verifiers 0.1.13.dev1__tar.gz → 0.1.13.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/PKG-INFO +1 -1
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_composable_env.py +77 -0
- verifiers-0.1.13.dev2/tests/test_context_token_metrics.py +200 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_environment_extra.py +3 -1
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_rlm_composable_env.py +146 -10
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/__init__.py +1 -1
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/environment.py +4 -6
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/cli_agent_env.py +11 -8
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/composable_env.py +24 -3
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harness.py +17 -1
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -2
- verifiers-0.1.13.dev2/verifiers/envs/experimental/composable/harnesses/rlm.py +287 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/stateful_tool_env.py +2 -2
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/tool_env.py +11 -11
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/tui.py +283 -134
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/types.py +2 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/eval_display.py +28 -13
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/eval_utils.py +31 -12
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/metric_utils.py +27 -11
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/save_utils.py +29 -5
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/usage_utils.py +52 -0
- verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -82
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/.gitignore +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/LICENSE +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/README.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/pyproject.toml +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/AGENTS.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/README.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/conftest.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_build_script.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_client_config.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_env_group.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_env_server.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_environment.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_envs.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_eval_cli.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_imports.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_logging.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_rlm_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_rubric.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/decorators.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/task.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/harbor_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/errors.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/eval.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/prime_rl.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/interception_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/version_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.13.
|
|
3
|
+
Version: 0.1.13.dev2
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -429,6 +429,83 @@ async def test_composable_env_no_upload_when_no_dirs(tmp_path, monkeypatch):
|
|
|
429
429
|
assert env.upload_file.await_count == 0
|
|
430
430
|
|
|
431
431
|
|
|
432
|
+
@pytest.mark.asyncio
|
|
433
|
+
async def test_composable_env_uploads_harness_dirs(tmp_path):
|
|
434
|
+
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
435
|
+
harness_dir = tmp_path / "agent-src"
|
|
436
|
+
harness_dir.mkdir()
|
|
437
|
+
(harness_dir / "marker.txt").write_text("agent\n")
|
|
438
|
+
|
|
439
|
+
env = ComposableEnv(
|
|
440
|
+
taskset=taskset,
|
|
441
|
+
harness=Harness(
|
|
442
|
+
run_command="true",
|
|
443
|
+
install_script="install-agent",
|
|
444
|
+
get_upload_dirs=lambda: {"agent_src": harness_dir},
|
|
445
|
+
upload_dir_mapping={"agent_src": "/tmp/agent-src"},
|
|
446
|
+
),
|
|
447
|
+
)
|
|
448
|
+
env.sandbox_client = SimpleNamespace(
|
|
449
|
+
execute_command=AsyncMock(
|
|
450
|
+
return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
|
|
451
|
+
),
|
|
452
|
+
teardown=lambda: None,
|
|
453
|
+
)
|
|
454
|
+
env.taskset.setup = AsyncMock()
|
|
455
|
+
env.upload_content = AsyncMock()
|
|
456
|
+
env.upload_file = AsyncMock()
|
|
457
|
+
|
|
458
|
+
await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
|
|
459
|
+
|
|
460
|
+
env.upload_file.assert_awaited_once()
|
|
461
|
+
upload_call = env.upload_file.await_args
|
|
462
|
+
assert upload_call.args[0] == "sbx"
|
|
463
|
+
assert upload_call.args[1] == "/tmp/_upload_tmp_agent-src.tar.gz"
|
|
464
|
+
|
|
465
|
+
extract_call = env.sandbox_client.execute_command.await_args_list[1]
|
|
466
|
+
assert extract_call == call(
|
|
467
|
+
"sbx",
|
|
468
|
+
"mkdir -p /tmp && tar -xzf /tmp/_upload_tmp_agent-src.tar.gz -C / && rm -f /tmp/_upload_tmp_agent-src.tar.gz",
|
|
469
|
+
timeout=60,
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
@pytest.mark.asyncio
|
|
474
|
+
async def test_composable_env_rejects_duplicate_task_and_harness_upload_names(
|
|
475
|
+
tmp_path, monkeypatch
|
|
476
|
+
):
|
|
477
|
+
mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
|
|
478
|
+
monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
|
|
479
|
+
taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
|
|
480
|
+
harness_dir = tmp_path / "skills"
|
|
481
|
+
harness_dir.mkdir()
|
|
482
|
+
|
|
483
|
+
env = ComposableEnv(
|
|
484
|
+
taskset=taskset,
|
|
485
|
+
harness=Harness(
|
|
486
|
+
run_command="true",
|
|
487
|
+
install_script="install-agent",
|
|
488
|
+
get_upload_dirs=lambda: {"skills": harness_dir},
|
|
489
|
+
skills_path="/task/skills",
|
|
490
|
+
),
|
|
491
|
+
)
|
|
492
|
+
env.sandbox_client = SimpleNamespace(
|
|
493
|
+
execute_command=AsyncMock(
|
|
494
|
+
return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
|
|
495
|
+
),
|
|
496
|
+
teardown=lambda: None,
|
|
497
|
+
)
|
|
498
|
+
env.taskset.setup = AsyncMock()
|
|
499
|
+
env.upload_content = AsyncMock()
|
|
500
|
+
env.upload_file = AsyncMock()
|
|
501
|
+
|
|
502
|
+
with pytest.raises(
|
|
503
|
+
ValueError,
|
|
504
|
+
match="Upload directory names must be unique across task and harness",
|
|
505
|
+
):
|
|
506
|
+
await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
|
|
507
|
+
|
|
508
|
+
|
|
432
509
|
# ── discover_sibling_dir ─────────────────────────────────────────────────
|
|
433
510
|
|
|
434
511
|
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Tests for per-turn context token metrics.
|
|
2
|
+
|
|
3
|
+
Tests the trajectory-based context token computation
|
|
4
|
+
(final_input_tokens, final_output_tokens) which assumes a linear rollout
|
|
5
|
+
using the last trajectory step.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from unittest.mock import MagicMock
|
|
9
|
+
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
from verifiers.utils.usage_utils import compute_context_token_metrics
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# =========================================================================
|
|
16
|
+
# Helpers
|
|
17
|
+
# =========================================================================
|
|
18
|
+
|
|
19
|
+
SYS = {"role": "system", "content": "You are helpful"}
|
|
20
|
+
USER = {"role": "user", "content": "hi"}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _make_response(prompt_tokens: int, completion_tokens: int) -> MagicMock:
|
|
24
|
+
response = MagicMock()
|
|
25
|
+
response.usage = MagicMock(
|
|
26
|
+
prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
|
|
27
|
+
)
|
|
28
|
+
return response
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _asst(i: int) -> dict:
|
|
32
|
+
return {"role": "assistant", "content": f"response {i}"}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# =========================================================================
|
|
36
|
+
# compute_context_token_metrics
|
|
37
|
+
# =========================================================================
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TestContextMetrics:
|
|
41
|
+
def test_empty_trajectory(self):
|
|
42
|
+
metrics = compute_context_token_metrics([])
|
|
43
|
+
assert metrics["final_output_tokens"] == 0
|
|
44
|
+
assert metrics["final_input_tokens"] == 0
|
|
45
|
+
|
|
46
|
+
def test_single_turn(self):
|
|
47
|
+
trajectory = [
|
|
48
|
+
{
|
|
49
|
+
"prompt": [SYS, USER],
|
|
50
|
+
"completion": [_asst(0)],
|
|
51
|
+
"response": _make_response(100, 20),
|
|
52
|
+
},
|
|
53
|
+
]
|
|
54
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
55
|
+
assert metrics["final_output_tokens"] == 20
|
|
56
|
+
assert metrics["final_input_tokens"] == 100
|
|
57
|
+
|
|
58
|
+
def test_multi_turn(self):
|
|
59
|
+
trajectory = [
|
|
60
|
+
{
|
|
61
|
+
"response": _make_response(100, 20),
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"response": _make_response(150, 25),
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"response": _make_response(200, 30),
|
|
68
|
+
},
|
|
69
|
+
]
|
|
70
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
71
|
+
# Last step total = 200 + 30 = 230
|
|
72
|
+
# Sum of completion tokens = 20 + 25 + 30 = 75
|
|
73
|
+
assert metrics["final_output_tokens"] == 75
|
|
74
|
+
assert metrics["final_input_tokens"] == 230 - 75
|
|
75
|
+
|
|
76
|
+
def test_invariant_total_equals_last_step(self):
|
|
77
|
+
trajectory = [
|
|
78
|
+
{"response": _make_response(100, 20)},
|
|
79
|
+
{"response": _make_response(150, 25)},
|
|
80
|
+
{"response": _make_response(200, 30)},
|
|
81
|
+
]
|
|
82
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
83
|
+
total = metrics["final_output_tokens"] + metrics["final_input_tokens"]
|
|
84
|
+
# Total should equal last step's prompt_tokens + completion_tokens
|
|
85
|
+
assert total == 200 + 30
|
|
86
|
+
|
|
87
|
+
def test_no_response_on_any_step(self):
|
|
88
|
+
trajectory = [{"response": None}]
|
|
89
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
90
|
+
assert metrics["final_output_tokens"] == 0
|
|
91
|
+
assert metrics["final_input_tokens"] == 0
|
|
92
|
+
|
|
93
|
+
def test_last_step_used_not_largest(self):
|
|
94
|
+
"""Even if an earlier step has a larger context, we use the last step."""
|
|
95
|
+
trajectory = [
|
|
96
|
+
{"response": _make_response(500, 100)}, # larger context
|
|
97
|
+
{"response": _make_response(100, 20)}, # last step, smaller
|
|
98
|
+
]
|
|
99
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
100
|
+
# Last step total = 120, sum completions = 100 + 20 = 120
|
|
101
|
+
assert metrics["final_output_tokens"] == 120
|
|
102
|
+
assert metrics["final_input_tokens"] == 0 # clamped to 0
|
|
103
|
+
|
|
104
|
+
def test_skips_none_responses_for_last_step(self):
|
|
105
|
+
"""Last step with response=None is skipped; uses previous step."""
|
|
106
|
+
trajectory = [
|
|
107
|
+
{"response": _make_response(100, 20)},
|
|
108
|
+
{"response": _make_response(200, 30)},
|
|
109
|
+
{"response": None},
|
|
110
|
+
]
|
|
111
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
112
|
+
# Last step with response is step 1: total = 230
|
|
113
|
+
# Sum completions from all steps with responses: 20 + 30 = 50
|
|
114
|
+
assert metrics["final_output_tokens"] == 50
|
|
115
|
+
assert metrics["final_input_tokens"] == 230 - 50
|
|
116
|
+
|
|
117
|
+
def test_skips_responses_without_usage(self):
|
|
118
|
+
"""Responses with no .usage attribute are skipped entirely."""
|
|
119
|
+
no_usage = MagicMock()
|
|
120
|
+
no_usage.usage = None
|
|
121
|
+
trajectory = [
|
|
122
|
+
{"response": _make_response(100, 20)},
|
|
123
|
+
{"response": _make_response(200, 30)},
|
|
124
|
+
{"response": no_usage}, # last step, but no usage
|
|
125
|
+
]
|
|
126
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
127
|
+
# Should use step 1 (last with usage): total = 230
|
|
128
|
+
assert metrics["final_output_tokens"] == 50
|
|
129
|
+
assert metrics["final_input_tokens"] == 230 - 50
|
|
130
|
+
|
|
131
|
+
def test_all_responses_lack_usage(self):
|
|
132
|
+
"""If no response has usage data, return zeros."""
|
|
133
|
+
no_usage = MagicMock()
|
|
134
|
+
no_usage.usage = None
|
|
135
|
+
trajectory = [
|
|
136
|
+
{"response": no_usage},
|
|
137
|
+
{"response": no_usage},
|
|
138
|
+
]
|
|
139
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
140
|
+
assert metrics["final_output_tokens"] == 0
|
|
141
|
+
assert metrics["final_input_tokens"] == 0
|
|
142
|
+
|
|
143
|
+
def test_final_input_tokens_clamped_to_zero(self):
|
|
144
|
+
"""If sum of completions exceeds last step total, input is clamped to 0."""
|
|
145
|
+
trajectory = [
|
|
146
|
+
{"response": _make_response(10, 500)}, # huge completion
|
|
147
|
+
{"response": _make_response(50, 10)},
|
|
148
|
+
]
|
|
149
|
+
metrics = compute_context_token_metrics(trajectory)
|
|
150
|
+
# Last step total = 60, sum completions = 510
|
|
151
|
+
assert metrics["final_output_tokens"] == 510
|
|
152
|
+
assert metrics["final_input_tokens"] == 0
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# =========================================================================
|
|
156
|
+
# Metric classes
|
|
157
|
+
# =========================================================================
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class TestContextTokenMetricClasses:
|
|
161
|
+
def test_input_tokens_metric(self):
|
|
162
|
+
from verifiers.utils.metric_utils import InputTokensMetric
|
|
163
|
+
|
|
164
|
+
m = InputTokensMetric()
|
|
165
|
+
m.add_output({"token_usage": {"input_tokens": 100.0}})
|
|
166
|
+
m.add_output({"token_usage": {"input_tokens": 200.0}})
|
|
167
|
+
assert m.compute() == pytest.approx(150.0)
|
|
168
|
+
|
|
169
|
+
def test_output_tokens_metric(self):
|
|
170
|
+
from verifiers.utils.metric_utils import OutputTokensMetric
|
|
171
|
+
|
|
172
|
+
m = OutputTokensMetric()
|
|
173
|
+
m.add_output({"token_usage": {"output_tokens": 40.0}})
|
|
174
|
+
m.add_output({"token_usage": {"output_tokens": 60.0}})
|
|
175
|
+
assert m.compute() == pytest.approx(50.0)
|
|
176
|
+
|
|
177
|
+
def test_final_input_tokens_metric(self):
|
|
178
|
+
from verifiers.utils.metric_utils import FinalInputTokensMetric
|
|
179
|
+
|
|
180
|
+
m = FinalInputTokensMetric()
|
|
181
|
+
m.add_output({"token_usage": {"final_input_tokens": 50.0}})
|
|
182
|
+
m.add_output({"token_usage": {"final_input_tokens": 100.0}})
|
|
183
|
+
assert m.compute() == pytest.approx(75.0)
|
|
184
|
+
|
|
185
|
+
def test_final_output_tokens_metric(self):
|
|
186
|
+
from verifiers.utils.metric_utils import FinalOutputTokensMetric
|
|
187
|
+
|
|
188
|
+
m = FinalOutputTokensMetric()
|
|
189
|
+
m.add_output({"token_usage": {"final_output_tokens": 150.0}})
|
|
190
|
+
m.add_output({"token_usage": {"final_output_tokens": 250.0}})
|
|
191
|
+
assert m.compute() == pytest.approx(200.0)
|
|
192
|
+
|
|
193
|
+
def test_skips_outputs_without_token_usage(self):
|
|
194
|
+
from verifiers.utils.metric_utils import FinalInputTokensMetric
|
|
195
|
+
|
|
196
|
+
m = FinalInputTokensMetric()
|
|
197
|
+
m.add_output({})
|
|
198
|
+
m.add_output({"token_usage": {}})
|
|
199
|
+
assert m.count == 0
|
|
200
|
+
assert m.compute() == 0.0
|
|
@@ -237,7 +237,9 @@ async def test_state_to_output_uses_state_usage_not_trajectory(
|
|
|
237
237
|
state["reward"] = 0.0
|
|
238
238
|
|
|
239
239
|
output = state_to_output(state, state_columns=[])
|
|
240
|
-
|
|
240
|
+
usage = output["token_usage"]
|
|
241
|
+
assert usage["input_tokens"] == 5.0
|
|
242
|
+
assert usage["output_tokens"] == 4.0
|
|
241
243
|
|
|
242
244
|
|
|
243
245
|
@pytest.mark.asyncio
|
|
@@ -6,6 +6,8 @@ fields and that the install script is generated correctly.
|
|
|
6
6
|
|
|
7
7
|
import importlib
|
|
8
8
|
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import subprocess
|
|
9
11
|
from types import SimpleNamespace
|
|
10
12
|
from unittest.mock import AsyncMock, call
|
|
11
13
|
|
|
@@ -18,9 +20,11 @@ from verifiers.envs.experimental.composable import (
|
|
|
18
20
|
SandboxSpec,
|
|
19
21
|
SandboxTaskSet,
|
|
20
22
|
)
|
|
23
|
+
from verifiers.envs.experimental.composable.harnesses import rlm as rlm_module
|
|
21
24
|
from verifiers.envs.experimental.composable.harnesses.rlm import (
|
|
22
25
|
build_install_script,
|
|
23
26
|
rlm_harness,
|
|
27
|
+
resolve_local_checkout,
|
|
24
28
|
)
|
|
25
29
|
|
|
26
30
|
|
|
@@ -86,28 +90,160 @@ def _make_temp_taskset_package(tmp_path, monkeypatch, *, with_skills: bool):
|
|
|
86
90
|
return mod
|
|
87
91
|
|
|
88
92
|
|
|
93
|
+
def _make_git_checkout(target: Path) -> Path:
|
|
94
|
+
checkout = target
|
|
95
|
+
checkout.mkdir()
|
|
96
|
+
(checkout / "install.sh").write_text("#!/usr/bin/env bash\n")
|
|
97
|
+
(checkout / "pyproject.toml").write_text("[project]\nname='rlm'\nversion='0.0.0'\n")
|
|
98
|
+
subprocess.run(["git", "init", "-b", "main"], cwd=checkout, check=True)
|
|
99
|
+
subprocess.run(
|
|
100
|
+
["git", "add", "install.sh", "pyproject.toml"], cwd=checkout, check=True
|
|
101
|
+
)
|
|
102
|
+
subprocess.run(
|
|
103
|
+
[
|
|
104
|
+
"git",
|
|
105
|
+
"-c",
|
|
106
|
+
"user.name=Codex",
|
|
107
|
+
"-c",
|
|
108
|
+
"user.email=codex@example.com",
|
|
109
|
+
"commit",
|
|
110
|
+
"-m",
|
|
111
|
+
"init",
|
|
112
|
+
],
|
|
113
|
+
cwd=checkout,
|
|
114
|
+
check=True,
|
|
115
|
+
)
|
|
116
|
+
return checkout
|
|
117
|
+
|
|
118
|
+
|
|
89
119
|
# ── RLM harness ──────────────────────────────────────────────────────────
|
|
90
120
|
|
|
91
121
|
|
|
92
|
-
def
|
|
122
|
+
def test_rlm_harness_install_script_requires_uploaded_checkout():
|
|
93
123
|
script = build_install_script()
|
|
94
|
-
assert
|
|
95
|
-
assert "
|
|
96
|
-
assert
|
|
124
|
+
assert 'test -f "$RLM_CHECKOUT_PATH/install.sh"' in script
|
|
125
|
+
assert "git clone" not in script
|
|
126
|
+
assert 'bash "$RLM_CHECKOUT_PATH/install.sh"' in script
|
|
97
127
|
|
|
98
128
|
|
|
99
|
-
def test_rlm_harness_sets_metrics_fields():
|
|
100
|
-
harness = rlm_harness()
|
|
129
|
+
def test_rlm_harness_sets_metrics_fields(tmp_path):
|
|
130
|
+
harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
|
|
101
131
|
assert harness.metrics_path == "{workdir}/.rlm/sessions/*/meta.json"
|
|
102
132
|
assert harness.metrics_key == "metrics"
|
|
103
133
|
assert harness.metrics_prefix == "rlm_"
|
|
104
134
|
|
|
105
135
|
|
|
106
|
-
def test_rlm_harness_sets_skills_path():
|
|
107
|
-
harness = rlm_harness()
|
|
136
|
+
def test_rlm_harness_sets_skills_path(tmp_path):
|
|
137
|
+
harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
|
|
108
138
|
assert harness.skills_path == "/task/rlm-skills"
|
|
109
139
|
|
|
110
140
|
|
|
141
|
+
def test_resolve_local_checkout_validates_explicit_path(tmp_path):
|
|
142
|
+
checkout = _make_git_checkout(tmp_path / "rlm")
|
|
143
|
+
|
|
144
|
+
resolved = resolve_local_checkout(checkout)
|
|
145
|
+
|
|
146
|
+
assert resolved == checkout.resolve()
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_rlm_harness_uploads_explicit_local_checkout(tmp_path):
|
|
150
|
+
checkout = _make_git_checkout(tmp_path / "rlm")
|
|
151
|
+
|
|
152
|
+
harness = rlm_harness(local_checkout=checkout)
|
|
153
|
+
|
|
154
|
+
assert harness.get_upload_dirs is not None
|
|
155
|
+
assert harness.get_upload_dirs() == {"rlm_checkout": checkout.resolve()}
|
|
156
|
+
assert harness.upload_dir_mapping == {"rlm_checkout": "/tmp/rlm-checkout"}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_resolve_local_checkout_materializes_host_cache(tmp_path):
|
|
160
|
+
source_checkout = _make_git_checkout(tmp_path / "rlm-source")
|
|
161
|
+
checkout_dir = tmp_path / "checkout-root" / "rlm"
|
|
162
|
+
|
|
163
|
+
resolved = resolve_local_checkout(
|
|
164
|
+
local_checkout=checkout_dir,
|
|
165
|
+
rlm_repo_url=str(source_checkout),
|
|
166
|
+
rlm_branch="main",
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
assert resolved == checkout_dir.resolve()
|
|
170
|
+
assert (checkout_dir / ".git").is_dir()
|
|
171
|
+
assert (checkout_dir / "install.sh").is_file()
|
|
172
|
+
assert (checkout_dir / "pyproject.toml").is_file()
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def test_rlm_harness_uses_default_host_cache_when_local_checkout_unspecified(
|
|
176
|
+
tmp_path, monkeypatch
|
|
177
|
+
):
|
|
178
|
+
source_checkout = _make_git_checkout(tmp_path / "rlm-source")
|
|
179
|
+
monkeypatch.setattr(
|
|
180
|
+
rlm_module,
|
|
181
|
+
"DEFAULT_RLM_LOCAL_CHECKOUT_CACHE_ROOT",
|
|
182
|
+
tmp_path / "cache-root",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
harness = rlm_harness(
|
|
186
|
+
rlm_repo_url=str(source_checkout),
|
|
187
|
+
rlm_branch="main",
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
assert harness.get_upload_dirs is not None
|
|
191
|
+
upload_checkout = harness.get_upload_dirs()["rlm_checkout"]
|
|
192
|
+
assert isinstance(upload_checkout, Path)
|
|
193
|
+
assert upload_checkout.is_dir()
|
|
194
|
+
assert upload_checkout.name.startswith("rlm-source-main-")
|
|
195
|
+
assert harness.upload_dir_mapping == {"rlm_checkout": "/tmp/rlm-checkout"}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def test_rlm_harness_always_uploads_checkout(tmp_path, monkeypatch):
|
|
199
|
+
source_checkout = _make_git_checkout(tmp_path / "rlm-source")
|
|
200
|
+
monkeypatch.setattr(
|
|
201
|
+
rlm_module,
|
|
202
|
+
"DEFAULT_RLM_LOCAL_CHECKOUT_CACHE_ROOT",
|
|
203
|
+
tmp_path / "cache-root",
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
harness = rlm_harness(
|
|
207
|
+
rlm_repo_url=str(source_checkout),
|
|
208
|
+
rlm_branch="main",
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
assert harness.get_upload_dirs is not None
|
|
212
|
+
assert harness.upload_dir_mapping is not None
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def test_resolve_local_checkout_redacts_gh_token_on_clone_failure(
|
|
216
|
+
tmp_path, monkeypatch
|
|
217
|
+
):
|
|
218
|
+
failing_checkout = tmp_path / "checkout-root" / "rlm"
|
|
219
|
+
token = "super/secret token"
|
|
220
|
+
quoted_token = "super%2Fsecret%20token"
|
|
221
|
+
|
|
222
|
+
def _raise_clone_error(*args, **kwargs):
|
|
223
|
+
raise subprocess.CalledProcessError(
|
|
224
|
+
128,
|
|
225
|
+
args[0],
|
|
226
|
+
stderr=(
|
|
227
|
+
"fatal: could not read from "
|
|
228
|
+
f"https://{quoted_token}@github.com/PrimeIntellect-ai/rlm.git"
|
|
229
|
+
),
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
monkeypatch.setattr(rlm_module.subprocess, "run", _raise_clone_error)
|
|
233
|
+
|
|
234
|
+
with pytest.raises(RuntimeError) as exc_info:
|
|
235
|
+
resolve_local_checkout(
|
|
236
|
+
local_checkout=failing_checkout,
|
|
237
|
+
rlm_repo_url="github.com/PrimeIntellect-ai/rlm.git",
|
|
238
|
+
rlm_branch="main",
|
|
239
|
+
gh_token=token,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
message = str(exc_info.value)
|
|
243
|
+
assert token not in message
|
|
244
|
+
assert "<redacted>" in message
|
|
245
|
+
|
|
246
|
+
|
|
111
247
|
# ── install_env ──────────────────────────────────────────────────────────
|
|
112
248
|
|
|
113
249
|
|
|
@@ -201,7 +337,7 @@ async def test_rlm_uploads_skills_before_install(tmp_path, monkeypatch):
|
|
|
201
337
|
|
|
202
338
|
|
|
203
339
|
@pytest.mark.asyncio
|
|
204
|
-
async def test_rlm_collects_logs_and_metrics():
|
|
340
|
+
async def test_rlm_collects_logs_and_metrics(tmp_path):
|
|
205
341
|
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
|
|
206
342
|
metrics = {
|
|
207
343
|
"turns": 3,
|
|
@@ -209,7 +345,7 @@ async def test_rlm_collects_logs_and_metrics():
|
|
|
209
345
|
"prompt_tokens": 100,
|
|
210
346
|
"completion_tokens": 25,
|
|
211
347
|
}
|
|
212
|
-
harness = rlm_harness()
|
|
348
|
+
harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
|
|
213
349
|
env = ComposableEnv(
|
|
214
350
|
taskset=taskset,
|
|
215
351
|
harness=Harness(
|
|
@@ -483,14 +483,12 @@ class Environment(ABC):
|
|
|
483
483
|
usage = state.get("usage")
|
|
484
484
|
if isinstance(usage, Mapping):
|
|
485
485
|
try:
|
|
486
|
-
|
|
487
|
-
|
|
486
|
+
return {
|
|
487
|
+
"input_tokens": float(usage.get("input_tokens", 0.0)),
|
|
488
|
+
"output_tokens": float(usage.get("output_tokens", 0.0)),
|
|
489
|
+
}
|
|
488
490
|
except (TypeError, ValueError):
|
|
489
491
|
return None
|
|
490
|
-
return {
|
|
491
|
-
"input_tokens": input_tokens,
|
|
492
|
-
"output_tokens": output_tokens,
|
|
493
|
-
}
|
|
494
492
|
return None
|
|
495
493
|
|
|
496
494
|
async def get_model_response(
|
{verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/cli_agent_env.py
RENAMED
|
@@ -374,17 +374,20 @@ class CliAgentEnv(SandboxMixin, vf.MultiTurnEnv):
|
|
|
374
374
|
f"Agent completed successfully (exit_code={status.exit_code})"
|
|
375
375
|
)
|
|
376
376
|
else:
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
if len(state.get("trajectory", [])) == 0:
|
|
381
|
-
stderr_snippet = (status.stderr or "")[:500]
|
|
377
|
+
stderr_full = status.stderr or ""
|
|
378
|
+
num_turns = len(state.get("trajectory", []))
|
|
379
|
+
if num_turns == 0:
|
|
382
380
|
error = AgentError(
|
|
383
381
|
f"Agent crashed before any LLM call "
|
|
384
|
-
f"(exit_code={status.exit_code}): {
|
|
382
|
+
f"(exit_code={status.exit_code}): {stderr_full}"
|
|
383
|
+
)
|
|
384
|
+
else:
|
|
385
|
+
error = AgentError(
|
|
386
|
+
f"Agent crashed after {num_turns} turn(s) "
|
|
387
|
+
f"(exit_code={status.exit_code}): {stderr_full}"
|
|
385
388
|
)
|
|
386
|
-
|
|
387
|
-
|
|
389
|
+
state["error"] = error
|
|
390
|
+
self.logger.error(str(error))
|
|
388
391
|
return
|
|
389
392
|
await asyncio.sleep(self.poll_interval)
|
|
390
393
|
|
|
@@ -50,6 +50,7 @@ import verifiers as vf
|
|
|
50
50
|
from verifiers.envs.experimental.cli_agent_env import CliAgentEnv
|
|
51
51
|
from verifiers.envs.experimental.composable.harness import Harness
|
|
52
52
|
from verifiers.envs.experimental.composable.task import TaskSet
|
|
53
|
+
from verifiers.envs.tool_env import ToolMonitorRubric
|
|
53
54
|
from verifiers.types import State
|
|
54
55
|
|
|
55
56
|
logger = logging.getLogger(__name__)
|
|
@@ -86,6 +87,9 @@ class ComposableEnv(CliAgentEnv):
|
|
|
86
87
|
self.harness = harness
|
|
87
88
|
self.install_env = dict(install_env) if install_env else None
|
|
88
89
|
|
|
90
|
+
if harness.tool_names:
|
|
91
|
+
self.add_rubric(ToolMonitorRubric(tool_names=list(harness.tool_names)))
|
|
92
|
+
|
|
89
93
|
# -- CliAgentEnv hooks --------------------------------------------------
|
|
90
94
|
|
|
91
95
|
def _get_spec(self, state: State) -> Any:
|
|
@@ -211,11 +215,11 @@ class ComposableEnv(CliAgentEnv):
|
|
|
211
215
|
async def _after_harness_inputs_uploaded(self, state: State) -> None:
|
|
212
216
|
"""Upload task-declared directories to harness-declared sandbox paths.
|
|
213
217
|
|
|
214
|
-
Joins
|
|
215
|
-
|
|
218
|
+
Joins task-declared and harness-declared upload directories with
|
|
219
|
+
``Harness.upload_dir_mapping`` (logical name → sandbox path).
|
|
216
220
|
Only directories whose logical name appears in both are uploaded.
|
|
217
221
|
"""
|
|
218
|
-
upload_dirs = self.
|
|
222
|
+
upload_dirs = self._get_upload_dirs()
|
|
219
223
|
mapping = self.harness.get_effective_upload_dir_mapping()
|
|
220
224
|
if not upload_dirs or not mapping:
|
|
221
225
|
return
|
|
@@ -225,6 +229,23 @@ class ComposableEnv(CliAgentEnv):
|
|
|
225
229
|
if remote_dest is not None:
|
|
226
230
|
await self._upload_dir(sandbox_id, local_source, remote_dest)
|
|
227
231
|
|
|
232
|
+
def _get_upload_dirs(self) -> dict[str, Traversable | Path]:
|
|
233
|
+
"""Merge task-owned and harness-owned upload directories."""
|
|
234
|
+
task_upload_dirs = dict(self.taskset.get_upload_dirs() or {})
|
|
235
|
+
harness_upload_dirs_value = (
|
|
236
|
+
self.harness.get_upload_dirs() if self.harness.get_upload_dirs else None
|
|
237
|
+
)
|
|
238
|
+
harness_upload_dirs = dict(harness_upload_dirs_value or {})
|
|
239
|
+
duplicate_names = sorted(set(task_upload_dirs) & set(harness_upload_dirs))
|
|
240
|
+
if duplicate_names:
|
|
241
|
+
names = ", ".join(repr(name) for name in duplicate_names)
|
|
242
|
+
raise ValueError(
|
|
243
|
+
"Upload directory names must be unique across task and harness; "
|
|
244
|
+
f"duplicates: {names}."
|
|
245
|
+
)
|
|
246
|
+
task_upload_dirs.update(harness_upload_dirs)
|
|
247
|
+
return task_upload_dirs
|
|
248
|
+
|
|
228
249
|
def _get_install_execute_kwargs(self) -> dict[str, Any]:
|
|
229
250
|
"""Keyword arguments passed to sandbox install command execution."""
|
|
230
251
|
kwargs: dict[str, Any] = {"timeout": self.harness.install_timeout}
|
{verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harness.py
RENAMED
|
@@ -17,7 +17,9 @@ connects them.
|
|
|
17
17
|
from __future__ import annotations
|
|
18
18
|
|
|
19
19
|
from dataclasses import dataclass
|
|
20
|
-
from
|
|
20
|
+
from importlib.abc import Traversable
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import TYPE_CHECKING, Callable
|
|
21
23
|
|
|
22
24
|
if TYPE_CHECKING:
|
|
23
25
|
from verifiers.envs.experimental.composable.task import SandboxSpec
|
|
@@ -58,6 +60,12 @@ class Harness:
|
|
|
58
60
|
``skills_path`` is merged into this mapping automatically.
|
|
59
61
|
Use for non-skills directories; for skills prefer
|
|
60
62
|
``skills_path``.
|
|
63
|
+
get_upload_dirs:
|
|
64
|
+
Optional callable returning harness-owned local directories to
|
|
65
|
+
upload into the sandbox before install. These are merged with
|
|
66
|
+
task-declared upload dirs by ``ComposableEnv`` and resolved via
|
|
67
|
+
the same ``upload_dir_mapping`` logical-name contract.
|
|
68
|
+
Example: ``lambda: {"agent_src": Path("/path/to/checkout")}``.
|
|
61
69
|
metrics_path:
|
|
62
70
|
Glob pattern for a JSON metrics file inside the sandbox,
|
|
63
71
|
collected after the rollout. May contain ``{workdir}`` which is
|
|
@@ -75,6 +83,12 @@ class Harness:
|
|
|
75
83
|
metrics_keys:
|
|
76
84
|
Optional whitelist of metric keys to surface. ``None`` means
|
|
77
85
|
surface all keys found.
|
|
86
|
+
tool_names:
|
|
87
|
+
Names of the tools the agent uses internally. When non-empty,
|
|
88
|
+
``ComposableEnv`` auto-registers a ``ToolMonitorRubric`` that
|
|
89
|
+
counts calls to each named tool (plus a total) from the
|
|
90
|
+
assistant messages the harness emits into the trajectory.
|
|
91
|
+
Example: ``["ipython", "summarize"]`` for the RLM harness.
|
|
78
92
|
"""
|
|
79
93
|
|
|
80
94
|
install_script: str | None = None
|
|
@@ -87,10 +101,12 @@ class Harness:
|
|
|
87
101
|
sandbox_spec: SandboxSpec | None = None
|
|
88
102
|
skills_path: str | None = None
|
|
89
103
|
upload_dir_mapping: dict[str, str] | None = None
|
|
104
|
+
get_upload_dirs: Callable[[], dict[str, Traversable | Path] | None] | None = None
|
|
90
105
|
metrics_path: str | None = None
|
|
91
106
|
metrics_prefix: str = ""
|
|
92
107
|
metrics_key: str | None = None
|
|
93
108
|
metrics_keys: list[str] | None = None
|
|
109
|
+
tool_names: list[str] | None = None
|
|
94
110
|
|
|
95
111
|
def get_effective_upload_dir_mapping(self) -> dict[str, str] | None:
|
|
96
112
|
"""Return the merged upload mapping (skills_path + upload_dir_mapping)."""
|