verifiers 0.1.13.dev7__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/.gitignore +1 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/PKG-INFO +77 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/README.md +73 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/pyproject.toml +26 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/conftest.py +8 -13
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_cli_agent_env.py +175 -14
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_composable_env.py +4 -4
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_decorator_ranks.py +43 -4
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_endpoint_registry.py +33 -65
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_env_group.py +51 -52
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_env_server.py +1 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_environment.py +86 -7
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_environment_extra.py +2 -4
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_envs.py +25 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_eval_cli.py +191 -16
- verifiers-0.1.14/tests/test_gepa_cli.py +251 -0
- verifiers-0.1.14/tests/test_gepa_utils.py +155 -0
- verifiers-0.1.14/tests/test_lean_task.py +344 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_math_rubric.py +89 -21
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_multiturn_env.py +81 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_nemorl_client.py +146 -35
- verifiers-0.1.14/tests/test_openai_responses_client.py +338 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_opencode_rlm_env.py +7 -9
- verifiers-0.1.14/tests/test_per_turn_timing.py +68 -0
- verifiers-0.1.14/tests/test_renderer_client.py +600 -0
- verifiers-0.1.14/tests/test_renderer_e2e.py +417 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_rlm_composable_env.py +333 -40
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_rlm_env.py +71 -71
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_rubric.py +9 -59
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_rubric_group.py +72 -43
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_save_utils.py +11 -11
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_setup_script.py +2 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_singleturn_env.py +7 -35
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_stateful_tool_env.py +3 -5
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_tool_env.py +4 -8
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_tool_utils.py +31 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_trajectory_processing.py +0 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_tui_info_formatting.py +9 -1
- verifiers-0.1.14/tests/test_types.py +11 -0
- verifiers-0.1.14/tests/test_v1_bfcl.py +55 -0
- verifiers-0.1.14/tests/test_v1_config_extension.py +1599 -0
- verifiers-0.1.14/tests/test_v1_endpoint_protocols.py +222 -0
- verifiers-0.1.14/tests/test_v1_group_reward_env.py +39 -0
- verifiers-0.1.14/tests/test_v1_harbor_cli.py +178 -0
- verifiers-0.1.14/tests/test_v1_mini_swe_agent.py +63 -0
- verifiers-0.1.14/tests/test_v1_rlm_swe.py +70 -0
- verifiers-0.1.14/tests/test_v1_runtime_lifecycle.py +1731 -0
- verifiers-0.1.14/tests/test_v1_scoring_functions.py +152 -0
- verifiers-0.1.14/tests/test_wordle_env.py +22 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/__init__.py +88 -8
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/__init__.py +29 -0
- verifiers-0.1.14/verifiers/clients/nemorl_chat_completions_client.py +117 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/openai_chat_completions_client.py +2 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/openai_chat_completions_token_client.py +17 -2
- verifiers-0.1.14/verifiers/clients/openai_responses_client.py +443 -0
- verifiers-0.1.14/verifiers/clients/renderer_client.py +603 -0
- verifiers-0.1.14/verifiers/decorators.py +296 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/AGENTS.md +2 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/env_group.py +192 -62
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/environment.py +113 -76
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/cli_agent_env.py +33 -38
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/README.md +1 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/composable_env.py +101 -25
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harness.py +32 -14
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harnesses/opencode.py +20 -4
- verifiers-0.1.14/verifiers/envs/experimental/composable/harnesses/rlm.py +281 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/task.py +44 -22
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -1
- verifiers-0.1.14/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +13 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +138 -27
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +3 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +3 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +3 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +11 -8
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +2 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +2 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +2 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/harbor_env/env.py +4 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/mcp_env.py +9 -12
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/opencode_env.py +2 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/opencode_rlm_env.py +2 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/rlm_env.py +3 -5
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/sandbox_mixin.py +51 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/utils/git_checkout_cache.py +45 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/browser_env.py +3 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/modes/base.py +2 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +1 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +1 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/openenv_env.py +2 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/reasoninggym_env.py +1 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/textarena_env.py +7 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/multiturn_env.py +54 -12
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/python_env.py +2 -3
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/sandbox_env.py +2 -2
- verifiers-0.1.14/verifiers/gepa/gepa_utils.py +322 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/math_rubric.py +2 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/rubric.py +137 -34
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/rubric_group.py +23 -1
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/eval.py +21 -7
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/gepa.py +274 -37
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/setup.py +14 -11
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/tui.py +34 -10
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/types.py +176 -17
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/async_utils.py +18 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/display_utils.py +90 -3
- verifiers-0.1.14/verifiers/utils/env_config_utils.py +45 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/env_utils.py +53 -2
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/error_utils.py +33 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/eval_display.py +61 -50
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/eval_utils.py +185 -109
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/interception_utils.py +344 -7
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/logging_utils.py +18 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/response_utils.py +2 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/save_utils.py +33 -13
- verifiers-0.1.14/verifiers/v1/README.md +1587 -0
- verifiers-0.1.14/verifiers/v1/RE_MIGRATION.md +804 -0
- verifiers-0.1.14/verifiers/v1/__init__.py +85 -0
- verifiers-0.1.14/verifiers/v1/config.py +455 -0
- verifiers-0.1.14/verifiers/v1/env.py +136 -0
- verifiers-0.1.14/verifiers/v1/harness.py +598 -0
- verifiers-0.1.14/verifiers/v1/packages/__init__.py +1 -0
- verifiers-0.1.14/verifiers/v1/packages/harnesses/__init__.py +7 -0
- verifiers-0.1.14/verifiers/v1/packages/harnesses/cli.py +121 -0
- verifiers-0.1.14/verifiers/v1/packages/harnesses/mini_swe_agent.py +247 -0
- verifiers-0.1.14/verifiers/v1/packages/harnesses/opencode.py +273 -0
- verifiers-0.1.14/verifiers/v1/packages/harnesses/pi.py +212 -0
- verifiers-0.1.14/verifiers/v1/packages/harnesses/rlm.py +264 -0
- verifiers-0.1.14/verifiers/v1/packages/tasksets/__init__.py +3 -0
- verifiers-0.1.14/verifiers/v1/packages/tasksets/harbor.py +405 -0
- verifiers-0.1.14/verifiers/v1/runtime.py +1931 -0
- verifiers-0.1.14/verifiers/v1/state.py +401 -0
- verifiers-0.1.14/verifiers/v1/task.py +177 -0
- verifiers-0.1.14/verifiers/v1/taskset.py +269 -0
- verifiers-0.1.14/verifiers/v1/toolset.py +352 -0
- verifiers-0.1.14/verifiers/v1/user.py +85 -0
- verifiers-0.1.14/verifiers/v1/utils/__init__.py +1 -0
- verifiers-0.1.14/verifiers/v1/utils/artifact_utils.py +31 -0
- verifiers-0.1.14/verifiers/v1/utils/endpoint_utils.py +669 -0
- verifiers-0.1.14/verifiers/v1/utils/json_utils.py +11 -0
- verifiers-0.1.14/verifiers/v1/utils/judge_utils.py +63 -0
- verifiers-0.1.14/verifiers/v1/utils/lifecycle_utils.py +96 -0
- verifiers-0.1.14/verifiers/v1/utils/mcp_proxy_utils.py +233 -0
- verifiers-0.1.14/verifiers/v1/utils/mcp_utils.py +148 -0
- verifiers-0.1.14/verifiers/v1/utils/program_utils.py +483 -0
- verifiers-0.1.14/verifiers/v1/utils/prompt_utils.py +136 -0
- verifiers-0.1.14/verifiers/v1/utils/sandbox_program_utils.py +650 -0
- verifiers-0.1.14/verifiers/v1/utils/sandbox_utils.py +753 -0
- verifiers-0.1.14/verifiers/v1/utils/scoring_utils.py +379 -0
- verifiers-0.1.14/verifiers/v1/utils/timing_utils.py +36 -0
- verifiers-0.1.14/verifiers/v1/utils/tool_utils.py +19 -0
- verifiers-0.1.14/verifiers/v1/utils/trajectory_utils.py +78 -0
- verifiers-0.1.13.dev7/tests/test_gepa_cli.py +0 -115
- verifiers-0.1.13.dev7/verifiers/clients/nemorl_chat_completions_client.py +0 -87
- verifiers-0.1.13.dev7/verifiers/decorators.py +0 -147
- verifiers-0.1.13.dev7/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -211
- verifiers-0.1.13.dev7/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -3
- verifiers-0.1.13.dev7/verifiers/gepa/gepa_utils.py +0 -116
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/LICENSE +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/AGENTS.md +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/README.md +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_build_script.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_client_config.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_imports.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_logging.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/errors.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/prime_rl.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.13.dev7 → verifiers-0.1.14}/verifiers/utils/version_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.14
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -26,6 +26,7 @@ Requires-Dist: aiolimiter>=1.2.1
|
|
|
26
26
|
Requires-Dist: anthropic>=0.78.0
|
|
27
27
|
Requires-Dist: datasets<4.7.0,>=3.0.0
|
|
28
28
|
Requires-Dist: gepa
|
|
29
|
+
Requires-Dist: httpx>=0.27.0
|
|
29
30
|
Requires-Dist: jinja2>=3.1.6
|
|
30
31
|
Requires-Dist: math-verify>=0.8.0
|
|
31
32
|
Requires-Dist: mcp>=1.14.1
|
|
@@ -53,6 +54,8 @@ Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
|
|
|
53
54
|
Requires-Dist: stagehand>=3.0.0; extra == 'browser'
|
|
54
55
|
Provides-Extra: openenv
|
|
55
56
|
Requires-Dist: openenv-core[core]==0.2.1; extra == 'openenv'
|
|
57
|
+
Provides-Extra: renderers
|
|
58
|
+
Requires-Dist: renderers>=0.1.6; extra == 'renderers'
|
|
56
59
|
Provides-Extra: rg
|
|
57
60
|
Requires-Dist: reasoning-gym; extra == 'rg'
|
|
58
61
|
Provides-Extra: rl
|
|
@@ -197,11 +200,81 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
|
|
|
197
200
|
async def correct_answer(completion, answer) -> float:
|
|
198
201
|
completion_ans = completion[-1]['content']
|
|
199
202
|
return 1.0 if completion_ans == answer else 0.0
|
|
200
|
-
rubric = Rubric(funcs=[correct_answer])
|
|
203
|
+
rubric = vf.Rubric(funcs=[correct_answer])
|
|
201
204
|
env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
|
|
202
205
|
return env
|
|
203
206
|
```
|
|
204
207
|
|
|
208
|
+
For composable environments with reusable tasksets, toolsets, custom programs,
|
|
209
|
+
or custom harnesses, use the v1 BYO Harness path:
|
|
210
|
+
```python
|
|
211
|
+
# my_env.py
|
|
212
|
+
import verifiers.v1 as vf
|
|
213
|
+
|
|
214
|
+
def source():
|
|
215
|
+
yield {
|
|
216
|
+
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
217
|
+
"answer": "cba",
|
|
218
|
+
"max_turns": 1,
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
@vf.reward(weight=1.0)
|
|
222
|
+
async def contains_answer(task, state) -> float:
|
|
223
|
+
return float(task["answer"] in str(state.get("completion") or ""))
|
|
224
|
+
|
|
225
|
+
def load_taskset(config: vf.TasksetConfig | None = None):
|
|
226
|
+
return vf.Taskset(source=source, rewards=[contains_answer], config=config)
|
|
227
|
+
|
|
228
|
+
def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
|
|
229
|
+
config = config or vf.EnvConfig()
|
|
230
|
+
return vf.Env(taskset=load_taskset(config=config.taskset))
|
|
231
|
+
```
|
|
232
|
+
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
233
|
+
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
234
|
+
Reusable taskset and harness packages live under `verifiers.v1.packages` while
|
|
235
|
+
the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
|
|
236
|
+
For example, Harbor task directories can run through the bundled OpenCode CLI
|
|
237
|
+
harness with:
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
env = vf.Env(
|
|
241
|
+
taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
|
|
242
|
+
harness=vf.OpenCode(),
|
|
243
|
+
)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
The same environment package is the unit used by evals and `prime-rl`. The
|
|
247
|
+
trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
|
|
248
|
+
and harness options stay under `env.taskset` and `env.harness`:
|
|
249
|
+
|
|
250
|
+
```toml
|
|
251
|
+
# configs/rl/my-v1-env.toml
|
|
252
|
+
model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
|
|
253
|
+
max_steps = 100
|
|
254
|
+
batch_size = 256
|
|
255
|
+
rollouts_per_example = 8
|
|
256
|
+
|
|
257
|
+
[sampling]
|
|
258
|
+
max_tokens = 4096
|
|
259
|
+
|
|
260
|
+
[[env]]
|
|
261
|
+
id = "my-env"
|
|
262
|
+
|
|
263
|
+
[env.args]
|
|
264
|
+
arg1 = "non-th-arg"
|
|
265
|
+
|
|
266
|
+
[env.harness]
|
|
267
|
+
max_turns = 1
|
|
268
|
+
|
|
269
|
+
[env.taskset.scoring.contains_answer]
|
|
270
|
+
weight = 1.0
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
prime env install my-env
|
|
275
|
+
uv run prime-rl configs/rl/my-v1-env.toml
|
|
276
|
+
```
|
|
277
|
+
|
|
205
278
|
To install the environment module into your project, do:
|
|
206
279
|
```bash
|
|
207
280
|
prime env install my-env # installs from ./environments/my_env
|
|
@@ -237,6 +310,8 @@ prime eval run primeintellect/math-python
|
|
|
237
310
|
|
|
238
311
|
**[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
|
|
239
312
|
|
|
313
|
+
**[BYO Harness](docs/byo-harness.md)** — Build composable v1 taskset/harness environments with custom tools, sandboxes, users, and custom programs.
|
|
314
|
+
|
|
240
315
|
**[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
|
|
241
316
|
|
|
242
317
|
**[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
|
|
@@ -124,11 +124,81 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
|
|
|
124
124
|
async def correct_answer(completion, answer) -> float:
|
|
125
125
|
completion_ans = completion[-1]['content']
|
|
126
126
|
return 1.0 if completion_ans == answer else 0.0
|
|
127
|
-
rubric = Rubric(funcs=[correct_answer])
|
|
127
|
+
rubric = vf.Rubric(funcs=[correct_answer])
|
|
128
128
|
env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
|
|
129
129
|
return env
|
|
130
130
|
```
|
|
131
131
|
|
|
132
|
+
For composable environments with reusable tasksets, toolsets, custom programs,
|
|
133
|
+
or custom harnesses, use the v1 BYO Harness path:
|
|
134
|
+
```python
|
|
135
|
+
# my_env.py
|
|
136
|
+
import verifiers.v1 as vf
|
|
137
|
+
|
|
138
|
+
def source():
|
|
139
|
+
yield {
|
|
140
|
+
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
141
|
+
"answer": "cba",
|
|
142
|
+
"max_turns": 1,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
@vf.reward(weight=1.0)
|
|
146
|
+
async def contains_answer(task, state) -> float:
|
|
147
|
+
return float(task["answer"] in str(state.get("completion") or ""))
|
|
148
|
+
|
|
149
|
+
def load_taskset(config: vf.TasksetConfig | None = None):
|
|
150
|
+
return vf.Taskset(source=source, rewards=[contains_answer], config=config)
|
|
151
|
+
|
|
152
|
+
def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
|
|
153
|
+
config = config or vf.EnvConfig()
|
|
154
|
+
return vf.Env(taskset=load_taskset(config=config.taskset))
|
|
155
|
+
```
|
|
156
|
+
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
157
|
+
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
158
|
+
Reusable taskset and harness packages live under `verifiers.v1.packages` while
|
|
159
|
+
the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
|
|
160
|
+
For example, Harbor task directories can run through the bundled OpenCode CLI
|
|
161
|
+
harness with:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
env = vf.Env(
|
|
165
|
+
taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
|
|
166
|
+
harness=vf.OpenCode(),
|
|
167
|
+
)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
The same environment package is the unit used by evals and `prime-rl`. The
|
|
171
|
+
trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
|
|
172
|
+
and harness options stay under `env.taskset` and `env.harness`:
|
|
173
|
+
|
|
174
|
+
```toml
|
|
175
|
+
# configs/rl/my-v1-env.toml
|
|
176
|
+
model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
|
|
177
|
+
max_steps = 100
|
|
178
|
+
batch_size = 256
|
|
179
|
+
rollouts_per_example = 8
|
|
180
|
+
|
|
181
|
+
[sampling]
|
|
182
|
+
max_tokens = 4096
|
|
183
|
+
|
|
184
|
+
[[env]]
|
|
185
|
+
id = "my-env"
|
|
186
|
+
|
|
187
|
+
[env.args]
|
|
188
|
+
arg1 = "non-th-arg"
|
|
189
|
+
|
|
190
|
+
[env.harness]
|
|
191
|
+
max_turns = 1
|
|
192
|
+
|
|
193
|
+
[env.taskset.scoring.contains_answer]
|
|
194
|
+
weight = 1.0
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
prime env install my-env
|
|
199
|
+
uv run prime-rl configs/rl/my-v1-env.toml
|
|
200
|
+
```
|
|
201
|
+
|
|
132
202
|
To install the environment module into your project, do:
|
|
133
203
|
```bash
|
|
134
204
|
prime env install my-env # installs from ./environments/my_env
|
|
@@ -164,6 +234,8 @@ prime eval run primeintellect/math-python
|
|
|
164
234
|
|
|
165
235
|
**[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
|
|
166
236
|
|
|
237
|
+
**[BYO Harness](docs/byo-harness.md)** — Build composable v1 taskset/harness environments with custom tools, sandboxes, users, and custom programs.
|
|
238
|
+
|
|
167
239
|
**[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
|
|
168
240
|
|
|
169
241
|
**[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
|
|
@@ -52,7 +52,8 @@ dependencies = [
|
|
|
52
52
|
"msgpack>=1.1.2",
|
|
53
53
|
"aiolimiter>=1.2.1",
|
|
54
54
|
"setproctitle>=1.3.0",
|
|
55
|
-
"regex<2026.4.4",
|
|
55
|
+
"regex<2026.4.4",
|
|
56
|
+
"httpx>=0.27.0",
|
|
56
57
|
]
|
|
57
58
|
|
|
58
59
|
[dependency-groups]
|
|
@@ -73,6 +74,7 @@ dev = [
|
|
|
73
74
|
"aiohttp>=3.9.0",
|
|
74
75
|
"python-dotenv>=1.0.0",
|
|
75
76
|
"nltk",
|
|
77
|
+
"renderers>=0.1.6",
|
|
76
78
|
]
|
|
77
79
|
|
|
78
80
|
[project.optional-dependencies]
|
|
@@ -91,6 +93,9 @@ browser = [
|
|
|
91
93
|
"aiohttp>=3.9.0",
|
|
92
94
|
"python-dotenv>=1.0.0",
|
|
93
95
|
]
|
|
96
|
+
renderers = [
|
|
97
|
+
"renderers>=0.1.6",
|
|
98
|
+
]
|
|
94
99
|
rl = [
|
|
95
100
|
"torch>=2.8.0,<2.9.0",
|
|
96
101
|
"transformers>=4.56.2",
|
|
@@ -108,6 +113,24 @@ rl = [
|
|
|
108
113
|
preview = true
|
|
109
114
|
required-version = ">=0.11.1"
|
|
110
115
|
|
|
116
|
+
[[tool.uv.index]]
|
|
117
|
+
name = "pypi"
|
|
118
|
+
url = "https://pypi.org/simple"
|
|
119
|
+
default = true
|
|
120
|
+
exclude-newer = "7 days"
|
|
121
|
+
|
|
122
|
+
[tool.uv.exclude-newer-package]
|
|
123
|
+
# PrimeIntellect-published on PyPI (trusted publisher)
|
|
124
|
+
prime-tunnel = false
|
|
125
|
+
prime-sandboxes = false
|
|
126
|
+
renderers = false
|
|
127
|
+
|
|
128
|
+
[tool.uv.sources]
|
|
129
|
+
# Pinned to renderers main until the next PyPI release lands; drop after.
|
|
130
|
+
# fe67f9f = renderers main: PR #4 squash-merge — construction-time
|
|
131
|
+
# preserve_*_thinking flags on create_renderer / create_renderer_pool.
|
|
132
|
+
renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "fe67f9f" }
|
|
133
|
+
|
|
111
134
|
[tool.uv.extra-build-dependencies]
|
|
112
135
|
flash-attn = [{ requirement = "torch", match-runtime = true }]
|
|
113
136
|
|
|
@@ -170,6 +193,7 @@ addopts = [
|
|
|
170
193
|
markers = [
|
|
171
194
|
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
172
195
|
"integration: marks tests as integration tests",
|
|
196
|
+
"prime_sandbox: marks tests that provision real Prime sandbox or tunnel resources",
|
|
173
197
|
"unit: marks tests as unit tests",
|
|
174
198
|
"asyncio: marks tests as async tests",
|
|
175
199
|
"parsers: marks tests for parser components",
|
|
@@ -195,7 +219,7 @@ unknown-argument = "warn"
|
|
|
195
219
|
redundant-cast = "ignore"
|
|
196
220
|
|
|
197
221
|
[tool.ty.src]
|
|
198
|
-
exclude = ["environments"]
|
|
222
|
+
exclude = ["environments", "verifiers/v1/sketch.py"]
|
|
199
223
|
|
|
200
224
|
[[tool.ty.overrides]]
|
|
201
225
|
include = ["verifiers/envs/experimental/composable/tasksets/**"]
|
|
@@ -425,10 +425,9 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
|
|
|
425
425
|
super().__init__(tools=[offset_tool], **kwargs)
|
|
426
426
|
|
|
427
427
|
async def setup_state(self, state, **kwargs):
|
|
428
|
-
|
|
428
|
+
await super().setup_state(state, **kwargs)
|
|
429
429
|
state["offset"] = 3
|
|
430
430
|
state["update_calls"] = 0
|
|
431
|
-
return state
|
|
432
431
|
|
|
433
432
|
def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
|
|
434
433
|
state["update_calls"] += 1
|
|
@@ -458,13 +457,15 @@ def make_input() -> Callable[..., RolloutInput]:
|
|
|
458
457
|
|
|
459
458
|
def _make_input(
|
|
460
459
|
example_id: int = 0,
|
|
461
|
-
task: str = "default",
|
|
462
460
|
prompt: Messages = DEFAULT_PROMPT,
|
|
463
461
|
info: Info = {},
|
|
464
462
|
answer: str = "4",
|
|
465
463
|
) -> RolloutInput:
|
|
466
464
|
return RolloutInput(
|
|
467
|
-
example_id=example_id,
|
|
465
|
+
example_id=example_id,
|
|
466
|
+
prompt=prompt,
|
|
467
|
+
answer=answer,
|
|
468
|
+
info=info,
|
|
468
469
|
)
|
|
469
470
|
|
|
470
471
|
return _make_input
|
|
@@ -476,7 +477,6 @@ def make_state() -> Callable[..., State]:
|
|
|
476
477
|
|
|
477
478
|
def _make_state(
|
|
478
479
|
example_id: int = 0,
|
|
479
|
-
task: str = "default",
|
|
480
480
|
prompt: Messages = DEFAULT_PROMPT,
|
|
481
481
|
answer: str = "4",
|
|
482
482
|
info: Info = {},
|
|
@@ -488,17 +488,12 @@ def make_state() -> Callable[..., State]:
|
|
|
488
488
|
stop_condition: str | None = "max_turns_reached",
|
|
489
489
|
tool_defs: list[Tool] | None = None,
|
|
490
490
|
trajectory: list[TrajectoryStep] = [],
|
|
491
|
-
timing=RolloutTiming(
|
|
492
|
-
generation_ms=0.0,
|
|
493
|
-
scoring_ms=0.0,
|
|
494
|
-
total_ms=0.0,
|
|
495
|
-
),
|
|
491
|
+
timing=RolloutTiming(),
|
|
496
492
|
foo: str = "bar", # custom field
|
|
497
493
|
**kwargs,
|
|
498
494
|
) -> State:
|
|
499
495
|
return State(
|
|
500
496
|
example_id=example_id,
|
|
501
|
-
task=task,
|
|
502
497
|
prompt=prompt,
|
|
503
498
|
answer=answer,
|
|
504
499
|
info=info,
|
|
@@ -551,7 +546,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
|
|
|
551
546
|
rollouts_per_example: int = 1,
|
|
552
547
|
sampling_args: SamplingArgs = {},
|
|
553
548
|
date: str = "1970-01-01",
|
|
554
|
-
|
|
549
|
+
time: float = 0.0,
|
|
555
550
|
avg_reward: float = 0.0,
|
|
556
551
|
avg_metrics: dict[str, float] = {},
|
|
557
552
|
pass_at_k: dict[str, float] = {},
|
|
@@ -579,7 +574,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
|
|
|
579
574
|
rollouts_per_example=rollouts_per_example,
|
|
580
575
|
sampling_args=sampling_args,
|
|
581
576
|
date=date,
|
|
582
|
-
|
|
577
|
+
time=time,
|
|
583
578
|
avg_reward=avg_reward,
|
|
584
579
|
avg_metrics=avg_metrics,
|
|
585
580
|
pass_at_k=pass_at_k,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Tests for CliAgentEnv and HarborEnv."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import tempfile
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
@@ -8,6 +9,7 @@ import pytest
|
|
|
8
9
|
from datasets import Dataset
|
|
9
10
|
|
|
10
11
|
import verifiers as vf
|
|
12
|
+
from verifiers.utils.interception_utils import serialize_intercept_response
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
@pytest.fixture
|
|
@@ -62,7 +64,8 @@ class TestCliAgentEnv:
|
|
|
62
64
|
assert env.run_command == "python agent.py"
|
|
63
65
|
assert env.docker_image == "python:3.11-slim"
|
|
64
66
|
assert env.interception_port == 8765
|
|
65
|
-
assert env.timeout_seconds
|
|
67
|
+
assert env.timeout_seconds is None
|
|
68
|
+
assert env.sandbox_timeout_minutes is None
|
|
66
69
|
|
|
67
70
|
def test_init_custom_config(self, sample_dataset):
|
|
68
71
|
"""Test initialization with custom configuration."""
|
|
@@ -130,22 +133,34 @@ class TestCliAgentEnv:
|
|
|
130
133
|
state = {"agent_completed": True}
|
|
131
134
|
assert await env.agent_completed(state) is True
|
|
132
135
|
|
|
133
|
-
@pytest.mark.
|
|
134
|
-
|
|
135
|
-
|
|
136
|
+
@pytest.mark.parametrize(
|
|
137
|
+
"timeout_seconds,expected_minutes",
|
|
138
|
+
[
|
|
139
|
+
(None, 24 * 60), # no rollout cap → SDK ceiling
|
|
140
|
+
(600.0, 10 + 60), # finite → ceil + scoring buffer
|
|
141
|
+
(24 * 3600.0, 24 * 60), # buffer would overflow → clamped to ceiling
|
|
142
|
+
],
|
|
143
|
+
)
|
|
144
|
+
def test_sandbox_timeout_auto_derived(
|
|
145
|
+
self, sample_dataset, timeout_seconds, expected_minutes
|
|
146
|
+
):
|
|
136
147
|
env = vf.CliAgentEnv(
|
|
137
148
|
run_command="python agent.py",
|
|
138
149
|
dataset=sample_dataset,
|
|
139
150
|
rubric=vf.Rubric(),
|
|
140
|
-
timeout_seconds=
|
|
151
|
+
timeout_seconds=timeout_seconds,
|
|
141
152
|
)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
state = {"timing": {"start_time": time.time()}}
|
|
145
|
-
assert await env.timeout_reached(state) is False
|
|
153
|
+
assert env.get_sandbox_resources({})["timeout_minutes"] == expected_minutes
|
|
146
154
|
|
|
147
|
-
|
|
148
|
-
|
|
155
|
+
def test_sandbox_timeout_explicit_override(self, sample_dataset):
|
|
156
|
+
env = vf.CliAgentEnv(
|
|
157
|
+
run_command="python agent.py",
|
|
158
|
+
dataset=sample_dataset,
|
|
159
|
+
rubric=vf.Rubric(),
|
|
160
|
+
timeout_seconds=600.0,
|
|
161
|
+
sandbox_timeout_minutes=30,
|
|
162
|
+
)
|
|
163
|
+
assert env.get_sandbox_resources({})["timeout_minutes"] == 30
|
|
149
164
|
|
|
150
165
|
@pytest.mark.asyncio
|
|
151
166
|
async def test_env_response_returns_empty(self, sample_dataset):
|
|
@@ -204,6 +219,152 @@ class TestCliAgentEnv:
|
|
|
204
219
|
assert kwargs["tools"][0].name == "echo"
|
|
205
220
|
|
|
206
221
|
|
|
222
|
+
@pytest.mark.asyncio
|
|
223
|
+
async def test_cli_agent_env_delivers_intercepted_tool_call_response(
|
|
224
|
+
sample_dataset, mock_client
|
|
225
|
+
):
|
|
226
|
+
env = vf.CliAgentEnv(
|
|
227
|
+
run_command="python agent.py",
|
|
228
|
+
dataset=sample_dataset,
|
|
229
|
+
rubric=vf.Rubric(),
|
|
230
|
+
)
|
|
231
|
+
prompt = sample_dataset[0]["prompt"]
|
|
232
|
+
tool_call = {
|
|
233
|
+
"id": "call_echo",
|
|
234
|
+
"type": "function",
|
|
235
|
+
"function": {"name": "echo", "arguments": '{"text": "hello"}'},
|
|
236
|
+
}
|
|
237
|
+
mock_client.add_response(
|
|
238
|
+
prompt,
|
|
239
|
+
"",
|
|
240
|
+
finish_reason="tool_calls",
|
|
241
|
+
tool_calls=[tool_call],
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
state = await env.init_state(
|
|
245
|
+
input=sample_dataset[0],
|
|
246
|
+
client=mock_client,
|
|
247
|
+
model="test-model",
|
|
248
|
+
)
|
|
249
|
+
response_future = asyncio.Future()
|
|
250
|
+
request_id = "req-tool-call"
|
|
251
|
+
state["current_request_id"] = request_id
|
|
252
|
+
env._interception_server.intercepts[request_id] = {
|
|
253
|
+
"stream": False,
|
|
254
|
+
"tools": [
|
|
255
|
+
{
|
|
256
|
+
"type": "function",
|
|
257
|
+
"function": {
|
|
258
|
+
"name": "echo",
|
|
259
|
+
"description": "Return the provided text.",
|
|
260
|
+
"parameters": {
|
|
261
|
+
"type": "object",
|
|
262
|
+
"properties": {"text": {"type": "string"}},
|
|
263
|
+
},
|
|
264
|
+
},
|
|
265
|
+
}
|
|
266
|
+
],
|
|
267
|
+
"response_future": response_future,
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
response = await env.get_model_response(
|
|
271
|
+
state=state,
|
|
272
|
+
prompt=prompt,
|
|
273
|
+
client=mock_client,
|
|
274
|
+
model="test-model",
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
assert response_future.done()
|
|
278
|
+
assert response_future.result() is response
|
|
279
|
+
assert state["current_request_id"] is None
|
|
280
|
+
|
|
281
|
+
payload = serialize_intercept_response(response_future.result())
|
|
282
|
+
choice = payload["choices"][0]
|
|
283
|
+
assert choice["finish_reason"] == "tool_calls"
|
|
284
|
+
assert choice["message"]["tool_calls"] == [tool_call]
|
|
285
|
+
assert mock_client.last_call_kwargs["tools"][0].name == "echo"
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@pytest.mark.asyncio
|
|
289
|
+
async def test_cli_agent_env_synthesizes_stream_for_intercepted_tool_call_response(
|
|
290
|
+
sample_dataset, mock_client
|
|
291
|
+
):
|
|
292
|
+
env = vf.CliAgentEnv(
|
|
293
|
+
run_command="python agent.py",
|
|
294
|
+
dataset=sample_dataset,
|
|
295
|
+
rubric=vf.Rubric(),
|
|
296
|
+
)
|
|
297
|
+
prompt = sample_dataset[0]["prompt"]
|
|
298
|
+
tool_call = {
|
|
299
|
+
"id": "call_echo",
|
|
300
|
+
"type": "function",
|
|
301
|
+
"function": {"name": "echo", "arguments": '{"text": "hello"}'},
|
|
302
|
+
}
|
|
303
|
+
mock_client.add_response(
|
|
304
|
+
prompt,
|
|
305
|
+
"",
|
|
306
|
+
finish_reason="tool_calls",
|
|
307
|
+
tool_calls=[tool_call],
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
state = await env.init_state(
|
|
311
|
+
input=sample_dataset[0],
|
|
312
|
+
client=mock_client,
|
|
313
|
+
model="test-model",
|
|
314
|
+
)
|
|
315
|
+
chunk_queue = asyncio.Queue()
|
|
316
|
+
response_future = asyncio.Future()
|
|
317
|
+
request_id = "req-stream-tool-call"
|
|
318
|
+
state["current_request_id"] = request_id
|
|
319
|
+
env._interception_server.intercepts[request_id] = {
|
|
320
|
+
"stream": True,
|
|
321
|
+
"tools": [
|
|
322
|
+
{
|
|
323
|
+
"type": "function",
|
|
324
|
+
"function": {
|
|
325
|
+
"name": "echo",
|
|
326
|
+
"description": "Return the provided text.",
|
|
327
|
+
"parameters": {
|
|
328
|
+
"type": "object",
|
|
329
|
+
"properties": {"text": {"type": "string"}},
|
|
330
|
+
},
|
|
331
|
+
},
|
|
332
|
+
}
|
|
333
|
+
],
|
|
334
|
+
"chunk_queue": chunk_queue,
|
|
335
|
+
"response_future": response_future,
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
response = await env.get_model_response(
|
|
339
|
+
state=state,
|
|
340
|
+
prompt=prompt,
|
|
341
|
+
client=mock_client,
|
|
342
|
+
model="test-model",
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
chunks = []
|
|
346
|
+
while True:
|
|
347
|
+
chunk = await asyncio.wait_for(chunk_queue.get(), timeout=1.0)
|
|
348
|
+
if chunk is None:
|
|
349
|
+
break
|
|
350
|
+
chunks.append(chunk)
|
|
351
|
+
|
|
352
|
+
assert response_future.done()
|
|
353
|
+
assert response_future.result() is response
|
|
354
|
+
assert state["current_request_id"] is None
|
|
355
|
+
|
|
356
|
+
assert chunks[0]["object"] == "chat.completion.chunk"
|
|
357
|
+
assert chunks[0]["choices"][0]["delta"]["tool_calls"][0]["id"] == "call_echo"
|
|
358
|
+
assert (
|
|
359
|
+
chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["name"] == "echo"
|
|
360
|
+
)
|
|
361
|
+
assert (
|
|
362
|
+
chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["arguments"]
|
|
363
|
+
== '{"text": "hello"}'
|
|
364
|
+
)
|
|
365
|
+
assert chunks[-1]["choices"][0]["finish_reason"] == "tool_calls"
|
|
366
|
+
|
|
367
|
+
|
|
207
368
|
class TestHarborEnv:
|
|
208
369
|
"""Tests for HarborEnv."""
|
|
209
370
|
|
|
@@ -231,7 +392,7 @@ class TestHarborEnv:
|
|
|
231
392
|
dataset_path=harbor_task_dir,
|
|
232
393
|
)
|
|
233
394
|
assert len(env.dataset) == 1
|
|
234
|
-
assert env.dataset[0]["
|
|
395
|
+
assert env.dataset[0]["info"]["task_name"] == "test_task"
|
|
235
396
|
|
|
236
397
|
def test_init_filters_tasks(self, harbor_task_dir):
|
|
237
398
|
"""Test that HarborEnv can filter tasks by name."""
|
|
@@ -247,7 +408,7 @@ class TestHarborEnv:
|
|
|
247
408
|
tasks=["test_task"],
|
|
248
409
|
)
|
|
249
410
|
assert len(env.dataset) == 1
|
|
250
|
-
assert env.dataset[0]["
|
|
411
|
+
assert env.dataset[0]["info"]["task_name"] == "test_task"
|
|
251
412
|
|
|
252
413
|
def test_init_raises_on_empty_dataset(self):
|
|
253
414
|
"""Test that HarborEnv raises when no valid tasks found."""
|
|
@@ -301,7 +462,7 @@ class TestHarborEnv:
|
|
|
301
462
|
)
|
|
302
463
|
state = {
|
|
303
464
|
"interception_base_url": "https://test.trycloudflare.com/v1",
|
|
304
|
-
"
|
|
465
|
+
"info": {"task_name": "my_task"},
|
|
305
466
|
}
|
|
306
467
|
env_vars = await env.build_env_vars(state)
|
|
307
468
|
|
|
@@ -251,7 +251,7 @@ async def test_composable_env_quotes_log_path_when_collecting_logs():
|
|
|
251
251
|
teardown=lambda: None,
|
|
252
252
|
)
|
|
253
253
|
|
|
254
|
-
state = {"sandbox_id": "sbx", "timing": {"
|
|
254
|
+
state = {"sandbox_id": "sbx", "timing": {"total": 0}}
|
|
255
255
|
|
|
256
256
|
await env.post_rollout(state)
|
|
257
257
|
|
|
@@ -594,7 +594,7 @@ async def test_composable_env_collects_harness_metrics():
|
|
|
594
594
|
state = {
|
|
595
595
|
"sandbox_id": "sbx",
|
|
596
596
|
"info": {"id": 0},
|
|
597
|
-
"timing": {"
|
|
597
|
+
"timing": {"total": 0},
|
|
598
598
|
"trajectory": [],
|
|
599
599
|
}
|
|
600
600
|
|
|
@@ -633,7 +633,7 @@ async def test_composable_env_metrics_with_key_whitelist():
|
|
|
633
633
|
state = {
|
|
634
634
|
"sandbox_id": "sbx",
|
|
635
635
|
"info": {"id": 0},
|
|
636
|
-
"timing": {"
|
|
636
|
+
"timing": {"total": 0},
|
|
637
637
|
"trajectory": [],
|
|
638
638
|
}
|
|
639
639
|
|
|
@@ -659,7 +659,7 @@ async def test_composable_env_no_metrics_when_path_not_set():
|
|
|
659
659
|
state = {
|
|
660
660
|
"sandbox_id": "sbx",
|
|
661
661
|
"info": {"id": 0},
|
|
662
|
-
"timing": {"
|
|
662
|
+
"timing": {"total": 0},
|
|
663
663
|
"trajectory": [],
|
|
664
664
|
}
|
|
665
665
|
|