verifiers 0.1.10.dev5__tar.gz → 0.1.11.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/PKG-INFO +9 -8
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/README.md +7 -7
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/pyproject.toml +1 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/README.md +16 -20
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/conftest.py +149 -118
- verifiers-0.1.11.dev1/tests/test_build_script.py +29 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_cli_agent_env.py +43 -0
- verifiers-0.1.11.dev1/tests/test_client_auth_errors.py +189 -0
- verifiers-0.1.11.dev1/tests/test_client_multimodal_types.py +239 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_decorator_ranks.py +29 -29
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_endpoint_registry.py +45 -0
- verifiers-0.1.11.dev1/tests/test_env_crash_recovery.py +237 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_env_group.py +47 -47
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_environment.py +210 -161
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_environment_extra.py +145 -88
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_eval_cli.py +28 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_gym_env.py +68 -95
- verifiers-0.1.11.dev1/tests/test_interception_utils.py +63 -0
- verifiers-0.1.11.dev1/tests/test_message_utils.py +57 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_multiturn_env.py +53 -54
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_rlm_env.py +880 -133
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_rlm_env_sandbox.py +14 -50
- verifiers-0.1.11.dev1/tests/test_rollout_gateway_env.py +350 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_save_utils.py +233 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_singleturn_env.py +21 -24
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_stateful_tool_env.py +23 -29
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_tool_env.py +34 -38
- verifiers-0.1.11.dev1/tests/test_tool_utils.py +160 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_trajectory_processing.py +104 -48
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/AGENTS.md +1 -1
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/__init__.py +19 -1
- verifiers-0.1.11.dev1/verifiers/clients/__init__.py +39 -0
- verifiers-0.1.11.dev1/verifiers/clients/anthropic_messages_client.py +470 -0
- verifiers-0.1.11.dev1/verifiers/clients/client.py +128 -0
- verifiers-0.1.11.dev1/verifiers/clients/openai_chat_completions_client.py +510 -0
- verifiers-0.1.11.dev1/verifiers/clients/openai_chat_completions_token_client.py +236 -0
- verifiers-0.1.11.dev1/verifiers/clients/openai_completions_client.py +188 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/env_group.py +13 -15
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/environment.py +233 -358
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/README.md +4 -2
- verifiers-0.1.11.dev1/verifiers/envs/experimental/__init__.py +4 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/cli_agent_env.py +104 -26
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/gym_env.py +4 -9
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/mcp_env.py +18 -47
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/rlm_env.py +658 -921
- verifiers-0.1.11.dev1/verifiers/envs/experimental/rollout_gateway_mixin.py +397 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/browser_env.py +7 -1
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +60 -44
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +1 -1
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/openenv_env.py +71 -46
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/textarena_env.py +33 -11
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/multiturn_env.py +37 -25
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/singleturn_env.py +3 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/stateful_tool_env.py +26 -44
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/tool_env.py +45 -56
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/errors.py +8 -2
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/adapter.py +3 -2
- verifiers-0.1.11.dev1/verifiers/parsers/parser.py +85 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/parsers/think_parser.py +14 -3
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/parsers/xml_parser.py +12 -6
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/build.py +32 -7
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/eval.py +131 -29
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/gepa.py +6 -11
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/types.py +179 -34
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/async_utils.py +2 -6
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/client_utils.py +62 -31
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/data_utils.py +3 -3
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/eval_display.py +7 -1
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/eval_utils.py +95 -11
- verifiers-0.1.11.dev1/verifiers/utils/heartbeat.py +31 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/interception_utils.py +103 -26
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/logging_utils.py +44 -6
- verifiers-0.1.11.dev1/verifiers/utils/message_utils.py +318 -0
- verifiers-0.1.11.dev1/verifiers/utils/metric_utils.py +69 -0
- verifiers-0.1.11.dev1/verifiers/utils/response_utils.py +73 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/save_utils.py +43 -14
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/tool_utils.py +9 -14
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/worker_utils.py +15 -32
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/workers/client/env_client.py +22 -2
- verifiers-0.1.11.dev1/verifiers/workers/client/zmq_env_client.py +408 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/workers/server/env_server.py +68 -34
- verifiers-0.1.11.dev1/verifiers/workers/server/zmq_env_server.py +246 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/workers/types.py +21 -0
- verifiers-0.1.10.dev5/tests/mock_client_guide.md +0 -207
- verifiers-0.1.10.dev5/tests/mock_openai_client.py +0 -155
- verifiers-0.1.10.dev5/tests/test_environment_audio_modality.py +0 -112
- verifiers-0.1.10.dev5/tests/test_tool_utils.py +0 -175
- verifiers-0.1.10.dev5/verifiers/envs/experimental/__init__.py +0 -3
- verifiers-0.1.10.dev5/verifiers/parsers/parser.py +0 -59
- verifiers-0.1.10.dev5/verifiers/utils/message_utils.py +0 -165
- verifiers-0.1.10.dev5/verifiers/utils/response_utils.py +0 -142
- verifiers-0.1.10.dev5/verifiers/utils/token_utils.py +0 -187
- verifiers-0.1.10.dev5/verifiers/workers/client/zmq_env_client.py +0 -198
- verifiers-0.1.10.dev5/verifiers/workers/server/zmq_env_server.py +0 -148
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/.gitignore +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/LICENSE +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/AGENTS.md +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_client_config.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_envs.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_imports.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_logging.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_message_utils_audio.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_parser.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_rubric.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/decorators.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/harbor_env.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/prime_rl.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.10.dev5 → verifiers-0.1.11.dev1}/verifiers/workers/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.11.dev1
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
23
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
24
|
Requires-Python: <3.14,>=3.10
|
|
25
|
+
Requires-Dist: anthropic>=0.78.0
|
|
25
26
|
Requires-Dist: datasets>=3.0.0
|
|
26
27
|
Requires-Dist: gepa
|
|
27
28
|
Requires-Dist: jinja2>=3.1.6
|
|
@@ -105,7 +106,7 @@ Verifiers: Environments for LLM Reinforcement Learning
|
|
|
105
106
|
|
|
106
107
|
- [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
|
|
107
108
|
- [11/19/25] v0.1.8 is released, featuring a major refactor of the rollout system to use trajectory-based tracking for token-in token-out training across turns, as well as support for truncated or branching rollouts.
|
|
108
|
-
- [11/07/25] Verifiers v0.1.7 is released! This includes an improved quickstart configuration for training with [prime-rl], a new included "nano" trainer (`vf.RLTrainer`, replacing `vf.GRPOTrainer`), and a number of bug fixes and improvements to the documentation.
|
|
109
|
+
- [11/07/25] Verifiers v0.1.7 is released! This includes an improved quickstart configuration for training with [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl), a new included "nano" trainer (`vf.RLTrainer`, replacing `vf.GRPOTrainer`), and a number of bug fixes and improvements to the documentation.
|
|
109
110
|
- [10/27/25] A new iteration of the Prime Intellect [Environments Program](https://docs.google.com/spreadsheets/d/13UDfRDjgIZXsMI2s9-Lmn8KSMMsgk2_zsfju6cx_pNU/edit?gid=0#gid=0) is live!
|
|
110
111
|
|
|
111
112
|
|
|
@@ -228,17 +229,17 @@ prime eval run primeintellect/math-python
|
|
|
228
229
|
|
|
229
230
|
## Documentation
|
|
230
231
|
|
|
231
|
-
**[Environments](environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
|
|
232
|
+
**[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
|
|
232
233
|
|
|
233
|
-
**[Evaluation](evaluation.md)** - Evaluate models using your environments.
|
|
234
|
+
**[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
|
|
234
235
|
|
|
235
|
-
**[Training](training.md)** — Train models in your environments with reinforcement learning.
|
|
236
|
+
**[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
|
|
236
237
|
|
|
237
|
-
**[Development](development.md)** — Contributing to verifiers
|
|
238
|
+
**[Development](docs/development.md)** — Contributing to verifiers
|
|
238
239
|
|
|
239
|
-
**[API Reference](reference.md)** — Understanding the API and data structures
|
|
240
|
+
**[API Reference](docs/reference.md)** — Understanding the API and data structures
|
|
240
241
|
|
|
241
|
-
**[FAQs](faqs.md)** - Other frequently asked questions.
|
|
242
|
+
**[FAQs](docs/faqs.md)** - Other frequently asked questions.
|
|
242
243
|
|
|
243
244
|
|
|
244
245
|
## Citation
|
|
@@ -36,7 +36,7 @@ Verifiers: Environments for LLM Reinforcement Learning
|
|
|
36
36
|
|
|
37
37
|
- [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
|
|
38
38
|
- [11/19/25] v0.1.8 is released, featuring a major refactor of the rollout system to use trajectory-based tracking for token-in token-out training across turns, as well as support for truncated or branching rollouts.
|
|
39
|
-
- [11/07/25] Verifiers v0.1.7 is released! This includes an improved quickstart configuration for training with [prime-rl], a new included "nano" trainer (`vf.RLTrainer`, replacing `vf.GRPOTrainer`), and a number of bug fixes and improvements to the documentation.
|
|
39
|
+
- [11/07/25] Verifiers v0.1.7 is released! This includes an improved quickstart configuration for training with [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl), a new included "nano" trainer (`vf.RLTrainer`, replacing `vf.GRPOTrainer`), and a number of bug fixes and improvements to the documentation.
|
|
40
40
|
- [10/27/25] A new iteration of the Prime Intellect [Environments Program](https://docs.google.com/spreadsheets/d/13UDfRDjgIZXsMI2s9-Lmn8KSMMsgk2_zsfju6cx_pNU/edit?gid=0#gid=0) is live!
|
|
41
41
|
|
|
42
42
|
|
|
@@ -159,17 +159,17 @@ prime eval run primeintellect/math-python
|
|
|
159
159
|
|
|
160
160
|
## Documentation
|
|
161
161
|
|
|
162
|
-
**[Environments](environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
|
|
162
|
+
**[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
|
|
163
163
|
|
|
164
|
-
**[Evaluation](evaluation.md)** - Evaluate models using your environments.
|
|
164
|
+
**[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
|
|
165
165
|
|
|
166
|
-
**[Training](training.md)** — Train models in your environments with reinforcement learning.
|
|
166
|
+
**[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
|
|
167
167
|
|
|
168
|
-
**[Development](development.md)** — Contributing to verifiers
|
|
168
|
+
**[Development](docs/development.md)** — Contributing to verifiers
|
|
169
169
|
|
|
170
|
-
**[API Reference](reference.md)** — Understanding the API and data structures
|
|
170
|
+
**[API Reference](docs/reference.md)** — Understanding the API and data structures
|
|
171
171
|
|
|
172
|
-
**[FAQs](faqs.md)** - Other frequently asked questions.
|
|
172
|
+
**[FAQs](docs/faqs.md)** - Other frequently asked questions.
|
|
173
173
|
|
|
174
174
|
|
|
175
175
|
## Citation
|
|
@@ -63,9 +63,11 @@ uv run pytest -m unit
|
|
|
63
63
|
|
|
64
64
|
The test suite includes comprehensive support for testing async Environment classes:
|
|
65
65
|
|
|
66
|
-
###
|
|
67
|
-
- `
|
|
68
|
-
-
|
|
66
|
+
### MockClient (conftest.py)
|
|
67
|
+
- `MockClient(Client)` implements the `get_response()` interface returning `vf.Response` objects
|
|
68
|
+
- `mock_client` fixture provides an instance for tests
|
|
69
|
+
- Supports prompt-to-response mappings via `add_response()`
|
|
70
|
+
- Tracks calls via `call_count` and `last_call_kwargs`
|
|
69
71
|
- No actual API calls are made during testing
|
|
70
72
|
|
|
71
73
|
### Test Datasets
|
|
@@ -76,22 +78,17 @@ The test suite includes comprehensive support for testing async Environment clas
|
|
|
76
78
|
### Async Test Examples
|
|
77
79
|
```python
|
|
78
80
|
@pytest.mark.asyncio
|
|
79
|
-
async def test_my_async_function(
|
|
80
|
-
env = SingleTurnEnv(client=
|
|
81
|
+
async def test_my_async_function(mock_client):
|
|
82
|
+
env = SingleTurnEnv(client=mock_client, model="test", ...)
|
|
81
83
|
result = await env.rollout(...)
|
|
82
|
-
assert
|
|
83
|
-
|
|
84
|
-
#
|
|
85
|
-
@pytest.mark.asyncio
|
|
86
|
-
async def
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
create_mock_response(resp) for resp in responses
|
|
91
|
-
]
|
|
92
|
-
|
|
93
|
-
completion, state = await mock_multiturn_env.rollout(...)
|
|
94
|
-
assert len(completion) > 1 # Multiple turns
|
|
84
|
+
assert mock_client.call_count == 1
|
|
85
|
+
|
|
86
|
+
# Custom response mapping
|
|
87
|
+
@pytest.mark.asyncio
|
|
88
|
+
async def test_with_custom_response(mock_client):
|
|
89
|
+
mock_client.set_default_response("DONE")
|
|
90
|
+
env = SimpleMultiTurnEnv(client=mock_client, model="test", ...)
|
|
91
|
+
completion, state = await env.rollout(...)
|
|
95
92
|
```
|
|
96
93
|
|
|
97
94
|
### Environment Testing
|
|
@@ -103,7 +100,6 @@ async def test_multiturn_conversation(mock_multiturn_env):
|
|
|
103
100
|
- Completion detection logic
|
|
104
101
|
- State management across turns
|
|
105
102
|
- Tests cover both chat and completion message formats
|
|
106
|
-
- Mocked responses simulate real OpenAI API behavior
|
|
107
103
|
- Error handling and edge cases are tested
|
|
108
104
|
- No real LLM requests are made
|
|
109
105
|
|
|
@@ -112,5 +108,5 @@ async def test_multiturn_conversation(mock_multiturn_env):
|
|
|
112
108
|
1. Create test files following the `test_*.py` naming convention
|
|
113
109
|
2. Use the fixtures from `conftest.py` for common instances
|
|
114
110
|
3. Add appropriate test markers (`@pytest.mark.asyncio` for async tests)
|
|
115
|
-
4. Use `
|
|
111
|
+
4. Use `mock_client` for Environment testing
|
|
116
112
|
5. Follow the existing test structure and naming conventions
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
"""Pytest configuration and fixtures for verifiers tests."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import Callable
|
|
5
|
-
from unittest.mock import AsyncMock, MagicMock
|
|
5
|
+
from typing import Any, Callable
|
|
6
6
|
|
|
7
7
|
import pytest
|
|
8
8
|
from datasets import Dataset
|
|
9
|
-
from openai.types.chat import ChatCompletionToolParam
|
|
10
9
|
|
|
11
10
|
from verifiers import (
|
|
12
11
|
MaybeThinkParser,
|
|
@@ -23,13 +22,18 @@ from verifiers import (
|
|
|
23
22
|
XMLParser,
|
|
24
23
|
stop,
|
|
25
24
|
)
|
|
25
|
+
from verifiers.clients.client import Client
|
|
26
26
|
from verifiers.types import (
|
|
27
27
|
GenerateMetadata,
|
|
28
28
|
Info,
|
|
29
|
+
Response,
|
|
30
|
+
ResponseMessage,
|
|
29
31
|
RolloutInput,
|
|
30
32
|
RolloutOutput,
|
|
31
33
|
RolloutTiming,
|
|
32
34
|
SamplingArgs,
|
|
35
|
+
Tool,
|
|
36
|
+
ToolCall,
|
|
33
37
|
TrajectoryStep,
|
|
34
38
|
)
|
|
35
39
|
from verifiers.utils.save_utils import state_to_output
|
|
@@ -82,135 +86,156 @@ def think_parser_with_extractor():
|
|
|
82
86
|
# Async test fixtures for Environment testing
|
|
83
87
|
|
|
84
88
|
|
|
85
|
-
class
|
|
86
|
-
"""
|
|
89
|
+
class MockClient(Client):
|
|
90
|
+
"""Mocked vf.Client with get_response() to return provider-agnostic vf.Response objects"""
|
|
87
91
|
|
|
88
92
|
def __init__(self):
|
|
89
|
-
self.
|
|
90
|
-
self.
|
|
91
|
-
self.default_chat_response = "This is a test response"
|
|
92
|
-
self.default_text_response = "This is a test completion"
|
|
93
|
-
self.base_url = "http://localhost/v1/" # For testing URL parsing
|
|
94
|
-
|
|
95
|
-
# Create mock structure
|
|
96
|
-
self.chat = MagicMock()
|
|
97
|
-
self.completions = MagicMock()
|
|
98
|
-
self.chat.completions = MagicMock()
|
|
99
|
-
|
|
100
|
-
# Set up async methods
|
|
101
|
-
self.chat.completions.create = AsyncMock(
|
|
102
|
-
side_effect=self._handle_chat_completion
|
|
103
|
-
)
|
|
104
|
-
self.completions.create = AsyncMock(side_effect=self._handle_text_completion)
|
|
93
|
+
self.logger = logging.getLogger(f"{__name__}.MockClient")
|
|
94
|
+
self._client = None
|
|
105
95
|
|
|
106
|
-
|
|
107
|
-
self
|
|
108
|
-
|
|
96
|
+
self._responses: dict[tuple, dict] = {}
|
|
97
|
+
self.default_response = "This is a test response"
|
|
98
|
+
|
|
99
|
+
# Call tracking
|
|
100
|
+
self.call_count = 0
|
|
101
|
+
self.last_call_kwargs: dict[str, Any] = {}
|
|
102
|
+
|
|
103
|
+
def add_response(self, messages, response, finish_reason="stop", tool_calls=None):
|
|
109
104
|
"""Add a mapped response for specific messages."""
|
|
110
|
-
|
|
111
|
-
key =
|
|
112
|
-
self.chat_completions[key] = {
|
|
105
|
+
key = self._messages_to_key(self._normalize_input(messages))
|
|
106
|
+
self._responses[key] = {
|
|
113
107
|
"content": response,
|
|
114
108
|
"finish_reason": finish_reason,
|
|
115
109
|
"tool_calls": tool_calls,
|
|
116
110
|
}
|
|
117
111
|
|
|
118
|
-
def
|
|
119
|
-
"""
|
|
120
|
-
self.
|
|
121
|
-
|
|
122
|
-
|
|
112
|
+
def set_default_response(self, response):
|
|
113
|
+
"""Set default response when no mapping found."""
|
|
114
|
+
self.default_response = response
|
|
115
|
+
|
|
116
|
+
async def get_response(
|
|
117
|
+
self,
|
|
118
|
+
prompt,
|
|
119
|
+
model,
|
|
120
|
+
sampling_args,
|
|
121
|
+
tools=None,
|
|
122
|
+
**kwargs,
|
|
123
|
+
) -> Response:
|
|
124
|
+
"""Return a Response based on the prompt-to-response mapping."""
|
|
125
|
+
self.call_count += 1
|
|
126
|
+
self.last_call_kwargs = {
|
|
127
|
+
"prompt": prompt,
|
|
128
|
+
"model": model,
|
|
129
|
+
"sampling_args": sampling_args,
|
|
130
|
+
"tools": tools,
|
|
131
|
+
**kwargs,
|
|
123
132
|
}
|
|
124
133
|
|
|
125
|
-
|
|
126
|
-
"""Set default responses when no mapping found."""
|
|
127
|
-
if chat_response:
|
|
128
|
-
self.default_chat_response = chat_response
|
|
129
|
-
if text_response:
|
|
130
|
-
self.default_text_response = text_response
|
|
134
|
+
return self._make_response(prompt)
|
|
131
135
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
key = self._messages_to_key(messages)
|
|
136
|
+
def setup_client(self, config):
|
|
137
|
+
return None
|
|
135
138
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
else:
|
|
139
|
-
response_data = {
|
|
140
|
-
"content": self.default_chat_response,
|
|
141
|
-
"finish_reason": "stop",
|
|
142
|
-
"tool_calls": None,
|
|
143
|
-
}
|
|
139
|
+
async def to_native_tool(self, tool):
|
|
140
|
+
pass
|
|
144
141
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
mock_message = MagicMock(spec=ChatCompletionMessage)
|
|
153
|
-
|
|
154
|
-
# Set the attributes
|
|
155
|
-
mock_message.content = response_data["content"]
|
|
156
|
-
mock_message.role = "assistant"
|
|
157
|
-
mock_message.tool_calls = response_data.get("tool_calls", None)
|
|
158
|
-
mock_choice.message = mock_message
|
|
159
|
-
mock_choice.finish_reason = response_data["finish_reason"]
|
|
160
|
-
mock_choice.index = 0
|
|
161
|
-
|
|
162
|
-
mock_response.choices = [mock_choice]
|
|
163
|
-
mock_response.id = "test-id"
|
|
164
|
-
mock_response.model = "test-model"
|
|
165
|
-
mock_response.object = "chat.completion"
|
|
166
|
-
|
|
167
|
-
return mock_response
|
|
168
|
-
|
|
169
|
-
async def _handle_text_completion(self, prompt, **kwargs):
|
|
170
|
-
"""Handle text completion requests."""
|
|
171
|
-
if prompt in self.text_completions:
|
|
172
|
-
response_data = self.text_completions[prompt]
|
|
173
|
-
else:
|
|
174
|
-
response_data = {
|
|
175
|
-
"text": self.default_text_response,
|
|
176
|
-
"finish_reason": "stop",
|
|
177
|
-
}
|
|
142
|
+
async def to_native_prompt(self, messages):
|
|
143
|
+
return [], {}
|
|
144
|
+
|
|
145
|
+
async def get_native_response(
|
|
146
|
+
self, prompt, model, sampling_args, tools=None, **kwargs
|
|
147
|
+
):
|
|
148
|
+
pass
|
|
178
149
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
from openai.types.completion_choice import CompletionChoice
|
|
150
|
+
async def raise_from_native_response(self, response):
|
|
151
|
+
pass
|
|
182
152
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
mock_choice = MagicMock(spec=CompletionChoice)
|
|
153
|
+
async def from_native_response(self, response):
|
|
154
|
+
pass
|
|
186
155
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
mock_choice.finish_reason = response_data["finish_reason"]
|
|
190
|
-
mock_choice.index = 0
|
|
156
|
+
async def close(self) -> None:
|
|
157
|
+
pass
|
|
191
158
|
|
|
192
|
-
|
|
193
|
-
mock_response.id = "test-id"
|
|
194
|
-
mock_response.model = "test-model"
|
|
195
|
-
mock_response.object = "text_completion"
|
|
159
|
+
# -- Internal helpers --
|
|
196
160
|
|
|
197
|
-
|
|
161
|
+
@staticmethod
|
|
162
|
+
def _normalize_input(messages):
|
|
163
|
+
"""Normalize prompt to list-of-dicts form for keying."""
|
|
164
|
+
if isinstance(messages, str):
|
|
165
|
+
return [{"role": "text", "content": messages}]
|
|
166
|
+
return messages
|
|
198
167
|
|
|
199
168
|
def _messages_to_key(self, messages):
|
|
200
169
|
"""Convert messages list to a hashable key."""
|
|
201
|
-
# Create a simplified representation for hashing
|
|
202
170
|
key_parts = []
|
|
203
171
|
for msg in messages:
|
|
204
|
-
|
|
205
|
-
|
|
172
|
+
if isinstance(msg, dict):
|
|
173
|
+
role = msg.get("role", "")
|
|
174
|
+
content = msg.get("content", "")
|
|
175
|
+
else:
|
|
176
|
+
role = getattr(msg, "role", "")
|
|
177
|
+
content = getattr(msg, "content", "")
|
|
206
178
|
key_parts.append(f"{role}:{content}")
|
|
207
179
|
return tuple(key_parts)
|
|
208
180
|
|
|
181
|
+
def _convert_tool_calls(self, raw_tool_calls) -> list[ToolCall] | None:
|
|
182
|
+
"""Convert OAI-style tool call objects to vf.ToolCall."""
|
|
183
|
+
if not raw_tool_calls:
|
|
184
|
+
return None
|
|
185
|
+
result: list[ToolCall] = []
|
|
186
|
+
for tc in raw_tool_calls:
|
|
187
|
+
if hasattr(tc, "function"):
|
|
188
|
+
result.append(
|
|
189
|
+
ToolCall(
|
|
190
|
+
id=tc.id,
|
|
191
|
+
name=tc.function.name,
|
|
192
|
+
arguments=tc.function.arguments,
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
elif isinstance(tc, dict):
|
|
196
|
+
func = tc.get("function", {})
|
|
197
|
+
result.append(
|
|
198
|
+
ToolCall(
|
|
199
|
+
id=tc.get("id", ""),
|
|
200
|
+
name=func.get("name", ""),
|
|
201
|
+
arguments=func.get("arguments", ""),
|
|
202
|
+
)
|
|
203
|
+
)
|
|
204
|
+
return result or None
|
|
205
|
+
|
|
206
|
+
def _make_response(self, prompt) -> Response:
|
|
207
|
+
key = self._messages_to_key(self._normalize_input(prompt))
|
|
208
|
+
if key in self._responses:
|
|
209
|
+
data = self._responses[key]
|
|
210
|
+
else:
|
|
211
|
+
data = {
|
|
212
|
+
"content": self.default_response,
|
|
213
|
+
"finish_reason": "stop",
|
|
214
|
+
"tool_calls": None,
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
tool_calls = self._convert_tool_calls(data.get("tool_calls"))
|
|
218
|
+
|
|
219
|
+
return Response(
|
|
220
|
+
id="test-id",
|
|
221
|
+
created=0,
|
|
222
|
+
model="test-model",
|
|
223
|
+
usage=None,
|
|
224
|
+
message=ResponseMessage(
|
|
225
|
+
content=data["content"],
|
|
226
|
+
reasoning_content=None,
|
|
227
|
+
finish_reason=data["finish_reason"],
|
|
228
|
+
is_truncated=data["finish_reason"] == "length",
|
|
229
|
+
tokens=None,
|
|
230
|
+
tool_calls=tool_calls,
|
|
231
|
+
),
|
|
232
|
+
)
|
|
233
|
+
|
|
209
234
|
|
|
210
235
|
@pytest.fixture
|
|
211
|
-
def
|
|
212
|
-
"""Return a
|
|
213
|
-
return
|
|
236
|
+
def mock_client():
|
|
237
|
+
"""Return a MockClient with input-output mapping."""
|
|
238
|
+
return MockClient()
|
|
214
239
|
|
|
215
240
|
|
|
216
241
|
@pytest.fixture
|
|
@@ -240,10 +265,10 @@ def sample_chat_dataset():
|
|
|
240
265
|
|
|
241
266
|
|
|
242
267
|
@pytest.fixture
|
|
243
|
-
def mock_singleturn_env(
|
|
268
|
+
def mock_singleturn_env(mock_client, sample_dataset):
|
|
244
269
|
"""Return a SingleTurnEnv with mocked client and dataset."""
|
|
245
270
|
return SingleTurnEnv(
|
|
246
|
-
client=
|
|
271
|
+
client=mock_client,
|
|
247
272
|
model="test-model",
|
|
248
273
|
dataset=sample_dataset,
|
|
249
274
|
system_prompt="You are a helpful assistant.",
|
|
@@ -253,7 +278,7 @@ def mock_singleturn_env(mock_openai_client, sample_dataset):
|
|
|
253
278
|
|
|
254
279
|
|
|
255
280
|
@pytest.fixture
|
|
256
|
-
def mock_singleturn_env_completion(
|
|
281
|
+
def mock_singleturn_env_completion(mock_client):
|
|
257
282
|
"""Return a SingleTurnEnv for completion format testing."""
|
|
258
283
|
completion_dataset = Dataset.from_dict(
|
|
259
284
|
{
|
|
@@ -262,7 +287,7 @@ def mock_singleturn_env_completion(mock_openai_client):
|
|
|
262
287
|
}
|
|
263
288
|
)
|
|
264
289
|
return SingleTurnEnv(
|
|
265
|
-
client=
|
|
290
|
+
client=mock_client,
|
|
266
291
|
model="test-model",
|
|
267
292
|
dataset=completion_dataset,
|
|
268
293
|
message_type="completion",
|
|
@@ -335,10 +360,10 @@ class SimpleMultiTurnEnv(MultiTurnEnv):
|
|
|
335
360
|
|
|
336
361
|
|
|
337
362
|
@pytest.fixture
|
|
338
|
-
def mock_multiturn_env(
|
|
363
|
+
def mock_multiturn_env(mock_client, sample_chat_dataset):
|
|
339
364
|
"""Return a MultiTurnEnv for basic testing."""
|
|
340
365
|
return SimpleMultiTurnEnv(
|
|
341
|
-
client=
|
|
366
|
+
client=mock_client,
|
|
342
367
|
model="test-model",
|
|
343
368
|
dataset=sample_chat_dataset,
|
|
344
369
|
max_turns=3,
|
|
@@ -349,10 +374,10 @@ def mock_multiturn_env(mock_openai_client, sample_chat_dataset):
|
|
|
349
374
|
|
|
350
375
|
|
|
351
376
|
@pytest.fixture
|
|
352
|
-
def mock_multiturn_env_max_turns(
|
|
377
|
+
def mock_multiturn_env_max_turns(mock_client, sample_chat_dataset):
|
|
353
378
|
"""Return a MultiTurnEnv that tests max_turns limiting."""
|
|
354
379
|
return SimpleMultiTurnEnv(
|
|
355
|
-
client=
|
|
380
|
+
client=mock_client,
|
|
356
381
|
model="test-model",
|
|
357
382
|
dataset=sample_chat_dataset,
|
|
358
383
|
max_turns=2,
|
|
@@ -377,9 +402,9 @@ class BasicToolEnv(ToolEnv):
|
|
|
377
402
|
|
|
378
403
|
|
|
379
404
|
@pytest.fixture
|
|
380
|
-
def mock_tool_env(
|
|
405
|
+
def mock_tool_env(mock_client, sample_chat_dataset):
|
|
381
406
|
return BasicToolEnv(
|
|
382
|
-
client=
|
|
407
|
+
client=mock_client,
|
|
383
408
|
model="test-model",
|
|
384
409
|
dataset=sample_chat_dataset,
|
|
385
410
|
parser=Parser(),
|
|
@@ -413,9 +438,9 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
|
|
|
413
438
|
|
|
414
439
|
|
|
415
440
|
@pytest.fixture
|
|
416
|
-
def mock_stateful_tool_env(
|
|
441
|
+
def mock_stateful_tool_env(mock_client, sample_chat_dataset):
|
|
417
442
|
return ExampleStatefulToolEnv(
|
|
418
|
-
client=
|
|
443
|
+
client=mock_client,
|
|
419
444
|
model="test-model",
|
|
420
445
|
dataset=sample_chat_dataset,
|
|
421
446
|
parser=Parser(),
|
|
@@ -461,7 +486,7 @@ def make_state() -> Callable[..., State]:
|
|
|
461
486
|
is_completed: bool = True,
|
|
462
487
|
is_truncated: bool = False,
|
|
463
488
|
stop_condition: str | None = "max_turns_reached",
|
|
464
|
-
|
|
489
|
+
tool_defs: list[Tool] | None = None,
|
|
465
490
|
trajectory: list[TrajectoryStep] = [],
|
|
466
491
|
timing=RolloutTiming(
|
|
467
492
|
generation_ms=0.0,
|
|
@@ -483,7 +508,7 @@ def make_state() -> Callable[..., State]:
|
|
|
483
508
|
is_completed=is_completed,
|
|
484
509
|
is_truncated=is_truncated,
|
|
485
510
|
stop_condition=stop_condition,
|
|
486
|
-
|
|
511
|
+
tool_defs=tool_defs,
|
|
487
512
|
trajectory=trajectory,
|
|
488
513
|
timing=timing,
|
|
489
514
|
error=None,
|
|
@@ -529,11 +554,14 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
|
|
|
529
554
|
time_ms: float = 0.0,
|
|
530
555
|
avg_reward: float = 0.0,
|
|
531
556
|
avg_metrics: dict[str, float] = {},
|
|
557
|
+
pass_at_k: dict[str, float] = {},
|
|
558
|
+
pass_all_k: dict[str, float] = {},
|
|
559
|
+
pass_threshold: float = 0.5,
|
|
532
560
|
usage: dict[str, float] | None = None,
|
|
533
561
|
version_info: dict | None = None,
|
|
534
562
|
state_columns: list[str] = ["foo"],
|
|
535
563
|
path_to_save: Path = Path("test.jsonl"),
|
|
536
|
-
tools: list[
|
|
564
|
+
tools: list[Tool] | None = None,
|
|
537
565
|
) -> GenerateMetadata:
|
|
538
566
|
if version_info is None:
|
|
539
567
|
version_info = {
|
|
@@ -554,6 +582,9 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
|
|
|
554
582
|
time_ms=time_ms,
|
|
555
583
|
avg_reward=avg_reward,
|
|
556
584
|
avg_metrics=avg_metrics,
|
|
585
|
+
pass_at_k=pass_at_k,
|
|
586
|
+
pass_all_k=pass_all_k,
|
|
587
|
+
pass_threshold=pass_threshold,
|
|
557
588
|
usage=usage,
|
|
558
589
|
version_info=version_info,
|
|
559
590
|
state_columns=state_columns,
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from verifiers.scripts import build
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_resolve_env_push_target_defaults_to_environments_dir(tmp_path: Path):
|
|
7
|
+
base_dir = tmp_path / "workspace" / "environments"
|
|
8
|
+
env_name, env_path = build._resolve_env_push_target("my-env", str(base_dir))
|
|
9
|
+
|
|
10
|
+
assert env_name == "my-env"
|
|
11
|
+
assert env_path == (base_dir / "my_env").resolve()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_resolve_env_push_target_appends_env_id_to_custom_base_path(tmp_path: Path):
|
|
15
|
+
base_dir = tmp_path / "workspace" / "custom_envs"
|
|
16
|
+
env_name, env_path = build._resolve_env_push_target("env-name", str(base_dir))
|
|
17
|
+
|
|
18
|
+
assert env_name == "env-name"
|
|
19
|
+
assert env_path == (base_dir / "env_name").resolve()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_resolve_env_push_target_uses_explicit_environment_path_when_env_id_missing(
|
|
23
|
+
tmp_path: Path,
|
|
24
|
+
):
|
|
25
|
+
explicit_env_path = tmp_path / "workspace" / "environments" / "already_normalized"
|
|
26
|
+
env_name, env_path = build._resolve_env_push_target(None, str(explicit_env_path))
|
|
27
|
+
|
|
28
|
+
assert env_name == "already-normalized"
|
|
29
|
+
assert env_path == explicit_env_path.resolve()
|
|
@@ -159,6 +159,49 @@ class TestCliAgentEnv:
|
|
|
159
159
|
response = await env.env_response(messages, state)
|
|
160
160
|
assert response == []
|
|
161
161
|
|
|
162
|
+
@pytest.mark.asyncio
|
|
163
|
+
async def test_non_streaming_intercept_tools_use_oai_schema(
|
|
164
|
+
self, sample_dataset, mock_client
|
|
165
|
+
):
|
|
166
|
+
"""OpenAI-formatted intercepted tools should work for non-streaming requests."""
|
|
167
|
+
env = vf.CliAgentEnv(
|
|
168
|
+
run_command="python agent.py",
|
|
169
|
+
dataset=sample_dataset,
|
|
170
|
+
rubric=vf.Rubric(),
|
|
171
|
+
)
|
|
172
|
+
state = await env.init_state(
|
|
173
|
+
input=sample_dataset[0],
|
|
174
|
+
client=mock_client,
|
|
175
|
+
model="test-model",
|
|
176
|
+
)
|
|
177
|
+
request_id = "req-test"
|
|
178
|
+
state["current_request_id"] = request_id
|
|
179
|
+
env._interception_server.intercepts[request_id] = {
|
|
180
|
+
"stream": False,
|
|
181
|
+
"tools": [
|
|
182
|
+
{
|
|
183
|
+
"type": "function",
|
|
184
|
+
"function": {
|
|
185
|
+
"name": "echo",
|
|
186
|
+
"description": "echo tool",
|
|
187
|
+
"parameters": {},
|
|
188
|
+
},
|
|
189
|
+
}
|
|
190
|
+
],
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
response = await env.get_model_response(
|
|
194
|
+
state=state,
|
|
195
|
+
prompt=sample_dataset[0]["prompt"],
|
|
196
|
+
client=mock_client,
|
|
197
|
+
model="test-model",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
assert isinstance(response, vf.Response)
|
|
201
|
+
kwargs = mock_client.last_call_kwargs
|
|
202
|
+
assert kwargs["tools"] is not None
|
|
203
|
+
assert kwargs["tools"][0].name == "echo"
|
|
204
|
+
|
|
162
205
|
|
|
163
206
|
class TestHarborEnv:
|
|
164
207
|
"""Tests for HarborEnv."""
|