verifiers 0.1.13.dev8__tar.gz → 0.1.15.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/.gitignore +1 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/PKG-INFO +82 -5
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/README.md +77 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/pyproject.toml +27 -5
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/conftest.py +9 -12
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_cli_agent_env.py +156 -3
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_composable_env.py +4 -4
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_decorator_ranks.py +43 -4
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_endpoint_registry.py +33 -65
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_env_group.py +51 -52
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_env_server.py +67 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_environment.py +85 -3
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_environment_extra.py +0 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_envs.py +28 -3
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_eval_cli.py +140 -16
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_eval_utils.py +24 -0
- verifiers-0.1.15.dev0/tests/test_gepa_cli.py +251 -0
- verifiers-0.1.15.dev0/tests/test_gepa_utils.py +155 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_interception_utils.py +105 -0
- verifiers-0.1.15.dev0/tests/test_langchain_deep_agents_wikispeedia.py +312 -0
- verifiers-0.1.15.dev0/tests/test_lean_task.py +344 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_logging.py +50 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_math_rubric.py +4 -21
- verifiers-0.1.15.dev0/tests/test_mcp_search_env.py +61 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_multiturn_env.py +70 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_nemorl_client.py +146 -35
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_openai_chat_completions_token_client.py +6 -2
- verifiers-0.1.15.dev0/tests/test_openai_responses_client.py +338 -0
- verifiers-0.1.15.dev0/tests/test_opencode_harbor.py +100 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_opencode_rlm_env.py +8 -3
- verifiers-0.1.15.dev0/tests/test_per_turn_timing.py +68 -0
- verifiers-0.1.15.dev0/tests/test_renderer_client.py +651 -0
- verifiers-0.1.15.dev0/tests/test_renderer_e2e.py +417 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_rlm_composable_env.py +2 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_rlm_env.py +3 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_rubric.py +9 -59
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_rubric_group.py +12 -67
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_save_utils.py +11 -11
- verifiers-0.1.15.dev0/tests/test_setup_script.py +32 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_singleturn_env.py +7 -35
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_stateful_tool_env.py +3 -5
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_tool_env.py +4 -8
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_tool_utils.py +31 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_trajectory_processing.py +0 -3
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_tui_info_formatting.py +9 -1
- verifiers-0.1.15.dev0/tests/test_types.py +11 -0
- verifiers-0.1.15.dev0/tests/test_v1_bfcl.py +55 -0
- verifiers-0.1.15.dev0/tests/test_v1_config_extension.py +1599 -0
- verifiers-0.1.15.dev0/tests/test_v1_endpoint_protocols.py +222 -0
- verifiers-0.1.15.dev0/tests/test_v1_example_counts.py +117 -0
- verifiers-0.1.15.dev0/tests/test_v1_group_reward_env.py +39 -0
- verifiers-0.1.15.dev0/tests/test_v1_harbor_cli.py +209 -0
- verifiers-0.1.15.dev0/tests/test_v1_mini_swe_agent.py +65 -0
- verifiers-0.1.15.dev0/tests/test_v1_rlm_swe.py +275 -0
- verifiers-0.1.15.dev0/tests/test_v1_runtime_lifecycle.py +1897 -0
- verifiers-0.1.15.dev0/tests/test_v1_scoring_functions.py +152 -0
- verifiers-0.1.15.dev0/tests/test_wordle_env.py +22 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/__init__.py +88 -8
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/__init__.py +29 -0
- verifiers-0.1.15.dev0/verifiers/clients/nemorl_chat_completions_client.py +117 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/openai_chat_completions_client.py +2 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/openai_chat_completions_token_client.py +38 -4
- verifiers-0.1.15.dev0/verifiers/clients/openai_responses_client.py +443 -0
- verifiers-0.1.15.dev0/verifiers/clients/renderer_client.py +606 -0
- verifiers-0.1.15.dev0/verifiers/decorators.py +296 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/AGENTS.md +2 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/env_group.py +192 -62
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/environment.py +113 -74
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/README.md +5 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/__init__.py +2 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/cli_agent_env.py +51 -13
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/README.md +8 -3
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/__init__.py +2 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/composable_env.py +12 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harness.py +9 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +5 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/opencode.py +44 -12
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/rlm.py +52 -5
- verifiers-0.1.15.dev0/verifiers/envs/experimental/composable/swe_debug_env.py +327 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/task.py +13 -8
- verifiers-0.1.15.dev0/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +13 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +138 -27
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +3 -11
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -5
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -7
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -8
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -5
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +1 -12
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -6
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +2 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/harbor_env/env.py +4 -3
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/mcp_env.py +9 -12
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/opencode_env.py +29 -14
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/opencode_rlm_env.py +28 -14
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/rlm_env.py +22 -3
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/sandbox_mixin.py +2 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/browser_env.py +6 -3
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/modes/base.py +1 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +2 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +2 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/openenv_env.py +3 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/reasoninggym_env.py +1 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/textarena_env.py +2 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/multiturn_env.py +33 -7
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/python_env.py +5 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/sandbox_env.py +6 -3
- verifiers-0.1.15.dev0/verifiers/gepa/gepa_utils.py +322 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/experimental/hybrid_math_rubric.py +1 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/rubric.py +137 -34
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/rubric_group.py +23 -14
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/eval.py +10 -6
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/gepa.py +274 -37
- verifiers-0.1.15.dev0/verifiers/scripts/setup.py +33 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/tui.py +34 -10
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/client/zmq_env_client.py +4 -1
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/types.py +176 -18
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/async_utils.py +18 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/display_utils.py +90 -3
- verifiers-0.1.15.dev0/verifiers/utils/env_config_utils.py +45 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/env_utils.py +53 -2
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/error_utils.py +33 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/eval_display.py +61 -50
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/eval_utils.py +200 -116
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/interception_utils.py +385 -17
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/logging_utils.py +9 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/response_utils.py +2 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/save_utils.py +33 -13
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/threaded_sandbox_client.py +35 -5
- verifiers-0.1.15.dev0/verifiers/v1/README.md +1594 -0
- verifiers-0.1.15.dev0/verifiers/v1/RE_MIGRATION.md +804 -0
- verifiers-0.1.15.dev0/verifiers/v1/__init__.py +93 -0
- verifiers-0.1.15.dev0/verifiers/v1/config.py +455 -0
- verifiers-0.1.15.dev0/verifiers/v1/env.py +136 -0
- verifiers-0.1.15.dev0/verifiers/v1/harness.py +598 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/__init__.py +1 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/__init__.py +8 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/cli.py +121 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/configs.py +74 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/mini_swe_agent.py +247 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/opencode.py +298 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/pi.py +212 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/harnesses/rlm.py +265 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/tasksets/__init__.py +3 -0
- verifiers-0.1.15.dev0/verifiers/v1/packages/tasksets/harbor.py +407 -0
- verifiers-0.1.15.dev0/verifiers/v1/runtime.py +1931 -0
- verifiers-0.1.15.dev0/verifiers/v1/state.py +401 -0
- verifiers-0.1.15.dev0/verifiers/v1/task.py +177 -0
- verifiers-0.1.15.dev0/verifiers/v1/taskset.py +269 -0
- verifiers-0.1.15.dev0/verifiers/v1/toolset.py +352 -0
- verifiers-0.1.15.dev0/verifiers/v1/user.py +85 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/__init__.py +1 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/artifact_utils.py +31 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/endpoint_utils.py +671 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/json_utils.py +11 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/judge_utils.py +63 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/lifecycle_utils.py +96 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/mcp_proxy_utils.py +233 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/mcp_utils.py +148 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/program_utils.py +483 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/prompt_utils.py +136 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/sandbox_program_utils.py +770 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/sandbox_utils.py +822 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/scoring_utils.py +379 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/timing_utils.py +36 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/tool_utils.py +19 -0
- verifiers-0.1.15.dev0/verifiers/v1/utils/trajectory_utils.py +78 -0
- verifiers-0.1.13.dev8/tests/test_gepa_cli.py +0 -115
- verifiers-0.1.13.dev8/tests/test_opencode_harbor.py +0 -57
- verifiers-0.1.13.dev8/tests/test_setup_script.py +0 -288
- verifiers-0.1.13.dev8/verifiers/clients/nemorl_chat_completions_client.py +0 -87
- verifiers-0.1.13.dev8/verifiers/decorators.py +0 -147
- verifiers-0.1.13.dev8/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -3
- verifiers-0.1.13.dev8/verifiers/gepa/gepa_utils.py +0 -116
- verifiers-0.1.13.dev8/verifiers/scripts/prime_rl.py +0 -197
- verifiers-0.1.13.dev8/verifiers/scripts/setup.py +0 -611
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/LICENSE +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/AGENTS.md +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/README.md +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_build_script.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_client_config.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_imports.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/errors.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.13.dev8 → verifiers-0.1.15.dev0}/verifiers/utils/version_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.15.dev0
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -26,6 +26,7 @@ Requires-Dist: aiolimiter>=1.2.1
|
|
|
26
26
|
Requires-Dist: anthropic>=0.78.0
|
|
27
27
|
Requires-Dist: datasets<4.7.0,>=3.0.0
|
|
28
28
|
Requires-Dist: gepa
|
|
29
|
+
Requires-Dist: httpx>=0.27.0
|
|
29
30
|
Requires-Dist: jinja2>=3.1.6
|
|
30
31
|
Requires-Dist: math-verify>=0.8.0
|
|
31
32
|
Requires-Dist: mcp>=1.14.1
|
|
@@ -34,7 +35,7 @@ Requires-Dist: nest-asyncio>=1.6.0
|
|
|
34
35
|
Requires-Dist: numpy
|
|
35
36
|
Requires-Dist: openai-agents>=0.0.7
|
|
36
37
|
Requires-Dist: openai>=1.108.1
|
|
37
|
-
Requires-Dist: prime-sandboxes>=0.2.
|
|
38
|
+
Requires-Dist: prime-sandboxes>=0.2.25
|
|
38
39
|
Requires-Dist: prime-tunnel>=0.1.6
|
|
39
40
|
Requires-Dist: pydantic>=2.11.9
|
|
40
41
|
Requires-Dist: pyzmq>=27.1.0
|
|
@@ -46,13 +47,14 @@ Requires-Dist: tenacity>=8.5.0
|
|
|
46
47
|
Requires-Dist: textual
|
|
47
48
|
Requires-Dist: tomli; python_version < '3.11'
|
|
48
49
|
Requires-Dist: typing-extensions; python_version < '3.12'
|
|
49
|
-
Requires-Dist: wget>=3.2
|
|
50
50
|
Provides-Extra: browser
|
|
51
51
|
Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
|
|
52
52
|
Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
|
|
53
53
|
Requires-Dist: stagehand>=3.0.0; extra == 'browser'
|
|
54
54
|
Provides-Extra: openenv
|
|
55
55
|
Requires-Dist: openenv-core[core]==0.2.1; extra == 'openenv'
|
|
56
|
+
Provides-Extra: renderers
|
|
57
|
+
Requires-Dist: renderers>=0.1.6; extra == 'renderers'
|
|
56
58
|
Provides-Extra: rg
|
|
57
59
|
Requires-Dist: reasoning-gym; extra == 'rg'
|
|
58
60
|
Provides-Extra: rl
|
|
@@ -107,7 +109,9 @@ Verifiers: Environments for LLM Reinforcement Learning
|
|
|
107
109
|
|
|
108
110
|
## News & Updates
|
|
109
111
|
|
|
110
|
-
- [
|
|
112
|
+
- [05/07/26] v0.1.14 is released, featuring the v1 Taskset/Harness API, shared eval and training config shape, model-family starter configs, OpenAI Responses and renderer-backed clients, per-turn timing, GEPA prompt artifacts, Lean guard markers, and release/infrastructure hardening.
|
|
113
|
+
- [04/28/26] v0.1.13.dev8 is released, featuring per-rollout wall-clock timeouts for `MultiTurnEnv`, CLI timeout config, sandbox timeout propagation, and smaller `CliAgentEnv` and RLM fixes.
|
|
114
|
+
- [04/17/26] v0.1.12 is released, featuring upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
|
|
111
115
|
- [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
|
|
112
116
|
- [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
|
|
113
117
|
- [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
|
|
@@ -197,11 +201,82 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
|
|
|
197
201
|
async def correct_answer(completion, answer) -> float:
|
|
198
202
|
completion_ans = completion[-1]['content']
|
|
199
203
|
return 1.0 if completion_ans == answer else 0.0
|
|
200
|
-
rubric = Rubric(funcs=[correct_answer])
|
|
204
|
+
rubric = vf.Rubric(funcs=[correct_answer])
|
|
201
205
|
env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
|
|
202
206
|
return env
|
|
203
207
|
```
|
|
204
208
|
|
|
209
|
+
For new environments with reusable tasksets, toolsets, custom programs, or
|
|
210
|
+
custom harnesses, use the v1 Taskset/Harness path:
|
|
211
|
+
```python
|
|
212
|
+
# my_env.py
|
|
213
|
+
import verifiers.v1 as vf
|
|
214
|
+
|
|
215
|
+
def source():
|
|
216
|
+
yield {
|
|
217
|
+
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
218
|
+
"answer": "cba",
|
|
219
|
+
"max_turns": 1,
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
@vf.reward(weight=1.0)
|
|
223
|
+
async def contains_answer(task, state) -> float:
|
|
224
|
+
return float(task["answer"] in str(state.get("completion") or ""))
|
|
225
|
+
|
|
226
|
+
def load_taskset(config: vf.TasksetConfig | None = None):
|
|
227
|
+
return vf.Taskset(source=source, rewards=[contains_answer], config=config)
|
|
228
|
+
|
|
229
|
+
def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
|
|
230
|
+
config = config or vf.EnvConfig()
|
|
231
|
+
return vf.Env(taskset=load_taskset(config=config.taskset))
|
|
232
|
+
```
|
|
233
|
+
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
234
|
+
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
235
|
+
Reusable taskset and harness packages live under `verifiers.v1.packages` while
|
|
236
|
+
the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
|
|
237
|
+
For example, Harbor task directories can run through the bundled OpenCode CLI
|
|
238
|
+
harness with:
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
env = vf.Env(
|
|
242
|
+
taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
|
|
243
|
+
harness=vf.OpenCode(),
|
|
244
|
+
)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
The same environment package is the unit used by evals and `prime-rl`. The
|
|
248
|
+
trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
|
|
249
|
+
and harness options stay under `env.taskset` and `env.harness`:
|
|
250
|
+
|
|
251
|
+
```toml
|
|
252
|
+
# configs/rl/my-v1-env.toml
|
|
253
|
+
model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
|
|
254
|
+
max_steps = 100
|
|
255
|
+
batch_size = 256
|
|
256
|
+
rollouts_per_example = 8
|
|
257
|
+
|
|
258
|
+
[sampling]
|
|
259
|
+
max_tokens = 4096
|
|
260
|
+
|
|
261
|
+
[[env]]
|
|
262
|
+
id = "my-env"
|
|
263
|
+
|
|
264
|
+
[env.args]
|
|
265
|
+
arg1 = "non-th-arg"
|
|
266
|
+
|
|
267
|
+
[env.harness]
|
|
268
|
+
max_turns = 1
|
|
269
|
+
|
|
270
|
+
[env.taskset.scoring.contains_answer]
|
|
271
|
+
weight = 1.0
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
prime env install my-env
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
For self-managed training launch commands, use the `prime-rl` documentation.
|
|
279
|
+
|
|
205
280
|
To install the environment module into your project, do:
|
|
206
281
|
```bash
|
|
207
282
|
prime env install my-env # installs from ./environments/my_env
|
|
@@ -237,6 +312,8 @@ prime eval run primeintellect/math-python
|
|
|
237
312
|
|
|
238
313
|
**[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
|
|
239
314
|
|
|
315
|
+
**[BYO Harness](docs/byo-harness.md)** — Build v1 Taskset/Harness environments with custom tools, sandboxes, users, and custom programs.
|
|
316
|
+
|
|
240
317
|
**[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
|
|
241
318
|
|
|
242
319
|
**[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
|
|
@@ -34,7 +34,9 @@ Verifiers: Environments for LLM Reinforcement Learning
|
|
|
34
34
|
|
|
35
35
|
## News & Updates
|
|
36
36
|
|
|
37
|
-
- [
|
|
37
|
+
- [05/07/26] v0.1.14 is released, featuring the v1 Taskset/Harness API, shared eval and training config shape, model-family starter configs, OpenAI Responses and renderer-backed clients, per-turn timing, GEPA prompt artifacts, Lean guard markers, and release/infrastructure hardening.
|
|
38
|
+
- [04/28/26] v0.1.13.dev8 is released, featuring per-rollout wall-clock timeouts for `MultiTurnEnv`, CLI timeout config, sandbox timeout propagation, and smaller `CliAgentEnv` and RLM fixes.
|
|
39
|
+
- [04/17/26] v0.1.12 is released, featuring upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
|
|
38
40
|
- [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
|
|
39
41
|
- [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
|
|
40
42
|
- [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
|
|
@@ -124,11 +126,82 @@ def load_environment(dataset_name: str = 'gsm8k') -> vf.Environment:
|
|
|
124
126
|
async def correct_answer(completion, answer) -> float:
|
|
125
127
|
completion_ans = completion[-1]['content']
|
|
126
128
|
return 1.0 if completion_ans == answer else 0.0
|
|
127
|
-
rubric = Rubric(funcs=[correct_answer])
|
|
129
|
+
rubric = vf.Rubric(funcs=[correct_answer])
|
|
128
130
|
env = vf.SingleTurnEnv(dataset=dataset, rubric=rubric)
|
|
129
131
|
return env
|
|
130
132
|
```
|
|
131
133
|
|
|
134
|
+
For new environments with reusable tasksets, toolsets, custom programs, or
|
|
135
|
+
custom harnesses, use the v1 Taskset/Harness path:
|
|
136
|
+
```python
|
|
137
|
+
# my_env.py
|
|
138
|
+
import verifiers.v1 as vf
|
|
139
|
+
|
|
140
|
+
def source():
|
|
141
|
+
yield {
|
|
142
|
+
"prompt": [{"role": "user", "content": "Reverse abc."}],
|
|
143
|
+
"answer": "cba",
|
|
144
|
+
"max_turns": 1,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
@vf.reward(weight=1.0)
|
|
148
|
+
async def contains_answer(task, state) -> float:
|
|
149
|
+
return float(task["answer"] in str(state.get("completion") or ""))
|
|
150
|
+
|
|
151
|
+
def load_taskset(config: vf.TasksetConfig | None = None):
|
|
152
|
+
return vf.Taskset(source=source, rewards=[contains_answer], config=config)
|
|
153
|
+
|
|
154
|
+
def load_environment(config: vf.EnvConfig | None = None) -> vf.Env:
|
|
155
|
+
config = config or vf.EnvConfig()
|
|
156
|
+
return vf.Env(taskset=load_taskset(config=config.taskset))
|
|
157
|
+
```
|
|
158
|
+
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
159
|
+
**[BYO Harness](docs/byo-harness.md)** for the advanced v1 taskset/harness API.
|
|
160
|
+
Reusable taskset and harness packages live under `verifiers.v1.packages` while
|
|
161
|
+
the v1 API stabilizes, and are re-exported from `verifiers.v1` for normal use.
|
|
162
|
+
For example, Harbor task directories can run through the bundled OpenCode CLI
|
|
163
|
+
harness with:
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
env = vf.Env(
|
|
167
|
+
taskset=vf.HarborTaskset(tasks="/path/to/harbor/tasks"),
|
|
168
|
+
harness=vf.OpenCode(),
|
|
169
|
+
)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
The same environment package is the unit used by evals and `prime-rl`. The
|
|
173
|
+
trainer owns model, endpoint, sampling, and rollout count; v1-specific taskset
|
|
174
|
+
and harness options stay under `env.taskset` and `env.harness`:
|
|
175
|
+
|
|
176
|
+
```toml
|
|
177
|
+
# configs/rl/my-v1-env.toml
|
|
178
|
+
model = "Qwen/Qwen3-30B-A3B-Instruct-2507"
|
|
179
|
+
max_steps = 100
|
|
180
|
+
batch_size = 256
|
|
181
|
+
rollouts_per_example = 8
|
|
182
|
+
|
|
183
|
+
[sampling]
|
|
184
|
+
max_tokens = 4096
|
|
185
|
+
|
|
186
|
+
[[env]]
|
|
187
|
+
id = "my-env"
|
|
188
|
+
|
|
189
|
+
[env.args]
|
|
190
|
+
arg1 = "non-th-arg"
|
|
191
|
+
|
|
192
|
+
[env.harness]
|
|
193
|
+
max_turns = 1
|
|
194
|
+
|
|
195
|
+
[env.taskset.scoring.contains_answer]
|
|
196
|
+
weight = 1.0
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
prime env install my-env
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
For self-managed training launch commands, use the `prime-rl` documentation.
|
|
204
|
+
|
|
132
205
|
To install the environment module into your project, do:
|
|
133
206
|
```bash
|
|
134
207
|
prime env install my-env # installs from ./environments/my_env
|
|
@@ -164,6 +237,8 @@ prime eval run primeintellect/math-python
|
|
|
164
237
|
|
|
165
238
|
**[Environments](docs/environments.md)** — Create datasets, rubrics, and custom multi-turn interaction protocols.
|
|
166
239
|
|
|
240
|
+
**[BYO Harness](docs/byo-harness.md)** — Build v1 Taskset/Harness environments with custom tools, sandboxes, users, and custom programs.
|
|
241
|
+
|
|
167
242
|
**[Evaluation](docs/evaluation.md)** - Evaluate models using your environments.
|
|
168
243
|
|
|
169
244
|
**[Training](docs/training.md)** — Train models in your environments with reinforcement learning.
|
|
@@ -38,7 +38,7 @@ dependencies = [
|
|
|
38
38
|
"openai>=1.108.1",
|
|
39
39
|
"openai-agents>=0.0.7",
|
|
40
40
|
"prime-tunnel>=0.1.6",
|
|
41
|
-
"prime-sandboxes>=0.2.
|
|
41
|
+
"prime-sandboxes>=0.2.25",
|
|
42
42
|
"pydantic>=2.11.9",
|
|
43
43
|
"requests",
|
|
44
44
|
"rich",
|
|
@@ -46,13 +46,13 @@ dependencies = [
|
|
|
46
46
|
"textual",
|
|
47
47
|
"tomli; python_version < '3.11'",
|
|
48
48
|
"typing_extensions; python_version < '3.12'",
|
|
49
|
-
"wget>=3.2",
|
|
50
49
|
"gepa",
|
|
51
50
|
"pyzmq>=27.1.0",
|
|
52
51
|
"msgpack>=1.1.2",
|
|
53
52
|
"aiolimiter>=1.2.1",
|
|
54
53
|
"setproctitle>=1.3.0",
|
|
55
|
-
"regex<2026.4.4",
|
|
54
|
+
"regex<2026.4.4",
|
|
55
|
+
"httpx>=0.27.0",
|
|
56
56
|
]
|
|
57
57
|
|
|
58
58
|
[dependency-groups]
|
|
@@ -73,6 +73,7 @@ dev = [
|
|
|
73
73
|
"aiohttp>=3.9.0",
|
|
74
74
|
"python-dotenv>=1.0.0",
|
|
75
75
|
"nltk",
|
|
76
|
+
"renderers>=0.1.6",
|
|
76
77
|
]
|
|
77
78
|
|
|
78
79
|
[project.optional-dependencies]
|
|
@@ -91,6 +92,9 @@ browser = [
|
|
|
91
92
|
"aiohttp>=3.9.0",
|
|
92
93
|
"python-dotenv>=1.0.0",
|
|
93
94
|
]
|
|
95
|
+
renderers = [
|
|
96
|
+
"renderers>=0.1.6",
|
|
97
|
+
]
|
|
94
98
|
rl = [
|
|
95
99
|
"torch>=2.8.0,<2.9.0",
|
|
96
100
|
"transformers>=4.56.2",
|
|
@@ -108,6 +112,24 @@ rl = [
|
|
|
108
112
|
preview = true
|
|
109
113
|
required-version = ">=0.11.1"
|
|
110
114
|
|
|
115
|
+
[[tool.uv.index]]
|
|
116
|
+
name = "pypi"
|
|
117
|
+
url = "https://pypi.org/simple"
|
|
118
|
+
default = true
|
|
119
|
+
exclude-newer = "7 days"
|
|
120
|
+
|
|
121
|
+
[tool.uv.exclude-newer-package]
|
|
122
|
+
# PrimeIntellect-published on PyPI (trusted publisher)
|
|
123
|
+
prime-tunnel = false
|
|
124
|
+
prime-sandboxes = false
|
|
125
|
+
renderers = false
|
|
126
|
+
|
|
127
|
+
[tool.uv.sources]
|
|
128
|
+
# Pinned to renderers main until the next PyPI release lands; drop after.
|
|
129
|
+
# fe67f9f = renderers main: PR #4 squash-merge — construction-time
|
|
130
|
+
# preserve_*_thinking flags on create_renderer / create_renderer_pool.
|
|
131
|
+
renderers = { git = "https://github.com/PrimeIntellect-ai/renderers.git", rev = "fe67f9f" }
|
|
132
|
+
|
|
111
133
|
[tool.uv.extra-build-dependencies]
|
|
112
134
|
flash-attn = [{ requirement = "torch", match-runtime = true }]
|
|
113
135
|
|
|
@@ -125,7 +147,6 @@ vf-rl = "verifiers.scripts.rl:main"
|
|
|
125
147
|
vf-train = "verifiers.scripts.train:main"
|
|
126
148
|
vf-tui = "verifiers.scripts.tui:main"
|
|
127
149
|
vf-vllm = "verifiers.scripts.vllm:main"
|
|
128
|
-
prime-rl = "verifiers.scripts.prime_rl:main"
|
|
129
150
|
|
|
130
151
|
# hatchling configuration
|
|
131
152
|
[tool.hatch.version]
|
|
@@ -170,6 +191,7 @@ addopts = [
|
|
|
170
191
|
markers = [
|
|
171
192
|
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
172
193
|
"integration: marks tests as integration tests",
|
|
194
|
+
"prime_sandbox: marks tests that provision real Prime sandbox or tunnel resources",
|
|
173
195
|
"unit: marks tests as unit tests",
|
|
174
196
|
"asyncio: marks tests as async tests",
|
|
175
197
|
"parsers: marks tests for parser components",
|
|
@@ -195,7 +217,7 @@ unknown-argument = "warn"
|
|
|
195
217
|
redundant-cast = "ignore"
|
|
196
218
|
|
|
197
219
|
[tool.ty.src]
|
|
198
|
-
exclude = ["environments"]
|
|
220
|
+
exclude = ["environments", "verifiers/v1/sketch.py"]
|
|
199
221
|
|
|
200
222
|
[[tool.ty.overrides]]
|
|
201
223
|
include = ["verifiers/envs/experimental/composable/tasksets/**"]
|
|
@@ -425,9 +425,10 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
|
|
|
425
425
|
super().__init__(tools=[offset_tool], **kwargs)
|
|
426
426
|
|
|
427
427
|
async def setup_state(self, state, **kwargs):
|
|
428
|
-
await super().setup_state(state, **kwargs)
|
|
428
|
+
state = await super().setup_state(state, **kwargs)
|
|
429
429
|
state["offset"] = 3
|
|
430
430
|
state["update_calls"] = 0
|
|
431
|
+
return state
|
|
431
432
|
|
|
432
433
|
def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
|
|
433
434
|
state["update_calls"] += 1
|
|
@@ -457,13 +458,15 @@ def make_input() -> Callable[..., RolloutInput]:
|
|
|
457
458
|
|
|
458
459
|
def _make_input(
|
|
459
460
|
example_id: int = 0,
|
|
460
|
-
task: str = "default",
|
|
461
461
|
prompt: Messages = DEFAULT_PROMPT,
|
|
462
462
|
info: Info = {},
|
|
463
463
|
answer: str = "4",
|
|
464
464
|
) -> RolloutInput:
|
|
465
465
|
return RolloutInput(
|
|
466
|
-
example_id=example_id,
|
|
466
|
+
example_id=example_id,
|
|
467
|
+
prompt=prompt,
|
|
468
|
+
answer=answer,
|
|
469
|
+
info=info,
|
|
467
470
|
)
|
|
468
471
|
|
|
469
472
|
return _make_input
|
|
@@ -475,7 +478,6 @@ def make_state() -> Callable[..., State]:
|
|
|
475
478
|
|
|
476
479
|
def _make_state(
|
|
477
480
|
example_id: int = 0,
|
|
478
|
-
task: str = "default",
|
|
479
481
|
prompt: Messages = DEFAULT_PROMPT,
|
|
480
482
|
answer: str = "4",
|
|
481
483
|
info: Info = {},
|
|
@@ -487,17 +489,12 @@ def make_state() -> Callable[..., State]:
|
|
|
487
489
|
stop_condition: str | None = "max_turns_reached",
|
|
488
490
|
tool_defs: list[Tool] | None = None,
|
|
489
491
|
trajectory: list[TrajectoryStep] = [],
|
|
490
|
-
timing=RolloutTiming(
|
|
491
|
-
generation_ms=0.0,
|
|
492
|
-
scoring_ms=0.0,
|
|
493
|
-
total_ms=0.0,
|
|
494
|
-
),
|
|
492
|
+
timing=RolloutTiming(),
|
|
495
493
|
foo: str = "bar", # custom field
|
|
496
494
|
**kwargs,
|
|
497
495
|
) -> State:
|
|
498
496
|
return State(
|
|
499
497
|
example_id=example_id,
|
|
500
|
-
task=task,
|
|
501
498
|
prompt=prompt,
|
|
502
499
|
answer=answer,
|
|
503
500
|
info=info,
|
|
@@ -550,7 +547,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
|
|
|
550
547
|
rollouts_per_example: int = 1,
|
|
551
548
|
sampling_args: SamplingArgs = {},
|
|
552
549
|
date: str = "1970-01-01",
|
|
553
|
-
|
|
550
|
+
time: float = 0.0,
|
|
554
551
|
avg_reward: float = 0.0,
|
|
555
552
|
avg_metrics: dict[str, float] = {},
|
|
556
553
|
pass_at_k: dict[str, float] = {},
|
|
@@ -578,7 +575,7 @@ def make_metadata() -> Callable[..., GenerateMetadata]:
|
|
|
578
575
|
rollouts_per_example=rollouts_per_example,
|
|
579
576
|
sampling_args=sampling_args,
|
|
580
577
|
date=date,
|
|
581
|
-
|
|
578
|
+
time=time,
|
|
582
579
|
avg_reward=avg_reward,
|
|
583
580
|
avg_metrics=avg_metrics,
|
|
584
581
|
pass_at_k=pass_at_k,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Tests for CliAgentEnv and HarborEnv."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import tempfile
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
@@ -8,6 +9,7 @@ import pytest
|
|
|
8
9
|
from datasets import Dataset
|
|
9
10
|
|
|
10
11
|
import verifiers as vf
|
|
12
|
+
from verifiers.utils.interception_utils import serialize_intercept_response
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
@pytest.fixture
|
|
@@ -100,6 +102,11 @@ class TestCliAgentEnv:
|
|
|
100
102
|
env_vars = await env.build_env_vars(state)
|
|
101
103
|
|
|
102
104
|
assert env_vars["OPENAI_BASE_URL"] == "https://test.trycloudflare.com/v1"
|
|
105
|
+
assert env_vars["OPENAI_API_KEY"] == env._require_interception_server().secret
|
|
106
|
+
assert env_vars["ANTHROPIC_BASE_URL"] == "https://test.trycloudflare.com"
|
|
107
|
+
assert (
|
|
108
|
+
env_vars["ANTHROPIC_API_KEY"] == env._require_interception_server().secret
|
|
109
|
+
)
|
|
103
110
|
assert env_vars["OPENAI_MODEL"] == "gpt-4"
|
|
104
111
|
assert env_vars["CUSTOM_VAR"] == "value"
|
|
105
112
|
|
|
@@ -217,6 +224,152 @@ class TestCliAgentEnv:
|
|
|
217
224
|
assert kwargs["tools"][0].name == "echo"
|
|
218
225
|
|
|
219
226
|
|
|
227
|
+
@pytest.mark.asyncio
|
|
228
|
+
async def test_cli_agent_env_delivers_intercepted_tool_call_response(
|
|
229
|
+
sample_dataset, mock_client
|
|
230
|
+
):
|
|
231
|
+
env = vf.CliAgentEnv(
|
|
232
|
+
run_command="python agent.py",
|
|
233
|
+
dataset=sample_dataset,
|
|
234
|
+
rubric=vf.Rubric(),
|
|
235
|
+
)
|
|
236
|
+
prompt = sample_dataset[0]["prompt"]
|
|
237
|
+
tool_call = {
|
|
238
|
+
"id": "call_echo",
|
|
239
|
+
"type": "function",
|
|
240
|
+
"function": {"name": "echo", "arguments": '{"text": "hello"}'},
|
|
241
|
+
}
|
|
242
|
+
mock_client.add_response(
|
|
243
|
+
prompt,
|
|
244
|
+
"",
|
|
245
|
+
finish_reason="tool_calls",
|
|
246
|
+
tool_calls=[tool_call],
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
state = await env.init_state(
|
|
250
|
+
input=sample_dataset[0],
|
|
251
|
+
client=mock_client,
|
|
252
|
+
model="test-model",
|
|
253
|
+
)
|
|
254
|
+
response_future = asyncio.Future()
|
|
255
|
+
request_id = "req-tool-call"
|
|
256
|
+
state["current_request_id"] = request_id
|
|
257
|
+
env._interception_server.intercepts[request_id] = {
|
|
258
|
+
"stream": False,
|
|
259
|
+
"tools": [
|
|
260
|
+
{
|
|
261
|
+
"type": "function",
|
|
262
|
+
"function": {
|
|
263
|
+
"name": "echo",
|
|
264
|
+
"description": "Return the provided text.",
|
|
265
|
+
"parameters": {
|
|
266
|
+
"type": "object",
|
|
267
|
+
"properties": {"text": {"type": "string"}},
|
|
268
|
+
},
|
|
269
|
+
},
|
|
270
|
+
}
|
|
271
|
+
],
|
|
272
|
+
"response_future": response_future,
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
response = await env.get_model_response(
|
|
276
|
+
state=state,
|
|
277
|
+
prompt=prompt,
|
|
278
|
+
client=mock_client,
|
|
279
|
+
model="test-model",
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
assert response_future.done()
|
|
283
|
+
assert response_future.result() is response
|
|
284
|
+
assert state["current_request_id"] is None
|
|
285
|
+
|
|
286
|
+
payload = serialize_intercept_response(response_future.result())
|
|
287
|
+
choice = payload["choices"][0]
|
|
288
|
+
assert choice["finish_reason"] == "tool_calls"
|
|
289
|
+
assert choice["message"]["tool_calls"] == [tool_call]
|
|
290
|
+
assert mock_client.last_call_kwargs["tools"][0].name == "echo"
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@pytest.mark.asyncio
|
|
294
|
+
async def test_cli_agent_env_synthesizes_stream_for_intercepted_tool_call_response(
|
|
295
|
+
sample_dataset, mock_client
|
|
296
|
+
):
|
|
297
|
+
env = vf.CliAgentEnv(
|
|
298
|
+
run_command="python agent.py",
|
|
299
|
+
dataset=sample_dataset,
|
|
300
|
+
rubric=vf.Rubric(),
|
|
301
|
+
)
|
|
302
|
+
prompt = sample_dataset[0]["prompt"]
|
|
303
|
+
tool_call = {
|
|
304
|
+
"id": "call_echo",
|
|
305
|
+
"type": "function",
|
|
306
|
+
"function": {"name": "echo", "arguments": '{"text": "hello"}'},
|
|
307
|
+
}
|
|
308
|
+
mock_client.add_response(
|
|
309
|
+
prompt,
|
|
310
|
+
"",
|
|
311
|
+
finish_reason="tool_calls",
|
|
312
|
+
tool_calls=[tool_call],
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
state = await env.init_state(
|
|
316
|
+
input=sample_dataset[0],
|
|
317
|
+
client=mock_client,
|
|
318
|
+
model="test-model",
|
|
319
|
+
)
|
|
320
|
+
chunk_queue = asyncio.Queue()
|
|
321
|
+
response_future = asyncio.Future()
|
|
322
|
+
request_id = "req-stream-tool-call"
|
|
323
|
+
state["current_request_id"] = request_id
|
|
324
|
+
env._interception_server.intercepts[request_id] = {
|
|
325
|
+
"stream": True,
|
|
326
|
+
"tools": [
|
|
327
|
+
{
|
|
328
|
+
"type": "function",
|
|
329
|
+
"function": {
|
|
330
|
+
"name": "echo",
|
|
331
|
+
"description": "Return the provided text.",
|
|
332
|
+
"parameters": {
|
|
333
|
+
"type": "object",
|
|
334
|
+
"properties": {"text": {"type": "string"}},
|
|
335
|
+
},
|
|
336
|
+
},
|
|
337
|
+
}
|
|
338
|
+
],
|
|
339
|
+
"chunk_queue": chunk_queue,
|
|
340
|
+
"response_future": response_future,
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
response = await env.get_model_response(
|
|
344
|
+
state=state,
|
|
345
|
+
prompt=prompt,
|
|
346
|
+
client=mock_client,
|
|
347
|
+
model="test-model",
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
chunks = []
|
|
351
|
+
while True:
|
|
352
|
+
chunk = await asyncio.wait_for(chunk_queue.get(), timeout=1.0)
|
|
353
|
+
if chunk is None:
|
|
354
|
+
break
|
|
355
|
+
chunks.append(chunk)
|
|
356
|
+
|
|
357
|
+
assert response_future.done()
|
|
358
|
+
assert response_future.result() is response
|
|
359
|
+
assert state["current_request_id"] is None
|
|
360
|
+
|
|
361
|
+
assert chunks[0]["object"] == "chat.completion.chunk"
|
|
362
|
+
assert chunks[0]["choices"][0]["delta"]["tool_calls"][0]["id"] == "call_echo"
|
|
363
|
+
assert (
|
|
364
|
+
chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["name"] == "echo"
|
|
365
|
+
)
|
|
366
|
+
assert (
|
|
367
|
+
chunks[0]["choices"][0]["delta"]["tool_calls"][0]["function"]["arguments"]
|
|
368
|
+
== '{"text": "hello"}'
|
|
369
|
+
)
|
|
370
|
+
assert chunks[-1]["choices"][0]["finish_reason"] == "tool_calls"
|
|
371
|
+
|
|
372
|
+
|
|
220
373
|
class TestHarborEnv:
|
|
221
374
|
"""Tests for HarborEnv."""
|
|
222
375
|
|
|
@@ -244,7 +397,7 @@ class TestHarborEnv:
|
|
|
244
397
|
dataset_path=harbor_task_dir,
|
|
245
398
|
)
|
|
246
399
|
assert len(env.dataset) == 1
|
|
247
|
-
assert env.dataset[0]["
|
|
400
|
+
assert env.dataset[0]["info"]["task_name"] == "test_task"
|
|
248
401
|
|
|
249
402
|
def test_init_filters_tasks(self, harbor_task_dir):
|
|
250
403
|
"""Test that HarborEnv can filter tasks by name."""
|
|
@@ -260,7 +413,7 @@ class TestHarborEnv:
|
|
|
260
413
|
tasks=["test_task"],
|
|
261
414
|
)
|
|
262
415
|
assert len(env.dataset) == 1
|
|
263
|
-
assert env.dataset[0]["
|
|
416
|
+
assert env.dataset[0]["info"]["task_name"] == "test_task"
|
|
264
417
|
|
|
265
418
|
def test_init_raises_on_empty_dataset(self):
|
|
266
419
|
"""Test that HarborEnv raises when no valid tasks found."""
|
|
@@ -314,7 +467,7 @@ class TestHarborEnv:
|
|
|
314
467
|
)
|
|
315
468
|
state = {
|
|
316
469
|
"interception_base_url": "https://test.trycloudflare.com/v1",
|
|
317
|
-
"
|
|
470
|
+
"info": {"task_name": "my_task"},
|
|
318
471
|
}
|
|
319
472
|
env_vars = await env.build_env_vars(state)
|
|
320
473
|
|
|
@@ -251,7 +251,7 @@ async def test_composable_env_quotes_log_path_when_collecting_logs():
|
|
|
251
251
|
teardown=lambda: None,
|
|
252
252
|
)
|
|
253
253
|
|
|
254
|
-
state = {"sandbox_id": "sbx", "timing": {"
|
|
254
|
+
state = {"sandbox_id": "sbx", "timing": {"total": 0}}
|
|
255
255
|
|
|
256
256
|
await env.post_rollout(state)
|
|
257
257
|
|
|
@@ -594,7 +594,7 @@ async def test_composable_env_collects_harness_metrics():
|
|
|
594
594
|
state = {
|
|
595
595
|
"sandbox_id": "sbx",
|
|
596
596
|
"info": {"id": 0},
|
|
597
|
-
"timing": {"
|
|
597
|
+
"timing": {"total": 0},
|
|
598
598
|
"trajectory": [],
|
|
599
599
|
}
|
|
600
600
|
|
|
@@ -633,7 +633,7 @@ async def test_composable_env_metrics_with_key_whitelist():
|
|
|
633
633
|
state = {
|
|
634
634
|
"sandbox_id": "sbx",
|
|
635
635
|
"info": {"id": 0},
|
|
636
|
-
"timing": {"
|
|
636
|
+
"timing": {"total": 0},
|
|
637
637
|
"trajectory": [],
|
|
638
638
|
}
|
|
639
639
|
|
|
@@ -659,7 +659,7 @@ async def test_composable_env_no_metrics_when_path_not_set():
|
|
|
659
659
|
state = {
|
|
660
660
|
"sandbox_id": "sbx",
|
|
661
661
|
"info": {"id": 0},
|
|
662
|
-
"timing": {"
|
|
662
|
+
"timing": {"total": 0},
|
|
663
663
|
"trajectory": [],
|
|
664
664
|
}
|
|
665
665
|
|