verifiers 0.1.15.dev5__tar.gz → 0.1.15.dev6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/PKG-INFO +14 -8
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/README.md +13 -7
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_langchain_deep_agents_wikispeedia.py +74 -19
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_mcp_search_env.py +5 -3
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_opencode_harbor.py +2 -2
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_save_utils.py +4 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_bfcl.py +18 -10
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_config_extension.py +181 -29
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_group_reward_env.py +8 -3
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_rlm_swe.py +3 -3
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/__init__.py +1 -1
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/openai_chat_completions_client.py +3 -24
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/openai_completions_client.py +5 -2
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/environment.py +4 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/init.py +77 -15
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/types.py +13 -8
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/types.py +2 -2
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/response_utils.py +29 -3
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/save_utils.py +1 -3
- verifiers-0.1.15.dev6/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +252 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/README.md +21 -37
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/RE_MIGRATION.md +4 -4
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/config.py +66 -27
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/config_utils.py +24 -1
- verifiers-0.1.15.dev5/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -73
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/.gitignore +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/LICENSE +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/pyproject.toml +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/README.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_environment.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_envs.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_eval_cli.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_eval_display.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_eval_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_imports.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_lean_task.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_openenv_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_pricing_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_renderer_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_renderer_e2e.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_rlm_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_empty_completions.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_example_counts.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_harbor_cli.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_runtime_lifecycle.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_scoring_functions.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_v1_taskset_bindings.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/clients/renderer_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/composable_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/task.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/eval.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/env_config_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/eval_display.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/eval_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/interception_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/pricing_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/env.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/harness.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/harnesses/command.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/harnesses/configs.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/harnesses/pi.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/harnesses/rlm.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/harnesses/terminus_2.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/packages/tasksets/harbor.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/runtime.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/state.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/task.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/taskset.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/toolset.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/types.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/user.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/artifact_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/binding_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/config_callable_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/endpoint_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/json_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/judge_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/lifecycle_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/mcp_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/object_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/program_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/prompt_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/runtime_registry.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/sandbox_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/scoring_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/serialization_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/task_freeze_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/taskset_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/timing_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/trajectory_utils.py +0 -0
- {verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/verifiers/v1/utils/usage_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev6
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -174,6 +174,10 @@ Environments built with Verifiers are self-contained Python modules. To initiali
|
|
|
174
174
|
```bash
|
|
175
175
|
prime env init my-env # creates a new template in ./environments/my_env
|
|
176
176
|
```
|
|
177
|
+
Add an explicit harness loader when the environment owns harness behavior:
|
|
178
|
+
```bash
|
|
179
|
+
prime env init my-env --with-harness
|
|
180
|
+
```
|
|
177
181
|
For OpenEnv integration, use:
|
|
178
182
|
```bash
|
|
179
183
|
prime env init my-openenv --openenv
|
|
@@ -191,7 +195,9 @@ environments/my_env/
|
|
|
191
195
|
└── README.md # Documentation
|
|
192
196
|
```
|
|
193
197
|
|
|
194
|
-
Environment modules should expose a `load_environment` function which returns an
|
|
198
|
+
Environment modules should expose a `load_environment` function which returns an
|
|
199
|
+
environment object. For simple legacy environments, this can still be a direct
|
|
200
|
+
constructor:
|
|
195
201
|
```python
|
|
196
202
|
# my_env.py
|
|
197
203
|
import verifiers as vf
|
|
@@ -223,7 +229,7 @@ def source():
|
|
|
223
229
|
async def contains_answer(task, state) -> float:
|
|
224
230
|
return float(task["answer"] in str(state.get("completion") or ""))
|
|
225
231
|
|
|
226
|
-
def load_taskset(config: vf.TasksetConfig
|
|
232
|
+
def load_taskset(config: vf.TasksetConfig):
|
|
227
233
|
return vf.Taskset(source=source, rewards=[contains_answer], config=config)
|
|
228
234
|
|
|
229
235
|
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
@@ -244,8 +250,8 @@ env = vf.Env(
|
|
|
244
250
|
```
|
|
245
251
|
|
|
246
252
|
The same environment package is the unit used by evals and `prime-rl`. The
|
|
247
|
-
trainer owns model, endpoint, sampling, and rollout count; v1-specific
|
|
248
|
-
|
|
253
|
+
trainer owns model, endpoint, sampling, and rollout count; v1-specific options
|
|
254
|
+
stay on the taskset or harness config that owns them:
|
|
249
255
|
|
|
250
256
|
```toml
|
|
251
257
|
# configs/rl/my-v1-env.toml
|
|
@@ -260,12 +266,12 @@ max_tokens = 4096
|
|
|
260
266
|
[[env]]
|
|
261
267
|
id = "my-env"
|
|
262
268
|
|
|
263
|
-
[env.args]
|
|
264
|
-
arg1 = "non-th-arg"
|
|
265
|
-
|
|
266
269
|
[env.harness]
|
|
267
270
|
max_turns = 1
|
|
268
271
|
|
|
272
|
+
[env.taskset]
|
|
273
|
+
split = "train"
|
|
274
|
+
|
|
269
275
|
[env.taskset.scoring.contains_answer]
|
|
270
276
|
weight = 1.0
|
|
271
277
|
```
|
|
@@ -99,6 +99,10 @@ Environments built with Verifiers are self-contained Python modules. To initiali
|
|
|
99
99
|
```bash
|
|
100
100
|
prime env init my-env # creates a new template in ./environments/my_env
|
|
101
101
|
```
|
|
102
|
+
Add an explicit harness loader when the environment owns harness behavior:
|
|
103
|
+
```bash
|
|
104
|
+
prime env init my-env --with-harness
|
|
105
|
+
```
|
|
102
106
|
For OpenEnv integration, use:
|
|
103
107
|
```bash
|
|
104
108
|
prime env init my-openenv --openenv
|
|
@@ -116,7 +120,9 @@ environments/my_env/
|
|
|
116
120
|
└── README.md # Documentation
|
|
117
121
|
```
|
|
118
122
|
|
|
119
|
-
Environment modules should expose a `load_environment` function which returns an
|
|
123
|
+
Environment modules should expose a `load_environment` function which returns an
|
|
124
|
+
environment object. For simple legacy environments, this can still be a direct
|
|
125
|
+
constructor:
|
|
120
126
|
```python
|
|
121
127
|
# my_env.py
|
|
122
128
|
import verifiers as vf
|
|
@@ -148,7 +154,7 @@ def source():
|
|
|
148
154
|
async def contains_answer(task, state) -> float:
|
|
149
155
|
return float(task["answer"] in str(state.get("completion") or ""))
|
|
150
156
|
|
|
151
|
-
def load_taskset(config: vf.TasksetConfig
|
|
157
|
+
def load_taskset(config: vf.TasksetConfig):
|
|
152
158
|
return vf.Taskset(source=source, rewards=[contains_answer], config=config)
|
|
153
159
|
|
|
154
160
|
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
@@ -169,8 +175,8 @@ env = vf.Env(
|
|
|
169
175
|
```
|
|
170
176
|
|
|
171
177
|
The same environment package is the unit used by evals and `prime-rl`. The
|
|
172
|
-
trainer owns model, endpoint, sampling, and rollout count; v1-specific
|
|
173
|
-
|
|
178
|
+
trainer owns model, endpoint, sampling, and rollout count; v1-specific options
|
|
179
|
+
stay on the taskset or harness config that owns them:
|
|
174
180
|
|
|
175
181
|
```toml
|
|
176
182
|
# configs/rl/my-v1-env.toml
|
|
@@ -185,12 +191,12 @@ max_tokens = 4096
|
|
|
185
191
|
[[env]]
|
|
186
192
|
id = "my-env"
|
|
187
193
|
|
|
188
|
-
[env.args]
|
|
189
|
-
arg1 = "non-th-arg"
|
|
190
|
-
|
|
191
194
|
[env.harness]
|
|
192
195
|
max_turns = 1
|
|
193
196
|
|
|
197
|
+
[env.taskset]
|
|
198
|
+
split = "train"
|
|
199
|
+
|
|
194
200
|
[env.taskset.scoring.contains_answer]
|
|
195
201
|
weight = 1.0
|
|
196
202
|
```
|
{verifiers-0.1.15.dev5 → verifiers-0.1.15.dev6}/tests/test_langchain_deep_agents_wikispeedia.py
RENAMED
|
@@ -57,7 +57,7 @@ def test_wikispeedia_loads_as_v1_taskset_harness(
|
|
|
57
57
|
) -> None:
|
|
58
58
|
module = load_module(monkeypatch)
|
|
59
59
|
|
|
60
|
-
env = module.load_environment(config=
|
|
60
|
+
env = module.load_environment(config=module.WikispeediaEnvConfig())
|
|
61
61
|
|
|
62
62
|
assert isinstance(env, vf.Env)
|
|
63
63
|
assert isinstance(env.taskset, vf.Taskset)
|
|
@@ -65,6 +65,43 @@ def test_wikispeedia_loads_as_v1_taskset_harness(
|
|
|
65
65
|
assert env.taskset.taskset_id == "langchain-deep-agents-wikispeedia"
|
|
66
66
|
|
|
67
67
|
|
|
68
|
+
def test_wikispeedia_env_config_reaches_taskset_and_harness(
|
|
69
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
70
|
+
) -> None:
|
|
71
|
+
module = load_module(monkeypatch)
|
|
72
|
+
wiki = make_small_wiki(module)
|
|
73
|
+
monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
|
|
74
|
+
|
|
75
|
+
env = module.load_environment(
|
|
76
|
+
config=module.WikispeediaEnvConfig(
|
|
77
|
+
taskset={
|
|
78
|
+
"train_size": 2,
|
|
79
|
+
"eval_size": 1,
|
|
80
|
+
"min_path_length": 1,
|
|
81
|
+
"max_path_length": 1,
|
|
82
|
+
"eval_target_fraction": 0.5,
|
|
83
|
+
"allow_go_back": False,
|
|
84
|
+
"links_only": True,
|
|
85
|
+
"max_turns": 7,
|
|
86
|
+
},
|
|
87
|
+
harness={
|
|
88
|
+
"max_turns": 8,
|
|
89
|
+
"timeout_seconds": 9.0,
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
train_rows = list(env.taskset.source())
|
|
95
|
+
eval_rows = list(env.taskset.eval_source())
|
|
96
|
+
|
|
97
|
+
assert len(train_rows) == 2
|
|
98
|
+
assert len(eval_rows) == 1
|
|
99
|
+
assert train_rows[0]["max_turns"] == 7
|
|
100
|
+
assert env.harness.config.max_turns == 8
|
|
101
|
+
assert env.harness.config.timeout_seconds == 9.0
|
|
102
|
+
assert [tool.__name__ for tool in env.taskset.toolsets[0].tools] == ["click_link"]
|
|
103
|
+
|
|
104
|
+
|
|
68
105
|
def test_wikispeedia_rows_use_v1_task_shape(
|
|
69
106
|
monkeypatch: pytest.MonkeyPatch,
|
|
70
107
|
) -> None:
|
|
@@ -90,11 +127,13 @@ def test_wikispeedia_taskset_sources_use_disjoint_target_split(
|
|
|
90
127
|
wiki = make_small_wiki(module)
|
|
91
128
|
monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
|
|
92
129
|
taskset = module.load_taskset(
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
130
|
+
config=module.WikispeediaTasksetConfig(
|
|
131
|
+
train_size=2,
|
|
132
|
+
eval_size=1,
|
|
133
|
+
min_path_length=1,
|
|
134
|
+
max_path_length=1,
|
|
135
|
+
eval_target_fraction=0.5,
|
|
136
|
+
)
|
|
98
137
|
)
|
|
99
138
|
|
|
100
139
|
train_rows = list(taskset.source())
|
|
@@ -114,8 +153,12 @@ def test_wikispeedia_efficiency_weight_uses_fresh_reward_wrapper(
|
|
|
114
153
|
wiki = make_small_wiki(module)
|
|
115
154
|
monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
|
|
116
155
|
|
|
117
|
-
weighted = module.load_taskset(
|
|
118
|
-
|
|
156
|
+
weighted = module.load_taskset(
|
|
157
|
+
config=module.WikispeediaTasksetConfig(efficiency_weight=0.5)
|
|
158
|
+
)
|
|
159
|
+
plain = module.load_taskset(
|
|
160
|
+
config=module.WikispeediaTasksetConfig(efficiency_weight=0.0)
|
|
161
|
+
)
|
|
119
162
|
|
|
120
163
|
assert any(fn.__name__ == "path_efficiency" for fn in weighted.rewards)
|
|
121
164
|
assert any(fn is module.path_efficiency for fn in plain.metrics)
|
|
@@ -127,13 +170,17 @@ def test_wikispeedia_taskset_owns_navigation_tools(
|
|
|
127
170
|
) -> None:
|
|
128
171
|
module = load_module(monkeypatch)
|
|
129
172
|
|
|
130
|
-
taskset = module.load_taskset(
|
|
173
|
+
taskset = module.load_taskset(
|
|
174
|
+
config=module.WikispeediaTasksetConfig(allow_go_back=True)
|
|
175
|
+
)
|
|
131
176
|
names = [tool.__name__ for tool in taskset.toolsets[0].tools]
|
|
132
|
-
no_back = module.load_taskset(
|
|
177
|
+
no_back = module.load_taskset(
|
|
178
|
+
config=module.WikispeediaTasksetConfig(allow_go_back=False)
|
|
179
|
+
)
|
|
133
180
|
|
|
134
181
|
assert names == ["click_link", "go_back"]
|
|
135
182
|
assert [tool.__name__ for tool in no_back.toolsets[0].tools] == ["click_link"]
|
|
136
|
-
assert module.load_harness().toolsets == []
|
|
183
|
+
assert module.load_harness(config=module.WikispeediaHarnessConfig()).toolsets == []
|
|
137
184
|
|
|
138
185
|
|
|
139
186
|
def test_wikispeedia_system_prompt_matches_available_tools(
|
|
@@ -141,8 +188,12 @@ def test_wikispeedia_system_prompt_matches_available_tools(
|
|
|
141
188
|
) -> None:
|
|
142
189
|
module = load_module(monkeypatch)
|
|
143
190
|
|
|
144
|
-
with_back = module.load_taskset(
|
|
145
|
-
|
|
191
|
+
with_back = module.load_taskset(
|
|
192
|
+
config=module.WikispeediaTasksetConfig(allow_go_back=True)
|
|
193
|
+
)
|
|
194
|
+
without_back = module.load_taskset(
|
|
195
|
+
config=module.WikispeediaTasksetConfig(allow_go_back=False)
|
|
196
|
+
)
|
|
146
197
|
|
|
147
198
|
assert "go_back" in with_back.system_prompt[0]["content"]
|
|
148
199
|
assert "go_back" not in without_back.system_prompt[0]["content"]
|
|
@@ -156,12 +207,16 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
|
|
|
156
207
|
module = load_module(monkeypatch)
|
|
157
208
|
wiki = make_small_wiki(module)
|
|
158
209
|
monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
|
|
159
|
-
env =
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
210
|
+
env = vf.Env(
|
|
211
|
+
taskset=module.load_taskset(
|
|
212
|
+
config=module.WikispeediaTasksetConfig(
|
|
213
|
+
train_size=2,
|
|
214
|
+
eval_size=1,
|
|
215
|
+
min_path_length=1,
|
|
216
|
+
max_path_length=1,
|
|
217
|
+
)
|
|
218
|
+
),
|
|
219
|
+
harness=module.load_harness(config=module.WikispeediaHarnessConfig()),
|
|
165
220
|
)
|
|
166
221
|
task = module.vf.Task(list(env.taskset.source())[0]).freeze()
|
|
167
222
|
state = module.vf.State.for_task(task)
|
|
@@ -26,7 +26,9 @@ def _load_mcp_search_module() -> Any:
|
|
|
26
26
|
def test_mcp_search_env_is_v1_only() -> None:
|
|
27
27
|
module = _load_mcp_search_module()
|
|
28
28
|
|
|
29
|
-
env = module.load_environment(
|
|
29
|
+
env = module.load_environment(
|
|
30
|
+
config=module.MCPSearchEnvConfig(taskset={"max_turns": 4})
|
|
31
|
+
)
|
|
30
32
|
|
|
31
33
|
assert isinstance(env, vf.Env)
|
|
32
34
|
assert isinstance(env.taskset, vf.Taskset)
|
|
@@ -40,7 +42,7 @@ def test_mcp_search_env_is_v1_only() -> None:
|
|
|
40
42
|
def test_mcp_search_default_taskset_has_stable_non_doc_fixture() -> None:
|
|
41
43
|
module = _load_mcp_search_module()
|
|
42
44
|
|
|
43
|
-
rows = module.load_taskset().rows()
|
|
45
|
+
rows = module.load_taskset(config=module.MCPSearchTasksetConfig()).rows()
|
|
44
46
|
|
|
45
47
|
assert len(rows) >= 10
|
|
46
48
|
assert len({row["answer"] for row in rows}) == len(rows)
|
|
@@ -52,7 +54,7 @@ def test_mcp_search_taskset_accepts_v1_taskset_config() -> None:
|
|
|
52
54
|
module = _load_mcp_search_module()
|
|
53
55
|
|
|
54
56
|
env = module.load_environment(
|
|
55
|
-
config=
|
|
57
|
+
config=module.MCPSearchEnvConfig(taskset={"max_turns": 3}),
|
|
56
58
|
)
|
|
57
59
|
rows = env.taskset.rows()
|
|
58
60
|
|
|
@@ -28,7 +28,7 @@ def _load_opencode_module() -> Any:
|
|
|
28
28
|
def test_load_environment_uses_v1_taskset_and_harness() -> None:
|
|
29
29
|
module = _load_opencode_module()
|
|
30
30
|
|
|
31
|
-
env = module.load_environment(config=
|
|
31
|
+
env = module.load_environment(config=module.OpenCodeHarborEnvConfig())
|
|
32
32
|
|
|
33
33
|
assert isinstance(env, vf.Env)
|
|
34
34
|
assert isinstance(env.taskset, vf.HarborTaskset)
|
|
@@ -52,7 +52,7 @@ def test_load_environment_accepts_v1_taskset_and_harness_config() -> None:
|
|
|
52
52
|
module = _load_opencode_module()
|
|
53
53
|
|
|
54
54
|
env = module.load_environment(
|
|
55
|
-
config=
|
|
55
|
+
config=module.OpenCodeHarborEnvConfig(
|
|
56
56
|
taskset={
|
|
57
57
|
"task_names": ["task-a"],
|
|
58
58
|
"cpu_cores": 1.5,
|
|
@@ -32,6 +32,7 @@ from verifiers.utils.save_utils import (
|
|
|
32
32
|
make_serializable,
|
|
33
33
|
save_new_outputs,
|
|
34
34
|
states_to_outputs,
|
|
35
|
+
truncate_malformed_trailing_line,
|
|
35
36
|
validate_resume_metadata,
|
|
36
37
|
)
|
|
37
38
|
from verifiers.utils.usage_utils import StateUsageTracker, response_usage_tokens
|
|
@@ -488,6 +489,9 @@ class TestSaveNewOutputs:
|
|
|
488
489
|
"\n".join(lines + [malformed_trailing_line]), encoding="utf-8"
|
|
489
490
|
)
|
|
490
491
|
|
|
492
|
+
# Caller drops the partial trailing row before appending so the new
|
|
493
|
+
# row lands on a valid JSONL boundary.
|
|
494
|
+
truncate_malformed_trailing_line(outputs_path)
|
|
491
495
|
save_new_outputs(
|
|
492
496
|
[{"example_id": 3, "label": "row-3"}],
|
|
493
497
|
results_path,
|
|
@@ -75,12 +75,12 @@ def test_bfcl_public_loader_is_v1_only(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
|
75
75
|
seen_taskset_config: vf.TasksetConfig | None = None
|
|
76
76
|
seen_harness_config: vf.HarnessConfig | None = None
|
|
77
77
|
|
|
78
|
-
def fake_taskset(config: vf.TasksetConfig
|
|
78
|
+
def fake_taskset(config: vf.TasksetConfig) -> vf.Taskset:
|
|
79
79
|
nonlocal seen_taskset_config
|
|
80
80
|
seen_taskset_config = config
|
|
81
81
|
return vf.Taskset(source=[], config=config)
|
|
82
82
|
|
|
83
|
-
def fake_harness(config: vf.HarnessConfig
|
|
83
|
+
def fake_harness(config: vf.HarnessConfig) -> vf.Harness:
|
|
84
84
|
nonlocal seen_harness_config
|
|
85
85
|
seen_harness_config = config
|
|
86
86
|
return vf.Harness(config=config)
|
|
@@ -89,9 +89,13 @@ def test_bfcl_public_loader_is_v1_only(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
|
89
89
|
monkeypatch.setattr(bfcl, "load_harness", fake_harness)
|
|
90
90
|
|
|
91
91
|
env = bfcl.load_environment(
|
|
92
|
-
config=
|
|
93
|
-
|
|
94
|
-
|
|
92
|
+
config=bfcl.BFCLEnvConfig(
|
|
93
|
+
taskset=bfcl.BFCLTasksetConfig(
|
|
94
|
+
test_category="simple_python",
|
|
95
|
+
examples_per_category=0,
|
|
96
|
+
),
|
|
97
|
+
harness=bfcl.BFCLHarnessConfig(),
|
|
98
|
+
)
|
|
95
99
|
)
|
|
96
100
|
|
|
97
101
|
assert isinstance(env, vf.Env)
|
|
@@ -110,12 +114,12 @@ def test_bfcl_loader_supports_category_groups(
|
|
|
110
114
|
seen_taskset_categories = []
|
|
111
115
|
seen_harness_categories = []
|
|
112
116
|
|
|
113
|
-
def fake_taskset(config: vf.TasksetConfig
|
|
117
|
+
def fake_taskset(config: vf.TasksetConfig) -> vf.Taskset:
|
|
114
118
|
assert isinstance(config, bfcl.BFCLTasksetConfig)
|
|
115
119
|
seen_taskset_categories.append(config.test_category)
|
|
116
120
|
return vf.Taskset(source=[{"question": "q", "answer": "a"}], config=config)
|
|
117
121
|
|
|
118
|
-
def fake_harness(config: vf.HarnessConfig
|
|
122
|
+
def fake_harness(config: vf.HarnessConfig) -> vf.Harness:
|
|
119
123
|
assert isinstance(config, bfcl.BFCLHarnessConfig)
|
|
120
124
|
seen_harness_categories.append(config.test_category)
|
|
121
125
|
return vf.Harness(config=config)
|
|
@@ -124,9 +128,13 @@ def test_bfcl_loader_supports_category_groups(
|
|
|
124
128
|
monkeypatch.setattr(bfcl, "load_harness", fake_harness)
|
|
125
129
|
|
|
126
130
|
env = bfcl.load_environment(
|
|
127
|
-
config=
|
|
128
|
-
|
|
129
|
-
|
|
131
|
+
config=bfcl.BFCLEnvConfig(
|
|
132
|
+
taskset=bfcl.BFCLTasksetConfig(
|
|
133
|
+
test_categories=["simple_python", "simple_java"],
|
|
134
|
+
examples_per_category=0,
|
|
135
|
+
),
|
|
136
|
+
harness=bfcl.BFCLHarnessConfig(),
|
|
137
|
+
)
|
|
130
138
|
)
|
|
131
139
|
|
|
132
140
|
assert isinstance(env, root_vf.EnvGroup)
|
|
@@ -1185,6 +1185,29 @@ def test_config_schema_is_visible_from_primary_types() -> None:
|
|
|
1185
1185
|
assert "bindings" in vf.ToolsetConfig.schema_text()
|
|
1186
1186
|
|
|
1187
1187
|
|
|
1188
|
+
def test_config_annotation_only_nested_config_defaults_recursively() -> None:
|
|
1189
|
+
class LeafConfig(Config):
|
|
1190
|
+
value: int = 1
|
|
1191
|
+
|
|
1192
|
+
class ChildConfig(Config):
|
|
1193
|
+
leaf: LeafConfig
|
|
1194
|
+
|
|
1195
|
+
class ParentConfig(Config):
|
|
1196
|
+
child: ChildConfig
|
|
1197
|
+
|
|
1198
|
+
first = ParentConfig()
|
|
1199
|
+
second = ParentConfig()
|
|
1200
|
+
configured = ParentConfig({"child": {"leaf": {"value": 3}}})
|
|
1201
|
+
|
|
1202
|
+
assert isinstance(first.child, ChildConfig)
|
|
1203
|
+
assert isinstance(first.child.leaf, LeafConfig)
|
|
1204
|
+
assert first.child.leaf.value == 1
|
|
1205
|
+
assert first.child is not second.child
|
|
1206
|
+
assert first.child.leaf is not second.child.leaf
|
|
1207
|
+
assert configured.child.leaf.value == 3
|
|
1208
|
+
assert "child: ChildConfig = <factory>" in ParentConfig.schema_text()
|
|
1209
|
+
|
|
1210
|
+
|
|
1188
1211
|
def test_env_config_normalizes_mapping_config_to_attributes() -> None:
|
|
1189
1212
|
config = EnvConfig(
|
|
1190
1213
|
{
|
|
@@ -1193,8 +1216,17 @@ def test_env_config_normalizes_mapping_config_to_attributes() -> None:
|
|
|
1193
1216
|
}
|
|
1194
1217
|
)
|
|
1195
1218
|
|
|
1196
|
-
assert config.taskset
|
|
1197
|
-
assert config.harness
|
|
1219
|
+
assert isinstance(config.taskset, TasksetConfig)
|
|
1220
|
+
assert isinstance(config.harness, HarnessConfig)
|
|
1221
|
+
assert config.taskset.taskset_id == "dict"
|
|
1222
|
+
assert config.harness.model == "configured-model"
|
|
1223
|
+
|
|
1224
|
+
|
|
1225
|
+
def test_env_config_defaults_taskset_and_harness_to_base_configs() -> None:
|
|
1226
|
+
config = EnvConfig()
|
|
1227
|
+
|
|
1228
|
+
assert isinstance(config.taskset, TasksetConfig)
|
|
1229
|
+
assert isinstance(config.harness, HarnessConfig)
|
|
1198
1230
|
|
|
1199
1231
|
|
|
1200
1232
|
def test_env_config_rejects_unknown_top_level_sections() -> None:
|
|
@@ -1205,6 +1237,34 @@ def test_env_config_rejects_unknown_top_level_sections() -> None:
|
|
|
1205
1237
|
def test_env_config_requires_child_sections_to_be_configs() -> None:
|
|
1206
1238
|
with pytest.raises(ValueError):
|
|
1207
1239
|
EnvConfig({"taskset": 1})
|
|
1240
|
+
with pytest.raises(ValueError, match="EnvConfig.taskset cannot be None"):
|
|
1241
|
+
EnvConfig({"taskset": None})
|
|
1242
|
+
with pytest.raises(ValueError, match="EnvConfig.harness cannot be None"):
|
|
1243
|
+
EnvConfig(harness=None)
|
|
1244
|
+
|
|
1245
|
+
|
|
1246
|
+
def test_env_config_child_config_objects_must_match_domain() -> None:
|
|
1247
|
+
class LocalTasksetConfig(TasksetConfig):
|
|
1248
|
+
split: str = "train"
|
|
1249
|
+
|
|
1250
|
+
class LocalHarnessConfig(HarnessConfig):
|
|
1251
|
+
mode: str = "default"
|
|
1252
|
+
|
|
1253
|
+
config = EnvConfig(
|
|
1254
|
+
taskset=LocalTasksetConfig(split="test"),
|
|
1255
|
+
harness=LocalHarnessConfig(mode="custom"),
|
|
1256
|
+
)
|
|
1257
|
+
|
|
1258
|
+
assert isinstance(config.taskset, LocalTasksetConfig)
|
|
1259
|
+
assert isinstance(config.harness, LocalHarnessConfig)
|
|
1260
|
+
|
|
1261
|
+
class LocalConfig(Config):
|
|
1262
|
+
split: str = "train"
|
|
1263
|
+
|
|
1264
|
+
with pytest.raises(ValueError):
|
|
1265
|
+
EnvConfig(taskset=LocalConfig())
|
|
1266
|
+
with pytest.raises(ValueError):
|
|
1267
|
+
EnvConfig(harness=LocalConfig())
|
|
1208
1268
|
|
|
1209
1269
|
|
|
1210
1270
|
def test_env_config_merges_child_config_defaults_with_nested_sections() -> None:
|
|
@@ -1234,25 +1294,53 @@ def test_env_config_merges_child_config_defaults_with_nested_sections() -> None:
|
|
|
1234
1294
|
assert default_config.taskset.split == "kwarg"
|
|
1235
1295
|
|
|
1236
1296
|
|
|
1237
|
-
def
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1297
|
+
def test_config_object_merge_omits_nested_none_values() -> None:
|
|
1298
|
+
base = HarnessConfig(
|
|
1299
|
+
sampling_args={
|
|
1300
|
+
"temperature": 0.7,
|
|
1301
|
+
"extra_body": {
|
|
1302
|
+
"top_k": 40,
|
|
1303
|
+
"top_p": 0.9,
|
|
1304
|
+
},
|
|
1305
|
+
}
|
|
1245
1306
|
)
|
|
1307
|
+
override = HarnessConfig(
|
|
1308
|
+
sampling_args={
|
|
1309
|
+
"extra_body": {
|
|
1310
|
+
"top_p": None,
|
|
1311
|
+
"min_p": 0.05,
|
|
1312
|
+
},
|
|
1313
|
+
"stop": [None, "DONE"],
|
|
1314
|
+
}
|
|
1315
|
+
)
|
|
1316
|
+
config = EnvConfig(EnvConfig(harness=override), harness=base)
|
|
1317
|
+
|
|
1318
|
+
assert config.harness.sampling_args == {
|
|
1319
|
+
"temperature": 0.7,
|
|
1320
|
+
"extra_body": {
|
|
1321
|
+
"top_k": 40,
|
|
1322
|
+
"top_p": 0.9,
|
|
1323
|
+
"min_p": 0.05,
|
|
1324
|
+
},
|
|
1325
|
+
"stop": [None, "DONE"],
|
|
1326
|
+
}
|
|
1246
1327
|
|
|
1247
|
-
assert isinstance(config.args, LocalArgsConfig)
|
|
1248
|
-
assert config.args.split == "args"
|
|
1249
|
-
assert config.args.max_turns == 7
|
|
1250
1328
|
|
|
1329
|
+
def test_env_config_subclasses_cannot_define_root_fields() -> None:
|
|
1330
|
+
with pytest.raises(TypeError, match="unsupported root env config fields"):
|
|
1251
1331
|
|
|
1252
|
-
|
|
1253
|
-
|
|
1332
|
+
class LocalEnvConfig(EnvConfig):
|
|
1333
|
+
split: str = "train"
|
|
1254
1334
|
|
|
1255
|
-
|
|
1335
|
+
|
|
1336
|
+
def test_env_config_subclasses_must_use_domain_child_configs() -> None:
|
|
1337
|
+
class LocalConfig(Config):
|
|
1338
|
+
split: str = "train"
|
|
1339
|
+
|
|
1340
|
+
with pytest.raises(TypeError, match="taskset must be typed"):
|
|
1341
|
+
|
|
1342
|
+
class LocalEnvConfig(EnvConfig):
|
|
1343
|
+
taskset: LocalConfig
|
|
1256
1344
|
|
|
1257
1345
|
|
|
1258
1346
|
def test_env_config_harness_section_extends_imported_config() -> None:
|
|
@@ -1335,6 +1423,56 @@ def test_load_environment_coerces_typed_env_config_arg(
|
|
|
1335
1423
|
}
|
|
1336
1424
|
|
|
1337
1425
|
|
|
1426
|
+
def test_load_environment_coerces_env_config_subclass_sections(
|
|
1427
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
1428
|
+
) -> None:
|
|
1429
|
+
module_name = "typed_env_config_subclass"
|
|
1430
|
+
module = types.ModuleType(module_name)
|
|
1431
|
+
seen: dict[str, object] = {}
|
|
1432
|
+
|
|
1433
|
+
class LocalTasksetConfig(TasksetConfig):
|
|
1434
|
+
split: str = "train"
|
|
1435
|
+
|
|
1436
|
+
class LocalHarnessConfig(HarnessConfig):
|
|
1437
|
+
mode: str = "default"
|
|
1438
|
+
|
|
1439
|
+
class LocalEnvConfig(EnvConfig):
|
|
1440
|
+
taskset: LocalTasksetConfig
|
|
1441
|
+
harness: LocalHarnessConfig
|
|
1442
|
+
|
|
1443
|
+
class LocalTaskset(Taskset):
|
|
1444
|
+
config_type = LocalTasksetConfig
|
|
1445
|
+
|
|
1446
|
+
class LocalHarness(Harness):
|
|
1447
|
+
config_type = LocalHarnessConfig
|
|
1448
|
+
|
|
1449
|
+
def load_environment(config: LocalEnvConfig) -> Env:
|
|
1450
|
+
seen["config"] = config
|
|
1451
|
+
return Env(
|
|
1452
|
+
taskset=LocalTaskset(source=source_loader, config=config.taskset),
|
|
1453
|
+
harness=LocalHarness(config=config.harness),
|
|
1454
|
+
)
|
|
1455
|
+
|
|
1456
|
+
module.load_environment = load_environment
|
|
1457
|
+
monkeypatch.setitem(sys.modules, module_name, module)
|
|
1458
|
+
|
|
1459
|
+
env = vf.load_environment(
|
|
1460
|
+
"typed-env-config-subclass",
|
|
1461
|
+
config={
|
|
1462
|
+
"taskset": {"taskset_id": "typed", "split": "test"},
|
|
1463
|
+
"harness": {"mode": "custom"},
|
|
1464
|
+
},
|
|
1465
|
+
)
|
|
1466
|
+
config = seen["config"]
|
|
1467
|
+
|
|
1468
|
+
assert isinstance(config, LocalEnvConfig)
|
|
1469
|
+
assert isinstance(config.taskset, LocalTasksetConfig)
|
|
1470
|
+
assert isinstance(config.harness, LocalHarnessConfig)
|
|
1471
|
+
assert env.taskset.config.taskset_id == "typed"
|
|
1472
|
+
assert env.taskset.config.split == "test"
|
|
1473
|
+
assert env.harness.config.mode == "custom"
|
|
1474
|
+
|
|
1475
|
+
|
|
1338
1476
|
def test_load_environment_supplies_default_typed_env_config(
|
|
1339
1477
|
monkeypatch: pytest.MonkeyPatch,
|
|
1340
1478
|
) -> None:
|
|
@@ -1457,9 +1595,24 @@ def test_reference_v1_harness_loaders_preserve_child_defaults() -> None:
|
|
|
1457
1595
|
"environments.hello_self_judge_v1.hello_self_judge_v1"
|
|
1458
1596
|
)
|
|
1459
1597
|
|
|
1460
|
-
assert
|
|
1461
|
-
|
|
1462
|
-
|
|
1598
|
+
assert (
|
|
1599
|
+
group_reward.load_harness(
|
|
1600
|
+
config=group_reward.GroupRewardHarnessConfig()
|
|
1601
|
+
).config.max_turns
|
|
1602
|
+
== 1
|
|
1603
|
+
)
|
|
1604
|
+
assert (
|
|
1605
|
+
parallel_sandbox.load_harness(
|
|
1606
|
+
config=parallel_sandbox.ParallelSandboxHarnessConfig()
|
|
1607
|
+
).config.max_turns
|
|
1608
|
+
== 4
|
|
1609
|
+
)
|
|
1610
|
+
assert (
|
|
1611
|
+
self_judge.load_harness(
|
|
1612
|
+
config=self_judge.SelfJudgeHarnessConfig()
|
|
1613
|
+
).config.max_turns
|
|
1614
|
+
== 8
|
|
1615
|
+
)
|
|
1463
1616
|
|
|
1464
1617
|
|
|
1465
1618
|
def test_bfcl_loader_preserves_mapping_config_sections(
|
|
@@ -1482,7 +1635,7 @@ def test_bfcl_loader_preserves_mapping_config_sections(
|
|
|
1482
1635
|
monkeypatch.setattr(module, "load_harness", fake_harness)
|
|
1483
1636
|
|
|
1484
1637
|
env = module.load_environment(
|
|
1485
|
-
config=
|
|
1638
|
+
config=module.BFCLEnvConfig(
|
|
1486
1639
|
taskset={"taskset_id": "bfcl-env-args"},
|
|
1487
1640
|
harness={"model": "bfcl-model"},
|
|
1488
1641
|
)
|
|
@@ -1508,7 +1661,7 @@ def test_tau2_loader_forwards_mapping_harness_config(
|
|
|
1508
1661
|
monkeypatch.setattr(module, "load_taskset", fake_taskset)
|
|
1509
1662
|
|
|
1510
1663
|
env = module.load_environment(
|
|
1511
|
-
config=
|
|
1664
|
+
config=module.Tau2EnvConfig(
|
|
1512
1665
|
taskset={"max_turns": 7},
|
|
1513
1666
|
harness={"model": "configured-model", "max_turns": 3},
|
|
1514
1667
|
)
|
|
@@ -1623,17 +1776,16 @@ def test_self_judge_loader_projects_shortcuts_to_child_configs() -> None:
|
|
|
1623
1776
|
"environments.hello_self_judge_v1.hello_self_judge_v1"
|
|
1624
1777
|
)
|
|
1625
1778
|
|
|
1626
|
-
taskset = module.load_taskset(num_examples=2)
|
|
1627
|
-
harness = module.load_harness(max_turns=3)
|
|
1779
|
+
taskset = module.load_taskset(config=module.SelfJudgeTasksetConfig(num_examples=2))
|
|
1780
|
+
harness = module.load_harness(config=module.SelfJudgeHarnessConfig(max_turns=3))
|
|
1628
1781
|
shortcut_env = module.load_environment(
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1782
|
+
config=module.SelfJudgeEnvConfig(
|
|
1783
|
+
taskset={"num_examples": 2},
|
|
1784
|
+
harness={"max_turns": 3},
|
|
1785
|
+
),
|
|
1632
1786
|
)
|
|
1633
1787
|
override_env = module.load_environment(
|
|
1634
|
-
|
|
1635
|
-
max_turns=3,
|
|
1636
|
-
config=EnvConfig(
|
|
1788
|
+
config=module.SelfJudgeEnvConfig(
|
|
1637
1789
|
taskset={"num_examples": 1},
|
|
1638
1790
|
harness={"max_turns": 5},
|
|
1639
1791
|
),
|