verifiers 0.1.15.dev2__tar.gz → 0.1.15.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/PKG-INFO +5 -6
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/README.md +3 -4
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/pyproject.toml +18 -5
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_context_token_metrics.py +37 -15
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_environment_extra.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_eval_display.py +43 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_eval_utils.py +51 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_gym_env.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_harbor_env_mcp.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_interception_utils.py +2 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_langchain_deep_agents_wikispeedia.py +15 -5
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_lean_task.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_mcp_search_env.py +17 -3
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_message_utils.py +33 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_nemorl_client.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_opencode_harbor.py +15 -29
- verifiers-0.1.15.dev4/tests/test_openenv_client.py +162 -0
- verifiers-0.1.15.dev4/tests/test_pricing_utils.py +127 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_renderer_e2e.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_save_utils.py +51 -37
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_setup_script.py +0 -2
- verifiers-0.1.15.dev4/tests/test_v1_bfcl.py +135 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_config_extension.py +275 -37
- verifiers-0.1.15.dev4/tests/test_v1_empty_completions.py +57 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_example_counts.py +13 -15
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_group_reward_env.py +2 -3
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_harbor_cli.py +148 -26
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_mini_swe_agent.py +32 -6
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_rlm_swe.py +105 -12
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_runtime_lifecycle.py +126 -44
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_scoring_functions.py +6 -5
- verifiers-0.1.15.dev4/tests/test_v1_taskset_bindings.py +188 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/__init__.py +66 -8
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/plugins/prime.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/__init__.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/nemorl_chat_completions_client.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/openai_responses_client.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/renderer_client.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/env_group.py +4 -6
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/environment.py +19 -21
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/_filter.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/composable_env.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harness.py +4 -8
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/task.py +4 -6
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +8 -10
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +2 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/gym_env.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/harbor_env/env.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/harbor_env/mcp.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/utils/file_locks.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/README.md +17 -15
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/openenv_env.py +99 -328
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/build.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/init.py +16 -19
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/setup.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/tui.py +6 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/server/env_router.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/types.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/types.py +428 -6
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/data_utils.py +3 -5
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/display_utils.py +5 -5
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/env_config_utils.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/env_utils.py +8 -5
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/eval_display.py +30 -3
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/eval_utils.py +110 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/import_utils.py +0 -2
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/interception_utils.py +15 -7
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/message_utils.py +65 -3
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/metric_utils.py +5 -5
- verifiers-0.1.15.dev4/verifiers/utils/pricing_utils.py +170 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/save_utils.py +66 -56
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/usage_utils.py +20 -56
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/version_utils.py +0 -2
- verifiers-0.1.15.dev4/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +73 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/README.md +123 -79
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/RE_MIGRATION.md +59 -40
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/__init__.py +42 -3
- verifiers-0.1.15.dev4/verifiers/v1/config.py +381 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/env.py +2 -4
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/harness.py +100 -112
- verifiers-0.1.15.dev4/verifiers/v1/packages/harnesses/__init__.py +14 -0
- verifiers-0.1.15.dev4/verifiers/v1/packages/harnesses/command.py +116 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/packages/harnesses/configs.py +32 -4
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/packages/harnesses/mini_swe_agent.py +43 -39
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/packages/harnesses/opencode.py +71 -52
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/packages/harnesses/pi.py +44 -40
- verifiers-0.1.15.dev4/verifiers/v1/packages/harnesses/rlm.py +347 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/packages/tasksets/harbor.py +93 -51
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/runtime.py +472 -374
- verifiers-0.1.15.dev4/verifiers/v1/state.py +10 -0
- verifiers-0.1.15.dev4/verifiers/v1/task.py +92 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/taskset.py +124 -99
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/toolset.py +94 -77
- verifiers-0.1.15.dev4/verifiers/v1/types.py +59 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/user.py +24 -17
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/artifact_utils.py +5 -7
- verifiers-0.1.15.dev4/verifiers/v1/utils/binding_utils.py +216 -0
- verifiers-0.1.15.dev4/verifiers/v1/utils/config_callable_utils.py +123 -0
- verifiers-0.1.15.dev4/verifiers/v1/utils/config_utils.py +177 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/endpoint_utils.py +42 -42
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/json_utils.py +3 -4
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/judge_utils.py +5 -17
- verifiers-0.1.15.dev4/verifiers/v1/utils/lifecycle_utils.py +98 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/mcp_proxy_utils.py +26 -31
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/mcp_utils.py +23 -21
- verifiers-0.1.15.dev4/verifiers/v1/utils/object_utils.py +32 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/program_utils.py +128 -111
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/prompt_utils.py +17 -19
- verifiers-0.1.15.dev4/verifiers/v1/utils/runtime_registry.py +37 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/sandbox_program_utils.py +23 -27
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/sandbox_utils.py +164 -102
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/scoring_utils.py +156 -84
- verifiers-0.1.15.dev4/verifiers/v1/utils/serialization_utils.py +14 -0
- verifiers-0.1.15.dev4/verifiers/v1/utils/task_freeze_utils.py +89 -0
- verifiers-0.1.15.dev4/verifiers/v1/utils/taskset_utils.py +56 -0
- verifiers-0.1.15.dev4/verifiers/v1/utils/timing_utils.py +119 -0
- verifiers-0.1.15.dev4/verifiers/v1/utils/tool_utils.py +54 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/trajectory_utils.py +24 -18
- verifiers-0.1.15.dev4/verifiers/v1/utils/usage_utils.py +21 -0
- verifiers-0.1.15.dev2/tests/test_v1_bfcl.py +0 -55
- verifiers-0.1.15.dev2/verifiers/v1/config.py +0 -455
- verifiers-0.1.15.dev2/verifiers/v1/packages/harnesses/__init__.py +0 -8
- verifiers-0.1.15.dev2/verifiers/v1/packages/harnesses/cli.py +0 -121
- verifiers-0.1.15.dev2/verifiers/v1/packages/harnesses/rlm.py +0 -265
- verifiers-0.1.15.dev2/verifiers/v1/state.py +0 -401
- verifiers-0.1.15.dev2/verifiers/v1/task.py +0 -177
- verifiers-0.1.15.dev2/verifiers/v1/utils/lifecycle_utils.py +0 -96
- verifiers-0.1.15.dev2/verifiers/v1/utils/timing_utils.py +0 -36
- verifiers-0.1.15.dev2/verifiers/v1/utils/tool_utils.py +0 -19
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/.gitignore +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/LICENSE +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/README.md +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_environment.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_envs.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_eval_cli.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_imports.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_path_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_renderer_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_rlm_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/eval.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/path_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/packages/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/packages/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/verifiers/v1/utils/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev4
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -52,7 +52,7 @@ Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
|
|
|
52
52
|
Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
|
|
53
53
|
Requires-Dist: stagehand>=3.0.0; extra == 'browser'
|
|
54
54
|
Provides-Extra: openenv
|
|
55
|
-
Requires-Dist: openenv-core
|
|
55
|
+
Requires-Dist: openenv-core>=0.3.0; extra == 'openenv'
|
|
56
56
|
Provides-Extra: renderers
|
|
57
57
|
Requires-Dist: renderers>=0.1.8.dev0; extra == 'renderers'
|
|
58
58
|
Provides-Extra: rg
|
|
@@ -210,7 +210,7 @@ For new environments with reusable tasksets, toolsets, custom programs, or
|
|
|
210
210
|
custom harnesses, use the v1 Taskset/Harness path:
|
|
211
211
|
```python
|
|
212
212
|
# my_env.py
|
|
213
|
-
import verifiers
|
|
213
|
+
import verifiers as vf
|
|
214
214
|
|
|
215
215
|
def source():
|
|
216
216
|
yield {
|
|
@@ -226,8 +226,7 @@ async def contains_answer(task, state) -> float:
|
|
|
226
226
|
def load_taskset(config: vf.TasksetConfig | None = None):
|
|
227
227
|
return vf.Taskset(source=source, rewards=[contains_answer], config=config)
|
|
228
228
|
|
|
229
|
-
def load_environment(config: vf.EnvConfig
|
|
230
|
-
config = config or vf.EnvConfig()
|
|
229
|
+
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
231
230
|
return vf.Env(taskset=load_taskset(config=config.taskset))
|
|
232
231
|
```
|
|
233
232
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
@@ -239,7 +238,7 @@ harness with:
|
|
|
239
238
|
|
|
240
239
|
```python
|
|
241
240
|
env = vf.Env(
|
|
242
|
-
taskset=vf.HarborTaskset(
|
|
241
|
+
taskset=vf.HarborTaskset(),
|
|
243
242
|
harness=vf.OpenCode(),
|
|
244
243
|
)
|
|
245
244
|
```
|
|
@@ -135,7 +135,7 @@ For new environments with reusable tasksets, toolsets, custom programs, or
|
|
|
135
135
|
custom harnesses, use the v1 Taskset/Harness path:
|
|
136
136
|
```python
|
|
137
137
|
# my_env.py
|
|
138
|
-
import verifiers
|
|
138
|
+
import verifiers as vf
|
|
139
139
|
|
|
140
140
|
def source():
|
|
141
141
|
yield {
|
|
@@ -151,8 +151,7 @@ async def contains_answer(task, state) -> float:
|
|
|
151
151
|
def load_taskset(config: vf.TasksetConfig | None = None):
|
|
152
152
|
return vf.Taskset(source=source, rewards=[contains_answer], config=config)
|
|
153
153
|
|
|
154
|
-
def load_environment(config: vf.EnvConfig
|
|
155
|
-
config = config or vf.EnvConfig()
|
|
154
|
+
def load_environment(config: vf.EnvConfig) -> vf.Env:
|
|
156
155
|
return vf.Env(taskset=load_taskset(config=config.taskset))
|
|
157
156
|
```
|
|
158
157
|
If no harness is passed, `vf.Env` uses the base endpoint-backed harness. See
|
|
@@ -164,7 +163,7 @@ harness with:
|
|
|
164
163
|
|
|
165
164
|
```python
|
|
166
165
|
env = vf.Env(
|
|
167
|
-
taskset=vf.HarborTaskset(
|
|
166
|
+
taskset=vf.HarborTaskset(),
|
|
168
167
|
harness=vf.OpenCode(),
|
|
169
168
|
)
|
|
170
169
|
```
|
|
@@ -68,13 +68,15 @@ dev = [
|
|
|
68
68
|
"ipywidgets",
|
|
69
69
|
"reasoning-gym",
|
|
70
70
|
"textarena",
|
|
71
|
-
"openenv-core[core]==0.2.1",
|
|
72
71
|
"stagehand>=3.0.0",
|
|
73
72
|
"aiohttp>=3.9.0",
|
|
74
73
|
"python-dotenv>=1.0.0",
|
|
75
74
|
"nltk",
|
|
76
75
|
"renderers>=0.1.8.dev0",
|
|
77
76
|
]
|
|
77
|
+
policy = [
|
|
78
|
+
"semgrep>=1.150.0",
|
|
79
|
+
]
|
|
78
80
|
|
|
79
81
|
[project.optional-dependencies]
|
|
80
82
|
rg = [
|
|
@@ -84,14 +86,14 @@ ta = [
|
|
|
84
86
|
"textarena",
|
|
85
87
|
"nltk",
|
|
86
88
|
]
|
|
87
|
-
openenv = [
|
|
88
|
-
"openenv-core[core]==0.2.1",
|
|
89
|
-
]
|
|
90
89
|
browser = [
|
|
91
90
|
"stagehand>=3.0.0",
|
|
92
91
|
"aiohttp>=3.9.0",
|
|
93
92
|
"python-dotenv>=1.0.0",
|
|
94
93
|
]
|
|
94
|
+
openenv = [
|
|
95
|
+
"openenv-core>=0.3.0",
|
|
96
|
+
]
|
|
95
97
|
renderers = [
|
|
96
98
|
"renderers>=0.1.8.dev0",
|
|
97
99
|
]
|
|
@@ -111,7 +113,12 @@ rl = [
|
|
|
111
113
|
[tool.uv]
|
|
112
114
|
preview = true
|
|
113
115
|
required-version = ">=0.11.1"
|
|
114
|
-
|
|
116
|
+
conflicts = [
|
|
117
|
+
[
|
|
118
|
+
{ extra = "openenv" },
|
|
119
|
+
{ group = "policy" },
|
|
120
|
+
],
|
|
121
|
+
]
|
|
115
122
|
[[tool.uv.index]]
|
|
116
123
|
name = "pypi"
|
|
117
124
|
url = "https://pypi.org/simple"
|
|
@@ -123,6 +130,7 @@ exclude-newer = "7 days"
|
|
|
123
130
|
prime-tunnel = false
|
|
124
131
|
prime-sandboxes = false
|
|
125
132
|
renderers = false
|
|
133
|
+
openenv-core = false
|
|
126
134
|
|
|
127
135
|
[tool.uv.extra-build-dependencies]
|
|
128
136
|
flash-attn = [{ requirement = "torch", match-runtime = true }]
|
|
@@ -130,6 +138,11 @@ flash-attn = [{ requirement = "torch", match-runtime = true }]
|
|
|
130
138
|
[tool.uv.extra-build-variables]
|
|
131
139
|
flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
|
|
132
140
|
|
|
141
|
+
[tool.ruff]
|
|
142
|
+
exclude = [
|
|
143
|
+
".semgrep",
|
|
144
|
+
]
|
|
145
|
+
|
|
133
146
|
[project.scripts]
|
|
134
147
|
vf-eval = "verifiers.scripts.eval:main"
|
|
135
148
|
vf-gepa = "verifiers.scripts.gepa:main"
|
|
@@ -5,10 +5,9 @@ Tests the trajectory-based context token computation
|
|
|
5
5
|
using the last trajectory step.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
-
from unittest.mock import MagicMock
|
|
9
|
-
|
|
10
8
|
import pytest
|
|
11
9
|
|
|
10
|
+
from verifiers.types import Response, ResponseMessage, Usage
|
|
12
11
|
from verifiers.utils.usage_utils import compute_context_token_metrics
|
|
13
12
|
|
|
14
13
|
|
|
@@ -20,12 +19,39 @@ SYS = {"role": "system", "content": "You are helpful"}
|
|
|
20
19
|
USER = {"role": "user", "content": "hi"}
|
|
21
20
|
|
|
22
21
|
|
|
23
|
-
def _make_response(prompt_tokens: int, completion_tokens: int) ->
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
22
|
+
def _make_response(prompt_tokens: int, completion_tokens: int) -> Response:
|
|
23
|
+
return Response(
|
|
24
|
+
id="test",
|
|
25
|
+
created=0,
|
|
26
|
+
model="test",
|
|
27
|
+
usage=Usage(
|
|
28
|
+
prompt_tokens=prompt_tokens,
|
|
29
|
+
reasoning_tokens=0,
|
|
30
|
+
completion_tokens=completion_tokens,
|
|
31
|
+
total_tokens=prompt_tokens + completion_tokens,
|
|
32
|
+
),
|
|
33
|
+
message=ResponseMessage(
|
|
34
|
+
role="assistant",
|
|
35
|
+
content="",
|
|
36
|
+
finish_reason="stop",
|
|
37
|
+
is_truncated=False,
|
|
38
|
+
),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _make_response_without_usage() -> Response:
|
|
43
|
+
return Response(
|
|
44
|
+
id="test",
|
|
45
|
+
created=0,
|
|
46
|
+
model="test",
|
|
47
|
+
usage=None,
|
|
48
|
+
message=ResponseMessage(
|
|
49
|
+
role="assistant",
|
|
50
|
+
content="",
|
|
51
|
+
finish_reason="stop",
|
|
52
|
+
is_truncated=False,
|
|
53
|
+
),
|
|
27
54
|
)
|
|
28
|
-
return response
|
|
29
55
|
|
|
30
56
|
|
|
31
57
|
def _asst(i: int) -> dict:
|
|
@@ -115,13 +141,11 @@ class TestContextMetrics:
|
|
|
115
141
|
assert metrics["final_input_tokens"] == 230 - 50
|
|
116
142
|
|
|
117
143
|
def test_skips_responses_without_usage(self):
|
|
118
|
-
"""Responses with
|
|
119
|
-
no_usage = MagicMock()
|
|
120
|
-
no_usage.usage = None
|
|
144
|
+
"""Responses with usage=None are skipped entirely."""
|
|
121
145
|
trajectory = [
|
|
122
146
|
{"response": _make_response(100, 20)},
|
|
123
147
|
{"response": _make_response(200, 30)},
|
|
124
|
-
{"response":
|
|
148
|
+
{"response": _make_response_without_usage()},
|
|
125
149
|
]
|
|
126
150
|
metrics = compute_context_token_metrics(trajectory)
|
|
127
151
|
# Should use step 1 (last with usage): total = 230
|
|
@@ -130,11 +154,9 @@ class TestContextMetrics:
|
|
|
130
154
|
|
|
131
155
|
def test_all_responses_lack_usage(self):
|
|
132
156
|
"""If no response has usage data, return zeros."""
|
|
133
|
-
no_usage = MagicMock()
|
|
134
|
-
no_usage.usage = None
|
|
135
157
|
trajectory = [
|
|
136
|
-
{"response":
|
|
137
|
-
{"response":
|
|
158
|
+
{"response": _make_response_without_usage()},
|
|
159
|
+
{"response": _make_response_without_usage()},
|
|
138
160
|
]
|
|
139
161
|
metrics = compute_context_token_metrics(trajectory)
|
|
140
162
|
assert metrics["final_output_tokens"] == 0
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
|
|
1
3
|
from verifiers.types import ClientConfig, EvalConfig
|
|
2
4
|
from verifiers.utils.eval_display import EvalDisplay
|
|
3
5
|
|
|
@@ -78,3 +80,44 @@ def test_format_client_target_uses_single_resolved_base_url() -> None:
|
|
|
78
80
|
)
|
|
79
81
|
|
|
80
82
|
assert EvalDisplay._format_client_target(config) == "http://localhost:8001/v1"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def render_plain(renderable) -> str:
|
|
86
|
+
console = Console(width=100, record=True)
|
|
87
|
+
console.print(renderable)
|
|
88
|
+
return console.export_text()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_tokens_row_omits_cost_when_unavailable() -> None:
|
|
92
|
+
display = EvalDisplay([make_config(max_concurrent=1)])
|
|
93
|
+
|
|
94
|
+
rendered = render_plain(
|
|
95
|
+
display._make_tokens_row({"input_tokens": 12.0, "output_tokens": 7.0})
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
assert "input 12" in rendered
|
|
99
|
+
assert "output 7" in rendered
|
|
100
|
+
assert "cost" not in rendered
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_tokens_row_includes_cost_when_available() -> None:
|
|
104
|
+
display = EvalDisplay([make_config(max_concurrent=1)])
|
|
105
|
+
|
|
106
|
+
rendered = render_plain(
|
|
107
|
+
display._make_tokens_row(
|
|
108
|
+
{
|
|
109
|
+
"input_tokens": 12.0,
|
|
110
|
+
"output_tokens": 7.0,
|
|
111
|
+
"final_input_tokens": 10.0,
|
|
112
|
+
"final_output_tokens": 5.0,
|
|
113
|
+
},
|
|
114
|
+
{"input_usd": 0.005, "output_usd": 0.0073, "total_usd": 0.0123},
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
assert "input 12" in rendered
|
|
119
|
+
assert "output 7" in rendered
|
|
120
|
+
assert "final_input 10" in rendered
|
|
121
|
+
assert "final_output 5" in rendered
|
|
122
|
+
assert "cost (all) $0.0123" in rendered
|
|
123
|
+
assert rendered.index("final_output 5") < rendered.index("cost (all) $0.0123")
|
|
@@ -4,6 +4,8 @@ Covers:
|
|
|
4
4
|
- print_results indexing with multiple rollouts per example
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
7
9
|
from verifiers.types import GenerateOutputs
|
|
8
10
|
from verifiers.utils.save_utils import states_to_outputs
|
|
9
11
|
|
|
@@ -138,6 +140,55 @@ def test_print_results_includes_usage(capsys, make_metadata, make_output):
|
|
|
138
140
|
assert "output_tokens (avg): 3.000" in captured.out
|
|
139
141
|
|
|
140
142
|
|
|
143
|
+
def test_attach_metadata_cost_uses_total_output_usage(make_metadata, make_output):
|
|
144
|
+
from verifiers.utils.eval_utils import _attach_metadata_cost
|
|
145
|
+
|
|
146
|
+
outputs = [
|
|
147
|
+
make_output(example_id=0, reward=1.0, metrics={"test_metric": 1.0}),
|
|
148
|
+
make_output(example_id=1, reward=0.0, metrics={"test_metric": 2.0}),
|
|
149
|
+
]
|
|
150
|
+
outputs[0]["token_usage"] = {"input_tokens": 10.0, "output_tokens": 4.0}
|
|
151
|
+
outputs[1]["token_usage"] = {"input_tokens": 6.0, "output_tokens": 2.0}
|
|
152
|
+
metadata = make_metadata(
|
|
153
|
+
num_examples=2,
|
|
154
|
+
rollouts_per_example=1,
|
|
155
|
+
usage={"input_tokens": 8.0, "output_tokens": 3.0},
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
cost = _attach_metadata_cost(
|
|
159
|
+
metadata,
|
|
160
|
+
{"input_usd_per_mtok": 1.0, "output_usd_per_mtok": 5.0},
|
|
161
|
+
outputs,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
assert cost == {
|
|
165
|
+
"input_usd": pytest.approx(0.000016),
|
|
166
|
+
"output_usd": pytest.approx(0.000030),
|
|
167
|
+
"total_usd": pytest.approx(0.000046),
|
|
168
|
+
}
|
|
169
|
+
assert metadata["cost"] == cost
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_print_results_labels_cost_as_all(capsys, make_metadata, make_output):
|
|
173
|
+
from verifiers.utils.eval_utils import print_results
|
|
174
|
+
|
|
175
|
+
outputs = [
|
|
176
|
+
make_output(example_id=0, reward=1.0, metrics={"test_metric": 1.0}),
|
|
177
|
+
]
|
|
178
|
+
outputs[0]["token_usage"] = {"input_tokens": 10.0, "output_tokens": 4.0}
|
|
179
|
+
metadata = make_metadata(num_examples=1, rollouts_per_example=1, usage=None)
|
|
180
|
+
metadata["cost"] = {
|
|
181
|
+
"input_usd": 0.005,
|
|
182
|
+
"output_usd": 0.0073,
|
|
183
|
+
"total_usd": 0.0123,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
print_results(GenerateOutputs(outputs=outputs, metadata=metadata))
|
|
187
|
+
captured = capsys.readouterr()
|
|
188
|
+
|
|
189
|
+
assert "cost (all): $0.0123" in captured.out
|
|
190
|
+
|
|
191
|
+
|
|
141
192
|
def test_print_results_handles_heterogeneous_metrics(
|
|
142
193
|
capsys, make_metadata, make_output
|
|
143
194
|
):
|
|
@@ -261,7 +261,7 @@ async def test_keepalive_write_failure_surfaces_to_state(monkeypatch):
|
|
|
261
261
|
|
|
262
262
|
assert isinstance(state["error"], StreamInterrupted)
|
|
263
263
|
msg = str(state["error"])
|
|
264
|
-
assert "
|
|
264
|
+
assert "Keepalive write failed" in msg
|
|
265
265
|
assert "ConnectionResetError" in msg
|
|
266
266
|
|
|
267
267
|
|
|
@@ -306,6 +306,6 @@ async def test_non_streaming_response_future_failure_surfaces_to_state(monkeypat
|
|
|
306
306
|
f"expected InterceptionError, got {type(state.get('error'))}"
|
|
307
307
|
)
|
|
308
308
|
msg = str(state["error"])
|
|
309
|
-
assert "
|
|
309
|
+
assert "Intercepted request failed" in msg
|
|
310
310
|
assert "RuntimeError" in msg
|
|
311
311
|
assert "vLLM raised" in msg
|
{verifiers-0.1.15.dev2 → verifiers-0.1.15.dev4}/tests/test_langchain_deep_agents_wikispeedia.py
RENAMED
|
@@ -57,7 +57,7 @@ def test_wikispeedia_loads_as_v1_taskset_harness(
|
|
|
57
57
|
) -> None:
|
|
58
58
|
module = load_module(monkeypatch)
|
|
59
59
|
|
|
60
|
-
env = module.load_environment(train_size=1, eval_size=1)
|
|
60
|
+
env = module.load_environment(config=vf.EnvConfig(), train_size=1, eval_size=1)
|
|
61
61
|
|
|
62
62
|
assert isinstance(env, vf.Env)
|
|
63
63
|
assert isinstance(env.taskset, vf.Taskset)
|
|
@@ -157,6 +157,7 @@ async def test_wikispeedia_tools_resolve_through_v1_runtime(
|
|
|
157
157
|
wiki = make_small_wiki(module)
|
|
158
158
|
monkeypatch.setattr(module, "load_wiki_graph", lambda cache_dir=None: wiki)
|
|
159
159
|
env = module.load_environment(
|
|
160
|
+
config=vf.EnvConfig(),
|
|
160
161
|
train_size=2,
|
|
161
162
|
eval_size=1,
|
|
162
163
|
min_path_length=1,
|
|
@@ -248,6 +249,12 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
|
|
|
248
249
|
async def ainvoke(self, payload, config=None):
|
|
249
250
|
raise GraphRecursionError("recursion limit")
|
|
250
251
|
|
|
252
|
+
created_system_prompts = []
|
|
253
|
+
|
|
254
|
+
def fake_create_deep_agent(**kwargs):
|
|
255
|
+
created_system_prompts.append(kwargs["system_prompt"])
|
|
256
|
+
return FakeAgent()
|
|
257
|
+
|
|
251
258
|
fake_deepagents = types.ModuleType("deepagents")
|
|
252
259
|
fake_langchain_openai = types.ModuleType("langchain_openai")
|
|
253
260
|
fake_langgraph = types.ModuleType("langgraph")
|
|
@@ -255,7 +262,7 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
|
|
|
255
262
|
fake_langchain_core = types.ModuleType("langchain_core")
|
|
256
263
|
fake_tools_module = types.ModuleType("langchain_core.tools")
|
|
257
264
|
|
|
258
|
-
fake_deepagents.create_deep_agent =
|
|
265
|
+
fake_deepagents.create_deep_agent = fake_create_deep_agent
|
|
259
266
|
fake_langchain_openai.ChatOpenAI = FakeChatOpenAI
|
|
260
267
|
fake_langgraph_errors.GraphRecursionError = GraphRecursionError
|
|
261
268
|
fake_langgraph.errors = fake_langgraph_errors
|
|
@@ -276,12 +283,16 @@ async def test_wikispeedia_graph_recursion_limit_stops_rollout(
|
|
|
276
283
|
{
|
|
277
284
|
"info": {"source": "A"},
|
|
278
285
|
"prompt": [{"role": "user", "content": "start"}],
|
|
279
|
-
"system_prompt": [
|
|
286
|
+
"system_prompt": [
|
|
287
|
+
{"role": "user", "content": "first prompt chunk"},
|
|
288
|
+
{"role": "system", "content": "second prompt chunk"},
|
|
289
|
+
],
|
|
280
290
|
}
|
|
281
291
|
)
|
|
282
292
|
|
|
283
293
|
result = await program({}, state)
|
|
284
294
|
|
|
295
|
+
assert created_system_prompts == ["first prompt chunk\n\nsecond prompt chunk"]
|
|
285
296
|
assert result["agent_timeout"] is True
|
|
286
297
|
assert result["stop_reason"] == "agent_recursion_limit"
|
|
287
298
|
assert result["agent_completion"] == []
|
|
@@ -298,11 +309,10 @@ async def test_wikispeedia_tool_metrics_use_agent_completion(
|
|
|
298
309
|
{
|
|
299
310
|
"role": "assistant",
|
|
300
311
|
"content": "",
|
|
301
|
-
"tool_calls": [{"id": "call_1", "name": "click_link"}],
|
|
312
|
+
"tool_calls": [{"id": "call_1", "name": "click_link", "arguments": "{}"}],
|
|
302
313
|
},
|
|
303
314
|
{
|
|
304
315
|
"role": "tool",
|
|
305
|
-
"name": "click_link",
|
|
306
316
|
"tool_call_id": "call_1",
|
|
307
317
|
"content": "'C' is not a valid link from 'A'.",
|
|
308
318
|
},
|
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
import importlib.util
|
|
4
2
|
import inspect
|
|
5
3
|
from pathlib import Path
|
|
6
4
|
from typing import Any
|
|
7
5
|
|
|
6
|
+
import pytest
|
|
8
7
|
import verifiers.v1 as vf
|
|
9
8
|
|
|
10
9
|
|
|
@@ -27,7 +26,7 @@ def _load_mcp_search_module() -> Any:
|
|
|
27
26
|
def test_mcp_search_env_is_v1_only() -> None:
|
|
28
27
|
module = _load_mcp_search_module()
|
|
29
28
|
|
|
30
|
-
env = module.load_environment(max_turns=4)
|
|
29
|
+
env = module.load_environment(config=vf.EnvConfig(), max_turns=4)
|
|
31
30
|
|
|
32
31
|
assert isinstance(env, vf.Env)
|
|
33
32
|
assert isinstance(env.taskset, vf.Taskset)
|
|
@@ -59,3 +58,18 @@ def test_mcp_search_taskset_accepts_v1_taskset_config() -> None:
|
|
|
59
58
|
|
|
60
59
|
assert env.taskset.config.max_turns == 3
|
|
61
60
|
assert all(row["max_turns"] == 3 for row in rows)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@pytest.mark.asyncio
|
|
64
|
+
async def test_mcp_search_reward_handles_missing_assistant() -> None:
|
|
65
|
+
module = _load_mcp_search_module()
|
|
66
|
+
|
|
67
|
+
task = vf.Task({"answer": "expected"})
|
|
68
|
+
assert await module.exact_title_reward(task, vf.State({"completion": []})) == 0.0
|
|
69
|
+
assert (
|
|
70
|
+
await module.exact_title_reward(
|
|
71
|
+
task,
|
|
72
|
+
vf.State({"completion": [{"role": "user", "content": "expected"}]}),
|
|
73
|
+
)
|
|
74
|
+
== 0.0
|
|
75
|
+
)
|
|
@@ -1,5 +1,9 @@
|
|
|
1
|
-
from verifiers.types import AssistantMessage
|
|
2
|
-
from verifiers.utils.message_utils import
|
|
1
|
+
from verifiers.types import AssistantMessage, UserMessage
|
|
2
|
+
from verifiers.utils.message_utils import (
|
|
3
|
+
from_raw_message,
|
|
4
|
+
get_messages,
|
|
5
|
+
normalize_messages,
|
|
6
|
+
)
|
|
3
7
|
|
|
4
8
|
|
|
5
9
|
def test_from_raw_message_normalizes_oai_tool_calls():
|
|
@@ -55,3 +59,30 @@ def test_normalize_messages_accepts_oai_tool_call_dicts():
|
|
|
55
59
|
assert assistant.tool_calls[0].id == "call_2"
|
|
56
60
|
assert assistant.tool_calls[0].name == "lookup"
|
|
57
61
|
assert assistant.tool_calls[0].arguments == '{"q": "hello"}'
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_get_messages_returns_typed_messages():
|
|
65
|
+
messages = get_messages(
|
|
66
|
+
[
|
|
67
|
+
{"role": "user", "content": "question"},
|
|
68
|
+
{"role": "assistant", "content": "answer"},
|
|
69
|
+
]
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
assert isinstance(messages[0], UserMessage)
|
|
73
|
+
assert isinstance(messages[1], AssistantMessage)
|
|
74
|
+
assert messages[-1].content == "answer"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_get_messages_filters_by_role_with_typed_return():
|
|
78
|
+
messages = get_messages(
|
|
79
|
+
[
|
|
80
|
+
{"role": "user", "content": "question"},
|
|
81
|
+
{"role": "assistant", "content": "answer"},
|
|
82
|
+
],
|
|
83
|
+
role="assistant",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
assert len(messages) == 1
|
|
87
|
+
assert isinstance(messages[0], AssistantMessage)
|
|
88
|
+
assert messages[0].content == "answer"
|
|
@@ -1,11 +1,8 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
import importlib.util
|
|
2
|
+
import sys
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import Any, cast
|
|
6
5
|
|
|
7
|
-
import pytest
|
|
8
|
-
|
|
9
6
|
import verifiers.v1 as vf
|
|
10
7
|
|
|
11
8
|
|
|
@@ -23,6 +20,7 @@ def _load_opencode_module() -> Any:
|
|
|
23
20
|
assert spec.loader is not None
|
|
24
21
|
|
|
25
22
|
module = importlib.util.module_from_spec(spec)
|
|
23
|
+
sys.modules[spec.name] = module
|
|
26
24
|
spec.loader.exec_module(module)
|
|
27
25
|
return module
|
|
28
26
|
|
|
@@ -30,33 +28,32 @@ def _load_opencode_module() -> Any:
|
|
|
30
28
|
def test_load_environment_uses_v1_taskset_and_harness() -> None:
|
|
31
29
|
module = _load_opencode_module()
|
|
32
30
|
|
|
33
|
-
env = module.load_environment()
|
|
31
|
+
env = module.load_environment(config=vf.EnvConfig())
|
|
34
32
|
|
|
35
33
|
assert isinstance(env, vf.Env)
|
|
36
34
|
assert isinstance(env.taskset, vf.HarborTaskset)
|
|
37
35
|
assert isinstance(env.harness, vf.OpenCode)
|
|
38
36
|
assert isinstance(env.harness.config, vf.OpenCodeConfig)
|
|
39
37
|
assert not hasattr(module, "OpenCodeHarborHarnessConfig")
|
|
40
|
-
assert
|
|
38
|
+
assert not hasattr(module, "TERMINAL_BENCH_SAMPLE_TASKS")
|
|
39
|
+
assert env.taskset.resolve_tasks_root() == Path(module.__file__).parent / "tasks"
|
|
41
40
|
assert env.harness.config.max_turns == 4
|
|
42
|
-
assert env.harness.config.disabled_tools ==
|
|
41
|
+
assert env.harness.config.disabled_tools == vf.OpenCodeConfig().disabled_tools
|
|
42
|
+
assert "webfetch" in env.harness.config.disabled_tools
|
|
43
|
+
assert "question" in env.harness.config.disabled_tools
|
|
43
44
|
|
|
44
45
|
program = cast(dict[str, object], env.harness.program)
|
|
45
|
-
mcp_setup = cast(dict[str, object], program["
|
|
46
|
+
mcp_setup = cast(dict[str, object], program["channels"])["mcp"]
|
|
46
47
|
assert '"webfetch": false' in cast(str, mcp_setup)
|
|
47
48
|
assert '"question": false' in cast(str, mcp_setup)
|
|
48
|
-
assert '"read": false' not in cast(str, mcp_setup)
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def test_load_environment_accepts_v1_taskset_and_harness_config(
|
|
52
|
-
tmp_path: Path,
|
|
53
|
-
) -> None:
|
|
51
|
+
def test_load_environment_accepts_v1_taskset_and_harness_config() -> None:
|
|
54
52
|
module = _load_opencode_module()
|
|
55
53
|
|
|
56
54
|
env = module.load_environment(
|
|
57
55
|
config=vf.EnvConfig(
|
|
58
56
|
taskset={
|
|
59
|
-
"tasks": str(tmp_path),
|
|
60
57
|
"task_names": ["task-a"],
|
|
61
58
|
"cpu_cores": 1.5,
|
|
62
59
|
},
|
|
@@ -68,7 +65,7 @@ def test_load_environment_accepts_v1_taskset_and_harness_config(
|
|
|
68
65
|
)
|
|
69
66
|
)
|
|
70
67
|
|
|
71
|
-
assert
|
|
68
|
+
assert env.taskset.resolve_tasks_root() == Path(module.__file__).parent / "tasks"
|
|
72
69
|
assert env.taskset.task_names == ["task-a"]
|
|
73
70
|
assert env.taskset.cpu_cores == 1.5
|
|
74
71
|
assert env.harness.config.agent_workdir == "/workspace"
|
|
@@ -76,25 +73,14 @@ def test_load_environment_accepts_v1_taskset_and_harness_config(
|
|
|
76
73
|
|
|
77
74
|
program = cast(dict[str, object], env.harness.program)
|
|
78
75
|
command = cast(list[object], program["command"])
|
|
79
|
-
mcp_setup = cast(dict[str, object], program["
|
|
76
|
+
mcp_setup = cast(dict[str, object], program["channels"])["mcp"]
|
|
80
77
|
assert "/workspace" in cast(str, command[2])
|
|
81
78
|
assert '"webfetch": false' in cast(str, mcp_setup)
|
|
82
79
|
assert '"question": false' not in cast(str, mcp_setup)
|
|
83
80
|
|
|
84
81
|
|
|
85
|
-
def
|
|
86
|
-
module = _load_opencode_module()
|
|
87
|
-
|
|
88
|
-
env = module.load_environment(dataset="terminal-bench-sample")
|
|
89
|
-
|
|
90
|
-
assert env.taskset.task_names == module.TERMINAL_BENCH_SAMPLE_TASKS
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def test_dataset_rejects_explicit_task_names() -> None:
|
|
82
|
+
def test_pyproject_does_not_define_unsupported_harness_defaults() -> None:
|
|
94
83
|
module = _load_opencode_module()
|
|
84
|
+
pyproject = Path(module.__file__).parent / "pyproject.toml"
|
|
95
85
|
|
|
96
|
-
|
|
97
|
-
module.load_environment(
|
|
98
|
-
dataset="terminal-bench-sample",
|
|
99
|
-
task_names=["hello-world"],
|
|
100
|
-
)
|
|
86
|
+
assert "[tool.verifiers.harness]" not in pyproject.read_text()
|