verifiers 0.1.15.dev6__tar.gz → 0.1.15.dev7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/PKG-INFO +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_cli.py +51 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_display.py +16 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_utils.py +16 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_path_utils.py +14 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/__init__.py +1 -1
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/eval.py +5 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/types.py +2 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/eval_display.py +25 -9
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/eval_utils.py +30 -16
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/path_utils.py +9 -3
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/.gitignore +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/LICENSE +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/pyproject.toml +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/AGENTS.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/conftest.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_browser_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_build_script.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_client_auth_errors.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_client_config.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_client_multimodal_types.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_composable_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_context_token_metrics.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_endpoint_registry.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_env_group.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_env_server.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_environment.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_environment_extra.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_envs.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_gepa_cli.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_gepa_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_gym_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_harbor_env_mcp.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_imports.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_install_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_interception_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_lean_task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_logging.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_math_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_mcp_search_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_message_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_message_utils_multimodal.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_multiturn_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_nemorl_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_opencode_harbor.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_openenv_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_per_turn_timing.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_pricing_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_prime_plugin.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_renderer_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_renderer_e2e.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_rlm_composable_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_rlm_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_rubric_group.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_save_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_setup_script.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_singleturn_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_tool_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_trajectory_processing.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_tui_info_formatting.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_types.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_bfcl.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_config_extension.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_empty_completions.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_endpoint_protocols.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_example_counts.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_group_reward_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_harbor_cli.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_rlm_swe.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_runtime_lifecycle.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_scoring_functions.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_taskset_bindings.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_wordle_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/AGENTS.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/build.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/eval.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/gepa.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/init.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/install.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/setup.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/plugins/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/plugins/prime.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/tui.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/anthropic_messages_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/openai_chat_completions_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/openai_completions_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/openai_responses_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/renderer_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/decorators.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/AGENTS.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/env_group.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/environment.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/cli_agent_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/_filter.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/composable_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harness.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/rlm_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/file_locks.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/openenv_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/multiturn_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/sandbox_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/tool_env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/errors.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/adapter.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/config.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/display.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/gepa_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/orchestrator.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/trainer.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/rubric.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/rubric_group.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/build.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/gepa.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/install.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/tui.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/vllm.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/client/env_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/client/zmq_env_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_router.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_server.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_worker.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/zmq_env_server.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/types.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/async_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/client_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/data_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/display_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/env_config_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/error_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/heartbeat.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/import_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/install_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/interception_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/logging_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/message_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/metric_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/pricing_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/process_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/save_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/serve_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/threaded_sandbox_client.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/tunnel_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/usage_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/version_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/README.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/RE_MIGRATION.md +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/config.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/env.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/harness.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/command.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/configs.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/mini_swe_agent.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/opencode.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/pi.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/rlm.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/terminus_2.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/tasksets/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/tasksets/harbor.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/runtime.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/state.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/task.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/taskset.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/toolset.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/types.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/user.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/__init__.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/artifact_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/binding_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/config_callable_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/config_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/endpoint_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/json_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/judge_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/lifecycle_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/mcp_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/object_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/program_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/prompt_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/runtime_registry.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/sandbox_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/scoring_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/serialization_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/task_freeze_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/taskset_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/timing_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/tool_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/trajectory_utils.py +0 -0
- {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/usage_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.15.
|
|
3
|
+
Version: 0.1.15.dev7
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -13,6 +13,7 @@ import verifiers.scripts.eval as vf_eval
|
|
|
13
13
|
import verifiers.utils.eval_utils
|
|
14
14
|
from verifiers.types import GenerateOutputs
|
|
15
15
|
from verifiers.utils.eval_utils import load_toml_config
|
|
16
|
+
from verifiers.utils.path_utils import get_eval_results_path
|
|
16
17
|
from verifiers.utils.save_utils import states_to_outputs
|
|
17
18
|
|
|
18
19
|
|
|
@@ -706,6 +707,34 @@ def test_load_toml_config_multi_env():
|
|
|
706
707
|
assert result[1]["env_id"] == "env2"
|
|
707
708
|
|
|
708
709
|
|
|
710
|
+
def test_load_toml_config_duplicate_envs_accept_names():
|
|
711
|
+
"""Duplicate env ids can be labeled and configured independently."""
|
|
712
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
713
|
+
f.write(
|
|
714
|
+
'[[eval]]\nid = "env1"\nname = "env1-short"\n'
|
|
715
|
+
"[eval.args]\n"
|
|
716
|
+
'split = "short"\n\n'
|
|
717
|
+
'[[eval]]\nid = "env1"\nname = "env1-long"\n'
|
|
718
|
+
"[eval.args]\n"
|
|
719
|
+
'split = "long"\n'
|
|
720
|
+
)
|
|
721
|
+
f.flush()
|
|
722
|
+
result = load_toml_config(Path(f.name))
|
|
723
|
+
|
|
724
|
+
assert len(result) == 2
|
|
725
|
+
assert [config["env_id"] for config in result] == ["env1", "env1"]
|
|
726
|
+
assert [config["name"] for config in result] == ["env1-short", "env1-long"]
|
|
727
|
+
assert [config["env_args"]["split"] for config in result] == ["short", "long"]
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def test_load_toml_config_rejects_global_name():
|
|
731
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
732
|
+
f.write('name = "shared-name"\n\n[[eval]]\nid = "env1"\n')
|
|
733
|
+
f.flush()
|
|
734
|
+
with pytest.raises(ValueError, match="Invalid global field"):
|
|
735
|
+
load_toml_config(Path(f.name))
|
|
736
|
+
|
|
737
|
+
|
|
709
738
|
def test_load_toml_config_with_env_args():
|
|
710
739
|
"""Multiple sections with env_args field loads correctly."""
|
|
711
740
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
@@ -815,6 +844,28 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli):
|
|
|
815
844
|
assert configs[1].env_id == "env2"
|
|
816
845
|
|
|
817
846
|
|
|
847
|
+
def test_cli_duplicate_env_names_disambiguate_result_paths(monkeypatch, run_cli):
|
|
848
|
+
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
849
|
+
f.write(
|
|
850
|
+
'[[eval]]\nid = "env1"\nname = "env1-short"\n'
|
|
851
|
+
"[eval.args]\n"
|
|
852
|
+
'split = "short"\n\n'
|
|
853
|
+
'[[eval]]\nid = "env1"\nname = "env1-long"\n'
|
|
854
|
+
"[eval.args]\n"
|
|
855
|
+
'split = "long"\n'
|
|
856
|
+
)
|
|
857
|
+
f.flush()
|
|
858
|
+
captured = run_cli(monkeypatch, {"env_id_or_config": f.name})
|
|
859
|
+
|
|
860
|
+
configs = captured["configs"]
|
|
861
|
+
assert len(configs) == 2
|
|
862
|
+
assert [config.env_id for config in configs] == ["env1", "env1"]
|
|
863
|
+
assert [config.name for config in configs] == ["env1-short", "env1-long"]
|
|
864
|
+
assert [config.env_args["split"] for config in configs] == ["short", "long"]
|
|
865
|
+
assert get_eval_results_path(configs[0]).parent.name.startswith("env1-short--")
|
|
866
|
+
assert get_eval_results_path(configs[1]).parent.name.startswith("env1-long--")
|
|
867
|
+
|
|
868
|
+
|
|
818
869
|
def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
|
|
819
870
|
"""TOML config ignores CLI args, uses defaults for unspecified values."""
|
|
820
871
|
with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
|
|
@@ -11,9 +11,11 @@ def make_config(
|
|
|
11
11
|
independent_scoring: bool = False,
|
|
12
12
|
endpoint_id: str | None = None,
|
|
13
13
|
client_config: ClientConfig | None = None,
|
|
14
|
+
name: str | None = None,
|
|
14
15
|
) -> EvalConfig:
|
|
15
16
|
return EvalConfig(
|
|
16
17
|
env_id="dummy-env",
|
|
18
|
+
name=name,
|
|
17
19
|
env_args={},
|
|
18
20
|
env_dir_path="./environments",
|
|
19
21
|
endpoint_id=endpoint_id,
|
|
@@ -82,6 +84,20 @@ def test_format_client_target_uses_single_resolved_base_url() -> None:
|
|
|
82
84
|
assert EvalDisplay._format_client_target(config) == "http://localhost:8001/v1"
|
|
83
85
|
|
|
84
86
|
|
|
87
|
+
def test_display_uses_eval_name_for_duplicate_env_labels() -> None:
|
|
88
|
+
display = EvalDisplay(
|
|
89
|
+
[
|
|
90
|
+
make_config(max_concurrent=1, name="dummy-env-short"),
|
|
91
|
+
make_config(max_concurrent=1, name="dummy-env-long"),
|
|
92
|
+
]
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
rendered = render_plain(display._make_compact_env_row(0))
|
|
96
|
+
|
|
97
|
+
assert "dummy-env-short" in rendered
|
|
98
|
+
assert "dummy-env-long" not in rendered
|
|
99
|
+
|
|
100
|
+
|
|
85
101
|
def render_plain(renderable) -> str:
|
|
86
102
|
console = Console(width=100, record=True)
|
|
87
103
|
console.print(renderable)
|
|
@@ -87,6 +87,22 @@ def test_print_results_single_rollout(capsys, make_metadata, make_state, make_in
|
|
|
87
87
|
assert "r1: [0.1, 0.2, 0.3]" in captured.out
|
|
88
88
|
|
|
89
89
|
|
|
90
|
+
def test_print_results_includes_eval_name(capsys, make_metadata, make_output):
|
|
91
|
+
from verifiers.utils.eval_utils import print_results
|
|
92
|
+
|
|
93
|
+
metadata = make_metadata(env_id="env1")
|
|
94
|
+
metadata["name"] = "env1-short"
|
|
95
|
+
results = GenerateOutputs(
|
|
96
|
+
outputs=[make_output(example_id=0, reward=1.0)],
|
|
97
|
+
metadata=metadata,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
print_results(results)
|
|
101
|
+
captured = capsys.readouterr()
|
|
102
|
+
|
|
103
|
+
assert "Environment: env1-short (env1)" in captured.out
|
|
104
|
+
|
|
105
|
+
|
|
90
106
|
def test_print_results_three_rollouts(capsys, make_metadata, make_state, make_input):
|
|
91
107
|
"""Test print_results with three rollouts per example."""
|
|
92
108
|
from verifiers.utils.eval_utils import print_results
|
|
@@ -3,6 +3,7 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
from verifiers.utils.path_utils import (
|
|
5
5
|
find_latest_incomplete_eval_results_path,
|
|
6
|
+
get_eval_runs_dir,
|
|
6
7
|
is_valid_eval_results_path,
|
|
7
8
|
)
|
|
8
9
|
|
|
@@ -69,6 +70,19 @@ def test_find_latest_incomplete_eval_results_path_returns_none_when_no_match(
|
|
|
69
70
|
assert result is None
|
|
70
71
|
|
|
71
72
|
|
|
73
|
+
def test_get_eval_runs_dir_uses_name_as_result_label(tmp_path: Path):
|
|
74
|
+
runs_dir = get_eval_runs_dir(
|
|
75
|
+
env_id="dummy-env",
|
|
76
|
+
name="dummy-env-short",
|
|
77
|
+
model="openai/gpt-4.1-mini",
|
|
78
|
+
output_dir=str(tmp_path / "outputs"),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
assert runs_dir == (
|
|
82
|
+
tmp_path / "outputs" / "evals" / "dummy-env-short--openai--gpt-4.1-mini"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
72
86
|
def test_is_valid_eval_results_path_requires_files(tmp_path: Path):
|
|
73
87
|
run_dir = tmp_path / "run"
|
|
74
88
|
run_dir.mkdir()
|
|
@@ -536,6 +536,9 @@ def main(argv: list[str] | None = None):
|
|
|
536
536
|
def build_eval_config(raw: dict) -> EvalConfig:
|
|
537
537
|
"""Build EvalConfig from a raw config dict."""
|
|
538
538
|
env_id = raw["env_id"]
|
|
539
|
+
name = raw.get("name")
|
|
540
|
+
if name is not None and (not isinstance(name, str) or not name):
|
|
541
|
+
raise ValueError("'name' must be a non-empty string when provided.")
|
|
539
542
|
|
|
540
543
|
# Resolve num_examples and rollouts_per_example with env defaults
|
|
541
544
|
env_defaults = get_env_eval_defaults(env_id)
|
|
@@ -775,6 +778,7 @@ def main(argv: list[str] | None = None):
|
|
|
775
778
|
rollouts_per_example=rollouts_per_example,
|
|
776
779
|
env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
|
|
777
780
|
output_dir=raw.get("output_dir"),
|
|
781
|
+
name=name,
|
|
778
782
|
)
|
|
779
783
|
if auto_resume_path is not None:
|
|
780
784
|
resume_path = auto_resume_path
|
|
@@ -794,6 +798,7 @@ def main(argv: list[str] | None = None):
|
|
|
794
798
|
|
|
795
799
|
return EvalConfig(
|
|
796
800
|
env_id=env_id,
|
|
801
|
+
name=name,
|
|
797
802
|
env_args=raw.get("env_args", {}),
|
|
798
803
|
env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
|
|
799
804
|
output_dir=raw.get("output_dir"),
|
|
@@ -937,6 +937,7 @@ class GenerateMetadata(TypedDict):
|
|
|
937
937
|
"""Pydantic model for generation metadata."""
|
|
938
938
|
|
|
939
939
|
env_id: str
|
|
940
|
+
name: NotRequired[str]
|
|
940
941
|
env_args: dict
|
|
941
942
|
model: str
|
|
942
943
|
base_url: str
|
|
@@ -1109,6 +1110,7 @@ class EvalConfig(BaseModel):
|
|
|
1109
1110
|
|
|
1110
1111
|
# environment
|
|
1111
1112
|
env_id: str
|
|
1113
|
+
name: str | None = None
|
|
1112
1114
|
env_args: dict
|
|
1113
1115
|
env_dir_path: str
|
|
1114
1116
|
# evaluation
|
|
@@ -36,6 +36,17 @@ from verifiers.utils.message_utils import format_messages
|
|
|
36
36
|
from verifiers.utils.pricing_utils import format_cost_usd
|
|
37
37
|
|
|
38
38
|
|
|
39
|
+
def _eval_label(config: EvalConfig) -> str:
|
|
40
|
+
return config.name or config.env_id
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _eval_title(config: EvalConfig) -> str:
|
|
44
|
+
label = _eval_label(config)
|
|
45
|
+
if config.name and config.name != config.env_id:
|
|
46
|
+
return f"{label} ({config.env_id})"
|
|
47
|
+
return label
|
|
48
|
+
|
|
49
|
+
|
|
39
50
|
@dataclass
|
|
40
51
|
class EnvEvalState:
|
|
41
52
|
"""Dynamic eval state for a single env."""
|
|
@@ -572,7 +583,7 @@ class EvalDisplay(BaseDisplay):
|
|
|
572
583
|
|
|
573
584
|
# build title with env name (and index if multi-env)
|
|
574
585
|
title = Text()
|
|
575
|
-
title.append(config
|
|
586
|
+
title.append(_eval_title(config), style="bold cyan")
|
|
576
587
|
if len(self.configs) > 1:
|
|
577
588
|
title.append(f" (env {env_idx + 1}/{len(self.configs)})", style="dim")
|
|
578
589
|
|
|
@@ -740,9 +751,10 @@ class EvalDisplay(BaseDisplay):
|
|
|
740
751
|
|
|
741
752
|
prefix = "\u25b6 " if selected else " "
|
|
742
753
|
line = Text()
|
|
754
|
+
label = _eval_label(config)
|
|
743
755
|
if env_state.status == "completed":
|
|
744
756
|
line.append(f"{prefix}\u2713 ", style="bold green")
|
|
745
|
-
line.append(
|
|
757
|
+
line.append(label, style="green")
|
|
746
758
|
line.append(" reward ", style="dim")
|
|
747
759
|
line.append(format_numeric(env_state.reward), style="bold")
|
|
748
760
|
color = self._get_error_rate_color(env_state.error_rate)
|
|
@@ -754,7 +766,7 @@ class EvalDisplay(BaseDisplay):
|
|
|
754
766
|
line.append(f" {time_str}", style="dim")
|
|
755
767
|
elif env_state.status == "failed":
|
|
756
768
|
line.append(f"{prefix}\u2717 ", style="bold red")
|
|
757
|
-
line.append(
|
|
769
|
+
line.append(label, style="red")
|
|
758
770
|
if env_state.error:
|
|
759
771
|
line.append(" ", style="dim")
|
|
760
772
|
line.append(env_state.error[:80], style="red")
|
|
@@ -770,7 +782,7 @@ class EvalDisplay(BaseDisplay):
|
|
|
770
782
|
)
|
|
771
783
|
total_str = "..." if env_state.total <= 0 else str(env_state.total)
|
|
772
784
|
line.append(f"{prefix}\u25cf ", style="bold yellow")
|
|
773
|
-
line.append(
|
|
785
|
+
line.append(label, style="yellow")
|
|
774
786
|
line.append(f" {pct:.0f}%", style="bold")
|
|
775
787
|
line.append(f" ({env_state.progress}/{total_str})", style="dim")
|
|
776
788
|
line.append(" reward ", style="dim")
|
|
@@ -784,7 +796,7 @@ class EvalDisplay(BaseDisplay):
|
|
|
784
796
|
line.append(f" {time_str}", style="dim")
|
|
785
797
|
else:
|
|
786
798
|
line.append(f"{prefix}\u25cb ", style="dim")
|
|
787
|
-
line.append(
|
|
799
|
+
line.append(label, style="dim")
|
|
788
800
|
line.append(" pending", style="dim")
|
|
789
801
|
|
|
790
802
|
return line
|
|
@@ -958,7 +970,7 @@ class EvalDisplay(BaseDisplay):
|
|
|
958
970
|
self.console.print(
|
|
959
971
|
Panel(
|
|
960
972
|
self._make_env_detail(config, env_state, results),
|
|
961
|
-
title=f"[bold blue]{config
|
|
973
|
+
title=f"[bold blue]{_eval_title(config)}[/bold blue]",
|
|
962
974
|
border_style="dim",
|
|
963
975
|
)
|
|
964
976
|
)
|
|
@@ -980,12 +992,12 @@ class EvalDisplay(BaseDisplay):
|
|
|
980
992
|
env_state = self.state.envs[idx]
|
|
981
993
|
if env_state.error:
|
|
982
994
|
self.console.print()
|
|
983
|
-
self.console.print(f"[red]error in {config
|
|
995
|
+
self.console.print(f"[red]error in {_eval_label(config)}:[/red]")
|
|
984
996
|
self.console.print(f" {env_state.error}")
|
|
985
997
|
|
|
986
998
|
# Summary table with main metrics (printed last)
|
|
987
999
|
table = Table(title="Evaluation Summary")
|
|
988
|
-
table.add_column("
|
|
1000
|
+
table.add_column("eval", style="cyan")
|
|
989
1001
|
table.add_column("status", justify="center")
|
|
990
1002
|
table.add_column("examples", justify="center")
|
|
991
1003
|
table.add_column("rollouts", justify="center")
|
|
@@ -1060,7 +1072,7 @@ class EvalDisplay(BaseDisplay):
|
|
|
1060
1072
|
mins, secs = divmod(int(elapsed), 60)
|
|
1061
1073
|
time_str = f"{mins}m {secs:02d}s" if mins > 0 else f"{secs}s"
|
|
1062
1074
|
|
|
1063
|
-
row = [config
|
|
1075
|
+
row = [_eval_label(config), status, examples_str, rollouts_str, reward]
|
|
1064
1076
|
if show_usage:
|
|
1065
1077
|
row.extend([input_tokens or "-", output_tokens or "-"])
|
|
1066
1078
|
if show_cost:
|
|
@@ -1079,6 +1091,10 @@ class EvalDisplay(BaseDisplay):
|
|
|
1079
1091
|
text = Text()
|
|
1080
1092
|
text.append("model: ", style="dim")
|
|
1081
1093
|
text.append(config.model, style="bold")
|
|
1094
|
+
if config.name:
|
|
1095
|
+
text.append("\n")
|
|
1096
|
+
text.append("env: ", style="dim")
|
|
1097
|
+
text.append(config.env_id, style="bold")
|
|
1082
1098
|
text.append("\n")
|
|
1083
1099
|
text.append("endpoint: ", style="dim")
|
|
1084
1100
|
text.append(self._format_client_target(config))
|
|
@@ -109,25 +109,35 @@ def _attach_metadata_cost(
|
|
|
109
109
|
return cost
|
|
110
110
|
|
|
111
111
|
|
|
112
|
-
def
|
|
112
|
+
def _attach_metadata_name(metadata: GenerateMetadata, name: str | None) -> bool:
|
|
113
|
+
if name is None:
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
metadata["name"] = name
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _with_eval_metadata(
|
|
113
121
|
on_progress: ProgressCallback | list[ProgressCallback] | None,
|
|
114
122
|
model_pricing: ModelPricing | None,
|
|
123
|
+
name: str | None,
|
|
115
124
|
) -> ProgressCallback | list[ProgressCallback] | None:
|
|
116
|
-
if model_pricing is None:
|
|
125
|
+
if model_pricing is None and name is None:
|
|
117
126
|
return on_progress
|
|
118
127
|
|
|
119
|
-
def
|
|
128
|
+
def attach_metadata(
|
|
120
129
|
all_outputs: list[RolloutOutput],
|
|
121
130
|
new_outputs: list[RolloutOutput],
|
|
122
131
|
metadata: GenerateMetadata,
|
|
123
132
|
) -> None:
|
|
133
|
+
_attach_metadata_name(metadata, name)
|
|
124
134
|
_attach_metadata_cost(metadata, model_pricing, all_outputs)
|
|
125
135
|
|
|
126
136
|
if on_progress is None:
|
|
127
|
-
return [
|
|
137
|
+
return [attach_metadata]
|
|
128
138
|
|
|
129
139
|
if isinstance(on_progress, list):
|
|
130
|
-
callbacks: list[ProgressCallback] = [
|
|
140
|
+
callbacks: list[ProgressCallback] = [attach_metadata]
|
|
131
141
|
callbacks.extend(cast(list[ProgressCallback], on_progress))
|
|
132
142
|
return callbacks
|
|
133
143
|
|
|
@@ -136,7 +146,7 @@ def _with_metadata_cost(
|
|
|
136
146
|
new_outputs: list[RolloutOutput],
|
|
137
147
|
metadata: GenerateMetadata,
|
|
138
148
|
) -> None:
|
|
139
|
-
|
|
149
|
+
attach_metadata(all_outputs, new_outputs, metadata)
|
|
140
150
|
on_progress(all_outputs, new_outputs, metadata)
|
|
141
151
|
|
|
142
152
|
return wrapped_progress
|
|
@@ -526,6 +536,7 @@ def load_toml_config(
|
|
|
526
536
|
valid_fields = {
|
|
527
537
|
# environment
|
|
528
538
|
"env_id",
|
|
539
|
+
"name",
|
|
529
540
|
"args",
|
|
530
541
|
"env_args",
|
|
531
542
|
"taskset",
|
|
@@ -573,11 +584,12 @@ def load_toml_config(
|
|
|
573
584
|
|
|
574
585
|
# validate global fields
|
|
575
586
|
if global_defaults:
|
|
576
|
-
|
|
587
|
+
global_valid_fields = valid_fields - {"name"}
|
|
588
|
+
invalid_global = set(global_defaults.keys()) - global_valid_fields
|
|
577
589
|
if invalid_global:
|
|
578
590
|
raise ValueError(
|
|
579
591
|
f"Invalid global field(s) {invalid_global}. "
|
|
580
|
-
f"Valid fields are: {sorted(
|
|
592
|
+
f"Valid fields are: {sorted(global_valid_fields)}"
|
|
581
593
|
)
|
|
582
594
|
|
|
583
595
|
# merge global defaults with per-eval configs
|
|
@@ -856,7 +868,10 @@ def print_usage(results: GenerateOutputs):
|
|
|
856
868
|
def print_results(results: GenerateOutputs, num_samples: int = 1):
|
|
857
869
|
assert results["metadata"] is not None
|
|
858
870
|
print("--- Evaluation ---")
|
|
859
|
-
|
|
871
|
+
env_id = results["metadata"]["env_id"]
|
|
872
|
+
name = results["metadata"].get("name")
|
|
873
|
+
env_label = f"{name} ({env_id})" if name and name != env_id else env_id
|
|
874
|
+
print(f"Environment: {env_label}")
|
|
860
875
|
print(f"Model: {results['metadata']['model']}")
|
|
861
876
|
print(f"Provider: {results['metadata']['base_url']}")
|
|
862
877
|
print(f"Examples: {results['metadata']['num_examples']}")
|
|
@@ -932,7 +947,7 @@ async def run_evaluation(
|
|
|
932
947
|
|
|
933
948
|
results_path = config.resume_path or get_eval_results_path(config)
|
|
934
949
|
model_pricing = await _resolve_model_pricing(config)
|
|
935
|
-
on_progress =
|
|
950
|
+
on_progress = _with_eval_metadata(on_progress, model_pricing, config.name)
|
|
936
951
|
|
|
937
952
|
try:
|
|
938
953
|
if not config.disable_env_server:
|
|
@@ -1022,12 +1037,11 @@ async def run_evaluation(
|
|
|
1022
1037
|
if not config.disable_env_server:
|
|
1023
1038
|
await vf_env.stop_server()
|
|
1024
1039
|
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
await asyncio.to_thread(save_metadata, outputs["metadata"], results_path)
|
|
1040
|
+
metadata_changed = _attach_metadata_name(outputs["metadata"], config.name)
|
|
1041
|
+
if _attach_metadata_cost(outputs["metadata"], model_pricing, outputs["outputs"]):
|
|
1042
|
+
metadata_changed = True
|
|
1043
|
+
if metadata_changed and config.save_results:
|
|
1044
|
+
await asyncio.to_thread(save_metadata, outputs["metadata"], results_path)
|
|
1031
1045
|
|
|
1032
1046
|
return outputs
|
|
1033
1047
|
|
|
@@ -51,7 +51,7 @@ def get_eval_results_path(config: EvalConfig) -> Path:
|
|
|
51
51
|
base_path = _get_outputs_base_path(
|
|
52
52
|
config.env_id, config.env_dir_path, config.output_dir
|
|
53
53
|
)
|
|
54
|
-
return get_results_path(config.env_id, config.model, base_path)
|
|
54
|
+
return get_results_path(config.name or config.env_id, config.model, base_path)
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
def get_eval_runs_dir(
|
|
@@ -59,10 +59,11 @@ def get_eval_runs_dir(
|
|
|
59
59
|
model: str,
|
|
60
60
|
env_dir_path: str = "./environments",
|
|
61
61
|
output_dir: str | None = None,
|
|
62
|
+
name: str | None = None,
|
|
62
63
|
) -> Path:
|
|
63
64
|
"""Return directory containing all eval run directories for env/model."""
|
|
64
65
|
base_path = _get_outputs_base_path(env_id, env_dir_path, output_dir)
|
|
65
|
-
env_model_str = f"{env_id}--{model.replace('/', '--')}"
|
|
66
|
+
env_model_str = f"{name or env_id}--{model.replace('/', '--')}"
|
|
66
67
|
return base_path / "evals" / env_model_str
|
|
67
68
|
|
|
68
69
|
|
|
@@ -108,10 +109,15 @@ def find_latest_incomplete_eval_results_path(
|
|
|
108
109
|
rollouts_per_example: int,
|
|
109
110
|
env_dir_path: str = "./environments",
|
|
110
111
|
output_dir: str | None = None,
|
|
112
|
+
name: str | None = None,
|
|
111
113
|
) -> Path | None:
|
|
112
114
|
"""Find the newest resumable, incomplete eval run for the provided config."""
|
|
113
115
|
runs_dir = get_eval_runs_dir(
|
|
114
|
-
env_id=env_id,
|
|
116
|
+
env_id=env_id,
|
|
117
|
+
model=model,
|
|
118
|
+
env_dir_path=env_dir_path,
|
|
119
|
+
output_dir=output_dir,
|
|
120
|
+
name=name,
|
|
115
121
|
)
|
|
116
122
|
if not runs_dir.exists():
|
|
117
123
|
return None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_langchain_deep_agents_wikispeedia.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_openai_chat_completions_token_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|