verifiers 0.1.15.dev6__tar.gz → 0.1.15.dev7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (311) hide show
  1. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/PKG-INFO +1 -1
  2. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_cli.py +51 -0
  3. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_display.py +16 -0
  4. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_eval_utils.py +16 -0
  5. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_path_utils.py +14 -0
  6. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/__init__.py +1 -1
  7. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/eval.py +5 -0
  8. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/types.py +2 -0
  9. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/eval_display.py +25 -9
  10. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/eval_utils.py +30 -16
  11. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/path_utils.py +9 -3
  12. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/.gitignore +0 -0
  13. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/LICENSE +0 -0
  14. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/README.md +0 -0
  15. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/pyproject.toml +0 -0
  16. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/AGENTS.md +0 -0
  17. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/README.md +0 -0
  18. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/__init__.py +0 -0
  19. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/conftest.py +0 -0
  20. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_browser_env.py +0 -0
  21. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_build_script.py +0 -0
  22. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_cli_agent_env.py +0 -0
  23. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_client_auth_errors.py +0 -0
  24. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_client_config.py +0 -0
  25. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_client_multimodal_types.py +0 -0
  26. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_composable_env.py +0 -0
  27. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_context_token_metrics.py +0 -0
  28. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_decorator_ranks.py +0 -0
  29. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_endpoint_registry.py +0 -0
  30. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_env_group.py +0 -0
  31. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_env_server.py +0 -0
  32. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_environment.py +0 -0
  33. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_environment_extra.py +0 -0
  34. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_envs.py +0 -0
  35. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_error_chain.py +0 -0
  36. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_gepa_cli.py +0 -0
  37. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_gepa_utils.py +0 -0
  38. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_gym_env.py +0 -0
  39. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_harbor_env_mcp.py +0 -0
  40. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_imports.py +0 -0
  41. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_install_utils.py +0 -0
  42. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_interception_utils.py +0 -0
  43. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
  44. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_lean_task.py +0 -0
  45. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_logging.py +0 -0
  46. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_math_rubric.py +0 -0
  47. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_maybe_think_parser.py +0 -0
  48. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_mcp_search_env.py +0 -0
  49. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_message_utils.py +0 -0
  50. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_message_utils_multimodal.py +0 -0
  51. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_multiturn_env.py +0 -0
  52. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_nemorl_client.py +0 -0
  53. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_openai_chat_completions_token_client.py +0 -0
  54. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_openai_responses_client.py +0 -0
  55. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_opencode_harbor.py +0 -0
  56. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_opencode_rlm_env.py +0 -0
  57. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_openenv_client.py +0 -0
  58. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_parser.py +0 -0
  59. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_per_turn_timing.py +0 -0
  60. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_pricing_utils.py +0 -0
  61. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_prime_plugin.py +0 -0
  62. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_renderer_client.py +0 -0
  63. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_renderer_e2e.py +0 -0
  64. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_rlm_composable_env.py +0 -0
  65. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_rlm_env.py +0 -0
  66. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_rubric.py +0 -0
  67. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_rubric_group.py +0 -0
  68. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_sandbox_env.py +0 -0
  69. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_sandbox_mixin.py +0 -0
  70. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_save_utils.py +0 -0
  71. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_setup_script.py +0 -0
  72. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_singleturn_env.py +0 -0
  73. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_stateful_tool_env.py +0 -0
  74. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_think_parser.py +0 -0
  75. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_tool_env.py +0 -0
  76. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_tool_utils.py +0 -0
  77. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_trajectory_processing.py +0 -0
  78. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_tui_info_formatting.py +0 -0
  79. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_types.py +0 -0
  80. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_bfcl.py +0 -0
  81. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_config_extension.py +0 -0
  82. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_empty_completions.py +0 -0
  83. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_endpoint_protocols.py +0 -0
  84. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_example_counts.py +0 -0
  85. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_group_reward_env.py +0 -0
  86. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_harbor_cli.py +0 -0
  87. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_mini_swe_agent.py +0 -0
  88. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_rlm_swe.py +0 -0
  89. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_runtime_lifecycle.py +0 -0
  90. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_scoring_functions.py +0 -0
  91. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_v1_taskset_bindings.py +0 -0
  92. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_wordle_env.py +0 -0
  93. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/tests/test_xml_parser.py +0 -0
  94. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/AGENTS.md +0 -0
  95. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/__init__.py +0 -0
  96. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/__init__.py +0 -0
  97. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/build.py +0 -0
  98. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/eval.py +0 -0
  99. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/gepa.py +0 -0
  100. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/init.py +0 -0
  101. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/install.py +0 -0
  102. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/commands/setup.py +0 -0
  103. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/plugins/__init__.py +0 -0
  104. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/plugins/prime.py +0 -0
  105. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/cli/tui.py +0 -0
  106. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/__init__.py +0 -0
  107. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/anthropic_messages_client.py +0 -0
  108. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/client.py +0 -0
  109. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
  110. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/openai_chat_completions_client.py +0 -0
  111. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
  112. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/openai_completions_client.py +0 -0
  113. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/openai_responses_client.py +0 -0
  114. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/clients/renderer_client.py +0 -0
  115. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/decorators.py +0 -0
  116. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/AGENTS.md +0 -0
  117. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/__init__.py +0 -0
  118. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/env_group.py +0 -0
  119. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/environment.py +0 -0
  120. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/README.md +0 -0
  121. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/__init__.py +0 -0
  122. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/cli_agent_env.py +0 -0
  123. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/README.md +0 -0
  124. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/__init__.py +0 -0
  125. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/_filter.py +0 -0
  126. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/composable_env.py +0 -0
  127. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harness.py +0 -0
  128. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
  129. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
  130. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
  131. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  132. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
  133. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
  134. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/task.py +0 -0
  135. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  136. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  137. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  138. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  139. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  140. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  141. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  142. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  143. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  144. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  145. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  146. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
  147. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  148. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  149. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
  150. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
  151. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
  152. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
  153. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
  154. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -0
  155. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
  156. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
  157. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  158. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/gym_env.py +0 -0
  159. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  160. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/env.py +0 -0
  161. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
  162. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/mcp_env.py +0 -0
  163. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_env.py +0 -0
  164. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  165. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  166. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/rlm_env.py +0 -0
  167. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  168. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/__init__.py +0 -0
  169. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/file_locks.py +0 -0
  170. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
  171. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/README.md +0 -0
  172. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/__init__.py +0 -0
  173. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/README.md +0 -0
  174. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  175. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  176. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  177. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  178. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  179. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  180. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/openenv_env.py +0 -0
  181. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  182. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/integrations/textarena_env.py +0 -0
  183. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/multiturn_env.py +0 -0
  184. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/python_env.py +0 -0
  185. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/sandbox_env.py +0 -0
  186. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/singleturn_env.py +0 -0
  187. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/stateful_tool_env.py +0 -0
  188. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/envs/tool_env.py +0 -0
  189. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/errors.py +0 -0
  190. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/__init__.py +0 -0
  191. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/adapter.py +0 -0
  192. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/config.py +0 -0
  193. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/display.py +0 -0
  194. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/gepa/gepa_utils.py +0 -0
  195. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/__init__.py +0 -0
  196. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/maybe_think_parser.py +0 -0
  197. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/parser.py +0 -0
  198. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/think_parser.py +0 -0
  199. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/parsers/xml_parser.py +0 -0
  200. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/README.md +0 -0
  201. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/__init__.py +0 -0
  202. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/inference/__init__.py +0 -0
  203. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/inference/client.py +0 -0
  204. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/inference/server.py +0 -0
  205. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/__init__.py +0 -0
  206. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/config.py +0 -0
  207. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/orchestrator.py +0 -0
  208. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/trainer.py +0 -0
  209. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rl/trainer/utils.py +0 -0
  210. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/__init__.py +0 -0
  211. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  212. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/judge_rubric.py +0 -0
  213. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/math_rubric.py +0 -0
  214. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/rubric.py +0 -0
  215. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/rubrics/rubric_group.py +0 -0
  216. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/__init__.py +0 -0
  217. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/build.py +0 -0
  218. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/gepa.py +0 -0
  219. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/init.py +0 -0
  220. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/install.py +0 -0
  221. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/rl.py +0 -0
  222. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/setup.py +0 -0
  223. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/train.py +0 -0
  224. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/tui.py +0 -0
  225. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/scripts/vllm.py +0 -0
  226. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/__init__.py +0 -0
  227. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/client/env_client.py +0 -0
  228. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/client/zmq_env_client.py +0 -0
  229. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/__init__.py +0 -0
  230. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_router.py +0 -0
  231. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_server.py +0 -0
  232. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/env_worker.py +0 -0
  233. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/server/zmq_env_server.py +0 -0
  234. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/serve/types.py +0 -0
  235. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/__init__.py +0 -0
  236. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/async_utils.py +0 -0
  237. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/client_utils.py +0 -0
  238. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/config_utils.py +0 -0
  239. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/data_utils.py +0 -0
  240. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/display_utils.py +0 -0
  241. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/env_config_utils.py +0 -0
  242. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/env_utils.py +0 -0
  243. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/error_utils.py +0 -0
  244. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/heartbeat.py +0 -0
  245. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/import_utils.py +0 -0
  246. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/install_utils.py +0 -0
  247. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/interception_utils.py +0 -0
  248. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/logging_utils.py +0 -0
  249. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/message_utils.py +0 -0
  250. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/metric_utils.py +0 -0
  251. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/pricing_utils.py +0 -0
  252. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/process_utils.py +0 -0
  253. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/response_utils.py +0 -0
  254. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/save_utils.py +0 -0
  255. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/serve_utils.py +0 -0
  256. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/thread_utils.py +0 -0
  257. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/threaded_sandbox_client.py +0 -0
  258. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/tool_utils.py +0 -0
  259. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/tunnel_utils.py +0 -0
  260. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/usage_utils.py +0 -0
  261. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/utils/version_utils.py +0 -0
  262. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -0
  263. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/README.md +0 -0
  264. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/RE_MIGRATION.md +0 -0
  265. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/__init__.py +0 -0
  266. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/config.py +0 -0
  267. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/env.py +0 -0
  268. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/harness.py +0 -0
  269. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/__init__.py +0 -0
  270. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/__init__.py +0 -0
  271. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/command.py +0 -0
  272. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/configs.py +0 -0
  273. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/mini_swe_agent.py +0 -0
  274. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/opencode.py +0 -0
  275. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/pi.py +0 -0
  276. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/rlm.py +0 -0
  277. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/harnesses/terminus_2.py +0 -0
  278. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/tasksets/__init__.py +0 -0
  279. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/packages/tasksets/harbor.py +0 -0
  280. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/runtime.py +0 -0
  281. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/state.py +0 -0
  282. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/task.py +0 -0
  283. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/taskset.py +0 -0
  284. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/toolset.py +0 -0
  285. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/types.py +0 -0
  286. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/user.py +0 -0
  287. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/__init__.py +0 -0
  288. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/artifact_utils.py +0 -0
  289. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/binding_utils.py +0 -0
  290. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/config_callable_utils.py +0 -0
  291. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/config_utils.py +0 -0
  292. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/endpoint_utils.py +0 -0
  293. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/json_utils.py +0 -0
  294. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/judge_utils.py +0 -0
  295. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/lifecycle_utils.py +0 -0
  296. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
  297. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/mcp_utils.py +0 -0
  298. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/object_utils.py +0 -0
  299. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/program_utils.py +0 -0
  300. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/prompt_utils.py +0 -0
  301. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/runtime_registry.py +0 -0
  302. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
  303. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/sandbox_utils.py +0 -0
  304. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/scoring_utils.py +0 -0
  305. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/serialization_utils.py +0 -0
  306. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/task_freeze_utils.py +0 -0
  307. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/taskset_utils.py +0 -0
  308. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/timing_utils.py +0 -0
  309. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/tool_utils.py +0 -0
  310. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/trajectory_utils.py +0 -0
  311. {verifiers-0.1.15.dev6 → verifiers-0.1.15.dev7}/verifiers/v1/utils/usage_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.15.dev6
3
+ Version: 0.1.15.dev7
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -13,6 +13,7 @@ import verifiers.scripts.eval as vf_eval
13
13
  import verifiers.utils.eval_utils
14
14
  from verifiers.types import GenerateOutputs
15
15
  from verifiers.utils.eval_utils import load_toml_config
16
+ from verifiers.utils.path_utils import get_eval_results_path
16
17
  from verifiers.utils.save_utils import states_to_outputs
17
18
 
18
19
 
@@ -706,6 +707,34 @@ def test_load_toml_config_multi_env():
706
707
  assert result[1]["env_id"] == "env2"
707
708
 
708
709
 
710
+ def test_load_toml_config_duplicate_envs_accept_names():
711
+ """Duplicate env ids can be labeled and configured independently."""
712
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
713
+ f.write(
714
+ '[[eval]]\nid = "env1"\nname = "env1-short"\n'
715
+ "[eval.args]\n"
716
+ 'split = "short"\n\n'
717
+ '[[eval]]\nid = "env1"\nname = "env1-long"\n'
718
+ "[eval.args]\n"
719
+ 'split = "long"\n'
720
+ )
721
+ f.flush()
722
+ result = load_toml_config(Path(f.name))
723
+
724
+ assert len(result) == 2
725
+ assert [config["env_id"] for config in result] == ["env1", "env1"]
726
+ assert [config["name"] for config in result] == ["env1-short", "env1-long"]
727
+ assert [config["env_args"]["split"] for config in result] == ["short", "long"]
728
+
729
+
730
+ def test_load_toml_config_rejects_global_name():
731
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
732
+ f.write('name = "shared-name"\n\n[[eval]]\nid = "env1"\n')
733
+ f.flush()
734
+ with pytest.raises(ValueError, match="Invalid global field"):
735
+ load_toml_config(Path(f.name))
736
+
737
+
709
738
  def test_load_toml_config_with_env_args():
710
739
  """Multiple sections with env_args field loads correctly."""
711
740
  with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
@@ -815,6 +844,28 @@ def test_cli_multi_env_via_toml_config(monkeypatch, run_cli):
815
844
  assert configs[1].env_id == "env2"
816
845
 
817
846
 
847
+ def test_cli_duplicate_env_names_disambiguate_result_paths(monkeypatch, run_cli):
848
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
849
+ f.write(
850
+ '[[eval]]\nid = "env1"\nname = "env1-short"\n'
851
+ "[eval.args]\n"
852
+ 'split = "short"\n\n'
853
+ '[[eval]]\nid = "env1"\nname = "env1-long"\n'
854
+ "[eval.args]\n"
855
+ 'split = "long"\n'
856
+ )
857
+ f.flush()
858
+ captured = run_cli(monkeypatch, {"env_id_or_config": f.name})
859
+
860
+ configs = captured["configs"]
861
+ assert len(configs) == 2
862
+ assert [config.env_id for config in configs] == ["env1", "env1"]
863
+ assert [config.name for config in configs] == ["env1-short", "env1-long"]
864
+ assert [config.env_args["split"] for config in configs] == ["short", "long"]
865
+ assert get_eval_results_path(configs[0]).parent.name.startswith("env1-short--")
866
+ assert get_eval_results_path(configs[1]).parent.name.startswith("env1-long--")
867
+
868
+
818
869
  def test_cli_toml_ignores_cli_args(monkeypatch, run_cli):
819
870
  """TOML config ignores CLI args, uses defaults for unspecified values."""
820
871
  with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
@@ -11,9 +11,11 @@ def make_config(
11
11
  independent_scoring: bool = False,
12
12
  endpoint_id: str | None = None,
13
13
  client_config: ClientConfig | None = None,
14
+ name: str | None = None,
14
15
  ) -> EvalConfig:
15
16
  return EvalConfig(
16
17
  env_id="dummy-env",
18
+ name=name,
17
19
  env_args={},
18
20
  env_dir_path="./environments",
19
21
  endpoint_id=endpoint_id,
@@ -82,6 +84,20 @@ def test_format_client_target_uses_single_resolved_base_url() -> None:
82
84
  assert EvalDisplay._format_client_target(config) == "http://localhost:8001/v1"
83
85
 
84
86
 
87
+ def test_display_uses_eval_name_for_duplicate_env_labels() -> None:
88
+ display = EvalDisplay(
89
+ [
90
+ make_config(max_concurrent=1, name="dummy-env-short"),
91
+ make_config(max_concurrent=1, name="dummy-env-long"),
92
+ ]
93
+ )
94
+
95
+ rendered = render_plain(display._make_compact_env_row(0))
96
+
97
+ assert "dummy-env-short" in rendered
98
+ assert "dummy-env-long" not in rendered
99
+
100
+
85
101
  def render_plain(renderable) -> str:
86
102
  console = Console(width=100, record=True)
87
103
  console.print(renderable)
@@ -87,6 +87,22 @@ def test_print_results_single_rollout(capsys, make_metadata, make_state, make_in
87
87
  assert "r1: [0.1, 0.2, 0.3]" in captured.out
88
88
 
89
89
 
90
+ def test_print_results_includes_eval_name(capsys, make_metadata, make_output):
91
+ from verifiers.utils.eval_utils import print_results
92
+
93
+ metadata = make_metadata(env_id="env1")
94
+ metadata["name"] = "env1-short"
95
+ results = GenerateOutputs(
96
+ outputs=[make_output(example_id=0, reward=1.0)],
97
+ metadata=metadata,
98
+ )
99
+
100
+ print_results(results)
101
+ captured = capsys.readouterr()
102
+
103
+ assert "Environment: env1-short (env1)" in captured.out
104
+
105
+
90
106
  def test_print_results_three_rollouts(capsys, make_metadata, make_state, make_input):
91
107
  """Test print_results with three rollouts per example."""
92
108
  from verifiers.utils.eval_utils import print_results
@@ -3,6 +3,7 @@ from pathlib import Path
3
3
 
4
4
  from verifiers.utils.path_utils import (
5
5
  find_latest_incomplete_eval_results_path,
6
+ get_eval_runs_dir,
6
7
  is_valid_eval_results_path,
7
8
  )
8
9
 
@@ -69,6 +70,19 @@ def test_find_latest_incomplete_eval_results_path_returns_none_when_no_match(
69
70
  assert result is None
70
71
 
71
72
 
73
+ def test_get_eval_runs_dir_uses_name_as_result_label(tmp_path: Path):
74
+ runs_dir = get_eval_runs_dir(
75
+ env_id="dummy-env",
76
+ name="dummy-env-short",
77
+ model="openai/gpt-4.1-mini",
78
+ output_dir=str(tmp_path / "outputs"),
79
+ )
80
+
81
+ assert runs_dir == (
82
+ tmp_path / "outputs" / "evals" / "dummy-env-short--openai--gpt-4.1-mini"
83
+ )
84
+
85
+
72
86
  def test_is_valid_eval_results_path_requires_files(tmp_path: Path):
73
87
  run_dir = tmp_path / "run"
74
88
  run_dir.mkdir()
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.15.dev6"
1
+ __version__ = "0.1.15.dev7"
2
2
 
3
3
  import importlib
4
4
  import os
@@ -536,6 +536,9 @@ def main(argv: list[str] | None = None):
536
536
  def build_eval_config(raw: dict) -> EvalConfig:
537
537
  """Build EvalConfig from a raw config dict."""
538
538
  env_id = raw["env_id"]
539
+ name = raw.get("name")
540
+ if name is not None and (not isinstance(name, str) or not name):
541
+ raise ValueError("'name' must be a non-empty string when provided.")
539
542
 
540
543
  # Resolve num_examples and rollouts_per_example with env defaults
541
544
  env_defaults = get_env_eval_defaults(env_id)
@@ -775,6 +778,7 @@ def main(argv: list[str] | None = None):
775
778
  rollouts_per_example=rollouts_per_example,
776
779
  env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
777
780
  output_dir=raw.get("output_dir"),
781
+ name=name,
778
782
  )
779
783
  if auto_resume_path is not None:
780
784
  resume_path = auto_resume_path
@@ -794,6 +798,7 @@ def main(argv: list[str] | None = None):
794
798
 
795
799
  return EvalConfig(
796
800
  env_id=env_id,
801
+ name=name,
797
802
  env_args=raw.get("env_args", {}),
798
803
  env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
799
804
  output_dir=raw.get("output_dir"),
@@ -937,6 +937,7 @@ class GenerateMetadata(TypedDict):
937
937
  """Pydantic model for generation metadata."""
938
938
 
939
939
  env_id: str
940
+ name: NotRequired[str]
940
941
  env_args: dict
941
942
  model: str
942
943
  base_url: str
@@ -1109,6 +1110,7 @@ class EvalConfig(BaseModel):
1109
1110
 
1110
1111
  # environment
1111
1112
  env_id: str
1113
+ name: str | None = None
1112
1114
  env_args: dict
1113
1115
  env_dir_path: str
1114
1116
  # evaluation
@@ -36,6 +36,17 @@ from verifiers.utils.message_utils import format_messages
36
36
  from verifiers.utils.pricing_utils import format_cost_usd
37
37
 
38
38
 
39
+ def _eval_label(config: EvalConfig) -> str:
40
+ return config.name or config.env_id
41
+
42
+
43
+ def _eval_title(config: EvalConfig) -> str:
44
+ label = _eval_label(config)
45
+ if config.name and config.name != config.env_id:
46
+ return f"{label} ({config.env_id})"
47
+ return label
48
+
49
+
39
50
  @dataclass
40
51
  class EnvEvalState:
41
52
  """Dynamic eval state for a single env."""
@@ -572,7 +583,7 @@ class EvalDisplay(BaseDisplay):
572
583
 
573
584
  # build title with env name (and index if multi-env)
574
585
  title = Text()
575
- title.append(config.env_id, style="bold cyan")
586
+ title.append(_eval_title(config), style="bold cyan")
576
587
  if len(self.configs) > 1:
577
588
  title.append(f" (env {env_idx + 1}/{len(self.configs)})", style="dim")
578
589
 
@@ -740,9 +751,10 @@ class EvalDisplay(BaseDisplay):
740
751
 
741
752
  prefix = "\u25b6 " if selected else " "
742
753
  line = Text()
754
+ label = _eval_label(config)
743
755
  if env_state.status == "completed":
744
756
  line.append(f"{prefix}\u2713 ", style="bold green")
745
- line.append(config.env_id, style="green")
757
+ line.append(label, style="green")
746
758
  line.append(" reward ", style="dim")
747
759
  line.append(format_numeric(env_state.reward), style="bold")
748
760
  color = self._get_error_rate_color(env_state.error_rate)
@@ -754,7 +766,7 @@ class EvalDisplay(BaseDisplay):
754
766
  line.append(f" {time_str}", style="dim")
755
767
  elif env_state.status == "failed":
756
768
  line.append(f"{prefix}\u2717 ", style="bold red")
757
- line.append(config.env_id, style="red")
769
+ line.append(label, style="red")
758
770
  if env_state.error:
759
771
  line.append(" ", style="dim")
760
772
  line.append(env_state.error[:80], style="red")
@@ -770,7 +782,7 @@ class EvalDisplay(BaseDisplay):
770
782
  )
771
783
  total_str = "..." if env_state.total <= 0 else str(env_state.total)
772
784
  line.append(f"{prefix}\u25cf ", style="bold yellow")
773
- line.append(config.env_id, style="yellow")
785
+ line.append(label, style="yellow")
774
786
  line.append(f" {pct:.0f}%", style="bold")
775
787
  line.append(f" ({env_state.progress}/{total_str})", style="dim")
776
788
  line.append(" reward ", style="dim")
@@ -784,7 +796,7 @@ class EvalDisplay(BaseDisplay):
784
796
  line.append(f" {time_str}", style="dim")
785
797
  else:
786
798
  line.append(f"{prefix}\u25cb ", style="dim")
787
- line.append(config.env_id, style="dim")
799
+ line.append(label, style="dim")
788
800
  line.append(" pending", style="dim")
789
801
 
790
802
  return line
@@ -958,7 +970,7 @@ class EvalDisplay(BaseDisplay):
958
970
  self.console.print(
959
971
  Panel(
960
972
  self._make_env_detail(config, env_state, results),
961
- title=f"[bold blue]{config.env_id}[/bold blue]",
973
+ title=f"[bold blue]{_eval_title(config)}[/bold blue]",
962
974
  border_style="dim",
963
975
  )
964
976
  )
@@ -980,12 +992,12 @@ class EvalDisplay(BaseDisplay):
980
992
  env_state = self.state.envs[idx]
981
993
  if env_state.error:
982
994
  self.console.print()
983
- self.console.print(f"[red]error in {config.env_id}:[/red]")
995
+ self.console.print(f"[red]error in {_eval_label(config)}:[/red]")
984
996
  self.console.print(f" {env_state.error}")
985
997
 
986
998
  # Summary table with main metrics (printed last)
987
999
  table = Table(title="Evaluation Summary")
988
- table.add_column("env_id", style="cyan")
1000
+ table.add_column("eval", style="cyan")
989
1001
  table.add_column("status", justify="center")
990
1002
  table.add_column("examples", justify="center")
991
1003
  table.add_column("rollouts", justify="center")
@@ -1060,7 +1072,7 @@ class EvalDisplay(BaseDisplay):
1060
1072
  mins, secs = divmod(int(elapsed), 60)
1061
1073
  time_str = f"{mins}m {secs:02d}s" if mins > 0 else f"{secs}s"
1062
1074
 
1063
- row = [config.env_id, status, examples_str, rollouts_str, reward]
1075
+ row = [_eval_label(config), status, examples_str, rollouts_str, reward]
1064
1076
  if show_usage:
1065
1077
  row.extend([input_tokens or "-", output_tokens or "-"])
1066
1078
  if show_cost:
@@ -1079,6 +1091,10 @@ class EvalDisplay(BaseDisplay):
1079
1091
  text = Text()
1080
1092
  text.append("model: ", style="dim")
1081
1093
  text.append(config.model, style="bold")
1094
+ if config.name:
1095
+ text.append("\n")
1096
+ text.append("env: ", style="dim")
1097
+ text.append(config.env_id, style="bold")
1082
1098
  text.append("\n")
1083
1099
  text.append("endpoint: ", style="dim")
1084
1100
  text.append(self._format_client_target(config))
@@ -109,25 +109,35 @@ def _attach_metadata_cost(
109
109
  return cost
110
110
 
111
111
 
112
- def _with_metadata_cost(
112
+ def _attach_metadata_name(metadata: GenerateMetadata, name: str | None) -> bool:
113
+ if name is None:
114
+ return False
115
+
116
+ metadata["name"] = name
117
+ return True
118
+
119
+
120
+ def _with_eval_metadata(
113
121
  on_progress: ProgressCallback | list[ProgressCallback] | None,
114
122
  model_pricing: ModelPricing | None,
123
+ name: str | None,
115
124
  ) -> ProgressCallback | list[ProgressCallback] | None:
116
- if model_pricing is None:
125
+ if model_pricing is None and name is None:
117
126
  return on_progress
118
127
 
119
- def attach_cost(
128
+ def attach_metadata(
120
129
  all_outputs: list[RolloutOutput],
121
130
  new_outputs: list[RolloutOutput],
122
131
  metadata: GenerateMetadata,
123
132
  ) -> None:
133
+ _attach_metadata_name(metadata, name)
124
134
  _attach_metadata_cost(metadata, model_pricing, all_outputs)
125
135
 
126
136
  if on_progress is None:
127
- return [attach_cost]
137
+ return [attach_metadata]
128
138
 
129
139
  if isinstance(on_progress, list):
130
- callbacks: list[ProgressCallback] = [attach_cost]
140
+ callbacks: list[ProgressCallback] = [attach_metadata]
131
141
  callbacks.extend(cast(list[ProgressCallback], on_progress))
132
142
  return callbacks
133
143
 
@@ -136,7 +146,7 @@ def _with_metadata_cost(
136
146
  new_outputs: list[RolloutOutput],
137
147
  metadata: GenerateMetadata,
138
148
  ) -> None:
139
- attach_cost(all_outputs, new_outputs, metadata)
149
+ attach_metadata(all_outputs, new_outputs, metadata)
140
150
  on_progress(all_outputs, new_outputs, metadata)
141
151
 
142
152
  return wrapped_progress
@@ -526,6 +536,7 @@ def load_toml_config(
526
536
  valid_fields = {
527
537
  # environment
528
538
  "env_id",
539
+ "name",
529
540
  "args",
530
541
  "env_args",
531
542
  "taskset",
@@ -573,11 +584,12 @@ def load_toml_config(
573
584
 
574
585
  # validate global fields
575
586
  if global_defaults:
576
- invalid_global = set(global_defaults.keys()) - valid_fields
587
+ global_valid_fields = valid_fields - {"name"}
588
+ invalid_global = set(global_defaults.keys()) - global_valid_fields
577
589
  if invalid_global:
578
590
  raise ValueError(
579
591
  f"Invalid global field(s) {invalid_global}. "
580
- f"Valid fields are: {sorted(valid_fields)}"
592
+ f"Valid fields are: {sorted(global_valid_fields)}"
581
593
  )
582
594
 
583
595
  # merge global defaults with per-eval configs
@@ -856,7 +868,10 @@ def print_usage(results: GenerateOutputs):
856
868
  def print_results(results: GenerateOutputs, num_samples: int = 1):
857
869
  assert results["metadata"] is not None
858
870
  print("--- Evaluation ---")
859
- print(f"Environment: {results['metadata']['env_id']}")
871
+ env_id = results["metadata"]["env_id"]
872
+ name = results["metadata"].get("name")
873
+ env_label = f"{name} ({env_id})" if name and name != env_id else env_id
874
+ print(f"Environment: {env_label}")
860
875
  print(f"Model: {results['metadata']['model']}")
861
876
  print(f"Provider: {results['metadata']['base_url']}")
862
877
  print(f"Examples: {results['metadata']['num_examples']}")
@@ -932,7 +947,7 @@ async def run_evaluation(
932
947
 
933
948
  results_path = config.resume_path or get_eval_results_path(config)
934
949
  model_pricing = await _resolve_model_pricing(config)
935
- on_progress = _with_metadata_cost(on_progress, model_pricing)
950
+ on_progress = _with_eval_metadata(on_progress, model_pricing, config.name)
936
951
 
937
952
  try:
938
953
  if not config.disable_env_server:
@@ -1022,12 +1037,11 @@ async def run_evaluation(
1022
1037
  if not config.disable_env_server:
1023
1038
  await vf_env.stop_server()
1024
1039
 
1025
- if (
1026
- _attach_metadata_cost(outputs["metadata"], model_pricing, outputs["outputs"])
1027
- is not None
1028
- ):
1029
- if config.save_results:
1030
- await asyncio.to_thread(save_metadata, outputs["metadata"], results_path)
1040
+ metadata_changed = _attach_metadata_name(outputs["metadata"], config.name)
1041
+ if _attach_metadata_cost(outputs["metadata"], model_pricing, outputs["outputs"]):
1042
+ metadata_changed = True
1043
+ if metadata_changed and config.save_results:
1044
+ await asyncio.to_thread(save_metadata, outputs["metadata"], results_path)
1031
1045
 
1032
1046
  return outputs
1033
1047
 
@@ -51,7 +51,7 @@ def get_eval_results_path(config: EvalConfig) -> Path:
51
51
  base_path = _get_outputs_base_path(
52
52
  config.env_id, config.env_dir_path, config.output_dir
53
53
  )
54
- return get_results_path(config.env_id, config.model, base_path)
54
+ return get_results_path(config.name or config.env_id, config.model, base_path)
55
55
 
56
56
 
57
57
  def get_eval_runs_dir(
@@ -59,10 +59,11 @@ def get_eval_runs_dir(
59
59
  model: str,
60
60
  env_dir_path: str = "./environments",
61
61
  output_dir: str | None = None,
62
+ name: str | None = None,
62
63
  ) -> Path:
63
64
  """Return directory containing all eval run directories for env/model."""
64
65
  base_path = _get_outputs_base_path(env_id, env_dir_path, output_dir)
65
- env_model_str = f"{env_id}--{model.replace('/', '--')}"
66
+ env_model_str = f"{name or env_id}--{model.replace('/', '--')}"
66
67
  return base_path / "evals" / env_model_str
67
68
 
68
69
 
@@ -108,10 +109,15 @@ def find_latest_incomplete_eval_results_path(
108
109
  rollouts_per_example: int,
109
110
  env_dir_path: str = "./environments",
110
111
  output_dir: str | None = None,
112
+ name: str | None = None,
111
113
  ) -> Path | None:
112
114
  """Find the newest resumable, incomplete eval run for the provided config."""
113
115
  runs_dir = get_eval_runs_dir(
114
- env_id=env_id, model=model, env_dir_path=env_dir_path, output_dir=output_dir
116
+ env_id=env_id,
117
+ model=model,
118
+ env_dir_path=env_dir_path,
119
+ output_dir=output_dir,
120
+ name=name,
115
121
  )
116
122
  if not runs_dir.exists():
117
123
  return None
File without changes