verifiers 0.1.15.dev12__tar.gz → 0.1.15.dev14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/PKG-INFO +1 -1
  2. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_eval_cli.py +112 -7
  3. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_config_extension.py +22 -0
  4. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_textarena_taskset.py +44 -0
  5. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/__init__.py +1 -1
  6. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/eval.py +142 -3
  7. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/types.py +9 -2
  8. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/env_utils.py +37 -23
  9. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/harness.py +20 -0
  10. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/taskset.py +13 -1
  11. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/.gitignore +0 -0
  12. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/LICENSE +0 -0
  13. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/README.md +0 -0
  14. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/pyproject.toml +0 -0
  15. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/AGENTS.md +0 -0
  16. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/README.md +0 -0
  17. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/__init__.py +0 -0
  18. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/conftest.py +0 -0
  19. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_browser_env.py +0 -0
  20. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_build_script.py +0 -0
  21. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_cli_agent_env.py +0 -0
  22. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_client_auth_errors.py +0 -0
  23. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_client_config.py +0 -0
  24. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_client_multimodal_types.py +0 -0
  25. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_composable_env.py +0 -0
  26. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_context_token_metrics.py +0 -0
  27. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_decorator_ranks.py +0 -0
  28. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_endpoint_registry.py +0 -0
  29. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_env_group.py +0 -0
  30. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_env_server.py +0 -0
  31. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_environment.py +0 -0
  32. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_environment_extra.py +0 -0
  33. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_envs.py +0 -0
  34. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_error_chain.py +0 -0
  35. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_eval_display.py +0 -0
  36. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_eval_utils.py +0 -0
  37. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_gepa_cli.py +0 -0
  38. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_gepa_utils.py +0 -0
  39. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_gym_env.py +0 -0
  40. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_harbor_env_mcp.py +0 -0
  41. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_imports.py +0 -0
  42. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_init_script.py +0 -0
  43. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_install_utils.py +0 -0
  44. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_interception_utils.py +0 -0
  45. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_langchain_deep_agents_wikispeedia.py +0 -0
  46. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_lean_task.py +0 -0
  47. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_logging.py +0 -0
  48. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_math_rubric.py +0 -0
  49. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_maybe_think_parser.py +0 -0
  50. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_mcp_search_env.py +0 -0
  51. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_message_utils.py +0 -0
  52. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_message_utils_multimodal.py +0 -0
  53. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_multiturn_env.py +0 -0
  54. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_nemorl_client.py +0 -0
  55. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_openai_chat_completions_token_client.py +0 -0
  56. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_openai_responses_client.py +0 -0
  57. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_opencode_harbor.py +0 -0
  58. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_opencode_rlm_env.py +0 -0
  59. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_openenv_client.py +0 -0
  60. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_parser.py +0 -0
  61. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_path_utils.py +0 -0
  62. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_per_turn_timing.py +0 -0
  63. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_pricing_utils.py +0 -0
  64. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_prime_plugin.py +0 -0
  65. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_renderer_client.py +0 -0
  66. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_renderer_e2e.py +0 -0
  67. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_rlm_composable_env.py +0 -0
  68. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_rlm_env.py +0 -0
  69. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_rubric.py +0 -0
  70. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_rubric_group.py +0 -0
  71. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_sandbox_env.py +0 -0
  72. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_sandbox_mixin.py +0 -0
  73. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_save_utils.py +0 -0
  74. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_setup_script.py +0 -0
  75. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_singleturn_env.py +0 -0
  76. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_stateful_tool_env.py +0 -0
  77. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_think_parser.py +0 -0
  78. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_tool_env.py +0 -0
  79. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_tool_utils.py +0 -0
  80. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_trajectory_processing.py +0 -0
  81. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_tui_info_formatting.py +0 -0
  82. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_types.py +0 -0
  83. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_bfcl.py +0 -0
  84. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_empty_completions.py +0 -0
  85. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_endpoint_protocols.py +0 -0
  86. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_example_counts.py +0 -0
  87. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_group_reward_env.py +0 -0
  88. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_harbor_cli.py +0 -0
  89. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_mini_swe_agent.py +0 -0
  90. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_nemo_gym_harness.py +0 -0
  91. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_openenv_taskset.py +0 -0
  92. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_openreward_taskset.py +0 -0
  93. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_rlm_swe.py +0 -0
  94. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_runtime_lifecycle.py +0 -0
  95. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_scoring_functions.py +0 -0
  96. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_v1_taskset_bindings.py +0 -0
  97. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_wiki_search_v1.py +0 -0
  98. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_wordle_env.py +0 -0
  99. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_wordle_v1_env.py +0 -0
  100. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/tests/test_xml_parser.py +0 -0
  101. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/AGENTS.md +0 -0
  102. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/__init__.py +0 -0
  103. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/commands/__init__.py +0 -0
  104. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/commands/build.py +0 -0
  105. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/commands/eval.py +0 -0
  106. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/commands/gepa.py +0 -0
  107. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/commands/init.py +0 -0
  108. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/commands/install.py +0 -0
  109. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/commands/setup.py +0 -0
  110. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/plugins/__init__.py +0 -0
  111. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/plugins/prime.py +0 -0
  112. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/cli/tui.py +0 -0
  113. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/__init__.py +0 -0
  114. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/anthropic_messages_client.py +0 -0
  115. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/client.py +0 -0
  116. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
  117. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/openai_chat_completions_client.py +0 -0
  118. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
  119. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/openai_completions_client.py +0 -0
  120. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/openai_responses_client.py +0 -0
  121. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/clients/renderer_client.py +0 -0
  122. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/decorators.py +0 -0
  123. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/AGENTS.md +0 -0
  124. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/__init__.py +0 -0
  125. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/env_group.py +0 -0
  126. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/environment.py +0 -0
  127. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/README.md +0 -0
  128. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/__init__.py +0 -0
  129. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/cli_agent_env.py +0 -0
  130. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/README.md +0 -0
  131. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/__init__.py +0 -0
  132. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/_filter.py +0 -0
  133. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/composable_env.py +0 -0
  134. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/harness.py +0 -0
  135. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -0
  136. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
  137. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
  138. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  139. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -0
  140. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/swe_debug_env.py +0 -0
  141. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/task.py +0 -0
  142. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  143. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  144. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  145. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  146. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  147. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  148. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  149. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  150. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  151. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  152. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  153. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
  154. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  155. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  156. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
  157. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
  158. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
  159. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
  160. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
  161. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +0 -0
  162. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
  163. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +0 -0
  164. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  165. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/gym_env.py +0 -0
  166. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  167. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/harbor_env/env.py +0 -0
  168. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
  169. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/mcp_env.py +0 -0
  170. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/opencode_env.py +0 -0
  171. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  172. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  173. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/rlm_env.py +0 -0
  174. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  175. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/utils/__init__.py +0 -0
  176. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/utils/file_locks.py +0 -0
  177. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/experimental/utils/git_checkout_cache.py +0 -0
  178. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/README.md +0 -0
  179. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/__init__.py +0 -0
  180. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/browser_env/README.md +0 -0
  181. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  182. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  183. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  184. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  185. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  186. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  187. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/openenv_env.py +0 -0
  188. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  189. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/integrations/textarena_env.py +0 -0
  190. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/multiturn_env.py +0 -0
  191. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/python_env.py +0 -0
  192. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/sandbox_env.py +0 -0
  193. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/singleturn_env.py +0 -0
  194. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/stateful_tool_env.py +0 -0
  195. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/envs/tool_env.py +0 -0
  196. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/errors.py +0 -0
  197. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/gepa/__init__.py +0 -0
  198. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/gepa/adapter.py +0 -0
  199. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/gepa/config.py +0 -0
  200. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/gepa/display.py +0 -0
  201. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/gepa/gepa_utils.py +0 -0
  202. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/parsers/__init__.py +0 -0
  203. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/parsers/maybe_think_parser.py +0 -0
  204. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/parsers/parser.py +0 -0
  205. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/parsers/think_parser.py +0 -0
  206. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/parsers/xml_parser.py +0 -0
  207. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/README.md +0 -0
  208. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/__init__.py +0 -0
  209. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/inference/__init__.py +0 -0
  210. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/inference/client.py +0 -0
  211. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/inference/server.py +0 -0
  212. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/trainer/__init__.py +0 -0
  213. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/trainer/config.py +0 -0
  214. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/trainer/orchestrator.py +0 -0
  215. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/trainer/trainer.py +0 -0
  216. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rl/trainer/utils.py +0 -0
  217. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rubrics/__init__.py +0 -0
  218. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  219. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rubrics/judge_rubric.py +0 -0
  220. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rubrics/math_rubric.py +0 -0
  221. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rubrics/rubric.py +0 -0
  222. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/rubrics/rubric_group.py +0 -0
  223. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/__init__.py +0 -0
  224. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/build.py +0 -0
  225. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/gepa.py +0 -0
  226. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/init.py +0 -0
  227. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/install.py +0 -0
  228. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/rl.py +0 -0
  229. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/setup.py +0 -0
  230. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/train.py +0 -0
  231. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/tui.py +0 -0
  232. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/scripts/vllm.py +0 -0
  233. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/__init__.py +0 -0
  234. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/client/env_client.py +0 -0
  235. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/client/zmq_env_client.py +0 -0
  236. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/server/__init__.py +0 -0
  237. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/server/env_router.py +0 -0
  238. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/server/env_server.py +0 -0
  239. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/server/env_worker.py +0 -0
  240. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/server/zmq_env_server.py +0 -0
  241. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/serve/types.py +0 -0
  242. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/__init__.py +0 -0
  243. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/async_utils.py +0 -0
  244. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/client_utils.py +0 -0
  245. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/config_utils.py +0 -0
  246. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/data_utils.py +0 -0
  247. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/display_utils.py +0 -0
  248. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/env_config_utils.py +0 -0
  249. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/error_utils.py +0 -0
  250. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/eval_display.py +0 -0
  251. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/eval_utils.py +0 -0
  252. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/heartbeat.py +0 -0
  253. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/import_utils.py +0 -0
  254. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/install_utils.py +0 -0
  255. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/interception_utils.py +0 -0
  256. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/logging_utils.py +0 -0
  257. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/message_utils.py +0 -0
  258. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/metric_utils.py +0 -0
  259. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/path_utils.py +0 -0
  260. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/pricing_utils.py +0 -0
  261. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/process_utils.py +0 -0
  262. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/response_utils.py +0 -0
  263. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/save_utils.py +0 -0
  264. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/serve_utils.py +0 -0
  265. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/thread_utils.py +0 -0
  266. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/threaded_sandbox_client.py +0 -0
  267. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/tool_utils.py +0 -0
  268. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/usage_utils.py +0 -0
  269. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/utils/version_utils.py +0 -0
  270. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/ENVIRONMENT_BEST_PRACTICES.md +0 -0
  271. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/README.md +0 -0
  272. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/RE_MIGRATION.md +0 -0
  273. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/__init__.py +0 -0
  274. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/artifact.py +0 -0
  275. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/config.py +0 -0
  276. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/env.py +0 -0
  277. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/model.py +0 -0
  278. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/program.py +0 -0
  279. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/runtime.py +0 -0
  280. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/runtime_handles.py +0 -0
  281. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/sandbox.py +0 -0
  282. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/state.py +0 -0
  283. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/task.py +0 -0
  284. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/toolset.py +0 -0
  285. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/types.py +0 -0
  286. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/user.py +0 -0
  287. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/__init__.py +0 -0
  288. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/binding_utils.py +0 -0
  289. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/config_callable_utils.py +0 -0
  290. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/config_utils.py +0 -0
  291. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/endpoint_utils.py +0 -0
  292. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/json_utils.py +0 -0
  293. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/judge_utils.py +0 -0
  294. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/lifecycle_utils.py +0 -0
  295. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/mcp_proxy_utils.py +0 -0
  296. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/mcp_utils.py +0 -0
  297. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/object_utils.py +0 -0
  298. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/program_utils.py +0 -0
  299. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/prompt_utils.py +0 -0
  300. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/runtime_owner_utils.py +0 -0
  301. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/runtime_registry.py +0 -0
  302. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/sandbox_program_utils.py +0 -0
  303. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/sandbox_python_utils.py +0 -0
  304. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/sandbox_utils.py +0 -0
  305. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/scoring_utils.py +0 -0
  306. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/serialization_utils.py +0 -0
  307. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/task_freeze_utils.py +0 -0
  308. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/taskset_utils.py +0 -0
  309. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/tool_utils.py +0 -0
  310. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/toolset_utils.py +0 -0
  311. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/trajectory_utils.py +0 -0
  312. {verifiers-0.1.15.dev12 → verifiers-0.1.15.dev14}/verifiers/v1/utils/usage_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.15.dev12
3
+ Version: 0.1.15.dev14
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -1,4 +1,3 @@
1
- import argparse
2
1
  import importlib
3
2
  import os
4
3
  import sys
@@ -11,7 +10,7 @@ import pytest
11
10
 
12
11
  import verifiers.scripts.eval as vf_eval
13
12
  import verifiers.utils.eval_utils
14
- from verifiers.types import EndpointConfig, GenerateOutputs
13
+ from verifiers.types import EndpointConfig, EvalConfig, GenerateOutputs
15
14
  from verifiers.utils.eval_utils import load_toml_config
16
15
  from verifiers.utils.path_utils import get_eval_results_path
17
16
  from verifiers.utils.save_utils import states_to_outputs
@@ -25,6 +24,24 @@ def endpoint(**values: object) -> EndpointConfig:
25
24
  return EndpointConfig.model_validate(values)
26
25
 
27
26
 
27
+ def test_eval_config_accepts_id_shorthand():
28
+ config = EvalConfig.model_validate(
29
+ {
30
+ "id": "env1",
31
+ "env_args": {},
32
+ "env_dir_path": "./environments",
33
+ "model": "openai/gpt-4.1-mini",
34
+ "client_config": {},
35
+ "sampling_args": {},
36
+ "num_examples": 1,
37
+ "rollouts_per_example": 1,
38
+ "max_concurrent": 1,
39
+ }
40
+ )
41
+
42
+ assert config.env_id == "env1"
43
+
44
+
28
45
  @pytest.fixture
29
46
  def run_cli(make_metadata, make_state, make_input):
30
47
  def _run_cli(
@@ -69,6 +86,7 @@ def run_cli(make_metadata, make_state, make_input):
69
86
  "save_to_hf_hub": False,
70
87
  "hf_hub_dataset_name": "",
71
88
  "extra_env_kwargs": {},
89
+ "env_config_overrides": [],
72
90
  "max_retries": 0,
73
91
  "fullscreen": False,
74
92
  "disable_tui": False,
@@ -80,11 +98,7 @@ def run_cli(make_metadata, make_state, make_input):
80
98
 
81
99
  captured: dict = {"sampling_args": None, "configs": []}
82
100
 
83
- monkeypatch.setattr(
84
- argparse.ArgumentParser,
85
- "parse_args",
86
- lambda self: args_namespace,
87
- )
101
+ monkeypatch.setattr(vf_eval, "parse_args", lambda argv=None: args_namespace)
88
102
  monkeypatch.setattr(vf_eval, "setup_logging", lambda *_, **__: None)
89
103
  if fail_on_load_endpoints:
90
104
  monkeypatch.setattr(vf_eval, "load_endpoints", fail_load_endpoints)
@@ -135,6 +149,97 @@ def test_cli_single_env_id(monkeypatch, run_cli):
135
149
  assert configs[0].env_id == "env1"
136
150
 
137
151
 
152
+ def test_parse_args_accepts_v1_env_config_overrides():
153
+ args = vf_eval.parse_args(
154
+ [
155
+ "env1",
156
+ "--taskset.id",
157
+ "my-id",
158
+ "--harness.id",
159
+ "my-harness",
160
+ "--harness.max-turns",
161
+ "4",
162
+ ]
163
+ )
164
+
165
+ assert args.env_config_overrides == [
166
+ "--taskset.id",
167
+ "my-id",
168
+ "--harness.id",
169
+ "my-harness",
170
+ "--harness.max-turns",
171
+ "4",
172
+ ]
173
+
174
+
175
+ def test_parse_args_rejects_unknown_eval_flags():
176
+ with pytest.raises(SystemExit):
177
+ vf_eval.parse_args(["env1", "--unknown-flag", "value"])
178
+
179
+
180
+ def test_cli_v1_env_config_overrides_preserve_env_args_config(
181
+ tmp_path: Path, monkeypatch, run_cli
182
+ ):
183
+ module_name = f"cli_override_env_{time.time_ns()}"
184
+ (tmp_path / f"{module_name}.py").write_text(
185
+ """
186
+ import verifiers as vf
187
+
188
+
189
+ class DemoTasksetConfig(vf.TasksetConfig):
190
+ count: int = 1
191
+ enabled: bool = True
192
+
193
+
194
+ class DemoHarnessConfig(vf.HarnessConfig):
195
+ label: str = "base"
196
+
197
+
198
+ def load_taskset(config: DemoTasksetConfig):
199
+ raise RuntimeError("not used")
200
+
201
+
202
+ def load_harness(config: DemoHarnessConfig):
203
+ raise RuntimeError("not used")
204
+
205
+
206
+ def load_environment(config: vf.EnvConfig):
207
+ raise RuntimeError("not used")
208
+ """,
209
+ encoding="utf-8",
210
+ )
211
+ monkeypatch.syspath_prepend(str(tmp_path))
212
+ importlib.invalidate_caches()
213
+
214
+ captured = run_cli(
215
+ monkeypatch,
216
+ {
217
+ "env_id_or_config": module_name,
218
+ "env_args": {"config": {"taskset": {"count": 2}}},
219
+ "env_config_overrides": [
220
+ "--taskset.id",
221
+ "override-id",
222
+ "--no-taskset.enabled",
223
+ "--harness.id",
224
+ "demo-harness",
225
+ "--harness.max-turns",
226
+ "4",
227
+ ],
228
+ },
229
+ )
230
+
231
+ assert captured["configs"][0].env_args == {
232
+ "config": {
233
+ "taskset": {
234
+ "taskset_id": "override-id",
235
+ "count": 2,
236
+ "enabled": False,
237
+ },
238
+ "harness": {"harness_id": "demo-harness", "max_turns": 4},
239
+ }
240
+ }
241
+
242
+
138
243
  def test_get_env_eval_defaults_for_package_module(tmp_path: Path, monkeypatch):
139
244
  module_name = f"pkg_env_{time.time_ns()}"
140
245
  env_id = module_name.replace("_", "-")
@@ -2027,6 +2027,28 @@ def test_env_config_tracks_prebuilt_children() -> None:
2027
2027
  assert env.config.harness.max_turns == 3
2028
2028
 
2029
2029
 
2030
+ def test_taskset_and_harness_configs_accept_id_shorthand() -> None:
2031
+ class CustomTasksetConfig(TasksetConfig):
2032
+ taskset_id: str | None = "default-taskset"
2033
+
2034
+ class CustomHarnessConfig(HarnessConfig):
2035
+ harness_id: str | None = "default-harness"
2036
+
2037
+ taskset_config = CustomTasksetConfig.model_validate({"id": "taskset-short"})
2038
+ harness_config = CustomHarnessConfig.model_validate({"id": "harness-short"})
2039
+
2040
+ assert taskset_config.taskset_id == "taskset-short"
2041
+ assert harness_config.harness_id == "harness-short"
2042
+ assert explicit_config_data(taskset_config) == {"taskset_id": "taskset-short"}
2043
+ assert explicit_config_data(harness_config) == {"harness_id": "harness-short"}
2044
+
2045
+ taskset = Taskset(config={"id": "taskset-short"})
2046
+ harness = Harness(config={"id": "harness-short"})
2047
+
2048
+ assert taskset.taskset_id == "taskset-short"
2049
+ assert harness.harness_id == "harness-short"
2050
+
2051
+
2030
2052
  def test_env_rejects_taskset_builders() -> None:
2031
2053
  def load_taskset() -> Taskset:
2032
2054
  return Taskset(config=TasksetConfig())
@@ -198,6 +198,50 @@ async def test_textarena_user_steps_env_and_stops_when_game_finishes(fake_textar
198
198
  assert state["stop_condition"] == "textarena_done"
199
199
 
200
200
 
201
+ @pytest.mark.asyncio
202
+ async def test_textarena_user_accepts_structured_assistant_content(fake_textarena):
203
+ _, fake_ta = fake_textarena
204
+ taskset = textarena.TextArenaTaskset(
205
+ config=textarena.TextArenaTasksetConfig(
206
+ game="FakeWordle-v0",
207
+ answer_state_key="secret_word",
208
+ num_train_examples=1,
209
+ num_eval_examples=0,
210
+ )
211
+ )
212
+ task = taskset.to_task(
213
+ vf.Task(
214
+ {
215
+ "example_id": 0,
216
+ "prompt": [],
217
+ "answer": "apple",
218
+ "textarena": {
219
+ "game": "FakeWordle-v0",
220
+ "answer_state_key": "secret_word",
221
+ },
222
+ }
223
+ )
224
+ )
225
+ state = vf.State.for_task(task)
226
+ state["completion"] = [
227
+ vf.AssistantMessage(
228
+ content=[
229
+ vf.TextContentPart(text="I will guess "),
230
+ vf.TextContentPart(text="<guess>[apple]</guess>."),
231
+ ]
232
+ )
233
+ ]
234
+
235
+ env = vf.Env(taskset=taskset, harness=vf.Harness(config=vf.HarnessConfig()))
236
+ state = await env.harness.setup_state(task, state)
237
+ messages = await env.harness.runtime.user_messages(task, state)
238
+ ta_env = fake_ta.envs[-1]
239
+
240
+ assert ta_env.guesses == ["[apple]"]
241
+ assert messages == [{"role": "user", "content": "Solved."}]
242
+ assert state["stop_condition"] == "textarena_done"
243
+
244
+
201
245
  @pytest.mark.asyncio
202
246
  async def test_textarena_user_returns_wordle_feedback_for_unfinished_game(
203
247
  fake_textarena,
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.15.dev12"
1
+ __version__ = "0.1.15.dev14"
2
2
 
3
3
  import importlib
4
4
  import os
@@ -10,12 +10,17 @@ os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
10
10
 
11
11
  import argparse
12
12
  import asyncio
13
+ import inspect
13
14
  import importlib.util
14
15
  import json
15
16
  import logging
16
17
  from pathlib import Path
17
18
  from typing import Any, cast
18
19
 
20
+ from pydantic import BaseModel, create_model
21
+ from pydantic_config import ConfigFileError
22
+ from pydantic_config import cli as parse_pydantic_config_cli
23
+
19
24
  from verifiers import setup_logging
20
25
  from verifiers.types import (
21
26
  ClientConfig,
@@ -34,8 +39,16 @@ from verifiers.utils.eval_utils import (
34
39
  run_evaluations,
35
40
  run_evaluations_tui,
36
41
  )
42
+ from verifiers.utils.env_utils import (
43
+ env_config_annotation,
44
+ env_config_child_types,
45
+ import_env_module,
46
+ load_env_config,
47
+ )
37
48
  from verifiers.utils.import_utils import load_toml
38
49
  from verifiers.utils.install_utils import check_hub_env_installed
50
+ from verifiers.v1.env import EnvConfig
51
+ from verifiers.v1.utils.config_utils import explicit_config_data
39
52
 
40
53
  logger = logging.getLogger(__name__)
41
54
 
@@ -46,6 +59,13 @@ DEFAULT_NUM_EXAMPLES = 5
46
59
  DEFAULT_ROLLOUTS_PER_EXAMPLE = 3
47
60
  DEFAULT_MAX_CONCURRENT = 32
48
61
  DEFAULT_CLIENT_TYPE = "openai_chat_completions"
62
+ ENV_CONFIG_OVERRIDE_FLAG_PREFIXES = (
63
+ "--taskset.",
64
+ "--harness.",
65
+ "--no-taskset.",
66
+ "--no-harness.",
67
+ )
68
+ ENV_CONFIG_OVERRIDE_GROUP_FLAGS = {"--taskset", "--harness"}
49
69
 
50
70
  # Provider shorthand configs: maps provider name to (base_url, api_key_var[, client_type])
51
71
  PROVIDER_CONFIGS: dict[str, dict[str, str]] = {
@@ -240,6 +260,111 @@ def get_env_eval_defaults(env_id: str) -> dict[str, Any]:
240
260
  return defaults
241
261
 
242
262
 
263
+ def is_env_config_override_flag(token: str) -> bool:
264
+ return token in ENV_CONFIG_OVERRIDE_GROUP_FLAGS or token.startswith(
265
+ ENV_CONFIG_OVERRIDE_FLAG_PREFIXES
266
+ )
267
+
268
+
269
+ def validate_env_config_override_args(
270
+ parser: argparse.ArgumentParser,
271
+ override_args: list[str],
272
+ ) -> None:
273
+ if not override_args:
274
+ return
275
+ if not is_env_config_override_flag(override_args[0]):
276
+ parser.error(f"unrecognized arguments: {' '.join(override_args)}")
277
+ invalid_flags = [
278
+ token
279
+ for token in override_args
280
+ if token.startswith("--") and not is_env_config_override_flag(token)
281
+ ]
282
+ if invalid_flags:
283
+ parser.error(f"unrecognized arguments: {' '.join(invalid_flags)}")
284
+
285
+
286
+ def merge_config_data(
287
+ base: dict[str, Any],
288
+ overrides: dict[str, Any],
289
+ ) -> dict[str, Any]:
290
+ merged = dict(base)
291
+ for key, value in overrides.items():
292
+ existing = merged.get(key)
293
+ if isinstance(existing, dict) and isinstance(value, dict):
294
+ merged[key] = merge_config_data(existing, value)
295
+ else:
296
+ merged[key] = value
297
+ return merged
298
+
299
+
300
+ def env_config_cli_type(
301
+ config_type: type[EnvConfig],
302
+ default_config: EnvConfig,
303
+ child_types: dict[str, type[BaseModel]],
304
+ ) -> type[EnvConfig]:
305
+ fields = {
306
+ field_name: (child_type, getattr(default_config, field_name))
307
+ for field_name, child_type in child_types.items()
308
+ }
309
+ create_env_model = cast(Any, create_model)
310
+ return cast(
311
+ type[EnvConfig],
312
+ create_env_model(
313
+ f"{config_type.__name__}CliOverrides",
314
+ __base__=config_type,
315
+ **fields,
316
+ ),
317
+ )
318
+
319
+
320
+ def apply_env_config_cli_overrides(
321
+ env_id: str,
322
+ env_args: dict[str, Any],
323
+ override_args: list[str],
324
+ ) -> dict[str, Any]:
325
+ if not override_args:
326
+ return dict(env_args)
327
+
328
+ module = import_env_module(env_id)
329
+ env_load_func = getattr(module, "load_environment", None)
330
+ if env_load_func is None:
331
+ raise ValueError(f"Environment '{env_id}' does not expose load_environment.")
332
+
333
+ sig = inspect.signature(env_load_func)
334
+ config_type = env_config_annotation(env_load_func, sig)
335
+ if config_type is None:
336
+ raise ValueError(
337
+ "Taskset/harness CLI overrides require a v1 loader shaped as "
338
+ "load_environment(config: vf.EnvConfig)."
339
+ )
340
+
341
+ merged_env_args = dict(env_args)
342
+ child_types = env_config_child_types(module, config_type)
343
+ base_config = load_env_config(
344
+ module,
345
+ config_type,
346
+ merged_env_args.get("config", {}),
347
+ child_types=child_types,
348
+ )
349
+ cli_type = env_config_cli_type(config_type, base_config, child_types)
350
+ try:
351
+ config = parse_pydantic_config_cli(
352
+ cli_type,
353
+ args=override_args,
354
+ default=base_config,
355
+ )
356
+ except ConfigFileError as exc:
357
+ raise ValueError(f"Invalid taskset/harness override: {exc}") from exc
358
+
359
+ base_config_data = explicit_config_data(merged_env_args.get("config", {}))
360
+ override_config_data = explicit_config_data(config)
361
+ merged_env_args["config"] = merge_config_data(
362
+ base_config_data,
363
+ override_config_data,
364
+ )
365
+ return merged_env_args
366
+
367
+
243
368
  def build_parser() -> argparse.ArgumentParser:
244
369
  parser = argparse.ArgumentParser()
245
370
  parser.add_argument(
@@ -502,8 +627,12 @@ def build_parser() -> argparse.ArgumentParser:
502
627
  def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
503
628
  parser = build_parser()
504
629
  if argv is None:
505
- return parser.parse_args()
506
- return parser.parse_args(argv)
630
+ args, env_config_overrides = parser.parse_known_args()
631
+ else:
632
+ args, env_config_overrides = parser.parse_known_args(argv)
633
+ validate_env_config_override_args(parser, env_config_overrides)
634
+ args.env_config_overrides = env_config_overrides
635
+ return args
507
636
 
508
637
 
509
638
  def main(argv: list[str] | None = None):
@@ -521,6 +650,10 @@ def main(argv: list[str] | None = None):
521
650
 
522
651
  # Build raw configs: both paths produce list[dict]
523
652
  if args.env_id_or_config.endswith(".toml"):
653
+ if args.env_config_overrides:
654
+ raise ValueError(
655
+ "Taskset/harness CLI overrides are only supported with a single environment id, not TOML config files."
656
+ )
524
657
  path = Path(args.env_id_or_config)
525
658
  if not path.is_file():
526
659
  raise FileNotFoundError(
@@ -800,6 +933,12 @@ def main(argv: list[str] | None = None):
800
933
  else:
801
934
  raise ValueError(f"Invalid value for --resume: {resume_arg!r}")
802
935
 
936
+ env_args = apply_env_config_cli_overrides(
937
+ env_id,
938
+ dict(raw.get("env_args", {})),
939
+ list(raw.get("env_config_overrides", [])),
940
+ )
941
+
803
942
  extra_env_kwargs = dict(raw.get("extra_env_kwargs", {}))
804
943
  if raw.get("timeout") is not None:
805
944
  extra_env_kwargs["timeout_seconds"] = raw["timeout"]
@@ -807,7 +946,7 @@ def main(argv: list[str] | None = None):
807
946
  return EvalConfig(
808
947
  env_id=env_id,
809
948
  name=name,
810
- env_args=raw.get("env_args", {}),
949
+ env_args=env_args,
811
950
  env_dir_path=raw.get("env_dir_path", DEFAULT_ENV_DIR_PATH),
812
951
  output_dir=raw.get("output_dir"),
813
952
  extra_env_kwargs=extra_env_kwargs,
@@ -17,7 +17,14 @@ from typing import (
17
17
  cast,
18
18
  )
19
19
 
20
- from pydantic import BaseModel, ConfigDict, Field, computed_field, field_validator
20
+ from pydantic import (
21
+ AliasChoices,
22
+ BaseModel,
23
+ ConfigDict,
24
+ Field,
25
+ computed_field,
26
+ field_validator,
27
+ )
21
28
 
22
29
  if TYPE_CHECKING:
23
30
  from anthropic.types import RedactedThinkingBlock
@@ -1307,7 +1314,7 @@ class EvalConfig(BaseModel):
1307
1314
  """Pydantic model for evaluation configuration."""
1308
1315
 
1309
1316
  # environment
1310
- env_id: str
1317
+ env_id: str = Field(validation_alias=AliasChoices("env_id", "id"))
1311
1318
  name: str | None = None
1312
1319
  env_args: dict
1313
1320
  env_dir_path: str
@@ -235,28 +235,14 @@ def load_env_config(
235
235
  module: ModuleType,
236
236
  config_type: type[EnvConfig],
237
237
  value: object,
238
+ *,
239
+ child_types: Mapping[str, type[BaseModel]] | None = None,
238
240
  ) -> EnvConfig:
239
- child_types: dict[str, type[BaseModel]] = {}
240
- for field_name, factory_name, base_type in (
241
- ("taskset", "load_taskset", TasksetConfig),
242
- ("harness", "load_harness", HarnessConfig),
243
- ):
244
- field_type = config_type_from_annotation(
245
- config_type.model_fields[field_name].annotation,
246
- base_type,
247
- f"{config_type.__name__}.{field_name}",
248
- )
249
- factory_type = factory_config_type(module, factory_name, base_type)
250
- if factory_type is not None:
251
- if not issubclass(factory_type, field_type):
252
- raise TypeError(
253
- f"{module.__name__}.{factory_name} config type "
254
- f"{factory_type.__name__} does not match "
255
- f"{config_type.__name__}.{field_name}: {field_type.__name__}."
256
- )
257
- child_types[field_name] = factory_type
258
- else:
259
- child_types[field_name] = field_type
241
+ resolved_child_types = (
242
+ env_config_child_types(module, config_type)
243
+ if child_types is None
244
+ else child_types
245
+ )
260
246
 
261
247
  data: dict[str, object]
262
248
  if isinstance(value, config_type):
@@ -271,7 +257,7 @@ def load_env_config(
271
257
  else:
272
258
  data = dict(explicit_config_data(value))
273
259
  defaults: EnvConfig | None = None
274
- for field_name, child_type in child_types.items():
260
+ for field_name, child_type in resolved_child_types.items():
275
261
  if field_name not in data:
276
262
  defaults = config_type() if defaults is None else defaults
277
263
  child = getattr(defaults, field_name)
@@ -284,7 +270,7 @@ def load_env_config(
284
270
  raise TypeError(f"config.{field_name} cannot be None.")
285
271
  data[field_name] = child_type.model_validate(explicit_config_data(child))
286
272
  config = config_type.model_validate(data)
287
- for field_name, child_type in child_types.items():
273
+ for field_name, child_type in resolved_child_types.items():
288
274
  child = getattr(config, field_name)
289
275
  if not isinstance(child, child_type):
290
276
  raise TypeError(
@@ -294,6 +280,34 @@ def load_env_config(
294
280
  return config
295
281
 
296
282
 
283
+ def env_config_child_types(
284
+ module: ModuleType,
285
+ config_type: type[EnvConfig],
286
+ ) -> dict[str, type[BaseModel]]:
287
+ child_types: dict[str, type[BaseModel]] = {}
288
+ for field_name, factory_name, base_type in (
289
+ ("taskset", "load_taskset", TasksetConfig),
290
+ ("harness", "load_harness", HarnessConfig),
291
+ ):
292
+ field_type = config_type_from_annotation(
293
+ config_type.model_fields[field_name].annotation,
294
+ base_type,
295
+ f"{config_type.__name__}.{field_name}",
296
+ )
297
+ factory_type = factory_config_type(module, factory_name, base_type)
298
+ if factory_type is not None:
299
+ if not issubclass(factory_type, field_type):
300
+ raise TypeError(
301
+ f"{module.__name__}.{factory_name} config type "
302
+ f"{factory_type.__name__} does not match "
303
+ f"{config_type.__name__}.{field_name}: {field_type.__name__}."
304
+ )
305
+ child_types[field_name] = factory_type
306
+ else:
307
+ child_types[field_name] = field_type
308
+ return child_types
309
+
310
+
297
311
  def factory_config_type(
298
312
  module: ModuleType,
299
313
  factory_name: str,
@@ -2,6 +2,8 @@ import asyncio
2
2
  from collections.abc import Awaitable, Callable
3
3
  from typing import TYPE_CHECKING, Generic, TypeAlias, TypeVar, cast, final
4
4
 
5
+ from pydantic import AliasChoices, Field
6
+
5
7
  import verifiers as vf
6
8
  from verifiers.clients.client import Client
7
9
  from verifiers.errors import Error, OverlongPromptError
@@ -97,6 +99,10 @@ ProgramRunner: TypeAlias = Callable[[Task, State], Awaitable[ProgramResult]]
97
99
 
98
100
  class HarnessConfig(LifecycleConfig):
99
101
  # Core fields configure harness-owned runtime behavior.
102
+ harness_id: str | None = Field(
103
+ default=None,
104
+ validation_alias=AliasChoices("harness_id", "id"),
105
+ )
100
106
  program: ProgramConfig = ProgramConfig()
101
107
  model: ModelConfig = ModelConfig()
102
108
  system_prompt: PromptInput | SystemPromptConfig | None = None
@@ -108,6 +114,14 @@ class HarnessConfig(LifecycleConfig):
108
114
  artifacts: ArtifactsConfig = ArtifactsConfig()
109
115
  max_turns: int = 10
110
116
 
117
+ @classmethod
118
+ def __pydantic_init_subclass__(cls, **kwargs: object) -> None:
119
+ super().__pydantic_init_subclass__(**kwargs)
120
+ field = cls.model_fields.get("harness_id")
121
+ if field is not None:
122
+ field.validation_alias = AliasChoices("harness_id", "id")
123
+ cls.model_rebuild(force=True)
124
+
111
125
 
112
126
  ConfigT = TypeVar("ConfigT", bound=HarnessConfig)
113
127
 
@@ -168,6 +182,12 @@ class Harness(RuntimeOwnerMixin[ConfigT], Generic[ConfigT]):
168
182
  self.config = cast(ConfigT, coerce_config(config_type, config))
169
183
  with config_ref_context(self.config):
170
184
  self.initialize_runtime_refresh()
185
+ resolved_harness_id = self.config.harness_id
186
+ if resolved_harness_id is not None and not isinstance(
187
+ resolved_harness_id, str
188
+ ):
189
+ raise TypeError("harness_id must be a string.")
190
+ self.harness_id = resolved_harness_id or type(self).__name__
171
191
  self.program_config = self.config.program.resolve()
172
192
  system_prompt_value = self.load_system_prompt(self.config)
173
193
  self.system_prompt = normalize_system_prompt(
@@ -3,6 +3,7 @@ from pathlib import Path
3
3
  from typing import Generic, TypeVar, cast, final
4
4
 
5
5
  from datasets import Dataset
6
+ from pydantic import AliasChoices, Field
6
7
 
7
8
  from .config import (
8
9
  ConfigSource,
@@ -43,13 +44,24 @@ from .types import (
43
44
 
44
45
  class TasksetConfig(LifecycleConfig):
45
46
  # Core fields configure taskset-owned loaders and runtime behavior.
46
- taskset_id: str | None = None
47
+ taskset_id: str | None = Field(
48
+ default=None,
49
+ validation_alias=AliasChoices("taskset_id", "id"),
50
+ )
47
51
  system_prompt: PromptInput | SystemPromptConfig | None = None
48
52
  user: UserConfig | None = None
49
53
  bindings: BindingsConfig = BindingsConfig()
50
54
  objects: ObjectsConfig = ObjectsConfig()
51
55
  artifacts: ArtifactsConfig = ArtifactsConfig()
52
56
 
57
+ @classmethod
58
+ def __pydantic_init_subclass__(cls, **kwargs: object) -> None:
59
+ super().__pydantic_init_subclass__(**kwargs)
60
+ field = cls.model_fields.get("taskset_id")
61
+ if field is not None:
62
+ field.validation_alias = AliasChoices("taskset_id", "id")
63
+ cls.model_rebuild(force=True)
64
+
53
65
 
54
66
  ConfigT = TypeVar("ConfigT", bound=TasksetConfig)
55
67