verifiers 0.1.13.dev6__tar.gz → 0.1.13.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/PKG-INFO +1 -1
  2. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/conftest.py +1 -2
  3. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_cli_agent_env.py +24 -11
  4. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_environment.py +1 -4
  5. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_environment_extra.py +2 -2
  6. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_eval_cli.py +51 -0
  7. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_math_rubric.py +85 -0
  8. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_multiturn_env.py +81 -0
  9. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_opencode_rlm_env.py +7 -9
  10. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_rlm_composable_env.py +331 -38
  11. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_rlm_env.py +71 -71
  12. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_rubric_group.py +84 -0
  13. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/__init__.py +1 -1
  14. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/openai_chat_completions_token_client.py +14 -1
  15. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/environment.py +4 -6
  16. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/cli_agent_env.py +24 -35
  17. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/composable_env.py +101 -25
  18. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harness.py +33 -15
  19. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -2
  20. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harnesses/opencode.py +6 -0
  21. verifiers-0.1.13.dev8/verifiers/envs/experimental/composable/harnesses/rlm.py +281 -0
  22. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/task.py +40 -20
  23. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -1
  24. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -1
  25. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +3 -3
  26. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +3 -3
  27. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +3 -3
  28. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +11 -8
  29. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +2 -2
  30. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py +2 -2
  31. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py +2 -2
  32. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/opencode_rlm_env.py +2 -3
  33. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/rlm_env.py +3 -5
  34. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/sandbox_mixin.py +51 -1
  35. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/utils/git_checkout_cache.py +45 -2
  36. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/browser_env.py +3 -3
  37. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/modes/base.py +2 -2
  38. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +1 -3
  39. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +1 -2
  40. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/openenv_env.py +2 -3
  41. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/textarena_env.py +7 -2
  42. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/multiturn_env.py +31 -11
  43. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/python_env.py +2 -3
  44. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/sandbox_env.py +2 -2
  45. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/math_rubric.py +2 -1
  46. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/rubric_group.py +13 -0
  47. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/eval.py +11 -1
  48. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/types.py +1 -0
  49. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/eval_utils.py +1 -0
  50. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/logging_utils.py +18 -0
  51. verifiers-0.1.13.dev6/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -186
  52. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/.gitignore +0 -0
  53. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/LICENSE +0 -0
  54. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/README.md +0 -0
  55. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/pyproject.toml +0 -0
  56. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/AGENTS.md +0 -0
  57. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/README.md +0 -0
  58. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/__init__.py +0 -0
  59. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_browser_env.py +0 -0
  60. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_build_script.py +0 -0
  61. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_client_auth_errors.py +0 -0
  62. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_client_config.py +0 -0
  63. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_client_multimodal_types.py +0 -0
  64. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_composable_env.py +0 -0
  65. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_context_token_metrics.py +0 -0
  66. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_decorator_ranks.py +0 -0
  67. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_endpoint_registry.py +0 -0
  68. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_env_group.py +0 -0
  69. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_env_server.py +0 -0
  70. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_envs.py +0 -0
  71. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_error_chain.py +0 -0
  72. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_eval_display.py +0 -0
  73. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_eval_utils.py +0 -0
  74. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_gepa_cli.py +0 -0
  75. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_gym_env.py +0 -0
  76. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_harbor_env_mcp.py +0 -0
  77. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_imports.py +0 -0
  78. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_install_utils.py +0 -0
  79. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_interception_utils.py +0 -0
  80. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_logging.py +0 -0
  81. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_maybe_think_parser.py +0 -0
  82. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_message_utils.py +0 -0
  83. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_message_utils_multimodal.py +0 -0
  84. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_nemorl_client.py +0 -0
  85. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_openai_chat_completions_token_client.py +0 -0
  86. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_opencode_harbor.py +0 -0
  87. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_parser.py +0 -0
  88. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_path_utils.py +0 -0
  89. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_prime_plugin.py +0 -0
  90. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_rubric.py +0 -0
  91. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_sandbox_env.py +0 -0
  92. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_sandbox_mixin.py +0 -0
  93. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_save_utils.py +0 -0
  94. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_setup_script.py +0 -0
  95. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_singleturn_env.py +0 -0
  96. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_stateful_tool_env.py +0 -0
  97. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_think_parser.py +0 -0
  98. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_tool_env.py +0 -0
  99. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_tool_utils.py +0 -0
  100. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_trajectory_processing.py +0 -0
  101. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_tui_info_formatting.py +0 -0
  102. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/tests/test_xml_parser.py +0 -0
  103. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/AGENTS.md +0 -0
  104. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/__init__.py +0 -0
  105. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/__init__.py +0 -0
  106. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/build.py +0 -0
  107. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/eval.py +0 -0
  108. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/gepa.py +0 -0
  109. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/init.py +0 -0
  110. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/install.py +0 -0
  111. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/commands/setup.py +0 -0
  112. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/plugins/__init__.py +0 -0
  113. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/plugins/prime.py +0 -0
  114. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/cli/tui.py +0 -0
  115. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/__init__.py +0 -0
  116. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/anthropic_messages_client.py +0 -0
  117. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/client.py +0 -0
  118. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
  119. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/openai_chat_completions_client.py +0 -0
  120. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/clients/openai_completions_client.py +0 -0
  121. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/decorators.py +0 -0
  122. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/AGENTS.md +0 -0
  123. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/__init__.py +0 -0
  124. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/env_group.py +0 -0
  125. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/README.md +0 -0
  126. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/__init__.py +0 -0
  127. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/README.md +0 -0
  128. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/__init__.py +0 -0
  129. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/_filter.py +0 -0
  130. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harnesses/mini_swe_agent.py +0 -0
  131. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  132. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  133. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  134. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  135. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  136. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  137. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  138. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  139. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  140. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  141. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/_test_patch.py +0 -0
  142. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  143. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  144. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2_log_parsers.py +0 -0
  145. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  146. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/gym_env.py +0 -0
  147. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/harbor_env/__init__.py +0 -0
  148. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/harbor_env/env.py +0 -0
  149. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/harbor_env/mcp.py +0 -0
  150. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/mcp_env.py +0 -0
  151. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/opencode_env.py +0 -0
  152. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  153. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/utils/__init__.py +0 -0
  154. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/experimental/utils/file_locks.py +0 -0
  155. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/README.md +0 -0
  156. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/__init__.py +0 -0
  157. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/README.md +0 -0
  158. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  159. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  160. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  161. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/singleturn_env.py +0 -0
  162. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/stateful_tool_env.py +0 -0
  163. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/envs/tool_env.py +0 -0
  164. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/errors.py +0 -0
  165. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/__init__.py +0 -0
  166. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/adapter.py +0 -0
  167. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/config.py +0 -0
  168. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/display.py +0 -0
  169. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/gepa/gepa_utils.py +0 -0
  170. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/__init__.py +0 -0
  171. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/maybe_think_parser.py +0 -0
  172. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/parser.py +0 -0
  173. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/think_parser.py +0 -0
  174. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/parsers/xml_parser.py +0 -0
  175. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/README.md +0 -0
  176. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/__init__.py +0 -0
  177. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/inference/__init__.py +0 -0
  178. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/inference/client.py +0 -0
  179. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/inference/server.py +0 -0
  180. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/__init__.py +0 -0
  181. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/config.py +0 -0
  182. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/orchestrator.py +0 -0
  183. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/trainer.py +0 -0
  184. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rl/trainer/utils.py +0 -0
  185. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/__init__.py +0 -0
  186. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  187. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/judge_rubric.py +0 -0
  188. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/rubrics/rubric.py +0 -0
  189. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/__init__.py +0 -0
  190. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/build.py +0 -0
  191. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/gepa.py +0 -0
  192. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/init.py +0 -0
  193. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/install.py +0 -0
  194. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/prime_rl.py +0 -0
  195. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/rl.py +0 -0
  196. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/setup.py +0 -0
  197. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/train.py +0 -0
  198. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/tui.py +0 -0
  199. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/scripts/vllm.py +0 -0
  200. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/__init__.py +0 -0
  201. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/client/env_client.py +0 -0
  202. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/client/zmq_env_client.py +0 -0
  203. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/__init__.py +0 -0
  204. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/env_router.py +0 -0
  205. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/env_server.py +0 -0
  206. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/env_worker.py +0 -0
  207. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/server/zmq_env_server.py +0 -0
  208. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/serve/types.py +0 -0
  209. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/__init__.py +0 -0
  210. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/async_utils.py +0 -0
  211. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/client_utils.py +0 -0
  212. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/config_utils.py +0 -0
  213. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/data_utils.py +0 -0
  214. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/display_utils.py +0 -0
  215. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/env_utils.py +0 -0
  216. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/error_utils.py +0 -0
  217. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/eval_display.py +0 -0
  218. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/heartbeat.py +0 -0
  219. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/import_utils.py +0 -0
  220. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/install_utils.py +0 -0
  221. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/interception_utils.py +0 -0
  222. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/message_utils.py +0 -0
  223. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/metric_utils.py +0 -0
  224. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/path_utils.py +0 -0
  225. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/process_utils.py +0 -0
  226. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/response_utils.py +0 -0
  227. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/save_utils.py +0 -0
  228. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/serve_utils.py +0 -0
  229. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/thread_utils.py +0 -0
  230. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/threaded_sandbox_client.py +0 -0
  231. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/tool_utils.py +0 -0
  232. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/tunnel_utils.py +0 -0
  233. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/usage_utils.py +0 -0
  234. {verifiers-0.1.13.dev6 → verifiers-0.1.13.dev8}/verifiers/utils/version_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.13.dev6
3
+ Version: 0.1.13.dev8
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -425,10 +425,9 @@ class ExampleStatefulToolEnv(StatefulToolEnv):
425
425
  super().__init__(tools=[offset_tool], **kwargs)
426
426
 
427
427
  async def setup_state(self, state, **kwargs):
428
- state = await super().setup_state(state, **kwargs)
428
+ await super().setup_state(state, **kwargs)
429
429
  state["offset"] = 3
430
430
  state["update_calls"] = 0
431
- return state
432
431
 
433
432
  def update_tool_args(self, tool_name, tool_args, messages, state, **kwargs):
434
433
  state["update_calls"] += 1
@@ -62,7 +62,8 @@ class TestCliAgentEnv:
62
62
  assert env.run_command == "python agent.py"
63
63
  assert env.docker_image == "python:3.11-slim"
64
64
  assert env.interception_port == 8765
65
- assert env.timeout_seconds == 3600.0
65
+ assert env.timeout_seconds is None
66
+ assert env.sandbox_timeout_minutes is None
66
67
 
67
68
  def test_init_custom_config(self, sample_dataset):
68
69
  """Test initialization with custom configuration."""
@@ -130,22 +131,34 @@ class TestCliAgentEnv:
130
131
  state = {"agent_completed": True}
131
132
  assert await env.agent_completed(state) is True
132
133
 
133
- @pytest.mark.asyncio
134
- async def test_timeout_reached_stop_condition(self, sample_dataset):
135
- """Test the timeout_reached stop condition."""
134
+ @pytest.mark.parametrize(
135
+ "timeout_seconds,expected_minutes",
136
+ [
137
+ (None, 24 * 60), # no rollout cap → SDK ceiling
138
+ (600.0, 10 + 60), # finite → ceil + scoring buffer
139
+ (24 * 3600.0, 24 * 60), # buffer would overflow → clamped to ceiling
140
+ ],
141
+ )
142
+ def test_sandbox_timeout_auto_derived(
143
+ self, sample_dataset, timeout_seconds, expected_minutes
144
+ ):
136
145
  env = vf.CliAgentEnv(
137
146
  run_command="python agent.py",
138
147
  dataset=sample_dataset,
139
148
  rubric=vf.Rubric(),
140
- timeout_seconds=10.0,
149
+ timeout_seconds=timeout_seconds,
141
150
  )
142
- import time
151
+ assert env.get_sandbox_resources({})["timeout_minutes"] == expected_minutes
143
152
 
144
- state = {"timing": {"start_time": time.time()}}
145
- assert await env.timeout_reached(state) is False
146
-
147
- state = {"timing": {"start_time": time.time() - 20}}
148
- assert await env.timeout_reached(state) is True
153
+ def test_sandbox_timeout_explicit_override(self, sample_dataset):
154
+ env = vf.CliAgentEnv(
155
+ run_command="python agent.py",
156
+ dataset=sample_dataset,
157
+ rubric=vf.Rubric(),
158
+ timeout_seconds=600.0,
159
+ sandbox_timeout_minutes=30,
160
+ )
161
+ assert env.get_sandbox_resources({})["timeout_minutes"] == 30
149
162
 
150
163
  @pytest.mark.asyncio
151
164
  async def test_env_response_returns_empty(self, sample_dataset):
@@ -26,7 +26,6 @@ class SimpleEnvironment(Environment):
26
26
 
27
27
  async def setup_state(self, state):
28
28
  """Setup state for SimpleEnvironment."""
29
- return state
30
29
 
31
30
  async def rollout(
32
31
  self,
@@ -38,7 +37,7 @@ class SimpleEnvironment(Environment):
38
37
  """Simple test rollout implementation."""
39
38
  state = await self.init_state(input, client=client, model=model)
40
39
  try:
41
- state = await self.setup_state(state)
40
+ await self.setup_state(state)
42
41
 
43
42
  prompt_messages = state["prompt"]
44
43
  response = await self.get_model_response(state, prompt_messages)
@@ -551,8 +550,6 @@ class RetryCounterEnv(SimpleEnvironment):
551
550
  f"Simulated failure {self.call_counts[example_id]}/{self.fail_count}"
552
551
  )
553
552
 
554
- return state
555
-
556
553
 
557
554
  class TestMaybeRetry:
558
555
  """Test cases for maybe_retry functionality in Environment.generate()."""
@@ -40,7 +40,7 @@ from verifiers.utils.save_utils import state_to_output
40
40
  # Local simple concrete Environment for testing
41
41
  class DummyEnvironment(Environment):
42
42
  async def setup_state(self, state):
43
- return state
43
+ pass
44
44
 
45
45
  async def rollout(
46
46
  self,
@@ -52,7 +52,7 @@ class DummyEnvironment(Environment):
52
52
  state = await self.init_state(
53
53
  input, client=client, model=model, sampling_args=sampling_args
54
54
  )
55
- state = await self.setup_state(state)
55
+ await self.setup_state(state)
56
56
 
57
57
  prompt_messages = state["prompt"]
58
58
  response = await self.get_model_response(state=state, prompt=prompt_messages)
@@ -232,6 +232,36 @@ def test_cli_temperature_not_added_when_none(monkeypatch, run_cli):
232
232
  assert "temperature" not in sa
233
233
 
234
234
 
235
+ def test_cli_extra_env_kwargs_support_timeout_seconds(monkeypatch, run_cli):
236
+ captured = run_cli(
237
+ monkeypatch,
238
+ {
239
+ "extra_env_kwargs": {"timeout_seconds": 30, "foo": "bar"},
240
+ },
241
+ )
242
+
243
+ assert captured["configs"][0].extra_env_kwargs == {
244
+ "timeout_seconds": 30,
245
+ "foo": "bar",
246
+ }
247
+
248
+
249
+ def test_cli_timeout_flag_overrides_extra_env_kwargs(monkeypatch, run_cli):
250
+ """--timeout wins over timeout_seconds in --extra-env-kwargs."""
251
+ captured = run_cli(
252
+ monkeypatch,
253
+ {
254
+ "extra_env_kwargs": {"timeout_seconds": 30, "foo": "bar"},
255
+ "timeout": 600,
256
+ },
257
+ )
258
+
259
+ assert captured["configs"][0].extra_env_kwargs == {
260
+ "timeout_seconds": 600,
261
+ "foo": "bar",
262
+ }
263
+
264
+
235
265
  def test_cli_headers_table_and_list_merge(monkeypatch, run_cli):
236
266
  captured = run_cli(
237
267
  monkeypatch,
@@ -874,6 +904,27 @@ def test_load_toml_config_global_values_with_per_eval_override():
874
904
  assert result[1]["num_examples"] == 50 # per-eval override
875
905
 
876
906
 
907
+ def test_load_toml_config_with_extra_env_kwargs():
908
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
909
+ f.write(
910
+ '[[eval]]\nenv_id = "env1"\n[eval.extra_env_kwargs]\ntimeout_seconds = 600\n'
911
+ )
912
+ f.flush()
913
+ result = load_toml_config(Path(f.name))
914
+
915
+ assert result[0]["extra_env_kwargs"] == {"timeout_seconds": 600}
916
+
917
+
918
+ def test_load_toml_config_with_top_level_timeout():
919
+ """Top-level `timeout` is a recognized field on [[eval]] tables."""
920
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
921
+ f.write('[[eval]]\nenv_id = "env1"\ntimeout = 600\n')
922
+ f.flush()
923
+ result = load_toml_config(Path(f.name))
924
+
925
+ assert result[0]["timeout"] == 600
926
+
927
+
877
928
  def test_load_toml_config_invalid_global_field():
878
929
  """Invalid global field raises ValueError."""
879
930
  with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
@@ -1,8 +1,11 @@
1
1
  """Tests for the MathRubric class."""
2
2
 
3
+ import asyncio
4
+
3
5
  import pytest
4
6
 
5
7
  import verifiers as vf
8
+ from verifiers.rubrics import math_rubric
6
9
 
7
10
 
8
11
  class TestMathRubric:
@@ -127,3 +130,85 @@ class TestMathRubric:
127
130
  assert state["metrics"]["correct_answer"] == 1.0
128
131
  else:
129
132
  assert state["metrics"]["correct_answer"] == 0.0
133
+
134
+
135
+ class TestVerifyResponseExceptionHandling:
136
+ """Regression tests for the exception handling in verify_response.
137
+
138
+ See commit narrowing ``except BaseException`` to
139
+ ``except (Exception, MathVerifyTimeout)`` so that ``CancelledError``,
140
+ ``KeyboardInterrupt``, and ``SystemExit`` propagate instead of being
141
+ silently reported as a 0.0 score.
142
+ """
143
+
144
+ def test_cancellederror_propagates(self, monkeypatch):
145
+ """CancelledError raised during math_verify must propagate, not
146
+ get swallowed and reported as a score of 0.0."""
147
+
148
+ def raise_cancelled(*args, **kwargs):
149
+ raise asyncio.CancelledError
150
+
151
+ monkeypatch.setattr(math_rubric, "parse", raise_cancelled)
152
+
153
+ with pytest.raises(asyncio.CancelledError):
154
+ math_rubric.verify_response(
155
+ response="\\boxed{1}",
156
+ answer="1",
157
+ max_verify_chars=50_000,
158
+ timeout_seconds=5,
159
+ )
160
+
161
+ def test_keyboardinterrupt_propagates(self, monkeypatch):
162
+ """KeyboardInterrupt must propagate so Ctrl-C still works during
163
+ scoring."""
164
+
165
+ def raise_kbd(*args, **kwargs):
166
+ raise KeyboardInterrupt
167
+
168
+ monkeypatch.setattr(math_rubric, "parse", raise_kbd)
169
+
170
+ with pytest.raises(KeyboardInterrupt):
171
+ math_rubric.verify_response(
172
+ response="\\boxed{1}",
173
+ answer="1",
174
+ max_verify_chars=50_000,
175
+ timeout_seconds=5,
176
+ )
177
+
178
+ def test_math_verify_timeout_returns_zero(self, monkeypatch):
179
+ """A real math_verify.errors.TimeoutException (which inherits from
180
+ BaseException, not Exception) must still be caught and reported as
181
+ a 0.0 score — that's why the catch is wider than just Exception."""
182
+ from math_verify.errors import TimeoutException
183
+
184
+ def raise_timeout(*args, **kwargs):
185
+ raise TimeoutException("simulated math_verify timeout")
186
+
187
+ monkeypatch.setattr(math_rubric, "parse", raise_timeout)
188
+
189
+ score, elapsed = math_rubric.verify_response(
190
+ response="\\boxed{1}",
191
+ answer="1",
192
+ max_verify_chars=50_000,
193
+ timeout_seconds=5,
194
+ )
195
+ assert score == 0.0
196
+ assert elapsed >= 0.0
197
+
198
+ def test_regular_exception_returns_zero(self, monkeypatch):
199
+ """A regular Exception from math_verify should continue to be
200
+ swallowed and reported as 0.0 (library-raised something weird)."""
201
+
202
+ def raise_exc(*args, **kwargs):
203
+ raise ValueError("simulated parse failure")
204
+
205
+ monkeypatch.setattr(math_rubric, "parse", raise_exc)
206
+
207
+ score, elapsed = math_rubric.verify_response(
208
+ response="\\boxed{1}",
209
+ answer="1",
210
+ max_verify_chars=50_000,
211
+ timeout_seconds=5,
212
+ )
213
+ assert score == 0.0
214
+ assert elapsed >= 0.0
@@ -1,5 +1,7 @@
1
1
  """Tests for the MultiTurnEnv class."""
2
2
 
3
+ import asyncio
4
+
3
5
  import pytest
4
6
  from datasets import Dataset
5
7
 
@@ -12,6 +14,7 @@ class TestMultiTurnEnv:
12
14
  def test_multiturn_env_initialization(self, mock_multiturn_env):
13
15
  """Test MultiTurnEnv initialization."""
14
16
  assert mock_multiturn_env.max_turns == 3
17
+ assert mock_multiturn_env.timeout_seconds is None
15
18
  assert mock_multiturn_env.message_type == "chat" # Default from parent
16
19
 
17
20
  def test_multiturn_env_default_max_turns(self, mock_client, sample_chat_dataset):
@@ -26,6 +29,7 @@ class TestMultiTurnEnv:
26
29
  rubric=Rubric(),
27
30
  )
28
31
  assert env.max_turns == -1 # Default value
32
+ assert env.timeout_seconds is None
29
33
 
30
34
  @pytest.mark.asyncio
31
35
  async def test_basic_multiturn_rollout(self, mock_multiturn_env, make_input):
@@ -103,6 +107,83 @@ class TestMultiTurnEnv:
103
107
  assert completion[1]["role"] == "user"
104
108
  assert completion[2]["role"] == "assistant"
105
109
 
110
+ @pytest.mark.asyncio
111
+ async def test_timeout_seconds_limits_rollout(
112
+ self, mock_client, sample_chat_dataset, make_input
113
+ ):
114
+ """Test that rollout stops when the wall-clock timeout is reached."""
115
+
116
+ class SlowMultiTurnEnv(MultiTurnEnv):
117
+ async def env_response(self, messages, state, **kwargs): # type: ignore[override]
118
+ return [{"role": "user", "content": "Continue"}]
119
+
120
+ async def add_model_response(self, state, prompt_messages, response): # type: ignore[override]
121
+ await super().add_model_response(state, prompt_messages, response)
122
+ await asyncio.sleep(0.05)
123
+
124
+ env = SlowMultiTurnEnv(
125
+ client=mock_client,
126
+ model="test-model",
127
+ dataset=sample_chat_dataset,
128
+ parser=Parser(),
129
+ rubric=Rubric(),
130
+ timeout_seconds=0.01,
131
+ )
132
+ mock_client.set_default_response("Still going")
133
+
134
+ prompt = [{"role": "user", "content": "Start conversation"}]
135
+ state = await env.rollout(
136
+ input=make_input(prompt=prompt, answer="target_answer"),
137
+ client=mock_client,
138
+ model="test-model",
139
+ )
140
+
141
+ assert len(state["trajectory"]) == 1
142
+ assert state["timed_out"] is True
143
+ assert state["is_completed"] is True
144
+ assert state["stop_condition"] == "timeout_reached"
145
+ completion = state["completion"]
146
+ assert len(completion) == 1
147
+ assert completion[0]["role"] == "assistant"
148
+ assert completion[0]["content"] == "Still going"
149
+
150
+ @pytest.mark.asyncio
151
+ async def test_timeout_seconds_limits_setup(
152
+ self, mock_client, sample_chat_dataset, make_input
153
+ ):
154
+ """Test that the rollout timeout applies while setup is in flight."""
155
+
156
+ class SlowSetupEnv(MultiTurnEnv):
157
+ async def setup_state(self, state): # type: ignore[override]
158
+ await asyncio.sleep(1)
159
+
160
+ async def env_response(self, messages, state, **kwargs): # type: ignore[override]
161
+ return [{"role": "user", "content": "Continue"}]
162
+
163
+ env = SlowSetupEnv(
164
+ client=mock_client,
165
+ model="test-model",
166
+ dataset=sample_chat_dataset,
167
+ parser=Parser(),
168
+ rubric=Rubric(),
169
+ timeout_seconds=0.01,
170
+ )
171
+
172
+ state = await env.rollout(
173
+ input=make_input(
174
+ prompt=[{"role": "user", "content": "Start conversation"}],
175
+ answer="target_answer",
176
+ ),
177
+ client=mock_client,
178
+ model="test-model",
179
+ )
180
+
181
+ assert state["timed_out"] is True
182
+ assert state["is_completed"] is True
183
+ assert state["stop_condition"] == "timeout_reached"
184
+ assert state["trajectory"] == []
185
+ assert state["completion"] == []
186
+
106
187
  @pytest.mark.asyncio
107
188
  async def test_override_is_completed_respects_max_turns(
108
189
  self, mock_client, sample_chat_dataset, make_input
@@ -289,13 +289,12 @@ class TestSetupState:
289
289
  OpenCodeRLMEnv.__bases__[0],
290
290
  "setup_state",
291
291
  new_callable=AsyncMock,
292
- return_value=state,
293
292
  ):
294
- result = await env.setup_state(state)
295
- assert result["sub_llm_turns"] == 0
296
- assert result["sub_llm_prompt_tokens"] == 0
297
- assert result["sub_llm_completion_tokens"] == 0
298
- assert result["_sub_llm_tasks"] == set()
293
+ await env.setup_state(state)
294
+ assert state["sub_llm_turns"] == 0
295
+ assert state["sub_llm_prompt_tokens"] == 0
296
+ assert state["sub_llm_completion_tokens"] == 0
297
+ assert state["_sub_llm_tasks"] == set()
299
298
 
300
299
  @pytest.mark.asyncio
301
300
  async def test_preserves_existing_sub_metrics(self):
@@ -305,10 +304,9 @@ class TestSetupState:
305
304
  OpenCodeRLMEnv.__bases__[0],
306
305
  "setup_state",
307
306
  new_callable=AsyncMock,
308
- return_value=state,
309
307
  ):
310
- result = await env.setup_state(state)
311
- assert result["sub_llm_turns"] == 3
308
+ await env.setup_state(state)
309
+ assert state["sub_llm_turns"] == 3
312
310
 
313
311
 
314
312
  # =============================================================================