verifiers 0.1.13.dev1__tar.gz → 0.1.13.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/PKG-INFO +1 -1
  2. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_composable_env.py +77 -0
  3. verifiers-0.1.13.dev3/tests/test_context_token_metrics.py +200 -0
  4. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_environment_extra.py +3 -1
  5. verifiers-0.1.13.dev3/tests/test_interception_utils.py +133 -0
  6. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_rlm_composable_env.py +146 -10
  7. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/__init__.py +1 -1
  8. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/environment.py +4 -6
  9. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/cli_agent_env.py +16 -10
  10. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/composable_env.py +24 -3
  11. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/harness.py +17 -1
  12. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -2
  13. verifiers-0.1.13.dev3/verifiers/envs/experimental/composable/harnesses/rlm.py +290 -0
  14. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/stateful_tool_env.py +2 -2
  15. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/tool_env.py +11 -11
  16. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/tui.py +283 -134
  17. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/types.py +2 -0
  18. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/eval_display.py +28 -13
  19. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/eval_utils.py +31 -12
  20. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/interception_utils.py +34 -1
  21. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/metric_utils.py +27 -11
  22. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/save_utils.py +29 -5
  23. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/usage_utils.py +52 -0
  24. verifiers-0.1.13.dev1/tests/test_interception_utils.py +0 -63
  25. verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -82
  26. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/.gitignore +0 -0
  27. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/LICENSE +0 -0
  28. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/README.md +0 -0
  29. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/pyproject.toml +0 -0
  30. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/AGENTS.md +0 -0
  31. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/README.md +0 -0
  32. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/__init__.py +0 -0
  33. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/conftest.py +0 -0
  34. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_browser_env.py +0 -0
  35. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_build_script.py +0 -0
  36. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_cli_agent_env.py +0 -0
  37. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_client_auth_errors.py +0 -0
  38. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_client_config.py +0 -0
  39. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_client_multimodal_types.py +0 -0
  40. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_decorator_ranks.py +0 -0
  41. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_endpoint_registry.py +0 -0
  42. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_env_group.py +0 -0
  43. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_env_server.py +0 -0
  44. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_environment.py +0 -0
  45. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_envs.py +0 -0
  46. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_error_chain.py +0 -0
  47. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_eval_cli.py +0 -0
  48. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_eval_display.py +0 -0
  49. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_eval_utils.py +0 -0
  50. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_gepa_cli.py +0 -0
  51. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_gym_env.py +0 -0
  52. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_imports.py +0 -0
  53. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_install_utils.py +0 -0
  54. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_logging.py +0 -0
  55. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_math_rubric.py +0 -0
  56. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_maybe_think_parser.py +0 -0
  57. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_message_utils.py +0 -0
  58. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_message_utils_multimodal.py +0 -0
  59. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_multiturn_env.py +0 -0
  60. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_nemorl_client.py +0 -0
  61. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_openai_chat_completions_token_client.py +0 -0
  62. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_opencode_harbor.py +0 -0
  63. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_opencode_rlm_env.py +0 -0
  64. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_parser.py +0 -0
  65. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_path_utils.py +0 -0
  66. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_prime_plugin.py +0 -0
  67. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_rlm_env.py +0 -0
  68. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_rubric.py +0 -0
  69. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_rubric_group.py +0 -0
  70. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_sandbox_env.py +0 -0
  71. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_sandbox_mixin.py +0 -0
  72. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_save_utils.py +0 -0
  73. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_setup_script.py +0 -0
  74. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_singleturn_env.py +0 -0
  75. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_stateful_tool_env.py +0 -0
  76. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_think_parser.py +0 -0
  77. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_tool_env.py +0 -0
  78. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_tool_utils.py +0 -0
  79. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_trajectory_processing.py +0 -0
  80. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_tui_info_formatting.py +0 -0
  81. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/tests/test_xml_parser.py +0 -0
  82. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/AGENTS.md +0 -0
  83. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/__init__.py +0 -0
  84. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/commands/__init__.py +0 -0
  85. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/commands/build.py +0 -0
  86. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/commands/eval.py +0 -0
  87. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/commands/gepa.py +0 -0
  88. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/commands/init.py +0 -0
  89. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/commands/install.py +0 -0
  90. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/commands/setup.py +0 -0
  91. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/plugins/__init__.py +0 -0
  92. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/plugins/prime.py +0 -0
  93. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/cli/tui.py +0 -0
  94. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/clients/__init__.py +0 -0
  95. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/clients/anthropic_messages_client.py +0 -0
  96. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/clients/client.py +0 -0
  97. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
  98. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/clients/openai_chat_completions_client.py +0 -0
  99. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
  100. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/clients/openai_completions_client.py +0 -0
  101. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/decorators.py +0 -0
  102. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/AGENTS.md +0 -0
  103. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/__init__.py +0 -0
  104. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/env_group.py +0 -0
  105. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/README.md +0 -0
  106. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/__init__.py +0 -0
  107. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/README.md +0 -0
  108. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/__init__.py +0 -0
  109. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
  110. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  111. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/task.py +0 -0
  112. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  113. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  114. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  115. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  116. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  117. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  118. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  119. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  120. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  121. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  122. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  123. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  124. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  125. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
  126. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
  127. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
  128. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
  129. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
  130. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  131. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/gym_env.py +0 -0
  132. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/harbor_env.py +0 -0
  133. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/mcp_env.py +0 -0
  134. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/opencode_env.py +0 -0
  135. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  136. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  137. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/rlm_env.py +0 -0
  138. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  139. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/README.md +0 -0
  140. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/__init__.py +0 -0
  141. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/browser_env/README.md +0 -0
  142. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  143. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  144. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  145. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  146. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  147. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  148. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/openenv_env.py +0 -0
  149. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  150. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/integrations/textarena_env.py +0 -0
  151. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/multiturn_env.py +0 -0
  152. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/python_env.py +0 -0
  153. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/sandbox_env.py +0 -0
  154. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/envs/singleturn_env.py +0 -0
  155. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/errors.py +0 -0
  156. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/gepa/__init__.py +0 -0
  157. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/gepa/adapter.py +0 -0
  158. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/gepa/config.py +0 -0
  159. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/gepa/display.py +0 -0
  160. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/gepa/gepa_utils.py +0 -0
  161. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/parsers/__init__.py +0 -0
  162. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/parsers/maybe_think_parser.py +0 -0
  163. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/parsers/parser.py +0 -0
  164. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/parsers/think_parser.py +0 -0
  165. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/parsers/xml_parser.py +0 -0
  166. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/README.md +0 -0
  167. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/__init__.py +0 -0
  168. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/inference/__init__.py +0 -0
  169. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/inference/client.py +0 -0
  170. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/inference/server.py +0 -0
  171. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/trainer/__init__.py +0 -0
  172. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/trainer/config.py +0 -0
  173. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/trainer/orchestrator.py +0 -0
  174. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/trainer/trainer.py +0 -0
  175. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rl/trainer/utils.py +0 -0
  176. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rubrics/__init__.py +0 -0
  177. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  178. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rubrics/judge_rubric.py +0 -0
  179. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rubrics/math_rubric.py +0 -0
  180. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rubrics/rubric.py +0 -0
  181. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/rubrics/rubric_group.py +0 -0
  182. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/__init__.py +0 -0
  183. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/build.py +0 -0
  184. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/eval.py +0 -0
  185. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/gepa.py +0 -0
  186. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/init.py +0 -0
  187. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/install.py +0 -0
  188. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/prime_rl.py +0 -0
  189. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/rl.py +0 -0
  190. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/setup.py +0 -0
  191. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/train.py +0 -0
  192. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/scripts/vllm.py +0 -0
  193. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/__init__.py +0 -0
  194. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/client/env_client.py +0 -0
  195. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/client/zmq_env_client.py +0 -0
  196. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/server/__init__.py +0 -0
  197. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/server/env_router.py +0 -0
  198. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/server/env_server.py +0 -0
  199. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/server/env_worker.py +0 -0
  200. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/server/zmq_env_server.py +0 -0
  201. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/serve/types.py +0 -0
  202. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/__init__.py +0 -0
  203. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/async_utils.py +0 -0
  204. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/client_utils.py +0 -0
  205. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/config_utils.py +0 -0
  206. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/data_utils.py +0 -0
  207. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/display_utils.py +0 -0
  208. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/env_utils.py +0 -0
  209. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/error_utils.py +0 -0
  210. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/heartbeat.py +0 -0
  211. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/import_utils.py +0 -0
  212. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/install_utils.py +0 -0
  213. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/logging_utils.py +0 -0
  214. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/message_utils.py +0 -0
  215. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/path_utils.py +0 -0
  216. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/process_utils.py +0 -0
  217. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/response_utils.py +0 -0
  218. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/serve_utils.py +0 -0
  219. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/thread_utils.py +0 -0
  220. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/threaded_sandbox_client.py +0 -0
  221. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/tool_utils.py +0 -0
  222. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/tunnel_utils.py +0 -0
  223. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev3}/verifiers/utils/version_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.13.dev1
3
+ Version: 0.1.13.dev3
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -429,6 +429,83 @@ async def test_composable_env_no_upload_when_no_dirs(tmp_path, monkeypatch):
429
429
  assert env.upload_file.await_count == 0
430
430
 
431
431
 
432
+ @pytest.mark.asyncio
433
+ async def test_composable_env_uploads_harness_dirs(tmp_path):
434
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
435
+ harness_dir = tmp_path / "agent-src"
436
+ harness_dir.mkdir()
437
+ (harness_dir / "marker.txt").write_text("agent\n")
438
+
439
+ env = ComposableEnv(
440
+ taskset=taskset,
441
+ harness=Harness(
442
+ run_command="true",
443
+ install_script="install-agent",
444
+ get_upload_dirs=lambda: {"agent_src": harness_dir},
445
+ upload_dir_mapping={"agent_src": "/tmp/agent-src"},
446
+ ),
447
+ )
448
+ env.sandbox_client = SimpleNamespace(
449
+ execute_command=AsyncMock(
450
+ return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
451
+ ),
452
+ teardown=lambda: None,
453
+ )
454
+ env.taskset.setup = AsyncMock()
455
+ env.upload_content = AsyncMock()
456
+ env.upload_file = AsyncMock()
457
+
458
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
459
+
460
+ env.upload_file.assert_awaited_once()
461
+ upload_call = env.upload_file.await_args
462
+ assert upload_call.args[0] == "sbx"
463
+ assert upload_call.args[1] == "/tmp/_upload_tmp_agent-src.tar.gz"
464
+
465
+ extract_call = env.sandbox_client.execute_command.await_args_list[1]
466
+ assert extract_call == call(
467
+ "sbx",
468
+ "mkdir -p /tmp && tar -xzf /tmp/_upload_tmp_agent-src.tar.gz -C / && rm -f /tmp/_upload_tmp_agent-src.tar.gz",
469
+ timeout=60,
470
+ )
471
+
472
+
473
+ @pytest.mark.asyncio
474
+ async def test_composable_env_rejects_duplicate_task_and_harness_upload_names(
475
+ tmp_path, monkeypatch
476
+ ):
477
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
478
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
479
+ taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
480
+ harness_dir = tmp_path / "skills"
481
+ harness_dir.mkdir()
482
+
483
+ env = ComposableEnv(
484
+ taskset=taskset,
485
+ harness=Harness(
486
+ run_command="true",
487
+ install_script="install-agent",
488
+ get_upload_dirs=lambda: {"skills": harness_dir},
489
+ skills_path="/task/skills",
490
+ ),
491
+ )
492
+ env.sandbox_client = SimpleNamespace(
493
+ execute_command=AsyncMock(
494
+ return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
495
+ ),
496
+ teardown=lambda: None,
497
+ )
498
+ env.taskset.setup = AsyncMock()
499
+ env.upload_content = AsyncMock()
500
+ env.upload_file = AsyncMock()
501
+
502
+ with pytest.raises(
503
+ ValueError,
504
+ match="Upload directory names must be unique across task and harness",
505
+ ):
506
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
507
+
508
+
432
509
  # ── discover_sibling_dir ─────────────────────────────────────────────────
433
510
 
434
511
 
@@ -0,0 +1,200 @@
1
+ """Tests for per-turn context token metrics.
2
+
3
+ Tests the trajectory-based context token computation
4
+ (final_input_tokens, final_output_tokens) which assumes a linear rollout
5
+ using the last trajectory step.
6
+ """
7
+
8
+ from unittest.mock import MagicMock
9
+
10
+ import pytest
11
+
12
+ from verifiers.utils.usage_utils import compute_context_token_metrics
13
+
14
+
15
+ # =========================================================================
16
+ # Helpers
17
+ # =========================================================================
18
+
19
+ SYS = {"role": "system", "content": "You are helpful"}
20
+ USER = {"role": "user", "content": "hi"}
21
+
22
+
23
+ def _make_response(prompt_tokens: int, completion_tokens: int) -> MagicMock:
24
+ response = MagicMock()
25
+ response.usage = MagicMock(
26
+ prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
27
+ )
28
+ return response
29
+
30
+
31
+ def _asst(i: int) -> dict:
32
+ return {"role": "assistant", "content": f"response {i}"}
33
+
34
+
35
+ # =========================================================================
36
+ # compute_context_token_metrics
37
+ # =========================================================================
38
+
39
+
40
+ class TestContextMetrics:
41
+ def test_empty_trajectory(self):
42
+ metrics = compute_context_token_metrics([])
43
+ assert metrics["final_output_tokens"] == 0
44
+ assert metrics["final_input_tokens"] == 0
45
+
46
+ def test_single_turn(self):
47
+ trajectory = [
48
+ {
49
+ "prompt": [SYS, USER],
50
+ "completion": [_asst(0)],
51
+ "response": _make_response(100, 20),
52
+ },
53
+ ]
54
+ metrics = compute_context_token_metrics(trajectory)
55
+ assert metrics["final_output_tokens"] == 20
56
+ assert metrics["final_input_tokens"] == 100
57
+
58
+ def test_multi_turn(self):
59
+ trajectory = [
60
+ {
61
+ "response": _make_response(100, 20),
62
+ },
63
+ {
64
+ "response": _make_response(150, 25),
65
+ },
66
+ {
67
+ "response": _make_response(200, 30),
68
+ },
69
+ ]
70
+ metrics = compute_context_token_metrics(trajectory)
71
+ # Last step total = 200 + 30 = 230
72
+ # Sum of completion tokens = 20 + 25 + 30 = 75
73
+ assert metrics["final_output_tokens"] == 75
74
+ assert metrics["final_input_tokens"] == 230 - 75
75
+
76
+ def test_invariant_total_equals_last_step(self):
77
+ trajectory = [
78
+ {"response": _make_response(100, 20)},
79
+ {"response": _make_response(150, 25)},
80
+ {"response": _make_response(200, 30)},
81
+ ]
82
+ metrics = compute_context_token_metrics(trajectory)
83
+ total = metrics["final_output_tokens"] + metrics["final_input_tokens"]
84
+ # Total should equal last step's prompt_tokens + completion_tokens
85
+ assert total == 200 + 30
86
+
87
+ def test_no_response_on_any_step(self):
88
+ trajectory = [{"response": None}]
89
+ metrics = compute_context_token_metrics(trajectory)
90
+ assert metrics["final_output_tokens"] == 0
91
+ assert metrics["final_input_tokens"] == 0
92
+
93
+ def test_last_step_used_not_largest(self):
94
+ """Even if an earlier step has a larger context, we use the last step."""
95
+ trajectory = [
96
+ {"response": _make_response(500, 100)}, # larger context
97
+ {"response": _make_response(100, 20)}, # last step, smaller
98
+ ]
99
+ metrics = compute_context_token_metrics(trajectory)
100
+ # Last step total = 120, sum completions = 100 + 20 = 120
101
+ assert metrics["final_output_tokens"] == 120
102
+ assert metrics["final_input_tokens"] == 0 # clamped to 0
103
+
104
+ def test_skips_none_responses_for_last_step(self):
105
+ """Last step with response=None is skipped; uses previous step."""
106
+ trajectory = [
107
+ {"response": _make_response(100, 20)},
108
+ {"response": _make_response(200, 30)},
109
+ {"response": None},
110
+ ]
111
+ metrics = compute_context_token_metrics(trajectory)
112
+ # Last step with response is step 1: total = 230
113
+ # Sum completions from all steps with responses: 20 + 30 = 50
114
+ assert metrics["final_output_tokens"] == 50
115
+ assert metrics["final_input_tokens"] == 230 - 50
116
+
117
+ def test_skips_responses_without_usage(self):
118
+ """Responses with no .usage attribute are skipped entirely."""
119
+ no_usage = MagicMock()
120
+ no_usage.usage = None
121
+ trajectory = [
122
+ {"response": _make_response(100, 20)},
123
+ {"response": _make_response(200, 30)},
124
+ {"response": no_usage}, # last step, but no usage
125
+ ]
126
+ metrics = compute_context_token_metrics(trajectory)
127
+ # Should use step 1 (last with usage): total = 230
128
+ assert metrics["final_output_tokens"] == 50
129
+ assert metrics["final_input_tokens"] == 230 - 50
130
+
131
+ def test_all_responses_lack_usage(self):
132
+ """If no response has usage data, return zeros."""
133
+ no_usage = MagicMock()
134
+ no_usage.usage = None
135
+ trajectory = [
136
+ {"response": no_usage},
137
+ {"response": no_usage},
138
+ ]
139
+ metrics = compute_context_token_metrics(trajectory)
140
+ assert metrics["final_output_tokens"] == 0
141
+ assert metrics["final_input_tokens"] == 0
142
+
143
+ def test_final_input_tokens_clamped_to_zero(self):
144
+ """If sum of completions exceeds last step total, input is clamped to 0."""
145
+ trajectory = [
146
+ {"response": _make_response(10, 500)}, # huge completion
147
+ {"response": _make_response(50, 10)},
148
+ ]
149
+ metrics = compute_context_token_metrics(trajectory)
150
+ # Last step total = 60, sum completions = 510
151
+ assert metrics["final_output_tokens"] == 510
152
+ assert metrics["final_input_tokens"] == 0
153
+
154
+
155
+ # =========================================================================
156
+ # Metric classes
157
+ # =========================================================================
158
+
159
+
160
+ class TestContextTokenMetricClasses:
161
+ def test_input_tokens_metric(self):
162
+ from verifiers.utils.metric_utils import InputTokensMetric
163
+
164
+ m = InputTokensMetric()
165
+ m.add_output({"token_usage": {"input_tokens": 100.0}})
166
+ m.add_output({"token_usage": {"input_tokens": 200.0}})
167
+ assert m.compute() == pytest.approx(150.0)
168
+
169
+ def test_output_tokens_metric(self):
170
+ from verifiers.utils.metric_utils import OutputTokensMetric
171
+
172
+ m = OutputTokensMetric()
173
+ m.add_output({"token_usage": {"output_tokens": 40.0}})
174
+ m.add_output({"token_usage": {"output_tokens": 60.0}})
175
+ assert m.compute() == pytest.approx(50.0)
176
+
177
+ def test_final_input_tokens_metric(self):
178
+ from verifiers.utils.metric_utils import FinalInputTokensMetric
179
+
180
+ m = FinalInputTokensMetric()
181
+ m.add_output({"token_usage": {"final_input_tokens": 50.0}})
182
+ m.add_output({"token_usage": {"final_input_tokens": 100.0}})
183
+ assert m.compute() == pytest.approx(75.0)
184
+
185
+ def test_final_output_tokens_metric(self):
186
+ from verifiers.utils.metric_utils import FinalOutputTokensMetric
187
+
188
+ m = FinalOutputTokensMetric()
189
+ m.add_output({"token_usage": {"final_output_tokens": 150.0}})
190
+ m.add_output({"token_usage": {"final_output_tokens": 250.0}})
191
+ assert m.compute() == pytest.approx(200.0)
192
+
193
+ def test_skips_outputs_without_token_usage(self):
194
+ from verifiers.utils.metric_utils import FinalInputTokensMetric
195
+
196
+ m = FinalInputTokensMetric()
197
+ m.add_output({})
198
+ m.add_output({"token_usage": {}})
199
+ assert m.count == 0
200
+ assert m.compute() == 0.0
@@ -237,7 +237,9 @@ async def test_state_to_output_uses_state_usage_not_trajectory(
237
237
  state["reward"] = 0.0
238
238
 
239
239
  output = state_to_output(state, state_columns=[])
240
- assert output["token_usage"] == {"input_tokens": 5.0, "output_tokens": 4.0}
240
+ usage = output["token_usage"]
241
+ assert usage["input_tokens"] == 5.0
242
+ assert usage["output_tokens"] == 4.0
241
243
 
242
244
 
243
245
  @pytest.mark.asyncio
@@ -0,0 +1,133 @@
1
+ import asyncio
2
+ from unittest.mock import AsyncMock, MagicMock
3
+
4
+ from verifiers.errors import InfraError
5
+ from verifiers.types import (
6
+ Response,
7
+ ResponseMessage,
8
+ TextContentPart,
9
+ ToolCall,
10
+ Usage,
11
+ )
12
+ from verifiers.utils import interception_utils
13
+ from verifiers.utils.interception_utils import (
14
+ InterceptionServer,
15
+ StreamInterrupted,
16
+ create_empty_completion,
17
+ serialize_intercept_response,
18
+ )
19
+
20
+
21
+ def test_serialize_intercept_response_from_vf_response_uses_chat_completion_shape():
22
+ response = Response(
23
+ id="resp_1",
24
+ created=123,
25
+ model="test-model",
26
+ usage=Usage(
27
+ prompt_tokens=10,
28
+ reasoning_tokens=0,
29
+ completion_tokens=5,
30
+ total_tokens=15,
31
+ ),
32
+ message=ResponseMessage(
33
+ content=[TextContentPart(text="hello "), {"type": "text", "text": "world"}],
34
+ reasoning_content=None,
35
+ tool_calls=[
36
+ ToolCall(id="call_1", name="echo", arguments='{"x": 1}'),
37
+ ],
38
+ finish_reason="tool_calls",
39
+ is_truncated=False,
40
+ tokens=None,
41
+ ),
42
+ )
43
+
44
+ payload = serialize_intercept_response(response)
45
+
46
+ assert payload["id"] == "resp_1"
47
+ assert payload["object"] == "chat.completion"
48
+ assert payload["model"] == "test-model"
49
+ assert payload["choices"][0]["message"]["role"] == "assistant"
50
+ assert payload["choices"][0]["message"]["content"] == "hello world"
51
+ assert payload["choices"][0]["message"]["tool_calls"] == [
52
+ {
53
+ "id": "call_1",
54
+ "type": "function",
55
+ "function": {"name": "echo", "arguments": '{"x": 1}'},
56
+ }
57
+ ]
58
+ assert payload["choices"][0]["finish_reason"] == "tool_calls"
59
+ assert payload["usage"]["prompt_tokens"] == 10
60
+ assert payload["usage"]["completion_tokens"] == 5
61
+ assert payload["usage"]["total_tokens"] == 15
62
+
63
+
64
+ def test_serialize_intercept_response_passthrough_native_chat_completion():
65
+ native = create_empty_completion("native-model")
66
+ payload = serialize_intercept_response(native)
67
+
68
+ assert payload["object"] == "chat.completion"
69
+ assert payload["model"] == "native-model"
70
+ assert len(payload["choices"]) == 1
71
+
72
+
73
+ def test_set_rollout_error_attaches_stream_interrupted_to_state():
74
+ server = InterceptionServer(port=0)
75
+ state: dict = {}
76
+ server.register_rollout("r1", state=state)
77
+
78
+ err = StreamInterrupted("tunnel died")
79
+ server._set_rollout_error("r1", err)
80
+
81
+ assert state["error"] is err
82
+ assert isinstance(state["error"], InfraError)
83
+
84
+
85
+ def test_set_rollout_error_does_not_clobber_existing_error():
86
+ # First error wins — later write failures must not hide the original cause.
87
+ server = InterceptionServer(port=0)
88
+ original = InfraError("original")
89
+ state: dict = {"error": original}
90
+ server.register_rollout("r1", state=state)
91
+
92
+ server._set_rollout_error("r1", StreamInterrupted("later"))
93
+
94
+ assert state["error"] is original
95
+
96
+
97
+ async def test_streaming_write_failure_surfaces_to_state(monkeypatch):
98
+ """The real failure path: a mid-SSE transport close on the client side
99
+ raises out of ``response.write(...)``. The except branch must funnel
100
+ that into ``state["error"]`` so the rollout halts via ``has_error``."""
101
+ server = InterceptionServer(port=0)
102
+ state: dict = {}
103
+ server.register_rollout("r1", state=state)
104
+
105
+ # Mock StreamResponse whose second write raises (first write succeeds
106
+ # to prove we're in the streaming loop, not failing at prepare()).
107
+ writes: list[bytes] = []
108
+
109
+ async def fake_write(data: bytes) -> None:
110
+ writes.append(data)
111
+ if len(writes) >= 2:
112
+ raise ConnectionResetError("client closed transport")
113
+
114
+ fake_response = MagicMock()
115
+ fake_response.prepare = AsyncMock()
116
+ fake_response.write = AsyncMock(side_effect=fake_write)
117
+ fake_response.write_eof = AsyncMock()
118
+ monkeypatch.setattr(
119
+ interception_utils.web, "StreamResponse", lambda **_: fake_response
120
+ )
121
+
122
+ chunk_queue: asyncio.Queue = asyncio.Queue()
123
+ await chunk_queue.put({"choices": [{"delta": {"content": "hi"}}]})
124
+ await chunk_queue.put({"choices": [{"delta": {"content": " there"}}]})
125
+ intercept = {
126
+ "chunk_queue": chunk_queue,
127
+ "response_future": asyncio.Future(),
128
+ }
129
+
130
+ await server._handle_streaming_response(MagicMock(), "r1", intercept)
131
+
132
+ assert isinstance(state["error"], StreamInterrupted)
133
+ assert "ConnectionResetError" in str(state["error"])
@@ -6,6 +6,8 @@ fields and that the install script is generated correctly.
6
6
 
7
7
  import importlib
8
8
  import json
9
+ from pathlib import Path
10
+ import subprocess
9
11
  from types import SimpleNamespace
10
12
  from unittest.mock import AsyncMock, call
11
13
 
@@ -18,9 +20,11 @@ from verifiers.envs.experimental.composable import (
18
20
  SandboxSpec,
19
21
  SandboxTaskSet,
20
22
  )
23
+ from verifiers.envs.experimental.composable.harnesses import rlm as rlm_module
21
24
  from verifiers.envs.experimental.composable.harnesses.rlm import (
22
25
  build_install_script,
23
26
  rlm_harness,
27
+ resolve_local_checkout,
24
28
  )
25
29
 
26
30
 
@@ -86,28 +90,160 @@ def _make_temp_taskset_package(tmp_path, monkeypatch, *, with_skills: bool):
86
90
  return mod
87
91
 
88
92
 
93
+ def _make_git_checkout(target: Path) -> Path:
94
+ checkout = target
95
+ checkout.mkdir()
96
+ (checkout / "install.sh").write_text("#!/usr/bin/env bash\n")
97
+ (checkout / "pyproject.toml").write_text("[project]\nname='rlm'\nversion='0.0.0'\n")
98
+ subprocess.run(["git", "init", "-b", "main"], cwd=checkout, check=True)
99
+ subprocess.run(
100
+ ["git", "add", "install.sh", "pyproject.toml"], cwd=checkout, check=True
101
+ )
102
+ subprocess.run(
103
+ [
104
+ "git",
105
+ "-c",
106
+ "user.name=Codex",
107
+ "-c",
108
+ "user.email=codex@example.com",
109
+ "commit",
110
+ "-m",
111
+ "init",
112
+ ],
113
+ cwd=checkout,
114
+ check=True,
115
+ )
116
+ return checkout
117
+
118
+
89
119
  # ── RLM harness ──────────────────────────────────────────────────────────
90
120
 
91
121
 
92
- def test_rlm_harness_install_script_downloads_repo_install_sh():
122
+ def test_rlm_harness_install_script_requires_uploaded_checkout():
93
123
  script = build_install_script()
94
- assert "git clone --depth 1 --branch main" in script
95
- assert "github.com/PrimeIntellect-ai/rlm.git" in script
96
- assert "bash /tmp/rlm-checkout/install.sh" in script
124
+ assert 'test -f "$RLM_CHECKOUT_PATH/install.sh"' in script
125
+ assert "git clone" not in script
126
+ assert 'bash "$RLM_CHECKOUT_PATH/install.sh"' in script
97
127
 
98
128
 
99
- def test_rlm_harness_sets_metrics_fields():
100
- harness = rlm_harness()
129
+ def test_rlm_harness_sets_metrics_fields(tmp_path):
130
+ harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
101
131
  assert harness.metrics_path == "{workdir}/.rlm/sessions/*/meta.json"
102
132
  assert harness.metrics_key == "metrics"
103
133
  assert harness.metrics_prefix == "rlm_"
104
134
 
105
135
 
106
- def test_rlm_harness_sets_skills_path():
107
- harness = rlm_harness()
136
+ def test_rlm_harness_sets_skills_path(tmp_path):
137
+ harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
108
138
  assert harness.skills_path == "/task/rlm-skills"
109
139
 
110
140
 
141
+ def test_resolve_local_checkout_validates_explicit_path(tmp_path):
142
+ checkout = _make_git_checkout(tmp_path / "rlm")
143
+
144
+ resolved = resolve_local_checkout(checkout)
145
+
146
+ assert resolved == checkout.resolve()
147
+
148
+
149
+ def test_rlm_harness_uploads_explicit_local_checkout(tmp_path):
150
+ checkout = _make_git_checkout(tmp_path / "rlm")
151
+
152
+ harness = rlm_harness(local_checkout=checkout)
153
+
154
+ assert harness.get_upload_dirs is not None
155
+ assert harness.get_upload_dirs() == {"rlm_checkout": checkout.resolve()}
156
+ assert harness.upload_dir_mapping == {"rlm_checkout": "/tmp/rlm-checkout"}
157
+
158
+
159
+ def test_resolve_local_checkout_materializes_host_cache(tmp_path):
160
+ source_checkout = _make_git_checkout(tmp_path / "rlm-source")
161
+ checkout_dir = tmp_path / "checkout-root" / "rlm"
162
+
163
+ resolved = resolve_local_checkout(
164
+ local_checkout=checkout_dir,
165
+ rlm_repo_url=str(source_checkout),
166
+ rlm_branch="main",
167
+ )
168
+
169
+ assert resolved == checkout_dir.resolve()
170
+ assert (checkout_dir / ".git").is_dir()
171
+ assert (checkout_dir / "install.sh").is_file()
172
+ assert (checkout_dir / "pyproject.toml").is_file()
173
+
174
+
175
+ def test_rlm_harness_uses_default_host_cache_when_local_checkout_unspecified(
176
+ tmp_path, monkeypatch
177
+ ):
178
+ source_checkout = _make_git_checkout(tmp_path / "rlm-source")
179
+ monkeypatch.setattr(
180
+ rlm_module,
181
+ "DEFAULT_RLM_LOCAL_CHECKOUT_CACHE_ROOT",
182
+ tmp_path / "cache-root",
183
+ )
184
+
185
+ harness = rlm_harness(
186
+ rlm_repo_url=str(source_checkout),
187
+ rlm_branch="main",
188
+ )
189
+
190
+ assert harness.get_upload_dirs is not None
191
+ upload_checkout = harness.get_upload_dirs()["rlm_checkout"]
192
+ assert isinstance(upload_checkout, Path)
193
+ assert upload_checkout.is_dir()
194
+ assert upload_checkout.name.startswith("rlm-source-main-")
195
+ assert harness.upload_dir_mapping == {"rlm_checkout": "/tmp/rlm-checkout"}
196
+
197
+
198
+ def test_rlm_harness_always_uploads_checkout(tmp_path, monkeypatch):
199
+ source_checkout = _make_git_checkout(tmp_path / "rlm-source")
200
+ monkeypatch.setattr(
201
+ rlm_module,
202
+ "DEFAULT_RLM_LOCAL_CHECKOUT_CACHE_ROOT",
203
+ tmp_path / "cache-root",
204
+ )
205
+
206
+ harness = rlm_harness(
207
+ rlm_repo_url=str(source_checkout),
208
+ rlm_branch="main",
209
+ )
210
+
211
+ assert harness.get_upload_dirs is not None
212
+ assert harness.upload_dir_mapping is not None
213
+
214
+
215
+ def test_resolve_local_checkout_redacts_gh_token_on_clone_failure(
216
+ tmp_path, monkeypatch
217
+ ):
218
+ failing_checkout = tmp_path / "checkout-root" / "rlm"
219
+ token = "super/secret token"
220
+ quoted_token = "super%2Fsecret%20token"
221
+
222
+ def _raise_clone_error(*args, **kwargs):
223
+ raise subprocess.CalledProcessError(
224
+ 128,
225
+ args[0],
226
+ stderr=(
227
+ "fatal: could not read from "
228
+ f"https://{quoted_token}@github.com/PrimeIntellect-ai/rlm.git"
229
+ ),
230
+ )
231
+
232
+ monkeypatch.setattr(rlm_module.subprocess, "run", _raise_clone_error)
233
+
234
+ with pytest.raises(RuntimeError) as exc_info:
235
+ resolve_local_checkout(
236
+ local_checkout=failing_checkout,
237
+ rlm_repo_url="github.com/PrimeIntellect-ai/rlm.git",
238
+ rlm_branch="main",
239
+ gh_token=token,
240
+ )
241
+
242
+ message = str(exc_info.value)
243
+ assert token not in message
244
+ assert "<redacted>" in message
245
+
246
+
111
247
  # ── install_env ──────────────────────────────────────────────────────────
112
248
 
113
249
 
@@ -201,7 +337,7 @@ async def test_rlm_uploads_skills_before_install(tmp_path, monkeypatch):
201
337
 
202
338
 
203
339
  @pytest.mark.asyncio
204
- async def test_rlm_collects_logs_and_metrics():
340
+ async def test_rlm_collects_logs_and_metrics(tmp_path):
205
341
  taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
206
342
  metrics = {
207
343
  "turns": 3,
@@ -209,7 +345,7 @@ async def test_rlm_collects_logs_and_metrics():
209
345
  "prompt_tokens": 100,
210
346
  "completion_tokens": 25,
211
347
  }
212
- harness = rlm_harness()
348
+ harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
213
349
  env = ComposableEnv(
214
350
  taskset=taskset,
215
351
  harness=Harness(
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.13.dev1"
1
+ __version__ = "0.1.13.dev3"
2
2
 
3
3
  import importlib
4
4
  import os
@@ -483,14 +483,12 @@ class Environment(ABC):
483
483
  usage = state.get("usage")
484
484
  if isinstance(usage, Mapping):
485
485
  try:
486
- input_tokens = float(usage.get("input_tokens", 0.0))
487
- output_tokens = float(usage.get("output_tokens", 0.0))
486
+ return {
487
+ "input_tokens": float(usage.get("input_tokens", 0.0)),
488
+ "output_tokens": float(usage.get("output_tokens", 0.0)),
489
+ }
488
490
  except (TypeError, ValueError):
489
491
  return None
490
- return {
491
- "input_tokens": input_tokens,
492
- "output_tokens": output_tokens,
493
- }
494
492
  return None
495
493
 
496
494
  async def get_model_response(