verifiers 0.1.13.dev1__tar.gz → 0.1.13.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (222) hide show
  1. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/PKG-INFO +1 -1
  2. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_composable_env.py +77 -0
  3. verifiers-0.1.13.dev2/tests/test_context_token_metrics.py +200 -0
  4. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_environment_extra.py +3 -1
  5. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_rlm_composable_env.py +146 -10
  6. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/__init__.py +1 -1
  7. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/environment.py +4 -6
  8. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/cli_agent_env.py +11 -8
  9. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/composable_env.py +24 -3
  10. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harness.py +17 -1
  11. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harnesses/__init__.py +0 -2
  12. verifiers-0.1.13.dev2/verifiers/envs/experimental/composable/harnesses/rlm.py +287 -0
  13. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/stateful_tool_env.py +2 -2
  14. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/tool_env.py +11 -11
  15. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/tui.py +283 -134
  16. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/types.py +2 -0
  17. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/eval_display.py +28 -13
  18. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/eval_utils.py +31 -12
  19. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/metric_utils.py +27 -11
  20. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/save_utils.py +29 -5
  21. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/usage_utils.py +52 -0
  22. verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -82
  23. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/.gitignore +0 -0
  24. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/LICENSE +0 -0
  25. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/README.md +0 -0
  26. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/pyproject.toml +0 -0
  27. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/AGENTS.md +0 -0
  28. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/README.md +0 -0
  29. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/__init__.py +0 -0
  30. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/conftest.py +0 -0
  31. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_browser_env.py +0 -0
  32. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_build_script.py +0 -0
  33. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_cli_agent_env.py +0 -0
  34. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_client_auth_errors.py +0 -0
  35. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_client_config.py +0 -0
  36. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_client_multimodal_types.py +0 -0
  37. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_decorator_ranks.py +0 -0
  38. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_endpoint_registry.py +0 -0
  39. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_env_group.py +0 -0
  40. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_env_server.py +0 -0
  41. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_environment.py +0 -0
  42. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_envs.py +0 -0
  43. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_error_chain.py +0 -0
  44. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_eval_cli.py +0 -0
  45. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_eval_display.py +0 -0
  46. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_eval_utils.py +0 -0
  47. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_gepa_cli.py +0 -0
  48. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_gym_env.py +0 -0
  49. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_imports.py +0 -0
  50. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_install_utils.py +0 -0
  51. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_interception_utils.py +0 -0
  52. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_logging.py +0 -0
  53. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_math_rubric.py +0 -0
  54. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_maybe_think_parser.py +0 -0
  55. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_message_utils.py +0 -0
  56. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_message_utils_multimodal.py +0 -0
  57. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_multiturn_env.py +0 -0
  58. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_nemorl_client.py +0 -0
  59. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_openai_chat_completions_token_client.py +0 -0
  60. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_opencode_harbor.py +0 -0
  61. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_opencode_rlm_env.py +0 -0
  62. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_parser.py +0 -0
  63. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_path_utils.py +0 -0
  64. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_prime_plugin.py +0 -0
  65. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_rlm_env.py +0 -0
  66. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_rubric.py +0 -0
  67. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_rubric_group.py +0 -0
  68. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_sandbox_env.py +0 -0
  69. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_sandbox_mixin.py +0 -0
  70. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_save_utils.py +0 -0
  71. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_setup_script.py +0 -0
  72. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_singleturn_env.py +0 -0
  73. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_stateful_tool_env.py +0 -0
  74. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_think_parser.py +0 -0
  75. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_tool_env.py +0 -0
  76. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_tool_utils.py +0 -0
  77. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_trajectory_processing.py +0 -0
  78. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_tui_info_formatting.py +0 -0
  79. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/tests/test_xml_parser.py +0 -0
  80. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/AGENTS.md +0 -0
  81. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/__init__.py +0 -0
  82. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/__init__.py +0 -0
  83. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/build.py +0 -0
  84. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/eval.py +0 -0
  85. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/gepa.py +0 -0
  86. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/init.py +0 -0
  87. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/install.py +0 -0
  88. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/commands/setup.py +0 -0
  89. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/plugins/__init__.py +0 -0
  90. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/plugins/prime.py +0 -0
  91. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/cli/tui.py +0 -0
  92. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/__init__.py +0 -0
  93. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/anthropic_messages_client.py +0 -0
  94. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/client.py +0 -0
  95. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/nemorl_chat_completions_client.py +0 -0
  96. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/openai_chat_completions_client.py +0 -0
  97. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/openai_chat_completions_token_client.py +0 -0
  98. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/clients/openai_completions_client.py +0 -0
  99. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/decorators.py +0 -0
  100. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/AGENTS.md +0 -0
  101. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/__init__.py +0 -0
  102. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/env_group.py +0 -0
  103. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/README.md +0 -0
  104. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/__init__.py +0 -0
  105. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/README.md +0 -0
  106. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/__init__.py +0 -0
  107. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
  108. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  109. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/task.py +0 -0
  110. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/__init__.py +0 -0
  111. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  112. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  113. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +0 -0
  114. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  115. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +0 -0
  116. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  117. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  118. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  119. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  120. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +0 -0
  121. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  122. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  123. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +0 -0
  124. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +0 -0
  125. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +0 -0
  126. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +0 -0
  127. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +0 -0
  128. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +0 -0
  129. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/gym_env.py +0 -0
  130. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/harbor_env.py +0 -0
  131. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/mcp_env.py +0 -0
  132. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/opencode_env.py +0 -0
  133. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  134. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  135. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/rlm_env.py +0 -0
  136. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  137. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/README.md +0 -0
  138. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/__init__.py +0 -0
  139. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/README.md +0 -0
  140. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  141. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  142. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  143. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  144. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  145. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  146. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/openenv_env.py +0 -0
  147. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  148. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/integrations/textarena_env.py +0 -0
  149. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/multiturn_env.py +0 -0
  150. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/python_env.py +0 -0
  151. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/sandbox_env.py +0 -0
  152. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/envs/singleturn_env.py +0 -0
  153. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/errors.py +0 -0
  154. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/__init__.py +0 -0
  155. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/adapter.py +0 -0
  156. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/config.py +0 -0
  157. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/display.py +0 -0
  158. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/gepa/gepa_utils.py +0 -0
  159. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/__init__.py +0 -0
  160. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/maybe_think_parser.py +0 -0
  161. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/parser.py +0 -0
  162. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/think_parser.py +0 -0
  163. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/parsers/xml_parser.py +0 -0
  164. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/README.md +0 -0
  165. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/__init__.py +0 -0
  166. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/inference/__init__.py +0 -0
  167. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/inference/client.py +0 -0
  168. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/inference/server.py +0 -0
  169. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/__init__.py +0 -0
  170. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/config.py +0 -0
  171. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/orchestrator.py +0 -0
  172. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/trainer.py +0 -0
  173. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rl/trainer/utils.py +0 -0
  174. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/__init__.py +0 -0
  175. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  176. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/judge_rubric.py +0 -0
  177. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/math_rubric.py +0 -0
  178. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/rubric.py +0 -0
  179. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/rubrics/rubric_group.py +0 -0
  180. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/__init__.py +0 -0
  181. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/build.py +0 -0
  182. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/eval.py +0 -0
  183. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/gepa.py +0 -0
  184. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/init.py +0 -0
  185. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/install.py +0 -0
  186. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/prime_rl.py +0 -0
  187. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/rl.py +0 -0
  188. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/setup.py +0 -0
  189. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/train.py +0 -0
  190. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/scripts/vllm.py +0 -0
  191. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/__init__.py +0 -0
  192. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/client/env_client.py +0 -0
  193. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/client/zmq_env_client.py +0 -0
  194. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/__init__.py +0 -0
  195. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/env_router.py +0 -0
  196. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/env_server.py +0 -0
  197. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/env_worker.py +0 -0
  198. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/server/zmq_env_server.py +0 -0
  199. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/serve/types.py +0 -0
  200. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/__init__.py +0 -0
  201. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/async_utils.py +0 -0
  202. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/client_utils.py +0 -0
  203. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/config_utils.py +0 -0
  204. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/data_utils.py +0 -0
  205. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/display_utils.py +0 -0
  206. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/env_utils.py +0 -0
  207. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/error_utils.py +0 -0
  208. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/heartbeat.py +0 -0
  209. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/import_utils.py +0 -0
  210. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/install_utils.py +0 -0
  211. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/interception_utils.py +0 -0
  212. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/logging_utils.py +0 -0
  213. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/message_utils.py +0 -0
  214. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/path_utils.py +0 -0
  215. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/process_utils.py +0 -0
  216. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/response_utils.py +0 -0
  217. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/serve_utils.py +0 -0
  218. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/thread_utils.py +0 -0
  219. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/threaded_sandbox_client.py +0 -0
  220. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/tool_utils.py +0 -0
  221. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/tunnel_utils.py +0 -0
  222. {verifiers-0.1.13.dev1 → verifiers-0.1.13.dev2}/verifiers/utils/version_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.13.dev1
3
+ Version: 0.1.13.dev2
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -429,6 +429,83 @@ async def test_composable_env_no_upload_when_no_dirs(tmp_path, monkeypatch):
429
429
  assert env.upload_file.await_count == 0
430
430
 
431
431
 
432
+ @pytest.mark.asyncio
433
+ async def test_composable_env_uploads_harness_dirs(tmp_path):
434
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
435
+ harness_dir = tmp_path / "agent-src"
436
+ harness_dir.mkdir()
437
+ (harness_dir / "marker.txt").write_text("agent\n")
438
+
439
+ env = ComposableEnv(
440
+ taskset=taskset,
441
+ harness=Harness(
442
+ run_command="true",
443
+ install_script="install-agent",
444
+ get_upload_dirs=lambda: {"agent_src": harness_dir},
445
+ upload_dir_mapping={"agent_src": "/tmp/agent-src"},
446
+ ),
447
+ )
448
+ env.sandbox_client = SimpleNamespace(
449
+ execute_command=AsyncMock(
450
+ return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
451
+ ),
452
+ teardown=lambda: None,
453
+ )
454
+ env.taskset.setup = AsyncMock()
455
+ env.upload_content = AsyncMock()
456
+ env.upload_file = AsyncMock()
457
+
458
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
459
+
460
+ env.upload_file.assert_awaited_once()
461
+ upload_call = env.upload_file.await_args
462
+ assert upload_call.args[0] == "sbx"
463
+ assert upload_call.args[1] == "/tmp/_upload_tmp_agent-src.tar.gz"
464
+
465
+ extract_call = env.sandbox_client.execute_command.await_args_list[1]
466
+ assert extract_call == call(
467
+ "sbx",
468
+ "mkdir -p /tmp && tar -xzf /tmp/_upload_tmp_agent-src.tar.gz -C / && rm -f /tmp/_upload_tmp_agent-src.tar.gz",
469
+ timeout=60,
470
+ )
471
+
472
+
473
+ @pytest.mark.asyncio
474
+ async def test_composable_env_rejects_duplicate_task_and_harness_upload_names(
475
+ tmp_path, monkeypatch
476
+ ):
477
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
478
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
479
+ taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
480
+ harness_dir = tmp_path / "skills"
481
+ harness_dir.mkdir()
482
+
483
+ env = ComposableEnv(
484
+ taskset=taskset,
485
+ harness=Harness(
486
+ run_command="true",
487
+ install_script="install-agent",
488
+ get_upload_dirs=lambda: {"skills": harness_dir},
489
+ skills_path="/task/skills",
490
+ ),
491
+ )
492
+ env.sandbox_client = SimpleNamespace(
493
+ execute_command=AsyncMock(
494
+ return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
495
+ ),
496
+ teardown=lambda: None,
497
+ )
498
+ env.taskset.setup = AsyncMock()
499
+ env.upload_content = AsyncMock()
500
+ env.upload_file = AsyncMock()
501
+
502
+ with pytest.raises(
503
+ ValueError,
504
+ match="Upload directory names must be unique across task and harness",
505
+ ):
506
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
507
+
508
+
432
509
  # ── discover_sibling_dir ─────────────────────────────────────────────────
433
510
 
434
511
 
@@ -0,0 +1,200 @@
1
+ """Tests for per-turn context token metrics.
2
+
3
+ Tests the trajectory-based context token computation
4
+ (final_input_tokens, final_output_tokens) which assumes a linear rollout
5
+ using the last trajectory step.
6
+ """
7
+
8
+ from unittest.mock import MagicMock
9
+
10
+ import pytest
11
+
12
+ from verifiers.utils.usage_utils import compute_context_token_metrics
13
+
14
+
15
+ # =========================================================================
16
+ # Helpers
17
+ # =========================================================================
18
+
19
+ SYS = {"role": "system", "content": "You are helpful"}
20
+ USER = {"role": "user", "content": "hi"}
21
+
22
+
23
+ def _make_response(prompt_tokens: int, completion_tokens: int) -> MagicMock:
24
+ response = MagicMock()
25
+ response.usage = MagicMock(
26
+ prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
27
+ )
28
+ return response
29
+
30
+
31
+ def _asst(i: int) -> dict:
32
+ return {"role": "assistant", "content": f"response {i}"}
33
+
34
+
35
+ # =========================================================================
36
+ # compute_context_token_metrics
37
+ # =========================================================================
38
+
39
+
40
+ class TestContextMetrics:
41
+ def test_empty_trajectory(self):
42
+ metrics = compute_context_token_metrics([])
43
+ assert metrics["final_output_tokens"] == 0
44
+ assert metrics["final_input_tokens"] == 0
45
+
46
+ def test_single_turn(self):
47
+ trajectory = [
48
+ {
49
+ "prompt": [SYS, USER],
50
+ "completion": [_asst(0)],
51
+ "response": _make_response(100, 20),
52
+ },
53
+ ]
54
+ metrics = compute_context_token_metrics(trajectory)
55
+ assert metrics["final_output_tokens"] == 20
56
+ assert metrics["final_input_tokens"] == 100
57
+
58
+ def test_multi_turn(self):
59
+ trajectory = [
60
+ {
61
+ "response": _make_response(100, 20),
62
+ },
63
+ {
64
+ "response": _make_response(150, 25),
65
+ },
66
+ {
67
+ "response": _make_response(200, 30),
68
+ },
69
+ ]
70
+ metrics = compute_context_token_metrics(trajectory)
71
+ # Last step total = 200 + 30 = 230
72
+ # Sum of completion tokens = 20 + 25 + 30 = 75
73
+ assert metrics["final_output_tokens"] == 75
74
+ assert metrics["final_input_tokens"] == 230 - 75
75
+
76
+ def test_invariant_total_equals_last_step(self):
77
+ trajectory = [
78
+ {"response": _make_response(100, 20)},
79
+ {"response": _make_response(150, 25)},
80
+ {"response": _make_response(200, 30)},
81
+ ]
82
+ metrics = compute_context_token_metrics(trajectory)
83
+ total = metrics["final_output_tokens"] + metrics["final_input_tokens"]
84
+ # Total should equal last step's prompt_tokens + completion_tokens
85
+ assert total == 200 + 30
86
+
87
+ def test_no_response_on_any_step(self):
88
+ trajectory = [{"response": None}]
89
+ metrics = compute_context_token_metrics(trajectory)
90
+ assert metrics["final_output_tokens"] == 0
91
+ assert metrics["final_input_tokens"] == 0
92
+
93
+ def test_last_step_used_not_largest(self):
94
+ """Even if an earlier step has a larger context, we use the last step."""
95
+ trajectory = [
96
+ {"response": _make_response(500, 100)}, # larger context
97
+ {"response": _make_response(100, 20)}, # last step, smaller
98
+ ]
99
+ metrics = compute_context_token_metrics(trajectory)
100
+ # Last step total = 120, sum completions = 100 + 20 = 120
101
+ assert metrics["final_output_tokens"] == 120
102
+ assert metrics["final_input_tokens"] == 0 # clamped to 0
103
+
104
+ def test_skips_none_responses_for_last_step(self):
105
+ """Last step with response=None is skipped; uses previous step."""
106
+ trajectory = [
107
+ {"response": _make_response(100, 20)},
108
+ {"response": _make_response(200, 30)},
109
+ {"response": None},
110
+ ]
111
+ metrics = compute_context_token_metrics(trajectory)
112
+ # Last step with response is step 1: total = 230
113
+ # Sum completions from all steps with responses: 20 + 30 = 50
114
+ assert metrics["final_output_tokens"] == 50
115
+ assert metrics["final_input_tokens"] == 230 - 50
116
+
117
+ def test_skips_responses_without_usage(self):
118
+ """Responses with no .usage attribute are skipped entirely."""
119
+ no_usage = MagicMock()
120
+ no_usage.usage = None
121
+ trajectory = [
122
+ {"response": _make_response(100, 20)},
123
+ {"response": _make_response(200, 30)},
124
+ {"response": no_usage}, # last step, but no usage
125
+ ]
126
+ metrics = compute_context_token_metrics(trajectory)
127
+ # Should use step 1 (last with usage): total = 230
128
+ assert metrics["final_output_tokens"] == 50
129
+ assert metrics["final_input_tokens"] == 230 - 50
130
+
131
+ def test_all_responses_lack_usage(self):
132
+ """If no response has usage data, return zeros."""
133
+ no_usage = MagicMock()
134
+ no_usage.usage = None
135
+ trajectory = [
136
+ {"response": no_usage},
137
+ {"response": no_usage},
138
+ ]
139
+ metrics = compute_context_token_metrics(trajectory)
140
+ assert metrics["final_output_tokens"] == 0
141
+ assert metrics["final_input_tokens"] == 0
142
+
143
+ def test_final_input_tokens_clamped_to_zero(self):
144
+ """If sum of completions exceeds last step total, input is clamped to 0."""
145
+ trajectory = [
146
+ {"response": _make_response(10, 500)}, # huge completion
147
+ {"response": _make_response(50, 10)},
148
+ ]
149
+ metrics = compute_context_token_metrics(trajectory)
150
+ # Last step total = 60, sum completions = 510
151
+ assert metrics["final_output_tokens"] == 510
152
+ assert metrics["final_input_tokens"] == 0
153
+
154
+
155
+ # =========================================================================
156
+ # Metric classes
157
+ # =========================================================================
158
+
159
+
160
+ class TestContextTokenMetricClasses:
161
+ def test_input_tokens_metric(self):
162
+ from verifiers.utils.metric_utils import InputTokensMetric
163
+
164
+ m = InputTokensMetric()
165
+ m.add_output({"token_usage": {"input_tokens": 100.0}})
166
+ m.add_output({"token_usage": {"input_tokens": 200.0}})
167
+ assert m.compute() == pytest.approx(150.0)
168
+
169
+ def test_output_tokens_metric(self):
170
+ from verifiers.utils.metric_utils import OutputTokensMetric
171
+
172
+ m = OutputTokensMetric()
173
+ m.add_output({"token_usage": {"output_tokens": 40.0}})
174
+ m.add_output({"token_usage": {"output_tokens": 60.0}})
175
+ assert m.compute() == pytest.approx(50.0)
176
+
177
+ def test_final_input_tokens_metric(self):
178
+ from verifiers.utils.metric_utils import FinalInputTokensMetric
179
+
180
+ m = FinalInputTokensMetric()
181
+ m.add_output({"token_usage": {"final_input_tokens": 50.0}})
182
+ m.add_output({"token_usage": {"final_input_tokens": 100.0}})
183
+ assert m.compute() == pytest.approx(75.0)
184
+
185
+ def test_final_output_tokens_metric(self):
186
+ from verifiers.utils.metric_utils import FinalOutputTokensMetric
187
+
188
+ m = FinalOutputTokensMetric()
189
+ m.add_output({"token_usage": {"final_output_tokens": 150.0}})
190
+ m.add_output({"token_usage": {"final_output_tokens": 250.0}})
191
+ assert m.compute() == pytest.approx(200.0)
192
+
193
+ def test_skips_outputs_without_token_usage(self):
194
+ from verifiers.utils.metric_utils import FinalInputTokensMetric
195
+
196
+ m = FinalInputTokensMetric()
197
+ m.add_output({})
198
+ m.add_output({"token_usage": {}})
199
+ assert m.count == 0
200
+ assert m.compute() == 0.0
@@ -237,7 +237,9 @@ async def test_state_to_output_uses_state_usage_not_trajectory(
237
237
  state["reward"] = 0.0
238
238
 
239
239
  output = state_to_output(state, state_columns=[])
240
- assert output["token_usage"] == {"input_tokens": 5.0, "output_tokens": 4.0}
240
+ usage = output["token_usage"]
241
+ assert usage["input_tokens"] == 5.0
242
+ assert usage["output_tokens"] == 4.0
241
243
 
242
244
 
243
245
  @pytest.mark.asyncio
@@ -6,6 +6,8 @@ fields and that the install script is generated correctly.
6
6
 
7
7
  import importlib
8
8
  import json
9
+ from pathlib import Path
10
+ import subprocess
9
11
  from types import SimpleNamespace
10
12
  from unittest.mock import AsyncMock, call
11
13
 
@@ -18,9 +20,11 @@ from verifiers.envs.experimental.composable import (
18
20
  SandboxSpec,
19
21
  SandboxTaskSet,
20
22
  )
23
+ from verifiers.envs.experimental.composable.harnesses import rlm as rlm_module
21
24
  from verifiers.envs.experimental.composable.harnesses.rlm import (
22
25
  build_install_script,
23
26
  rlm_harness,
27
+ resolve_local_checkout,
24
28
  )
25
29
 
26
30
 
@@ -86,28 +90,160 @@ def _make_temp_taskset_package(tmp_path, monkeypatch, *, with_skills: bool):
86
90
  return mod
87
91
 
88
92
 
93
+ def _make_git_checkout(target: Path) -> Path:
94
+ checkout = target
95
+ checkout.mkdir()
96
+ (checkout / "install.sh").write_text("#!/usr/bin/env bash\n")
97
+ (checkout / "pyproject.toml").write_text("[project]\nname='rlm'\nversion='0.0.0'\n")
98
+ subprocess.run(["git", "init", "-b", "main"], cwd=checkout, check=True)
99
+ subprocess.run(
100
+ ["git", "add", "install.sh", "pyproject.toml"], cwd=checkout, check=True
101
+ )
102
+ subprocess.run(
103
+ [
104
+ "git",
105
+ "-c",
106
+ "user.name=Codex",
107
+ "-c",
108
+ "user.email=codex@example.com",
109
+ "commit",
110
+ "-m",
111
+ "init",
112
+ ],
113
+ cwd=checkout,
114
+ check=True,
115
+ )
116
+ return checkout
117
+
118
+
89
119
  # ── RLM harness ──────────────────────────────────────────────────────────
90
120
 
91
121
 
92
- def test_rlm_harness_install_script_downloads_repo_install_sh():
122
+ def test_rlm_harness_install_script_requires_uploaded_checkout():
93
123
  script = build_install_script()
94
- assert "git clone --depth 1 --branch main" in script
95
- assert "github.com/PrimeIntellect-ai/rlm.git" in script
96
- assert "bash /tmp/rlm-checkout/install.sh" in script
124
+ assert 'test -f "$RLM_CHECKOUT_PATH/install.sh"' in script
125
+ assert "git clone" not in script
126
+ assert 'bash "$RLM_CHECKOUT_PATH/install.sh"' in script
97
127
 
98
128
 
99
- def test_rlm_harness_sets_metrics_fields():
100
- harness = rlm_harness()
129
+ def test_rlm_harness_sets_metrics_fields(tmp_path):
130
+ harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
101
131
  assert harness.metrics_path == "{workdir}/.rlm/sessions/*/meta.json"
102
132
  assert harness.metrics_key == "metrics"
103
133
  assert harness.metrics_prefix == "rlm_"
104
134
 
105
135
 
106
- def test_rlm_harness_sets_skills_path():
107
- harness = rlm_harness()
136
+ def test_rlm_harness_sets_skills_path(tmp_path):
137
+ harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
108
138
  assert harness.skills_path == "/task/rlm-skills"
109
139
 
110
140
 
141
+ def test_resolve_local_checkout_validates_explicit_path(tmp_path):
142
+ checkout = _make_git_checkout(tmp_path / "rlm")
143
+
144
+ resolved = resolve_local_checkout(checkout)
145
+
146
+ assert resolved == checkout.resolve()
147
+
148
+
149
+ def test_rlm_harness_uploads_explicit_local_checkout(tmp_path):
150
+ checkout = _make_git_checkout(tmp_path / "rlm")
151
+
152
+ harness = rlm_harness(local_checkout=checkout)
153
+
154
+ assert harness.get_upload_dirs is not None
155
+ assert harness.get_upload_dirs() == {"rlm_checkout": checkout.resolve()}
156
+ assert harness.upload_dir_mapping == {"rlm_checkout": "/tmp/rlm-checkout"}
157
+
158
+
159
+ def test_resolve_local_checkout_materializes_host_cache(tmp_path):
160
+ source_checkout = _make_git_checkout(tmp_path / "rlm-source")
161
+ checkout_dir = tmp_path / "checkout-root" / "rlm"
162
+
163
+ resolved = resolve_local_checkout(
164
+ local_checkout=checkout_dir,
165
+ rlm_repo_url=str(source_checkout),
166
+ rlm_branch="main",
167
+ )
168
+
169
+ assert resolved == checkout_dir.resolve()
170
+ assert (checkout_dir / ".git").is_dir()
171
+ assert (checkout_dir / "install.sh").is_file()
172
+ assert (checkout_dir / "pyproject.toml").is_file()
173
+
174
+
175
+ def test_rlm_harness_uses_default_host_cache_when_local_checkout_unspecified(
176
+ tmp_path, monkeypatch
177
+ ):
178
+ source_checkout = _make_git_checkout(tmp_path / "rlm-source")
179
+ monkeypatch.setattr(
180
+ rlm_module,
181
+ "DEFAULT_RLM_LOCAL_CHECKOUT_CACHE_ROOT",
182
+ tmp_path / "cache-root",
183
+ )
184
+
185
+ harness = rlm_harness(
186
+ rlm_repo_url=str(source_checkout),
187
+ rlm_branch="main",
188
+ )
189
+
190
+ assert harness.get_upload_dirs is not None
191
+ upload_checkout = harness.get_upload_dirs()["rlm_checkout"]
192
+ assert isinstance(upload_checkout, Path)
193
+ assert upload_checkout.is_dir()
194
+ assert upload_checkout.name.startswith("rlm-source-main-")
195
+ assert harness.upload_dir_mapping == {"rlm_checkout": "/tmp/rlm-checkout"}
196
+
197
+
198
+ def test_rlm_harness_always_uploads_checkout(tmp_path, monkeypatch):
199
+ source_checkout = _make_git_checkout(tmp_path / "rlm-source")
200
+ monkeypatch.setattr(
201
+ rlm_module,
202
+ "DEFAULT_RLM_LOCAL_CHECKOUT_CACHE_ROOT",
203
+ tmp_path / "cache-root",
204
+ )
205
+
206
+ harness = rlm_harness(
207
+ rlm_repo_url=str(source_checkout),
208
+ rlm_branch="main",
209
+ )
210
+
211
+ assert harness.get_upload_dirs is not None
212
+ assert harness.upload_dir_mapping is not None
213
+
214
+
215
+ def test_resolve_local_checkout_redacts_gh_token_on_clone_failure(
216
+ tmp_path, monkeypatch
217
+ ):
218
+ failing_checkout = tmp_path / "checkout-root" / "rlm"
219
+ token = "super/secret token"
220
+ quoted_token = "super%2Fsecret%20token"
221
+
222
+ def _raise_clone_error(*args, **kwargs):
223
+ raise subprocess.CalledProcessError(
224
+ 128,
225
+ args[0],
226
+ stderr=(
227
+ "fatal: could not read from "
228
+ f"https://{quoted_token}@github.com/PrimeIntellect-ai/rlm.git"
229
+ ),
230
+ )
231
+
232
+ monkeypatch.setattr(rlm_module.subprocess, "run", _raise_clone_error)
233
+
234
+ with pytest.raises(RuntimeError) as exc_info:
235
+ resolve_local_checkout(
236
+ local_checkout=failing_checkout,
237
+ rlm_repo_url="github.com/PrimeIntellect-ai/rlm.git",
238
+ rlm_branch="main",
239
+ gh_token=token,
240
+ )
241
+
242
+ message = str(exc_info.value)
243
+ assert token not in message
244
+ assert "<redacted>" in message
245
+
246
+
111
247
  # ── install_env ──────────────────────────────────────────────────────────
112
248
 
113
249
 
@@ -201,7 +337,7 @@ async def test_rlm_uploads_skills_before_install(tmp_path, monkeypatch):
201
337
 
202
338
 
203
339
  @pytest.mark.asyncio
204
- async def test_rlm_collects_logs_and_metrics():
340
+ async def test_rlm_collects_logs_and_metrics(tmp_path):
205
341
  taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
206
342
  metrics = {
207
343
  "turns": 3,
@@ -209,7 +345,7 @@ async def test_rlm_collects_logs_and_metrics():
209
345
  "prompt_tokens": 100,
210
346
  "completion_tokens": 25,
211
347
  }
212
- harness = rlm_harness()
348
+ harness = rlm_harness(local_checkout=_make_git_checkout(tmp_path / "rlm"))
213
349
  env = ComposableEnv(
214
350
  taskset=taskset,
215
351
  harness=Harness(
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.13.dev1"
1
+ __version__ = "0.1.13.dev2"
2
2
 
3
3
  import importlib
4
4
  import os
@@ -483,14 +483,12 @@ class Environment(ABC):
483
483
  usage = state.get("usage")
484
484
  if isinstance(usage, Mapping):
485
485
  try:
486
- input_tokens = float(usage.get("input_tokens", 0.0))
487
- output_tokens = float(usage.get("output_tokens", 0.0))
486
+ return {
487
+ "input_tokens": float(usage.get("input_tokens", 0.0)),
488
+ "output_tokens": float(usage.get("output_tokens", 0.0)),
489
+ }
488
490
  except (TypeError, ValueError):
489
491
  return None
490
- return {
491
- "input_tokens": input_tokens,
492
- "output_tokens": output_tokens,
493
- }
494
492
  return None
495
493
 
496
494
  async def get_model_response(
@@ -374,17 +374,20 @@ class CliAgentEnv(SandboxMixin, vf.MultiTurnEnv):
374
374
  f"Agent completed successfully (exit_code={status.exit_code})"
375
375
  )
376
376
  else:
377
- self.logger.warning(
378
- f"Agent failed (exit_code={status.exit_code}) stdout={status.stdout}, stderr={status.stderr}"
379
- )
380
- if len(state.get("trajectory", [])) == 0:
381
- stderr_snippet = (status.stderr or "")[:500]
377
+ stderr_full = status.stderr or ""
378
+ num_turns = len(state.get("trajectory", []))
379
+ if num_turns == 0:
382
380
  error = AgentError(
383
381
  f"Agent crashed before any LLM call "
384
- f"(exit_code={status.exit_code}): {stderr_snippet}"
382
+ f"(exit_code={status.exit_code}): {stderr_full}"
383
+ )
384
+ else:
385
+ error = AgentError(
386
+ f"Agent crashed after {num_turns} turn(s) "
387
+ f"(exit_code={status.exit_code}): {stderr_full}"
385
388
  )
386
- state["error"] = error
387
- self.logger.error(str(error))
389
+ state["error"] = error
390
+ self.logger.error(str(error))
388
391
  return
389
392
  await asyncio.sleep(self.poll_interval)
390
393
 
@@ -50,6 +50,7 @@ import verifiers as vf
50
50
  from verifiers.envs.experimental.cli_agent_env import CliAgentEnv
51
51
  from verifiers.envs.experimental.composable.harness import Harness
52
52
  from verifiers.envs.experimental.composable.task import TaskSet
53
+ from verifiers.envs.tool_env import ToolMonitorRubric
53
54
  from verifiers.types import State
54
55
 
55
56
  logger = logging.getLogger(__name__)
@@ -86,6 +87,9 @@ class ComposableEnv(CliAgentEnv):
86
87
  self.harness = harness
87
88
  self.install_env = dict(install_env) if install_env else None
88
89
 
90
+ if harness.tool_names:
91
+ self.add_rubric(ToolMonitorRubric(tool_names=list(harness.tool_names)))
92
+
89
93
  # -- CliAgentEnv hooks --------------------------------------------------
90
94
 
91
95
  def _get_spec(self, state: State) -> Any:
@@ -211,11 +215,11 @@ class ComposableEnv(CliAgentEnv):
211
215
  async def _after_harness_inputs_uploaded(self, state: State) -> None:
212
216
  """Upload task-declared directories to harness-declared sandbox paths.
213
217
 
214
- Joins ``TaskSet.get_upload_dirs()`` (logical name local source)
215
- with ``Harness.upload_dir_mapping`` (logical name → sandbox path).
218
+ Joins task-declared and harness-declared upload directories with
219
+ ``Harness.upload_dir_mapping`` (logical name → sandbox path).
216
220
  Only directories whose logical name appears in both are uploaded.
217
221
  """
218
- upload_dirs = self.taskset.get_upload_dirs()
222
+ upload_dirs = self._get_upload_dirs()
219
223
  mapping = self.harness.get_effective_upload_dir_mapping()
220
224
  if not upload_dirs or not mapping:
221
225
  return
@@ -225,6 +229,23 @@ class ComposableEnv(CliAgentEnv):
225
229
  if remote_dest is not None:
226
230
  await self._upload_dir(sandbox_id, local_source, remote_dest)
227
231
 
232
+ def _get_upload_dirs(self) -> dict[str, Traversable | Path]:
233
+ """Merge task-owned and harness-owned upload directories."""
234
+ task_upload_dirs = dict(self.taskset.get_upload_dirs() or {})
235
+ harness_upload_dirs_value = (
236
+ self.harness.get_upload_dirs() if self.harness.get_upload_dirs else None
237
+ )
238
+ harness_upload_dirs = dict(harness_upload_dirs_value or {})
239
+ duplicate_names = sorted(set(task_upload_dirs) & set(harness_upload_dirs))
240
+ if duplicate_names:
241
+ names = ", ".join(repr(name) for name in duplicate_names)
242
+ raise ValueError(
243
+ "Upload directory names must be unique across task and harness; "
244
+ f"duplicates: {names}."
245
+ )
246
+ task_upload_dirs.update(harness_upload_dirs)
247
+ return task_upload_dirs
248
+
228
249
  def _get_install_execute_kwargs(self) -> dict[str, Any]:
229
250
  """Keyword arguments passed to sandbox install command execution."""
230
251
  kwargs: dict[str, Any] = {"timeout": self.harness.install_timeout}
@@ -17,7 +17,9 @@ connects them.
17
17
  from __future__ import annotations
18
18
 
19
19
  from dataclasses import dataclass
20
- from typing import TYPE_CHECKING
20
+ from importlib.abc import Traversable
21
+ from pathlib import Path
22
+ from typing import TYPE_CHECKING, Callable
21
23
 
22
24
  if TYPE_CHECKING:
23
25
  from verifiers.envs.experimental.composable.task import SandboxSpec
@@ -58,6 +60,12 @@ class Harness:
58
60
  ``skills_path`` is merged into this mapping automatically.
59
61
  Use for non-skills directories; for skills prefer
60
62
  ``skills_path``.
63
+ get_upload_dirs:
64
+ Optional callable returning harness-owned local directories to
65
+ upload into the sandbox before install. These are merged with
66
+ task-declared upload dirs by ``ComposableEnv`` and resolved via
67
+ the same ``upload_dir_mapping`` logical-name contract.
68
+ Example: ``lambda: {"agent_src": Path("/path/to/checkout")}``.
61
69
  metrics_path:
62
70
  Glob pattern for a JSON metrics file inside the sandbox,
63
71
  collected after the rollout. May contain ``{workdir}`` which is
@@ -75,6 +83,12 @@ class Harness:
75
83
  metrics_keys:
76
84
  Optional whitelist of metric keys to surface. ``None`` means
77
85
  surface all keys found.
86
+ tool_names:
87
+ Names of the tools the agent uses internally. When non-empty,
88
+ ``ComposableEnv`` auto-registers a ``ToolMonitorRubric`` that
89
+ counts calls to each named tool (plus a total) from the
90
+ assistant messages the harness emits into the trajectory.
91
+ Example: ``["ipython", "summarize"]`` for the RLM harness.
78
92
  """
79
93
 
80
94
  install_script: str | None = None
@@ -87,10 +101,12 @@ class Harness:
87
101
  sandbox_spec: SandboxSpec | None = None
88
102
  skills_path: str | None = None
89
103
  upload_dir_mapping: dict[str, str] | None = None
104
+ get_upload_dirs: Callable[[], dict[str, Traversable | Path] | None] | None = None
90
105
  metrics_path: str | None = None
91
106
  metrics_prefix: str = ""
92
107
  metrics_key: str | None = None
93
108
  metrics_keys: list[str] | None = None
109
+ tool_names: list[str] | None = None
94
110
 
95
111
  def get_effective_upload_dir_mapping(self) -> dict[str, str] | None:
96
112
  """Return the merged upload mapping (skills_path + upload_dir_mapping)."""