verifiers 0.1.12.dev5__tar.gz → 0.1.13.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/PKG-INFO +3 -3
  2. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/README.md +1 -1
  3. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/pyproject.toml +1 -1
  4. verifiers-0.1.13.dev1/tests/test_composable_env.py +592 -0
  5. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_eval_cli.py +32 -0
  6. verifiers-0.1.13.dev1/tests/test_nemorl_client.py +219 -0
  7. verifiers-0.1.13.dev1/tests/test_rlm_composable_env.py +262 -0
  8. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/__init__.py +1 -1
  9. verifiers-0.1.13.dev1/verifiers/cli/commands/eval.py +21 -0
  10. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/clients/__init__.py +6 -0
  11. verifiers-0.1.13.dev1/verifiers/clients/nemorl_chat_completions_client.py +87 -0
  12. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/clients/openai_chat_completions_token_client.py +39 -4
  13. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/cli_agent_env.py +7 -2
  14. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/README.md +23 -1
  15. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/__init__.py +2 -0
  16. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/composable_env.py +143 -28
  17. verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/harness.py +100 -0
  18. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/harnesses/__init__.py +2 -0
  19. verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/harnesses/rlm.py +82 -0
  20. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/task.py +74 -0
  21. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/__init__.py +2 -0
  22. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/cp/test_utils.py +4 -1
  23. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/harbor/harbor.py +5 -1
  24. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/__init__.py +2 -0
  25. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py +4 -1
  26. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/openswe.py +6 -1
  27. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py +6 -1
  28. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py +14 -1
  29. verifiers-0.1.13.dev1/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py +384 -0
  30. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py +15 -0
  31. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/eval.py +73 -30
  32. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/server/env_router.py +3 -0
  33. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/server/env_server.py +1 -0
  34. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/server/env_worker.py +2 -0
  35. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/types.py +1 -0
  36. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/eval_utils.py +14 -1
  37. verifiers-0.1.12.dev5/tests/test_composable_env.py +0 -260
  38. verifiers-0.1.12.dev5/verifiers/cli/commands/eval.py +0 -7
  39. verifiers-0.1.12.dev5/verifiers/envs/experimental/composable/harness.py +0 -58
  40. verifiers-0.1.12.dev5/verifiers/envs/experimental/composable/harnesses/rlm.py +0 -50
  41. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/.gitignore +0 -0
  42. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/LICENSE +0 -0
  43. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/AGENTS.md +0 -0
  44. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/README.md +0 -0
  45. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/__init__.py +0 -0
  46. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/conftest.py +0 -0
  47. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_browser_env.py +0 -0
  48. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_build_script.py +0 -0
  49. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_cli_agent_env.py +0 -0
  50. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_client_auth_errors.py +0 -0
  51. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_client_config.py +0 -0
  52. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_client_multimodal_types.py +0 -0
  53. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_decorator_ranks.py +0 -0
  54. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_endpoint_registry.py +0 -0
  55. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_env_group.py +0 -0
  56. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_env_server.py +0 -0
  57. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_environment.py +0 -0
  58. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_environment_extra.py +0 -0
  59. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_envs.py +0 -0
  60. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_error_chain.py +0 -0
  61. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_eval_display.py +0 -0
  62. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_eval_utils.py +0 -0
  63. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_gepa_cli.py +0 -0
  64. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_gym_env.py +0 -0
  65. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_imports.py +0 -0
  66. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_install_utils.py +0 -0
  67. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_interception_utils.py +0 -0
  68. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_logging.py +0 -0
  69. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_math_rubric.py +0 -0
  70. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_maybe_think_parser.py +0 -0
  71. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_message_utils.py +0 -0
  72. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_message_utils_multimodal.py +0 -0
  73. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_multiturn_env.py +0 -0
  74. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_openai_chat_completions_token_client.py +0 -0
  75. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_opencode_harbor.py +0 -0
  76. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_opencode_rlm_env.py +0 -0
  77. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_parser.py +0 -0
  78. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_path_utils.py +0 -0
  79. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_prime_plugin.py +0 -0
  80. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_rlm_env.py +0 -0
  81. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_rubric.py +0 -0
  82. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_rubric_group.py +0 -0
  83. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_sandbox_env.py +0 -0
  84. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_sandbox_mixin.py +0 -0
  85. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_save_utils.py +0 -0
  86. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_setup_script.py +0 -0
  87. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_singleturn_env.py +0 -0
  88. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_stateful_tool_env.py +0 -0
  89. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_think_parser.py +0 -0
  90. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_tool_env.py +0 -0
  91. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_tool_utils.py +0 -0
  92. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_trajectory_processing.py +0 -0
  93. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_tui_info_formatting.py +0 -0
  94. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/tests/test_xml_parser.py +0 -0
  95. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/AGENTS.md +0 -0
  96. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/__init__.py +0 -0
  97. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/commands/__init__.py +0 -0
  98. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/commands/build.py +0 -0
  99. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/commands/gepa.py +0 -0
  100. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/commands/init.py +0 -0
  101. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/commands/install.py +0 -0
  102. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/commands/setup.py +0 -0
  103. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/plugins/__init__.py +0 -0
  104. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/plugins/prime.py +0 -0
  105. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/cli/tui.py +0 -0
  106. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/clients/anthropic_messages_client.py +0 -0
  107. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/clients/client.py +0 -0
  108. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/clients/openai_chat_completions_client.py +0 -0
  109. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/clients/openai_completions_client.py +0 -0
  110. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/decorators.py +0 -0
  111. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/AGENTS.md +0 -0
  112. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/__init__.py +0 -0
  113. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/env_group.py +0 -0
  114. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/environment.py +0 -0
  115. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/README.md +0 -0
  116. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/__init__.py +0 -0
  117. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/harnesses/opencode.py +0 -0
  118. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/harnesses/prompt.txt +0 -0
  119. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/cp/__init__.py +0 -0
  120. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/cp/cp_task.py +0 -0
  121. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/harbor/__init__.py +0 -0
  122. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/lean/__init__.py +0 -0
  123. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/lean/lean_task.py +0 -0
  124. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/math/__init__.py +0 -0
  125. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/math/math_task.py +0 -0
  126. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/create_fix_patch.sh +0 -0
  127. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/composable/tasksets/swe/log_parser.py +0 -0
  128. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/gym_env.py +0 -0
  129. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/harbor_env.py +0 -0
  130. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/mcp_env.py +0 -0
  131. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/opencode_env.py +0 -0
  132. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/opencode_qa_env.py +0 -0
  133. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/opencode_rlm_env.py +0 -0
  134. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/rlm_env.py +0 -0
  135. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/experimental/sandbox_mixin.py +0 -0
  136. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/README.md +0 -0
  137. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/__init__.py +0 -0
  138. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/README.md +0 -0
  139. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/__init__.py +0 -0
  140. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/browser_env.py +0 -0
  141. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/modes/__init__.py +0 -0
  142. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/modes/base.py +0 -0
  143. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/modes/cua_mode.py +0 -0
  144. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/browser_env/modes/dom_mode.py +0 -0
  145. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/openenv_env.py +0 -0
  146. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  147. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/integrations/textarena_env.py +0 -0
  148. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/multiturn_env.py +0 -0
  149. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/python_env.py +0 -0
  150. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/sandbox_env.py +0 -0
  151. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/singleturn_env.py +0 -0
  152. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/stateful_tool_env.py +0 -0
  153. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/envs/tool_env.py +0 -0
  154. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/errors.py +0 -0
  155. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/gepa/__init__.py +0 -0
  156. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/gepa/adapter.py +0 -0
  157. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/gepa/config.py +0 -0
  158. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/gepa/display.py +0 -0
  159. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/gepa/gepa_utils.py +0 -0
  160. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/parsers/__init__.py +0 -0
  161. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/parsers/maybe_think_parser.py +0 -0
  162. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/parsers/parser.py +0 -0
  163. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/parsers/think_parser.py +0 -0
  164. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/parsers/xml_parser.py +0 -0
  165. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/README.md +0 -0
  166. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/__init__.py +0 -0
  167. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/inference/__init__.py +0 -0
  168. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/inference/client.py +0 -0
  169. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/inference/server.py +0 -0
  170. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/__init__.py +0 -0
  171. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/config.py +0 -0
  172. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/orchestrator.py +0 -0
  173. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/trainer.py +0 -0
  174. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rl/trainer/utils.py +0 -0
  175. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rubrics/__init__.py +0 -0
  176. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rubrics/experimental/hybrid_math_rubric.py +0 -0
  177. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rubrics/judge_rubric.py +0 -0
  178. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rubrics/math_rubric.py +0 -0
  179. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rubrics/rubric.py +0 -0
  180. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/rubrics/rubric_group.py +0 -0
  181. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/__init__.py +0 -0
  182. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/build.py +0 -0
  183. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/gepa.py +0 -0
  184. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/init.py +0 -0
  185. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/install.py +0 -0
  186. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/prime_rl.py +0 -0
  187. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/rl.py +0 -0
  188. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/setup.py +0 -0
  189. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/train.py +0 -0
  190. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/tui.py +0 -0
  191. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/scripts/vllm.py +0 -0
  192. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/__init__.py +0 -0
  193. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/client/env_client.py +0 -0
  194. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/client/zmq_env_client.py +0 -0
  195. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/server/__init__.py +0 -0
  196. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/server/zmq_env_server.py +0 -0
  197. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/serve/types.py +0 -0
  198. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/__init__.py +0 -0
  199. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/async_utils.py +0 -0
  200. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/client_utils.py +0 -0
  201. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/config_utils.py +0 -0
  202. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/data_utils.py +0 -0
  203. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/display_utils.py +0 -0
  204. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/env_utils.py +0 -0
  205. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/error_utils.py +0 -0
  206. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/eval_display.py +0 -0
  207. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/heartbeat.py +0 -0
  208. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/import_utils.py +0 -0
  209. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/install_utils.py +0 -0
  210. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/interception_utils.py +0 -0
  211. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/logging_utils.py +0 -0
  212. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/message_utils.py +0 -0
  213. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/metric_utils.py +0 -0
  214. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/path_utils.py +0 -0
  215. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/process_utils.py +0 -0
  216. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/response_utils.py +0 -0
  217. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/save_utils.py +0 -0
  218. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/serve_utils.py +0 -0
  219. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/thread_utils.py +0 -0
  220. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/threaded_sandbox_client.py +0 -0
  221. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/tool_utils.py +0 -0
  222. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/tunnel_utils.py +0 -0
  223. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/usage_utils.py +0 -0
  224. {verifiers-0.1.12.dev5 → verifiers-0.1.13.dev1}/verifiers/utils/version_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.12.dev5
3
+ Version: 0.1.13.dev1
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -34,7 +34,7 @@ Requires-Dist: nest-asyncio>=1.6.0
34
34
  Requires-Dist: numpy
35
35
  Requires-Dist: openai-agents>=0.0.7
36
36
  Requires-Dist: openai>=1.108.1
37
- Requires-Dist: prime-sandboxes>=0.2.19
37
+ Requires-Dist: prime-sandboxes>=0.2.20
38
38
  Requires-Dist: prime-tunnel>=0.1.6
39
39
  Requires-Dist: pydantic>=2.11.9
40
40
  Requires-Dist: pyzmq>=27.1.0
@@ -107,7 +107,7 @@ Verifiers: Environments for LLM Reinforcement Learning
107
107
 
108
108
  ## News & Updates
109
109
 
110
- - [03/22/26] v0.1.12.dev0 release prep is up, featuring opencode RLM environments, performance and autoscaling improvements, stronger cancellation/runtime handling, multimodal save fidelity, and updated development docs.
110
+ - [04/17/26] v0.1.12 is released, featuring a new composable Task/Agent/Environment architecture, upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
111
111
  - [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
112
112
  - [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
113
113
  - [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
@@ -34,7 +34,7 @@ Verifiers: Environments for LLM Reinforcement Learning
34
34
 
35
35
  ## News & Updates
36
36
 
37
- - [03/22/26] v0.1.12.dev0 release prep is up, featuring opencode RLM environments, performance and autoscaling improvements, stronger cancellation/runtime handling, multimodal save fidelity, and updated development docs.
37
+ - [04/17/26] v0.1.12 is released, featuring a new composable Task/Agent/Environment architecture, upstreamed opencode and RLM harnesses/tasksets, major `RLMEnv` improvements (context dropping, prompt builder, hardened transport), multi-worker env server support, expanded `vf-tui` capabilities, and richer eval configuration.
38
38
  - [03/12/26] v0.1.11 is released, featuring a unified client stack, major `RLMEnv` and env server reliability improvements, a substantially refined eval TUI, new pass@k and ablation sweep support, and bundled opencode environments.
39
39
  - [02/10/26] v0.1.10 is released, featuring OpenEnv and BrowserEnv integrations, resumed evals, improved rollout and token tracking, safer sandbox lifecycle behavior, refreshed workspace setup, and opencode harbor improvements.
40
40
  - [01/08/26] v0.1.9 is released, featuring a number of new experimental environment class types, monitor rubrics for automatic metric collection, improved workspace setup flow, improved error handling, bug fixes, and a documentation overhaul.
@@ -38,7 +38,7 @@ dependencies = [
38
38
  "openai>=1.108.1",
39
39
  "openai-agents>=0.0.7",
40
40
  "prime-tunnel>=0.1.6",
41
- "prime-sandboxes>=0.2.19",
41
+ "prime-sandboxes>=0.2.20",
42
42
  "pydantic>=2.11.9",
43
43
  "requests",
44
44
  "rich",
@@ -0,0 +1,592 @@
1
+ """Tests for the composable architecture: Task, TaskSet, SandboxTaskSet, SandboxSpec."""
2
+
3
+ import importlib
4
+ import json
5
+ from types import SimpleNamespace
6
+ from unittest.mock import AsyncMock, call
7
+
8
+ import pytest
9
+
10
+ import verifiers as vf
11
+ from verifiers.envs.experimental.composable import (
12
+ ComposableEnv,
13
+ Harness,
14
+ SandboxSpec,
15
+ SandboxTaskSet,
16
+ Task,
17
+ TaskSet,
18
+ discover_sibling_dir,
19
+ )
20
+
21
+
22
+ # ── Mock Rubrics ──────────────────────────────────────────────────────
23
+
24
+
25
+ class MockSandboxRubric(vf.Rubric):
26
+ def __init__(self, **kwargs):
27
+ super().__init__(**kwargs)
28
+ self.add_reward_func(self.solved)
29
+
30
+ async def solved(self, state, **kwargs) -> float:
31
+ return 1.0 if state.get("test_output") == "PASS" else 0.0
32
+
33
+
34
+ class MockMathRubric(vf.Rubric):
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.add_reward_func(self.correct)
38
+
39
+ async def correct(self, state, **kwargs) -> float:
40
+ return 1.0 if state.get("info", {}).get("id") == 0 else 0.0
41
+
42
+
43
+ # ── Mock TaskSets ───────────────────────────────────────────────────────
44
+
45
+
46
+ class MockSandboxTaskSet(SandboxTaskSet):
47
+ """SandboxTaskSet for testing."""
48
+
49
+ def get_instruction(self, info):
50
+ return f"Fix bug #{info.get('id', 0)}"
51
+
52
+ def get_sandbox_spec(self, info):
53
+ return SandboxSpec(image="python:3.11-slim", cpu_cores=2, memory_gb=2)
54
+
55
+ def get_rubric(self):
56
+ return MockSandboxRubric()
57
+
58
+ def get_workdir(self, info):
59
+ return "/testbed"
60
+
61
+ def get_env_vars(self):
62
+ return {"FOO": "bar"}
63
+
64
+
65
+ class MockTaskSet(TaskSet):
66
+ """Plain TaskSet (no sandbox) for testing."""
67
+
68
+ def get_instruction(self, info):
69
+ return info.get("question", "")
70
+
71
+ def get_rubric(self):
72
+ return MockMathRubric()
73
+
74
+
75
+ def _make_dataset(n=3):
76
+ from datasets import Dataset
77
+
78
+ return Dataset.from_dict(
79
+ {
80
+ "info": [{"id": i, "question": f"q{i}"} for i in range(n)],
81
+ "answer": ["" for _ in range(n)],
82
+ }
83
+ )
84
+
85
+
86
+ # ── SandboxSpec ─────────────────────────────────────────────────────────
87
+
88
+
89
+ def test_sandbox_spec_defaults():
90
+ spec = SandboxSpec()
91
+ assert spec.image == "python:3.11-slim"
92
+ assert spec.cpu_cores == 4
93
+
94
+
95
+ def test_sandbox_spec_custom():
96
+ spec = SandboxSpec(image="lean-tactic:v4.27", gpu_count=1)
97
+ assert spec.image == "lean-tactic:v4.27"
98
+ assert spec.gpu_count == 1
99
+
100
+
101
+ # ── Task from SandboxTaskSet ───────────────────────────────────────────
102
+
103
+
104
+ def test_task_sandbox_spec():
105
+ ts = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
106
+ task = ts[0]
107
+ assert isinstance(task, Task)
108
+ assert task.sandbox_spec is not None
109
+ assert task.sandbox_spec.image == "python:3.11-slim"
110
+ assert task.sandbox_spec.cpu_cores == 2
111
+
112
+
113
+ def test_task_image():
114
+ ts = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
115
+ task = ts[0]
116
+ assert task.image == "python:3.11-slim"
117
+
118
+
119
+ def test_task_workdir():
120
+ ts = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
121
+ task = ts[0]
122
+ assert task.workdir == "/testbed"
123
+
124
+
125
+ def test_task_repr_sandbox():
126
+ ts = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
127
+ task = ts[0]
128
+ assert "python:3.11-slim" in repr(task)
129
+
130
+
131
+ # ── Task from plain TaskSet ────────────────────────────────────────────
132
+
133
+
134
+ def test_task_no_sandbox():
135
+ ts = MockTaskSet(dataset=_make_dataset(), name="math")
136
+ task = ts[0]
137
+ assert task.sandbox_spec is None
138
+ assert task.image is None
139
+
140
+
141
+ def test_task_repr_no_sandbox():
142
+ ts = MockTaskSet(dataset=_make_dataset(), name="math")
143
+ task = ts[0]
144
+ assert "no sandbox" in repr(task)
145
+
146
+
147
+ # ── TaskSet ─────────────────────────────────────────────────────────────
148
+
149
+
150
+ def test_taskset_isinstance():
151
+ ts = MockTaskSet(dataset=_make_dataset(), name="math")
152
+ assert not isinstance(ts, SandboxTaskSet)
153
+
154
+ ts2 = MockSandboxTaskSet(dataset=_make_dataset(), name="swe")
155
+ assert isinstance(ts2, SandboxTaskSet)
156
+
157
+
158
+ def test_taskset_len():
159
+ ts = MockTaskSet(dataset=_make_dataset(5), name="test")
160
+ assert len(ts) == 5
161
+
162
+
163
+ def test_taskset_iter():
164
+ ts = MockTaskSet(dataset=_make_dataset(3), name="test")
165
+ tasks = list(ts)
166
+ assert len(tasks) == 3
167
+ assert all(isinstance(t, Task) for t in tasks)
168
+
169
+
170
+ def test_taskset_filter():
171
+ ts = MockSandboxTaskSet(dataset=_make_dataset(5), name="test")
172
+ filtered = ts.filter(lambda ex: ex["info"]["id"] < 3)
173
+ assert len(filtered) == 3
174
+ assert isinstance(filtered, MockSandboxTaskSet)
175
+
176
+
177
+ def test_taskset_take():
178
+ ts = MockSandboxTaskSet(dataset=_make_dataset(5), name="test")
179
+ taken = ts.take(2)
180
+ assert len(taken) == 2
181
+ assert isinstance(taken, MockSandboxTaskSet)
182
+
183
+
184
+ def test_taskset_repr():
185
+ ts = MockTaskSet(dataset=_make_dataset(), name="mytest")
186
+ assert "mytest" in repr(ts)
187
+ assert "3" in repr(ts)
188
+
189
+
190
+ @pytest.mark.asyncio
191
+ async def test_composable_env_exports_task_workdir():
192
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
193
+ env = ComposableEnv(
194
+ taskset=taskset,
195
+ harness=Harness(run_command="true"),
196
+ )
197
+
198
+ env_vars = await env.build_env_vars(
199
+ {
200
+ "info": {"id": 0},
201
+ "interception_base_url": "https://test.trycloudflare.com/v1",
202
+ }
203
+ )
204
+
205
+ assert env_vars["AGENT_WORKDIR"] == "/testbed"
206
+ assert env_vars["FOO"] == "bar"
207
+
208
+
209
+ @pytest.mark.asyncio
210
+ async def test_composable_env_quotes_paths_in_mkdir_command():
211
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
212
+ env = ComposableEnv(
213
+ taskset=taskset,
214
+ harness=Harness(
215
+ run_command="true",
216
+ instruction_path="/tmp/with space/prompt.txt",
217
+ system_prompt="system",
218
+ system_prompt_path="/tmp/other path/system.txt",
219
+ ),
220
+ )
221
+ env.sandbox_client = SimpleNamespace(
222
+ execute_command=AsyncMock(),
223
+ teardown=lambda: None,
224
+ )
225
+ env.taskset.setup = AsyncMock()
226
+ env.upload_content = AsyncMock()
227
+
228
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
229
+
230
+ env.sandbox_client.execute_command.assert_awaited_once_with(
231
+ "sbx",
232
+ "mkdir -p '/tmp/other path' '/tmp/with space'",
233
+ timeout=10,
234
+ )
235
+
236
+
237
+ @pytest.mark.asyncio
238
+ async def test_composable_env_quotes_log_path_when_collecting_logs():
239
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
240
+ env = ComposableEnv(
241
+ taskset=taskset,
242
+ harness=Harness(
243
+ run_command="true",
244
+ log_path="/tmp/log dir/agent.log",
245
+ ),
246
+ )
247
+ env.sandbox_client = SimpleNamespace(
248
+ execute_command=AsyncMock(
249
+ return_value=SimpleNamespace(stdout="agent log\n", stderr="", exit_code=0)
250
+ ),
251
+ teardown=lambda: None,
252
+ )
253
+
254
+ state = {"sandbox_id": "sbx", "timing": {"total_ms": 0}}
255
+
256
+ await env.post_rollout(state)
257
+
258
+ env.sandbox_client.execute_command.assert_awaited_once_with(
259
+ "sbx",
260
+ "cat '/tmp/log dir/agent.log' 2>/dev/null || echo '<no logs>'",
261
+ working_dir=None,
262
+ )
263
+ assert state["agent_logs"] == "agent log"
264
+
265
+
266
+ # ── install_env ──────────────────────────────────────────────────────────
267
+
268
+
269
+ @pytest.mark.asyncio
270
+ async def test_composable_env_install_env_passes_to_execute():
271
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
272
+ env = ComposableEnv(
273
+ taskset=taskset,
274
+ harness=Harness(
275
+ run_command="true",
276
+ install_script="install-agent",
277
+ instruction_path="/tmp/prompt.txt",
278
+ ),
279
+ install_env={"GH_TOKEN": "secret"},
280
+ )
281
+ env.sandbox_client = SimpleNamespace(
282
+ execute_command=AsyncMock(
283
+ return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
284
+ ),
285
+ teardown=lambda: None,
286
+ )
287
+ env.taskset.setup = AsyncMock()
288
+ env.upload_content = AsyncMock()
289
+
290
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
291
+
292
+ install_call = env.sandbox_client.execute_command.await_args_list[-1]
293
+ assert install_call == call(
294
+ "sbx", "install-agent", timeout=300, env={"GH_TOKEN": "secret"}
295
+ )
296
+
297
+
298
+ @pytest.mark.asyncio
299
+ async def test_composable_env_install_env_none_by_default():
300
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
301
+ env = ComposableEnv(
302
+ taskset=taskset,
303
+ harness=Harness(
304
+ run_command="true",
305
+ install_script="install-agent",
306
+ instruction_path="/tmp/prompt.txt",
307
+ ),
308
+ )
309
+ env.sandbox_client = SimpleNamespace(
310
+ execute_command=AsyncMock(
311
+ return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
312
+ ),
313
+ teardown=lambda: None,
314
+ )
315
+ env.taskset.setup = AsyncMock()
316
+ env.upload_content = AsyncMock()
317
+
318
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
319
+
320
+ install_call = env.sandbox_client.execute_command.await_args_list[-1]
321
+ assert install_call == call("sbx", "install-agent", timeout=300)
322
+
323
+
324
+ # ── get_upload_dirs ──────────────────────────────────────────────────────
325
+
326
+
327
+ def _make_temp_taskset_package(tmp_path, monkeypatch, *, with_skills: bool):
328
+ package_name = f"fixture_{tmp_path.name.replace('-', '_')}"
329
+ pkg_dir = tmp_path / package_name
330
+ pkg_dir.mkdir()
331
+ (pkg_dir / "__init__.py").write_text("")
332
+ (pkg_dir / "taskset_mod.py").write_text("MARKER = 1\n")
333
+
334
+ if with_skills:
335
+ skill_dir = pkg_dir / "skills" / "demo"
336
+ skill_dir.mkdir(parents=True)
337
+ (skill_dir / "SKILL.md").write_text("---\nname: demo\n---\n")
338
+ (skill_dir / "pyproject.toml").write_text(
339
+ "[project]\nname = 'skill-demo'\nversion = '0.0.0'\n"
340
+ )
341
+
342
+ monkeypatch.syspath_prepend(str(tmp_path))
343
+ importlib.invalidate_caches()
344
+ mod = importlib.import_module(f"{package_name}.taskset_mod")
345
+ return mod, package_name
346
+
347
+
348
+ class MockSandboxTaskSetWithSkills(SandboxTaskSet):
349
+ """SandboxTaskSet — skills auto-discovered via get_skills_dir()."""
350
+
351
+ def get_instruction(self, info):
352
+ return f"Fix bug #{info.get('id', 0)}"
353
+
354
+ def get_sandbox_spec(self, info):
355
+ return SandboxSpec(image="python:3.11-slim", cpu_cores=2, memory_gb=2)
356
+
357
+ def get_rubric(self):
358
+ return MockSandboxRubric()
359
+
360
+ def get_workdir(self, info):
361
+ return "/testbed"
362
+
363
+
364
+ @pytest.mark.asyncio
365
+ async def test_composable_env_uploads_task_dirs(tmp_path, monkeypatch):
366
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
367
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
368
+ taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
369
+ env = ComposableEnv(
370
+ taskset=taskset,
371
+ harness=Harness(
372
+ run_command="true",
373
+ install_script="install-agent",
374
+ skills_path="/task/skills",
375
+ ),
376
+ )
377
+ env.sandbox_client = SimpleNamespace(
378
+ execute_command=AsyncMock(
379
+ return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
380
+ ),
381
+ teardown=lambda: None,
382
+ )
383
+ env.taskset.setup = AsyncMock()
384
+ env.upload_content = AsyncMock()
385
+ env.upload_file = AsyncMock()
386
+
387
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
388
+
389
+ env.upload_file.assert_awaited_once()
390
+ upload_call = env.upload_file.await_args
391
+ assert upload_call.args[0] == "sbx"
392
+ assert upload_call.args[1] == "/tmp/_upload_task_skills.tar.gz"
393
+
394
+ install_call = env.sandbox_client.execute_command.await_args_list[-1]
395
+ assert install_call == call("sbx", "install-agent", timeout=300)
396
+ extract_call = env.sandbox_client.execute_command.await_args_list[1]
397
+ assert extract_call == call(
398
+ "sbx",
399
+ "mkdir -p /task && tar -xzf /tmp/_upload_task_skills.tar.gz -C / && rm -f /tmp/_upload_task_skills.tar.gz",
400
+ timeout=60,
401
+ )
402
+
403
+
404
+ @pytest.mark.asyncio
405
+ async def test_composable_env_no_upload_when_no_dirs(tmp_path, monkeypatch):
406
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=False)
407
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
408
+ taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
409
+ env = ComposableEnv(
410
+ taskset=taskset,
411
+ harness=Harness(
412
+ run_command="true",
413
+ install_script="install-agent",
414
+ skills_path="/task/skills",
415
+ ),
416
+ )
417
+ env.sandbox_client = SimpleNamespace(
418
+ execute_command=AsyncMock(
419
+ return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
420
+ ),
421
+ teardown=lambda: None,
422
+ )
423
+ env.taskset.setup = AsyncMock()
424
+ env.upload_content = AsyncMock()
425
+ env.upload_file = AsyncMock()
426
+
427
+ await env.post_sandbox_setup({"sandbox_id": "sbx", "info": {"id": 0}})
428
+
429
+ assert env.upload_file.await_count == 0
430
+
431
+
432
+ # ── discover_sibling_dir ─────────────────────────────────────────────────
433
+
434
+
435
+ def test_discover_sibling_dir_finds_skills(tmp_path, monkeypatch):
436
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
437
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
438
+ result = discover_sibling_dir(MockSandboxTaskSetWithSkills, "skills")
439
+ assert result is not None
440
+
441
+
442
+ def test_discover_sibling_dir_returns_none_without_skills(tmp_path, monkeypatch):
443
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=False)
444
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
445
+ result = discover_sibling_dir(MockSandboxTaskSetWithSkills, "skills")
446
+ assert result is None
447
+
448
+
449
+ # ── get_skills_dir / auto-discovery ──────────────────────────────────────
450
+
451
+
452
+ def test_get_skills_dir_auto_discovers(tmp_path, monkeypatch):
453
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
454
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
455
+ taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
456
+ assert taskset.get_skills_dir() is not None
457
+
458
+
459
+ def test_get_skills_dir_returns_none_without_skills(tmp_path, monkeypatch):
460
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=False)
461
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
462
+ taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
463
+ assert taskset.get_skills_dir() is None
464
+
465
+
466
+ def test_get_upload_dirs_includes_skills_automatically(tmp_path, monkeypatch):
467
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=True)
468
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
469
+ taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
470
+ upload_dirs = taskset.get_upload_dirs()
471
+ assert "skills" in upload_dirs
472
+
473
+
474
+ def test_get_upload_dirs_empty_without_skills(tmp_path, monkeypatch):
475
+ mod, _ = _make_temp_taskset_package(tmp_path, monkeypatch, with_skills=False)
476
+ monkeypatch.setattr(MockSandboxTaskSetWithSkills, "__module__", mod.__name__)
477
+ taskset = MockSandboxTaskSetWithSkills(dataset=_make_dataset(), name="test")
478
+ assert taskset.get_upload_dirs() == {}
479
+
480
+
481
+ # ── Harness metrics collection ───────────────────────────────────────────
482
+
483
+
484
+ @pytest.mark.asyncio
485
+ async def test_composable_env_collects_harness_metrics():
486
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
487
+ metrics_data = {
488
+ "turns": 3,
489
+ "stop_reason": "done",
490
+ "prompt_tokens": 100,
491
+ "completion_tokens": 25,
492
+ }
493
+ env = ComposableEnv(
494
+ taskset=taskset,
495
+ harness=Harness(
496
+ run_command="true",
497
+ log_path="/tmp/log dir/agent.log",
498
+ metrics_path="{workdir}/.rlm/sessions/*/meta.json",
499
+ metrics_key="metrics",
500
+ metrics_prefix="rlm_",
501
+ ),
502
+ )
503
+ env.sandbox_client = SimpleNamespace(
504
+ execute_command=AsyncMock(
505
+ side_effect=[
506
+ SimpleNamespace(stdout="agent log\n", stderr="", exit_code=0),
507
+ SimpleNamespace(
508
+ stdout=json.dumps({"metrics": metrics_data}),
509
+ stderr="",
510
+ exit_code=0,
511
+ ),
512
+ ]
513
+ ),
514
+ teardown=lambda: None,
515
+ )
516
+
517
+ state = {
518
+ "sandbox_id": "sbx",
519
+ "info": {"id": 0},
520
+ "timing": {"total_ms": 0},
521
+ "trajectory": [],
522
+ }
523
+
524
+ await env.post_rollout(state)
525
+
526
+ assert state["agent_logs"] == "agent log"
527
+ assert state["rlm_turns"] == 3
528
+ assert state["rlm_stop_reason"] == "done"
529
+ assert state["rlm_prompt_tokens"] == 100
530
+ assert state["rlm_completion_tokens"] == 25
531
+
532
+
533
+ @pytest.mark.asyncio
534
+ async def test_composable_env_metrics_with_key_whitelist():
535
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
536
+ env = ComposableEnv(
537
+ taskset=taskset,
538
+ harness=Harness(
539
+ run_command="true",
540
+ metrics_path="{workdir}/metrics.json",
541
+ metrics_prefix="agent_",
542
+ metrics_keys=["turns", "tokens"],
543
+ ),
544
+ )
545
+ env.sandbox_client = SimpleNamespace(
546
+ execute_command=AsyncMock(
547
+ return_value=SimpleNamespace(
548
+ stdout=json.dumps({"turns": 5, "tokens": 200, "secret": "hidden"}),
549
+ stderr="",
550
+ exit_code=0,
551
+ )
552
+ ),
553
+ teardown=lambda: None,
554
+ )
555
+
556
+ state = {
557
+ "sandbox_id": "sbx",
558
+ "info": {"id": 0},
559
+ "timing": {"total_ms": 0},
560
+ "trajectory": [],
561
+ }
562
+
563
+ await env.post_rollout(state)
564
+
565
+ assert state["agent_turns"] == 5
566
+ assert state["agent_tokens"] == 200
567
+ assert "agent_secret" not in state
568
+
569
+
570
+ @pytest.mark.asyncio
571
+ async def test_composable_env_no_metrics_when_path_not_set():
572
+ taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
573
+ env = ComposableEnv(
574
+ taskset=taskset,
575
+ harness=Harness(run_command="true"),
576
+ )
577
+ env.sandbox_client = SimpleNamespace(
578
+ execute_command=AsyncMock(),
579
+ teardown=lambda: None,
580
+ )
581
+
582
+ state = {
583
+ "sandbox_id": "sbx",
584
+ "info": {"id": 0},
585
+ "timing": {"total_ms": 0},
586
+ "trajectory": [],
587
+ }
588
+
589
+ await env.post_rollout(state)
590
+
591
+ # No execute_command calls since no log_path and no metrics_path
592
+ env.sandbox_client.execute_command.assert_not_awaited()
@@ -1061,6 +1061,38 @@ def test_ablation_global_defaults_apply():
1061
1061
  assert all(c["num_examples"] == 100 for c in configs)
1062
1062
 
1063
1063
 
1064
+ def test_ablation_endpoint_id_override_removes_global_model():
1065
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
1066
+ f.write(
1067
+ 'model = "gpt-4.1-mini"\n\n'
1068
+ '[[ablation]]\nenv_id = "my-env"\nendpoint_id = "proxy"\n\n'
1069
+ "[ablation.sweep]\n"
1070
+ "temperature = [0.0]\n"
1071
+ )
1072
+ f.flush()
1073
+ configs = load_toml_config(Path(f.name))
1074
+
1075
+ assert len(configs) == 1
1076
+ assert configs[0]["endpoint_id"] == "proxy"
1077
+ assert "model" not in configs[0]
1078
+
1079
+
1080
+ def test_ablation_swept_model_override_removes_global_endpoint_id():
1081
+ with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
1082
+ f.write(
1083
+ 'endpoint_id = "proxy"\n\n'
1084
+ '[[ablation]]\nenv_id = "my-env"\n\n'
1085
+ "[ablation.sweep]\n"
1086
+ 'model = ["gpt-4.1-mini"]\n'
1087
+ )
1088
+ f.flush()
1089
+ configs = load_toml_config(Path(f.name))
1090
+
1091
+ assert len(configs) == 1
1092
+ assert configs[0]["model"] == "gpt-4.1-mini"
1093
+ assert "endpoint_id" not in configs[0]
1094
+
1095
+
1064
1096
  def test_ablation_with_eval_blocks():
1065
1097
  """Ablation and eval blocks can coexist."""
1066
1098
  with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f: