eval-protocol 0.2.8__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (340) hide show
  1. {eval_protocol-0.2.8/eval_protocol.egg-info → eval_protocol-0.2.9}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/_version.py +3 -3
  3. eval_protocol-0.2.9/eval_protocol/benchmarks/__init__.py +9 -0
  4. eval_protocol-0.2.9/eval_protocol/benchmarks/registry.py +174 -0
  5. eval_protocol-0.2.9/eval_protocol/benchmarks/run.py +100 -0
  6. eval_protocol-0.2.9/eval_protocol/benchmarks/suites/__init__.py +3 -0
  7. eval_protocol-0.2.9/eval_protocol/benchmarks/suites/aime25.py +118 -0
  8. eval_protocol-0.2.9/eval_protocol/benchmarks/suites/gpqa.py +100 -0
  9. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/base_policy.py +17 -11
  10. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/manager.py +27 -33
  11. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/policy.py +2 -1
  12. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/models.py +3 -3
  13. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/playback_policy.py +2 -2
  14. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_single_turn_rollout_process.py +19 -5
  15. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/evaluation_test.py +328 -0
  16. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/plugin.py +2 -3
  17. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/types/types.py +27 -3
  18. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/static_policy.py +4 -4
  19. {eval_protocol-0.2.8 → eval_protocol-0.2.9/eval_protocol.egg-info}/PKG-INFO +1 -1
  20. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/SOURCES.txt +9 -3
  21. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_rollout_control_plane_integration.py +7 -3
  22. eval_protocol-0.2.9/vite-app/dist/assets/index-CmEkuH8E.js +93 -0
  23. eval_protocol-0.2.9/vite-app/dist/assets/index-CmEkuH8E.js.map +1 -0
  24. eval_protocol-0.2.9/vite-app/dist/assets/index-DZwKPeo5.css +1 -0
  25. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vite-app/dist/index.html +2 -2
  26. eval_protocol-0.2.8/vite-app/dist/assets/index-CGYj40Gx.css +0 -1
  27. eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js +0 -88
  28. eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js.map +0 -1
  29. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/LICENSE +0 -0
  30. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/README.md +0 -0
  31. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/__init__.py +0 -0
  32. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/normalize_sandbox_fusion.py +0 -0
  33. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/utils/__init__.py +0 -0
  34. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/utils/generate_api_key.py +0 -0
  35. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/utils/subprocess_manager.py +0 -0
  36. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/__init__.py +0 -0
  37. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/__main__.py +0 -0
  38. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/__init__.py +0 -0
  39. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/braintrust.py +0 -0
  40. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/huggingface.py +0 -0
  41. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/langfuse.py +0 -0
  42. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/trl.py +0 -0
  43. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/__init__.py +0 -0
  44. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/models.py +0 -0
  45. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/orchestrator.py +0 -0
  46. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resource_abc.py +0 -0
  47. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resource_pool.py +0 -0
  48. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/__init__.py +0 -0
  49. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  50. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  51. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  52. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  53. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  54. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/docker_resource.py +0 -0
  55. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  56. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  57. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  58. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  59. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/sql_resource.py +0 -0
  60. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/task_manager.py +0 -0
  61. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/tool_registry.py +0 -0
  62. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/auth.py +0 -0
  63. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli.py +0 -0
  64. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/__init__.py +0 -0
  65. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  66. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/common.py +0 -0
  67. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/deploy.py +0 -0
  68. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  69. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/logs.py +0 -0
  70. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/preview.py +0 -0
  71. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  72. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/common_utils.py +0 -0
  73. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/config.py +0 -0
  74. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/__init__.py +0 -0
  75. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  76. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  77. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  78. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  79. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/datasets/__init__.py +0 -0
  80. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/datasets/loader.py +0 -0
  81. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/directory_utils.py +0 -0
  82. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/evaluation.py +0 -0
  83. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/__init__.py +0 -0
  84. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/event_bus.py +0 -0
  85. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/logger.py +0 -0
  86. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  87. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  88. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/execution/__init__.py +0 -0
  89. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/execution/pipeline.py +0 -0
  90. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/gcp_tools.py +0 -0
  91. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/generation/cache.py +0 -0
  92. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/generation/clients/base.py +0 -0
  93. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/generation/clients.py +0 -0
  94. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/generic_server.py +0 -0
  95. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/get_pep440_version.py +0 -0
  96. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/human_id/__init__.py +0 -0
  97. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/human_id/dictionary.py +0 -0
  98. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/__init__.py +0 -0
  99. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/braintrust.py +0 -0
  100. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/deepeval.py +0 -0
  101. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/openeval.py +0 -0
  102. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/trl.py +0 -0
  103. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/logging_utils.py +0 -0
  104. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/__init__.py +0 -0
  105. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/adapter.py +0 -0
  106. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/client/__init__.py +0 -0
  107. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/client/connection.py +0 -0
  108. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/clients.py +0 -0
  109. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/__init__.py +0 -0
  110. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/grid_renderer.py +0 -0
  111. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  112. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/mcpgym.py +0 -0
  113. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/process_manager.py +0 -0
  114. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/session/__init__.py +0 -0
  115. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/session/manager.py +0 -0
  116. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/simple_process_manager.py +0 -0
  117. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/simulation_server.py +0 -0
  118. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/__init__.py +0 -0
  119. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/config.py +0 -0
  120. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  121. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/main.py +0 -0
  122. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  123. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  124. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  125. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  126. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  127. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/session.py +0 -0
  128. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_env.py +0 -0
  129. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/packaging.py +0 -0
  130. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/platform_api.py +0 -0
  131. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/__init__.py +0 -0
  132. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  133. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  134. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  135. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  136. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/types.py +0 -0
  137. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/utils.py +0 -0
  138. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/resources.py +0 -0
  139. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/reward_function.py +0 -0
  140. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/__init__.py +0 -0
  141. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/accuracy.py +0 -0
  142. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/accuracy_length.py +0 -0
  143. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  144. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  145. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/apps_testing_util.py +0 -0
  146. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/bfcl_reward.py +0 -0
  147. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/code_execution.py +0 -0
  148. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/code_execution_utils.py +0 -0
  149. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/cpp_code.py +0 -0
  150. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  151. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/format.py +0 -0
  152. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/function_calling.py +0 -0
  153. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/json_schema.py +0 -0
  154. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/language_consistency.py +0 -0
  155. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/lean_prover.py +0 -0
  156. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/length.py +0 -0
  157. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  158. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/math.py +0 -0
  159. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  160. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/reasoning_steps.py +0 -0
  161. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/repetition.py +0 -0
  162. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/tag_count.py +0 -0
  163. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rl_processing.py +0 -0
  164. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/server.py +0 -0
  165. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/stats/__init__.py +0 -0
  166. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/stats/confidence_intervals.py +0 -0
  167. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/typed_interface.py +0 -0
  168. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/types/__init__.py +0 -0
  169. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/__init__.py +0 -0
  170. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/batch_evaluation.py +0 -0
  171. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/batch_transformation.py +0 -0
  172. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/dataset_helpers.py +0 -0
  173. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/logs_server.py +0 -0
  174. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/module_loader.py +0 -0
  175. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/packaging_utils.py +0 -0
  176. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/vite_server.py +0 -0
  177. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/dependency_links.txt +0 -0
  178. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/entry_points.txt +0 -0
  179. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/requires.txt +0 -0
  180. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/top_level.txt +0 -0
  181. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/pyproject.toml +0 -0
  182. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/setup.cfg +0 -0
  183. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/setup.py +0 -0
  184. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_accuracy.py +0 -0
  185. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_accuracy_length.py +0 -0
  186. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_adapters_e2e.py +0 -0
  187. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_agent_orchestrator.py +0 -0
  188. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_agent_resources.py +0 -0
  189. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_auth.py +0 -0
  190. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_batch_evaluation.py +0 -0
  191. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_braintrust_adapter.py +0 -0
  192. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_braintrust_example.py +0 -0
  193. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_cli.py +0 -0
  194. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_cli_agent.py +0 -0
  195. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_cli_args.py +0 -0
  196. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_code_execution.py +0 -0
  197. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_config.py +0 -0
  198. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_control_plane_separation.py +0 -0
  199. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_cpp_code.py +0 -0
  200. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_data_driven_task_manager.py +0 -0
  201. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_deepcoder_reward.py +0 -0
  202. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_deepeval_integration.py +0 -0
  203. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_deploy_integration.py +0 -0
  204. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_e2b_integration.py +0 -0
  205. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_e2b_js_integration.py +0 -0
  206. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_edge_cases.py +0 -0
  207. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_eval_protocol_import.py +0 -0
  208. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_evaluation.py +0 -0
  209. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_evaluation_integration.py +0 -0
  210. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_evaluation_preview_integration.py +0 -0
  211. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_event_bus.py +0 -0
  212. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_examples_end_to_end.py +0 -0
  213. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_fireworks_api.py +0 -0
  214. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_format.py +0 -0
  215. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_fractional_code.py +0 -0
  216. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_frozen_lake_http_server.py +0 -0
  217. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  218. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_function_calling.py +0 -0
  219. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_gcp_tools.py +0 -0
  220. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_generic_server.py +0 -0
  221. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_integration.py +0 -0
  222. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_json_schema.py +0 -0
  223. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_kwargs_validation.py +0 -0
  224. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_language_consistency.py +0 -0
  225. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_lean_prover.py +0 -0
  226. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_lean_prover_runner.py +0 -0
  227. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_length.py +0 -0
  228. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_list_comparison_math_reward.py +0 -0
  229. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_logs_server.py +0 -0
  230. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_logs_server_simple.py +0 -0
  231. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_math.py +0 -0
  232. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_minimal.py +0 -0
  233. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_models.py +0 -0
  234. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_models_rl.py +0 -0
  235. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_multiple_choice_math_reward.py +0 -0
  236. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_n_variant_batch_integration.py +0 -0
  237. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_n_variant_integration.py +0 -0
  238. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_openai_compatibility.py +0 -0
  239. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_openeval_integration.py +0 -0
  240. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_packaging.py +0 -0
  241. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_parallel_rollouts.py +0 -0
  242. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_platform_api.py +0 -0
  243. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_readiness.py +0 -0
  244. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_reasoning_steps.py +0 -0
  245. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_repetition.py +0 -0
  246. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_repetition_debug.py +0 -0
  247. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_reward_function.py +0 -0
  248. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_reward_protocol_import.py +0 -0
  249. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_rl_processing.py +0 -0
  250. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_server.py +0 -0
  251. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_tag_count.py +0 -0
  252. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_typed_interface.py +0 -0
  253. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_typed_interface_rl.py +0 -0
  254. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_url_handling.py +0 -0
  255. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_vite_server.py +0 -0
  256. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/__init__.py +0 -0
  257. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/agent/__init__.py +0 -0
  258. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/agent/base.py +0 -0
  259. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/agent/llm_agent.py +0 -0
  260. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/api_service/__init__.py +0 -0
  261. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/api_service/api_config.py +0 -0
  262. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/api_service/data_model.py +0 -0
  263. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/api_service/simulation_service.py +0 -0
  264. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/cli.py +0 -0
  265. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/config.py +0 -0
  266. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/data_model/__init__.py +0 -0
  267. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/data_model/message.py +0 -0
  268. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/data_model/simulation.py +0 -0
  269. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/data_model/tasks.py +0 -0
  270. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/__init__.py +0 -0
  271. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/__init__.py +0 -0
  272. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/data_model.py +0 -0
  273. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/environment.py +0 -0
  274. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/tools.py +0 -0
  275. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/utils.py +0 -0
  276. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/__init__.py +0 -0
  277. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/data_model.py +0 -0
  278. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/environment.py +0 -0
  279. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/tools.py +0 -0
  280. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/utils.py +0 -0
  281. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/__init__.py +0 -0
  282. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/data_model.py +0 -0
  283. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/environment.py +0 -0
  284. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/tools.py +0 -0
  285. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/utils.py +0 -0
  286. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/__init__.py +0 -0
  287. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/data_model.py +0 -0
  288. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/environment.py +0 -0
  289. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  290. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  291. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  292. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  293. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  294. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  295. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  296. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  297. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tools.py +0 -0
  298. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  299. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  300. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/utils.py +0 -0
  301. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/__init__.py +0 -0
  302. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/db.py +0 -0
  303. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/environment.py +0 -0
  304. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/server.py +0 -0
  305. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/tool.py +0 -0
  306. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/toolkit.py +0 -0
  307. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  308. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/__init__.py +0 -0
  309. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator.py +0 -0
  310. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  311. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  312. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  313. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  314. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  315. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/metrics/__init__.py +0 -0
  316. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/metrics/agent_metrics.py +0 -0
  317. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  318. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/orchestrator/__init__.py +0 -0
  319. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  320. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  321. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/orchestrator/utils.py +0 -0
  322. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/registry.py +0 -0
  323. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/run.py +0 -0
  324. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/__init__.py +0 -0
  325. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/check_data.py +0 -0
  326. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  327. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/start_servers.py +0 -0
  328. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/view_simulations.py +0 -0
  329. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/user/__init__.py +0 -0
  330. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/user/base.py +0 -0
  331. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/user/user_simulator.py +0 -0
  332. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/__init__.py +0 -0
  333. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/display.py +0 -0
  334. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/io_utils.py +0 -0
  335. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/llm_utils.py +0 -0
  336. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/pydantic_utils.py +0 -0
  337. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/utils.py +0 -0
  338. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/versioneer.py +0 -0
  339. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  340. {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-11T22:02:14-0700",
11
+ "date": "2025-08-12T13:33:17-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "b004c422c7d873890fc88cc299935929fa966b1f",
15
- "version": "0.2.8"
14
+ "full-revisionid": "6b018d4d211d239896a5bda83b375b9bbb4fca34",
15
+ "version": "0.2.9"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,9 @@
1
+ from .registry import export_benchmark, get_benchmark_runner, list_benchmarks
2
+
3
+ __all__ = [
4
+ "export_benchmark",
5
+ "get_benchmark_runner",
6
+ "list_benchmarks",
7
+ ]
8
+
9
+
@@ -0,0 +1,174 @@
1
+ """
2
+ Benchmark registry and export decorator.
3
+
4
+ This module provides a lightweight registry for benchmarks and a decorator
5
+ `@export_benchmark(name)` that can be stacked with `@evaluation_test`.
6
+
7
+ It registers a runnable handle that executes the exact same evaluation pipeline
8
+ as the pytest flow by calling `run_evaluation_test_direct` with the parameters
9
+ captured from the decorated function.
10
+
11
+ Usage in a suite module (stack under @evaluation_test):
12
+
13
+ from eval_protocol.benchmarks.registry import export_benchmark
14
+
15
+ @export_benchmark("aime25_low")
16
+ @evaluation_test(...)
17
+ def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow:
18
+ ...
19
+
20
+ Programmatic run:
21
+
22
+ from eval_protocol.benchmarks.registry import get_benchmark_runner
23
+ get_benchmark_runner("aime25_low")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import json
29
+ import os
30
+ from typing import Any, Callable, Dict, List, Optional
31
+
32
+
33
+ # Global registry: name -> callable runner
34
+ _BENCHMARK_REGISTRY: Dict[str, Callable[..., Any]] = {}
35
+
36
+
37
+ def list_benchmarks() -> List[str]:
38
+ return sorted(_BENCHMARK_REGISTRY.keys())
39
+
40
+
41
+ def get_benchmark_runner(name: str) -> Callable[..., Any]:
42
+ try:
43
+ return _BENCHMARK_REGISTRY[name]
44
+ except KeyError as exc:
45
+ raise KeyError(f"Benchmark '{name}' not found. Available: {list_benchmarks()}") from exc
46
+
47
+
48
+ def export_benchmark(name: str) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
49
+ """
50
+ Decorator to export a benchmark test into the global registry.
51
+
52
+ This expects to be stacked with `@evaluation_test`, so the decorated function
53
+ should carry `__ep_config` and `__ep_original_test_func` attributes that the
54
+ decorator can read to construct a direct runner.
55
+
56
+ The registered runner supports a subset of convenient overrides and maps them
57
+ to the same EP_* environment variables used by the pytest plugin to ensure
58
+ identical summaries and JSON artifact behavior.
59
+ """
60
+
61
+ def _decorator(test_wrapper: Callable[..., Any]) -> Callable[..., Any]:
62
+ # Pull through metadata attached by evaluation_test
63
+ ep_config: Dict[str, Any] = getattr(test_wrapper, "__ep_config", {})
64
+ original_test_func: Optional[Callable[..., Any]] = getattr(
65
+ test_wrapper, "__ep_original_test_func", None
66
+ )
67
+
68
+ def _runner(
69
+ *,
70
+ model: Optional[str] = None,
71
+ print_summary: bool = False,
72
+ out: Optional[str] = None,
73
+ reasoning_effort: Optional[str] = None,
74
+ max_rows: Optional[int | str] = None,
75
+ num_runs: Optional[int] = None,
76
+ input_params_override: Optional[Dict[str, Any]] = None,
77
+ max_concurrency: Optional[int] = None,
78
+ ) -> Any:
79
+ # Map convenience flags to EP_* env used by the pytest flow
80
+ if print_summary:
81
+ os.environ["EP_PRINT_SUMMARY"] = "1"
82
+ if out:
83
+ os.environ["EP_SUMMARY_JSON"] = out
84
+ # Merge reasoning effort and arbitrary overrides into EP_INPUT_PARAMS_JSON
85
+ merged: Dict[str, Any] = {}
86
+ if reasoning_effort:
87
+ # Fireworks OpenAI-compatible endpoint expects extra_body.reasoning_effort, not nested reasoning dict
88
+ merged.setdefault("extra_body", {})["reasoning_effort"] = str(reasoning_effort)
89
+ if input_params_override:
90
+ def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]:
91
+ for k, v in over.items():
92
+ if isinstance(v, dict) and isinstance(base.get(k), dict):
93
+ _deep_update(base[k], v)
94
+ else:
95
+ base[k] = v
96
+ return base
97
+ merged = _deep_update(merged, dict(input_params_override))
98
+ if merged:
99
+ os.environ["EP_INPUT_PARAMS_JSON"] = json.dumps(merged)
100
+
101
+ if max_rows is not None:
102
+ if isinstance(max_rows, str) and max_rows.strip().lower() == "all":
103
+ os.environ["EP_MAX_DATASET_ROWS"] = "None"
104
+ else:
105
+ os.environ["EP_MAX_DATASET_ROWS"] = str(max_rows)
106
+
107
+ # Build effective parameters, preferring overrides
108
+ models: List[str] = ep_config.get("model") or []
109
+ model_to_use = model or (models[0] if models else None)
110
+ if not model_to_use:
111
+ raise ValueError(
112
+ f"No model provided and none captured from evaluation_test for benchmark '{name}'"
113
+ )
114
+
115
+ input_messages = ep_config.get("input_messages")
116
+ input_dataset = ep_config.get("input_dataset")
117
+ dataset_adapter = ep_config.get("dataset_adapter")
118
+ rollout_input_params_list = ep_config.get("rollout_input_params")
119
+ rollout_processor = ep_config.get("rollout_processor")
120
+ aggregation_method = ep_config.get("aggregation_method")
121
+ threshold = ep_config.get("threshold_of_success")
122
+ default_num_runs = ep_config.get("num_runs")
123
+ max_dataset_rows = ep_config.get("max_dataset_rows")
124
+ mcp_config_path = ep_config.get("mcp_config_path")
125
+ max_concurrent_rollouts = ep_config.get("max_concurrent_rollouts")
126
+ if max_concurrency is not None:
127
+ max_concurrent_rollouts = int(max_concurrency)
128
+ server_script_path = ep_config.get("server_script_path")
129
+ steps = ep_config.get("steps")
130
+ mode = ep_config.get("mode")
131
+ combine_datasets = ep_config.get("combine_datasets")
132
+
133
+ # Choose the first rollout param set by default
134
+ rollout_params = None
135
+ if isinstance(rollout_input_params_list, list) and rollout_input_params_list:
136
+ rollout_params = rollout_input_params_list[0]
137
+
138
+ # Import runner lazily to avoid hard import dependencies and circulars
139
+ import importlib
140
+
141
+ _mod = importlib.import_module("eval_protocol.pytest.evaluation_test")
142
+ run_evaluation_test_direct = getattr(_mod, "run_evaluation_test_direct")
143
+
144
+ return run_evaluation_test_direct(
145
+ test_func=original_test_func or test_wrapper,
146
+ model=model_to_use,
147
+ input_messages=input_messages,
148
+ input_dataset=input_dataset,
149
+ dataset_adapter=dataset_adapter,
150
+ rollout_input_params=rollout_params,
151
+ rollout_processor=rollout_processor,
152
+ aggregation_method=aggregation_method,
153
+ threshold_of_success=threshold,
154
+ num_runs=(num_runs if num_runs is not None else default_num_runs),
155
+ max_dataset_rows=max_dataset_rows,
156
+ mcp_config_path=mcp_config_path,
157
+ max_concurrent_rollouts=max_concurrent_rollouts,
158
+ server_script_path=server_script_path,
159
+ steps=steps,
160
+ mode=mode,
161
+ )
162
+
163
+ # Register runner
164
+ if name in _BENCHMARK_REGISTRY:
165
+ # Overwrite with latest definition
166
+ _BENCHMARK_REGISTRY[name] = _runner
167
+ else:
168
+ _BENCHMARK_REGISTRY[name] = _runner
169
+
170
+ return test_wrapper
171
+
172
+ return _decorator
173
+
174
+
@@ -0,0 +1,100 @@
1
+ """
2
+ Minimal CLI runner for exported benchmarks.
3
+
4
+ Usage:
5
+
6
+ python -m eval_protocol.benchmarks.run aime25_low \
7
+ --model fireworks_ai/accounts/fireworks/models/gpt-oss-120b \
8
+ --print-summary \
9
+ --out artifacts/aime25_low.json \
10
+ --max-rows 50 \
11
+ --reasoning-effort low
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ from typing import Any
18
+
19
+ from importlib import import_module
20
+ import pkgutil
21
+ import eval_protocol.benchmarks.suites as suites_pkg
22
+ from eval_protocol.benchmarks.registry import get_benchmark_runner, list_benchmarks
23
+
24
+
25
+ def _parse_args() -> argparse.Namespace:
26
+ parser = argparse.ArgumentParser(description="Run an exported eval-protocol benchmark")
27
+ parser.add_argument("name", help=f"Benchmark name. Known: {', '.join(list_benchmarks()) or '(none)'}")
28
+ parser.add_argument("--model", required=True, help="Model identifier (provider/model)")
29
+ parser.add_argument("--print-summary", action="store_true", help="Print concise EP summary line")
30
+ parser.add_argument("--out", help="Write JSON summary artifact to path or directory")
31
+ parser.add_argument(
32
+ "--reasoning-effort",
33
+ choices=["low", "medium", "high"],
34
+ help="Sets extra_body.reasoning.effort via EP_INPUT_PARAMS_JSON",
35
+ )
36
+ parser.add_argument(
37
+ "--max-rows",
38
+ help="Limit rows: integer or 'all' for no limit (maps to EP_MAX_DATASET_ROWS)",
39
+ )
40
+ parser.add_argument("--num-runs", type=int, help="Override num_runs if provided")
41
+ parser.add_argument("--max-tokens", type=int, help="Override max_tokens for generation requests")
42
+ parser.add_argument("--max-concurrency", type=int, help="Override max concurrent rollouts")
43
+ # Allow overriding reasoning effort explicitly (low/medium/high). If omitted, suite default is used.
44
+ # Already mapped by --reasoning-effort above.
45
+ return parser.parse_args()
46
+
47
+
48
+ def main() -> int:
49
+ args = _parse_args()
50
+ # Auto-import all suite modules so their @export_benchmark decorators register
51
+ # Import all suite modules so their @export_benchmark decorators register
52
+ import sys, traceback
53
+ for modinfo in pkgutil.iter_modules(suites_pkg.__path__):
54
+ mod_name = f"{suites_pkg.__name__}.{modinfo.name}"
55
+ try:
56
+ import_module(mod_name)
57
+ except Exception as e:
58
+ print(f"[bench] failed to import suite module: {mod_name}: {e}", file=sys.stderr)
59
+ traceback.print_exc()
60
+ # Fallback: if nothing registered yet and a known suite was requested, try explicit import
61
+ if not list_benchmarks():
62
+ known_map = {
63
+ "aime25_low": "eval_protocol.benchmarks.suites.aime25",
64
+ }
65
+ forced = known_map.get(args.name)
66
+ if forced:
67
+ try:
68
+ import_module(forced)
69
+ except Exception as e:
70
+ print(f"[bench] explicit import failed for {forced}: {e}", file=sys.stderr)
71
+ runner = get_benchmark_runner(args.name)
72
+ max_rows: int | str | None = None
73
+ if args.max_rows is not None:
74
+ try:
75
+ max_rows = int(args.max_rows)
76
+ except Exception:
77
+ max_rows = str(args.max_rows)
78
+ # Build input params override if needed
79
+ ip_override = {}
80
+ if args.max_tokens is not None:
81
+ ip_override["max_tokens"] = int(args.max_tokens)
82
+
83
+ _ = runner(
84
+ model=args.model,
85
+ print_summary=args.print_summary,
86
+ out=args.out,
87
+ reasoning_effort=args.reasoning_effort,
88
+ max_rows=max_rows,
89
+ num_runs=args.num_runs,
90
+ input_params_override=(ip_override or None),
91
+ max_concurrency=args.max_concurrency,
92
+ )
93
+ # Non-zero exit on failure gate is handled within the runner via assertions
94
+ return 0
95
+
96
+
97
+ if __name__ == "__main__":
98
+ raise SystemExit(main())
99
+
100
+
@@ -0,0 +1,3 @@
1
+ # Suite modules are auto-imported by eval_protocol.benchmarks.run to register benchmarks.
2
+
3
+
@@ -0,0 +1,118 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+ from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
4
+ from eval_protocol.pytest.default_single_turn_rollout_process import (
5
+ default_single_turn_rollout_processor,
6
+ )
7
+ from eval_protocol.pytest.evaluation_test import evaluation_test
8
+ from eval_protocol.benchmarks.registry import export_benchmark
9
+
10
+
11
+ SYSTEM_PROMPT = (
12
+ "You are a helpful math assistant. Please reason step by step, and put your "
13
+ "final answer within \\boxed{...}."
14
+ )
15
+
16
+
17
+ def _extract_boxed_text(text: str) -> str:
18
+ import re
19
+
20
+ if not text:
21
+ return ""
22
+
23
+ pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}"
24
+ matches = re.findall(pattern_boxed, text, re.DOTALL)
25
+ if matches:
26
+ for match in matches[::-1]:
27
+ for group in match:
28
+ if group:
29
+ return group.split(",")[-1].strip()
30
+ matches_digits = re.findall(r"\d+", text, re.DOTALL)
31
+ if matches_digits:
32
+ return matches_digits[-1]
33
+ return ""
34
+
35
+
36
+ def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
37
+ import re
38
+
39
+ if s is None:
40
+ return None
41
+ m = re.match(r"\d+", str(s).strip())
42
+ if not m:
43
+ return None
44
+ try:
45
+ return int(m.group(0))
46
+ except ValueError:
47
+ return None
48
+
49
+
50
+ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
51
+ converted: List[EvaluationRow] = []
52
+ for r in rows:
53
+ question = r.get("question", "")
54
+ answer = r.get("answer", None)
55
+ messages = [
56
+ Message(role="system", content=SYSTEM_PROMPT),
57
+ Message(role="user", content=str(question)),
58
+ ]
59
+ converted.append(
60
+ EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None)
61
+ )
62
+ return converted
63
+
64
+
65
+ @export_benchmark("aime25")
66
+ @evaluation_test(
67
+ model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
68
+ input_dataset=[
69
+ "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
70
+ "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
71
+ ],
72
+ dataset_adapter=aime2025_dataset_adapter,
73
+ rollout_input_params=[{"max_tokens": 131000, "extra_body": {"reasoning_effort": "low"}}],
74
+ rollout_processor=default_single_turn_rollout_processor,
75
+ aggregation_method="mean",
76
+ threshold_of_success=None,
77
+ num_runs=8,
78
+ max_dataset_rows=2,
79
+ max_concurrent_rollouts=4,
80
+ mode="pointwise",
81
+ )
82
+ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
83
+ assistant_msgs = [m for m in row.messages if m.role == "assistant"]
84
+ content = assistant_msgs[-1].content if assistant_msgs else ""
85
+
86
+ extracted_text = _extract_boxed_text(content or "")
87
+ extracted_int = _normalize_to_int_or_none(extracted_text)
88
+ gt_int = _normalize_to_int_or_none(row.ground_truth or "")
89
+
90
+ is_valid = extracted_int is not None and gt_int is not None
91
+ score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
92
+
93
+ metrics = {
94
+ "exact_match": MetricResult(
95
+ score=score,
96
+ is_score_valid=is_valid,
97
+ reason=(
98
+ "Parsed both integers and they matched"
99
+ if score == 1.0
100
+ else ("Parsed integers did not match" if is_valid else "Failed to parse integer")
101
+ ),
102
+ data={
103
+ "extracted_text": extracted_text,
104
+ "extracted_int": extracted_int,
105
+ "ground_truth_int": gt_int,
106
+ },
107
+ )
108
+ }
109
+
110
+ row.evaluation_result = EvaluateResult(
111
+ score=score,
112
+ reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
113
+ is_score_valid=is_valid,
114
+ metrics=metrics,
115
+ )
116
+ return row
117
+
118
+
@@ -0,0 +1,100 @@
1
+ from typing import List
2
+
3
+ import csv
4
+ import io
5
+ import re
6
+ import requests
7
+
8
+ from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
9
+ from eval_protocol.pytest.evaluation_test import evaluation_test
10
+ from eval_protocol.pytest.default_single_turn_rollout_process import (
11
+ default_single_turn_rollout_processor,
12
+ )
13
+ from eval_protocol.benchmarks.registry import export_benchmark
14
+
15
+
16
+ SYSTEM_PROMPT = (
17
+ "You are a helpful assistant. Read the question and options carefully. "
18
+ "Express your final answer strictly as a single letter: A, B, C, or D."
19
+ )
20
+
21
+
22
+ def _load_gpqa_messages_from_csv() -> List[List[Message]]:
23
+ url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
24
+ resp = requests.get(url, timeout=60)
25
+ resp.raise_for_status()
26
+
27
+ messages_list: List[List[Message]] = []
28
+ reader = csv.DictReader(io.StringIO(resp.text))
29
+ for ex in reader:
30
+ q = str(ex.get("Question", ""))
31
+ correct = str(ex.get("Correct Answer", "")).strip()
32
+ inc1 = str(ex.get("Incorrect Answer 1", ""))
33
+ inc2 = str(ex.get("Incorrect Answer 2", ""))
34
+ inc3 = str(ex.get("Incorrect Answer 3", ""))
35
+ choices = [correct, inc1, inc2, inc3]
36
+ user_content = (
37
+ f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter."
38
+ )
39
+ messages_list.append(
40
+ [
41
+ Message(role="system", content=SYSTEM_PROMPT),
42
+ Message(role="user", content=user_content),
43
+ # Correct answer is always option A by construction
44
+ Message(role="system", content="__GT__:A"),
45
+ ]
46
+ )
47
+ if not messages_list:
48
+ raise RuntimeError("Failed to load GPQA messages: no rows found from source")
49
+ return messages_list
50
+
51
+
52
+ def _extract_abcd_letter(text: str) -> str | None:
53
+ if not text:
54
+ return None
55
+ m = re.search(r"\b([ABCD])\b", text.upper())
56
+ return m.group(1) if m else None
57
+
58
+
59
+ _GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv()
60
+
61
+
62
+ @export_benchmark("gpqa")
63
+ @evaluation_test(
64
+ model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
65
+ input_messages=_GPQA_INPUT_MESSAGES,
66
+ rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
67
+ rollout_processor=default_single_turn_rollout_processor,
68
+ aggregation_method="mean",
69
+ threshold_of_success=None,
70
+ num_runs=8,
71
+ mode="pointwise",
72
+ )
73
+ def gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
74
+ assistant_msgs = [m for m in row.messages if m.role == "assistant"]
75
+ content = assistant_msgs[-1].content if assistant_msgs else ""
76
+
77
+ pred = _extract_abcd_letter(content or "")
78
+ # Retrieve GT from the trailing system message we appended
79
+ gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
80
+ gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None
81
+
82
+ is_valid = pred is not None and gt in {"A", "B", "C", "D"}
83
+ score = 1.0 if (is_valid and pred == gt) else 0.0
84
+
85
+ row.evaluation_result = EvaluateResult(
86
+ score=score,
87
+ reason=("Correct option" if score == 1.0 else "Incorrect option"),
88
+ is_score_valid=is_valid,
89
+ metrics={
90
+ "exact_match": MetricResult(
91
+ score=score,
92
+ is_score_valid=is_valid,
93
+ reason=("Matched" if score == 1.0 else "Not matched"),
94
+ data={"pred": pred, "gt": gt},
95
+ )
96
+ },
97
+ )
98
+ return row
99
+
100
+
@@ -151,7 +151,7 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
151
151
  tool_schemas: List[Dict],
152
152
  env_index: int,
153
153
  conversation_history: List[Dict[str, Any]],
154
- ) -> Tuple[List[MCPToolCall], CompletionUsage]:
154
+ ) -> Tuple[List[MCPToolCall], CompletionUsage, str]:
155
155
  """
156
156
  Generate tool calls using conversation history for proper OpenAI trajectories.
157
157
 
@@ -161,7 +161,7 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
161
161
  user_prompt: Current user prompt with observation
162
162
 
163
163
  Returns:
164
- List of MCPToolCall objects
164
+ List of MCPToolCall objects, LLM usage stats, and finish reason
165
165
  """
166
166
  # Convert MCP tools to LLM format
167
167
  llm_tools = self._convert_mcp_tools_to_llm_format(tool_schemas)
@@ -190,6 +190,8 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
190
190
  total_tokens=response["usage"]["total_tokens"],
191
191
  )
192
192
 
193
+ finish_reason = response["choices"][0]["finish_reason"]
194
+
193
195
  # Extract tool call from response
194
196
  message = response["choices"][0]["message"]
195
197
  logger.debug(f"Environment {env_index} - Response message: {message}")
@@ -217,15 +219,19 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
217
219
  if self.max_tools_per_turn:
218
220
  mcp_tool_calls = mcp_tool_calls[: self.max_tools_per_turn]
219
221
 
220
- return mcp_tool_calls, usage_stats
222
+ return mcp_tool_calls, usage_stats, finish_reason
221
223
  else:
222
224
  # No tool calls in response - this is normal when episode ends or LLM provides only text
223
225
  logger.debug(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
224
- return [
225
- MCPToolCall(
226
- tool_name="_no_tool_call",
227
- arguments={
228
- "reason": "no_tool_call_generated",
229
- },
230
- )
231
- ], usage_stats
226
+ return (
227
+ [
228
+ MCPToolCall(
229
+ tool_name="_no_tool_call",
230
+ arguments={
231
+ "reason": "no_tool_call_generated",
232
+ },
233
+ )
234
+ ],
235
+ usage_stats,
236
+ finish_reason,
237
+ )