eval-protocol 0.2.7__tar.gz → 0.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (334) hide show
  1. {eval_protocol-0.2.7/eval_protocol.egg-info → eval_protocol-0.2.8}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli.py +1 -0
  4. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/logs.py +4 -3
  5. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +3 -3
  6. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/manager.py +4 -4
  7. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/models.py +47 -21
  8. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/evaluation_test.py +68 -36
  9. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/logs_server.py +70 -20
  10. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/vite_server.py +48 -17
  11. {eval_protocol-0.2.7 → eval_protocol-0.2.8/eval_protocol.egg-info}/PKG-INFO +1 -1
  12. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/SOURCES.txt +6 -3
  13. eval_protocol-0.2.8/tests/test_logs_server.py +585 -0
  14. eval_protocol-0.2.8/tests/test_logs_server_simple.py +88 -0
  15. eval_protocol-0.2.8/tests/test_vite_server.py +224 -0
  16. eval_protocol-0.2.8/vite-app/dist/assets/index-CGYj40Gx.css +1 -0
  17. eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js +88 -0
  18. eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js.map +1 -0
  19. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vite-app/dist/index.html +2 -2
  20. eval_protocol-0.2.7/vite-app/dist/assets/index-DWfIf2rx.css +0 -1
  21. eval_protocol-0.2.7/vite-app/dist/assets/index-D_nkLTVA.js +0 -88
  22. eval_protocol-0.2.7/vite-app/dist/assets/index-D_nkLTVA.js.map +0 -1
  23. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/LICENSE +0 -0
  24. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/README.md +0 -0
  25. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/__init__.py +0 -0
  26. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/normalize_sandbox_fusion.py +0 -0
  27. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/utils/__init__.py +0 -0
  28. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/utils/generate_api_key.py +0 -0
  29. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/utils/subprocess_manager.py +0 -0
  30. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/__init__.py +0 -0
  31. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/__main__.py +0 -0
  32. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/__init__.py +0 -0
  33. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/braintrust.py +0 -0
  34. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/huggingface.py +0 -0
  35. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/langfuse.py +0 -0
  36. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/trl.py +0 -0
  37. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/__init__.py +0 -0
  38. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/models.py +0 -0
  39. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/orchestrator.py +0 -0
  40. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resource_abc.py +0 -0
  41. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resource_pool.py +0 -0
  42. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/__init__.py +0 -0
  43. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  44. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  45. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  46. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  47. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  48. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/docker_resource.py +0 -0
  49. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  50. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  51. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  52. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  53. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/sql_resource.py +0 -0
  54. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/task_manager.py +0 -0
  55. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/tool_registry.py +0 -0
  56. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/auth.py +0 -0
  57. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/__init__.py +0 -0
  58. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  59. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/common.py +0 -0
  60. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/deploy.py +0 -0
  61. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  62. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/preview.py +0 -0
  63. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  64. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/common_utils.py +0 -0
  65. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/config.py +0 -0
  66. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/__init__.py +0 -0
  67. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  68. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  69. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  70. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/datasets/__init__.py +0 -0
  71. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/datasets/loader.py +0 -0
  72. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/directory_utils.py +0 -0
  73. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/evaluation.py +0 -0
  74. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/__init__.py +0 -0
  75. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/event_bus.py +0 -0
  76. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/logger.py +0 -0
  77. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  78. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  79. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/execution/__init__.py +0 -0
  80. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/execution/pipeline.py +0 -0
  81. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/gcp_tools.py +0 -0
  82. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/generation/cache.py +0 -0
  83. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/generation/clients/base.py +0 -0
  84. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/generation/clients.py +0 -0
  85. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/generic_server.py +0 -0
  86. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/get_pep440_version.py +0 -0
  87. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/human_id/__init__.py +0 -0
  88. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/human_id/dictionary.py +0 -0
  89. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/__init__.py +0 -0
  90. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/braintrust.py +0 -0
  91. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/deepeval.py +0 -0
  92. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/openeval.py +0 -0
  93. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/trl.py +0 -0
  94. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/logging_utils.py +0 -0
  95. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/__init__.py +0 -0
  96. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/adapter.py +0 -0
  97. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/client/__init__.py +0 -0
  98. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/client/connection.py +0 -0
  99. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/clients.py +0 -0
  100. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/__init__.py +0 -0
  101. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/base_policy.py +0 -0
  102. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/policy.py +0 -0
  103. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/grid_renderer.py +0 -0
  104. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  105. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/mcpgym.py +0 -0
  106. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/process_manager.py +0 -0
  107. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/session/__init__.py +0 -0
  108. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/session/manager.py +0 -0
  109. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/simple_process_manager.py +0 -0
  110. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/simulation_server.py +0 -0
  111. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/__init__.py +0 -0
  112. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/config.py +0 -0
  113. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  114. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/main.py +0 -0
  115. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  116. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  117. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  118. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  119. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  120. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/session.py +0 -0
  121. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_env.py +0 -0
  122. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/packaging.py +0 -0
  123. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/platform_api.py +0 -0
  124. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/playback_policy.py +0 -0
  125. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/__init__.py +0 -0
  126. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  127. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  128. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  129. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  130. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  131. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/plugin.py +0 -0
  132. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/types.py +0 -0
  133. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/utils.py +0 -0
  134. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/resources.py +0 -0
  135. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/reward_function.py +0 -0
  136. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/__init__.py +0 -0
  137. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/accuracy.py +0 -0
  138. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/accuracy_length.py +0 -0
  139. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  140. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  141. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_testing_util.py +0 -0
  142. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/bfcl_reward.py +0 -0
  143. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/code_execution.py +0 -0
  144. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/code_execution_utils.py +0 -0
  145. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/cpp_code.py +0 -0
  146. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  147. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/format.py +0 -0
  148. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/function_calling.py +0 -0
  149. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/json_schema.py +0 -0
  150. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/language_consistency.py +0 -0
  151. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/lean_prover.py +0 -0
  152. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/length.py +0 -0
  153. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  154. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/math.py +0 -0
  155. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  156. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/reasoning_steps.py +0 -0
  157. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/repetition.py +0 -0
  158. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/tag_count.py +0 -0
  159. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rl_processing.py +0 -0
  160. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/server.py +0 -0
  161. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/stats/__init__.py +0 -0
  162. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/stats/confidence_intervals.py +0 -0
  163. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/typed_interface.py +0 -0
  164. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/types/__init__.py +0 -0
  165. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/types/types.py +0 -0
  166. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/__init__.py +0 -0
  167. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/batch_evaluation.py +0 -0
  168. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/batch_transformation.py +0 -0
  169. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/dataset_helpers.py +0 -0
  170. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/module_loader.py +0 -0
  171. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/packaging_utils.py +0 -0
  172. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/static_policy.py +0 -0
  173. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/dependency_links.txt +0 -0
  174. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/entry_points.txt +0 -0
  175. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/requires.txt +0 -0
  176. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/top_level.txt +0 -0
  177. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/pyproject.toml +0 -0
  178. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/setup.cfg +0 -0
  179. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/setup.py +0 -0
  180. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_accuracy.py +0 -0
  181. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_accuracy_length.py +0 -0
  182. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_adapters_e2e.py +0 -0
  183. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_agent_orchestrator.py +0 -0
  184. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_agent_resources.py +0 -0
  185. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_auth.py +0 -0
  186. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_batch_evaluation.py +0 -0
  187. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_braintrust_adapter.py +0 -0
  188. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_braintrust_example.py +0 -0
  189. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_cli.py +0 -0
  190. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_cli_agent.py +0 -0
  191. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_cli_args.py +0 -0
  192. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_code_execution.py +0 -0
  193. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_config.py +0 -0
  194. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_control_plane_separation.py +0 -0
  195. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_cpp_code.py +0 -0
  196. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_data_driven_task_manager.py +0 -0
  197. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_deepcoder_reward.py +0 -0
  198. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_deepeval_integration.py +0 -0
  199. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_deploy_integration.py +0 -0
  200. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_e2b_integration.py +0 -0
  201. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_e2b_js_integration.py +0 -0
  202. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_edge_cases.py +0 -0
  203. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_eval_protocol_import.py +0 -0
  204. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_evaluation.py +0 -0
  205. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_evaluation_integration.py +0 -0
  206. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_evaluation_preview_integration.py +0 -0
  207. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_event_bus.py +0 -0
  208. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_examples_end_to_end.py +0 -0
  209. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_fireworks_api.py +0 -0
  210. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_format.py +0 -0
  211. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_fractional_code.py +0 -0
  212. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_frozen_lake_http_server.py +0 -0
  213. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  214. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_function_calling.py +0 -0
  215. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_gcp_tools.py +0 -0
  216. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_generic_server.py +0 -0
  217. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_integration.py +0 -0
  218. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_json_schema.py +0 -0
  219. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_kwargs_validation.py +0 -0
  220. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_language_consistency.py +0 -0
  221. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_lean_prover.py +0 -0
  222. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_lean_prover_runner.py +0 -0
  223. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_length.py +0 -0
  224. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_list_comparison_math_reward.py +0 -0
  225. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_math.py +0 -0
  226. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_minimal.py +0 -0
  227. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_models.py +0 -0
  228. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_models_rl.py +0 -0
  229. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_multiple_choice_math_reward.py +0 -0
  230. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_n_variant_batch_integration.py +0 -0
  231. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_n_variant_integration.py +0 -0
  232. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_openai_compatibility.py +0 -0
  233. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_openeval_integration.py +0 -0
  234. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_packaging.py +0 -0
  235. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_parallel_rollouts.py +0 -0
  236. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_platform_api.py +0 -0
  237. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_readiness.py +0 -0
  238. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_reasoning_steps.py +0 -0
  239. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_repetition.py +0 -0
  240. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_repetition_debug.py +0 -0
  241. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_reward_function.py +0 -0
  242. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_reward_protocol_import.py +0 -0
  243. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_rl_processing.py +0 -0
  244. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_rollout_control_plane_integration.py +0 -0
  245. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_server.py +0 -0
  246. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_tag_count.py +0 -0
  247. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_typed_interface.py +0 -0
  248. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_typed_interface_rl.py +0 -0
  249. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_url_handling.py +0 -0
  250. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/__init__.py +0 -0
  251. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/agent/__init__.py +0 -0
  252. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/agent/base.py +0 -0
  253. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/agent/llm_agent.py +0 -0
  254. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/api_service/__init__.py +0 -0
  255. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/api_service/api_config.py +0 -0
  256. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/api_service/data_model.py +0 -0
  257. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/api_service/simulation_service.py +0 -0
  258. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/cli.py +0 -0
  259. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/config.py +0 -0
  260. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/data_model/__init__.py +0 -0
  261. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/data_model/message.py +0 -0
  262. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/data_model/simulation.py +0 -0
  263. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/data_model/tasks.py +0 -0
  264. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/__init__.py +0 -0
  265. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/__init__.py +0 -0
  266. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/data_model.py +0 -0
  267. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/environment.py +0 -0
  268. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/tools.py +0 -0
  269. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/utils.py +0 -0
  270. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/__init__.py +0 -0
  271. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/data_model.py +0 -0
  272. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/environment.py +0 -0
  273. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/tools.py +0 -0
  274. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/utils.py +0 -0
  275. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/__init__.py +0 -0
  276. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/data_model.py +0 -0
  277. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/environment.py +0 -0
  278. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/tools.py +0 -0
  279. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/utils.py +0 -0
  280. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/__init__.py +0 -0
  281. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/data_model.py +0 -0
  282. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/environment.py +0 -0
  283. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  284. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  285. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  286. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  287. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  288. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  289. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  290. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  291. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tools.py +0 -0
  292. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  293. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  294. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/utils.py +0 -0
  295. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/__init__.py +0 -0
  296. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/db.py +0 -0
  297. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/environment.py +0 -0
  298. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/server.py +0 -0
  299. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/tool.py +0 -0
  300. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/toolkit.py +0 -0
  301. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  302. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/__init__.py +0 -0
  303. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator.py +0 -0
  304. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  305. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  306. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  307. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  308. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  309. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/metrics/__init__.py +0 -0
  310. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/metrics/agent_metrics.py +0 -0
  311. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  312. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/__init__.py +0 -0
  313. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  314. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  315. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/utils.py +0 -0
  316. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/registry.py +0 -0
  317. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/run.py +0 -0
  318. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/__init__.py +0 -0
  319. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/check_data.py +0 -0
  320. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  321. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/start_servers.py +0 -0
  322. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/view_simulations.py +0 -0
  323. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/user/__init__.py +0 -0
  324. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/user/base.py +0 -0
  325. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/user/user_simulator.py +0 -0
  326. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/__init__.py +0 -0
  327. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/display.py +0 -0
  328. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/io_utils.py +0 -0
  329. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/llm_utils.py +0 -0
  330. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/pydantic_utils.py +0 -0
  331. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/utils.py +0 -0
  332. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/versioneer.py +0 -0
  333. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  334. {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.7
3
+ Version: 0.2.8
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-11T00:47:52-0700",
11
+ "date": "2025-08-11T22:02:14-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "38a44449f6d48a8a79eb11a0aaf873129df3e994",
15
- "version": "0.2.7"
14
+ "full-revisionid": "b004c422c7d873890fc88cc299935929fa966b1f",
15
+ "version": "0.2.8"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -289,6 +289,7 @@ def parse_args(args=None):
289
289
 
290
290
  # Logs command
291
291
  logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
292
+ logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
292
293
 
293
294
  # Run command (for Hydra-based evaluations)
294
295
  # This subparser intentionally defines no arguments itself.
@@ -11,15 +11,16 @@ from ..utils.logs_server import serve_logs
11
11
  def logs_command(args):
12
12
  """Serve logs with file watching and real-time updates"""
13
13
 
14
+ port = args.port
14
15
  print(f"🚀 Starting Eval Protocol Logs Server")
15
- print(f"🌐 URL: http://localhost:8000")
16
- print(f"🔌 WebSocket: ws://localhost:8000/ws")
16
+ print(f"🌐 URL: http://localhost:{port}")
17
+ print(f"🔌 WebSocket: ws://localhost:{port}/ws")
17
18
  print(f"👀 Watching paths: {['current directory']}")
18
19
  print("Press Ctrl+C to stop the server")
19
20
  print("-" * 50)
20
21
 
21
22
  try:
22
- serve_logs()
23
+ serve_logs(port=args.port)
23
24
  return 0
24
25
  except KeyboardInterrupt:
25
26
  print("\n🛑 Server stopped by user")
@@ -37,9 +37,9 @@ class SqliteEvaluationRowStore:
37
37
  return self._db_path
38
38
 
39
39
  def upsert_row(self, data: dict) -> None:
40
- rollout_id = data["rollout_id"]
41
- if "rollout_id" not in data:
42
- raise ValueError("rollout_id is required to upsert a row")
40
+ rollout_id = data["execution_metadata"]["rollout_id"]
41
+ if rollout_id is None:
42
+ raise ValueError("execution_metadata.rollout_id is required to upsert a row")
43
43
  if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
44
44
  self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
45
45
  else:
@@ -158,8 +158,8 @@ class ExecutionManager:
158
158
  messages.append(Message.model_validate(msg_dict))
159
159
 
160
160
  evaluation_rows[idx].messages = messages
161
- evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
162
- evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
161
+ # evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
162
+ # evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
163
163
  evaluation_rows[idx].tools = shared_tool_schema
164
164
  evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
165
165
  evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
@@ -482,11 +482,11 @@ class ExecutionManager:
482
482
  trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
483
483
  try:
484
484
  await envs.connection_manager.reset_session(session)
485
- except:
485
+ except: # noqa: E722
486
486
  logger.error(f"Error resetting session {session.session_id}")
487
487
  try:
488
488
  await envs.connection_manager.close_session(session)
489
- except:
489
+ except: # noqa: E722
490
490
  logger.error(f"Error closing session {session.session_id}")
491
491
  return trajectory
492
492
 
@@ -202,6 +202,21 @@ class InputMetadata(BaseModel):
202
202
  )
203
203
 
204
204
 
205
+ class EvaluationThreshold(BaseModel):
206
+ """Threshold configuration for evaluation tests.
207
+
208
+ The success field is required - tests must specify a minimum success rate.
209
+ The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
210
+ """
211
+
212
+ success: float = Field(
213
+ ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
214
+ )
215
+ standard_deviation: Optional[float] = Field(
216
+ None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
217
+ )
218
+
219
+
205
220
  class EvalMetadata(BaseModel):
206
221
  """Metadata about the evaluation that was run."""
207
222
 
@@ -216,10 +231,36 @@ class EvalMetadata(BaseModel):
216
231
  )
217
232
  num_runs: int = Field(..., description="Number of times the evaluation was repeated")
218
233
  aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
219
- threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
234
+ passed_threshold: Optional[EvaluationThreshold] = Field(
235
+ None, description="Threshold configuration for test success"
236
+ )
220
237
  passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
221
238
 
222
239
 
240
+ class ExecutionMetadata(BaseModel):
241
+ """Metadata about the execution of the evaluation."""
242
+
243
+ invocation_id: Optional[str] = Field(
244
+ default_factory=generate_id,
245
+ description="The ID of the invocation that this row belongs to.",
246
+ )
247
+
248
+ experiment_id: Optional[str] = Field(
249
+ default_factory=generate_id,
250
+ description="The ID of the experiment that this row belongs to.",
251
+ )
252
+
253
+ rollout_id: Optional[str] = Field(
254
+ default_factory=generate_id,
255
+ description="The ID of the rollout that this row belongs to.",
256
+ )
257
+
258
+ run_id: Optional[str] = Field(
259
+ None,
260
+ description=("The ID of the run that this row belongs to."),
261
+ )
262
+
263
+
223
264
  class RolloutStatus(BaseModel):
224
265
  """Status of the rollout."""
225
266
 
@@ -264,26 +305,6 @@ class EvaluationRow(BaseModel):
264
305
  description="The status of the rollout.",
265
306
  )
266
307
 
267
- invocation_id: Optional[str] = Field(
268
- default_factory=generate_id,
269
- description="The ID of the invocation that this row belongs to.",
270
- )
271
-
272
- cohort_id: Optional[str] = Field(
273
- default_factory=generate_id,
274
- description="The ID of the cohort that this row belongs to.",
275
- )
276
-
277
- rollout_id: Optional[str] = Field(
278
- default_factory=generate_id,
279
- description="The ID of the rollout that this row belongs to.",
280
- )
281
-
282
- run_id: Optional[str] = Field(
283
- None,
284
- description=("The ID of the run that this row belongs to."),
285
- )
286
-
287
308
  # Ground truth reference (moved from EvaluateResult to top level)
288
309
  ground_truth: Optional[str] = Field(
289
310
  default=None, description="Optional ground truth reference for this evaluation."
@@ -294,6 +315,11 @@ class EvaluationRow(BaseModel):
294
315
  default=None, description="The evaluation result for this row/trajectory."
295
316
  )
296
317
 
318
+ execution_metadata: ExecutionMetadata = Field(
319
+ default_factory=ExecutionMetadata,
320
+ description="Metadata about the execution of the evaluation.",
321
+ )
322
+
297
323
  # LLM usage statistics
298
324
  usage: Optional[CompletionUsage] = Field(
299
325
  default=None, description="Token usage statistics from LLM calls during execution."
@@ -3,14 +3,21 @@ import inspect
3
3
  import math
4
4
  import os
5
5
  import statistics
6
- from typing import Any, Callable, Dict, List, Literal, Optional
6
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union
7
7
 
8
8
  import pytest
9
9
 
10
10
  from eval_protocol.dataset_logger import default_logger
11
11
  from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
12
12
  from eval_protocol.human_id import generate_id
13
- from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata, Message
13
+ from eval_protocol.models import (
14
+ CompletionParams,
15
+ EvalMetadata,
16
+ EvaluationRow,
17
+ EvaluationThreshold,
18
+ InputMetadata,
19
+ Message,
20
+ )
14
21
  from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
15
22
  from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
16
23
  from eval_protocol.pytest.types import (
@@ -47,7 +54,7 @@ def evaluation_test( # noqa: C901
47
54
  rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
48
55
  evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
49
56
  aggregation_method: AggregationMethod = "mean",
50
- threshold_of_success: Optional[float] = None,
57
+ passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
51
58
  num_runs: int = 1,
52
59
  max_dataset_rows: Optional[int] = None,
53
60
  mcp_config_path: Optional[str] = None,
@@ -66,14 +73,14 @@ def evaluation_test( # noqa: C901
66
73
  Here are some key concepts to understand the terminology in EP:
67
74
 
68
75
  - "invocation" is a single execution of a test function. An invocation can
69
- generate 1 or more cohorts. Grouping by invocation might be useful to
76
+ generate 1 or more experiments. Grouping by invocation might be useful to
70
77
  aggregate eval scores across multiple invocations when you want to aggregate
71
78
  scores across multiple datasets.
72
- - "cohort" is a group of runs with for a combination of parameters. A single
73
- cohort will have multiple runs if num_runs > 1.
79
+ - "experiment" is a group of runs with for a combination of parameters. A single
80
+ experiment will have multiple runs if num_runs > 1.
74
81
  1. If your evaluation_test has combinations of parameters, it will generate
75
- multiple cohorts per combination of parameters.
76
- 2. A new execution of a test function will generate a new cohort.
82
+ multiple experiments per combination of parameters.
83
+ 2. A new execution of a test function will generate a new experiment.
77
84
  - "run" is a group of rollouts. For multiple num_runs > 1, there will be
78
85
  multiple "run_id"s.
79
86
  - "rollout" is the execution/process that produces a "trajectory". You
@@ -91,7 +98,7 @@ def evaluation_test( # noqa: C901
91
98
  decorated test. It simply produces a score from 0 to 1 and attached it
92
99
  to the row as the "evaluation_result" field.
93
100
 
94
- "invocation", "cohort", "run", "rollout", and "row" each have a unique ID
101
+ "invocation", "experiment", "run", "rollout", and "row" each have a unique ID
95
102
  which can be used to easily group and identify your dataset by.
96
103
 
97
104
  Args:
@@ -108,8 +115,8 @@ def evaluation_test( # noqa: C901
108
115
  rollout_processor: Function used to perform the rollout.
109
116
  evaluation_test_kwargs: Kwargs for the evaluation function.
110
117
  aggregation_method: How to aggregate scores across rows.
111
- threshold_of_success: If set, fail the test if the aggregated score is
112
- below this threshold.
118
+ passed_threshold: Threshold configuration for test success.
119
+ Success rate must be above success, and if set, standard deviation must be below standard_deviation.
113
120
  num_runs: Number of times to repeat the rollout and evaluations.
114
121
  max_dataset_rows: Limit dataset to the first N rows.
115
122
  mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -127,6 +134,14 @@ def evaluation_test( # noqa: C901
127
134
  def decorator(
128
135
  test_func: TestFunction,
129
136
  ):
137
+ if passed_threshold is not None:
138
+ if isinstance(passed_threshold, float):
139
+ threshold = EvaluationThreshold(success=passed_threshold)
140
+ else:
141
+ threshold = EvaluationThreshold(**passed_threshold)
142
+ else:
143
+ threshold = None
144
+
130
145
  sig = inspect.signature(test_func)
131
146
 
132
147
  # For pointwise/rowwise mode, we expect a different signature
@@ -285,9 +300,9 @@ def evaluation_test( # noqa: C901
285
300
  def wrapper_body(**kwargs):
286
301
  model_name = kwargs["model"]
287
302
  eval_metadata = None
288
- all_results: List[EvaluationRow] = []
303
+ all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
289
304
 
290
- cohort_id = generate_id()
305
+ experiment_id = generate_id()
291
306
 
292
307
  def _log_eval_error(
293
308
  status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
@@ -346,7 +361,7 @@ def evaluation_test( # noqa: C901
346
361
  status="running",
347
362
  num_runs=num_runs,
348
363
  aggregation_method=aggregation_method,
349
- threshold_of_success=threshold_of_success,
364
+ passed_threshold=threshold,
350
365
  passed=None,
351
366
  )
352
367
 
@@ -368,8 +383,8 @@ def evaluation_test( # noqa: C901
368
383
  row.input_metadata.session_data["mode"] = mode
369
384
  # Initialize eval_metadata for each row
370
385
  row.eval_metadata = eval_metadata
371
- row.cohort_id = cohort_id
372
- row.invocation_id = invocation_id
386
+ row.execution_metadata.experiment_id = experiment_id
387
+ row.execution_metadata.invocation_id = invocation_id
373
388
 
374
389
  # has to be done in the pytest main process since it's
375
390
  # used to determine whether this eval has stopped
@@ -386,19 +401,19 @@ def evaluation_test( # noqa: C901
386
401
  logger=active_logger,
387
402
  )
388
403
 
389
- for _ in range(num_runs):
404
+ for i in range(num_runs):
390
405
  # Regenerate outputs each run by deep-copying the pristine dataset
391
406
  # so model responses are not reused across runs.
392
407
  run_id = generate_id()
393
- fresh_dataset = [copy.deepcopy(r) for r in data]
408
+ fresh_dataset = [r.model_copy(deep=True) for r in data]
394
409
 
395
410
  # apply new run_id to fresh_dataset
396
411
  for row in fresh_dataset:
397
- row.run_id = run_id
412
+ row.execution_metadata.run_id = run_id
398
413
 
399
414
  # generate new rollout_id for each row
400
415
  for row in fresh_dataset:
401
- row.rollout_id = generate_id()
416
+ row.execution_metadata.rollout_id = generate_id()
402
417
 
403
418
  # log the fresh_dataset
404
419
  for row in fresh_dataset:
@@ -418,7 +433,7 @@ def evaluation_test( # noqa: C901
418
433
  raise ValueError(
419
434
  f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
420
435
  )
421
- all_results.append(result)
436
+ all_results[i].append(result)
422
437
  else:
423
438
  # Batch mode: call the test function with the full dataset
424
439
  results = execute_with_params(
@@ -442,17 +457,21 @@ def evaluation_test( # noqa: C901
442
457
  raise ValueError(
443
458
  f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
444
459
  )
445
- all_results.extend(results)
460
+ all_results[i] = results
446
461
 
447
- scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
462
+ scores = [
463
+ sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
464
+ for result in all_results
465
+ ]
448
466
  agg_score = aggregate(scores, aggregation_method)
467
+ score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
449
468
 
450
469
  # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
451
470
  ci_low: float | None = None
452
471
  ci_high: float | None = None
453
472
  if aggregation_method == "mean":
454
473
  try:
455
- result_ci = compute_fixed_set_mu_ci(all_results)
474
+ result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
456
475
  mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
457
476
  if mu_ci_low is not None and mu_ci_high is not None:
458
477
  ci_low = float(mu_ci_low)
@@ -464,15 +483,24 @@ def evaluation_test( # noqa: C901
464
483
 
465
484
  # Determine if the evaluation passed based on threshold
466
485
  passed = None
467
- if threshold_of_success is not None:
468
- passed = agg_score >= threshold_of_success
486
+
487
+ if threshold is not None:
488
+ success_passed, std_passed = True, True
489
+
490
+ success_passed = agg_score >= threshold.success
491
+
492
+ if threshold.standard_deviation is not None:
493
+ std_passed = score_std <= threshold.standard_deviation
494
+
495
+ passed = success_passed and std_passed
469
496
 
470
497
  # Update eval metadata status and passed field for all results
471
- for r in all_results:
472
- if r.eval_metadata is not None:
473
- r.eval_metadata.status = "finished"
474
- r.eval_metadata.passed = passed
475
- active_logger.log(r)
498
+ for result in all_results:
499
+ for r in result:
500
+ if r.eval_metadata is not None:
501
+ r.eval_metadata.status = "finished"
502
+ r.eval_metadata.passed = passed
503
+ active_logger.log(r)
476
504
 
477
505
  # Optional: print and/or persist a summary artifact for CI
478
506
  try:
@@ -480,7 +508,7 @@ def evaluation_test( # noqa: C901
480
508
  summary_path = os.getenv("EP_SUMMARY_JSON")
481
509
  suite_name = test_func.__name__
482
510
  model_used = model_name
483
- total_rows = len(all_results)
511
+ total_rows = len([item for sublist in all_results for item in sublist])
484
512
  summary_obj = {
485
513
  "suite": suite_name,
486
514
  "model": model_used,
@@ -497,7 +525,7 @@ def evaluation_test( # noqa: C901
497
525
  from collections import defaultdict
498
526
 
499
527
  metric_scores: Dict[str, list] = defaultdict(list)
500
- for r in all_results:
528
+ for r in [item for sublist in all_results for item in sublist]:
501
529
  if r.evaluation_result and r.evaluation_result.metrics:
502
530
  for m_name, m_res in r.evaluation_result.metrics.items():
503
531
  if m_res is not None and getattr(m_res, "score", None) is not None:
@@ -614,10 +642,14 @@ def evaluation_test( # noqa: C901
614
642
  # pass
615
643
 
616
644
  # Check threshold after logging
617
- if threshold_of_success is not None and not passed:
645
+ if threshold is not None and not passed:
618
646
  assert (
619
- agg_score >= threshold_of_success
620
- ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
647
+ agg_score >= threshold.success
648
+ ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
649
+ if threshold.standard_deviation is not None:
650
+ assert (
651
+ score_std <= threshold.standard_deviation
652
+ ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
621
653
 
622
654
  except AssertionError:
623
655
  _log_eval_error("finished", data if "data" in locals() else None, passed=False)
@@ -87,18 +87,32 @@ class WebSocketManager:
87
87
  return
88
88
 
89
89
  tasks = []
90
+ failed_connections = []
91
+
90
92
  for connection in connections:
91
93
  try:
92
94
  tasks.append(connection.send_text(text))
93
95
  except Exception as e:
94
96
  logger.error(f"Failed to send text to WebSocket: {e}")
95
- with self._lock:
96
- try:
97
- self.active_connections.remove(connection)
98
- except ValueError:
99
- pass
97
+ failed_connections.append(connection)
98
+
99
+ # Execute all sends in parallel
100
100
  if tasks:
101
- await asyncio.gather(*tasks, return_exceptions=True)
101
+ results = await asyncio.gather(*tasks, return_exceptions=True)
102
+
103
+ # Check for any exceptions that occurred during execution
104
+ for i, result in enumerate(results):
105
+ if isinstance(result, Exception):
106
+ logger.error(f"Failed to send text to WebSocket: {result}")
107
+ failed_connections.append(connections[i])
108
+
109
+ # Remove all failed connections
110
+ with self._lock:
111
+ for connection in failed_connections:
112
+ try:
113
+ self.active_connections.remove(connection)
114
+ except ValueError:
115
+ pass
102
116
 
103
117
  def start_broadcast_loop(self):
104
118
  """Start the broadcast loop in the current event loop."""
@@ -109,6 +123,7 @@ class WebSocketManager:
109
123
  """Stop the broadcast loop."""
110
124
  if self._broadcast_task and not self._broadcast_task.done():
111
125
  self._broadcast_task.cancel()
126
+ self._broadcast_task = None
112
127
 
113
128
 
114
129
  class EvaluationWatcher:
@@ -233,7 +248,6 @@ class LogsServer(ViteServer):
233
248
 
234
249
  # Subscribe to events and start listening for cross-process events
235
250
  event_bus.subscribe(self._handle_event)
236
- event_bus.start_listening()
237
251
 
238
252
  logger.info(f"LogsServer initialized on {host}:{port}")
239
253
 
@@ -273,6 +287,12 @@ class LogsServer(ViteServer):
273
287
  data = EvaluationRow(**data)
274
288
  self.websocket_manager.broadcast_row_upserted(data)
275
289
 
290
+ def start_loops(self):
291
+ """Start the broadcast loop and evaluation watcher."""
292
+ self.websocket_manager.start_broadcast_loop()
293
+ self.evaluation_watcher.start()
294
+ event_bus.start_listening()
295
+
276
296
  async def run_async(self):
277
297
  """
278
298
  Run the logs server asynchronously with file watching.
@@ -285,11 +305,7 @@ class LogsServer(ViteServer):
285
305
  logger.info(f"Serving files from: {self.build_dir}")
286
306
  logger.info("WebSocket endpoint available at /ws")
287
307
 
288
- # Start the broadcast loop
289
- self.websocket_manager.start_broadcast_loop()
290
-
291
- # Start the evaluation watcher
292
- self.evaluation_watcher.start()
308
+ self.start_loops()
293
309
 
294
310
  config = uvicorn.Config(
295
311
  self.app,
@@ -319,20 +335,54 @@ class LogsServer(ViteServer):
319
335
  asyncio.run(self.run_async())
320
336
 
321
337
 
322
- server = LogsServer()
323
- app = server.app
338
+ def create_app(host: str = "localhost", port: int = 8000, build_dir: Optional[str] = None) -> FastAPI:
339
+ """
340
+ Factory function to create a FastAPI app instance and start the server with async loops.
341
+
342
+ This creates a LogsServer instance and starts it in a background thread to ensure
343
+ all async loops (WebSocket broadcast, evaluation watching) are running.
324
344
 
345
+ Args:
346
+ host: Host to bind to
347
+ port: Port to bind to
348
+ build_dir: Optional custom build directory path
325
349
 
326
- def serve_logs():
350
+ Returns:
351
+ FastAPI app instance with server running in background
352
+ """
353
+ if build_dir is None:
354
+ build_dir = os.path.abspath(
355
+ os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "vite-app", "dist")
356
+ )
357
+
358
+ server = LogsServer(host=host, port=port, build_dir=build_dir)
359
+ server.start_loops()
360
+ return server.app
361
+
362
+
363
+ # For backward compatibility and direct usage
364
+ def serve_logs(port: Optional[int] = None):
327
365
  """
328
366
  Convenience function to create and run a LogsServer.
329
367
  """
330
- global server, app
331
- if server is None:
332
- server = LogsServer()
333
- app = server.app
368
+ server = LogsServer(port=port)
334
369
  server.run()
335
370
 
336
371
 
337
372
  if __name__ == "__main__":
338
- serve_logs()
373
+ import argparse
374
+
375
+ parser = argparse.ArgumentParser(description="Start the evaluation logs server")
376
+ parser.add_argument("--host", default="localhost", help="Host to bind to (default: localhost)")
377
+ parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
378
+ parser.add_argument("--build-dir", help="Path to Vite build directory")
379
+
380
+ args = parser.parse_args()
381
+
382
+ # Create server with command line arguments
383
+ if args.build_dir:
384
+ server = LogsServer(host=args.host, port=args.port, build_dir=args.build_dir)
385
+ else:
386
+ server = LogsServer(host=args.host, port=args.port)
387
+
388
+ server.run()