eval-protocol 0.2.6.dev2__tar.gz → 0.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (334) hide show
  1. {eval_protocol-0.2.6.dev2/eval_protocol.egg-info → eval_protocol-0.2.8}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli.py +1 -0
  4. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/logs.py +4 -3
  5. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +3 -4
  6. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +14 -11
  7. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/manager.py +4 -4
  8. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/models.py +47 -21
  9. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_agent_rollout_processor.py +5 -4
  10. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_single_turn_rollout_process.py +4 -5
  11. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/evaluation_test.py +96 -38
  12. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/types.py +8 -2
  13. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/logs_server.py +70 -20
  14. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/vite_server.py +48 -17
  15. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8/eval_protocol.egg-info}/PKG-INFO +1 -1
  16. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/SOURCES.txt +6 -3
  17. eval_protocol-0.2.8/tests/test_logs_server.py +585 -0
  18. eval_protocol-0.2.8/tests/test_logs_server_simple.py +88 -0
  19. eval_protocol-0.2.8/tests/test_vite_server.py +224 -0
  20. eval_protocol-0.2.8/vite-app/dist/assets/index-CGYj40Gx.css +1 -0
  21. eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js +88 -0
  22. eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js.map +1 -0
  23. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vite-app/dist/index.html +2 -2
  24. eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-D9iVTBbF.css +0 -1
  25. eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-DiF_B1x_.js +0 -88
  26. eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-DiF_B1x_.js.map +0 -1
  27. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/LICENSE +0 -0
  28. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/README.md +0 -0
  29. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/__init__.py +0 -0
  30. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/normalize_sandbox_fusion.py +0 -0
  31. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/utils/__init__.py +0 -0
  32. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/utils/generate_api_key.py +0 -0
  33. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/utils/subprocess_manager.py +0 -0
  34. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/__init__.py +0 -0
  35. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/__main__.py +0 -0
  36. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/__init__.py +0 -0
  37. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/braintrust.py +0 -0
  38. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/huggingface.py +0 -0
  39. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/langfuse.py +0 -0
  40. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/trl.py +0 -0
  41. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/__init__.py +0 -0
  42. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/models.py +0 -0
  43. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/orchestrator.py +0 -0
  44. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resource_abc.py +0 -0
  45. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resource_pool.py +0 -0
  46. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/__init__.py +0 -0
  47. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  48. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  49. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  50. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  51. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  52. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/docker_resource.py +0 -0
  53. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  54. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  55. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  56. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  57. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/sql_resource.py +0 -0
  58. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/task_manager.py +0 -0
  59. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/tool_registry.py +0 -0
  60. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/auth.py +0 -0
  61. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/__init__.py +0 -0
  62. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  63. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/common.py +0 -0
  64. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/deploy.py +0 -0
  65. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  66. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/preview.py +0 -0
  67. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  68. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/common_utils.py +0 -0
  69. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/config.py +0 -0
  70. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/__init__.py +0 -0
  71. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  72. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  73. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/datasets/__init__.py +0 -0
  74. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/datasets/loader.py +0 -0
  75. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/directory_utils.py +0 -0
  76. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/evaluation.py +0 -0
  77. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/__init__.py +0 -0
  78. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/event_bus.py +0 -0
  79. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/logger.py +0 -0
  80. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  81. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  82. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/execution/__init__.py +0 -0
  83. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/execution/pipeline.py +0 -0
  84. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/gcp_tools.py +0 -0
  85. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/generation/cache.py +0 -0
  86. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/generation/clients/base.py +0 -0
  87. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/generation/clients.py +0 -0
  88. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/generic_server.py +0 -0
  89. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/get_pep440_version.py +0 -0
  90. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/human_id/__init__.py +0 -0
  91. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/human_id/dictionary.py +0 -0
  92. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/__init__.py +0 -0
  93. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/braintrust.py +0 -0
  94. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/deepeval.py +0 -0
  95. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/openeval.py +0 -0
  96. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/trl.py +0 -0
  97. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/logging_utils.py +0 -0
  98. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/__init__.py +0 -0
  99. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/adapter.py +0 -0
  100. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/client/__init__.py +0 -0
  101. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/client/connection.py +0 -0
  102. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/clients.py +0 -0
  103. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/__init__.py +0 -0
  104. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/base_policy.py +0 -0
  105. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/policy.py +0 -0
  106. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/grid_renderer.py +0 -0
  107. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  108. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/mcpgym.py +0 -0
  109. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/process_manager.py +0 -0
  110. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/session/__init__.py +0 -0
  111. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/session/manager.py +0 -0
  112. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/simple_process_manager.py +0 -0
  113. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/simulation_server.py +0 -0
  114. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/__init__.py +0 -0
  115. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/config.py +0 -0
  116. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  117. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/main.py +0 -0
  118. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  119. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  120. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  121. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  122. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  123. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/session.py +0 -0
  124. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_env.py +0 -0
  125. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/packaging.py +0 -0
  126. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/platform_api.py +0 -0
  127. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/playback_policy.py +0 -0
  128. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/__init__.py +0 -0
  129. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  130. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  131. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  132. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/plugin.py +0 -0
  133. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/utils.py +0 -0
  134. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/resources.py +0 -0
  135. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/reward_function.py +0 -0
  136. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/__init__.py +0 -0
  137. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/accuracy.py +0 -0
  138. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/accuracy_length.py +0 -0
  139. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  140. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  141. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_testing_util.py +0 -0
  142. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/bfcl_reward.py +0 -0
  143. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/code_execution.py +0 -0
  144. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/code_execution_utils.py +0 -0
  145. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/cpp_code.py +0 -0
  146. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  147. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/format.py +0 -0
  148. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/function_calling.py +0 -0
  149. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/json_schema.py +0 -0
  150. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/language_consistency.py +0 -0
  151. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/lean_prover.py +0 -0
  152. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/length.py +0 -0
  153. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  154. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/math.py +0 -0
  155. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  156. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/reasoning_steps.py +0 -0
  157. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/repetition.py +0 -0
  158. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/tag_count.py +0 -0
  159. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rl_processing.py +0 -0
  160. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/server.py +0 -0
  161. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/stats/__init__.py +0 -0
  162. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/stats/confidence_intervals.py +0 -0
  163. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/typed_interface.py +0 -0
  164. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/types/__init__.py +0 -0
  165. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/types/types.py +0 -0
  166. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/__init__.py +0 -0
  167. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/batch_evaluation.py +0 -0
  168. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/batch_transformation.py +0 -0
  169. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/dataset_helpers.py +0 -0
  170. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/module_loader.py +0 -0
  171. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/packaging_utils.py +0 -0
  172. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/static_policy.py +0 -0
  173. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/dependency_links.txt +0 -0
  174. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/entry_points.txt +0 -0
  175. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/requires.txt +0 -0
  176. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/top_level.txt +0 -0
  177. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/pyproject.toml +0 -0
  178. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/setup.cfg +0 -0
  179. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/setup.py +0 -0
  180. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_accuracy.py +0 -0
  181. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_accuracy_length.py +0 -0
  182. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_adapters_e2e.py +0 -0
  183. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_agent_orchestrator.py +0 -0
  184. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_agent_resources.py +0 -0
  185. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_auth.py +0 -0
  186. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_batch_evaluation.py +0 -0
  187. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_braintrust_adapter.py +0 -0
  188. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_braintrust_example.py +0 -0
  189. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_cli.py +0 -0
  190. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_cli_agent.py +0 -0
  191. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_cli_args.py +0 -0
  192. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_code_execution.py +0 -0
  193. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_config.py +0 -0
  194. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_control_plane_separation.py +0 -0
  195. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_cpp_code.py +0 -0
  196. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_data_driven_task_manager.py +0 -0
  197. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_deepcoder_reward.py +0 -0
  198. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_deepeval_integration.py +0 -0
  199. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_deploy_integration.py +0 -0
  200. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_e2b_integration.py +0 -0
  201. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_e2b_js_integration.py +0 -0
  202. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_edge_cases.py +0 -0
  203. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_eval_protocol_import.py +0 -0
  204. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_evaluation.py +0 -0
  205. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_evaluation_integration.py +0 -0
  206. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_evaluation_preview_integration.py +0 -0
  207. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_event_bus.py +0 -0
  208. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_examples_end_to_end.py +0 -0
  209. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_fireworks_api.py +0 -0
  210. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_format.py +0 -0
  211. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_fractional_code.py +0 -0
  212. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_frozen_lake_http_server.py +0 -0
  213. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  214. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_function_calling.py +0 -0
  215. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_gcp_tools.py +0 -0
  216. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_generic_server.py +0 -0
  217. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_integration.py +0 -0
  218. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_json_schema.py +0 -0
  219. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_kwargs_validation.py +0 -0
  220. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_language_consistency.py +0 -0
  221. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_lean_prover.py +0 -0
  222. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_lean_prover_runner.py +0 -0
  223. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_length.py +0 -0
  224. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_list_comparison_math_reward.py +0 -0
  225. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_math.py +0 -0
  226. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_minimal.py +0 -0
  227. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_models.py +0 -0
  228. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_models_rl.py +0 -0
  229. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_multiple_choice_math_reward.py +0 -0
  230. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_n_variant_batch_integration.py +0 -0
  231. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_n_variant_integration.py +0 -0
  232. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_openai_compatibility.py +0 -0
  233. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_openeval_integration.py +0 -0
  234. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_packaging.py +0 -0
  235. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_parallel_rollouts.py +0 -0
  236. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_platform_api.py +0 -0
  237. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_readiness.py +0 -0
  238. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_reasoning_steps.py +0 -0
  239. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_repetition.py +0 -0
  240. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_repetition_debug.py +0 -0
  241. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_reward_function.py +0 -0
  242. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_reward_protocol_import.py +0 -0
  243. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_rl_processing.py +0 -0
  244. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_rollout_control_plane_integration.py +0 -0
  245. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_server.py +0 -0
  246. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_tag_count.py +0 -0
  247. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_typed_interface.py +0 -0
  248. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_typed_interface_rl.py +0 -0
  249. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_url_handling.py +0 -0
  250. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/__init__.py +0 -0
  251. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/agent/__init__.py +0 -0
  252. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/agent/base.py +0 -0
  253. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/agent/llm_agent.py +0 -0
  254. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/api_service/__init__.py +0 -0
  255. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/api_service/api_config.py +0 -0
  256. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/api_service/data_model.py +0 -0
  257. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/api_service/simulation_service.py +0 -0
  258. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/cli.py +0 -0
  259. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/config.py +0 -0
  260. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/data_model/__init__.py +0 -0
  261. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/data_model/message.py +0 -0
  262. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/data_model/simulation.py +0 -0
  263. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/data_model/tasks.py +0 -0
  264. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/__init__.py +0 -0
  265. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/__init__.py +0 -0
  266. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/data_model.py +0 -0
  267. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/environment.py +0 -0
  268. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/tools.py +0 -0
  269. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/utils.py +0 -0
  270. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/__init__.py +0 -0
  271. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/data_model.py +0 -0
  272. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/environment.py +0 -0
  273. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/tools.py +0 -0
  274. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/utils.py +0 -0
  275. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/__init__.py +0 -0
  276. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/data_model.py +0 -0
  277. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/environment.py +0 -0
  278. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/tools.py +0 -0
  279. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/utils.py +0 -0
  280. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/__init__.py +0 -0
  281. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/data_model.py +0 -0
  282. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/environment.py +0 -0
  283. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  284. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  285. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  286. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  287. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  288. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  289. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  290. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  291. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tools.py +0 -0
  292. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  293. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  294. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/utils.py +0 -0
  295. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/__init__.py +0 -0
  296. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/db.py +0 -0
  297. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/environment.py +0 -0
  298. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/server.py +0 -0
  299. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/tool.py +0 -0
  300. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/toolkit.py +0 -0
  301. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  302. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/__init__.py +0 -0
  303. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator.py +0 -0
  304. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  305. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  306. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  307. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  308. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  309. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/metrics/__init__.py +0 -0
  310. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/metrics/agent_metrics.py +0 -0
  311. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  312. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/__init__.py +0 -0
  313. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  314. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  315. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/utils.py +0 -0
  316. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/registry.py +0 -0
  317. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/run.py +0 -0
  318. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/__init__.py +0 -0
  319. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/check_data.py +0 -0
  320. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  321. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/start_servers.py +0 -0
  322. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/view_simulations.py +0 -0
  323. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/user/__init__.py +0 -0
  324. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/user/base.py +0 -0
  325. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/user/user_simulator.py +0 -0
  326. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/__init__.py +0 -0
  327. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/display.py +0 -0
  328. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/io_utils.py +0 -0
  329. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/llm_utils.py +0 -0
  330. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/pydantic_utils.py +0 -0
  331. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/utils.py +0 -0
  332. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/versioneer.py +0 -0
  333. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  334. {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.6.dev2
3
+ Version: 0.2.8
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-10T19:39:17-0700",
11
+ "date": "2025-08-11T22:02:14-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "a50c3f62fdb5be7347741446338d8c1771e92547",
15
- "version": "0.2.6-dev2"
14
+ "full-revisionid": "b004c422c7d873890fc88cc299935929fa966b1f",
15
+ "version": "0.2.8"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -289,6 +289,7 @@ def parse_args(args=None):
289
289
 
290
290
  # Logs command
291
291
  logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
292
+ logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
292
293
 
293
294
  # Run command (for Hydra-based evaluations)
294
295
  # This subparser intentionally defines no arguments itself.
@@ -11,15 +11,16 @@ from ..utils.logs_server import serve_logs
11
11
  def logs_command(args):
12
12
  """Serve logs with file watching and real-time updates"""
13
13
 
14
+ port = args.port
14
15
  print(f"🚀 Starting Eval Protocol Logs Server")
15
- print(f"🌐 URL: http://localhost:8000")
16
- print(f"🔌 WebSocket: ws://localhost:8000/ws")
16
+ print(f"🌐 URL: http://localhost:{port}")
17
+ print(f"🔌 WebSocket: ws://localhost:{port}/ws")
17
18
  print(f"👀 Watching paths: {['current directory']}")
18
19
  print("Press Ctrl+C to stop the server")
19
20
  print("-" * 50)
20
21
 
21
22
  try:
22
- serve_logs()
23
+ serve_logs(port=args.port)
23
24
  return 0
24
25
  except KeyboardInterrupt:
25
26
  print("\n🛑 Server stopped by user")
@@ -22,9 +22,8 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
22
22
  self._store = SqliteEvaluationRowStore(self.db_path)
23
23
 
24
24
  def log(self, row: "EvaluationRow") -> None:
25
- row_id = row.input_metadata.row_id
26
25
  data = row.model_dump(exclude_none=True, mode="json")
27
- self._store.upsert_row(row_id=row_id, data=data)
26
+ self._store.upsert_row(data=data)
28
27
  try:
29
28
  event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
30
29
  except Exception as e:
@@ -32,8 +31,8 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
32
31
  logger.error(f"Failed to emit row_upserted event: {e}")
33
32
  pass
34
33
 
35
- def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
34
+ def read(self, rollout_id: Optional[str] = None) -> List["EvaluationRow"]:
36
35
  from eval_protocol.models import EvaluationRow
37
36
 
38
- results = self._store.read_rows(row_id=row_id)
37
+ results = self._store.read_rows(rollout_id=rollout_id)
39
38
  return [EvaluationRow(**data) for data in results]
@@ -11,7 +11,7 @@ class SqliteEvaluationRowStore:
11
11
  """
12
12
  Lightweight reusable SQLite store for evaluation rows.
13
13
 
14
- Stores arbitrary row data as JSON keyed by a unique string `row_id`.
14
+ Stores arbitrary row data as JSON keyed by a unique string `rollout_id`.
15
15
  """
16
16
 
17
17
  def __init__(self, db_path: str):
@@ -24,7 +24,7 @@ class SqliteEvaluationRowStore:
24
24
  database = self._db
25
25
 
26
26
  class EvaluationRow(BaseModel): # type: ignore
27
- row_id = CharField(unique=True)
27
+ rollout_id = CharField(unique=True)
28
28
  data = JSONField()
29
29
 
30
30
  self._EvaluationRow = EvaluationRow
@@ -36,22 +36,25 @@ class SqliteEvaluationRowStore:
36
36
  def db_path(self) -> str:
37
37
  return self._db_path
38
38
 
39
- def upsert_row(self, row_id: str, data: dict) -> None:
40
- if self._EvaluationRow.select().where(self._EvaluationRow.row_id == row_id).exists():
41
- self._EvaluationRow.update(data=data).where(self._EvaluationRow.row_id == row_id).execute()
39
+ def upsert_row(self, data: dict) -> None:
40
+ rollout_id = data["execution_metadata"]["rollout_id"]
41
+ if rollout_id is None:
42
+ raise ValueError("execution_metadata.rollout_id is required to upsert a row")
43
+ if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
44
+ self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
42
45
  else:
43
- self._EvaluationRow.create(row_id=row_id, data=data)
46
+ self._EvaluationRow.create(rollout_id=rollout_id, data=data)
44
47
 
45
- def read_rows(self, row_id: Optional[str] = None) -> List[dict]:
46
- if row_id is None:
48
+ def read_rows(self, rollout_id: Optional[str] = None) -> List[dict]:
49
+ if rollout_id is None:
47
50
  query = self._EvaluationRow.select().dicts()
48
51
  else:
49
- query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.row_id == row_id)
52
+ query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.rollout_id == rollout_id)
50
53
  results = list(query)
51
54
  return [result["data"] for result in results]
52
55
 
53
- def delete_row(self, row_id: str) -> int:
54
- return self._EvaluationRow.delete().where(self._EvaluationRow.row_id == row_id).execute()
56
+ def delete_row(self, rollout_id: str) -> int:
57
+ return self._EvaluationRow.delete().where(self._EvaluationRow.rollout_id == rollout_id).execute()
55
58
 
56
59
  def delete_all_rows(self) -> int:
57
60
  return self._EvaluationRow.delete().execute()
@@ -158,8 +158,8 @@ class ExecutionManager:
158
158
  messages.append(Message.model_validate(msg_dict))
159
159
 
160
160
  evaluation_rows[idx].messages = messages
161
- evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
162
- evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
161
+ # evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
162
+ # evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
163
163
  evaluation_rows[idx].tools = shared_tool_schema
164
164
  evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
165
165
  evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
@@ -482,11 +482,11 @@ class ExecutionManager:
482
482
  trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
483
483
  try:
484
484
  await envs.connection_manager.reset_session(session)
485
- except:
485
+ except: # noqa: E722
486
486
  logger.error(f"Error resetting session {session.session_id}")
487
487
  try:
488
488
  await envs.connection_manager.close_session(session)
489
- except:
489
+ except: # noqa: E722
490
490
  logger.error(f"Error closing session {session.session_id}")
491
491
  return trajectory
492
492
 
@@ -202,6 +202,21 @@ class InputMetadata(BaseModel):
202
202
  )
203
203
 
204
204
 
205
+ class EvaluationThreshold(BaseModel):
206
+ """Threshold configuration for evaluation tests.
207
+
208
+ The success field is required - tests must specify a minimum success rate.
209
+ The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
210
+ """
211
+
212
+ success: float = Field(
213
+ ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
214
+ )
215
+ standard_deviation: Optional[float] = Field(
216
+ None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
217
+ )
218
+
219
+
205
220
  class EvalMetadata(BaseModel):
206
221
  """Metadata about the evaluation that was run."""
207
222
 
@@ -216,10 +231,36 @@ class EvalMetadata(BaseModel):
216
231
  )
217
232
  num_runs: int = Field(..., description="Number of times the evaluation was repeated")
218
233
  aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
219
- threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
234
+ passed_threshold: Optional[EvaluationThreshold] = Field(
235
+ None, description="Threshold configuration for test success"
236
+ )
220
237
  passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
221
238
 
222
239
 
240
+ class ExecutionMetadata(BaseModel):
241
+ """Metadata about the execution of the evaluation."""
242
+
243
+ invocation_id: Optional[str] = Field(
244
+ default_factory=generate_id,
245
+ description="The ID of the invocation that this row belongs to.",
246
+ )
247
+
248
+ experiment_id: Optional[str] = Field(
249
+ default_factory=generate_id,
250
+ description="The ID of the experiment that this row belongs to.",
251
+ )
252
+
253
+ rollout_id: Optional[str] = Field(
254
+ default_factory=generate_id,
255
+ description="The ID of the rollout that this row belongs to.",
256
+ )
257
+
258
+ run_id: Optional[str] = Field(
259
+ None,
260
+ description=("The ID of the run that this row belongs to."),
261
+ )
262
+
263
+
223
264
  class RolloutStatus(BaseModel):
224
265
  """Status of the rollout."""
225
266
 
@@ -264,26 +305,6 @@ class EvaluationRow(BaseModel):
264
305
  description="The status of the rollout.",
265
306
  )
266
307
 
267
- invocation_id: Optional[str] = Field(
268
- default_factory=generate_id,
269
- description="The ID of the invocation that this row belongs to.",
270
- )
271
-
272
- cohort_id: Optional[str] = Field(
273
- default_factory=generate_id,
274
- description="The ID of the cohort that this row belongs to.",
275
- )
276
-
277
- rollout_id: Optional[str] = Field(
278
- default_factory=generate_id,
279
- description="The ID of the rollout that this row belongs to.",
280
- )
281
-
282
- run_id: Optional[str] = Field(
283
- None,
284
- description=("The ID of the run that this row belongs to."),
285
- )
286
-
287
308
  # Ground truth reference (moved from EvaluateResult to top level)
288
309
  ground_truth: Optional[str] = Field(
289
310
  default=None, description="Optional ground truth reference for this evaluation."
@@ -294,6 +315,11 @@ class EvaluationRow(BaseModel):
294
315
  default=None, description="The evaluation result for this row/trajectory."
295
316
  )
296
317
 
318
+ execution_metadata: ExecutionMetadata = Field(
319
+ default_factory=ExecutionMetadata,
320
+ description="Metadata about the execution of the evaluation.",
321
+ )
322
+
297
323
  # LLM usage statistics
298
324
  usage: Optional[CompletionUsage] = Field(
299
325
  default=None, description="Token usage statistics from LLM calls during execution."
@@ -8,7 +8,7 @@ from openai import NOT_GIVEN, NotGiven
8
8
  from openai.types.chat import ChatCompletionContentPartTextParam, ChatCompletionMessage, ChatCompletionToolParam
9
9
  from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
10
10
 
11
- from eval_protocol.dataset_logger import default_logger
11
+ from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
12
12
  from eval_protocol.mcp.execution.policy import LiteLLMPolicy
13
13
  from eval_protocol.mcp.mcp_multi_client import MCPMultiClient
14
14
  from eval_protocol.models import EvaluationRow, Message
@@ -20,12 +20,13 @@ class Agent:
20
20
  A really simple agent that calls the model until no more tool calls are needed.
21
21
  """
22
22
 
23
- def __init__(self, model: str, row: EvaluationRow, config_path: str):
23
+ def __init__(self, model: str, row: EvaluationRow, config_path: str, logger: DatasetLogger):
24
24
  self.model = model
25
25
  self.evaluation_row: EvaluationRow = row
26
26
  self._policy = LiteLLMPolicy(model_id=model)
27
27
  self.mcp_client = MCPMultiClient(config_path=config_path) if config_path else None
28
28
  self.tools: Union[List[ChatCompletionToolParam], NotGiven] = NOT_GIVEN
29
+ self.logger: DatasetLogger = logger
29
30
 
30
31
  async def setup(self):
31
32
  if self.mcp_client:
@@ -42,7 +43,7 @@ class Agent:
42
43
 
43
44
  def append_message_and_log(self, message: Message):
44
45
  self.messages.append(message)
45
- default_logger.log(self.evaluation_row)
46
+ self.logger.log(self.evaluation_row)
46
47
 
47
48
  async def call_agent(self) -> str:
48
49
  """
@@ -116,7 +117,7 @@ async def default_agent_rollout_processor(
116
117
  ) -> List[EvaluationRow]:
117
118
  dataset: Dataset = []
118
119
  for row in rows:
119
- agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path)
120
+ agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger)
120
121
  await agent.setup()
121
122
  await agent.call_agent()
122
123
  dataset.append(agent.evaluation_row)
@@ -1,11 +1,9 @@
1
1
  import asyncio
2
- from typing import List
3
-
4
2
  import logging
5
3
  import os
4
+ from typing import List
6
5
 
7
- from eval_protocol.dataset_logger import default_logger
8
- from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall
6
+ from eval_protocol.models import ChatCompletionMessageToolCall, EvaluationRow, Message
9
7
  from eval_protocol.pytest.types import RolloutProcessorConfig
10
8
 
11
9
 
@@ -49,6 +47,7 @@ async def default_single_turn_rollout_processor(
49
47
 
50
48
  # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
51
49
  import importlib
50
+
52
51
  _litellm = importlib.import_module("litellm")
53
52
  acompletion = getattr(_litellm, "acompletion")
54
53
  response = await acompletion(**request_params)
@@ -79,7 +78,7 @@ async def default_single_turn_rollout_processor(
79
78
  ]
80
79
 
81
80
  row.messages = messages
82
- default_logger.log(row)
81
+ config.logger.log(row)
83
82
  return row
84
83
 
85
84
  # Process rows with bounded concurrency if configured
@@ -3,13 +3,21 @@ import inspect
3
3
  import math
4
4
  import os
5
5
  import statistics
6
- from typing import Any, Callable, Dict, List, Literal, Optional
6
+ from typing import Any, Callable, Dict, List, Literal, Optional, Union
7
7
 
8
8
  import pytest
9
9
 
10
10
  from eval_protocol.dataset_logger import default_logger
11
+ from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
11
12
  from eval_protocol.human_id import generate_id
12
- from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata, Message
13
+ from eval_protocol.models import (
14
+ CompletionParams,
15
+ EvalMetadata,
16
+ EvaluationRow,
17
+ EvaluationThreshold,
18
+ InputMetadata,
19
+ Message,
20
+ )
13
21
  from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
14
22
  from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
15
23
  from eval_protocol.pytest.types import (
@@ -46,7 +54,7 @@ def evaluation_test( # noqa: C901
46
54
  rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
47
55
  evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
48
56
  aggregation_method: AggregationMethod = "mean",
49
- threshold_of_success: Optional[float] = None,
57
+ passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
50
58
  num_runs: int = 1,
51
59
  max_dataset_rows: Optional[int] = None,
52
60
  mcp_config_path: Optional[str] = None,
@@ -55,6 +63,7 @@ def evaluation_test( # noqa: C901
55
63
  steps: int = 30,
56
64
  mode: EvaluationTestMode = "batch",
57
65
  combine_datasets: bool = True,
66
+ logger: Optional[DatasetLogger] = None,
58
67
  ) -> Callable[
59
68
  [TestFunction],
60
69
  TestFunction,
@@ -64,14 +73,14 @@ def evaluation_test( # noqa: C901
64
73
  Here are some key concepts to understand the terminology in EP:
65
74
 
66
75
  - "invocation" is a single execution of a test function. An invocation can
67
- generate 1 or more cohorts. Grouping by invocation might be useful to
76
+ generate 1 or more experiments. Grouping by invocation might be useful to
68
77
  aggregate eval scores across multiple invocations when you want to aggregate
69
78
  scores across multiple datasets.
70
- - "cohort" is a group of runs with for a combination of parameters. A single
71
- cohort will have multiple runs if num_runs > 1.
79
+ - "experiment" is a group of runs with for a combination of parameters. A single
80
+ experiment will have multiple runs if num_runs > 1.
72
81
  1. If your evaluation_test has combinations of parameters, it will generate
73
- multiple cohorts per combination of parameters.
74
- 2. A new execution of a test function will generate a new cohort.
82
+ multiple experiments per combination of parameters.
83
+ 2. A new execution of a test function will generate a new experiment.
75
84
  - "run" is a group of rollouts. For multiple num_runs > 1, there will be
76
85
  multiple "run_id"s.
77
86
  - "rollout" is the execution/process that produces a "trajectory". You
@@ -89,7 +98,7 @@ def evaluation_test( # noqa: C901
89
98
  decorated test. It simply produces a score from 0 to 1 and attached it
90
99
  to the row as the "evaluation_result" field.
91
100
 
92
- "invocation", "cohort", "run", "rollout", and "row" each have a unique ID
101
+ "invocation", "experiment", "run", "rollout", and "row" each have a unique ID
93
102
  which can be used to easily group and identify your dataset by.
94
103
 
95
104
  Args:
@@ -106,8 +115,8 @@ def evaluation_test( # noqa: C901
106
115
  rollout_processor: Function used to perform the rollout.
107
116
  evaluation_test_kwargs: Kwargs for the evaluation function.
108
117
  aggregation_method: How to aggregate scores across rows.
109
- threshold_of_success: If set, fail the test if the aggregated score is
110
- below this threshold.
118
+ passed_threshold: Threshold configuration for test success.
119
+ Success rate must be above success, and if set, standard deviation must be below standard_deviation.
111
120
  num_runs: Number of times to repeat the rollout and evaluations.
112
121
  max_dataset_rows: Limit dataset to the first N rows.
113
122
  mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -117,11 +126,22 @@ def evaluation_test( # noqa: C901
117
126
  mode: Evaluation mode. "batch" (default) expects test function to handle
118
127
  full dataset. "pointwise" applies test function to each row. If your evaluation requires
119
128
  the full rollout of all rows to compute the score, use
129
+ logger: DatasetLogger to use for logging. If not provided, a default logger will be used.
120
130
  """
121
131
 
132
+ active_logger: DatasetLogger = logger if logger else default_logger
133
+
122
134
  def decorator(
123
135
  test_func: TestFunction,
124
136
  ):
137
+ if passed_threshold is not None:
138
+ if isinstance(passed_threshold, float):
139
+ threshold = EvaluationThreshold(success=passed_threshold)
140
+ else:
141
+ threshold = EvaluationThreshold(**passed_threshold)
142
+ else:
143
+ threshold = None
144
+
125
145
  sig = inspect.signature(test_func)
126
146
 
127
147
  # For pointwise/rowwise mode, we expect a different signature
@@ -280,14 +300,14 @@ def evaluation_test( # noqa: C901
280
300
  def wrapper_body(**kwargs):
281
301
  model_name = kwargs["model"]
282
302
  eval_metadata = None
283
- all_results: List[EvaluationRow] = []
303
+ all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
284
304
 
285
- cohort_id = generate_id()
305
+ experiment_id = generate_id()
286
306
 
287
307
  def _log_eval_error(
288
308
  status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
289
309
  ) -> None:
290
- log_eval_status_and_rows(eval_metadata, rows, status, passed, default_logger)
310
+ log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger)
291
311
 
292
312
  try:
293
313
  # Handle dataset loading
@@ -341,7 +361,7 @@ def evaluation_test( # noqa: C901
341
361
  status="running",
342
362
  num_runs=num_runs,
343
363
  aggregation_method=aggregation_method,
344
- threshold_of_success=threshold_of_success,
364
+ passed_threshold=threshold,
345
365
  passed=None,
346
366
  )
347
367
 
@@ -363,13 +383,12 @@ def evaluation_test( # noqa: C901
363
383
  row.input_metadata.session_data["mode"] = mode
364
384
  # Initialize eval_metadata for each row
365
385
  row.eval_metadata = eval_metadata
366
- row.cohort_id = cohort_id
367
- row.invocation_id = invocation_id
386
+ row.execution_metadata.experiment_id = experiment_id
387
+ row.execution_metadata.invocation_id = invocation_id
368
388
 
369
389
  # has to be done in the pytest main process since it's
370
390
  # used to determine whether this eval has stopped
371
391
  row.pid = os.getpid()
372
- default_logger.log(row)
373
392
 
374
393
  # Prepare rollout processor config once; we will generate fresh outputs per run
375
394
  config = RolloutProcessorConfig(
@@ -379,21 +398,26 @@ def evaluation_test( # noqa: C901
379
398
  max_concurrent_rollouts=max_concurrent_rollouts,
380
399
  server_script_path=server_script_path,
381
400
  steps=steps,
401
+ logger=active_logger,
382
402
  )
383
403
 
384
- for _ in range(num_runs):
404
+ for i in range(num_runs):
385
405
  # Regenerate outputs each run by deep-copying the pristine dataset
386
406
  # so model responses are not reused across runs.
387
407
  run_id = generate_id()
388
- fresh_dataset = [copy.deepcopy(r) for r in data]
408
+ fresh_dataset = [r.model_copy(deep=True) for r in data]
389
409
 
390
410
  # apply new run_id to fresh_dataset
391
411
  for row in fresh_dataset:
392
- row.run_id = run_id
412
+ row.execution_metadata.run_id = run_id
393
413
 
394
414
  # generate new rollout_id for each row
395
415
  for row in fresh_dataset:
396
- row.rollout_id = generate_id()
416
+ row.execution_metadata.rollout_id = generate_id()
417
+
418
+ # log the fresh_dataset
419
+ for row in fresh_dataset:
420
+ active_logger.log(row)
397
421
 
398
422
  processed_dataset = execute_function(rollout_processor, rows=fresh_dataset, config=config)
399
423
 
@@ -409,7 +433,7 @@ def evaluation_test( # noqa: C901
409
433
  raise ValueError(
410
434
  f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
411
435
  )
412
- all_results.append(result)
436
+ all_results[i].append(result)
413
437
  else:
414
438
  # Batch mode: call the test function with the full dataset
415
439
  results = execute_with_params(
@@ -433,17 +457,21 @@ def evaluation_test( # noqa: C901
433
457
  raise ValueError(
434
458
  f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
435
459
  )
436
- all_results.extend(results)
460
+ all_results[i] = results
437
461
 
438
- scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
462
+ scores = [
463
+ sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
464
+ for result in all_results
465
+ ]
439
466
  agg_score = aggregate(scores, aggregation_method)
467
+ score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
440
468
 
441
469
  # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
442
470
  ci_low: float | None = None
443
471
  ci_high: float | None = None
444
472
  if aggregation_method == "mean":
445
473
  try:
446
- result_ci = compute_fixed_set_mu_ci(all_results)
474
+ result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
447
475
  mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
448
476
  if mu_ci_low is not None and mu_ci_high is not None:
449
477
  ci_low = float(mu_ci_low)
@@ -455,15 +483,24 @@ def evaluation_test( # noqa: C901
455
483
 
456
484
  # Determine if the evaluation passed based on threshold
457
485
  passed = None
458
- if threshold_of_success is not None:
459
- passed = agg_score >= threshold_of_success
486
+
487
+ if threshold is not None:
488
+ success_passed, std_passed = True, True
489
+
490
+ success_passed = agg_score >= threshold.success
491
+
492
+ if threshold.standard_deviation is not None:
493
+ std_passed = score_std <= threshold.standard_deviation
494
+
495
+ passed = success_passed and std_passed
460
496
 
461
497
  # Update eval metadata status and passed field for all results
462
- for r in all_results:
463
- if r.eval_metadata is not None:
464
- r.eval_metadata.status = "finished"
465
- r.eval_metadata.passed = passed
466
- default_logger.log(r)
498
+ for result in all_results:
499
+ for r in result:
500
+ if r.eval_metadata is not None:
501
+ r.eval_metadata.status = "finished"
502
+ r.eval_metadata.passed = passed
503
+ active_logger.log(r)
467
504
 
468
505
  # Optional: print and/or persist a summary artifact for CI
469
506
  try:
@@ -471,7 +508,7 @@ def evaluation_test( # noqa: C901
471
508
  summary_path = os.getenv("EP_SUMMARY_JSON")
472
509
  suite_name = test_func.__name__
473
510
  model_used = model_name
474
- total_rows = len(all_results)
511
+ total_rows = len([item for sublist in all_results for item in sublist])
475
512
  summary_obj = {
476
513
  "suite": suite_name,
477
514
  "model": model_used,
@@ -488,7 +525,7 @@ def evaluation_test( # noqa: C901
488
525
  from collections import defaultdict
489
526
 
490
527
  metric_scores: Dict[str, list] = defaultdict(list)
491
- for r in all_results:
528
+ for r in [item for sublist in all_results for item in sublist]:
492
529
  if r.evaluation_result and r.evaluation_result.metrics:
493
530
  for m_name, m_res in r.evaluation_result.metrics.items():
494
531
  if m_res is not None and getattr(m_res, "score", None) is not None:
@@ -587,11 +624,32 @@ def evaluation_test( # noqa: C901
587
624
  # Do not fail evaluation if summary writing fails
588
625
  pass
589
626
 
627
+ # # Write all rows from active_logger.read() to a JSONL file in the same directory as the summary
628
+ # try:
629
+ # if active_logger is not None:
630
+ # rows = active_logger.read()
631
+ # # Write to a .jsonl file alongside the summary file
632
+ # jsonl_path = "logs.jsonl"
633
+ # import json
634
+
635
+ # with open(jsonl_path, "w", encoding="utf-8") as f_jsonl:
636
+ # for row in rows:
637
+ # json.dump(row.model_dump(exclude_none=True, mode="json"), f_jsonl)
638
+ # f_jsonl.write("\n")
639
+ # except Exception as e:
640
+ # # Do not fail evaluation if log writing fails
641
+ # print(e)
642
+ # pass
643
+
590
644
  # Check threshold after logging
591
- if threshold_of_success is not None and not passed:
645
+ if threshold is not None and not passed:
592
646
  assert (
593
- agg_score >= threshold_of_success
594
- ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
647
+ agg_score >= threshold.success
648
+ ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
649
+ if threshold.standard_deviation is not None:
650
+ assert (
651
+ score_std <= threshold.standard_deviation
652
+ ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
595
653
 
596
654
  except AssertionError:
597
655
  _log_eval_error("finished", data if "data" in locals() else None, passed=False)
@@ -5,6 +5,9 @@ Parameter types
5
5
  from dataclasses import dataclass
6
6
  from typing import Any, Callable, Dict, List, Literal, Optional
7
7
 
8
+ from eval_protocol.dataset_logger import default_logger
9
+ from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
10
+
8
11
  from ..models import EvaluationRow, Message
9
12
 
10
13
  ModelParam = str # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct
@@ -39,10 +42,13 @@ Rollout processor types
39
42
  class RolloutProcessorConfig:
40
43
  model: ModelParam
41
44
  input_params: RolloutInputParam # optional input parameters for inference
42
- mcp_config_path: str
43
- server_script_path: Optional[str] = None # TODO: change from server_script_path to mcp_config_path for agent rollout processor
45
+ mcp_config_path: str
46
+ server_script_path: Optional[str] = (
47
+ None # TODO: change from server_script_path to mcp_config_path for agent rollout processor
48
+ )
44
49
  max_concurrent_rollouts: int = 8 # maximum number of concurrent rollouts
45
50
  steps: int = 30 # max number of rollout steps
51
+ logger: DatasetLogger = default_logger # logger to use during rollout for mid-rollout logs
46
52
 
47
53
 
48
54
  RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], List[EvaluationRow]]