eval-protocol 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. {eval_protocol-0.2.1/eval_protocol.egg-info → eval_protocol-0.2.3}/PKG-INFO +48 -3
  2. eval_protocol-0.2.3/README.md +69 -0
  3. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_agent_rollout_processor.py +34 -8
  5. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +37 -7
  6. {eval_protocol-0.2.1 → eval_protocol-0.2.3/eval_protocol.egg-info}/PKG-INFO +48 -3
  7. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/pyproject.toml +1 -0
  8. eval_protocol-0.2.1/README.md +0 -24
  9. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/LICENSE +0 -0
  10. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/__init__.py +0 -0
  11. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/normalize_sandbox_fusion.py +0 -0
  12. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/utils/__init__.py +0 -0
  13. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/utils/generate_api_key.py +0 -0
  14. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/utils/subprocess_manager.py +0 -0
  15. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/__init__.py +0 -0
  16. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/__main__.py +0 -0
  17. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/__init__.py +0 -0
  18. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/braintrust.py +0 -0
  19. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/huggingface.py +0 -0
  20. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/langfuse.py +0 -0
  21. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/trl.py +0 -0
  22. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/__init__.py +0 -0
  23. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/models.py +0 -0
  24. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/orchestrator.py +0 -0
  25. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resource_abc.py +0 -0
  26. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resource_pool.py +0 -0
  27. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/__init__.py +0 -0
  28. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  29. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  30. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  31. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  32. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  33. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/docker_resource.py +0 -0
  34. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  35. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  36. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  37. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  38. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/sql_resource.py +0 -0
  39. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/task_manager.py +0 -0
  40. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/tool_registry.py +0 -0
  41. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/auth.py +0 -0
  42. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli.py +0 -0
  43. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/__init__.py +0 -0
  44. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  45. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/common.py +0 -0
  46. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/deploy.py +0 -0
  47. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  48. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/preview.py +0 -0
  49. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  50. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/common_utils.py +0 -0
  51. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/config.py +0 -0
  52. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/datasets/__init__.py +0 -0
  53. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/datasets/loader.py +0 -0
  54. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/evaluation.py +0 -0
  55. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/execution/__init__.py +0 -0
  56. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/execution/pipeline.py +0 -0
  57. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/gcp_tools.py +0 -0
  58. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/generation/cache.py +0 -0
  59. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/generation/clients/base.py +0 -0
  60. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/generation/clients.py +0 -0
  61. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/generic_server.py +0 -0
  62. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/__init__.py +0 -0
  63. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/braintrust.py +0 -0
  64. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/deepeval.py +0 -0
  65. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/openeval.py +0 -0
  66. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/trl.py +0 -0
  67. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/__init__.py +0 -0
  68. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/adapter.py +0 -0
  69. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/client/__init__.py +0 -0
  70. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/client/connection.py +0 -0
  71. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/clients.py +0 -0
  72. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/execution/__init__.py +0 -0
  73. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/execution/base_policy.py +0 -0
  74. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/execution/manager.py +0 -0
  75. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/execution/policy.py +0 -0
  76. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/grid_renderer.py +0 -0
  77. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  78. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/mcpgym.py +0 -0
  79. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/process_manager.py +0 -0
  80. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/session/__init__.py +0 -0
  81. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/session/manager.py +0 -0
  82. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/simple_process_manager.py +0 -0
  83. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/simulation_server.py +0 -0
  84. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/__init__.py +0 -0
  85. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/config.py +0 -0
  86. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  87. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/main.py +0 -0
  88. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  89. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  90. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  91. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  92. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  93. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/session.py +0 -0
  94. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_env.py +0 -0
  95. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/models.py +0 -0
  96. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/packaging.py +0 -0
  97. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/platform_api.py +0 -0
  98. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/playback_policy.py +0 -0
  99. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/__init__.py +0 -0
  100. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  101. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  102. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  103. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/evaluation_test.py +0 -0
  104. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/types.py +0 -0
  105. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/utils.py +0 -0
  106. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/resources.py +0 -0
  107. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/reward_function.py +0 -0
  108. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/__init__.py +0 -0
  109. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/accuracy.py +0 -0
  110. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/accuracy_length.py +0 -0
  111. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  112. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  113. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/apps_testing_util.py +0 -0
  114. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/bfcl_reward.py +0 -0
  115. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/code_execution.py +0 -0
  116. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/code_execution_utils.py +0 -0
  117. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/cpp_code.py +0 -0
  118. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  119. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/format.py +0 -0
  120. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/function_calling.py +0 -0
  121. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/json_schema.py +0 -0
  122. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/language_consistency.py +0 -0
  123. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/lean_prover.py +0 -0
  124. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/length.py +0 -0
  125. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  126. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/math.py +0 -0
  127. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  128. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/reasoning_steps.py +0 -0
  129. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/repetition.py +0 -0
  130. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/tag_count.py +0 -0
  131. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rl_processing.py +0 -0
  132. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/server.py +0 -0
  133. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/typed_interface.py +0 -0
  134. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/types/__init__.py +0 -0
  135. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/types/types.py +0 -0
  136. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/__init__.py +0 -0
  137. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/batch_evaluation.py +0 -0
  138. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/batch_transformation.py +0 -0
  139. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/dataset_helpers.py +0 -0
  140. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/module_loader.py +0 -0
  141. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/packaging_utils.py +0 -0
  142. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/static_policy.py +0 -0
  143. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/SOURCES.txt +0 -0
  144. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/dependency_links.txt +0 -0
  145. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/entry_points.txt +0 -0
  146. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/requires.txt +0 -0
  147. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/top_level.txt +0 -0
  148. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/setup.cfg +0 -0
  149. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/setup.py +0 -0
  150. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_accuracy.py +0 -0
  151. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_accuracy_length.py +0 -0
  152. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_adapters_e2e.py +0 -0
  153. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_agent_orchestrator.py +0 -0
  154. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_agent_resources.py +0 -0
  155. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_auth.py +0 -0
  156. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_batch_evaluation.py +0 -0
  157. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_braintrust_adapter.py +0 -0
  158. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_braintrust_example.py +0 -0
  159. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_cli.py +0 -0
  160. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_cli_agent.py +0 -0
  161. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_cli_args.py +0 -0
  162. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_code_execution.py +0 -0
  163. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_config.py +0 -0
  164. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_control_plane_separation.py +0 -0
  165. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_cpp_code.py +0 -0
  166. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_data_driven_task_manager.py +0 -0
  167. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_deepcoder_reward.py +0 -0
  168. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_deepeval_integration.py +0 -0
  169. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_deploy_integration.py +0 -0
  170. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_e2b_integration.py +0 -0
  171. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_e2b_js_integration.py +0 -0
  172. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_edge_cases.py +0 -0
  173. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_eval_protocol_import.py +0 -0
  174. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_evaluation.py +0 -0
  175. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_evaluation_integration.py +0 -0
  176. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_evaluation_preview_integration.py +0 -0
  177. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_examples_end_to_end.py +0 -0
  178. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_fireworks_api.py +0 -0
  179. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_format.py +0 -0
  180. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_fractional_code.py +0 -0
  181. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_frozen_lake_http_server.py +0 -0
  182. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  183. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_function_calling.py +0 -0
  184. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_gcp_tools.py +0 -0
  185. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_generic_server.py +0 -0
  186. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_integration.py +0 -0
  187. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_json_schema.py +0 -0
  188. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_kwargs_validation.py +0 -0
  189. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_language_consistency.py +0 -0
  190. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_lean_prover.py +0 -0
  191. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_lean_prover_runner.py +0 -0
  192. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_length.py +0 -0
  193. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_list_comparison_math_reward.py +0 -0
  194. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_math.py +0 -0
  195. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_minimal.py +0 -0
  196. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_models.py +0 -0
  197. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_models_rl.py +0 -0
  198. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_multiple_choice_math_reward.py +0 -0
  199. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_n_variant_batch_integration.py +0 -0
  200. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_n_variant_integration.py +0 -0
  201. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_openai_compatibility.py +0 -0
  202. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_openeval_integration.py +0 -0
  203. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_packaging.py +0 -0
  204. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_parallel_rollouts.py +0 -0
  205. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_platform_api.py +0 -0
  206. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_readiness.py +0 -0
  207. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_reasoning_steps.py +0 -0
  208. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_repetition.py +0 -0
  209. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_repetition_debug.py +0 -0
  210. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_reward_function.py +0 -0
  211. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_reward_protocol_import.py +0 -0
  212. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_rl_processing.py +0 -0
  213. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_rollout_control_plane_integration.py +0 -0
  214. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_server.py +0 -0
  215. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_tag_count.py +0 -0
  216. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_typed_interface.py +0 -0
  217. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_typed_interface_rl.py +0 -0
  218. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_url_handling.py +0 -0
  219. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/__init__.py +0 -0
  220. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/agent/__init__.py +0 -0
  221. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/agent/base.py +0 -0
  222. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/agent/llm_agent.py +0 -0
  223. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/api_service/__init__.py +0 -0
  224. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/api_service/api_config.py +0 -0
  225. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/api_service/data_model.py +0 -0
  226. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/api_service/simulation_service.py +0 -0
  227. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/cli.py +0 -0
  228. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/config.py +0 -0
  229. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/data_model/__init__.py +0 -0
  230. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/data_model/message.py +0 -0
  231. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/data_model/simulation.py +0 -0
  232. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/data_model/tasks.py +0 -0
  233. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/__init__.py +0 -0
  234. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/__init__.py +0 -0
  235. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/data_model.py +0 -0
  236. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/environment.py +0 -0
  237. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/tools.py +0 -0
  238. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/utils.py +0 -0
  239. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/__init__.py +0 -0
  240. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/data_model.py +0 -0
  241. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/environment.py +0 -0
  242. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/tools.py +0 -0
  243. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/utils.py +0 -0
  244. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/__init__.py +0 -0
  245. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/data_model.py +0 -0
  246. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/environment.py +0 -0
  247. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/tools.py +0 -0
  248. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/utils.py +0 -0
  249. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/__init__.py +0 -0
  250. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/data_model.py +0 -0
  251. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/environment.py +0 -0
  252. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  253. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  254. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  255. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  256. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  257. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  258. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  259. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  260. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tools.py +0 -0
  261. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  262. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  263. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/utils.py +0 -0
  264. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/__init__.py +0 -0
  265. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/db.py +0 -0
  266. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/environment.py +0 -0
  267. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/server.py +0 -0
  268. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/tool.py +0 -0
  269. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/toolkit.py +0 -0
  270. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  271. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/__init__.py +0 -0
  272. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator.py +0 -0
  273. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  274. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  275. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  276. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  277. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  278. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/metrics/__init__.py +0 -0
  279. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/metrics/agent_metrics.py +0 -0
  280. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  281. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/orchestrator/__init__.py +0 -0
  282. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  283. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  284. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/orchestrator/utils.py +0 -0
  285. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/registry.py +0 -0
  286. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/run.py +0 -0
  287. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/__init__.py +0 -0
  288. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/check_data.py +0 -0
  289. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  290. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/start_servers.py +0 -0
  291. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/view_simulations.py +0 -0
  292. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/user/__init__.py +0 -0
  293. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/user/base.py +0 -0
  294. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/user/user_simulator.py +0 -0
  295. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/__init__.py +0 -0
  296. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/display.py +0 -0
  297. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/io_utils.py +0 -0
  298. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/llm_utils.py +0 -0
  299. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/pydantic_utils.py +0 -0
  300. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/utils.py +0 -0
  301. {eval_protocol-0.2.1 → eval_protocol-0.2.3}/versioneer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: Apache-2.0
@@ -96,8 +96,53 @@ Dynamic: license-file
96
96
 
97
97
  [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
98
98
 
99
- EP is an open protocol that standardizes how developers author evals for large
100
- language model (LLM) applications.
99
+ EP is an open specification, Python SDK, and pytest wrapper that provides a
100
+ standardized way to write evaluations for large language model (LLM)
101
+ applications. Start with simple single-turn evals for model selection and prompt
102
+ engineering, then scale up to complex multi-turn reinforcement learning (RL) for
103
+ agents using Model Context Protocol (MCP). EP ensures consistent patterns for
104
+ writing evals, storing traces, and saving results—enabling you to build
105
+ sophisticated agent evaluations that work across real-world scenarios, from
106
+ markdown generation tasks to customer service agents with tool calling
107
+ capabilities.
108
+
109
+ ## Quick Example
110
+
111
+ Here's a simple test function that checks if a model's response contains **bold** text formatting:
112
+
113
+ ```python test_bold_format.py
114
+ from eval_protocol.models import EvaluateResult, EvaluationRow
115
+ from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
116
+
117
+ @evaluation_test(
118
+ input_messages=[
119
+ [
120
+ Message(role="system", content="You are a helpful assistant. Use bold text to highlight important information."),
121
+ Message(role="user", content="Explain why **evaluations** matter for building AI agents. Make it dramatic!"),
122
+ ],
123
+ ],
124
+ model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
125
+ rollout_processor=default_single_turn_rollout_processor,
126
+ mode="pointwise",
127
+ )
128
+ def test_bold_format(row: EvaluationRow) -> EvaluationRow:
129
+ """
130
+ Simple evaluation that checks if the model's response contains bold text.
131
+ """
132
+
133
+ assistant_response = row.messages[-1].content
134
+
135
+ # Check if response contains **bold** text
136
+ has_bold = "**" in assistant_response
137
+
138
+ if has_bold:
139
+ result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
140
+ else:
141
+ result = EvaluateResult(score=0.0, reason="❌ No bold text found")
142
+
143
+ row.evaluation_result = result
144
+ return row
145
+ ```
101
146
 
102
147
  ## Documentation
103
148
 
@@ -0,0 +1,69 @@
1
+ # Eval Protocol (EP)
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
4
+
5
+ EP is an open specification, Python SDK, and pytest wrapper that provides a
6
+ standardized way to write evaluations for large language model (LLM)
7
+ applications. Start with simple single-turn evals for model selection and prompt
8
+ engineering, then scale up to complex multi-turn reinforcement learning (RL) for
9
+ agents using Model Context Protocol (MCP). EP ensures consistent patterns for
10
+ writing evals, storing traces, and saving results—enabling you to build
11
+ sophisticated agent evaluations that work across real-world scenarios, from
12
+ markdown generation tasks to customer service agents with tool calling
13
+ capabilities.
14
+
15
+ ## Quick Example
16
+
17
+ Here's a simple test function that checks if a model's response contains **bold** text formatting:
18
+
19
+ ```python test_bold_format.py
20
+ from eval_protocol.models import EvaluateResult, EvaluationRow
21
+ from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
22
+
23
+ @evaluation_test(
24
+ input_messages=[
25
+ [
26
+ Message(role="system", content="You are a helpful assistant. Use bold text to highlight important information."),
27
+ Message(role="user", content="Explain why **evaluations** matter for building AI agents. Make it dramatic!"),
28
+ ],
29
+ ],
30
+ model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
31
+ rollout_processor=default_single_turn_rollout_processor,
32
+ mode="pointwise",
33
+ )
34
+ def test_bold_format(row: EvaluationRow) -> EvaluationRow:
35
+ """
36
+ Simple evaluation that checks if the model's response contains bold text.
37
+ """
38
+
39
+ assistant_response = row.messages[-1].content
40
+
41
+ # Check if response contains **bold** text
42
+ has_bold = "**" in assistant_response
43
+
44
+ if has_bold:
45
+ result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
46
+ else:
47
+ result = EvaluateResult(score=0.0, reason="❌ No bold text found")
48
+
49
+ row.evaluation_result = result
50
+ return row
51
+ ```
52
+
53
+ ## Documentation
54
+
55
+ See our [documentation](https://evalprotocol.io) for more details.
56
+
57
+ ## Installation
58
+
59
+ **This library requires Python >= 3.10.**
60
+
61
+ Install with pip:
62
+
63
+ ```
64
+ pip install eval-protocol
65
+ ```
66
+
67
+ ## License
68
+
69
+ [MIT](LICENSE)
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-04T14:28:02-0700",
11
+ "date": "2025-08-04T20:35:33-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "07fda02490d1a09c7ab92595d6397622cb64230d",
15
- "version": "0.2.1"
14
+ "full-revisionid": "52b46a7d3f8455d848d8d5138ec4ca4d6343d3d2",
15
+ "version": "0.2.3"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -1,8 +1,10 @@
1
+ import asyncio
1
2
  import json
2
3
  import os
3
- from typing import Any, List, Optional
4
+ from typing import Any, List, Optional, Union
4
5
 
5
6
  from mcp.types import CallToolResult
7
+ from openai import NOT_GIVEN, NotGiven
6
8
  from openai.types.chat import ChatCompletionMessage, ChatCompletionToolParam
7
9
  from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
8
10
 
@@ -22,27 +24,43 @@ class Agent:
22
24
  self.messages: list[Message] = initial_messages
23
25
  self._policy = LiteLLMPolicy(model_id=model)
24
26
  self.mcp_client = MCPMultiClient(config_path=config_path) if config_path else None
27
+ self.tools: Union[List[ChatCompletionToolParam], NotGiven] = NOT_GIVEN
25
28
 
26
29
  async def setup(self):
27
30
  if self.mcp_client:
28
31
  await self.mcp_client.connect_to_servers()
29
32
 
33
+ async def _get_tools(self) -> Optional[List[ChatCompletionToolParam]]:
34
+ if self.tools is NOT_GIVEN:
35
+ self.tools = await self.mcp_client.get_available_tools() if self.mcp_client else None
36
+ return self.tools
37
+
30
38
  async def call_agent(self) -> str:
31
39
  """
32
40
  Call the assistant with the user query.
33
41
  """
34
- tools = await self.mcp_client.get_available_tools() if self.mcp_client else None
42
+ tools = await self._get_tools() if self.mcp_client else None
35
43
 
36
44
  message = await self._call_model(self.messages, tools)
37
45
  self.messages.append(message)
38
46
  if message["tool_calls"]:
47
+ # Create tasks for all tool calls to run them in parallel
48
+ tool_tasks = []
39
49
  for tool_call in message["tool_calls"]:
40
50
  tool_call_id = tool_call["id"]
41
51
  tool_name = tool_call["function"]["name"]
42
52
  tool_args = tool_call["function"]["arguments"]
43
53
  tool_args_dict = json.loads(tool_args)
44
- tool_result = await self.mcp_client.call_tool(tool_name, tool_args_dict)
45
- content = self._get_content_from_tool_result(tool_result)
54
+
55
+ # Create a task for each tool call
56
+ task = self._execute_tool_call(tool_call_id, tool_name, tool_args_dict)
57
+ tool_tasks.append(task)
58
+
59
+ # Execute all tool calls in parallel
60
+ tool_results = await asyncio.gather(*tool_tasks)
61
+
62
+ # Add all tool results to messages (they will be in the same order as tool_calls)
63
+ for tool_call, (tool_call_id, content) in zip(message["tool_calls"], tool_results):
46
64
  self.messages.append(
47
65
  {
48
66
  "role": "tool",
@@ -50,18 +68,26 @@ class Agent:
50
68
  "tool_call_id": tool_call_id,
51
69
  }
52
70
  )
71
+ return await self.call_agent()
53
72
  return message["content"]
54
73
 
55
74
  async def _call_model(
56
75
  self, messages: list[Message], tools: Optional[list[ChatCompletionToolParam]]
57
76
  ) -> ChatCompletionMessage:
58
77
  messages = [message.model_dump() if hasattr(message, "model_dump") else message for message in messages]
59
- response = await self._policy._make_llm_call(
60
- messages=messages,
61
- tools=tools,
62
- )
78
+ tools = [{"function": tool["function"].model_dump(), "type": "function"} for tool in tools] if tools else []
79
+ response = await self._policy._make_llm_call(messages=messages, tools=tools)
63
80
  return response["choices"][0]["message"]
64
81
 
82
+ async def _execute_tool_call(self, tool_call_id: str, tool_name: str, tool_args_dict: dict) -> tuple[str, str]:
83
+ """
84
+ Execute a single tool call and return the tool_call_id and content.
85
+ This method is designed to be used with asyncio.gather() for parallel execution.
86
+ """
87
+ tool_result = await self.mcp_client.call_tool(tool_name, tool_args_dict)
88
+ content = self._get_content_from_tool_result(tool_result)
89
+ return tool_call_id, content
90
+
65
91
  def _get_content_from_tool_result(self, tool_result: CallToolResult) -> str:
66
92
  if tool_result.structuredContent:
67
93
  return json.dumps(tool_result.structuredContent)
@@ -2,6 +2,7 @@ import asyncio
2
2
  import os
3
3
  import subprocess
4
4
  import time
5
+ import socket
5
6
  from pathlib import Path
6
7
  from typing import List, Optional
7
8
 
@@ -69,11 +70,8 @@ class MCPServerManager:
69
70
  self._log_file = log_file
70
71
  self._log_file_path = log_file_path
71
72
 
72
- # Wait for server to start
73
- time.sleep(3)
74
-
75
- # Check if process is still running
76
- if self.process.poll() is not None:
73
+ # Wait for server to be ready with proper health check
74
+ if not self._wait_for_server_ready(timeout=15):
77
75
  try:
78
76
  with open(self._log_file_path, "r") as f:
79
77
  log_content = f.read()
@@ -82,13 +80,45 @@ class MCPServerManager:
82
80
  print("=" * 50)
83
81
  print(log_content)
84
82
  print("=" * 50)
85
- raise RuntimeError(f"Server failed to start. Check log above for details.")
83
+ raise RuntimeError(f"Server failed to start or become ready. Check log above for details.")
86
84
  except Exception as e:
87
85
  stdout, stderr = self.process.communicate()
88
- raise RuntimeError(f"Server failed to start. stderr: {stderr}, log error: {e}")
86
+ raise RuntimeError(f"Server failed to start or become ready. stderr: {stderr}, log error: {e}")
89
87
 
90
88
  print(f"✅ Server started successfully on port {self.port}")
91
89
 
90
+ def _wait_for_server_ready(self, timeout: int = 15) -> bool:
91
+ """
92
+ Wait for server to be ready by polling socket connection.
93
+ """
94
+ start_time = time.time()
95
+ health_check_failures = 0
96
+
97
+ while time.time() - start_time < timeout:
98
+ # Check if process is still running
99
+ if self.process.poll() is not None:
100
+ print(f"Server process exited early")
101
+ return False
102
+
103
+ try:
104
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
105
+ s.settimeout(1)
106
+ result = s.connect_ex(("localhost", self.port))
107
+ if result == 0:
108
+ time.sleep(0.5)
109
+ return True
110
+ except Exception as e:
111
+ health_check_failures += 1
112
+ # Print first few failures for debugging
113
+ if health_check_failures <= 3:
114
+ print(f"Health check failed: {e}")
115
+
116
+ # Wait before next check
117
+ time.sleep(0.1)
118
+
119
+ print(f"Server failed to become ready within {timeout} seconds")
120
+ return False
121
+
92
122
  def stop(self) -> None:
93
123
  """Stop the MCP server."""
94
124
  if self.process:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: Apache-2.0
@@ -96,8 +96,53 @@ Dynamic: license-file
96
96
 
97
97
  [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
98
98
 
99
- EP is an open protocol that standardizes how developers author evals for large
100
- language model (LLM) applications.
99
+ EP is an open specification, Python SDK, and pytest wrapper that provides a
100
+ standardized way to write evaluations for large language model (LLM)
101
+ applications. Start with simple single-turn evals for model selection and prompt
102
+ engineering, then scale up to complex multi-turn reinforcement learning (RL) for
103
+ agents using Model Context Protocol (MCP). EP ensures consistent patterns for
104
+ writing evals, storing traces, and saving results—enabling you to build
105
+ sophisticated agent evaluations that work across real-world scenarios, from
106
+ markdown generation tasks to customer service agents with tool calling
107
+ capabilities.
108
+
109
+ ## Quick Example
110
+
111
+ Here's a simple test function that checks if a model's response contains **bold** text formatting:
112
+
113
+ ```python test_bold_format.py
114
+ from eval_protocol.models import EvaluateResult, EvaluationRow
115
+ from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
116
+
117
+ @evaluation_test(
118
+ input_messages=[
119
+ [
120
+ Message(role="system", content="You are a helpful assistant. Use bold text to highlight important information."),
121
+ Message(role="user", content="Explain why **evaluations** matter for building AI agents. Make it dramatic!"),
122
+ ],
123
+ ],
124
+ model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
125
+ rollout_processor=default_single_turn_rollout_processor,
126
+ mode="pointwise",
127
+ )
128
+ def test_bold_format(row: EvaluationRow) -> EvaluationRow:
129
+ """
130
+ Simple evaluation that checks if the model's response contains bold text.
131
+ """
132
+
133
+ assistant_response = row.messages[-1].content
134
+
135
+ # Check if response contains **bold** text
136
+ has_bold = "**" in assistant_response
137
+
138
+ if has_bold:
139
+ result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
140
+ else:
141
+ result = EvaluateResult(score=0.0, reason="❌ No bold text found")
142
+
143
+ row.evaluation_result = result
144
+ return row
145
+ ```
101
146
 
102
147
  ## Documentation
103
148
 
@@ -140,6 +140,7 @@ tau2 = { git = "https://github.com/sierra-research/tau2-bench.git" }
140
140
 
141
141
  [dependency-groups]
142
142
  dev = [
143
+ "fastmcp>=2.10.6",
143
144
  "haikus==0.3.8",
144
145
  "pytest>=8.4.1",
145
146
  ]
@@ -1,24 +0,0 @@
1
- # Eval Protocol (EP)
2
-
3
- [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
4
-
5
- EP is an open protocol that standardizes how developers author evals for large
6
- language model (LLM) applications.
7
-
8
- ## Documentation
9
-
10
- See our [documentation](https://evalprotocol.io) for more details.
11
-
12
- ## Installation
13
-
14
- **This library requires Python >= 3.10.**
15
-
16
- Install with pip:
17
-
18
- ```
19
- pip install eval-protocol
20
- ```
21
-
22
- ## License
23
-
24
- [MIT](LICENSE)
File without changes