eval-protocol 0.2.4__tar.gz → 0.2.5.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. {eval_protocol-0.2.4/eval_protocol.egg-info → eval_protocol-0.2.5.dev1}/PKG-INFO +2 -3
  2. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/client/connection.py +19 -1
  4. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/manager.py +3 -38
  5. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/mcpgym.py +25 -2
  6. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/session/manager.py +7 -9
  7. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_env.py +25 -9
  8. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +17 -19
  9. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/types/types.py +4 -2
  10. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1/eval_protocol.egg-info}/PKG-INFO +2 -3
  11. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/requires.txt +1 -2
  12. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/pyproject.toml +1 -2
  13. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_parallel_rollouts.py +2 -2
  14. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_rollout_control_plane_integration.py +10 -2
  15. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_url_handling.py +26 -12
  16. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/LICENSE +0 -0
  17. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/README.md +0 -0
  18. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/__init__.py +0 -0
  19. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/normalize_sandbox_fusion.py +0 -0
  20. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/utils/__init__.py +0 -0
  21. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/utils/generate_api_key.py +0 -0
  22. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/utils/subprocess_manager.py +0 -0
  23. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/__init__.py +0 -0
  24. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/__main__.py +0 -0
  25. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/__init__.py +0 -0
  26. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/braintrust.py +0 -0
  27. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/huggingface.py +0 -0
  28. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/langfuse.py +0 -0
  29. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/trl.py +0 -0
  30. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/__init__.py +0 -0
  31. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/models.py +0 -0
  32. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/orchestrator.py +0 -0
  33. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resource_abc.py +0 -0
  34. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resource_pool.py +0 -0
  35. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
  36. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  37. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  38. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  39. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  40. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  41. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/docker_resource.py +0 -0
  42. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  43. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  44. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  45. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  46. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
  47. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/task_manager.py +0 -0
  48. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/tool_registry.py +0 -0
  49. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/auth.py +0 -0
  50. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli.py +0 -0
  51. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
  52. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  53. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/common.py +0 -0
  54. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/deploy.py +0 -0
  55. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  56. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/logs.py +0 -0
  57. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/preview.py +0 -0
  58. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  59. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/common_utils.py +0 -0
  60. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/config.py +0 -0
  61. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/dataset_logger/__init__.py +0 -0
  62. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  63. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  64. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/datasets/__init__.py +0 -0
  65. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/datasets/loader.py +0 -0
  66. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/evaluation.py +0 -0
  67. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/execution/__init__.py +0 -0
  68. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/execution/pipeline.py +0 -0
  69. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/gcp_tools.py +0 -0
  70. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/generation/cache.py +0 -0
  71. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/generation/clients/base.py +0 -0
  72. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/generation/clients.py +0 -0
  73. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/generic_server.py +0 -0
  74. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/human_id/__init__.py +0 -0
  75. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/human_id/dictionary.py +0 -0
  76. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/__init__.py +0 -0
  77. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/braintrust.py +0 -0
  78. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/deepeval.py +0 -0
  79. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/openeval.py +0 -0
  80. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/trl.py +0 -0
  81. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/__init__.py +0 -0
  82. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/adapter.py +0 -0
  83. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
  84. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/clients.py +0 -0
  85. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
  86. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/base_policy.py +0 -0
  87. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/policy.py +0 -0
  88. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
  89. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  90. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/process_manager.py +0 -0
  91. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
  92. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/simple_process_manager.py +0 -0
  93. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/simulation_server.py +0 -0
  94. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
  95. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/config.py +0 -0
  96. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  97. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/main.py +0 -0
  98. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  99. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  100. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  101. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  102. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  103. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/session.py +0 -0
  104. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/models.py +0 -0
  105. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/packaging.py +0 -0
  106. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/platform_api.py +0 -0
  107. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/playback_policy.py +0 -0
  108. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/__init__.py +0 -0
  109. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  110. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  111. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  112. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  113. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/evaluation_test.py +0 -0
  114. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/types.py +0 -0
  115. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/utils.py +0 -0
  116. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/resources.py +0 -0
  117. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/reward_function.py +0 -0
  118. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/__init__.py +0 -0
  119. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/accuracy.py +0 -0
  120. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/accuracy_length.py +0 -0
  121. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  122. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  123. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/apps_testing_util.py +0 -0
  124. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/bfcl_reward.py +0 -0
  125. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/code_execution.py +0 -0
  126. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
  127. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/cpp_code.py +0 -0
  128. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  129. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/format.py +0 -0
  130. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/function_calling.py +0 -0
  131. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/json_schema.py +0 -0
  132. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/language_consistency.py +0 -0
  133. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/lean_prover.py +0 -0
  134. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/length.py +0 -0
  135. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  136. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/math.py +0 -0
  137. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  138. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/reasoning_steps.py +0 -0
  139. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/repetition.py +0 -0
  140. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/tag_count.py +0 -0
  141. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rl_processing.py +0 -0
  142. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/server.py +0 -0
  143. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/typed_interface.py +0 -0
  144. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/types/__init__.py +0 -0
  145. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/__init__.py +0 -0
  146. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
  147. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/batch_transformation.py +0 -0
  148. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
  149. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/logs_server.py +0 -0
  150. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/module_loader.py +0 -0
  151. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
  152. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/static_policy.py +0 -0
  153. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/vite_server.py +0 -0
  154. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/SOURCES.txt +0 -0
  155. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
  156. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/entry_points.txt +0 -0
  157. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
  158. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/setup.cfg +0 -0
  159. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/setup.py +0 -0
  160. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_accuracy.py +0 -0
  161. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_accuracy_length.py +0 -0
  162. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_adapters_e2e.py +0 -0
  163. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_agent_orchestrator.py +0 -0
  164. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_agent_resources.py +0 -0
  165. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_auth.py +0 -0
  166. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_batch_evaluation.py +0 -0
  167. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_braintrust_adapter.py +0 -0
  168. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_braintrust_example.py +0 -0
  169. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_cli.py +0 -0
  170. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_cli_agent.py +0 -0
  171. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_cli_args.py +0 -0
  172. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_code_execution.py +0 -0
  173. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_config.py +0 -0
  174. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_control_plane_separation.py +0 -0
  175. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_cpp_code.py +0 -0
  176. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_data_driven_task_manager.py +0 -0
  177. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_deepcoder_reward.py +0 -0
  178. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_deepeval_integration.py +0 -0
  179. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_deploy_integration.py +0 -0
  180. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_e2b_integration.py +0 -0
  181. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_e2b_js_integration.py +0 -0
  182. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_edge_cases.py +0 -0
  183. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_eval_protocol_import.py +0 -0
  184. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_evaluation.py +0 -0
  185. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_evaluation_integration.py +0 -0
  186. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_evaluation_preview_integration.py +0 -0
  187. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_examples_end_to_end.py +0 -0
  188. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_fireworks_api.py +0 -0
  189. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_format.py +0 -0
  190. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_fractional_code.py +0 -0
  191. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_frozen_lake_http_server.py +0 -0
  192. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  193. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_function_calling.py +0 -0
  194. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_gcp_tools.py +0 -0
  195. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_generic_server.py +0 -0
  196. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_integration.py +0 -0
  197. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_json_schema.py +0 -0
  198. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_kwargs_validation.py +0 -0
  199. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_language_consistency.py +0 -0
  200. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_lean_prover.py +0 -0
  201. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_lean_prover_runner.py +0 -0
  202. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_length.py +0 -0
  203. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_list_comparison_math_reward.py +0 -0
  204. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_math.py +0 -0
  205. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_minimal.py +0 -0
  206. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_models.py +0 -0
  207. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_models_rl.py +0 -0
  208. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
  209. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_n_variant_batch_integration.py +0 -0
  210. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_n_variant_integration.py +0 -0
  211. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_openai_compatibility.py +0 -0
  212. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_openeval_integration.py +0 -0
  213. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_packaging.py +0 -0
  214. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_platform_api.py +0 -0
  215. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_readiness.py +0 -0
  216. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_reasoning_steps.py +0 -0
  217. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_repetition.py +0 -0
  218. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_repetition_debug.py +0 -0
  219. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_reward_function.py +0 -0
  220. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_reward_protocol_import.py +0 -0
  221. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_rl_processing.py +0 -0
  222. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_server.py +0 -0
  223. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_tag_count.py +0 -0
  224. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_typed_interface.py +0 -0
  225. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_typed_interface_rl.py +0 -0
  226. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/__init__.py +0 -0
  227. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/agent/__init__.py +0 -0
  228. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/agent/base.py +0 -0
  229. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/agent/llm_agent.py +0 -0
  230. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/api_service/__init__.py +0 -0
  231. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/api_service/api_config.py +0 -0
  232. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/api_service/data_model.py +0 -0
  233. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
  234. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/cli.py +0 -0
  235. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/config.py +0 -0
  236. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/data_model/__init__.py +0 -0
  237. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/data_model/message.py +0 -0
  238. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/data_model/simulation.py +0 -0
  239. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/data_model/tasks.py +0 -0
  240. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/__init__.py +0 -0
  241. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
  242. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/data_model.py +0 -0
  243. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
  244. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/tools.py +0 -0
  245. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
  246. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
  247. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/data_model.py +0 -0
  248. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/environment.py +0 -0
  249. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/tools.py +0 -0
  250. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
  251. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
  252. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/data_model.py +0 -0
  253. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
  254. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/tools.py +0 -0
  255. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
  256. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
  257. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/data_model.py +0 -0
  258. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/environment.py +0 -0
  259. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  260. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  261. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  262. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  263. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  264. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  265. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  266. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  267. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tools.py +0 -0
  268. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  269. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  270. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
  271. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/__init__.py +0 -0
  272. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/db.py +0 -0
  273. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/environment.py +0 -0
  274. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/server.py +0 -0
  275. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/tool.py +0 -0
  276. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/toolkit.py +0 -0
  277. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  278. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/__init__.py +0 -0
  279. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator.py +0 -0
  280. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  281. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  282. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  283. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  284. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  285. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/metrics/__init__.py +0 -0
  286. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/metrics/agent_metrics.py +0 -0
  287. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  288. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/orchestrator/__init__.py +0 -0
  289. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  290. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  291. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/orchestrator/utils.py +0 -0
  292. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/registry.py +0 -0
  293. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/run.py +0 -0
  294. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/__init__.py +0 -0
  295. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/check_data.py +0 -0
  296. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  297. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/start_servers.py +0 -0
  298. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/view_simulations.py +0 -0
  299. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/user/__init__.py +0 -0
  300. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/user/base.py +0 -0
  301. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/user/user_simulator.py +0 -0
  302. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/__init__.py +0 -0
  303. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/display.py +0 -0
  304. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/io_utils.py +0 -0
  305. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/llm_utils.py +0 -0
  306. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/pydantic_utils.py +0 -0
  307. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/utils.py +0 -0
  308. {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/versioneer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.4
3
+ Version: 0.2.5.dev1
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -40,7 +40,6 @@ Requires-Dist: deepdiff>=6.0.0
40
40
  Requires-Dist: pandas>=1.5.0
41
41
  Requires-Dist: watchdog>=2.1.0
42
42
  Requires-Dist: websockets>=15.0.1
43
- Requires-Dist: fireworks-ai>=0.19.12
44
43
  Requires-Dist: fastapi>=0.116.1
45
44
  Provides-Extra: dev
46
45
  Requires-Dist: build; extra == "dev"
@@ -79,7 +78,7 @@ Requires-Dist: accelerate>=0.28.0; extra == "trl"
79
78
  Provides-Extra: openevals
80
79
  Requires-Dist: openevals>=0.1.0; extra == "openevals"
81
80
  Provides-Extra: fireworks
82
- Requires-Dist: fireworks-ai>=0.19.10; extra == "fireworks"
81
+ Requires-Dist: fireworks-ai>=0.19.12; extra == "fireworks"
83
82
  Provides-Extra: box2d
84
83
  Requires-Dist: swig; extra == "box2d"
85
84
  Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-05T23:22:49-0700",
11
+ "date": "2025-08-06T17:51:29-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "4dbac4d9116bdb2888bd145e779eba9086c59096",
15
- "version": "0.2.4"
14
+ "full-revisionid": "a807140937b9002c71ee42a6afef594ea6377c2d",
15
+ "version": "0.2.5-dev1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -101,7 +101,7 @@ class MCPConnectionManager:
101
101
 
102
102
  # Update the session ID to match what the server generated
103
103
  session.session_id = server_session_id
104
- logger.debug(f"Updated session ID to match server: {server_session_id}")
104
+ logger.info(f"Updated session ID to match server: {server_session_id}")
105
105
 
106
106
  # PRE-WARM: Discover and cache tools immediately after session initialization
107
107
  # This prevents concurrent list_tools() calls later
@@ -133,6 +133,24 @@ class MCPConnectionManager:
133
133
  self._tools_cache[cache_key] = tool_schemas
134
134
  logger.debug(f"✅ PRE-WARMED {len(tool_schemas)} tools for{cache_key}")
135
135
 
136
+ async def reset_session(self, session: MCPSession) -> None:
137
+ """
138
+ Clean session data in remote mcp server for the given session
139
+ """
140
+ import httpx
141
+
142
+ base_url = session.base_url.rstrip("/").removesuffix("/mcp")
143
+ url = f"{base_url}/control/reset_session"
144
+
145
+ headers = {"mcp-session-id": session.session_id}
146
+ body = {"seed": session.seed}
147
+
148
+ timeout = httpx.Timeout(3.0)
149
+ async with httpx.AsyncClient(timeout=timeout) as client:
150
+ resp = await client.post(url, headers=headers, json=body)
151
+ resp.raise_for_status()
152
+ logger.debug(f"Session {session.session_id}: reset_session -> {resp.json()}")
153
+
136
154
  async def discover_tools(self, session: MCPSession) -> List[Dict]:
137
155
  """
138
156
  Discover available tools from an MCP session.
@@ -22,7 +22,6 @@ from vendor.tau2.user.user_simulator import UserSimulator
22
22
 
23
23
  from ...models import CompletionParams, EvaluationRow, InputMetadata, Message
24
24
  from ...types import MCPSession, MCPToolCall, TerminationReason, Trajectory
25
- from ..client.connection import MCPConnectionManager
26
25
 
27
26
  if TYPE_CHECKING:
28
27
  from ..session.manager import GeneralMCPVectorEnv
@@ -33,43 +32,9 @@ logger = logging.getLogger(__name__)
33
32
 
34
33
  class ExecutionManager:
35
34
  """
36
- Unified manager that handles both MCP session lifecycle and rollout execution.
37
-
38
- Combines the functionality of SessionManager and RolloutManager for better
39
- organization and reduced complexity.
35
+ Manage rollout for MCP environments.
40
36
  """
41
37
 
42
- def __init__(self):
43
- """Initialize the execution manager."""
44
- self.connection_manager = MCPConnectionManager()
45
-
46
- async def initialize_sessions(self, sessions: List[MCPSession]) -> None:
47
- """
48
- Initialize multiple MCP sessions in parallel.
49
-
50
- Args:
51
- sessions: List of MCPSessions to initialize
52
- """
53
- tasks = [self.connection_manager.initialize_session(session) for session in sessions]
54
- await asyncio.gather(*tasks)
55
-
56
- async def close_sessions(self, sessions: List[MCPSession]) -> None:
57
- """
58
- Close multiple MCP sessions in parallel.
59
-
60
- Args:
61
- sessions: List of MCPSessions to close
62
- """
63
- tasks = [asyncio.create_task(self.connection_manager.close_session(session)) for session in sessions]
64
-
65
- if tasks:
66
- try:
67
- # Wait for all close operations to complete
68
- await asyncio.gather(*tasks, return_exceptions=True)
69
- except asyncio.CancelledError:
70
- # Handle cancellation gracefully (especially important for Python 3.12)
71
- logger.debug("Close operation was cancelled, but sessions are marked as closed")
72
-
73
38
  async def execute_rollouts(
74
39
  self,
75
40
  envs: "GeneralMCPVectorEnv",
@@ -178,7 +143,7 @@ class ExecutionManager:
178
143
  for msg in trajectory.conversation_history:
179
144
  # Create a copy to avoid modifying the original
180
145
  msg_dict = dict(msg)
181
-
146
+
182
147
  # Handle multimodal content (list of content blocks) by extracting text
183
148
  if isinstance(msg_dict.get("content"), list):
184
149
  text_content = None
@@ -187,7 +152,7 @@ class ExecutionManager:
187
152
  text_content = content_block.get("text")
188
153
  break
189
154
  msg_dict["content"] = text_content or ""
190
-
155
+
191
156
  messages.append(Message.model_validate(msg_dict))
192
157
 
193
158
  input_metadata = InputMetadata(
@@ -116,6 +116,7 @@ class McpGym(ABC):
116
116
  # Register tools and control plane endpoints
117
117
  self._register_tools()
118
118
  self._discover_and_register_control_plane_endpoints()
119
+ self._register_session_reset_endpoint()
119
120
 
120
121
  def _get_session_id(self, ctx: Context) -> str:
121
122
  """
@@ -227,6 +228,28 @@ class McpGym(ABC):
227
228
 
228
229
  return self.sessions[session_id]
229
230
 
231
+ def _register_session_reset_endpoint(self):
232
+
233
+ @self.mcp.custom_route("/control/reset_session", methods=["POST"])
234
+ async def reset_session_endpoint(request: Request) -> JSONResponse:
235
+ session_id = request.headers.get("mcp-session-id")
236
+ body = await request.json()
237
+ seed = body.get("seed", None)
238
+ print(f"🔍 _register_session_reset_endpoint: Resetting session, session_id: {session_id}, seed: {seed}")
239
+ if not session_id:
240
+ return JSONResponse({"error": "Missing mcp-session-id header"}, status_code=400)
241
+ with self.session_lock:
242
+ if session_id in self.sessions:
243
+ env, obs, _ = self._new_env(seed=seed)
244
+ self.sessions[session_id] = {
245
+ "env": env,
246
+ "obs": obs,
247
+ "session_data": {},
248
+ "session_id": session_id,
249
+ }
250
+ print(f"🔍 _register_session_reset_endpoint: Finished reset session, session_id: {session_id}")
251
+ return JSONResponse({"message": "Session reset successfully"})
252
+
230
253
  def _discover_and_register_control_plane_endpoints(self):
231
254
  """
232
255
  Discover and register control plane endpoints on the subclass instance.
@@ -323,7 +346,7 @@ class McpGym(ABC):
323
346
 
324
347
  # Log control plane update (for debugging)
325
348
  print(
326
- f"🎛️ Control plane updated: reward={reward}, terminated={terminated}, step={self.control_plane_state['step_count']}"
349
+ f"🎛️ Control plane updated: reward={reward}, terminated={terminated}, step={self.control_plane_state['step_count']}, total_reward={self.control_plane_state['total_reward']}"
327
350
  )
328
351
 
329
352
  def _get_or_create_session_control_plane(self, session_id: str) -> Dict[str, Any]:
@@ -365,7 +388,7 @@ class McpGym(ABC):
365
388
 
366
389
  # Log control plane update
367
390
  print(
368
- f"🎛️ Session {session_id[:16]}... control plane: reward={reward}, terminated={terminated}, step={control_plane['step_count']}"
391
+ f"🎛️ Session {session_id[:16]}... control plane: reward={reward}, terminated={terminated}, step={control_plane['step_count']}, total_reward={control_plane['total_reward']}"
369
392
  )
370
393
 
371
394
  def get_control_plane_state(self, session_id: str) -> Optional[Dict[str, Any]]:
@@ -11,7 +11,7 @@ import logging
11
11
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
12
12
 
13
13
  from ...types import DatasetRow, MCPSession, MCPToolCall
14
- from ..execution.manager import ExecutionManager
14
+ from ..client.connection import MCPConnectionManager
15
15
 
16
16
  logger = logging.getLogger(__name__)
17
17
 
@@ -44,7 +44,7 @@ class GeneralMCPVectorEnv:
44
44
  self.user_prompt_formatter = user_prompt_formatter or self._default_formatter
45
45
  self.n = len(sessions)
46
46
  self.tool_schemas = [] # Discovered from MCP servers
47
- self.execution_manager = ExecutionManager()
47
+ self.connection_manager = MCPConnectionManager()
48
48
  self.usage_stats = {} # llm usage stats for monitoring
49
49
 
50
50
  if len(sessions) != len(dataset_rows):
@@ -58,17 +58,14 @@ class GeneralMCPVectorEnv:
58
58
 
59
59
  This is thread-safe and can be called from worker threads.
60
60
  """
61
- # Establish a persistent session for each environment.
62
- await self.execution_manager.connection_manager.initialize_session(session)
63
-
64
61
  # Get available tools from MCP server
65
- tool_schemas = await self.execution_manager.connection_manager.discover_tools(session)
62
+ tool_schemas = await self.connection_manager.discover_tools(session)
66
63
 
67
64
  if not self.tool_schemas:
68
65
  self.tool_schemas = tool_schemas
69
66
 
70
67
  # PROPER MCP PATTERN: Get initial state from resources during session establishment
71
- initial_observation = await self.execution_manager.connection_manager.get_initial_state(session)
68
+ initial_observation = await self.connection_manager.get_initial_state(session)
72
69
 
73
70
  # Update session state
74
71
  session.terminated = False
@@ -119,7 +116,7 @@ class GeneralMCPVectorEnv:
119
116
  )
120
117
 
121
118
  # Execute the tool call via MCP protocol
122
- observation, reward, done, info = await self.execution_manager.connection_manager.call_tool(
119
+ observation, reward, done, info = await self.connection_manager.call_tool(
123
120
  session, tool_call.tool_name, tool_call.arguments
124
121
  )
125
122
 
@@ -223,5 +220,6 @@ class GeneralMCPVectorEnv:
223
220
  async def close(self):
224
221
  """Closes all MCP sessions."""
225
222
  print(f"🧹 Closing {self.n} MCP sessions...")
226
- await self.execution_manager.close_sessions(self.sessions)
223
+ tasks = [self.connection_manager.close_session(session) for session in self.sessions]
224
+ await asyncio.gather(*tasks)
227
225
  print(f"✅ All MCP sessions closed.")
@@ -17,7 +17,7 @@ Usage remains the same:
17
17
  policy = ep.FireworksPolicy(model_id="accounts/fireworks/models/qwen3-235b-a22b")
18
18
 
19
19
  # Create environments with evaluation_rows configuration
20
- envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
20
+ envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
21
21
 
22
22
  # Execute tool-calling rollouts
23
23
  evaluation_rows = await ep.rollout(envs, policy=policy, steps=512)
@@ -51,11 +51,20 @@ from .mcp.execution.policy import AnthropicPolicy, FireworksPolicy, LLMBasePolic
51
51
  from .mcp.session.manager import GeneralMCPVectorEnv
52
52
  from .models import EvaluationRow
53
53
  from .types import DatasetRow, MCPSession, MCPToolCall
54
+ import asyncio
54
55
 
55
56
  logger = logging.getLogger(__name__)
56
57
 
57
58
 
58
- def make(
59
+ async def reset_mcp_sessions(envs: GeneralMCPVectorEnv):
60
+ """
61
+ Reset mcp server sessions
62
+ """
63
+ tasks = [envs.connection_manager.reset_session(session) for session in envs.sessions]
64
+ await asyncio.gather(*tasks)
65
+
66
+
67
+ async def make(
59
68
  env_spec: str,
60
69
  evaluation_rows: Optional[List[EvaluationRow]] = None,
61
70
  dataset: Optional[List[Dict]] = None,
@@ -63,6 +72,7 @@ def make(
63
72
  seeds: Optional[List[int]] = None,
64
73
  model_id: str = "unknown",
65
74
  user_prompt_formatter: Optional[Callable] = None,
75
+ reset_sessions: bool = False,
66
76
  ) -> GeneralMCPVectorEnv:
67
77
  """
68
78
  Create general MCP environments driven by evaluation_rows configuration.
@@ -75,19 +85,20 @@ def make(
75
85
  seeds: List of seeds (for backward compatibility)
76
86
  model_id: Model identifier
77
87
  user_prompt_formatter: Optional callback for formatting user prompts
88
+ reset_sessions: Whether to reset sessions before returning the environment
78
89
 
79
90
  Returns:
80
91
  General MCP environment that works with any MCP server
81
92
 
82
93
  Example:
83
94
  # EvaluationRow approach (preferred)
84
- envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
95
+ envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
85
96
 
86
97
  # Dataset approach (backward compatibility)
87
- envs = ep.make("http://localhost:8000/mcp", dataset=dataset)
98
+ envs = await ep.make("http://localhost:8000/mcp", dataset=dataset)
88
99
 
89
100
  # Legacy approach (backward compatibility)
90
- envs = ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
101
+ envs = await ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
91
102
  """
92
103
  # Parse environment specification - make sure URL format is correct
93
104
  base_url = env_spec
@@ -160,8 +171,6 @@ def make(
160
171
  )
161
172
  sessions.append(session)
162
173
 
163
- return GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
164
-
165
174
  else:
166
175
  # Legacy approach for backward compatibility
167
176
  if n is None:
@@ -198,7 +207,14 @@ def make(
198
207
  )
199
208
  sessions.append(session)
200
209
 
201
- return GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
210
+ mcp_envs = GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
211
+ tasks = [mcp_envs.connection_manager.initialize_session(session) for session in sessions]
212
+ await asyncio.gather(*tasks)
213
+
214
+ if reset_sessions:
215
+ await reset_mcp_sessions(mcp_envs)
216
+
217
+ return mcp_envs
202
218
 
203
219
 
204
220
  async def rollout(
@@ -266,7 +282,7 @@ async def rollout(
266
282
  raise ValueError("Either 'evaluation_rows' or 'dataset' must be provided when envs is a URL")
267
283
 
268
284
  auto_model_id = model_id or getattr(policy, "model_id", "unknown")
269
- envs = make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id)
285
+ envs = await make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id)
270
286
 
271
287
  # Use the new ExecutionManager for execution
272
288
  execution_manager = ExecutionManager()
@@ -182,49 +182,47 @@ class MCPServerManager:
182
182
  return False # Don't suppress exceptions
183
183
 
184
184
 
185
-
186
- async def default_mcp_gym_rollout_processor(rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[EvaluationRow]:
185
+ async def default_mcp_gym_rollout_processor(
186
+ rows: List[EvaluationRow], config: RolloutProcessorConfig
187
+ ) -> List[EvaluationRow]:
187
188
  """
188
189
  Rollout processor for tau bench environments.
189
-
190
+
190
191
  This processor starts an MCP server, creates tau bench environments, and runs rollouts
191
192
  using the eval_protocol framework, following the pattern from test_tau2_e2e.py.
192
-
193
+
193
194
  Args:
194
195
  rows: List of EvaluationRow objects containing messages and dataset info in input_metadata
195
196
  config: RolloutProcessorConfig with model and other parameters
196
-
197
+
197
198
  Returns:
198
199
  List of EvaluationRow objects with completed conversations
199
200
  """
200
201
  server = MCPServerManager(config.server_script_path, port=9700)
201
-
202
+
202
203
  try:
203
204
  server.start()
204
-
205
+
205
206
  policy = ep.LiteLLMPolicy(
206
207
  model_id=config.model,
207
- temperature=config.input_params.get('temperature', 0.0),
208
- max_tokens=config.input_params.get('max_tokens', 4096),
208
+ temperature=config.input_params.get("temperature", 0.0),
209
+ max_tokens=config.input_params.get("max_tokens", 4096),
209
210
  )
210
-
211
+
211
212
  # Create MCP environments directly from evaluation_rows
212
- envs = ep.make(
213
- 'http://localhost:9700/mcp/',
213
+ envs = await ep.make(
214
+ "http://localhost:9700/mcp/",
214
215
  evaluation_rows=rows,
215
216
  model_id=policy.model_id,
216
217
  )
217
-
218
+
218
219
  # Run rollout with environments and policy
219
220
  evaluation_rows = await ep.rollout(
220
- envs,
221
- policy=policy,
222
- steps=config.steps,
223
- max_concurrent_rollouts=config.max_concurrent_rollouts
221
+ envs, policy=policy, steps=config.steps, max_concurrent_rollouts=config.max_concurrent_rollouts
224
222
  )
225
-
223
+
226
224
  return evaluation_rows
227
-
225
+
228
226
  finally:
229
227
  # Always clean up the server
230
228
  server.stop()
@@ -1,6 +1,8 @@
1
1
  from dataclasses import dataclass, field
2
2
  from enum import Enum
3
3
  from typing import Any, Dict, List, Optional
4
+ from mcp.client.session import ClientSession
5
+ from contextlib import AsyncExitStack
4
6
 
5
7
 
6
8
  class TerminationReason(str, Enum):
@@ -50,8 +52,8 @@ class MCPSession:
50
52
  last_observation: Any = None
51
53
 
52
54
  # Persistent MCP connection components
53
- _exit_stack: Optional[Any] = None
54
- _mcp_session: Optional[Any] = None
55
+ _exit_stack: Optional[AsyncExitStack] = None
56
+ _mcp_session: Optional[ClientSession] = None
55
57
 
56
58
 
57
59
  @dataclass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.4
3
+ Version: 0.2.5.dev1
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -40,7 +40,6 @@ Requires-Dist: deepdiff>=6.0.0
40
40
  Requires-Dist: pandas>=1.5.0
41
41
  Requires-Dist: watchdog>=2.1.0
42
42
  Requires-Dist: websockets>=15.0.1
43
- Requires-Dist: fireworks-ai>=0.19.12
44
43
  Requires-Dist: fastapi>=0.116.1
45
44
  Provides-Extra: dev
46
45
  Requires-Dist: build; extra == "dev"
@@ -79,7 +78,7 @@ Requires-Dist: accelerate>=0.28.0; extra == "trl"
79
78
  Provides-Extra: openevals
80
79
  Requires-Dist: openevals>=0.1.0; extra == "openevals"
81
80
  Provides-Extra: fireworks
82
- Requires-Dist: fireworks-ai>=0.19.10; extra == "fireworks"
81
+ Requires-Dist: fireworks-ai>=0.19.12; extra == "fireworks"
83
82
  Provides-Extra: box2d
84
83
  Requires-Dist: swig; extra == "box2d"
85
84
  Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"
@@ -28,7 +28,6 @@ deepdiff>=6.0.0
28
28
  pandas>=1.5.0
29
29
  watchdog>=2.1.0
30
30
  websockets>=15.0.1
31
- fireworks-ai>=0.19.12
32
31
  fastapi>=0.116.1
33
32
 
34
33
  [adapters]
@@ -71,7 +70,7 @@ pip>=25.1.1
71
70
  haikus==0.3.8
72
71
 
73
72
  [fireworks]
74
- fireworks-ai>=0.19.10
73
+ fireworks-ai>=0.19.12
75
74
 
76
75
  [huggingface]
77
76
  datasets>=2.0.0
@@ -48,7 +48,6 @@ dependencies = [
48
48
  "pandas>=1.5.0",
49
49
  "watchdog>=2.1.0",
50
50
  "websockets>=15.0.1",
51
- "fireworks-ai>=0.19.12",
52
51
  "fastapi>=0.116.1",
53
52
  ]
54
53
 
@@ -96,7 +95,7 @@ openevals = [
96
95
  "openevals>=0.1.0",
97
96
  ]
98
97
  fireworks = [
99
- "fireworks-ai>=0.19.10",
98
+ "fireworks-ai>=0.19.12",
100
99
  ]
101
100
  box2d = [
102
101
  "swig",
@@ -138,7 +138,7 @@ async def _test_seed_handling_and_type_compatibility_impl():
138
138
  )
139
139
 
140
140
  # 3. Test that environments are created with proper seed isolation
141
- envs = ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
141
+ envs = await ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
142
142
 
143
143
  # Verify we have the right number of environments
144
144
  assert len(envs.sessions) == len(test_seeds), f"Expected {len(test_seeds)} sessions, got {len(envs.sessions)}"
@@ -273,7 +273,7 @@ async def _run_simplified_compatibility_test():
273
273
  )
274
274
 
275
275
  # This should work even without a server (just creates session objects)
276
- envs = ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
276
+ envs = await ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
277
277
  assert len(envs.sessions) == len(test_seeds)
278
278
  print("✅ Environment creation works")
279
279
 
@@ -489,7 +489,7 @@ class TestRolloutControlPlaneIntegration:
489
489
  policy = MockPolicy(["right"])
490
490
 
491
491
  with (
492
- patch("eval_protocol.mcp_env.make") as mock_make,
492
+ patch("eval_protocol.mcp_env.make", new_callable=AsyncMock) as mock_make,
493
493
  patch("eval_protocol.mcp_env.ExecutionManager") as MockManager,
494
494
  ):
495
495
  mock_env = MagicMock()
@@ -512,7 +512,15 @@ class TestRolloutControlPlaneIntegration:
512
512
  dataset=dataset,
513
513
  model_id="test_model",
514
514
  )
515
- manager_instance.execute_rollouts.assert_called_once_with(mock_env, policy, 5, None, 8)
515
+
516
+ manager_instance.execute_rollouts.assert_called_once_with(
517
+ mock_make.return_value,
518
+ policy,
519
+ 5,
520
+ None,
521
+ 8,
522
+ )
523
+
516
524
  assert result == ["ok"]
517
525
 
518
526
  def test_control_plane_trajectory_serialization(self):
@@ -1,5 +1,4 @@
1
- import asyncio
2
-
1
+ from unittest.mock import AsyncMock, patch
3
2
  import httpx
4
3
  import pytest
5
4
  from werkzeug.wrappers import Response
@@ -7,31 +6,46 @@ from werkzeug.wrappers import Response
7
6
  import eval_protocol as ep
8
7
 
9
8
 
10
- # Sync tests for the ep.make() function
11
- def test_mcp_env_make_appends_trailing_slash():
9
+ # Sync tests for the await ep.make() function
10
+ @pytest.mark.asyncio
11
+ async def test_mcp_env_make_appends_trailing_slash():
12
12
  """
13
- Verify that ep.make() appends a trailing slash to the MCP server URL if it's missing.
13
+ Verify that await ep.make() appends a trailing slash to the MCP server URL if it's missing.
14
14
  This prevents 307 redirects that can break HTTP clients.
15
15
  """
16
16
  base_url = "http://localhost:8000/mcp"
17
17
  corrected_url = "http://localhost:8000/mcp/"
18
18
 
19
- # Use n and seeds to avoid needing a full dataset
20
- envs = ep.make(base_url, n=1, seeds=[42])
19
+ with patch(
20
+ "eval_protocol.mcp.client.connection.MCPConnectionManager.initialize_session",
21
+ new_callable=AsyncMock,
22
+ ) as mock_init:
23
+ mock_init.return_value = None
24
+
25
+ envs = await ep.make(base_url, n=1, seeds=[42])
26
+
27
+ mock_init.assert_awaited_once()
21
28
 
22
29
  assert len(envs.sessions) == 1
23
- # The session's base_url should have the trailing slash
24
30
  assert envs.sessions[0].base_url == corrected_url
25
31
 
26
32
 
27
- def test_mcp_env_make_keeps_existing_trailing_slash():
33
+ @pytest.mark.asyncio
34
+ async def test_mcp_env_make_keeps_existing_trailing_slash():
28
35
  """
29
- Verify that ep.make() does not add an extra slash if one is already present.
36
+ Verify that await ep.make() does not add an extra slash if one is already present.
30
37
  """
31
38
  base_url = "http://localhost:8000/mcp/"
32
39
 
33
- # Use n and seeds to avoid needing a full dataset
34
- envs = ep.make(base_url, n=1, seeds=[42])
40
+ with patch(
41
+ "eval_protocol.mcp.client.connection.MCPConnectionManager.initialize_session",
42
+ new_callable=AsyncMock,
43
+ ) as mock_init:
44
+ mock_init.return_value = None
45
+
46
+ envs = await ep.make(base_url, n=1, seeds=[42])
47
+
48
+ mock_init.assert_awaited_once()
35
49
 
36
50
  assert len(envs.sessions) == 1
37
51
  # The session's base_url should remain unchanged