eval-protocol 0.2.6.dev1__tar.gz → 0.2.6.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. {eval_protocol-0.2.6.dev1/eval_protocol.egg-info → eval_protocol-0.2.6.dev2}/PKG-INFO +2 -2
  2. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/_version.py +3 -3
  3. eval_protocol-0.2.6.dev2/eval_protocol/common_utils.py +55 -0
  4. eval_protocol-0.2.6.dev2/eval_protocol/dataset_logger/__init__.py +3 -0
  5. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/dataset_logger/dataset_logger.py +2 -0
  6. eval_protocol-0.2.6.dev2/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +39 -0
  7. eval_protocol-0.2.6.dev2/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +57 -0
  8. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/__init__.py +5 -0
  9. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/event_bus.py +50 -0
  10. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/logger.py +3 -0
  11. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/sqlite_event_bus.py +109 -0
  12. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/sqlite_event_bus_database.py +95 -0
  13. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/generation/clients.py +4 -1
  14. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/logging_utils.py +1 -1
  15. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/client/connection.py +6 -11
  16. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/execution/base_policy.py +1 -1
  17. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/execution/manager.py +40 -25
  18. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/mcpgym.py +61 -101
  19. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/session/manager.py +1 -0
  20. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_env.py +6 -14
  21. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/models.py +22 -2
  22. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +14 -1
  23. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +37 -5
  24. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/evaluation_test.py +310 -48
  25. eval_protocol-0.2.6.dev2/eval_protocol/pytest/plugin.py +144 -0
  26. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/utils.py +33 -1
  27. eval_protocol-0.2.6.dev2/eval_protocol/stats/__init__.py +5 -0
  28. eval_protocol-0.2.6.dev2/eval_protocol/stats/confidence_intervals.py +116 -0
  29. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/types/types.py +3 -0
  30. eval_protocol-0.2.6.dev2/eval_protocol/utils/logs_server.py +338 -0
  31. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2/eval_protocol.egg-info}/PKG-INFO +2 -2
  32. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/SOURCES.txt +15 -4
  33. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/entry_points.txt +3 -0
  34. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/requires.txt +1 -1
  35. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/pyproject.toml +13 -1
  36. eval_protocol-0.2.6.dev2/tests/test_event_bus.py +265 -0
  37. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_parallel_rollouts.py +2 -2
  38. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_url_handling.py +8 -26
  39. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/agent/llm_agent.py +22 -36
  40. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/user/user_simulator.py +9 -5
  41. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/llm_utils.py +18 -3
  42. eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-D9iVTBbF.css +1 -0
  43. eval_protocol-0.2.6.dev1/vite-app/dist/assets/index-BMc_e8JT.js → eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-DiF_B1x_.js +19 -19
  44. eval_protocol-0.2.6.dev1/vite-app/dist/assets/index-BMc_e8JT.js.map → eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-DiF_B1x_.js.map +1 -1
  45. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vite-app/dist/index.html +3 -3
  46. eval_protocol-0.2.6.dev1/eval_protocol/common_utils.py +0 -30
  47. eval_protocol-0.2.6.dev1/eval_protocol/dataset_logger/__init__.py +0 -3
  48. eval_protocol-0.2.6.dev1/eval_protocol/utils/logs_server.py +0 -299
  49. eval_protocol-0.2.6.dev1/vite-app/dist/assets/index-Dp7ms4NJ.css +0 -1
  50. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/LICENSE +0 -0
  51. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/README.md +0 -0
  52. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/development/__init__.py +0 -0
  53. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/development/normalize_sandbox_fusion.py +0 -0
  54. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/development/utils/__init__.py +0 -0
  55. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/development/utils/generate_api_key.py +0 -0
  56. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/development/utils/subprocess_manager.py +0 -0
  57. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/__init__.py +0 -0
  58. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/__main__.py +0 -0
  59. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/__init__.py +0 -0
  60. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/braintrust.py +0 -0
  61. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/huggingface.py +0 -0
  62. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/langfuse.py +0 -0
  63. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/trl.py +0 -0
  64. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/__init__.py +0 -0
  65. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/models.py +0 -0
  66. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/orchestrator.py +0 -0
  67. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resource_abc.py +0 -0
  68. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resource_pool.py +0 -0
  69. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
  70. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  71. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  72. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  73. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  74. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  75. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
  76. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  77. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  78. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  79. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  80. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
  81. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/task_manager.py +0 -0
  82. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/tool_registry.py +0 -0
  83. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/auth.py +0 -0
  84. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli.py +0 -0
  85. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
  86. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  87. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/common.py +0 -0
  88. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
  89. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  90. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/logs.py +0 -0
  91. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/preview.py +0 -0
  92. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  93. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/config.py +0 -0
  94. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  95. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/datasets/__init__.py +0 -0
  96. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/datasets/loader.py +0 -0
  97. {eval_protocol-0.2.6.dev1/eval_protocol/dataset_logger → eval_protocol-0.2.6.dev2/eval_protocol}/directory_utils.py +0 -0
  98. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/evaluation.py +0 -0
  99. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/execution/__init__.py +0 -0
  100. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/execution/pipeline.py +0 -0
  101. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/gcp_tools.py +0 -0
  102. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/generation/cache.py +0 -0
  103. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/generation/clients/base.py +0 -0
  104. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/generic_server.py +0 -0
  105. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/get_pep440_version.py +0 -0
  106. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/human_id/__init__.py +0 -0
  107. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/human_id/dictionary.py +0 -0
  108. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/__init__.py +0 -0
  109. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/braintrust.py +0 -0
  110. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/deepeval.py +0 -0
  111. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/openeval.py +0 -0
  112. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/trl.py +0 -0
  113. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/__init__.py +0 -0
  114. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/adapter.py +0 -0
  115. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
  116. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/clients.py +0 -0
  117. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
  118. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
  119. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
  120. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  121. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/process_manager.py +0 -0
  122. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
  123. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
  124. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
  125. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
  126. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/config.py +0 -0
  127. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  128. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/main.py +0 -0
  129. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  130. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  131. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  132. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  133. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  134. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/session.py +0 -0
  135. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/packaging.py +0 -0
  136. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/platform_api.py +0 -0
  137. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/playback_policy.py +0 -0
  138. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/__init__.py +0 -0
  139. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  140. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  141. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  142. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/types.py +0 -0
  143. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/resources.py +0 -0
  144. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/reward_function.py +0 -0
  145. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/__init__.py +0 -0
  146. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/accuracy.py +0 -0
  147. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
  148. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  149. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  150. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
  151. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
  152. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/code_execution.py +0 -0
  153. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
  154. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
  155. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  156. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/format.py +0 -0
  157. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/function_calling.py +0 -0
  158. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/json_schema.py +0 -0
  159. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
  160. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
  161. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/length.py +0 -0
  162. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  163. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/math.py +0 -0
  164. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  165. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
  166. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/repetition.py +0 -0
  167. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/tag_count.py +0 -0
  168. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/rl_processing.py +0 -0
  169. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/server.py +0 -0
  170. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/typed_interface.py +0 -0
  171. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/types/__init__.py +0 -0
  172. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/__init__.py +0 -0
  173. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
  174. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
  175. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
  176. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/module_loader.py +0 -0
  177. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
  178. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/static_policy.py +0 -0
  179. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/vite_server.py +0 -0
  180. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
  181. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
  182. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/setup.cfg +0 -0
  183. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/setup.py +0 -0
  184. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_accuracy.py +0 -0
  185. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_accuracy_length.py +0 -0
  186. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_adapters_e2e.py +0 -0
  187. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_agent_orchestrator.py +0 -0
  188. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_agent_resources.py +0 -0
  189. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_auth.py +0 -0
  190. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_batch_evaluation.py +0 -0
  191. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_braintrust_adapter.py +0 -0
  192. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_braintrust_example.py +0 -0
  193. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_cli.py +0 -0
  194. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_cli_agent.py +0 -0
  195. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_cli_args.py +0 -0
  196. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_code_execution.py +0 -0
  197. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_config.py +0 -0
  198. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_control_plane_separation.py +0 -0
  199. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_cpp_code.py +0 -0
  200. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_data_driven_task_manager.py +0 -0
  201. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_deepcoder_reward.py +0 -0
  202. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_deepeval_integration.py +0 -0
  203. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_deploy_integration.py +0 -0
  204. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_e2b_integration.py +0 -0
  205. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_e2b_js_integration.py +0 -0
  206. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_edge_cases.py +0 -0
  207. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_eval_protocol_import.py +0 -0
  208. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_evaluation.py +0 -0
  209. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_evaluation_integration.py +0 -0
  210. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_evaluation_preview_integration.py +0 -0
  211. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_examples_end_to_end.py +0 -0
  212. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_fireworks_api.py +0 -0
  213. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_format.py +0 -0
  214. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_fractional_code.py +0 -0
  215. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_frozen_lake_http_server.py +0 -0
  216. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  217. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_function_calling.py +0 -0
  218. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_gcp_tools.py +0 -0
  219. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_generic_server.py +0 -0
  220. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_integration.py +0 -0
  221. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_json_schema.py +0 -0
  222. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_kwargs_validation.py +0 -0
  223. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_language_consistency.py +0 -0
  224. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_lean_prover.py +0 -0
  225. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_lean_prover_runner.py +0 -0
  226. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_length.py +0 -0
  227. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_list_comparison_math_reward.py +0 -0
  228. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_math.py +0 -0
  229. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_minimal.py +0 -0
  230. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_models.py +0 -0
  231. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_models_rl.py +0 -0
  232. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
  233. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_n_variant_batch_integration.py +0 -0
  234. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_n_variant_integration.py +0 -0
  235. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_openai_compatibility.py +0 -0
  236. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_openeval_integration.py +0 -0
  237. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_packaging.py +0 -0
  238. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_platform_api.py +0 -0
  239. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_readiness.py +0 -0
  240. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_reasoning_steps.py +0 -0
  241. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_repetition.py +0 -0
  242. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_repetition_debug.py +0 -0
  243. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_reward_function.py +0 -0
  244. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_reward_protocol_import.py +0 -0
  245. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_rl_processing.py +0 -0
  246. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
  247. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_server.py +0 -0
  248. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_tag_count.py +0 -0
  249. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_typed_interface.py +0 -0
  250. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/tests/test_typed_interface_rl.py +0 -0
  251. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/__init__.py +0 -0
  252. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/agent/__init__.py +0 -0
  253. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/agent/base.py +0 -0
  254. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/api_service/__init__.py +0 -0
  255. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/api_service/api_config.py +0 -0
  256. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/api_service/data_model.py +0 -0
  257. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
  258. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/cli.py +0 -0
  259. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/config.py +0 -0
  260. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/data_model/__init__.py +0 -0
  261. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/data_model/message.py +0 -0
  262. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/data_model/simulation.py +0 -0
  263. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/data_model/tasks.py +0 -0
  264. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/__init__.py +0 -0
  265. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
  266. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
  267. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
  268. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
  269. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
  270. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
  271. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
  272. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
  273. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
  274. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
  275. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
  276. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
  277. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
  278. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
  279. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
  280. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
  281. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
  282. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
  283. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  284. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  285. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  286. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  287. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  288. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  289. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  290. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  291. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
  292. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  293. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  294. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
  295. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/__init__.py +0 -0
  296. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/db.py +0 -0
  297. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/environment.py +0 -0
  298. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/server.py +0 -0
  299. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/tool.py +0 -0
  300. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/toolkit.py +0 -0
  301. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  302. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
  303. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
  304. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  305. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  306. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  307. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  308. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  309. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/metrics/__init__.py +0 -0
  310. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
  311. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  312. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
  313. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  314. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  315. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
  316. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/registry.py +0 -0
  317. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/run.py +0 -0
  318. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/__init__.py +0 -0
  319. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/check_data.py +0 -0
  320. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  321. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
  322. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
  323. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/user/__init__.py +0 -0
  324. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/user/base.py +0 -0
  325. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/__init__.py +0 -0
  326. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/display.py +0 -0
  327. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/io_utils.py +0 -0
  328. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
  329. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/utils.py +0 -0
  330. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/versioneer.py +0 -0
  331. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  332. {eval_protocol-0.2.6.dev1 → eval_protocol-0.2.6.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.6.dev1
3
+ Version: 0.2.6.dev2
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -38,10 +38,10 @@ Requires-Dist: litellm>=1.0.0
38
38
  Requires-Dist: addict>=2.4.0
39
39
  Requires-Dist: deepdiff>=6.0.0
40
40
  Requires-Dist: pandas>=1.5.0
41
- Requires-Dist: watchdog>=2.1.0
42
41
  Requires-Dist: websockets>=15.0.1
43
42
  Requires-Dist: fastapi>=0.116.1
44
43
  Requires-Dist: pytest>=6.0.0
44
+ Requires-Dist: peewee>=3.18.2
45
45
  Provides-Extra: dev
46
46
  Requires-Dist: build; extra == "dev"
47
47
  Requires-Dist: twine; extra == "dev"
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-08T10:51:54-0700",
11
+ "date": "2025-08-10T19:39:17-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "986452fd04442f9a0d1ba902753a83480e413d43",
15
- "version": "0.2.6-dev1"
14
+ "full-revisionid": "a50c3f62fdb5be7347741446338d8c1771e92547",
15
+ "version": "0.2.6-dev2"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,55 @@
1
+ import json
2
+ import re
3
+ from typing import Any, Dict, List
4
+
5
+ import requests
6
+
7
+
8
+ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
9
+ """
10
+ Reads a JSONL file where each line is a valid JSON object and returns a list of these objects.
11
+
12
+ Args:
13
+ file_path: Path to the JSONL file.
14
+
15
+ Returns:
16
+ A list of dictionaries, where each dictionary is a parsed JSON object from a line.
17
+ Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
18
+ """
19
+ data: List[Dict[str, Any]] = []
20
+ if file_path.startswith("http://") or file_path.startswith("https://"):
21
+ resp = requests.get(file_path, stream=True, timeout=30)
22
+ resp.raise_for_status()
23
+ for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1):
24
+ if raw is None:
25
+ continue
26
+ stripped = raw.strip()
27
+ if not stripped:
28
+ continue
29
+ try:
30
+ data.append(json.loads(stripped))
31
+ except json.JSONDecodeError as e:
32
+ print(f"Error parsing JSON line for URL {file_path} at line {line_number}")
33
+ row_id_index = stripped.find("row_id")
34
+ if row_id_index != -1:
35
+ row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
36
+ raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e
37
+ raise e
38
+ else:
39
+ with open(file_path, "r", encoding="utf-8") as f:
40
+ for line_number, line in enumerate(f, start=1):
41
+ # Skip entirely blank or whitespace-only lines to be robust to trailing newlines
42
+ stripped = line.strip()
43
+ if not stripped:
44
+ continue
45
+ try:
46
+ data.append(json.loads(stripped))
47
+ except json.JSONDecodeError as e:
48
+ print(f"Error parsing JSON line for file {file_path} at line {line_number}")
49
+ # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
50
+ row_id_index = line.find("row_id")
51
+ if row_id_index != -1:
52
+ row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
53
+ raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e
54
+ raise e
55
+ return data
@@ -0,0 +1,3 @@
1
+ from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter
2
+
3
+ default_logger = SqliteDatasetLoggerAdapter()
@@ -4,6 +4,8 @@ from typing import TYPE_CHECKING, List, Optional
4
4
  if TYPE_CHECKING:
5
5
  from eval_protocol.models import EvaluationRow
6
6
 
7
+ LOG_EVENT_TYPE = "log"
8
+
7
9
 
8
10
  class DatasetLogger(ABC):
9
11
  """
@@ -0,0 +1,39 @@
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ from eval_protocol.dataset_logger.dataset_logger import LOG_EVENT_TYPE, DatasetLogger
5
+ from eval_protocol.dataset_logger.sqlite_evaluation_row_store import SqliteEvaluationRowStore
6
+ from eval_protocol.directory_utils import find_eval_protocol_dir
7
+ from eval_protocol.event_bus import event_bus
8
+ from eval_protocol.event_bus.logger import logger
9
+ from eval_protocol.models import EvaluationRow
10
+
11
+
12
+ class SqliteDatasetLoggerAdapter(DatasetLogger):
13
+ def __init__(self, db_path: Optional[str] = None, store: Optional[SqliteEvaluationRowStore] = None):
14
+ eval_protocol_dir = find_eval_protocol_dir()
15
+ if db_path is not None and store is not None:
16
+ raise ValueError("Provide only one of db_path or store, not both.")
17
+ if store is not None:
18
+ self.db_path = store.db_path
19
+ self._store = store
20
+ else:
21
+ self.db_path = db_path if db_path is not None else os.path.join(eval_protocol_dir, "logs.db")
22
+ self._store = SqliteEvaluationRowStore(self.db_path)
23
+
24
+ def log(self, row: "EvaluationRow") -> None:
25
+ row_id = row.input_metadata.row_id
26
+ data = row.model_dump(exclude_none=True, mode="json")
27
+ self._store.upsert_row(row_id=row_id, data=data)
28
+ try:
29
+ event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
30
+ except Exception as e:
31
+ # Avoid breaking storage due to event emission issues
32
+ logger.error(f"Failed to emit row_upserted event: {e}")
33
+ pass
34
+
35
+ def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
36
+ from eval_protocol.models import EvaluationRow
37
+
38
+ results = self._store.read_rows(row_id=row_id)
39
+ return [EvaluationRow(**data) for data in results]
@@ -0,0 +1,57 @@
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ from peewee import CharField, Model, SqliteDatabase
5
+ from playhouse.sqlite_ext import JSONField
6
+
7
+ from eval_protocol.models import EvaluationRow
8
+
9
+
10
+ class SqliteEvaluationRowStore:
11
+ """
12
+ Lightweight reusable SQLite store for evaluation rows.
13
+
14
+ Stores arbitrary row data as JSON keyed by a unique string `row_id`.
15
+ """
16
+
17
+ def __init__(self, db_path: str):
18
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
19
+ self._db_path = db_path
20
+ self._db = SqliteDatabase(self._db_path)
21
+
22
+ class BaseModel(Model):
23
+ class Meta:
24
+ database = self._db
25
+
26
+ class EvaluationRow(BaseModel): # type: ignore
27
+ row_id = CharField(unique=True)
28
+ data = JSONField()
29
+
30
+ self._EvaluationRow = EvaluationRow
31
+
32
+ self._db.connect()
33
+ self._db.create_tables([EvaluationRow])
34
+
35
+ @property
36
+ def db_path(self) -> str:
37
+ return self._db_path
38
+
39
+ def upsert_row(self, row_id: str, data: dict) -> None:
40
+ if self._EvaluationRow.select().where(self._EvaluationRow.row_id == row_id).exists():
41
+ self._EvaluationRow.update(data=data).where(self._EvaluationRow.row_id == row_id).execute()
42
+ else:
43
+ self._EvaluationRow.create(row_id=row_id, data=data)
44
+
45
+ def read_rows(self, row_id: Optional[str] = None) -> List[dict]:
46
+ if row_id is None:
47
+ query = self._EvaluationRow.select().dicts()
48
+ else:
49
+ query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.row_id == row_id)
50
+ results = list(query)
51
+ return [result["data"] for result in results]
52
+
53
+ def delete_row(self, row_id: str) -> int:
54
+ return self._EvaluationRow.delete().where(self._EvaluationRow.row_id == row_id).execute()
55
+
56
+ def delete_all_rows(self) -> int:
57
+ return self._EvaluationRow.delete().execute()
@@ -0,0 +1,5 @@
1
+ # Global event bus instance - uses SqliteEventBus for cross-process functionality
2
+ from eval_protocol.event_bus.event_bus import EventBus
3
+ from eval_protocol.event_bus.sqlite_event_bus import SqliteEventBus
4
+
5
+ event_bus: EventBus = SqliteEventBus()
@@ -0,0 +1,50 @@
1
+ from typing import Any, Callable, List
2
+
3
+ from eval_protocol.event_bus.logger import logger
4
+
5
+
6
+ class EventBus:
7
+ """Core event bus interface for decoupling components in the evaluation system."""
8
+
9
+ def __init__(self):
10
+ self._listeners: List[Callable[[str, Any], None]] = []
11
+
12
+ def subscribe(self, callback: Callable[[str, Any], None]) -> None:
13
+ """Subscribe to events.
14
+
15
+ Args:
16
+ callback: Function that takes (event_type, data) parameters
17
+ """
18
+ self._listeners.append(callback)
19
+
20
+ def unsubscribe(self, callback: Callable[[str, Any], None]) -> None:
21
+ """Unsubscribe from events.
22
+
23
+ Args:
24
+ callback: The callback function to remove
25
+ """
26
+ try:
27
+ self._listeners.remove(callback)
28
+ except ValueError:
29
+ pass # Callback wasn't subscribed
30
+
31
+ def emit(self, event_type: str, data: Any) -> None:
32
+ """Emit an event to all subscribers.
33
+
34
+ Args:
35
+ event_type: Type of event (e.g., "row_upserted")
36
+ data: Event data
37
+ """
38
+ for listener in self._listeners:
39
+ try:
40
+ listener(event_type, data)
41
+ except Exception as e:
42
+ logger.debug(f"Event listener failed for {event_type}: {e}")
43
+
44
+ def start_listening(self) -> None:
45
+ """Start listening for cross-process events. Override in subclasses."""
46
+ pass
47
+
48
+ def stop_listening(self) -> None:
49
+ """Stop listening for cross-process events. Override in subclasses."""
50
+ pass
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)
@@ -0,0 +1,109 @@
1
+ import threading
2
+ import time
3
+ from typing import Any, Optional
4
+ from uuid import uuid4
5
+
6
+ from eval_protocol.event_bus.event_bus import EventBus
7
+ from eval_protocol.event_bus.logger import logger
8
+ from eval_protocol.event_bus.sqlite_event_bus_database import SqliteEventBusDatabase
9
+
10
+
11
+ class SqliteEventBus(EventBus):
12
+ """SQLite-based event bus implementation that supports cross-process communication."""
13
+
14
+ def __init__(self, db_path: Optional[str] = None):
15
+ super().__init__()
16
+
17
+ # Use the same database as the evaluation row store
18
+ if db_path is None:
19
+ import os
20
+
21
+ from eval_protocol.directory_utils import find_eval_protocol_dir
22
+
23
+ eval_protocol_dir = find_eval_protocol_dir()
24
+ db_path = os.path.join(eval_protocol_dir, "logs.db")
25
+
26
+ self._db = SqliteEventBusDatabase(db_path)
27
+ self._running = False
28
+ self._listener_thread: Optional[threading.Thread] = None
29
+ self._process_id = str(uuid4())
30
+
31
+ def emit(self, event_type: str, data: Any) -> None:
32
+ """Emit an event to all subscribers.
33
+
34
+ Args:
35
+ event_type: Type of event (e.g., "log")
36
+ data: Event data
37
+ """
38
+ # Call local listeners immediately
39
+ super().emit(event_type, data)
40
+
41
+ # Publish to cross-process subscribers
42
+ self._publish_cross_process(event_type, data)
43
+
44
+ def _publish_cross_process(self, event_type: str, data: Any) -> None:
45
+ """Publish event to cross-process subscribers via database."""
46
+ self._db.publish_event(event_type, data, self._process_id)
47
+
48
+ def start_listening(self) -> None:
49
+ """Start listening for cross-process events."""
50
+ if self._running:
51
+ return
52
+
53
+ self._running = True
54
+ self._start_database_listener()
55
+
56
+ def stop_listening(self) -> None:
57
+ """Stop listening for cross-process events."""
58
+ self._running = False
59
+ if self._listener_thread and self._listener_thread.is_alive():
60
+ self._listener_thread.join(timeout=1)
61
+
62
+ def _start_database_listener(self) -> None:
63
+ """Start database-based event listener."""
64
+
65
+ def database_listener():
66
+ last_cleanup = time.time()
67
+
68
+ while self._running:
69
+ try:
70
+ # Get unprocessed events from other processes
71
+ events = self._db.get_unprocessed_events(self._process_id)
72
+
73
+ for event in events:
74
+ if not self._running:
75
+ break
76
+
77
+ try:
78
+ # Handle the event
79
+ self._handle_cross_process_event(event["event_type"], event["data"])
80
+
81
+ # Mark as processed
82
+ self._db.mark_event_processed(event["event_id"])
83
+
84
+ except Exception as e:
85
+ logger.debug(f"Failed to process event {event['event_id']}: {e}")
86
+
87
+ # Clean up old events every hour
88
+ current_time = time.time()
89
+ if current_time - last_cleanup >= 3600:
90
+ self._db.cleanup_old_events()
91
+ last_cleanup = current_time
92
+
93
+ # Small sleep to prevent busy waiting
94
+ time.sleep(0.1)
95
+
96
+ except Exception as e:
97
+ logger.debug(f"Database listener error: {e}")
98
+ time.sleep(1)
99
+
100
+ self._listener_thread = threading.Thread(target=database_listener, daemon=True)
101
+ self._listener_thread.start()
102
+
103
+ def _handle_cross_process_event(self, event_type: str, data: Any) -> None:
104
+ """Handle events received from other processes."""
105
+ for listener in self._listeners:
106
+ try:
107
+ listener(event_type, data)
108
+ except Exception as e:
109
+ logger.debug(f"Cross-process event listener failed for {event_type}: {e}")
@@ -0,0 +1,95 @@
1
+ import time
2
+ from typing import Any, List
3
+ from uuid import uuid4
4
+
5
+ from peewee import CharField, DateTimeField, Model, SqliteDatabase
6
+ from playhouse.sqlite_ext import JSONField
7
+
8
+ from eval_protocol.event_bus.logger import logger
9
+
10
+
11
+ class SqliteEventBusDatabase:
12
+ """SQLite database for cross-process event communication."""
13
+
14
+ def __init__(self, db_path: str):
15
+ self._db_path = db_path
16
+ self._db = SqliteDatabase(db_path)
17
+
18
+ class BaseModel(Model):
19
+ class Meta:
20
+ database = self._db
21
+
22
+ class Event(BaseModel): # type: ignore
23
+ event_id = CharField(unique=True)
24
+ event_type = CharField()
25
+ data = JSONField()
26
+ timestamp = DateTimeField()
27
+ process_id = CharField()
28
+ processed = CharField(default="false") # Track if event has been processed
29
+
30
+ self._Event = Event
31
+ self._db.connect()
32
+ self._db.create_tables([Event])
33
+
34
+ def publish_event(self, event_type: str, data: Any, process_id: str) -> None:
35
+ """Publish an event to the database."""
36
+ try:
37
+ # Serialize data, handling pydantic models
38
+ if hasattr(data, "model_dump"):
39
+ serialized_data = data.model_dump(mode="json", exclude_none=True)
40
+ else:
41
+ serialized_data = data
42
+
43
+ self._Event.create(
44
+ event_id=str(uuid4()),
45
+ event_type=event_type,
46
+ data=serialized_data,
47
+ timestamp=time.time(),
48
+ process_id=process_id,
49
+ processed="false",
50
+ )
51
+ except Exception as e:
52
+ logger.warning(f"Failed to publish event to database: {e}")
53
+
54
+ def get_unprocessed_events(self, process_id: str) -> List[dict]:
55
+ """Get unprocessed events from other processes."""
56
+ try:
57
+ query = (
58
+ self._Event.select()
59
+ .where((self._Event.process_id != process_id) & (self._Event.processed == "false"))
60
+ .order_by(self._Event.timestamp)
61
+ )
62
+
63
+ events = []
64
+ for event in query:
65
+ events.append(
66
+ {
67
+ "event_id": event.event_id,
68
+ "event_type": event.event_type,
69
+ "data": event.data,
70
+ "timestamp": event.timestamp,
71
+ "process_id": event.process_id,
72
+ }
73
+ )
74
+
75
+ return events
76
+ except Exception as e:
77
+ logger.warning(f"Failed to get unprocessed events: {e}")
78
+ return []
79
+
80
+ def mark_event_processed(self, event_id: str) -> None:
81
+ """Mark an event as processed."""
82
+ try:
83
+ self._Event.update(processed="true").where(self._Event.event_id == event_id).execute()
84
+ except Exception as e:
85
+ logger.debug(f"Failed to mark event as processed: {e}")
86
+
87
+ def cleanup_old_events(self, max_age_hours: int = 24) -> None:
88
+ """Clean up old processed events."""
89
+ try:
90
+ cutoff_time = time.time() - (max_age_hours * 3600)
91
+ self._Event.delete().where(
92
+ (self._Event.processed == "true") & (self._Event.timestamp < cutoff_time)
93
+ ).execute()
94
+ except Exception as e:
95
+ logger.debug(f"Failed to cleanup old events: {e}")
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
11
11
 
12
12
  import aiohttp
13
13
  from omegaconf import DictConfig
14
- from pydantic import BaseModel, Field # Added for new models
14
+ from pydantic import BaseModel # Added for new models
15
15
 
16
16
  logger = logging.getLogger(__name__)
17
17
 
@@ -83,6 +83,9 @@ class FireworksModelClient(ModelClient):
83
83
  }
84
84
  if self.top_p is not None:
85
85
  payload["top_p"] = self.top_p
86
+ # Include reasoning settings if configured (for reasoning-capable models)
87
+ if self.reasoning_effort:
88
+ payload["reasoning_effort"] = self.reasoning_effort
86
89
 
87
90
  if tools:
88
91
  payload["tools"] = tools
@@ -12,7 +12,7 @@ import sys
12
12
  from pathlib import Path
13
13
  from typing import Optional
14
14
 
15
- from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_dir
15
+ from eval_protocol.directory_utils import find_eval_protocol_dir
16
16
 
17
17
 
18
18
  def setup_logger(
@@ -9,14 +9,16 @@ import asyncio
9
9
  import hashlib
10
10
  import json
11
11
  import logging
12
+ import time
12
13
  from contextlib import AsyncExitStack
13
14
  from typing import Any, Dict, List, Optional, Tuple
14
15
 
16
+ import httpx
15
17
  from mcp.client.session import ClientSession
16
18
  from mcp.client.streamable_http import streamablehttp_client
19
+ from mcp.types import Implementation
17
20
 
18
21
  from ...types import MCPSession
19
- from mcp.types import Implementation
20
22
 
21
23
  logger = logging.getLogger(__name__)
22
24
 
@@ -109,15 +111,13 @@ class MCPConnectionManager:
109
111
  """
110
112
  Clean session data in remote mcp server for the given session
111
113
  """
112
- import httpx
113
-
114
114
  base_url = session.base_url.rstrip("/").removesuffix("/mcp")
115
115
  url = f"{base_url}/control/reset_session"
116
116
 
117
117
  headers = {"mcp-session-id": session.session_id}
118
118
  body = {"seed": session.seed}
119
119
 
120
- timeout = httpx.Timeout(3.0)
120
+ timeout = httpx.Timeout(15.0)
121
121
  async with httpx.AsyncClient(timeout=timeout) as client:
122
122
  resp = await client.post(url, headers=headers, json=body)
123
123
  resp.raise_for_status()
@@ -202,8 +202,6 @@ class MCPConnectionManager:
202
202
  initial_observation = None
203
203
 
204
204
  try:
205
- import httpx
206
-
207
205
  # Extract base URL and session ID from the MCP session
208
206
  base_url = session.base_url.rstrip("/").removesuffix("/mcp")
209
207
  session_id = session.session_id
@@ -459,9 +457,6 @@ class MCPConnectionManager:
459
457
  control_plane_info = {}
460
458
 
461
459
  try:
462
- # Query control plane endpoints following the new architecture
463
- import httpx
464
-
465
460
  # Extract base URL and session ID from the MCP session
466
461
  base_url = session.base_url.rstrip("/").removesuffix("/mcp")
467
462
  # Use the session ID from the established MCP session
@@ -544,10 +539,10 @@ class MCPConnectionManager:
544
539
  await session._exit_stack.aclose()
545
540
  except asyncio.CancelledError:
546
541
  # Handle cancellation gracefully (especially important for Python 3.12)
547
- logger.debug(f"Session {session.session_id} close was cancelled")
542
+ logger.error(f"Session {session.session_id} close was cancelled")
548
543
  except Exception as e:
549
544
  # Hitting this error, probably because of use of threads: "Attempted to exit cancel scope in a different task than it was entered in"
550
- logger.debug(f"Error closing session {session.session_id}: {e}")
545
+ logger.error(f"Error closing session {session.session_id}: {e}")
551
546
  finally:
552
547
  session._exit_stack = None
553
548
  session._mcp_session = None
@@ -220,7 +220,7 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
220
220
  return mcp_tool_calls, usage_stats
221
221
  else:
222
222
  # No tool calls in response - this is normal when episode ends or LLM provides only text
223
- logger.info(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
223
+ logger.debug(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
224
224
  return [
225
225
  MCPToolCall(
226
226
  tool_name="_no_tool_call",