eval-protocol 0.2.6__tar.gz → 0.2.6.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (335) hide show
  1. {eval_protocol-0.2.6/eval_protocol.egg-info → eval_protocol-0.2.6.dev2}/PKG-INFO +3 -3
  2. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/__init__.py +4 -3
  3. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/_version.py +3 -3
  4. eval_protocol-0.2.6.dev2/eval_protocol/common_utils.py +55 -0
  5. eval_protocol-0.2.6.dev2/eval_protocol/dataset_logger/__init__.py +3 -0
  6. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/dataset_logger/dataset_logger.py +2 -0
  7. eval_protocol-0.2.6.dev2/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +98 -0
  8. eval_protocol-0.2.6.dev2/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +39 -0
  9. eval_protocol-0.2.6.dev2/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +57 -0
  10. eval_protocol-0.2.6.dev2/eval_protocol/directory_utils.py +55 -0
  11. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/__init__.py +5 -0
  12. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/event_bus.py +50 -0
  13. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/logger.py +3 -0
  14. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/sqlite_event_bus.py +109 -0
  15. eval_protocol-0.2.6.dev2/eval_protocol/event_bus/sqlite_event_bus_database.py +95 -0
  16. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/generation/clients.py +4 -1
  17. eval_protocol-0.2.6.dev2/eval_protocol/get_pep440_version.py +133 -0
  18. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/human_id/__init__.py +3 -2
  19. eval_protocol-0.2.6.dev2/eval_protocol/logging_utils.py +175 -0
  20. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/client/connection.py +16 -49
  21. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/execution/base_policy.py +1 -1
  22. eval_protocol-0.2.6.dev2/eval_protocol/mcp/execution/manager.py +562 -0
  23. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/mcpgym.py +67 -102
  24. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/session/manager.py +4 -0
  25. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_env.py +35 -16
  26. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/models.py +54 -4
  27. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +14 -1
  28. eval_protocol-0.2.6.dev2/eval_protocol/pytest/default_single_turn_rollout_process.py +96 -0
  29. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/evaluation_test.py +316 -51
  30. eval_protocol-0.2.6.dev2/eval_protocol/pytest/plugin.py +144 -0
  31. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/utils.py +32 -2
  32. eval_protocol-0.2.6.dev2/eval_protocol/stats/__init__.py +5 -0
  33. eval_protocol-0.2.6.dev2/eval_protocol/stats/confidence_intervals.py +116 -0
  34. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/types/types.py +4 -0
  35. eval_protocol-0.2.6.dev2/eval_protocol/utils/logs_server.py +338 -0
  36. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2/eval_protocol.egg-info}/PKG-INFO +3 -3
  37. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/SOURCES.txt +17 -3
  38. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/entry_points.txt +4 -0
  39. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/requires.txt +2 -2
  40. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/pyproject.toml +15 -2
  41. eval_protocol-0.2.6.dev2/tests/test_event_bus.py +265 -0
  42. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_parallel_rollouts.py +2 -2
  43. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_rollout_control_plane_integration.py +1 -1
  44. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_url_handling.py +8 -26
  45. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/agent/llm_agent.py +22 -36
  46. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/user/user_simulator.py +9 -5
  47. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/llm_utils.py +18 -3
  48. eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-D9iVTBbF.css +1 -0
  49. eval_protocol-0.2.6/vite-app/dist/assets/index-CRkZ6JGL.js → eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-DiF_B1x_.js +19 -19
  50. eval_protocol-0.2.6/vite-app/dist/assets/index-CRkZ6JGL.js.map → eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-DiF_B1x_.js.map +1 -1
  51. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vite-app/dist/index.html +3 -3
  52. eval_protocol-0.2.6/eval_protocol/common_utils.py +0 -36
  53. eval_protocol-0.2.6/eval_protocol/dataset_logger/__init__.py +0 -3
  54. eval_protocol-0.2.6/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -114
  55. eval_protocol-0.2.6/eval_protocol/mcp/execution/manager.py +0 -518
  56. eval_protocol-0.2.6/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -50
  57. eval_protocol-0.2.6/eval_protocol/utils/logs_server.py +0 -295
  58. eval_protocol-0.2.6/vite-app/dist/assets/index-BySN1scz.css +0 -1
  59. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/LICENSE +0 -0
  60. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/README.md +0 -0
  61. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/development/__init__.py +0 -0
  62. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/development/normalize_sandbox_fusion.py +0 -0
  63. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/development/utils/__init__.py +0 -0
  64. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/development/utils/generate_api_key.py +0 -0
  65. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/development/utils/subprocess_manager.py +0 -0
  66. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/__main__.py +0 -0
  67. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/__init__.py +0 -0
  68. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/braintrust.py +0 -0
  69. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/huggingface.py +0 -0
  70. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/langfuse.py +0 -0
  71. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/adapters/trl.py +0 -0
  72. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/__init__.py +0 -0
  73. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/models.py +0 -0
  74. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/orchestrator.py +0 -0
  75. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resource_abc.py +0 -0
  76. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resource_pool.py +0 -0
  77. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
  78. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  79. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  80. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  81. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  82. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  83. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
  84. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  85. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  86. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  87. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  88. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
  89. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/task_manager.py +0 -0
  90. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/agent/tool_registry.py +0 -0
  91. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/auth.py +0 -0
  92. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli.py +0 -0
  93. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
  94. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  95. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/common.py +0 -0
  96. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
  97. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  98. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/logs.py +0 -0
  99. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/preview.py +0 -0
  100. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  101. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/config.py +0 -0
  102. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/datasets/__init__.py +0 -0
  103. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/datasets/loader.py +0 -0
  104. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/evaluation.py +0 -0
  105. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/execution/__init__.py +0 -0
  106. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/execution/pipeline.py +0 -0
  107. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/gcp_tools.py +0 -0
  108. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/generation/cache.py +0 -0
  109. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/generation/clients/base.py +0 -0
  110. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/generic_server.py +0 -0
  111. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/human_id/dictionary.py +0 -0
  112. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/__init__.py +0 -0
  113. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/braintrust.py +0 -0
  114. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/deepeval.py +0 -0
  115. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/openeval.py +0 -0
  116. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/integrations/trl.py +0 -0
  117. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/__init__.py +0 -0
  118. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/adapter.py +0 -0
  119. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
  120. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/clients.py +0 -0
  121. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
  122. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
  123. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
  124. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  125. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/process_manager.py +0 -0
  126. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
  127. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
  128. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
  129. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
  130. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/config.py +0 -0
  131. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  132. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/main.py +0 -0
  133. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  134. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  135. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  136. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  137. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  138. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/mcp_agent/session.py +0 -0
  139. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/packaging.py +0 -0
  140. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/platform_api.py +0 -0
  141. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/playback_policy.py +0 -0
  142. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/__init__.py +0 -0
  143. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  144. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  145. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  146. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/pytest/types.py +0 -0
  147. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/resources.py +0 -0
  148. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/reward_function.py +0 -0
  149. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/__init__.py +0 -0
  150. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/accuracy.py +0 -0
  151. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
  152. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  153. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  154. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
  155. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
  156. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/code_execution.py +0 -0
  157. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
  158. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
  159. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  160. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/format.py +0 -0
  161. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/function_calling.py +0 -0
  162. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/json_schema.py +0 -0
  163. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
  164. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
  165. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/length.py +0 -0
  166. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  167. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/math.py +0 -0
  168. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  169. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
  170. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/repetition.py +0 -0
  171. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rewards/tag_count.py +0 -0
  172. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/rl_processing.py +0 -0
  173. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/server.py +0 -0
  174. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/typed_interface.py +0 -0
  175. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/types/__init__.py +0 -0
  176. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/__init__.py +0 -0
  177. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
  178. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
  179. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
  180. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/module_loader.py +0 -0
  181. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
  182. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/static_policy.py +0 -0
  183. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol/utils/vite_server.py +0 -0
  184. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
  185. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
  186. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/setup.cfg +0 -0
  187. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/setup.py +0 -0
  188. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_accuracy.py +0 -0
  189. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_accuracy_length.py +0 -0
  190. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_adapters_e2e.py +0 -0
  191. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_agent_orchestrator.py +0 -0
  192. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_agent_resources.py +0 -0
  193. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_auth.py +0 -0
  194. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_batch_evaluation.py +0 -0
  195. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_braintrust_adapter.py +0 -0
  196. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_braintrust_example.py +0 -0
  197. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_cli.py +0 -0
  198. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_cli_agent.py +0 -0
  199. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_cli_args.py +0 -0
  200. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_code_execution.py +0 -0
  201. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_config.py +0 -0
  202. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_control_plane_separation.py +0 -0
  203. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_cpp_code.py +0 -0
  204. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_data_driven_task_manager.py +0 -0
  205. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_deepcoder_reward.py +0 -0
  206. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_deepeval_integration.py +0 -0
  207. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_deploy_integration.py +0 -0
  208. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_e2b_integration.py +0 -0
  209. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_e2b_js_integration.py +0 -0
  210. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_edge_cases.py +0 -0
  211. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_eval_protocol_import.py +0 -0
  212. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_evaluation.py +0 -0
  213. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_evaluation_integration.py +0 -0
  214. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_evaluation_preview_integration.py +0 -0
  215. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_examples_end_to_end.py +0 -0
  216. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_fireworks_api.py +0 -0
  217. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_format.py +0 -0
  218. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_fractional_code.py +0 -0
  219. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_frozen_lake_http_server.py +0 -0
  220. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  221. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_function_calling.py +0 -0
  222. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_gcp_tools.py +0 -0
  223. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_generic_server.py +0 -0
  224. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_integration.py +0 -0
  225. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_json_schema.py +0 -0
  226. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_kwargs_validation.py +0 -0
  227. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_language_consistency.py +0 -0
  228. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_lean_prover.py +0 -0
  229. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_lean_prover_runner.py +0 -0
  230. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_length.py +0 -0
  231. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_list_comparison_math_reward.py +0 -0
  232. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_math.py +0 -0
  233. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_minimal.py +0 -0
  234. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_models.py +0 -0
  235. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_models_rl.py +0 -0
  236. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
  237. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_n_variant_batch_integration.py +0 -0
  238. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_n_variant_integration.py +0 -0
  239. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_openai_compatibility.py +0 -0
  240. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_openeval_integration.py +0 -0
  241. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_packaging.py +0 -0
  242. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_platform_api.py +0 -0
  243. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_readiness.py +0 -0
  244. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_reasoning_steps.py +0 -0
  245. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_repetition.py +0 -0
  246. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_repetition_debug.py +0 -0
  247. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_reward_function.py +0 -0
  248. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_reward_protocol_import.py +0 -0
  249. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_rl_processing.py +0 -0
  250. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_server.py +0 -0
  251. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_tag_count.py +0 -0
  252. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_typed_interface.py +0 -0
  253. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/tests/test_typed_interface_rl.py +0 -0
  254. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/__init__.py +0 -0
  255. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/agent/__init__.py +0 -0
  256. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/agent/base.py +0 -0
  257. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/api_service/__init__.py +0 -0
  258. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/api_service/api_config.py +0 -0
  259. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/api_service/data_model.py +0 -0
  260. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
  261. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/cli.py +0 -0
  262. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/config.py +0 -0
  263. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/data_model/__init__.py +0 -0
  264. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/data_model/message.py +0 -0
  265. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/data_model/simulation.py +0 -0
  266. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/data_model/tasks.py +0 -0
  267. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/__init__.py +0 -0
  268. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
  269. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
  270. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
  271. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
  272. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
  273. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
  274. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
  275. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
  276. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
  277. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
  278. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
  279. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
  280. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
  281. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
  282. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
  283. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
  284. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
  285. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
  286. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  287. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  288. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  289. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  290. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  291. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  292. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  293. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  294. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
  295. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  296. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  297. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
  298. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/__init__.py +0 -0
  299. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/db.py +0 -0
  300. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/environment.py +0 -0
  301. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/server.py +0 -0
  302. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/tool.py +0 -0
  303. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/toolkit.py +0 -0
  304. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  305. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
  306. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
  307. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  308. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  309. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  310. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  311. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  312. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/metrics/__init__.py +0 -0
  313. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
  314. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  315. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
  316. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  317. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  318. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
  319. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/registry.py +0 -0
  320. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/run.py +0 -0
  321. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/__init__.py +0 -0
  322. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/check_data.py +0 -0
  323. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  324. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
  325. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
  326. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/user/__init__.py +0 -0
  327. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/user/base.py +0 -0
  328. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/__init__.py +0 -0
  329. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/display.py +0 -0
  330. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/io_utils.py +0 -0
  331. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
  332. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vendor/tau2/utils/utils.py +0 -0
  333. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/versioneer.py +0 -0
  334. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  335. {eval_protocol-0.2.6 → eval_protocol-0.2.6.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.6
3
+ Version: 0.2.6.dev2
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -38,13 +38,13 @@ Requires-Dist: litellm>=1.0.0
38
38
  Requires-Dist: addict>=2.4.0
39
39
  Requires-Dist: deepdiff>=6.0.0
40
40
  Requires-Dist: pandas>=1.5.0
41
- Requires-Dist: watchdog>=2.1.0
42
41
  Requires-Dist: websockets>=15.0.1
43
42
  Requires-Dist: fastapi>=0.116.1
43
+ Requires-Dist: pytest>=6.0.0
44
+ Requires-Dist: peewee>=3.18.2
44
45
  Provides-Extra: dev
45
46
  Requires-Dist: build; extra == "dev"
46
47
  Requires-Dist: twine; extra == "dev"
47
- Requires-Dist: pytest>=6.0.0; extra == "dev"
48
48
  Requires-Dist: pytest-asyncio; extra == "dev"
49
49
  Requires-Dist: pytest-httpserver; extra == "dev"
50
50
  Requires-Dist: werkzeug>=2.0.0; extra == "dev"
@@ -10,15 +10,16 @@ tool-augmented models using self-contained task bundles.
10
10
 
11
11
  import warnings
12
12
 
13
- from .adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
13
+ from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
14
+
14
15
  from .auth import get_fireworks_account_id, get_fireworks_api_key
15
16
  from .common_utils import load_jsonl
16
17
  from .config import RewardKitConfig, get_config, load_config
17
18
  from .mcp_env import (
18
19
  AnthropicPolicy,
19
- OpenAIPolicy,
20
- LiteLLMPolicy,
21
20
  FireworksPolicy,
21
+ LiteLLMPolicy,
22
+ OpenAIPolicy,
22
23
  make,
23
24
  rollout,
24
25
  test_mcp,
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-06T23:10:26-0700",
11
+ "date": "2025-08-10T19:39:17-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "fffd75c146b297cbce37f768ca9850e2ee05e4b5",
15
- "version": "0.2.6"
14
+ "full-revisionid": "a50c3f62fdb5be7347741446338d8c1771e92547",
15
+ "version": "0.2.6-dev2"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,55 @@
1
+ import json
2
+ import re
3
+ from typing import Any, Dict, List
4
+
5
+ import requests
6
+
7
+
8
+ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
9
+ """
10
+ Reads a JSONL file where each line is a valid JSON object and returns a list of these objects.
11
+
12
+ Args:
13
+ file_path: Path to the JSONL file.
14
+
15
+ Returns:
16
+ A list of dictionaries, where each dictionary is a parsed JSON object from a line.
17
+ Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
18
+ """
19
+ data: List[Dict[str, Any]] = []
20
+ if file_path.startswith("http://") or file_path.startswith("https://"):
21
+ resp = requests.get(file_path, stream=True, timeout=30)
22
+ resp.raise_for_status()
23
+ for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1):
24
+ if raw is None:
25
+ continue
26
+ stripped = raw.strip()
27
+ if not stripped:
28
+ continue
29
+ try:
30
+ data.append(json.loads(stripped))
31
+ except json.JSONDecodeError as e:
32
+ print(f"Error parsing JSON line for URL {file_path} at line {line_number}")
33
+ row_id_index = stripped.find("row_id")
34
+ if row_id_index != -1:
35
+ row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
36
+ raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e
37
+ raise e
38
+ else:
39
+ with open(file_path, "r", encoding="utf-8") as f:
40
+ for line_number, line in enumerate(f, start=1):
41
+ # Skip entirely blank or whitespace-only lines to be robust to trailing newlines
42
+ stripped = line.strip()
43
+ if not stripped:
44
+ continue
45
+ try:
46
+ data.append(json.loads(stripped))
47
+ except json.JSONDecodeError as e:
48
+ print(f"Error parsing JSON line for file {file_path} at line {line_number}")
49
+ # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
50
+ row_id_index = line.find("row_id")
51
+ if row_id_index != -1:
52
+ row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
53
+ raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e
54
+ raise e
55
+ return data
@@ -0,0 +1,3 @@
1
+ from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter
2
+
3
+ default_logger = SqliteDatasetLoggerAdapter()
@@ -4,6 +4,8 @@ from typing import TYPE_CHECKING, List, Optional
4
4
  if TYPE_CHECKING:
5
5
  from eval_protocol.models import EvaluationRow
6
6
 
7
+ LOG_EVENT_TYPE = "log"
8
+
7
9
 
8
10
  class DatasetLogger(ABC):
9
11
  """
@@ -0,0 +1,98 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, List, Optional
7
+
8
+ from eval_protocol.common_utils import load_jsonl
9
+ from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
10
+ from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir
11
+
12
+ if TYPE_CHECKING:
13
+ from eval_protocol.models import EvaluationRow
14
+
15
+
16
+ class LocalFSDatasetLoggerAdapter(DatasetLogger):
17
+ """
18
+ Logger that stores logs in the local filesystem with file locking to prevent race conditions.
19
+ """
20
+
21
+ def __init__(self):
22
+ self.log_dir = os.path.dirname(find_eval_protocol_datasets_dir())
23
+ self.datasets_dir = find_eval_protocol_datasets_dir()
24
+
25
+ # ensure that log file exists
26
+ if not os.path.exists(self.current_jsonl_path):
27
+ with open(self.current_jsonl_path, "w") as f:
28
+ f.write("")
29
+
30
+ @property
31
+ def current_date(self) -> str:
32
+ # Use UTC timezone to be consistent across local device/locations/CI
33
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d")
34
+
35
+ @property
36
+ def current_jsonl_path(self) -> str:
37
+ """
38
+ The current JSONL file path. Based on the current date.
39
+ """
40
+ return os.path.join(self.datasets_dir, f"{self.current_date}.jsonl")
41
+
42
+ def log(self, row: "EvaluationRow") -> None:
43
+ """Log a row, updating existing row with same ID or appending new row."""
44
+ row_id = row.input_metadata.row_id
45
+
46
+ # Check if row with this ID already exists in any JSONL file
47
+ if os.path.exists(self.datasets_dir):
48
+ for filename in os.listdir(self.datasets_dir):
49
+ if filename.endswith(".jsonl"):
50
+ file_path = os.path.join(self.datasets_dir, filename)
51
+ if os.path.exists(file_path):
52
+ with open(file_path, "r") as f:
53
+ lines = f.readlines()
54
+
55
+ # Find the line with matching ID
56
+ for i, line in enumerate(lines):
57
+ try:
58
+ line_data = json.loads(line.strip())
59
+ if line_data["input_metadata"]["row_id"] == row_id:
60
+ # Update existing row
61
+ lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
62
+ with open(file_path, "w") as f:
63
+ f.writelines(lines)
64
+ return
65
+ except json.JSONDecodeError:
66
+ continue
67
+
68
+ # If no existing row found, append new row to current file
69
+ with open(self.current_jsonl_path, "a") as f:
70
+ f.write(row.model_dump_json(exclude_none=True) + os.linesep)
71
+
72
+ def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
73
+ """Read rows from all JSONL files in the datasets directory. Also
74
+ ensures that there are no duplicate row IDs."""
75
+ from eval_protocol.models import EvaluationRow
76
+
77
+ if not os.path.exists(self.datasets_dir):
78
+ return []
79
+
80
+ all_rows = []
81
+ existing_row_ids = set()
82
+ for filename in os.listdir(self.datasets_dir):
83
+ if filename.endswith(".jsonl"):
84
+ file_path = os.path.join(self.datasets_dir, filename)
85
+ data = load_jsonl(file_path)
86
+ for r in data:
87
+ row = EvaluationRow(**r)
88
+ if row.input_metadata.row_id not in existing_row_ids:
89
+ existing_row_ids.add(row.input_metadata.row_id)
90
+ else:
91
+ raise ValueError(f"Duplicate Row ID {row.input_metadata.row_id} already exists")
92
+ all_rows.append(row)
93
+
94
+ if row_id:
95
+ # Filter by row_id if specified
96
+ return [row for row in all_rows if getattr(row.input_metadata, "row_id", None) == row_id]
97
+ else:
98
+ return all_rows
@@ -0,0 +1,39 @@
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ from eval_protocol.dataset_logger.dataset_logger import LOG_EVENT_TYPE, DatasetLogger
5
+ from eval_protocol.dataset_logger.sqlite_evaluation_row_store import SqliteEvaluationRowStore
6
+ from eval_protocol.directory_utils import find_eval_protocol_dir
7
+ from eval_protocol.event_bus import event_bus
8
+ from eval_protocol.event_bus.logger import logger
9
+ from eval_protocol.models import EvaluationRow
10
+
11
+
12
+ class SqliteDatasetLoggerAdapter(DatasetLogger):
13
+ def __init__(self, db_path: Optional[str] = None, store: Optional[SqliteEvaluationRowStore] = None):
14
+ eval_protocol_dir = find_eval_protocol_dir()
15
+ if db_path is not None and store is not None:
16
+ raise ValueError("Provide only one of db_path or store, not both.")
17
+ if store is not None:
18
+ self.db_path = store.db_path
19
+ self._store = store
20
+ else:
21
+ self.db_path = db_path if db_path is not None else os.path.join(eval_protocol_dir, "logs.db")
22
+ self._store = SqliteEvaluationRowStore(self.db_path)
23
+
24
+ def log(self, row: "EvaluationRow") -> None:
25
+ row_id = row.input_metadata.row_id
26
+ data = row.model_dump(exclude_none=True, mode="json")
27
+ self._store.upsert_row(row_id=row_id, data=data)
28
+ try:
29
+ event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
30
+ except Exception as e:
31
+ # Avoid breaking storage due to event emission issues
32
+ logger.error(f"Failed to emit row_upserted event: {e}")
33
+ pass
34
+
35
+ def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
36
+ from eval_protocol.models import EvaluationRow
37
+
38
+ results = self._store.read_rows(row_id=row_id)
39
+ return [EvaluationRow(**data) for data in results]
@@ -0,0 +1,57 @@
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ from peewee import CharField, Model, SqliteDatabase
5
+ from playhouse.sqlite_ext import JSONField
6
+
7
+ from eval_protocol.models import EvaluationRow
8
+
9
+
10
+ class SqliteEvaluationRowStore:
11
+ """
12
+ Lightweight reusable SQLite store for evaluation rows.
13
+
14
+ Stores arbitrary row data as JSON keyed by a unique string `row_id`.
15
+ """
16
+
17
+ def __init__(self, db_path: str):
18
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
19
+ self._db_path = db_path
20
+ self._db = SqliteDatabase(self._db_path)
21
+
22
+ class BaseModel(Model):
23
+ class Meta:
24
+ database = self._db
25
+
26
+ class EvaluationRow(BaseModel): # type: ignore
27
+ row_id = CharField(unique=True)
28
+ data = JSONField()
29
+
30
+ self._EvaluationRow = EvaluationRow
31
+
32
+ self._db.connect()
33
+ self._db.create_tables([EvaluationRow])
34
+
35
+ @property
36
+ def db_path(self) -> str:
37
+ return self._db_path
38
+
39
+ def upsert_row(self, row_id: str, data: dict) -> None:
40
+ if self._EvaluationRow.select().where(self._EvaluationRow.row_id == row_id).exists():
41
+ self._EvaluationRow.update(data=data).where(self._EvaluationRow.row_id == row_id).execute()
42
+ else:
43
+ self._EvaluationRow.create(row_id=row_id, data=data)
44
+
45
+ def read_rows(self, row_id: Optional[str] = None) -> List[dict]:
46
+ if row_id is None:
47
+ query = self._EvaluationRow.select().dicts()
48
+ else:
49
+ query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.row_id == row_id)
50
+ results = list(query)
51
+ return [result["data"] for result in results]
52
+
53
+ def delete_row(self, row_id: str) -> int:
54
+ return self._EvaluationRow.delete().where(self._EvaluationRow.row_id == row_id).execute()
55
+
56
+ def delete_all_rows(self) -> int:
57
+ return self._EvaluationRow.delete().execute()
@@ -0,0 +1,55 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+ # Shared constants for directory discovery
5
+ EVAL_PROTOCOL_DIR = ".eval_protocol"
6
+ PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
7
+ DATASETS_DIR = "datasets"
8
+
9
+
10
+ def find_eval_protocol_dir() -> str:
11
+ """
12
+ Find the .eval_protocol directory by looking up the directory tree.
13
+
14
+ Returns:
15
+ Path to the .eval_protocol directory
16
+ """
17
+ # recursively look up for a .eval_protocol directory
18
+ current_dir = os.path.dirname(os.path.abspath(__file__))
19
+ while current_dir != "/":
20
+ if os.path.exists(os.path.join(current_dir, EVAL_PROTOCOL_DIR)):
21
+ log_dir = os.path.join(current_dir, EVAL_PROTOCOL_DIR)
22
+ break
23
+ current_dir = os.path.dirname(current_dir)
24
+ else:
25
+ # if not found, recursively look up until a pyproject.toml or requirements.txt is found
26
+ current_dir = os.path.dirname(os.path.abspath(__file__))
27
+ while current_dir != "/":
28
+ if any(os.path.exists(os.path.join(current_dir, f)) for f in PYTHON_FILES):
29
+ log_dir = os.path.join(current_dir, EVAL_PROTOCOL_DIR)
30
+ break
31
+ current_dir = os.path.dirname(current_dir)
32
+ else:
33
+ # get the PWD that this python process is running in
34
+ log_dir = os.path.join(os.getcwd(), EVAL_PROTOCOL_DIR)
35
+
36
+ # create the .eval_protocol directory if it doesn't exist
37
+ os.makedirs(log_dir, exist_ok=True)
38
+
39
+ return log_dir
40
+
41
+
42
+ def find_eval_protocol_datasets_dir() -> str:
43
+ """
44
+ Find the .eval_protocol/datasets directory by looking up the directory tree.
45
+
46
+ Returns:
47
+ Path to the .eval_protocol/datasets directory
48
+ """
49
+ log_dir = find_eval_protocol_dir()
50
+
51
+ # create the datasets subdirectory
52
+ datasets_dir = os.path.join(log_dir, DATASETS_DIR)
53
+ os.makedirs(datasets_dir, exist_ok=True)
54
+
55
+ return datasets_dir
@@ -0,0 +1,5 @@
1
+ # Global event bus instance - uses SqliteEventBus for cross-process functionality
2
+ from eval_protocol.event_bus.event_bus import EventBus
3
+ from eval_protocol.event_bus.sqlite_event_bus import SqliteEventBus
4
+
5
+ event_bus: EventBus = SqliteEventBus()
@@ -0,0 +1,50 @@
1
+ from typing import Any, Callable, List
2
+
3
+ from eval_protocol.event_bus.logger import logger
4
+
5
+
6
+ class EventBus:
7
+ """Core event bus interface for decoupling components in the evaluation system."""
8
+
9
+ def __init__(self):
10
+ self._listeners: List[Callable[[str, Any], None]] = []
11
+
12
+ def subscribe(self, callback: Callable[[str, Any], None]) -> None:
13
+ """Subscribe to events.
14
+
15
+ Args:
16
+ callback: Function that takes (event_type, data) parameters
17
+ """
18
+ self._listeners.append(callback)
19
+
20
+ def unsubscribe(self, callback: Callable[[str, Any], None]) -> None:
21
+ """Unsubscribe from events.
22
+
23
+ Args:
24
+ callback: The callback function to remove
25
+ """
26
+ try:
27
+ self._listeners.remove(callback)
28
+ except ValueError:
29
+ pass # Callback wasn't subscribed
30
+
31
+ def emit(self, event_type: str, data: Any) -> None:
32
+ """Emit an event to all subscribers.
33
+
34
+ Args:
35
+ event_type: Type of event (e.g., "row_upserted")
36
+ data: Event data
37
+ """
38
+ for listener in self._listeners:
39
+ try:
40
+ listener(event_type, data)
41
+ except Exception as e:
42
+ logger.debug(f"Event listener failed for {event_type}: {e}")
43
+
44
+ def start_listening(self) -> None:
45
+ """Start listening for cross-process events. Override in subclasses."""
46
+ pass
47
+
48
+ def stop_listening(self) -> None:
49
+ """Stop listening for cross-process events. Override in subclasses."""
50
+ pass
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)
@@ -0,0 +1,109 @@
1
+ import threading
2
+ import time
3
+ from typing import Any, Optional
4
+ from uuid import uuid4
5
+
6
+ from eval_protocol.event_bus.event_bus import EventBus
7
+ from eval_protocol.event_bus.logger import logger
8
+ from eval_protocol.event_bus.sqlite_event_bus_database import SqliteEventBusDatabase
9
+
10
+
11
+ class SqliteEventBus(EventBus):
12
+ """SQLite-based event bus implementation that supports cross-process communication."""
13
+
14
+ def __init__(self, db_path: Optional[str] = None):
15
+ super().__init__()
16
+
17
+ # Use the same database as the evaluation row store
18
+ if db_path is None:
19
+ import os
20
+
21
+ from eval_protocol.directory_utils import find_eval_protocol_dir
22
+
23
+ eval_protocol_dir = find_eval_protocol_dir()
24
+ db_path = os.path.join(eval_protocol_dir, "logs.db")
25
+
26
+ self._db = SqliteEventBusDatabase(db_path)
27
+ self._running = False
28
+ self._listener_thread: Optional[threading.Thread] = None
29
+ self._process_id = str(uuid4())
30
+
31
+ def emit(self, event_type: str, data: Any) -> None:
32
+ """Emit an event to all subscribers.
33
+
34
+ Args:
35
+ event_type: Type of event (e.g., "log")
36
+ data: Event data
37
+ """
38
+ # Call local listeners immediately
39
+ super().emit(event_type, data)
40
+
41
+ # Publish to cross-process subscribers
42
+ self._publish_cross_process(event_type, data)
43
+
44
+ def _publish_cross_process(self, event_type: str, data: Any) -> None:
45
+ """Publish event to cross-process subscribers via database."""
46
+ self._db.publish_event(event_type, data, self._process_id)
47
+
48
+ def start_listening(self) -> None:
49
+ """Start listening for cross-process events."""
50
+ if self._running:
51
+ return
52
+
53
+ self._running = True
54
+ self._start_database_listener()
55
+
56
+ def stop_listening(self) -> None:
57
+ """Stop listening for cross-process events."""
58
+ self._running = False
59
+ if self._listener_thread and self._listener_thread.is_alive():
60
+ self._listener_thread.join(timeout=1)
61
+
62
+ def _start_database_listener(self) -> None:
63
+ """Start database-based event listener."""
64
+
65
+ def database_listener():
66
+ last_cleanup = time.time()
67
+
68
+ while self._running:
69
+ try:
70
+ # Get unprocessed events from other processes
71
+ events = self._db.get_unprocessed_events(self._process_id)
72
+
73
+ for event in events:
74
+ if not self._running:
75
+ break
76
+
77
+ try:
78
+ # Handle the event
79
+ self._handle_cross_process_event(event["event_type"], event["data"])
80
+
81
+ # Mark as processed
82
+ self._db.mark_event_processed(event["event_id"])
83
+
84
+ except Exception as e:
85
+ logger.debug(f"Failed to process event {event['event_id']}: {e}")
86
+
87
+ # Clean up old events every hour
88
+ current_time = time.time()
89
+ if current_time - last_cleanup >= 3600:
90
+ self._db.cleanup_old_events()
91
+ last_cleanup = current_time
92
+
93
+ # Small sleep to prevent busy waiting
94
+ time.sleep(0.1)
95
+
96
+ except Exception as e:
97
+ logger.debug(f"Database listener error: {e}")
98
+ time.sleep(1)
99
+
100
+ self._listener_thread = threading.Thread(target=database_listener, daemon=True)
101
+ self._listener_thread.start()
102
+
103
+ def _handle_cross_process_event(self, event_type: str, data: Any) -> None:
104
+ """Handle events received from other processes."""
105
+ for listener in self._listeners:
106
+ try:
107
+ listener(event_type, data)
108
+ except Exception as e:
109
+ logger.debug(f"Cross-process event listener failed for {event_type}: {e}")
@@ -0,0 +1,95 @@
1
+ import time
2
+ from typing import Any, List
3
+ from uuid import uuid4
4
+
5
+ from peewee import CharField, DateTimeField, Model, SqliteDatabase
6
+ from playhouse.sqlite_ext import JSONField
7
+
8
+ from eval_protocol.event_bus.logger import logger
9
+
10
+
11
+ class SqliteEventBusDatabase:
12
+ """SQLite database for cross-process event communication."""
13
+
14
+ def __init__(self, db_path: str):
15
+ self._db_path = db_path
16
+ self._db = SqliteDatabase(db_path)
17
+
18
+ class BaseModel(Model):
19
+ class Meta:
20
+ database = self._db
21
+
22
+ class Event(BaseModel): # type: ignore
23
+ event_id = CharField(unique=True)
24
+ event_type = CharField()
25
+ data = JSONField()
26
+ timestamp = DateTimeField()
27
+ process_id = CharField()
28
+ processed = CharField(default="false") # Track if event has been processed
29
+
30
+ self._Event = Event
31
+ self._db.connect()
32
+ self._db.create_tables([Event])
33
+
34
+ def publish_event(self, event_type: str, data: Any, process_id: str) -> None:
35
+ """Publish an event to the database."""
36
+ try:
37
+ # Serialize data, handling pydantic models
38
+ if hasattr(data, "model_dump"):
39
+ serialized_data = data.model_dump(mode="json", exclude_none=True)
40
+ else:
41
+ serialized_data = data
42
+
43
+ self._Event.create(
44
+ event_id=str(uuid4()),
45
+ event_type=event_type,
46
+ data=serialized_data,
47
+ timestamp=time.time(),
48
+ process_id=process_id,
49
+ processed="false",
50
+ )
51
+ except Exception as e:
52
+ logger.warning(f"Failed to publish event to database: {e}")
53
+
54
+ def get_unprocessed_events(self, process_id: str) -> List[dict]:
55
+ """Get unprocessed events from other processes."""
56
+ try:
57
+ query = (
58
+ self._Event.select()
59
+ .where((self._Event.process_id != process_id) & (self._Event.processed == "false"))
60
+ .order_by(self._Event.timestamp)
61
+ )
62
+
63
+ events = []
64
+ for event in query:
65
+ events.append(
66
+ {
67
+ "event_id": event.event_id,
68
+ "event_type": event.event_type,
69
+ "data": event.data,
70
+ "timestamp": event.timestamp,
71
+ "process_id": event.process_id,
72
+ }
73
+ )
74
+
75
+ return events
76
+ except Exception as e:
77
+ logger.warning(f"Failed to get unprocessed events: {e}")
78
+ return []
79
+
80
+ def mark_event_processed(self, event_id: str) -> None:
81
+ """Mark an event as processed."""
82
+ try:
83
+ self._Event.update(processed="true").where(self._Event.event_id == event_id).execute()
84
+ except Exception as e:
85
+ logger.debug(f"Failed to mark event as processed: {e}")
86
+
87
+ def cleanup_old_events(self, max_age_hours: int = 24) -> None:
88
+ """Clean up old processed events."""
89
+ try:
90
+ cutoff_time = time.time() - (max_age_hours * 3600)
91
+ self._Event.delete().where(
92
+ (self._Event.processed == "true") & (self._Event.timestamp < cutoff_time)
93
+ ).execute()
94
+ except Exception as e:
95
+ logger.debug(f"Failed to cleanup old events: {e}")
@@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional
11
11
 
12
12
  import aiohttp
13
13
  from omegaconf import DictConfig
14
- from pydantic import BaseModel, Field # Added for new models
14
+ from pydantic import BaseModel # Added for new models
15
15
 
16
16
  logger = logging.getLogger(__name__)
17
17
 
@@ -83,6 +83,9 @@ class FireworksModelClient(ModelClient):
83
83
  }
84
84
  if self.top_p is not None:
85
85
  payload["top_p"] = self.top_p
86
+ # Include reasoning settings if configured (for reasoning-capable models)
87
+ if self.reasoning_effort:
88
+ payload["reasoning_effort"] = self.reasoning_effort
86
89
 
87
90
  if tools:
88
91
  payload["tools"] = tools