eval-protocol 0.2.34__tar.gz → 0.2.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (401) hide show
  1. {eval_protocol-0.2.34/eval_protocol.egg-info → eval_protocol-0.2.35}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/_version.py +3 -3
  3. eval_protocol-0.2.35/eval_protocol/logging/elasticsearch_direct_http_handler.py +91 -0
  4. eval_protocol-0.2.35/eval_protocol/logging/elasticsearch_index_manager.py +187 -0
  5. eval_protocol-0.2.35/eval_protocol/pytest/elasticsearch_setup.py +167 -0
  6. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/evaluation_test.py +2 -0
  7. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/remote_rollout_processor.py +40 -2
  8. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/rollout_processor.py +4 -0
  9. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/remote_rollout_processor.py +11 -0
  10. eval_protocol-0.2.35/eval_protocol/utils/subprocess_utils.py +118 -0
  11. {eval_protocol-0.2.34 → eval_protocol-0.2.35/eval_protocol.egg-info}/PKG-INFO +1 -1
  12. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/SOURCES.txt +4 -0
  13. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/LICENSE +0 -0
  14. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/README.md +0 -0
  15. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/__init__.py +0 -0
  16. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/normalize_sandbox_fusion.py +0 -0
  17. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/utils/__init__.py +0 -0
  18. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/utils/generate_api_key.py +0 -0
  19. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/development/utils/subprocess_manager.py +0 -0
  20. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/__init__.py +0 -0
  21. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/__main__.py +0 -0
  22. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/__init__.py +0 -0
  23. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/base.py +0 -0
  24. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/bigquery.py +0 -0
  25. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/braintrust.py +0 -0
  26. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/huggingface.py +0 -0
  27. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/langchain.py +0 -0
  28. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/langfuse.py +0 -0
  29. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/langsmith.py +0 -0
  30. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/openai_responses.py +0 -0
  31. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/trl.py +0 -0
  32. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/adapters/utils.py +0 -0
  33. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/__init__.py +0 -0
  34. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/models.py +0 -0
  35. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/orchestrator.py +0 -0
  36. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resource_abc.py +0 -0
  37. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resource_pool.py +0 -0
  38. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/__init__.py +0 -0
  39. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  40. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  41. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  42. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  43. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  44. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/docker_resource.py +0 -0
  45. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  46. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  47. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/resources/sql_resource.py +0 -0
  48. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/task_manager.py +0 -0
  49. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/agent/tool_registry.py +0 -0
  50. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/auth.py +0 -0
  51. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/__init__.py +0 -0
  52. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  53. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  54. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_aime25.py +0 -0
  55. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  56. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  57. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  58. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  59. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli.py +0 -0
  60. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/__init__.py +0 -0
  61. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  62. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/common.py +0 -0
  63. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/deploy.py +0 -0
  64. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  65. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/logs.py +0 -0
  66. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/preview.py +0 -0
  67. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  68. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/common_utils.py +0 -0
  69. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/config.py +0 -0
  70. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/__init__.py +0 -0
  71. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  72. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  73. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  74. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/data_loader/models.py +0 -0
  75. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/__init__.py +0 -0
  76. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  77. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  78. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  79. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  80. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/datasets/__init__.py +0 -0
  81. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/datasets/loader.py +0 -0
  82. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/directory_utils.py +0 -0
  83. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/evaluation.py +0 -0
  84. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/__init__.py +0 -0
  85. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/event_bus.py +0 -0
  86. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/logger.py +0 -0
  87. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  88. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  89. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/execution/__init__.py +0 -0
  90. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/execution/pipeline.py +0 -0
  91. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/gcp_tools.py +0 -0
  92. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/generation/cache.py +0 -0
  93. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/generation/clients/base.py +0 -0
  94. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/generation/clients.py +0 -0
  95. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/generic_server.py +0 -0
  96. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/get_pep440_version.py +0 -0
  97. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/human_id/__init__.py +0 -0
  98. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/human_id/dictionary.py +0 -0
  99. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/integrations/__init__.py +0 -0
  100. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/integrations/deepeval.py +0 -0
  101. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/integrations/openeval.py +0 -0
  102. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/integrations/trl.py +0 -0
  103. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/logging_utils.py +0 -0
  104. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/__init__.py +0 -0
  105. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/adapter.py +0 -0
  106. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/client/__init__.py +0 -0
  107. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/client/connection.py +0 -0
  108. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/clients.py +0 -0
  109. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/execution/__init__.py +0 -0
  110. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/execution/base_policy.py +0 -0
  111. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/execution/manager.py +0 -0
  112. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/execution/policy.py +0 -0
  113. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/grid_renderer.py +0 -0
  114. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  115. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/mcpgym.py +0 -0
  116. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/process_manager.py +0 -0
  117. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/session/__init__.py +0 -0
  118. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/session/manager.py +0 -0
  119. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/simple_process_manager.py +0 -0
  120. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp/simulation_server.py +0 -0
  121. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/__init__.py +0 -0
  122. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/config.py +0 -0
  123. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/main.py +0 -0
  124. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  125. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  126. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  127. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  128. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_env.py +0 -0
  129. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/__init__.py +0 -0
  130. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  131. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  132. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  133. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  134. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  135. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  136. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  137. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  138. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  139. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  140. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  141. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/models.py +0 -0
  142. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/packaging.py +0 -0
  143. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/platform_api.py +0 -0
  144. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/playback_policy.py +0 -0
  145. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/__init__.py +0 -0
  146. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  147. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  148. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  149. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  150. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  151. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  152. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  153. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  154. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  155. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/exception_config.py +0 -0
  156. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/execution.py +0 -0
  157. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  158. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  159. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/parameterize.py +0 -0
  160. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/plugin.py +0 -0
  161. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/store_experiment_link.py +0 -0
  162. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/store_results_url.py +0 -0
  163. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/types.py +0 -0
  164. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/utils.py +0 -0
  165. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/pytest/validate_signature.py +0 -0
  166. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/__init__.py +0 -0
  167. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge.py +0 -0
  168. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  169. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
  170. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
  171. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
  172. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/quickstart/utils.py +0 -0
  173. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/resources.py +0 -0
  174. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/reward_function.py +0 -0
  175. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/__init__.py +0 -0
  176. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/accuracy.py +0 -0
  177. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/accuracy_length.py +0 -0
  178. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  179. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  180. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/apps_testing_util.py +0 -0
  181. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/bfcl_reward.py +0 -0
  182. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/code_execution.py +0 -0
  183. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/code_execution_utils.py +0 -0
  184. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/cpp_code.py +0 -0
  185. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  186. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/format.py +0 -0
  187. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/function_calling.py +0 -0
  188. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/json_schema.py +0 -0
  189. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/language_consistency.py +0 -0
  190. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/lean_prover.py +0 -0
  191. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/length.py +0 -0
  192. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  193. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/math.py +0 -0
  194. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  195. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/reasoning_steps.py +0 -0
  196. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/repetition.py +0 -0
  197. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rewards/tag_count.py +0 -0
  198. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/rl_processing.py +0 -0
  199. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/server.py +0 -0
  200. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/stats/__init__.py +0 -0
  201. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/stats/confidence_intervals.py +0 -0
  202. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/typed_interface.py +0 -0
  203. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/__init__.py +0 -0
  204. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/errors.py +0 -0
  205. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/types/types.py +0 -0
  206. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/__init__.py +0 -0
  207. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/batch_evaluation.py +0 -0
  208. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/batch_transformation.py +0 -0
  209. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/check_server_status.py +0 -0
  210. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/dataset_helpers.py +0 -0
  211. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/logs_server.py +0 -0
  212. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/module_loader.py +0 -0
  213. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/packaging_utils.py +0 -0
  214. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/show_results_url.py +0 -0
  215. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/static_policy.py +0 -0
  216. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol/utils/vite_server.py +0 -0
  217. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/dependency_links.txt +0 -0
  218. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/entry_points.txt +0 -0
  219. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/requires.txt +0 -0
  220. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/eval_protocol.egg-info/top_level.txt +0 -0
  221. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/pyproject.toml +0 -0
  222. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/setup.cfg +0 -0
  223. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/setup.py +0 -0
  224. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_accuracy.py +0 -0
  225. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_accuracy_length.py +0 -0
  226. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_adapters_e2e.py +0 -0
  227. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_agent_orchestrator.py +0 -0
  228. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_agent_resources.py +0 -0
  229. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_auth.py +0 -0
  230. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_batch_evaluation.py +0 -0
  231. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_cli.py +0 -0
  232. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_cli_agent.py +0 -0
  233. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_cli_args.py +0 -0
  234. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_code_execution.py +0 -0
  235. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_config.py +0 -0
  236. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_control_plane_separation.py +0 -0
  237. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_cpp_code.py +0 -0
  238. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_data_driven_task_manager.py +0 -0
  239. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_deepcoder_reward.py +0 -0
  240. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_deepeval_integration.py +0 -0
  241. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_deploy_integration.py +0 -0
  242. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_e2b_integration.py +0 -0
  243. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_e2b_js_integration.py +0 -0
  244. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_edge_cases.py +0 -0
  245. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_eval_protocol_import.py +0 -0
  246. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_evaluation.py +0 -0
  247. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_evaluation_integration.py +0 -0
  248. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_evaluation_postprocess.py +0 -0
  249. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_evaluation_preview_integration.py +0 -0
  250. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_event_bus.py +0 -0
  251. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_examples_end_to_end.py +0 -0
  252. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_fireworks_api.py +0 -0
  253. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_format.py +0 -0
  254. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_fractional_code.py +0 -0
  255. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_function_calling.py +0 -0
  256. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_gcp_tools.py +0 -0
  257. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_generic_server.py +0 -0
  258. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_human_id.py +0 -0
  259. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_integration.py +0 -0
  260. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_json_schema.py +0 -0
  261. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_kwargs_validation.py +0 -0
  262. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_language_consistency.py +0 -0
  263. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_lean_prover.py +0 -0
  264. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_lean_prover_runner.py +0 -0
  265. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_length.py +0 -0
  266. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_list_comparison_math_reward.py +0 -0
  267. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_logs_server.py +0 -0
  268. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_logs_server_simple.py +0 -0
  269. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_math.py +0 -0
  270. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_minimal.py +0 -0
  271. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_models.py +0 -0
  272. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_models_rl.py +0 -0
  273. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_multiple_choice_math_reward.py +0 -0
  274. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_n_variant_batch_integration.py +0 -0
  275. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_n_variant_integration.py +0 -0
  276. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_openai_compatibility.py +0 -0
  277. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_openeval_integration.py +0 -0
  278. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_packaging.py +0 -0
  279. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_parallel_rollouts.py +0 -0
  280. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_platform_api.py +0 -0
  281. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_quickstart_utils.py +0 -0
  282. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_readiness.py +0 -0
  283. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_reasoning_steps.py +0 -0
  284. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_repetition.py +0 -0
  285. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_repetition_debug.py +0 -0
  286. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_retry_mechanism.py +0 -0
  287. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_reward_function.py +0 -0
  288. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_reward_protocol_import.py +0 -0
  289. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_rl_processing.py +0 -0
  290. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_rollout_control_plane_integration.py +0 -0
  291. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_server.py +0 -0
  292. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_show_results_url.py +0 -0
  293. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_status_migration_changes.py +0 -0
  294. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_status_migration_integration.py +0 -0
  295. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_status_model.py +0 -0
  296. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_tag_count.py +0 -0
  297. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_tau_bench_airline_smoke.py +0 -0
  298. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_typed_interface.py +0 -0
  299. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_typed_interface_rl.py +0 -0
  300. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_url_handling.py +0 -0
  301. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/tests/test_vite_server.py +0 -0
  302. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/__init__.py +0 -0
  303. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/agent/__init__.py +0 -0
  304. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/agent/base.py +0 -0
  305. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/agent/llm_agent.py +0 -0
  306. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/api_service/__init__.py +0 -0
  307. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/api_service/api_config.py +0 -0
  308. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/api_service/data_model.py +0 -0
  309. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/api_service/simulation_service.py +0 -0
  310. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/cli.py +0 -0
  311. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/config.py +0 -0
  312. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/airline/policy.md +0 -0
  313. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/mock/policy.md +0 -0
  314. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  315. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/retail/policy.md +0 -0
  316. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  317. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  318. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  319. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  320. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  321. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  322. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  323. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data_model/__init__.py +0 -0
  324. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data_model/message.py +0 -0
  325. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data_model/simulation.py +0 -0
  326. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/data_model/tasks.py +0 -0
  327. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/__init__.py +0 -0
  328. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/__init__.py +0 -0
  329. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/data_model.py +0 -0
  330. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/environment.py +0 -0
  331. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/tools.py +0 -0
  332. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/airline/utils.py +0 -0
  333. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/__init__.py +0 -0
  334. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/data_model.py +0 -0
  335. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/environment.py +0 -0
  336. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/tools.py +0 -0
  337. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/mock/utils.py +0 -0
  338. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/__init__.py +0 -0
  339. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/data_model.py +0 -0
  340. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/environment.py +0 -0
  341. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/tools.py +0 -0
  342. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/retail/utils.py +0 -0
  343. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/__init__.py +0 -0
  344. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/data_model.py +0 -0
  345. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/environment.py +0 -0
  346. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  347. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  348. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  349. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  350. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  351. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  352. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  353. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  354. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/tools.py +0 -0
  355. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  356. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  357. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/domains/telecom/utils.py +0 -0
  358. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/__init__.py +0 -0
  359. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/db.py +0 -0
  360. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/environment.py +0 -0
  361. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/server.py +0 -0
  362. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/tool.py +0 -0
  363. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/toolkit.py +0 -0
  364. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  365. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/__init__.py +0 -0
  366. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator.py +0 -0
  367. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  368. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  369. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  370. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  371. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  372. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/metrics/__init__.py +0 -0
  373. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/metrics/agent_metrics.py +0 -0
  374. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  375. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/orchestrator/__init__.py +0 -0
  376. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  377. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  378. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/orchestrator/utils.py +0 -0
  379. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/registry.py +0 -0
  380. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/run.py +0 -0
  381. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/__init__.py +0 -0
  382. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/check_data.py +0 -0
  383. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  384. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/start_servers.py +0 -0
  385. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/scripts/view_simulations.py +0 -0
  386. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/user/__init__.py +0 -0
  387. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/user/base.py +0 -0
  388. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/user/user_simulator.py +0 -0
  389. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/__init__.py +0 -0
  390. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/display.py +0 -0
  391. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/io_utils.py +0 -0
  392. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/llm_utils.py +0 -0
  393. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/pydantic_utils.py +0 -0
  394. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vendor/tau2/utils/utils.py +0 -0
  395. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/versioneer.py +0 -0
  396. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  397. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/index-C8woq7EO.js +0 -0
  398. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/index-C8woq7EO.js.map +0 -0
  399. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/index-CSKGq1w7.css +0 -0
  400. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  401. {eval_protocol-0.2.34 → eval_protocol-0.2.35}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.34
3
+ Version: 0.2.35
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-09-30T15:39:15-0700",
11
+ "date": "2025-10-01T13:28:59-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "c09755b30386c03c95bd79d7b142ed614419c7c4",
15
- "version": "0.2.34"
14
+ "full-revisionid": "43ea8eaa8329931e6a9e61aa23a7aeca359f1d1c",
15
+ "version": "0.2.35"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,91 @@
1
+ import json
2
+ import logging
3
+ import asyncio
4
+ import threading
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from typing import Optional, Tuple, Any, Dict
7
+ from datetime import datetime
8
+ from urllib.parse import urlparse
9
+ import requests
10
+
11
+ from eval_protocol.types.remote_rollout_processor import ElasticSearchConfig
12
+
13
+
14
+ class ElasticsearchDirectHttpHandler(logging.Handler):
15
+ def __init__(self, elasticsearch_config: ElasticSearchConfig) -> None:
16
+ super().__init__()
17
+ self.base_url: str = elasticsearch_config.url.rstrip("/")
18
+ self.index_name: str = elasticsearch_config.index_name
19
+ self.api_key: str = elasticsearch_config.api_key
20
+ self.url: str = f"{self.base_url}/{self.index_name}/_doc"
21
+ self.formatter: logging.Formatter = logging.Formatter()
22
+ self._executor = None
23
+
24
+ # Parse URL to determine if we should verify SSL
25
+ parsed_url = urlparse(elasticsearch_config.url)
26
+ self.verify_ssl = parsed_url.scheme == "https"
27
+
28
+ def emit(self, record: logging.LogRecord) -> None:
29
+ """Emit a log record by scheduling it for async transmission."""
30
+ try:
31
+ # Create proper ISO 8601 timestamp
32
+ timestamp = datetime.fromtimestamp(record.created).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
33
+
34
+ data: Dict[str, Any] = {
35
+ "@timestamp": timestamp,
36
+ "level": record.levelname,
37
+ "message": record.getMessage(),
38
+ "logger_name": record.name,
39
+ # Add other relevant record attributes if needed
40
+ }
41
+
42
+ # Schedule the HTTP request to run asynchronously
43
+ self._schedule_async_send(data, record)
44
+ except Exception as e:
45
+ self.handleError(record)
46
+ print(f"Error preparing log for Elasticsearch: {e}")
47
+
48
+ def _schedule_async_send(self, data: Dict[str, Any], record: logging.LogRecord) -> None:
49
+ """Schedule an async task to send the log data to Elasticsearch."""
50
+ if self._executor is None:
51
+ self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="elasticsearch-logger")
52
+
53
+ # Submit the HTTP request to the thread pool
54
+ future = self._executor.submit(self._send_to_elasticsearch, data, record)
55
+
56
+ # Add error handling callback
57
+ future.add_done_callback(lambda f: self._handle_async_result(f, record))
58
+
59
+ def _send_to_elasticsearch(self, data: Dict[str, Any], record: logging.LogRecord) -> None:
60
+ """Send data to Elasticsearch (runs in thread pool)."""
61
+ try:
62
+ response: requests.Response = requests.post(
63
+ self.url,
64
+ headers={"Content-Type": "application/json", "Authorization": f"ApiKey {self.api_key}"},
65
+ data=json.dumps(data),
66
+ verify=self.verify_ssl, # If using HTTPS, verify SSL certificate
67
+ )
68
+ response.raise_for_status() # Raise an exception for HTTP errors
69
+ except Exception as e:
70
+ # Re-raise to be handled by the callback
71
+ raise e
72
+
73
+ def _handle_async_result(self, future, record: logging.LogRecord) -> None:
74
+ """Handle the result of the async send operation."""
75
+ try:
76
+ future.result() # This will raise any exception that occurred
77
+ except Exception as e:
78
+ self.handleError(record)
79
+ # You might want to log this error to a file or console
80
+ # to prevent a logging loop.
81
+ if hasattr(e, "response") and getattr(e, "response", None) is not None:
82
+ print(f"Error sending log to Elasticsearch: {e}")
83
+ print(f"Response content: {getattr(e, 'response').text}")
84
+ else:
85
+ print(f"Error sending log to Elasticsearch: {e}")
86
+
87
+ def close(self) -> None:
88
+ """Clean up resources when the handler is closed."""
89
+ super().close()
90
+ if self._executor:
91
+ self._executor.shutdown(wait=True)
@@ -0,0 +1,187 @@
1
+ import requests
2
+ from typing import Dict, Any, Optional
3
+ from urllib.parse import urlparse
4
+
5
+
6
+ class ElasticsearchIndexManager:
7
+ """Manages Elasticsearch index creation and mapping configuration."""
8
+
9
+ def __init__(self, base_url: str, index_name: str, api_key: str) -> None:
10
+ """Initialize the Elasticsearch index manager.
11
+
12
+ Args:
13
+ base_url: Elasticsearch base URL (e.g., "https://localhost:9200")
14
+ index_name: Name of the index to manage
15
+ api_key: API key for authentication
16
+ """
17
+ self.base_url: str = base_url.rstrip("/")
18
+ self.index_name: str = index_name
19
+ self.api_key: str = api_key
20
+ self.index_url: str = f"{self.base_url}/{self.index_name}"
21
+ self._mapping_created: bool = False
22
+
23
+ # Parse URL to determine if we should verify SSL
24
+ parsed_url = urlparse(base_url)
25
+ self.verify_ssl = parsed_url.scheme == "https"
26
+
27
+ def create_logging_index_mapping(self) -> bool:
28
+ """Create index with proper mapping for logging data.
29
+
30
+ Returns:
31
+ bool: True if mapping was created successfully, False otherwise.
32
+ """
33
+ if self._mapping_created:
34
+ return True
35
+
36
+ try:
37
+ # Check if index exists and has correct mapping
38
+ if self._index_exists_with_correct_mapping():
39
+ self._mapping_created = True
40
+ return True
41
+
42
+ # If index exists but has wrong mapping, delete and recreate it
43
+ if self.index_exists():
44
+ print(f"Warning: Index {self.index_name} exists with incorrect mapping. Deleting and recreating...")
45
+ if not self.delete_index():
46
+ print(f"Warning: Failed to delete existing index {self.index_name}")
47
+ return False
48
+
49
+ # Create index with proper mapping
50
+ mapping = self._get_logging_mapping()
51
+ response = requests.put(
52
+ self.index_url,
53
+ headers={"Content-Type": "application/json", "Authorization": f"ApiKey {self.api_key}"},
54
+ json=mapping,
55
+ verify=self.verify_ssl,
56
+ )
57
+
58
+ if response.status_code in [200, 201]:
59
+ self._mapping_created = True
60
+ return True
61
+ else:
62
+ print(f"Warning: Failed to create index mapping: {response.status_code} - {response.text}")
63
+ return False
64
+
65
+ except Exception as e:
66
+ print(f"Warning: Failed to create index mapping: {e}")
67
+ return False
68
+
69
+ def _index_exists_with_correct_mapping(self) -> bool:
70
+ """Check if index exists and has the correct @timestamp mapping.
71
+
72
+ Returns:
73
+ bool: True if index exists with correct mapping, False otherwise.
74
+ """
75
+ try:
76
+ # Check if index exists
77
+ response = requests.head(
78
+ self.index_url, headers={"Authorization": f"ApiKey {self.api_key}"}, verify=self.verify_ssl
79
+ )
80
+
81
+ if response.status_code != 200:
82
+ return False
83
+
84
+ # Check if mapping is correct
85
+ mapping_response = requests.get(
86
+ f"{self.index_url}/_mapping",
87
+ headers={"Authorization": f"ApiKey {self.api_key}"},
88
+ verify=self.verify_ssl,
89
+ )
90
+
91
+ if mapping_response.status_code != 200:
92
+ return False
93
+
94
+ mapping_data = mapping_response.json()
95
+ return self._has_correct_timestamp_mapping(mapping_data)
96
+
97
+ except Exception:
98
+ return False
99
+
100
+ def _has_correct_timestamp_mapping(self, mapping_data: Dict[str, Any]) -> bool:
101
+ """Check if the mapping has @timestamp as a date field.
102
+
103
+ Args:
104
+ mapping_data: Elasticsearch mapping response data
105
+
106
+ Returns:
107
+ bool: True if @timestamp is correctly mapped as date field
108
+ """
109
+ try:
110
+ return (
111
+ self.index_name in mapping_data
112
+ and "mappings" in mapping_data[self.index_name]
113
+ and "properties" in mapping_data[self.index_name]["mappings"]
114
+ and "@timestamp" in mapping_data[self.index_name]["mappings"]["properties"]
115
+ and mapping_data[self.index_name]["mappings"]["properties"]["@timestamp"].get("type") == "date"
116
+ )
117
+ except (KeyError, TypeError):
118
+ return False
119
+
120
+ def _get_logging_mapping(self) -> Dict[str, Any]:
121
+ """Get the standard mapping for logging data.
122
+
123
+ Returns:
124
+ Dict containing the index mapping configuration
125
+ """
126
+ return {
127
+ "mappings": {
128
+ "properties": {
129
+ "@timestamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
130
+ "level": {"type": "keyword"},
131
+ "message": {"type": "text"},
132
+ "logger_name": {"type": "keyword"},
133
+ }
134
+ }
135
+ }
136
+
137
+ def delete_index(self) -> bool:
138
+ """Delete the managed index.
139
+
140
+ Returns:
141
+ bool: True if index was deleted successfully, False otherwise.
142
+ """
143
+ try:
144
+ response = requests.delete(
145
+ self.index_url, headers={"Authorization": f"ApiKey {self.api_key}"}, verify=self.verify_ssl
146
+ )
147
+ if response.status_code in [200, 404]: # 404 means index doesn't exist, which is fine
148
+ self._mapping_created = False
149
+ return True
150
+ else:
151
+ print(f"Warning: Failed to delete index: {response.status_code} - {response.text}")
152
+ return False
153
+ except Exception as e:
154
+ print(f"Warning: Failed to delete index: {e}")
155
+ return False
156
+
157
+ def index_exists(self) -> bool:
158
+ """Check if the index exists.
159
+
160
+ Returns:
161
+ bool: True if index exists, False otherwise.
162
+ """
163
+ try:
164
+ response = requests.head(
165
+ self.index_url, headers={"Authorization": f"ApiKey {self.api_key}"}, verify=self.verify_ssl
166
+ )
167
+ return response.status_code == 200
168
+ except Exception:
169
+ return False
170
+
171
+ def get_index_stats(self) -> Optional[Dict[str, Any]]:
172
+ """Get statistics about the index.
173
+
174
+ Returns:
175
+ Dict containing index statistics, or None if failed
176
+ """
177
+ try:
178
+ response = requests.get(
179
+ f"{self.index_url}/_stats",
180
+ headers={"Authorization": f"ApiKey {self.api_key}"},
181
+ verify=self.verify_ssl,
182
+ )
183
+ if response.status_code == 200:
184
+ return response.json()
185
+ return None
186
+ except Exception:
187
+ return None
@@ -0,0 +1,167 @@
1
+ import os
2
+ import subprocess
3
+ import tempfile
4
+ import logging
5
+ from typing import Optional
6
+
7
+ from dotenv import load_dotenv
8
+ from eval_protocol.directory_utils import find_eval_protocol_dir
9
+ from eval_protocol.types.remote_rollout_processor import ElasticSearchConfig
10
+ from eval_protocol.logging.elasticsearch_index_manager import ElasticsearchIndexManager
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ElasticsearchSetupError(Exception):
16
+ """Exception raised when Elasticsearch setup fails."""
17
+
18
+ pass
19
+
20
+
21
+ class ElasticsearchSetup:
22
+ """Handles Elasticsearch setup with retry logic for existing containers."""
23
+
24
+ def __init__(self):
25
+ self.eval_protocol_dir = find_eval_protocol_dir()
26
+
27
+ def setup_elasticsearch(self, index_name: str = "default-logs") -> ElasticSearchConfig:
28
+ """
29
+ Set up Elasticsearch, handling both local and remote scenarios.
30
+
31
+ Args:
32
+ index_name: Name of the Elasticsearch index to use for logging
33
+
34
+ Returns:
35
+ ElasticSearchConfig for the running instance with the specified index name.
36
+ """
37
+ elastic_start_local_dir = os.path.join(self.eval_protocol_dir, "elastic-start-local")
38
+ env_file_path = os.path.join(elastic_start_local_dir, ".env")
39
+
40
+ # If elastic-start-local directory exists, use existing Docker script
41
+ if os.path.exists(elastic_start_local_dir):
42
+ config = self._setup_existing_docker_elasticsearch(elastic_start_local_dir, env_file_path)
43
+ else:
44
+ # Otherwise, initialize Docker setup from scratch
45
+ config = self._setup_initialized_docker_elasticsearch(env_file_path)
46
+
47
+ # Create the logging index with proper mapping
48
+ self.create_logging_index(index_name)
49
+
50
+ # Return config with the specified index name
51
+ return ElasticSearchConfig(url=config.url, api_key=config.api_key, index_name=index_name)
52
+
53
+ def _setup_existing_docker_elasticsearch(
54
+ self, elastic_start_local_dir: str, env_file_path: str
55
+ ) -> ElasticSearchConfig:
56
+ """Set up Elasticsearch using existing Docker start.sh script."""
57
+ from eval_protocol.utils.subprocess_utils import run_script_and_wait
58
+
59
+ run_script_and_wait(
60
+ script_name="start.sh",
61
+ working_directory=elastic_start_local_dir,
62
+ inherit_stdout=True,
63
+ )
64
+ return self._parse_elastic_env_file(env_file_path)
65
+
66
+ def _setup_initialized_docker_elasticsearch(self, env_file_path: str) -> ElasticSearchConfig:
67
+ """Set up Elasticsearch by initializing Docker setup from scratch with retry logic."""
68
+ max_retries = 2
69
+ for attempt in range(max_retries):
70
+ # Use a temporary file to capture output while also showing it in parent stdout
71
+ with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file:
72
+ temp_file_path = temp_file.name
73
+
74
+ try:
75
+ # Run the command and tee output to both stdout and temp file
76
+ # Use set -o pipefail to ensure we get the return code of the first failing command
77
+ process = subprocess.Popen(
78
+ [
79
+ "sh",
80
+ "-c",
81
+ f"set -o pipefail; curl -fsSL https://elastic.co/start-local | sh -s -- --esonly | tee {temp_file_path}",
82
+ ],
83
+ cwd=self.eval_protocol_dir,
84
+ )
85
+ returncode = process.wait()
86
+
87
+ # Read the captured output
88
+ with open(temp_file_path, "r") as f:
89
+ stdout = f.read()
90
+
91
+ if returncode == 0:
92
+ return self._parse_elastic_env_file(env_file_path)
93
+
94
+ # Check if container is already running and handle it
95
+ if self._handle_existing_elasticsearch_container(stdout):
96
+ logger.info(f"Retrying Elasticsearch setup (attempt {attempt + 1}/{max_retries})")
97
+ continue
98
+
99
+ # If we get here, it's a different error
100
+ raise ElasticsearchSetupError(
101
+ f"Failed to start Elasticsearch (attempt {attempt + 1}/{max_retries}): {stdout}"
102
+ )
103
+
104
+ finally:
105
+ # Clean up the temporary file
106
+ try:
107
+ os.unlink(temp_file_path)
108
+ except OSError:
109
+ pass
110
+
111
+ raise ElasticsearchSetupError(f"Failed to start Elasticsearch after {max_retries} attempts")
112
+
113
+ def _handle_existing_elasticsearch_container(self, output: str) -> bool:
114
+ """
115
+ Check if the curl command output indicates that the Elasticsearch container is already running.
116
+ If so, stop the existing container and return True to indicate a retry is needed.
117
+ """
118
+ if "docker stop es-local-dev" in output:
119
+ logger.info("Elasticsearch container 'es-local-dev' is already running. Stopping it...")
120
+ try:
121
+ subprocess.run(["docker", "stop", "es-local-dev"], check=True, capture_output=True, text=True)
122
+ logger.info("Successfully stopped existing Elasticsearch container")
123
+ return True # Indicate retry is needed
124
+ except subprocess.CalledProcessError as e:
125
+ logger.warning(f"Failed to stop existing container: {e}")
126
+ return False
127
+ return False
128
+
129
+ def _parse_elastic_env_file(self, env_file_path: str) -> ElasticSearchConfig:
130
+ """Parse ES_LOCAL_API_KEY and ES_LOCAL_URL from .env file."""
131
+ loaded = load_dotenv(env_file_path)
132
+ if not loaded:
133
+ raise ElasticsearchSetupError("Failed to load .env file")
134
+
135
+ api_key = os.getenv("ES_LOCAL_API_KEY")
136
+ url = os.getenv("ES_LOCAL_URL")
137
+
138
+ if not url or not api_key:
139
+ raise ElasticsearchSetupError("Failed to parse ES_LOCAL_API_KEY and ES_LOCAL_URL from .env file")
140
+
141
+ return ElasticSearchConfig(url=url, api_key=api_key, index_name="default-logs")
142
+
143
+ def create_logging_index(self, index_name: str) -> bool:
144
+ """Create an Elasticsearch index with proper mapping for logging data.
145
+
146
+ Args:
147
+ index_name: Name of the index to create
148
+
149
+ Returns:
150
+ bool: True if index was created successfully, False otherwise.
151
+ """
152
+ try:
153
+ # Get the config from the .env file
154
+ config = self._parse_elastic_env_file(self._get_env_file_path())
155
+
156
+ # Create index manager and set up mapping
157
+ index_manager = ElasticsearchIndexManager(config.url, index_name, config.api_key)
158
+ return index_manager.create_logging_index_mapping()
159
+
160
+ except Exception as e:
161
+ logger.error(f"Failed to create logging index {index_name}: {e}")
162
+ return False
163
+
164
+ def _get_env_file_path(self) -> str:
165
+ """Get the path to the .env file."""
166
+ elastic_start_local_dir = os.path.join(self.eval_protocol_dir, "elastic-start-local")
167
+ return os.path.join(elastic_start_local_dir, ".env")
@@ -367,6 +367,8 @@ def evaluation_test(
367
367
  exception_handler_config=exception_handler_config,
368
368
  )
369
369
 
370
+ rollout_processor.setup()
371
+
370
372
  async def execute_run(run_idx: int, config: RolloutProcessorConfig):
371
373
  nonlocal all_results
372
374
 
@@ -6,11 +6,16 @@ import requests
6
6
 
7
7
  from eval_protocol.models import EvaluationRow, Status
8
8
  from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
9
- from eval_protocol.types.remote_rollout_processor import InitRequest, RolloutMetadata
9
+ from eval_protocol.types.remote_rollout_processor import ElasticSearchConfig, InitRequest, RolloutMetadata
10
10
  from .rollout_processor import RolloutProcessor
11
11
  from .types import RolloutProcessorConfig
12
+ from .elasticsearch_setup import ElasticsearchSetup
13
+ import logging
14
+
12
15
  import os
13
16
 
17
+ logger = logging.getLogger(__name__)
18
+
14
19
 
15
20
  class RemoteRolloutProcessor(RolloutProcessor):
16
21
  """
@@ -27,6 +32,8 @@ class RemoteRolloutProcessor(RolloutProcessor):
27
32
  poll_interval: float = 1.0,
28
33
  timeout_seconds: float = 120.0,
29
34
  output_data_loader: Callable[[str], DynamicDataLoader],
35
+ disable_elastic_search: bool = False,
36
+ elastic_search_config: Optional[ElasticSearchConfig] = None,
30
37
  ):
31
38
  # Prefer constructor-provided configuration. These can be overridden via
32
39
  # config.kwargs at call time for backward compatibility.
@@ -37,6 +44,21 @@ class RemoteRolloutProcessor(RolloutProcessor):
37
44
  self._poll_interval = poll_interval
38
45
  self._timeout_seconds = timeout_seconds
39
46
  self._output_data_loader = output_data_loader
47
+ self._disable_elastic_search = disable_elastic_search
48
+ self._elastic_search_config = elastic_search_config
49
+
50
+ def setup(self) -> None:
51
+ if self._disable_elastic_search:
52
+ logger.info("Elasticsearch is disabled, skipping setup")
53
+ return
54
+ logger.info("Setting up Elasticsearch")
55
+ self._elastic_search_config = self._setup_elastic_search()
56
+ logger.info("Elasticsearch setup complete")
57
+
58
+ def _setup_elastic_search(self) -> ElasticSearchConfig:
59
+ """Set up Elasticsearch using the dedicated setup module."""
60
+ setup = ElasticsearchSetup()
61
+ return setup.setup_elasticsearch()
40
62
 
41
63
  def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
42
64
  tasks: List[asyncio.Task[EvaluationRow]] = []
@@ -113,12 +135,23 @@ class RemoteRolloutProcessor(RolloutProcessor):
113
135
  if row.execution_metadata.rollout_id is None:
114
136
  raise ValueError("Rollout ID is required in RemoteRolloutProcessor")
115
137
 
138
+ final_model_base_url = model_base_url
139
+ if model_base_url and model_base_url.startswith("https://tracing.fireworks.ai/project_id/"):
140
+ final_model_base_url = (
141
+ f"{model_base_url}/rollout_id/{meta.rollout_id}"
142
+ f"/invocation_id/{meta.invocation_id}"
143
+ f"/experiment_id/{meta.experiment_id}"
144
+ f"/run_id/{meta.run_id}"
145
+ f"/row_id/{meta.row_id}"
146
+ )
147
+
116
148
  init_payload: InitRequest = InitRequest(
117
149
  model=model,
118
150
  messages=clean_messages,
119
151
  tools=row.tools,
120
152
  metadata=meta,
121
- model_base_url=model_base_url,
153
+ model_base_url=final_model_base_url,
154
+ elastic_search_config=self._elastic_search_config,
122
155
  )
123
156
 
124
157
  # Fire-and-poll
@@ -197,6 +230,11 @@ class RemoteRolloutProcessor(RolloutProcessor):
197
230
  langfuse_row.input_metadata.dataset_info = row.input_metadata.dataset_info
198
231
  langfuse_row.eval_metadata = row.eval_metadata
199
232
  langfuse_row.ground_truth = row.ground_truth
233
+
234
+ # this is useful to detect stopped evaluations so we can update
235
+ # the status in the logs server
236
+ langfuse_row.pid = row.pid
237
+
200
238
  return langfuse_row
201
239
  else:
202
240
  raise ValueError("RemoteRolloutProcessor's output_data_loader should return exactly one row.")
@@ -10,6 +10,10 @@ class RolloutProcessor(ABC):
10
10
  Abstract base class for all rollout processor strategies.
11
11
  """
12
12
 
13
+ def setup(self) -> None:
14
+ """Setup resources. Override in subclasses if setup is needed. Executed once per invocation."""
15
+ pass
16
+
13
17
  @abstractmethod
14
18
  def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) -> list[asyncio.Task[EvaluationRow]]:
15
19
  """Process evaluation rows and return async tasks. Must be implemented by subclasses."""
@@ -7,6 +7,16 @@ from pydantic import BaseModel, Field
7
7
  from eval_protocol.models import Message, Status
8
8
 
9
9
 
10
+ class ElasticSearchConfig(BaseModel):
11
+ """
12
+ Configuration for Elasticsearch.
13
+ """
14
+
15
+ url: str
16
+ api_key: str
17
+ index_name: str
18
+
19
+
10
20
  class RolloutMetadata(BaseModel):
11
21
  """Metadata for rollout execution."""
12
22
 
@@ -21,6 +31,7 @@ class InitRequest(BaseModel):
21
31
  """Request model for POST /init endpoint."""
22
32
 
23
33
  model: str
34
+ elastic_search_config: Optional[ElasticSearchConfig] = None
24
35
  messages: Optional[List[Message]] = None
25
36
  tools: Optional[List[Dict[str, Any]]] = None
26
37