eval-protocol 0.2.43__tar.gz → 0.2.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (416) hide show
  1. {eval_protocol-0.2.43/eval_protocol.egg-info → eval_protocol-0.2.45}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/__init__.py +2 -0
  3. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/fireworks_tracing.py +29 -50
  5. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli.py +1 -0
  6. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/logs.py +2 -1
  7. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +8 -1
  8. eval_protocol-0.2.45/eval_protocol/event_bus/sqlite_event_bus.py +126 -0
  9. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/sqlite_event_bus_database.py +6 -8
  10. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/elasticsearch_client.py +19 -0
  11. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +3 -3
  12. eval_protocol-0.2.45/eval_protocol/log_utils/util.py +22 -0
  13. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/remote_rollout_processor.py +1 -1
  14. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/logs_server.py +126 -22
  15. {eval_protocol-0.2.43 → eval_protocol-0.2.45/eval_protocol.egg-info}/PKG-INFO +1 -1
  16. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/SOURCES.txt +2 -0
  17. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_event_bus.py +74 -38
  18. eval_protocol-0.2.45/tests/test_event_bus_helper.py +74 -0
  19. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_logs_server.py +2 -2
  20. eval_protocol-0.2.43/eval_protocol/event_bus/sqlite_event_bus.py +0 -109
  21. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/LICENSE +0 -0
  22. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/README.md +0 -0
  23. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/__init__.py +0 -0
  24. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/normalize_sandbox_fusion.py +0 -0
  25. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/utils/__init__.py +0 -0
  26. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/utils/generate_api_key.py +0 -0
  27. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/development/utils/subprocess_manager.py +0 -0
  28. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/__main__.py +0 -0
  29. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/__init__.py +0 -0
  30. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/base.py +0 -0
  31. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/bigquery.py +0 -0
  32. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/braintrust.py +0 -0
  33. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/huggingface.py +0 -0
  34. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/langchain.py +0 -0
  35. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/langfuse.py +0 -0
  36. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/langsmith.py +0 -0
  37. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/openai_responses.py +0 -0
  38. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/trl.py +0 -0
  39. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/adapters/utils.py +0 -0
  40. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/__init__.py +0 -0
  41. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/models.py +0 -0
  42. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/orchestrator.py +0 -0
  43. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resource_abc.py +0 -0
  44. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resource_pool.py +0 -0
  45. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/__init__.py +0 -0
  46. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  47. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  48. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  49. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  50. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  51. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/docker_resource.py +0 -0
  52. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  53. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  54. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/resources/sql_resource.py +0 -0
  55. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/task_manager.py +0 -0
  56. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/agent/tool_registry.py +0 -0
  57. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/auth.py +0 -0
  58. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/__init__.py +0 -0
  59. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  60. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  61. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_aime25.py +0 -0
  62. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  63. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  64. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  65. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  66. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  67. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/__init__.py +0 -0
  68. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  69. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/common.py +0 -0
  70. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/deploy.py +0 -0
  71. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  72. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/preview.py +0 -0
  73. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  74. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/cli_commands/upload.py +0 -0
  75. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/common_utils.py +0 -0
  76. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/config.py +0 -0
  77. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/__init__.py +0 -0
  78. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  79. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  80. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  81. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/data_loader/models.py +0 -0
  82. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/__init__.py +0 -0
  83. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  84. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  85. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  86. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/datasets/__init__.py +0 -0
  87. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/datasets/loader.py +0 -0
  88. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/directory_utils.py +0 -0
  89. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/evaluation.py +0 -0
  90. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/__init__.py +0 -0
  91. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/event_bus.py +0 -0
  92. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/event_bus/logger.py +0 -0
  93. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/execution/__init__.py +0 -0
  94. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/execution/pipeline.py +0 -0
  95. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/gcp_tools.py +0 -0
  96. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/generation/cache.py +0 -0
  97. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/generation/clients/base.py +0 -0
  98. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/generation/clients.py +0 -0
  99. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/generic_server.py +0 -0
  100. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/get_pep440_version.py +0 -0
  101. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/human_id/__init__.py +0 -0
  102. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/human_id/dictionary.py +0 -0
  103. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/integrations/__init__.py +0 -0
  104. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/integrations/deepeval.py +0 -0
  105. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/integrations/openeval.py +0 -0
  106. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/integrations/trl.py +0 -0
  107. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/__init__.py +0 -0
  108. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  109. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  110. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/logging_utils.py +0 -0
  111. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/__init__.py +0 -0
  112. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/adapter.py +0 -0
  113. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/client/__init__.py +0 -0
  114. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/client/connection.py +0 -0
  115. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/clients.py +0 -0
  116. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/execution/__init__.py +0 -0
  117. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/execution/base_policy.py +0 -0
  118. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/execution/manager.py +0 -0
  119. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/execution/policy.py +0 -0
  120. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/grid_renderer.py +0 -0
  121. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  122. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/mcpgym.py +0 -0
  123. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/process_manager.py +0 -0
  124. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/session/__init__.py +0 -0
  125. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/session/manager.py +0 -0
  126. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/simple_process_manager.py +0 -0
  127. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp/simulation_server.py +0 -0
  128. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/__init__.py +0 -0
  129. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/config.py +0 -0
  130. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/main.py +0 -0
  131. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  132. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  133. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  134. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  135. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_env.py +0 -0
  136. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/__init__.py +0 -0
  137. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  138. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  139. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  140. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  141. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  142. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  143. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  144. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  145. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  146. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  147. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  148. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  149. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  150. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  151. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/models.py +0 -0
  152. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/packaging.py +0 -0
  153. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/platform_api.py +0 -0
  154. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/playback_policy.py +0 -0
  155. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/__init__.py +0 -0
  156. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  157. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  158. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  159. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  160. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  161. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  162. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  163. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  164. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  165. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/evaluation_test.py +0 -0
  166. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  167. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/exception_config.py +0 -0
  168. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/execution.py +0 -0
  169. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  170. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  171. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/parameterize.py +0 -0
  172. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/plugin.py +0 -0
  173. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/rollout_processor.py +0 -0
  174. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/store_experiment_link.py +0 -0
  175. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/store_results_url.py +0 -0
  176. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/types.py +0 -0
  177. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/utils.py +0 -0
  178. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/pytest/validate_signature.py +0 -0
  179. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/__init__.py +0 -0
  180. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge.py +0 -0
  181. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  182. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
  183. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
  184. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
  185. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/quickstart/utils.py +0 -0
  186. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/resources.py +0 -0
  187. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/reward_function.py +0 -0
  188. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/__init__.py +0 -0
  189. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/accuracy.py +0 -0
  190. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/accuracy_length.py +0 -0
  191. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  192. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  193. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/apps_testing_util.py +0 -0
  194. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/bfcl_reward.py +0 -0
  195. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/code_execution.py +0 -0
  196. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/code_execution_utils.py +0 -0
  197. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/cpp_code.py +0 -0
  198. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  199. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/format.py +0 -0
  200. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/function_calling.py +0 -0
  201. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/json_schema.py +0 -0
  202. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/language_consistency.py +0 -0
  203. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/lean_prover.py +0 -0
  204. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/length.py +0 -0
  205. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  206. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/math.py +0 -0
  207. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  208. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/reasoning_steps.py +0 -0
  209. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/repetition.py +0 -0
  210. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rewards/tag_count.py +0 -0
  211. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/rl_processing.py +0 -0
  212. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/server.py +0 -0
  213. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/stats/__init__.py +0 -0
  214. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/stats/confidence_intervals.py +0 -0
  215. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/typed_interface.py +0 -0
  216. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/types/__init__.py +0 -0
  217. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/types/errors.py +0 -0
  218. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/types/remote_rollout_processor.py +0 -0
  219. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/types/types.py +0 -0
  220. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/__init__.py +0 -0
  221. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/batch_evaluation.py +0 -0
  222. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/batch_transformation.py +0 -0
  223. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/check_server_status.py +0 -0
  224. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/dataset_helpers.py +0 -0
  225. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/logs_models.py +0 -0
  226. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/module_loader.py +0 -0
  227. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/packaging_utils.py +0 -0
  228. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/show_results_url.py +0 -0
  229. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/static_policy.py +0 -0
  230. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/subprocess_utils.py +0 -0
  231. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol/utils/vite_server.py +0 -0
  232. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/dependency_links.txt +0 -0
  233. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/entry_points.txt +0 -0
  234. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/requires.txt +0 -0
  235. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/eval_protocol.egg-info/top_level.txt +0 -0
  236. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/pyproject.toml +0 -0
  237. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/setup.cfg +0 -0
  238. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/setup.py +0 -0
  239. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_accuracy.py +0 -0
  240. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_accuracy_length.py +0 -0
  241. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_adapters_e2e.py +0 -0
  242. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_agent_orchestrator.py +0 -0
  243. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_agent_resources.py +0 -0
  244. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_auth.py +0 -0
  245. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_batch_evaluation.py +0 -0
  246. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_cli.py +0 -0
  247. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_cli_agent.py +0 -0
  248. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_cli_args.py +0 -0
  249. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_code_execution.py +0 -0
  250. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_config.py +0 -0
  251. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_control_plane_separation.py +0 -0
  252. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_cpp_code.py +0 -0
  253. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_data_driven_task_manager.py +0 -0
  254. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_deepcoder_reward.py +0 -0
  255. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_deepeval_integration.py +0 -0
  256. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_deploy_integration.py +0 -0
  257. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_directory_utils.py +0 -0
  258. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_e2b_integration.py +0 -0
  259. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_e2b_js_integration.py +0 -0
  260. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_edge_cases.py +0 -0
  261. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_eval_protocol_import.py +0 -0
  262. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_evaluation.py +0 -0
  263. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_evaluation_integration.py +0 -0
  264. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_evaluation_postprocess.py +0 -0
  265. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_evaluation_preview_integration.py +0 -0
  266. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_examples_end_to_end.py +0 -0
  267. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_fireworks_api.py +0 -0
  268. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_format.py +0 -0
  269. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_fractional_code.py +0 -0
  270. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_function_calling.py +0 -0
  271. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_gcp_tools.py +0 -0
  272. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_generic_server.py +0 -0
  273. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_human_id.py +0 -0
  274. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_integration.py +0 -0
  275. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_json_schema.py +0 -0
  276. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_kwargs_validation.py +0 -0
  277. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_language_consistency.py +0 -0
  278. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_lean_prover.py +0 -0
  279. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_lean_prover_runner.py +0 -0
  280. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_length.py +0 -0
  281. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_list_comparison_math_reward.py +0 -0
  282. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_logs_server_simple.py +0 -0
  283. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_math.py +0 -0
  284. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_minimal.py +0 -0
  285. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_models.py +0 -0
  286. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_models_rl.py +0 -0
  287. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_multiple_choice_math_reward.py +0 -0
  288. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_n_variant_batch_integration.py +0 -0
  289. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_n_variant_integration.py +0 -0
  290. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_openai_compatibility.py +0 -0
  291. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_openeval_integration.py +0 -0
  292. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_packaging.py +0 -0
  293. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_parallel_rollouts.py +0 -0
  294. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_platform_api.py +0 -0
  295. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_quickstart_utils.py +0 -0
  296. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_readiness.py +0 -0
  297. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_reasoning_steps.py +0 -0
  298. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_repetition.py +0 -0
  299. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_repetition_debug.py +0 -0
  300. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_retry_mechanism.py +0 -0
  301. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_reward_function.py +0 -0
  302. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_reward_protocol_import.py +0 -0
  303. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_rl_processing.py +0 -0
  304. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_rollout_control_plane_integration.py +0 -0
  305. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_server.py +0 -0
  306. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_show_results_url.py +0 -0
  307. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_status_migration_changes.py +0 -0
  308. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_status_migration_integration.py +0 -0
  309. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_status_model.py +0 -0
  310. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_tag_count.py +0 -0
  311. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_tau_bench_airline_smoke.py +0 -0
  312. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_typed_interface.py +0 -0
  313. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_typed_interface_rl.py +0 -0
  314. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_upload_entrypoint.py +0 -0
  315. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_url_handling.py +0 -0
  316. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/tests/test_vite_server.py +0 -0
  317. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/__init__.py +0 -0
  318. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/agent/__init__.py +0 -0
  319. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/agent/base.py +0 -0
  320. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/agent/llm_agent.py +0 -0
  321. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/api_service/__init__.py +0 -0
  322. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/api_service/api_config.py +0 -0
  323. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/api_service/data_model.py +0 -0
  324. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/api_service/simulation_service.py +0 -0
  325. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/cli.py +0 -0
  326. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/config.py +0 -0
  327. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/airline/policy.md +0 -0
  328. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/mock/policy.md +0 -0
  329. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  330. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/retail/policy.md +0 -0
  331. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  332. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  333. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  334. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  335. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  336. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  337. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  338. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data_model/__init__.py +0 -0
  339. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data_model/message.py +0 -0
  340. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data_model/simulation.py +0 -0
  341. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/data_model/tasks.py +0 -0
  342. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/__init__.py +0 -0
  343. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/__init__.py +0 -0
  344. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/data_model.py +0 -0
  345. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/environment.py +0 -0
  346. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/tools.py +0 -0
  347. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/airline/utils.py +0 -0
  348. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/__init__.py +0 -0
  349. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/data_model.py +0 -0
  350. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/environment.py +0 -0
  351. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/tools.py +0 -0
  352. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/mock/utils.py +0 -0
  353. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/__init__.py +0 -0
  354. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/data_model.py +0 -0
  355. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/environment.py +0 -0
  356. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/tools.py +0 -0
  357. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/retail/utils.py +0 -0
  358. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/__init__.py +0 -0
  359. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/data_model.py +0 -0
  360. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/environment.py +0 -0
  361. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  362. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  363. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  364. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  365. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  366. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  367. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  368. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  369. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/tools.py +0 -0
  370. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  371. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  372. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/domains/telecom/utils.py +0 -0
  373. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/__init__.py +0 -0
  374. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/db.py +0 -0
  375. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/environment.py +0 -0
  376. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/server.py +0 -0
  377. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/tool.py +0 -0
  378. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/toolkit.py +0 -0
  379. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  380. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/__init__.py +0 -0
  381. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator.py +0 -0
  382. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  383. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  384. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  385. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  386. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  387. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/metrics/__init__.py +0 -0
  388. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/metrics/agent_metrics.py +0 -0
  389. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  390. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/orchestrator/__init__.py +0 -0
  391. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  392. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  393. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/orchestrator/utils.py +0 -0
  394. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/registry.py +0 -0
  395. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/run.py +0 -0
  396. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/__init__.py +0 -0
  397. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/check_data.py +0 -0
  398. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  399. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/start_servers.py +0 -0
  400. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/scripts/view_simulations.py +0 -0
  401. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/user/__init__.py +0 -0
  402. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/user/base.py +0 -0
  403. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/user/user_simulator.py +0 -0
  404. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/__init__.py +0 -0
  405. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/display.py +0 -0
  406. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/io_utils.py +0 -0
  407. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/llm_utils.py +0 -0
  408. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/pydantic_utils.py +0 -0
  409. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vendor/tau2/utils/utils.py +0 -0
  410. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/versioneer.py +0 -0
  411. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  412. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/index-C81y9r9l.js +0 -0
  413. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/index-C81y9r9l.js.map +0 -0
  414. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/index-DpYZaoAr.css +0 -0
  415. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  416. {eval_protocol-0.2.43 → eval_protocol-0.2.45}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.43
3
+ Version: 0.2.45
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -34,6 +34,7 @@ from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutPr
34
34
  from .pytest.parameterize import DefaultParameterIdGenerator
35
35
  from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
36
36
  from .log_utils.rollout_id_filter import RolloutIdFilter
37
+ from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
37
38
 
38
39
  from .types.remote_rollout_processor import (
39
40
  InitRequest,
@@ -68,6 +69,7 @@ warnings.filterwarnings("default", category=DeprecationWarning, module="eval_pro
68
69
  __all__ = [
69
70
  "ElasticsearchDirectHttpHandler",
70
71
  "RolloutIdFilter",
72
+ "setup_rollout_logging_for_elasticsearch_handler",
71
73
  "DataLoaderConfig",
72
74
  "Status",
73
75
  "RemoteRolloutProcessor",
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-10-08T08:52:41-0700",
11
+ "date": "2025-10-08T14:59:37-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "535169e7193e6500d8d323e7dbc31c14dca98b96",
15
- "version": "0.2.43"
14
+ "full-revisionid": "b120611112b84df8476cefcc02660c542e61b2a9",
15
+ "version": "0.2.45"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -281,9 +281,8 @@ class FireworksTracingAdapter(BaseAdapter):
281
281
  from_timestamp: Optional[datetime] = None,
282
282
  to_timestamp: Optional[datetime] = None,
283
283
  include_tool_calls: bool = True,
284
- backend_sleep_between_gets: float = 0.1,
285
- backend_max_retries: int = 3,
286
- proxy_max_retries: int = 3,
284
+ sleep_between_gets: float = 0.1,
285
+ max_retries: int = 3,
287
286
  span_name: Optional[str] = None,
288
287
  converter: Optional[TraceDictConverter] = None,
289
288
  ) -> List[EvaluationRow]:
@@ -305,10 +304,8 @@ class FireworksTracingAdapter(BaseAdapter):
305
304
  from_timestamp: Explicit start time (ISO format)
306
305
  to_timestamp: Explicit end time (ISO format)
307
306
  include_tool_calls: Whether to include tool calling traces
308
- backend_sleep_between_gets: Sleep time between backend trace fetches (passed to proxy)
309
- backend_max_retries: Maximum retries for backend operations (passed to proxy)
310
- proxy_max_retries: Maximum retries when proxy returns 404 (client-side retries with exponential backoff)
311
- span_name: If provided, extract messages from generations within this named span
307
+ sleep_between_gets: Sleep time between polling attempts (default: 2.5s)
308
+ max_retries: Max retry attempts used by proxy (default: 3)
312
309
  converter: Optional custom converter implementing TraceDictConverter protocol.
313
310
  If provided, this will be used instead of the default conversion logic.
314
311
 
@@ -318,9 +315,9 @@ class FireworksTracingAdapter(BaseAdapter):
318
315
  Raises:
319
316
  ValueError: If tags list is empty
320
317
  """
321
- # Validate that tags are provided (security requirement)
318
+ # Validate that tags are provided
322
319
  if not tags or len(tags) == 0:
323
- raise ValueError("At least one tag is required to fetch traces (security: prevents fetching all traces)")
320
+ raise ValueError("At least one tag is required to fetch traces")
324
321
 
325
322
  eval_rows = []
326
323
 
@@ -339,58 +336,40 @@ class FireworksTracingAdapter(BaseAdapter):
339
336
  "hours_back": hours_back,
340
337
  "from_timestamp": from_timestamp.isoformat() if from_timestamp else None,
341
338
  "to_timestamp": to_timestamp.isoformat() if to_timestamp else None,
342
- "sleep_between_gets": backend_sleep_between_gets,
343
- "max_retries": backend_max_retries,
339
+ "sleep_between_gets": sleep_between_gets,
340
+ "max_retries": max_retries,
344
341
  }
345
342
 
346
343
  # Remove None values
347
344
  params = {k: v for k, v in params.items() if v is not None}
348
345
 
349
- # Make request to proxy with retry logic
346
+ # Make request to proxy
350
347
  if self.project_id:
351
348
  url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
352
349
  else:
353
350
  url = f"{self.base_url}/v1/traces"
354
351
 
355
- # Retry loop for handling backend indexing delays (proxy returns 404)
356
352
  result = None
357
- for attempt in range(proxy_max_retries):
358
- try:
359
- response = requests.get(url, params=params, timeout=self.timeout)
360
- response.raise_for_status()
361
- result = response.json()
362
- break # Success, exit retry loop
363
- except requests.exceptions.HTTPError as e:
364
- error_msg = str(e)
365
- should_retry = False
366
-
367
- # Try to extract detail message from response
368
- if e.response is not None:
369
- try:
370
- error_detail = e.response.json().get("detail", "")
371
- error_msg = error_detail or e.response.text
372
-
373
- # Retry on 404 if it's due to incomplete/missing traces (backend still indexing)
374
- if e.response.status_code == 404:
375
- should_retry = True
376
- except Exception:
377
- error_msg = e.response.text
378
-
379
- if should_retry and attempt < proxy_max_retries - 1:
380
- sleep_time = 2 ** (attempt + 1)
381
- logger.warning(error_msg)
382
- time.sleep(sleep_time)
383
- else:
384
- # Final retry or non-retryable error
385
- logger.error("Failed to fetch traces from proxy: %s", error_msg)
386
- return eval_rows
387
- except requests.exceptions.RequestException as e:
388
- # Non-HTTP errors (network issues, timeouts, etc.)
389
- logger.error("Failed to fetch traces from proxy: %s", str(e))
390
- return eval_rows
391
-
392
- if result is None:
393
- logger.error("Failed to fetch traces after %d retries", proxy_max_retries)
353
+ try:
354
+ response = requests.get(url, params=params, timeout=self.timeout)
355
+ response.raise_for_status()
356
+ result = response.json()
357
+ except requests.exceptions.HTTPError as e:
358
+ error_msg = str(e)
359
+
360
+ # Try to extract detail message from response
361
+ if e.response is not None:
362
+ try:
363
+ error_detail = e.response.json().get("detail", {})
364
+ error_msg = error_detail or e.response.text
365
+ except Exception: # In case e.response.json() fails
366
+ error_msg = f"Proxy error: {e.response.text}"
367
+
368
+ logger.error("Failed to fetch traces from proxy: %s", error_msg)
369
+ return eval_rows
370
+ except requests.exceptions.RequestException as e:
371
+ # Non-HTTP errors (network issues, timeouts, etc.)
372
+ logger.error("Failed to fetch traces from proxy: %s", str(e))
394
373
  return eval_rows
395
374
 
396
375
  # Extract traces from response
@@ -300,6 +300,7 @@ def parse_args(args=None):
300
300
  # Logs command
301
301
  logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
302
302
  logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
303
+ logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
303
304
 
304
305
  # Upload command
305
306
  upload_parser = subparsers.add_parser(
@@ -16,6 +16,7 @@ def logs_command(args):
16
16
  print(f"🌐 URL: http://localhost:{port}")
17
17
  print(f"🔌 WebSocket: ws://localhost:{port}/ws")
18
18
  print(f"👀 Watching paths: {['current directory']}")
19
+ print(f"🔍 Debug mode: {args.debug}")
19
20
  print("Press Ctrl+C to stop the server")
20
21
  print("-" * 50)
21
22
 
@@ -25,7 +26,7 @@ def logs_command(args):
25
26
  elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
26
27
 
27
28
  try:
28
- serve_logs(port=args.port, elasticsearch_config=elasticsearch_config)
29
+ serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
29
30
  return 0
30
31
  except KeyboardInterrupt:
31
32
  print("\n🛑 Server stopped by user")
@@ -23,12 +23,19 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
23
23
 
24
24
  def log(self, row: "EvaluationRow") -> None:
25
25
  data = row.model_dump(exclude_none=True, mode="json")
26
+ rollout_id = data.get("execution_metadata", {}).get("rollout_id", "unknown")
27
+ logger.debug(f"[EVENT_BUS_EMIT] Starting to log row with rollout_id: {rollout_id}")
28
+
26
29
  self._store.upsert_row(data=data)
30
+ logger.debug(f"[EVENT_BUS_EMIT] Successfully stored row in database for rollout_id: {rollout_id}")
31
+
27
32
  try:
33
+ logger.debug(f"[EVENT_BUS_EMIT] Emitting event '{LOG_EVENT_TYPE}' for rollout_id: {rollout_id}")
28
34
  event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
35
+ logger.debug(f"[EVENT_BUS_EMIT] Successfully emitted event for rollout_id: {rollout_id}")
29
36
  except Exception as e:
30
37
  # Avoid breaking storage due to event emission issues
31
- logger.error(f"Failed to emit row_upserted event: {e}")
38
+ logger.error(f"[EVENT_BUS_EMIT] Failed to emit row_upserted event for rollout_id {rollout_id}: {e}")
32
39
  pass
33
40
 
34
41
  def read(self, rollout_id: Optional[str] = None) -> List["EvaluationRow"]:
@@ -0,0 +1,126 @@
1
+ import asyncio
2
+ import os
3
+ import threading
4
+ import time
5
+ from typing import Any, Optional
6
+ from uuid import uuid4
7
+
8
+ from eval_protocol.event_bus.event_bus import EventBus
9
+ from eval_protocol.event_bus.logger import logger
10
+ from eval_protocol.event_bus.sqlite_event_bus_database import SqliteEventBusDatabase
11
+
12
+
13
+ class SqliteEventBus(EventBus):
14
+ """SQLite-based event bus implementation that supports cross-process communication."""
15
+
16
+ def __init__(self, db_path: Optional[str] = None):
17
+ super().__init__()
18
+
19
+ # Use the same database as the evaluation row store
20
+ if db_path is None:
21
+ from eval_protocol.directory_utils import find_eval_protocol_dir
22
+
23
+ eval_protocol_dir = find_eval_protocol_dir()
24
+ db_path = os.path.join(eval_protocol_dir, "logs.db")
25
+
26
+ self._db: SqliteEventBusDatabase = SqliteEventBusDatabase(db_path)
27
+ self._running = False
28
+ self._process_id = str(os.getpid())
29
+
30
+ def emit(self, event_type: str, data: Any) -> None:
31
+ """Emit an event to all subscribers.
32
+
33
+ Args:
34
+ event_type: Type of event (e.g., "log")
35
+ data: Event data
36
+ """
37
+ logger.debug(f"[CROSS_PROCESS_EMIT] Emitting event type: {event_type}")
38
+
39
+ # Call local listeners immediately
40
+ logger.debug(f"[CROSS_PROCESS_EMIT] Calling {len(self._listeners)} local listeners")
41
+ super().emit(event_type, data)
42
+ logger.debug("[CROSS_PROCESS_EMIT] Completed local listener calls")
43
+
44
+ # Publish to cross-process subscribers
45
+ logger.debug("[CROSS_PROCESS_EMIT] Publishing to cross-process subscribers")
46
+ self._publish_cross_process(event_type, data)
47
+ logger.debug("[CROSS_PROCESS_EMIT] Completed cross-process publish")
48
+
49
+ def _publish_cross_process(self, event_type: str, data: Any) -> None:
50
+ """Publish event to cross-process subscribers via database."""
51
+ logger.debug(f"[CROSS_PROCESS_PUBLISH] Publishing event {event_type} to database")
52
+ try:
53
+ self._db.publish_event(event_type, data, self._process_id)
54
+ logger.debug(f"[CROSS_PROCESS_PUBLISH] Successfully published event {event_type} to database")
55
+ except Exception as e:
56
+ logger.error(f"[CROSS_PROCESS_PUBLISH] Failed to publish event {event_type} to database: {e}")
57
+
58
+ def start_listening(self) -> None:
59
+ """Start listening for cross-process events."""
60
+ if self._running:
61
+ logger.debug("[CROSS_PROCESS_LISTEN] Already listening, skipping start")
62
+ return
63
+
64
+ logger.debug("[CROSS_PROCESS_LISTEN] Starting cross-process event listening")
65
+ self._running = True
66
+ loop = asyncio.get_running_loop()
67
+ loop.create_task(self._database_listener_task())
68
+ logger.debug("[CROSS_PROCESS_LISTEN] Started async database listener task")
69
+
70
+ def stop_listening(self) -> None:
71
+ """Stop listening for cross-process events."""
72
+ logger.debug("[CROSS_PROCESS_LISTEN] Stopping cross-process event listening")
73
+ self._running = False
74
+
75
+ async def _database_listener_task(self) -> None:
76
+ """Single database listener task that processes events and recreates itself."""
77
+ if not self._running:
78
+ # this should end the task loop
79
+ logger.debug("[CROSS_PROCESS_LISTENER] Stopping database listener task")
80
+ return
81
+
82
+ # Get unprocessed events from other processes
83
+ events = self._db.get_unprocessed_events(str(self._process_id))
84
+ if events:
85
+ logger.debug(f"[CROSS_PROCESS_LISTENER] Found {len(events)} unprocessed events")
86
+ else:
87
+ logger.debug(f"[CROSS_PROCESS_LISTENER] No unprocessed events found for process {self._process_id}")
88
+
89
+ for event in events:
90
+ logger.debug(
91
+ f"[CROSS_PROCESS_LISTENER] Processing event {event['event_id']} of type {event['event_type']}"
92
+ )
93
+ # Handle the event
94
+ self._handle_cross_process_event(event["event_type"], event["data"])
95
+ logger.debug(f"[CROSS_PROCESS_LISTENER] Successfully processed event {event['event_id']}")
96
+
97
+ # Mark as processed
98
+ self._db.mark_event_processed(event["event_id"])
99
+ logger.debug(f"[CROSS_PROCESS_LISTENER] Marked event {event['event_id']} as processed")
100
+
101
+ # Clean up old events every hour
102
+ current_time = time.time()
103
+ if not hasattr(self, "_last_cleanup"):
104
+ self._last_cleanup = current_time
105
+ elif current_time - self._last_cleanup >= 3600:
106
+ logger.debug("[CROSS_PROCESS_LISTENER] Cleaning up old events")
107
+ self._db.cleanup_old_events()
108
+ self._last_cleanup = current_time
109
+
110
+ # Schedule the next task if still running
111
+ await asyncio.sleep(1.0)
112
+ loop = asyncio.get_running_loop()
113
+ loop.create_task(self._database_listener_task())
114
+
115
+ def _handle_cross_process_event(self, event_type: str, data: Any) -> None:
116
+ """Handle events received from other processes."""
117
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Handling cross-process event type: {event_type}")
118
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Calling {len(self._listeners)} listeners")
119
+
120
+ for i, listener in enumerate(self._listeners):
121
+ try:
122
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Calling listener {i}")
123
+ listener(event_type, data)
124
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Successfully called listener {i}")
125
+ except Exception as e:
126
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Cross-process event listener {i} failed for {event_type}: {e}")
@@ -2,7 +2,7 @@ import time
2
2
  from typing import Any, List
3
3
  from uuid import uuid4
4
4
 
5
- from peewee import CharField, DateTimeField, Model, SqliteDatabase
5
+ from peewee import BooleanField, CharField, DateTimeField, Model, SqliteDatabase
6
6
  from playhouse.sqlite_ext import JSONField
7
7
 
8
8
  from eval_protocol.event_bus.logger import logger
@@ -25,7 +25,7 @@ class SqliteEventBusDatabase:
25
25
  data = JSONField()
26
26
  timestamp = DateTimeField()
27
27
  process_id = CharField()
28
- processed = CharField(default="false") # Track if event has been processed
28
+ processed = BooleanField(default=False) # Track if event has been processed
29
29
 
30
30
  self._Event = Event
31
31
  self._db.connect()
@@ -46,7 +46,7 @@ class SqliteEventBusDatabase:
46
46
  data=serialized_data,
47
47
  timestamp=time.time(),
48
48
  process_id=process_id,
49
- processed="false",
49
+ processed=False,
50
50
  )
51
51
  except Exception as e:
52
52
  logger.warning(f"Failed to publish event to database: {e}")
@@ -56,7 +56,7 @@ class SqliteEventBusDatabase:
56
56
  try:
57
57
  query = (
58
58
  self._Event.select()
59
- .where((self._Event.process_id != process_id) & (self._Event.processed == "false"))
59
+ .where((self._Event.process_id != process_id) & (~self._Event.processed))
60
60
  .order_by(self._Event.timestamp)
61
61
  )
62
62
 
@@ -80,7 +80,7 @@ class SqliteEventBusDatabase:
80
80
  def mark_event_processed(self, event_id: str) -> None:
81
81
  """Mark an event as processed."""
82
82
  try:
83
- self._Event.update(processed="true").where(self._Event.event_id == event_id).execute()
83
+ self._Event.update(processed=True).where(self._Event.event_id == event_id).execute()
84
84
  except Exception as e:
85
85
  logger.debug(f"Failed to mark event as processed: {e}")
86
86
 
@@ -88,8 +88,6 @@ class SqliteEventBusDatabase:
88
88
  """Clean up old processed events."""
89
89
  try:
90
90
  cutoff_time = time.time() - (max_age_hours * 3600)
91
- self._Event.delete().where(
92
- (self._Event.processed == "true") & (self._Event.timestamp < cutoff_time)
93
- ).execute()
91
+ self._Event.delete().where((self._Event.processed) & (self._Event.timestamp < cutoff_time)).execute()
94
92
  except Exception as e:
95
93
  logger.debug(f"Failed to cleanup old events: {e}")
@@ -100,6 +100,25 @@ class ElasticsearchClient:
100
100
  except Exception:
101
101
  return False
102
102
 
103
+ def clear_index(self) -> bool:
104
+ """Clear all documents from the index.
105
+
106
+ Returns:
107
+ bool: True if successful, False otherwise
108
+ """
109
+ try:
110
+ # Delete all documents by query
111
+ response = self._make_request(
112
+ "POST", f"{self.index_url}/_delete_by_query", json_data={"query": {"match_all": {}}}
113
+ )
114
+ if response.status_code == 200:
115
+ # Refresh the index to ensure changes are visible
116
+ refresh_response = self._make_request("POST", f"{self.index_url}/_refresh")
117
+ return refresh_response.status_code == 200
118
+ return False
119
+ except Exception:
120
+ return False
121
+
103
122
  def get_mapping(self) -> Optional[Dict[str, Any]]:
104
123
  """Get the index mapping.
105
124
 
@@ -2,7 +2,7 @@ import logging
2
2
  import os
3
3
  from concurrent.futures import ThreadPoolExecutor
4
4
  from typing import Optional, Any, Dict
5
- from datetime import datetime
5
+ from datetime import datetime, timezone
6
6
 
7
7
  from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
8
8
  from .elasticsearch_client import ElasticsearchClient
@@ -36,8 +36,8 @@ class ElasticsearchDirectHttpHandler(logging.Handler):
36
36
  def emit(self, record: logging.LogRecord) -> None:
37
37
  """Emit a log record by scheduling it for async transmission."""
38
38
  try:
39
- # Create proper ISO 8601 timestamp
40
- timestamp = datetime.fromtimestamp(record.created).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
39
+ # Create proper ISO 8601 timestamp in UTC
40
+ timestamp = datetime.fromtimestamp(record.created, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
41
41
 
42
42
  rollout_id = self._get_rollout_id(record)
43
43
  logger.debug(f"Emitting log record: {record.getMessage()} with rollout_id: {rollout_id}")
@@ -0,0 +1,22 @@
1
+ import os
2
+ from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
3
+ from .elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
4
+
5
+
6
+ def setup_rollout_logging_for_elasticsearch_handler(
7
+ handler: ElasticsearchDirectHttpHandler, rollout_id: str, elastic_search_config: ElasticsearchConfig
8
+ ) -> None:
9
+ """
10
+ Whenever a new subprocess is created, we need to setup the rollout context
11
+ for the subprocess. This is useful when implementing your own remote server
12
+ for rollout processing.
13
+
14
+ 1. Set the EP_ROLLOUT_ID environment variable
15
+ 2. Configure the Elasticsearch handler with the Elasticsearch config
16
+ """
17
+
18
+ # this should only affect this subprocess so logs from this subprocess can
19
+ # be correlated to the rollout
20
+ os.environ["EP_ROLLOUT_ID"] = rollout_id
21
+
22
+ handler.configure(elasticsearch_config=elastic_search_config)
@@ -70,7 +70,7 @@ def _default_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader:
70
70
  def fetch_traces() -> List[EvaluationRow]:
71
71
  base_url = config.model_base_url or "https://tracing.fireworks.ai"
72
72
  adapter = FireworksTracingAdapter(base_url=base_url)
73
- return adapter.get_evaluation_rows(tags=[f"rollout_id:{config.rollout_id}"], proxy_max_retries=5)
73
+ return adapter.get_evaluation_rows(tags=[f"rollout_id:{config.rollout_id}"], max_retries=5)
74
74
 
75
75
  return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=filter_longest_conversation)
76
76