eval-protocol 0.2.43__tar.gz → 0.2.44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (415) hide show
  1. {eval_protocol-0.2.43/eval_protocol.egg-info → eval_protocol-0.2.44}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli.py +1 -0
  4. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/logs.py +2 -1
  5. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +8 -1
  6. eval_protocol-0.2.44/eval_protocol/event_bus/sqlite_event_bus.py +126 -0
  7. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/sqlite_event_bus_database.py +6 -8
  8. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/elasticsearch_client.py +19 -0
  9. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +3 -3
  10. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/logs_server.py +126 -22
  11. {eval_protocol-0.2.43 → eval_protocol-0.2.44/eval_protocol.egg-info}/PKG-INFO +1 -1
  12. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/SOURCES.txt +1 -0
  13. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_event_bus.py +74 -38
  14. eval_protocol-0.2.44/tests/test_event_bus_helper.py +74 -0
  15. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_logs_server.py +2 -2
  16. eval_protocol-0.2.43/eval_protocol/event_bus/sqlite_event_bus.py +0 -109
  17. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/LICENSE +0 -0
  18. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/README.md +0 -0
  19. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/__init__.py +0 -0
  20. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/normalize_sandbox_fusion.py +0 -0
  21. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/utils/__init__.py +0 -0
  22. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/utils/generate_api_key.py +0 -0
  23. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/development/utils/subprocess_manager.py +0 -0
  24. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/__init__.py +0 -0
  25. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/__main__.py +0 -0
  26. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/__init__.py +0 -0
  27. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/base.py +0 -0
  28. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/bigquery.py +0 -0
  29. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/braintrust.py +0 -0
  30. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  31. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/huggingface.py +0 -0
  32. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/langchain.py +0 -0
  33. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/langfuse.py +0 -0
  34. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/langsmith.py +0 -0
  35. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/openai_responses.py +0 -0
  36. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/trl.py +0 -0
  37. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/adapters/utils.py +0 -0
  38. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/__init__.py +0 -0
  39. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/models.py +0 -0
  40. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/orchestrator.py +0 -0
  41. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resource_abc.py +0 -0
  42. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resource_pool.py +0 -0
  43. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/__init__.py +0 -0
  44. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  45. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  46. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  47. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  48. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  49. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/docker_resource.py +0 -0
  50. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  51. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  52. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/resources/sql_resource.py +0 -0
  53. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/task_manager.py +0 -0
  54. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/agent/tool_registry.py +0 -0
  55. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/auth.py +0 -0
  56. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/__init__.py +0 -0
  57. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  58. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  59. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_aime25.py +0 -0
  60. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  61. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  62. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  63. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  64. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  65. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/__init__.py +0 -0
  66. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  67. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/common.py +0 -0
  68. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/deploy.py +0 -0
  69. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  70. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/preview.py +0 -0
  71. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  72. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/cli_commands/upload.py +0 -0
  73. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/common_utils.py +0 -0
  74. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/config.py +0 -0
  75. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/__init__.py +0 -0
  76. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  77. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  78. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  79. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/data_loader/models.py +0 -0
  80. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/__init__.py +0 -0
  81. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  82. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  83. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  84. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/datasets/__init__.py +0 -0
  85. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/datasets/loader.py +0 -0
  86. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/directory_utils.py +0 -0
  87. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/evaluation.py +0 -0
  88. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/__init__.py +0 -0
  89. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/event_bus.py +0 -0
  90. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/event_bus/logger.py +0 -0
  91. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/execution/__init__.py +0 -0
  92. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/execution/pipeline.py +0 -0
  93. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/gcp_tools.py +0 -0
  94. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/generation/cache.py +0 -0
  95. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/generation/clients/base.py +0 -0
  96. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/generation/clients.py +0 -0
  97. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/generic_server.py +0 -0
  98. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/get_pep440_version.py +0 -0
  99. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/human_id/__init__.py +0 -0
  100. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/human_id/dictionary.py +0 -0
  101. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/integrations/__init__.py +0 -0
  102. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/integrations/deepeval.py +0 -0
  103. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/integrations/openeval.py +0 -0
  104. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/integrations/trl.py +0 -0
  105. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/__init__.py +0 -0
  106. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  107. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  108. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/logging_utils.py +0 -0
  109. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/__init__.py +0 -0
  110. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/adapter.py +0 -0
  111. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/client/__init__.py +0 -0
  112. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/client/connection.py +0 -0
  113. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/clients.py +0 -0
  114. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/execution/__init__.py +0 -0
  115. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/execution/base_policy.py +0 -0
  116. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/execution/manager.py +0 -0
  117. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/execution/policy.py +0 -0
  118. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/grid_renderer.py +0 -0
  119. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  120. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/mcpgym.py +0 -0
  121. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/process_manager.py +0 -0
  122. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/session/__init__.py +0 -0
  123. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/session/manager.py +0 -0
  124. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/simple_process_manager.py +0 -0
  125. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp/simulation_server.py +0 -0
  126. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/__init__.py +0 -0
  127. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/config.py +0 -0
  128. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/main.py +0 -0
  129. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  130. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  131. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  132. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  133. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_env.py +0 -0
  134. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/__init__.py +0 -0
  135. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  136. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  137. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  138. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  139. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  140. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  141. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  142. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  143. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  144. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  145. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  146. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  147. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  148. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  149. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/models.py +0 -0
  150. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/packaging.py +0 -0
  151. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/platform_api.py +0 -0
  152. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/playback_policy.py +0 -0
  153. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/__init__.py +0 -0
  154. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  155. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  156. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  157. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  158. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  159. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  160. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  161. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  162. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  163. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/evaluation_test.py +0 -0
  164. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  165. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/exception_config.py +0 -0
  166. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/execution.py +0 -0
  167. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  168. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  169. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/parameterize.py +0 -0
  170. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/plugin.py +0 -0
  171. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  172. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/rollout_processor.py +0 -0
  173. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/store_experiment_link.py +0 -0
  174. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/store_results_url.py +0 -0
  175. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/types.py +0 -0
  176. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/utils.py +0 -0
  177. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/pytest/validate_signature.py +0 -0
  178. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/__init__.py +0 -0
  179. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge.py +0 -0
  180. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  181. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
  182. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
  183. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
  184. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/quickstart/utils.py +0 -0
  185. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/resources.py +0 -0
  186. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/reward_function.py +0 -0
  187. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/__init__.py +0 -0
  188. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/accuracy.py +0 -0
  189. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/accuracy_length.py +0 -0
  190. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  191. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  192. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/apps_testing_util.py +0 -0
  193. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/bfcl_reward.py +0 -0
  194. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/code_execution.py +0 -0
  195. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/code_execution_utils.py +0 -0
  196. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/cpp_code.py +0 -0
  197. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  198. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/format.py +0 -0
  199. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/function_calling.py +0 -0
  200. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/json_schema.py +0 -0
  201. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/language_consistency.py +0 -0
  202. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/lean_prover.py +0 -0
  203. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/length.py +0 -0
  204. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  205. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/math.py +0 -0
  206. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  207. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/reasoning_steps.py +0 -0
  208. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/repetition.py +0 -0
  209. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rewards/tag_count.py +0 -0
  210. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/rl_processing.py +0 -0
  211. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/server.py +0 -0
  212. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/stats/__init__.py +0 -0
  213. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/stats/confidence_intervals.py +0 -0
  214. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/typed_interface.py +0 -0
  215. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/types/__init__.py +0 -0
  216. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/types/errors.py +0 -0
  217. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/types/remote_rollout_processor.py +0 -0
  218. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/types/types.py +0 -0
  219. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/__init__.py +0 -0
  220. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/batch_evaluation.py +0 -0
  221. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/batch_transformation.py +0 -0
  222. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/check_server_status.py +0 -0
  223. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/dataset_helpers.py +0 -0
  224. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/logs_models.py +0 -0
  225. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/module_loader.py +0 -0
  226. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/packaging_utils.py +0 -0
  227. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/show_results_url.py +0 -0
  228. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/static_policy.py +0 -0
  229. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/subprocess_utils.py +0 -0
  230. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol/utils/vite_server.py +0 -0
  231. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/dependency_links.txt +0 -0
  232. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/entry_points.txt +0 -0
  233. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/requires.txt +0 -0
  234. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/eval_protocol.egg-info/top_level.txt +0 -0
  235. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/pyproject.toml +0 -0
  236. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/setup.cfg +0 -0
  237. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/setup.py +0 -0
  238. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_accuracy.py +0 -0
  239. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_accuracy_length.py +0 -0
  240. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_adapters_e2e.py +0 -0
  241. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_agent_orchestrator.py +0 -0
  242. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_agent_resources.py +0 -0
  243. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_auth.py +0 -0
  244. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_batch_evaluation.py +0 -0
  245. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_cli.py +0 -0
  246. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_cli_agent.py +0 -0
  247. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_cli_args.py +0 -0
  248. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_code_execution.py +0 -0
  249. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_config.py +0 -0
  250. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_control_plane_separation.py +0 -0
  251. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_cpp_code.py +0 -0
  252. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_data_driven_task_manager.py +0 -0
  253. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_deepcoder_reward.py +0 -0
  254. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_deepeval_integration.py +0 -0
  255. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_deploy_integration.py +0 -0
  256. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_directory_utils.py +0 -0
  257. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_e2b_integration.py +0 -0
  258. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_e2b_js_integration.py +0 -0
  259. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_edge_cases.py +0 -0
  260. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_eval_protocol_import.py +0 -0
  261. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_evaluation.py +0 -0
  262. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_evaluation_integration.py +0 -0
  263. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_evaluation_postprocess.py +0 -0
  264. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_evaluation_preview_integration.py +0 -0
  265. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_examples_end_to_end.py +0 -0
  266. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_fireworks_api.py +0 -0
  267. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_format.py +0 -0
  268. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_fractional_code.py +0 -0
  269. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_function_calling.py +0 -0
  270. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_gcp_tools.py +0 -0
  271. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_generic_server.py +0 -0
  272. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_human_id.py +0 -0
  273. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_integration.py +0 -0
  274. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_json_schema.py +0 -0
  275. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_kwargs_validation.py +0 -0
  276. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_language_consistency.py +0 -0
  277. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_lean_prover.py +0 -0
  278. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_lean_prover_runner.py +0 -0
  279. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_length.py +0 -0
  280. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_list_comparison_math_reward.py +0 -0
  281. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_logs_server_simple.py +0 -0
  282. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_math.py +0 -0
  283. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_minimal.py +0 -0
  284. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_models.py +0 -0
  285. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_models_rl.py +0 -0
  286. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_multiple_choice_math_reward.py +0 -0
  287. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_n_variant_batch_integration.py +0 -0
  288. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_n_variant_integration.py +0 -0
  289. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_openai_compatibility.py +0 -0
  290. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_openeval_integration.py +0 -0
  291. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_packaging.py +0 -0
  292. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_parallel_rollouts.py +0 -0
  293. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_platform_api.py +0 -0
  294. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_quickstart_utils.py +0 -0
  295. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_readiness.py +0 -0
  296. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_reasoning_steps.py +0 -0
  297. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_repetition.py +0 -0
  298. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_repetition_debug.py +0 -0
  299. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_retry_mechanism.py +0 -0
  300. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_reward_function.py +0 -0
  301. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_reward_protocol_import.py +0 -0
  302. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_rl_processing.py +0 -0
  303. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_rollout_control_plane_integration.py +0 -0
  304. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_server.py +0 -0
  305. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_show_results_url.py +0 -0
  306. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_status_migration_changes.py +0 -0
  307. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_status_migration_integration.py +0 -0
  308. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_status_model.py +0 -0
  309. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_tag_count.py +0 -0
  310. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_tau_bench_airline_smoke.py +0 -0
  311. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_typed_interface.py +0 -0
  312. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_typed_interface_rl.py +0 -0
  313. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_upload_entrypoint.py +0 -0
  314. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_url_handling.py +0 -0
  315. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/tests/test_vite_server.py +0 -0
  316. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/__init__.py +0 -0
  317. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/agent/__init__.py +0 -0
  318. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/agent/base.py +0 -0
  319. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/agent/llm_agent.py +0 -0
  320. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/api_service/__init__.py +0 -0
  321. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/api_service/api_config.py +0 -0
  322. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/api_service/data_model.py +0 -0
  323. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/api_service/simulation_service.py +0 -0
  324. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/cli.py +0 -0
  325. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/config.py +0 -0
  326. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/airline/policy.md +0 -0
  327. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/mock/policy.md +0 -0
  328. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  329. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/retail/policy.md +0 -0
  330. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  331. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  332. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  333. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  334. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  335. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  336. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  337. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data_model/__init__.py +0 -0
  338. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data_model/message.py +0 -0
  339. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data_model/simulation.py +0 -0
  340. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/data_model/tasks.py +0 -0
  341. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/__init__.py +0 -0
  342. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/__init__.py +0 -0
  343. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/data_model.py +0 -0
  344. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/environment.py +0 -0
  345. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/tools.py +0 -0
  346. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/airline/utils.py +0 -0
  347. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/__init__.py +0 -0
  348. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/data_model.py +0 -0
  349. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/environment.py +0 -0
  350. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/tools.py +0 -0
  351. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/mock/utils.py +0 -0
  352. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/__init__.py +0 -0
  353. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/data_model.py +0 -0
  354. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/environment.py +0 -0
  355. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/tools.py +0 -0
  356. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/retail/utils.py +0 -0
  357. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/__init__.py +0 -0
  358. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/data_model.py +0 -0
  359. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/environment.py +0 -0
  360. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  361. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  362. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  363. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  364. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  365. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  366. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  367. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  368. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/tools.py +0 -0
  369. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  370. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  371. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/domains/telecom/utils.py +0 -0
  372. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/__init__.py +0 -0
  373. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/db.py +0 -0
  374. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/environment.py +0 -0
  375. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/server.py +0 -0
  376. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/tool.py +0 -0
  377. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/toolkit.py +0 -0
  378. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  379. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/__init__.py +0 -0
  380. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator.py +0 -0
  381. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  382. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  383. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  384. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  385. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  386. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/metrics/__init__.py +0 -0
  387. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/metrics/agent_metrics.py +0 -0
  388. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  389. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/orchestrator/__init__.py +0 -0
  390. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  391. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  392. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/orchestrator/utils.py +0 -0
  393. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/registry.py +0 -0
  394. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/run.py +0 -0
  395. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/__init__.py +0 -0
  396. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/check_data.py +0 -0
  397. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  398. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/start_servers.py +0 -0
  399. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/scripts/view_simulations.py +0 -0
  400. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/user/__init__.py +0 -0
  401. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/user/base.py +0 -0
  402. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/user/user_simulator.py +0 -0
  403. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/__init__.py +0 -0
  404. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/display.py +0 -0
  405. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/io_utils.py +0 -0
  406. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/llm_utils.py +0 -0
  407. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/pydantic_utils.py +0 -0
  408. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vendor/tau2/utils/utils.py +0 -0
  409. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/versioneer.py +0 -0
  410. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  411. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/index-C81y9r9l.js +0 -0
  412. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/index-C81y9r9l.js.map +0 -0
  413. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/index-DpYZaoAr.css +0 -0
  414. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  415. {eval_protocol-0.2.43 → eval_protocol-0.2.44}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.43
3
+ Version: 0.2.44
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-10-08T08:52:41-0700",
11
+ "date": "2025-10-08T11:55:20-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "535169e7193e6500d8d323e7dbc31c14dca98b96",
15
- "version": "0.2.43"
14
+ "full-revisionid": "e5883aeb569de1af057de3eae81aaf7790f468f1",
15
+ "version": "0.2.44"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -300,6 +300,7 @@ def parse_args(args=None):
300
300
  # Logs command
301
301
  logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
302
302
  logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
303
+ logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
303
304
 
304
305
  # Upload command
305
306
  upload_parser = subparsers.add_parser(
@@ -16,6 +16,7 @@ def logs_command(args):
16
16
  print(f"🌐 URL: http://localhost:{port}")
17
17
  print(f"🔌 WebSocket: ws://localhost:{port}/ws")
18
18
  print(f"👀 Watching paths: {['current directory']}")
19
+ print(f"🔍 Debug mode: {args.debug}")
19
20
  print("Press Ctrl+C to stop the server")
20
21
  print("-" * 50)
21
22
 
@@ -25,7 +26,7 @@ def logs_command(args):
25
26
  elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
26
27
 
27
28
  try:
28
- serve_logs(port=args.port, elasticsearch_config=elasticsearch_config)
29
+ serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
29
30
  return 0
30
31
  except KeyboardInterrupt:
31
32
  print("\n🛑 Server stopped by user")
@@ -23,12 +23,19 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
23
23
 
24
24
  def log(self, row: "EvaluationRow") -> None:
25
25
  data = row.model_dump(exclude_none=True, mode="json")
26
+ rollout_id = data.get("execution_metadata", {}).get("rollout_id", "unknown")
27
+ logger.debug(f"[EVENT_BUS_EMIT] Starting to log row with rollout_id: {rollout_id}")
28
+
26
29
  self._store.upsert_row(data=data)
30
+ logger.debug(f"[EVENT_BUS_EMIT] Successfully stored row in database for rollout_id: {rollout_id}")
31
+
27
32
  try:
33
+ logger.debug(f"[EVENT_BUS_EMIT] Emitting event '{LOG_EVENT_TYPE}' for rollout_id: {rollout_id}")
28
34
  event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
35
+ logger.debug(f"[EVENT_BUS_EMIT] Successfully emitted event for rollout_id: {rollout_id}")
29
36
  except Exception as e:
30
37
  # Avoid breaking storage due to event emission issues
31
- logger.error(f"Failed to emit row_upserted event: {e}")
38
+ logger.error(f"[EVENT_BUS_EMIT] Failed to emit row_upserted event for rollout_id {rollout_id}: {e}")
32
39
  pass
33
40
 
34
41
  def read(self, rollout_id: Optional[str] = None) -> List["EvaluationRow"]:
@@ -0,0 +1,126 @@
1
+ import asyncio
2
+ import os
3
+ import threading
4
+ import time
5
+ from typing import Any, Optional
6
+ from uuid import uuid4
7
+
8
+ from eval_protocol.event_bus.event_bus import EventBus
9
+ from eval_protocol.event_bus.logger import logger
10
+ from eval_protocol.event_bus.sqlite_event_bus_database import SqliteEventBusDatabase
11
+
12
+
13
+ class SqliteEventBus(EventBus):
14
+ """SQLite-based event bus implementation that supports cross-process communication."""
15
+
16
+ def __init__(self, db_path: Optional[str] = None):
17
+ super().__init__()
18
+
19
+ # Use the same database as the evaluation row store
20
+ if db_path is None:
21
+ from eval_protocol.directory_utils import find_eval_protocol_dir
22
+
23
+ eval_protocol_dir = find_eval_protocol_dir()
24
+ db_path = os.path.join(eval_protocol_dir, "logs.db")
25
+
26
+ self._db: SqliteEventBusDatabase = SqliteEventBusDatabase(db_path)
27
+ self._running = False
28
+ self._process_id = str(os.getpid())
29
+
30
+ def emit(self, event_type: str, data: Any) -> None:
31
+ """Emit an event to all subscribers.
32
+
33
+ Args:
34
+ event_type: Type of event (e.g., "log")
35
+ data: Event data
36
+ """
37
+ logger.debug(f"[CROSS_PROCESS_EMIT] Emitting event type: {event_type}")
38
+
39
+ # Call local listeners immediately
40
+ logger.debug(f"[CROSS_PROCESS_EMIT] Calling {len(self._listeners)} local listeners")
41
+ super().emit(event_type, data)
42
+ logger.debug("[CROSS_PROCESS_EMIT] Completed local listener calls")
43
+
44
+ # Publish to cross-process subscribers
45
+ logger.debug("[CROSS_PROCESS_EMIT] Publishing to cross-process subscribers")
46
+ self._publish_cross_process(event_type, data)
47
+ logger.debug("[CROSS_PROCESS_EMIT] Completed cross-process publish")
48
+
49
+ def _publish_cross_process(self, event_type: str, data: Any) -> None:
50
+ """Publish event to cross-process subscribers via database."""
51
+ logger.debug(f"[CROSS_PROCESS_PUBLISH] Publishing event {event_type} to database")
52
+ try:
53
+ self._db.publish_event(event_type, data, self._process_id)
54
+ logger.debug(f"[CROSS_PROCESS_PUBLISH] Successfully published event {event_type} to database")
55
+ except Exception as e:
56
+ logger.error(f"[CROSS_PROCESS_PUBLISH] Failed to publish event {event_type} to database: {e}")
57
+
58
+ def start_listening(self) -> None:
59
+ """Start listening for cross-process events."""
60
+ if self._running:
61
+ logger.debug("[CROSS_PROCESS_LISTEN] Already listening, skipping start")
62
+ return
63
+
64
+ logger.debug("[CROSS_PROCESS_LISTEN] Starting cross-process event listening")
65
+ self._running = True
66
+ loop = asyncio.get_running_loop()
67
+ loop.create_task(self._database_listener_task())
68
+ logger.debug("[CROSS_PROCESS_LISTEN] Started async database listener task")
69
+
70
+ def stop_listening(self) -> None:
71
+ """Stop listening for cross-process events."""
72
+ logger.debug("[CROSS_PROCESS_LISTEN] Stopping cross-process event listening")
73
+ self._running = False
74
+
75
+ async def _database_listener_task(self) -> None:
76
+ """Single database listener task that processes events and recreates itself."""
77
+ if not self._running:
78
+ # this should end the task loop
79
+ logger.debug("[CROSS_PROCESS_LISTENER] Stopping database listener task")
80
+ return
81
+
82
+ # Get unprocessed events from other processes
83
+ events = self._db.get_unprocessed_events(str(self._process_id))
84
+ if events:
85
+ logger.debug(f"[CROSS_PROCESS_LISTENER] Found {len(events)} unprocessed events")
86
+ else:
87
+ logger.debug(f"[CROSS_PROCESS_LISTENER] No unprocessed events found for process {self._process_id}")
88
+
89
+ for event in events:
90
+ logger.debug(
91
+ f"[CROSS_PROCESS_LISTENER] Processing event {event['event_id']} of type {event['event_type']}"
92
+ )
93
+ # Handle the event
94
+ self._handle_cross_process_event(event["event_type"], event["data"])
95
+ logger.debug(f"[CROSS_PROCESS_LISTENER] Successfully processed event {event['event_id']}")
96
+
97
+ # Mark as processed
98
+ self._db.mark_event_processed(event["event_id"])
99
+ logger.debug(f"[CROSS_PROCESS_LISTENER] Marked event {event['event_id']} as processed")
100
+
101
+ # Clean up old events every hour
102
+ current_time = time.time()
103
+ if not hasattr(self, "_last_cleanup"):
104
+ self._last_cleanup = current_time
105
+ elif current_time - self._last_cleanup >= 3600:
106
+ logger.debug("[CROSS_PROCESS_LISTENER] Cleaning up old events")
107
+ self._db.cleanup_old_events()
108
+ self._last_cleanup = current_time
109
+
110
+ # Schedule the next task if still running
111
+ await asyncio.sleep(1.0)
112
+ loop = asyncio.get_running_loop()
113
+ loop.create_task(self._database_listener_task())
114
+
115
+ def _handle_cross_process_event(self, event_type: str, data: Any) -> None:
116
+ """Handle events received from other processes."""
117
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Handling cross-process event type: {event_type}")
118
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Calling {len(self._listeners)} listeners")
119
+
120
+ for i, listener in enumerate(self._listeners):
121
+ try:
122
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Calling listener {i}")
123
+ listener(event_type, data)
124
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Successfully called listener {i}")
125
+ except Exception as e:
126
+ logger.debug(f"[CROSS_PROCESS_HANDLE] Cross-process event listener {i} failed for {event_type}: {e}")
@@ -2,7 +2,7 @@ import time
2
2
  from typing import Any, List
3
3
  from uuid import uuid4
4
4
 
5
- from peewee import CharField, DateTimeField, Model, SqliteDatabase
5
+ from peewee import BooleanField, CharField, DateTimeField, Model, SqliteDatabase
6
6
  from playhouse.sqlite_ext import JSONField
7
7
 
8
8
  from eval_protocol.event_bus.logger import logger
@@ -25,7 +25,7 @@ class SqliteEventBusDatabase:
25
25
  data = JSONField()
26
26
  timestamp = DateTimeField()
27
27
  process_id = CharField()
28
- processed = CharField(default="false") # Track if event has been processed
28
+ processed = BooleanField(default=False) # Track if event has been processed
29
29
 
30
30
  self._Event = Event
31
31
  self._db.connect()
@@ -46,7 +46,7 @@ class SqliteEventBusDatabase:
46
46
  data=serialized_data,
47
47
  timestamp=time.time(),
48
48
  process_id=process_id,
49
- processed="false",
49
+ processed=False,
50
50
  )
51
51
  except Exception as e:
52
52
  logger.warning(f"Failed to publish event to database: {e}")
@@ -56,7 +56,7 @@ class SqliteEventBusDatabase:
56
56
  try:
57
57
  query = (
58
58
  self._Event.select()
59
- .where((self._Event.process_id != process_id) & (self._Event.processed == "false"))
59
+ .where((self._Event.process_id != process_id) & (~self._Event.processed))
60
60
  .order_by(self._Event.timestamp)
61
61
  )
62
62
 
@@ -80,7 +80,7 @@ class SqliteEventBusDatabase:
80
80
  def mark_event_processed(self, event_id: str) -> None:
81
81
  """Mark an event as processed."""
82
82
  try:
83
- self._Event.update(processed="true").where(self._Event.event_id == event_id).execute()
83
+ self._Event.update(processed=True).where(self._Event.event_id == event_id).execute()
84
84
  except Exception as e:
85
85
  logger.debug(f"Failed to mark event as processed: {e}")
86
86
 
@@ -88,8 +88,6 @@ class SqliteEventBusDatabase:
88
88
  """Clean up old processed events."""
89
89
  try:
90
90
  cutoff_time = time.time() - (max_age_hours * 3600)
91
- self._Event.delete().where(
92
- (self._Event.processed == "true") & (self._Event.timestamp < cutoff_time)
93
- ).execute()
91
+ self._Event.delete().where((self._Event.processed) & (self._Event.timestamp < cutoff_time)).execute()
94
92
  except Exception as e:
95
93
  logger.debug(f"Failed to cleanup old events: {e}")
@@ -100,6 +100,25 @@ class ElasticsearchClient:
100
100
  except Exception:
101
101
  return False
102
102
 
103
+ def clear_index(self) -> bool:
104
+ """Clear all documents from the index.
105
+
106
+ Returns:
107
+ bool: True if successful, False otherwise
108
+ """
109
+ try:
110
+ # Delete all documents by query
111
+ response = self._make_request(
112
+ "POST", f"{self.index_url}/_delete_by_query", json_data={"query": {"match_all": {}}}
113
+ )
114
+ if response.status_code == 200:
115
+ # Refresh the index to ensure changes are visible
116
+ refresh_response = self._make_request("POST", f"{self.index_url}/_refresh")
117
+ return refresh_response.status_code == 200
118
+ return False
119
+ except Exception:
120
+ return False
121
+
103
122
  def get_mapping(self) -> Optional[Dict[str, Any]]:
104
123
  """Get the index mapping.
105
124
 
@@ -2,7 +2,7 @@ import logging
2
2
  import os
3
3
  from concurrent.futures import ThreadPoolExecutor
4
4
  from typing import Optional, Any, Dict
5
- from datetime import datetime
5
+ from datetime import datetime, timezone
6
6
 
7
7
  from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
8
8
  from .elasticsearch_client import ElasticsearchClient
@@ -36,8 +36,8 @@ class ElasticsearchDirectHttpHandler(logging.Handler):
36
36
  def emit(self, record: logging.LogRecord) -> None:
37
37
  """Emit a log record by scheduling it for async transmission."""
38
38
  try:
39
- # Create proper ISO 8601 timestamp
40
- timestamp = datetime.fromtimestamp(record.created).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
39
+ # Create proper ISO 8601 timestamp in UTC
40
+ timestamp = datetime.fromtimestamp(record.created, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
41
41
 
42
42
  rollout_id = self._get_rollout_id(record)
43
43
  logger.debug(f"Emitting log record: {record.getMessage()} with rollout_id: {rollout_id}")
@@ -30,6 +30,19 @@ if TYPE_CHECKING:
30
30
  logger = logging.getLogger(__name__)
31
31
 
32
32
 
33
+ def enable_debug_mode():
34
+ """Enable debug mode for all relevant loggers in the logs server system."""
35
+ # Set debug level for all relevant loggers
36
+ logger.setLevel(logging.DEBUG)
37
+
38
+ # Set debug level for event bus logger
39
+ from eval_protocol.event_bus.logger import logger as event_bus_logger
40
+
41
+ event_bus_logger.setLevel(logging.DEBUG)
42
+
43
+ print("Debug mode enabled for all relevant loggers")
44
+
45
+
33
46
  class WebSocketManager:
34
47
  """Manages WebSocket connections and broadcasts messages."""
35
48
 
@@ -40,100 +53,152 @@ class WebSocketManager:
40
53
  self._lock = threading.Lock()
41
54
 
42
55
  async def connect(self, websocket: WebSocket):
56
+ logger.debug("[WEBSOCKET_CONNECT] New websocket connection attempt")
43
57
  await websocket.accept()
44
58
  with self._lock:
45
59
  self.active_connections.append(websocket)
46
60
  connection_count = len(self.active_connections)
47
- logger.info(f"WebSocket connected. Total connections: {connection_count}")
61
+ logger.info(f"[WEBSOCKET_CONNECT] WebSocket connected. Total connections: {connection_count}")
62
+
63
+ logger.debug("[WEBSOCKET_CONNECT] Reading logs for initialization")
48
64
  logs = default_logger.read()
65
+ logger.debug(f"[WEBSOCKET_CONNECT] Found {len(logs)} logs to send")
66
+
49
67
  data = {
50
68
  "type": "initialize_logs",
51
69
  "logs": [log.model_dump(exclude_none=True, mode="json") for log in logs],
52
70
  }
71
+ logger.debug("[WEBSOCKET_CONNECT] Sending initialization data")
53
72
  await websocket.send_text(json.dumps(data))
73
+ logger.debug("[WEBSOCKET_CONNECT] Successfully sent initialization data")
54
74
 
55
75
  def disconnect(self, websocket: WebSocket):
76
+ logger.debug("[WEBSOCKET_DISCONNECT] WebSocket disconnection")
56
77
  with self._lock:
57
78
  if websocket in self.active_connections:
58
79
  self.active_connections.remove(websocket)
80
+ logger.debug("[WEBSOCKET_DISCONNECT] Removed websocket from active connections")
81
+ else:
82
+ logger.debug("[WEBSOCKET_DISCONNECT] Websocket was not in active connections")
59
83
  connection_count = len(self.active_connections)
60
- logger.info(f"WebSocket disconnected. Total connections: {connection_count}")
84
+ logger.info(f"[WEBSOCKET_DISCONNECT] WebSocket disconnected. Total connections: {connection_count}")
61
85
 
62
86
  def broadcast_row_upserted(self, row: "EvaluationRow"):
63
87
  """Broadcast a row-upsert event to all connected clients.
64
88
 
65
89
  Safe no-op if server loop is not running or there are no connections.
66
90
  """
91
+ rollout_id = row.execution_metadata.rollout_id if row.execution_metadata else "unknown"
92
+ logger.debug(f"[WEBSOCKET_BROADCAST] Starting broadcast for rollout_id: {rollout_id}")
93
+
94
+ with self._lock:
95
+ active_connections_count = len(self.active_connections)
96
+ logger.debug(f"[WEBSOCKET_BROADCAST] Active connections: {active_connections_count}")
97
+
67
98
  try:
68
99
  # Serialize pydantic model
100
+ logger.debug(f"[WEBSOCKET_BROADCAST] Serializing row for rollout_id: {rollout_id}")
69
101
  json_message = json.dumps({"type": "log", "row": row.model_dump(exclude_none=True, mode="json")})
102
+ logger.debug(
103
+ f"[WEBSOCKET_BROADCAST] Successfully serialized message (length: {len(json_message)}) for rollout_id: {rollout_id}"
104
+ )
105
+
70
106
  # Queue the message for broadcasting in the main event loop
107
+ logger.debug(f"[WEBSOCKET_BROADCAST] Queuing message for broadcast for rollout_id: {rollout_id}")
71
108
  self._broadcast_queue.put(json_message)
109
+ logger.debug(f"[WEBSOCKET_BROADCAST] Successfully queued message for rollout_id: {rollout_id}")
72
110
  except Exception as e:
73
- logger.error(f"Failed to serialize row for broadcast: {e}")
111
+ logger.error(
112
+ f"[WEBSOCKET_BROADCAST] Failed to serialize row for broadcast for rollout_id {rollout_id}: {e}"
113
+ )
74
114
 
75
115
  async def _start_broadcast_loop(self):
76
116
  """Start the broadcast loop that processes queued messages."""
117
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Starting broadcast loop")
77
118
  while True:
78
119
  try:
79
120
  # Wait for a message to be queued
121
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Waiting for message from queue")
80
122
  message_data = await asyncio.get_event_loop().run_in_executor(None, self._broadcast_queue.get)
123
+ logger.debug(
124
+ f"[WEBSOCKET_BROADCAST_LOOP] Retrieved message from queue (length: {len(str(message_data))})"
125
+ )
81
126
 
82
127
  # Regular string message for all connections
128
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Sending message to all connections")
83
129
  await self._send_text_to_all_connections(str(message_data))
130
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Successfully sent message to all connections")
84
131
 
85
132
  except Exception as e:
86
- logger.error(f"Error in broadcast loop: {e}")
133
+ logger.error(f"[WEBSOCKET_BROADCAST_LOOP] Error in broadcast loop: {e}")
87
134
  await asyncio.sleep(0.1)
88
135
  except asyncio.CancelledError:
89
- logger.info("Broadcast loop cancelled")
136
+ logger.info("[WEBSOCKET_BROADCAST_LOOP] Broadcast loop cancelled")
90
137
  break
91
138
 
92
139
  async def _send_text_to_all_connections(self, text: str):
93
140
  with self._lock:
94
141
  connections = list(self.active_connections)
95
142
 
143
+ logger.debug(f"[WEBSOCKET_SEND] Attempting to send to {len(connections)} connections")
144
+
96
145
  if not connections:
146
+ logger.debug("[WEBSOCKET_SEND] No connections available, skipping send")
97
147
  return
98
148
 
99
149
  tasks = []
100
150
  failed_connections = []
101
151
 
102
- for connection in connections:
152
+ for i, connection in enumerate(connections):
103
153
  try:
154
+ logger.debug(f"[WEBSOCKET_SEND] Preparing to send to connection {i}")
104
155
  tasks.append(connection.send_text(text))
105
156
  except Exception as e:
106
- logger.error(f"Failed to send text to WebSocket: {e}")
157
+ logger.error(f"[WEBSOCKET_SEND] Failed to prepare send to WebSocket {i}: {e}")
107
158
  failed_connections.append(connection)
108
159
 
109
160
  # Execute all sends in parallel
110
161
  if tasks:
162
+ logger.debug(f"[WEBSOCKET_SEND] Executing {len(tasks)} parallel sends")
111
163
  results = await asyncio.gather(*tasks, return_exceptions=True)
164
+ logger.debug("[WEBSOCKET_SEND] Completed parallel sends")
112
165
 
113
166
  # Check for any exceptions that occurred during execution
114
167
  for i, result in enumerate(results):
115
168
  if isinstance(result, Exception):
116
- logger.error(f"Failed to send text to WebSocket: {result}")
169
+ logger.error(f"[WEBSOCKET_SEND] Failed to send text to WebSocket {i}: {result}")
117
170
  failed_connections.append(connections[i])
171
+ else:
172
+ logger.debug(f"[WEBSOCKET_SEND] Successfully sent to connection {i}")
118
173
 
119
174
  # Remove all failed connections
120
- with self._lock:
121
- for connection in failed_connections:
122
- try:
123
- self.active_connections.remove(connection)
124
- except ValueError:
125
- pass
175
+ if failed_connections:
176
+ logger.debug(f"[WEBSOCKET_SEND] Removing {len(failed_connections)} failed connections")
177
+ with self._lock:
178
+ for connection in failed_connections:
179
+ try:
180
+ self.active_connections.remove(connection)
181
+ except ValueError:
182
+ pass
126
183
 
127
184
  def start_broadcast_loop(self):
128
185
  """Start the broadcast loop in the current event loop."""
129
186
  if self._broadcast_task is None or self._broadcast_task.done():
187
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Creating new broadcast task")
130
188
  self._broadcast_task = asyncio.create_task(self._start_broadcast_loop())
189
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Broadcast task created")
190
+ else:
191
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Broadcast task already running")
131
192
 
132
193
  def stop_broadcast_loop(self):
133
194
  """Stop the broadcast loop."""
134
195
  if self._broadcast_task and not self._broadcast_task.done():
196
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Cancelling broadcast task")
135
197
  self._broadcast_task.cancel()
136
198
  self._broadcast_task = None
199
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] Broadcast task cancelled")
200
+ else:
201
+ logger.debug("[WEBSOCKET_BROADCAST_LOOP] No active broadcast task to stop")
137
202
 
138
203
 
139
204
  class EvaluationWatcher:
@@ -260,7 +325,12 @@ class LogsServer(ViteServer):
260
325
  port: Optional[int] = 8000,
261
326
  index_file: str = "index.html",
262
327
  elasticsearch_config: Optional[ElasticsearchConfig] = None,
328
+ debug: bool = False,
263
329
  ):
330
+ # Enable debug mode if requested
331
+ if debug:
332
+ enable_debug_mode()
333
+
264
334
  # Initialize WebSocket manager
265
335
  self.websocket_manager = WebSocketManager()
266
336
 
@@ -304,9 +374,11 @@ class LogsServer(ViteServer):
304
374
  logger.info(f" {methods} {path}")
305
375
 
306
376
  # Subscribe to events and start listening for cross-process events
377
+ logger.debug("[LOGS_SERVER_INIT] Subscribing to event bus")
307
378
  event_bus.subscribe(self._handle_event)
379
+ logger.debug("[LOGS_SERVER_INIT] Successfully subscribed to event bus")
308
380
 
309
- logger.info(f"LogsServer initialized on {host}:{port}")
381
+ logger.info(f"[LOGS_SERVER_INIT] LogsServer initialized on {host}:{port}")
310
382
 
311
383
  def _setup_websocket_routes(self):
312
384
  """Set up WebSocket routes for real-time communication."""
@@ -418,17 +490,34 @@ class LogsServer(ViteServer):
418
490
 
419
491
  def _handle_event(self, event_type: str, data: Any) -> None:
420
492
  """Handle events from the event bus."""
493
+ logger.debug(f"[EVENT_BUS_RECEIVE] Received event type: {event_type}")
494
+
421
495
  if event_type in [LOG_EVENT_TYPE]:
422
496
  from eval_protocol.models import EvaluationRow
423
497
 
424
- data = EvaluationRow(**data)
425
- self.websocket_manager.broadcast_row_upserted(data)
498
+ try:
499
+ logger.debug("[EVENT_BUS_RECEIVE] Processing LOG_EVENT_TYPE event")
500
+ data = EvaluationRow(**data)
501
+ rollout_id = data.execution_metadata.rollout_id if data.execution_metadata else "unknown"
502
+ logger.debug(f"[EVENT_BUS_RECEIVE] Successfully parsed EvaluationRow for rollout_id: {rollout_id}")
503
+
504
+ logger.debug("[EVENT_BUS_RECEIVE] Broadcasting row_upserted to websocket manager")
505
+ self.websocket_manager.broadcast_row_upserted(data)
506
+ logger.debug(f"[EVENT_BUS_RECEIVE] Successfully queued broadcast for rollout_id: {rollout_id}")
507
+ except Exception as e:
508
+ logger.error(f"[EVENT_BUS_RECEIVE] Failed to process LOG_EVENT_TYPE event: {e}")
509
+ else:
510
+ logger.debug(f"[EVENT_BUS_RECEIVE] Ignoring event type: {event_type} (not LOG_EVENT_TYPE)")
426
511
 
427
512
  def start_loops(self):
428
513
  """Start the broadcast loop and evaluation watcher."""
514
+ logger.debug("[LOGS_SERVER_LOOPS] Starting all loops")
429
515
  self.websocket_manager.start_broadcast_loop()
516
+ logger.debug("[LOGS_SERVER_LOOPS] Started websocket broadcast loop")
430
517
  self.evaluation_watcher.start()
518
+ logger.debug("[LOGS_SERVER_LOOPS] Started evaluation watcher")
431
519
  event_bus.start_listening()
520
+ logger.debug("[LOGS_SERVER_LOOPS] Started event bus listening")
432
521
 
433
522
  async def run_async(self):
434
523
  """
@@ -477,6 +566,7 @@ def create_app(
477
566
  port: int = 8000,
478
567
  build_dir: Optional[str] = None,
479
568
  elasticsearch_config: Optional[ElasticsearchConfig] = None,
569
+ debug: bool = False,
480
570
  ) -> FastAPI:
481
571
  """
482
572
  Factory function to create a FastAPI app instance and start the server with async loops.
@@ -498,17 +588,21 @@ def create_app(
498
588
  os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "vite-app", "dist")
499
589
  )
500
590
 
501
- server = LogsServer(host=host, port=port, build_dir=build_dir, elasticsearch_config=elasticsearch_config)
591
+ server = LogsServer(
592
+ host=host, port=port, build_dir=build_dir, elasticsearch_config=elasticsearch_config, debug=debug
593
+ )
502
594
  server.start_loops()
503
595
  return server.app
504
596
 
505
597
 
506
598
  # For backward compatibility and direct usage
507
- def serve_logs(port: Optional[int] = None, elasticsearch_config: Optional[ElasticsearchConfig] = None):
599
+ def serve_logs(
600
+ port: Optional[int] = None, elasticsearch_config: Optional[ElasticsearchConfig] = None, debug: bool = False
601
+ ):
508
602
  """
509
603
  Convenience function to create and run a LogsServer.
510
604
  """
511
- server = LogsServer(port=port, elasticsearch_config=elasticsearch_config)
605
+ server = LogsServer(port=port, elasticsearch_config=elasticsearch_config, debug=debug)
512
606
  server.run()
513
607
 
514
608
 
@@ -519,17 +613,27 @@ if __name__ == "__main__":
519
613
  parser.add_argument("--host", default="localhost", help="Host to bind to (default: localhost)")
520
614
  parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
521
615
  parser.add_argument("--build-dir", help="Path to Vite build directory")
616
+ parser.add_argument("--debug", help="Set logger level to DEBUG")
522
617
 
523
618
  args = parser.parse_args()
524
619
 
620
+ if args.debug:
621
+ enable_debug_mode()
622
+
525
623
  elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
526
624
 
527
625
  # Create server with command line arguments
528
626
  if args.build_dir:
529
627
  server = LogsServer(
530
- host=args.host, port=args.port, build_dir=args.build_dir, elasticsearch_config=elasticsearch_config
628
+ host=args.host,
629
+ port=args.port,
630
+ build_dir=args.build_dir,
631
+ elasticsearch_config=elasticsearch_config,
632
+ debug=bool(args.debug),
531
633
  )
532
634
  else:
533
- server = LogsServer(host=args.host, port=args.port, elasticsearch_config=elasticsearch_config)
635
+ server = LogsServer(
636
+ host=args.host, port=args.port, elasticsearch_config=elasticsearch_config, debug=bool(args.debug)
637
+ )
534
638
 
535
639
  server.run()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.43
3
+ Version: 0.2.44
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -265,6 +265,7 @@ tests/test_evaluation_integration.py
265
265
  tests/test_evaluation_postprocess.py
266
266
  tests/test_evaluation_preview_integration.py
267
267
  tests/test_event_bus.py
268
+ tests/test_event_bus_helper.py
268
269
  tests/test_examples_end_to_end.py
269
270
  tests/test_fireworks_api.py
270
271
  tests/test_format.py