eval-protocol 0.2.46.dev3__tar.gz → 0.2.48__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. {eval_protocol-0.2.46.dev3/eval_protocol.egg-info → eval_protocol-0.2.48}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/__init__.py +5 -10
  3. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/fireworks_tracing.py +6 -8
  5. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/openai_responses.py +29 -1
  6. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli.py +6 -0
  7. eval_protocol-0.2.48/eval_protocol/cli_commands/logs.py +54 -0
  8. eval_protocol-0.2.48/eval_protocol/log_utils/fireworks_tracing_http_handler.py +63 -0
  9. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/evaluation_test.py +22 -1
  10. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/remote_rollout_processor.py +22 -3
  11. eval_protocol-0.2.48/eval_protocol/utils/browser_utils.py +114 -0
  12. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/logs_server.py +9 -1
  13. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48/eval_protocol.egg-info}/PKG-INFO +1 -1
  14. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/SOURCES.txt +5 -12
  15. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_show_results_url.py +141 -0
  16. eval_protocol-0.2.48/vite-app/dist/assets/index-34WaHH5W.css +1 -0
  17. eval_protocol-0.2.46.dev3/vite-app/dist/assets/index-C81y9r9l.js → eval_protocol-0.2.48/vite-app/dist/assets/index-DOPsfOMT.js +4 -4
  18. eval_protocol-0.2.46.dev3/vite-app/dist/assets/index-C81y9r9l.js.map → eval_protocol-0.2.48/vite-app/dist/assets/index-DOPsfOMT.js.map +1 -1
  19. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vite-app/dist/index.html +2 -2
  20. eval_protocol-0.2.46.dev3/eval_protocol/cli_commands/logs.py +0 -36
  21. eval_protocol-0.2.46.dev3/eval_protocol/proxy/__init__.py +0 -17
  22. eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/__init__.py +0 -12
  23. eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/app.py +0 -305
  24. eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/auth.py +0 -18
  25. eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/langfuse.py +0 -526
  26. eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/litellm.py +0 -171
  27. eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/main.py +0 -10
  28. eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/models.py +0 -92
  29. eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/redis_utils.py +0 -48
  30. eval_protocol-0.2.46.dev3/vite-app/dist/assets/index-DpYZaoAr.css +0 -1
  31. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/LICENSE +0 -0
  32. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/README.md +0 -0
  33. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/__init__.py +0 -0
  34. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/normalize_sandbox_fusion.py +0 -0
  35. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/utils/__init__.py +0 -0
  36. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/utils/generate_api_key.py +0 -0
  37. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/utils/subprocess_manager.py +0 -0
  38. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/__main__.py +0 -0
  39. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/__init__.py +0 -0
  40. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/base.py +0 -0
  41. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/bigquery.py +0 -0
  42. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/braintrust.py +0 -0
  43. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/huggingface.py +0 -0
  44. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/langchain.py +0 -0
  45. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/langfuse.py +0 -0
  46. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/langsmith.py +0 -0
  47. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/trl.py +0 -0
  48. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/utils.py +0 -0
  49. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/weave.py +0 -0
  50. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/__init__.py +0 -0
  51. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/models.py +0 -0
  52. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/orchestrator.py +0 -0
  53. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resource_abc.py +0 -0
  54. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resource_pool.py +0 -0
  55. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/__init__.py +0 -0
  56. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  57. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  58. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  59. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  60. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  61. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/docker_resource.py +0 -0
  62. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  63. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  64. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/sql_resource.py +0 -0
  65. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/task_manager.py +0 -0
  66. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/tool_registry.py +0 -0
  67. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/auth.py +0 -0
  68. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/__init__.py +0 -0
  69. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  70. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  71. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_aime25.py +0 -0
  72. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  73. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  74. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  75. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  76. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  77. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/__init__.py +0 -0
  78. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  79. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/common.py +0 -0
  80. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/deploy.py +0 -0
  81. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  82. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/preview.py +0 -0
  83. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  84. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/upload.py +0 -0
  85. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/common_utils.py +0 -0
  86. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/config.py +0 -0
  87. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/__init__.py +0 -0
  88. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  89. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  90. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  91. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/models.py +0 -0
  92. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/__init__.py +0 -0
  93. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  94. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  95. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  96. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  97. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/datasets/__init__.py +0 -0
  98. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/datasets/loader.py +0 -0
  99. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/directory_utils.py +0 -0
  100. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/evaluation.py +0 -0
  101. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/__init__.py +0 -0
  102. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/event_bus.py +0 -0
  103. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/logger.py +0 -0
  104. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  105. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  106. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/execution/__init__.py +0 -0
  107. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/execution/pipeline.py +0 -0
  108. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/gcp_tools.py +0 -0
  109. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/generation/cache.py +0 -0
  110. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/generation/clients/base.py +0 -0
  111. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/generation/clients.py +0 -0
  112. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/generic_server.py +0 -0
  113. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/get_pep440_version.py +0 -0
  114. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/human_id/__init__.py +0 -0
  115. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/human_id/dictionary.py +0 -0
  116. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/integrations/__init__.py +0 -0
  117. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/integrations/deepeval.py +0 -0
  118. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/integrations/openeval.py +0 -0
  119. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/integrations/trl.py +0 -0
  120. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/__init__.py +0 -0
  121. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  122. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  123. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  124. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  125. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/util.py +0 -0
  126. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/logging_utils.py +0 -0
  127. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/__init__.py +0 -0
  128. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/adapter.py +0 -0
  129. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/client/__init__.py +0 -0
  130. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/client/connection.py +0 -0
  131. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/clients.py +0 -0
  132. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/execution/__init__.py +0 -0
  133. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/execution/base_policy.py +0 -0
  134. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/execution/manager.py +0 -0
  135. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/execution/policy.py +0 -0
  136. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/grid_renderer.py +0 -0
  137. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  138. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/mcpgym.py +0 -0
  139. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/process_manager.py +0 -0
  140. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/session/__init__.py +0 -0
  141. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/session/manager.py +0 -0
  142. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/simple_process_manager.py +0 -0
  143. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/simulation_server.py +0 -0
  144. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/__init__.py +0 -0
  145. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/config.py +0 -0
  146. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/main.py +0 -0
  147. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  148. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  149. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  150. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  151. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_env.py +0 -0
  152. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/__init__.py +0 -0
  153. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  154. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  155. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  156. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  157. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  158. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  159. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  160. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  161. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  162. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  163. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  164. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  165. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  166. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  167. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/models.py +0 -0
  168. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/packaging.py +0 -0
  169. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/platform_api.py +0 -0
  170. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/playback_policy.py +0 -0
  171. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/__init__.py +0 -0
  172. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  173. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  174. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  175. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  176. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  177. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  178. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  179. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  180. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  181. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  182. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/exception_config.py +0 -0
  183. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/execution.py +0 -0
  184. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  185. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  186. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/parameterize.py +0 -0
  187. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/plugin.py +0 -0
  188. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/rollout_processor.py +0 -0
  189. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/store_experiment_link.py +0 -0
  190. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/store_results_url.py +0 -0
  191. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/types.py +0 -0
  192. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/utils.py +0 -0
  193. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/validate_signature.py +0 -0
  194. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/__init__.py +0 -0
  195. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge.py +0 -0
  196. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  197. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
  198. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
  199. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
  200. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/utils.py +0 -0
  201. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/resources.py +0 -0
  202. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/reward_function.py +0 -0
  203. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/__init__.py +0 -0
  204. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/accuracy.py +0 -0
  205. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/accuracy_length.py +0 -0
  206. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  207. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  208. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/apps_testing_util.py +0 -0
  209. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/bfcl_reward.py +0 -0
  210. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/code_execution.py +0 -0
  211. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/code_execution_utils.py +0 -0
  212. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/cpp_code.py +0 -0
  213. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  214. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/format.py +0 -0
  215. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/function_calling.py +0 -0
  216. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/json_schema.py +0 -0
  217. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/language_consistency.py +0 -0
  218. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/lean_prover.py +0 -0
  219. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/length.py +0 -0
  220. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  221. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/math.py +0 -0
  222. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  223. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/reasoning_steps.py +0 -0
  224. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/repetition.py +0 -0
  225. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/tag_count.py +0 -0
  226. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rl_processing.py +0 -0
  227. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/server.py +0 -0
  228. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/stats/__init__.py +0 -0
  229. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/stats/confidence_intervals.py +0 -0
  230. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/typed_interface.py +0 -0
  231. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/types/__init__.py +0 -0
  232. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/types/errors.py +0 -0
  233. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/types/remote_rollout_processor.py +0 -0
  234. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/types/types.py +0 -0
  235. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/__init__.py +0 -0
  236. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/batch_evaluation.py +0 -0
  237. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/batch_transformation.py +0 -0
  238. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/check_server_status.py +0 -0
  239. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/dataset_helpers.py +0 -0
  240. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/logs_models.py +0 -0
  241. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/module_loader.py +0 -0
  242. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/packaging_utils.py +0 -0
  243. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/show_results_url.py +0 -0
  244. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/static_policy.py +0 -0
  245. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/subprocess_utils.py +0 -0
  246. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/vite_server.py +0 -0
  247. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/dependency_links.txt +0 -0
  248. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/entry_points.txt +0 -0
  249. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/requires.txt +0 -0
  250. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/top_level.txt +0 -0
  251. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/pyproject.toml +0 -0
  252. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/setup.cfg +0 -0
  253. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/setup.py +0 -0
  254. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_accuracy.py +0 -0
  255. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_accuracy_length.py +0 -0
  256. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_adapters_e2e.py +0 -0
  257. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_agent_orchestrator.py +0 -0
  258. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_agent_resources.py +0 -0
  259. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_auth.py +0 -0
  260. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_batch_evaluation.py +0 -0
  261. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_cli.py +0 -0
  262. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_cli_agent.py +0 -0
  263. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_cli_args.py +0 -0
  264. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_code_execution.py +0 -0
  265. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_config.py +0 -0
  266. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_control_plane_separation.py +0 -0
  267. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_cpp_code.py +0 -0
  268. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_data_driven_task_manager.py +0 -0
  269. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_deepcoder_reward.py +0 -0
  270. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_deepeval_integration.py +0 -0
  271. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_deploy_integration.py +0 -0
  272. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_directory_utils.py +0 -0
  273. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_e2b_integration.py +0 -0
  274. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_e2b_js_integration.py +0 -0
  275. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_edge_cases.py +0 -0
  276. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_eval_protocol_import.py +0 -0
  277. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_evaluation.py +0 -0
  278. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_evaluation_integration.py +0 -0
  279. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_evaluation_postprocess.py +0 -0
  280. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_evaluation_preview_integration.py +0 -0
  281. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_event_bus.py +0 -0
  282. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_event_bus_helper.py +0 -0
  283. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_examples_end_to_end.py +0 -0
  284. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_fireworks_api.py +0 -0
  285. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_format.py +0 -0
  286. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_fractional_code.py +0 -0
  287. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_function_calling.py +0 -0
  288. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_gcp_tools.py +0 -0
  289. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_generic_server.py +0 -0
  290. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_human_id.py +0 -0
  291. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_integration.py +0 -0
  292. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_json_schema.py +0 -0
  293. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_kwargs_validation.py +0 -0
  294. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_language_consistency.py +0 -0
  295. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_lean_prover.py +0 -0
  296. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_lean_prover_runner.py +0 -0
  297. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_length.py +0 -0
  298. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_list_comparison_math_reward.py +0 -0
  299. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_logs_server.py +0 -0
  300. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_logs_server_simple.py +0 -0
  301. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_math.py +0 -0
  302. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_minimal.py +0 -0
  303. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_models.py +0 -0
  304. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_models_rl.py +0 -0
  305. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_multiple_choice_math_reward.py +0 -0
  306. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_n_variant_batch_integration.py +0 -0
  307. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_n_variant_integration.py +0 -0
  308. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_openai_compatibility.py +0 -0
  309. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_openeval_integration.py +0 -0
  310. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_packaging.py +0 -0
  311. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_parallel_rollouts.py +0 -0
  312. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_platform_api.py +0 -0
  313. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_quickstart_utils.py +0 -0
  314. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_readiness.py +0 -0
  315. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_reasoning_steps.py +0 -0
  316. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_repetition.py +0 -0
  317. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_repetition_debug.py +0 -0
  318. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_retry_mechanism.py +0 -0
  319. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_reward_function.py +0 -0
  320. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_reward_protocol_import.py +0 -0
  321. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_rl_processing.py +0 -0
  322. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_rollout_control_plane_integration.py +0 -0
  323. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_server.py +0 -0
  324. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_status_migration_changes.py +0 -0
  325. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_status_migration_integration.py +0 -0
  326. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_status_model.py +0 -0
  327. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_tag_count.py +0 -0
  328. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_tau_bench_airline_smoke.py +0 -0
  329. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_typed_interface.py +0 -0
  330. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_typed_interface_rl.py +0 -0
  331. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_upload_entrypoint.py +0 -0
  332. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_url_handling.py +0 -0
  333. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_vite_server.py +0 -0
  334. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/__init__.py +0 -0
  335. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/agent/__init__.py +0 -0
  336. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/agent/base.py +0 -0
  337. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/agent/llm_agent.py +0 -0
  338. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/api_service/__init__.py +0 -0
  339. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/api_service/api_config.py +0 -0
  340. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/api_service/data_model.py +0 -0
  341. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/api_service/simulation_service.py +0 -0
  342. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/cli.py +0 -0
  343. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/config.py +0 -0
  344. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/airline/policy.md +0 -0
  345. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/mock/policy.md +0 -0
  346. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  347. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/retail/policy.md +0 -0
  348. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  349. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  350. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  351. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  352. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  353. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  354. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  355. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data_model/__init__.py +0 -0
  356. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data_model/message.py +0 -0
  357. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data_model/simulation.py +0 -0
  358. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data_model/tasks.py +0 -0
  359. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/__init__.py +0 -0
  360. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/__init__.py +0 -0
  361. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/data_model.py +0 -0
  362. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/environment.py +0 -0
  363. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/tools.py +0 -0
  364. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/utils.py +0 -0
  365. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/__init__.py +0 -0
  366. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/data_model.py +0 -0
  367. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/environment.py +0 -0
  368. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/tools.py +0 -0
  369. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/utils.py +0 -0
  370. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/__init__.py +0 -0
  371. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/data_model.py +0 -0
  372. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/environment.py +0 -0
  373. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/tools.py +0 -0
  374. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/utils.py +0 -0
  375. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/__init__.py +0 -0
  376. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/data_model.py +0 -0
  377. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/environment.py +0 -0
  378. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  379. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  380. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  381. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  382. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  383. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  384. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  385. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  386. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tools.py +0 -0
  387. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  388. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  389. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/utils.py +0 -0
  390. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/__init__.py +0 -0
  391. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/db.py +0 -0
  392. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/environment.py +0 -0
  393. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/server.py +0 -0
  394. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/tool.py +0 -0
  395. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/toolkit.py +0 -0
  396. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  397. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/__init__.py +0 -0
  398. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator.py +0 -0
  399. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  400. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  401. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  402. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  403. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  404. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/metrics/__init__.py +0 -0
  405. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/metrics/agent_metrics.py +0 -0
  406. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  407. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/orchestrator/__init__.py +0 -0
  408. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  409. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  410. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/orchestrator/utils.py +0 -0
  411. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/registry.py +0 -0
  412. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/run.py +0 -0
  413. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/__init__.py +0 -0
  414. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/check_data.py +0 -0
  415. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  416. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/start_servers.py +0 -0
  417. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/view_simulations.py +0 -0
  418. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/user/__init__.py +0 -0
  419. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/user/base.py +0 -0
  420. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/user/user_simulator.py +0 -0
  421. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/__init__.py +0 -0
  422. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/display.py +0 -0
  423. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/io_utils.py +0 -0
  424. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/llm_utils.py +0 -0
  425. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/pydantic_utils.py +0 -0
  426. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/utils.py +0 -0
  427. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/versioneer.py +0 -0
  428. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  429. {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.46.dev3
3
+ Version: 0.2.48
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -35,6 +35,9 @@ from .pytest.parameterize import DefaultParameterIdGenerator
35
35
  from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
36
36
  from .log_utils.rollout_id_filter import RolloutIdFilter
37
37
  from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
38
+ from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
39
+ from .log_utils.elasticsearch_client import ElasticsearchConfig
40
+
38
41
 
39
42
  from .types.remote_rollout_processor import (
40
43
  InitRequest,
@@ -70,16 +73,10 @@ try:
70
73
  except ImportError:
71
74
  WeaveAdapter = None
72
75
 
73
- try:
74
- from .proxy import create_app, AuthProvider
75
- except ImportError:
76
- create_app = None
77
- AuthProvider = None
78
-
79
-
80
76
  warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
81
77
 
82
78
  __all__ = [
79
+ "ElasticsearchConfig",
83
80
  "ElasticsearchDirectHttpHandler",
84
81
  "RolloutIdFilter",
85
82
  "setup_rollout_logging_for_elasticsearch_handler",
@@ -102,6 +99,7 @@ __all__ = [
102
99
  "BraintrustAdapter",
103
100
  "create_braintrust_adapter",
104
101
  "LangSmithAdapter",
102
+ "FireworksTracingHttpHandler",
105
103
  # Core interfaces
106
104
  "Message",
107
105
  "MetricResult",
@@ -137,9 +135,6 @@ __all__ = [
137
135
  "RolloutMetadata",
138
136
  "StatusResponse",
139
137
  "create_langfuse_config_tags",
140
- # Proxy
141
- "create_app",
142
- "AuthProvider",
143
138
  ]
144
139
 
145
140
  from . import _version
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-10-10T01:45:32-0700",
11
+ "date": "2025-10-10T13:52:05-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "1757548441eb93afd5dc0428b0218637787cdd80",
15
- "version": "0.2.46-dev3"
14
+ "full-revisionid": "8e5d3a5f347613eafe384a726d3598cf58719822",
15
+ "version": "0.2.48"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -7,9 +7,9 @@ to pull data from Langfuse deployments with simplified retry logic handling.
7
7
  from __future__ import annotations
8
8
  import logging
9
9
  import requests
10
+ import time
10
11
  from datetime import datetime
11
12
  from typing import Any, Dict, List, Optional, Protocol
12
- import os
13
13
 
14
14
  from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
15
15
  from .base import BaseAdapter
@@ -343,17 +343,15 @@ class FireworksTracingAdapter(BaseAdapter):
343
343
  # Remove None values
344
344
  params = {k: v for k, v in params.items() if v is not None}
345
345
 
346
- # Make request to proxy (using pointwise for efficiency)
346
+ # Make request to proxy
347
347
  if self.project_id:
348
- url = f"{self.base_url}/v1/project_id/{self.project_id}/traces/pointwise"
348
+ url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
349
349
  else:
350
- url = f"{self.base_url}/v1/traces/pointwise"
351
-
352
- headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
350
+ url = f"{self.base_url}/v1/traces"
353
351
 
354
352
  result = None
355
353
  try:
356
- response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
354
+ response = requests.get(url, params=params, timeout=self.timeout)
357
355
  response.raise_for_status()
358
356
  result = response.json()
359
357
  except requests.exceptions.HTTPError as e:
@@ -367,7 +365,7 @@ class FireworksTracingAdapter(BaseAdapter):
367
365
  except Exception: # In case e.response.json() fails
368
366
  error_msg = f"Proxy error: {e.response.text}"
369
367
 
370
- logger.error("Failed to fetch traces from proxy (HTTP %s): %s", e.response.status_code, error_msg)
368
+ logger.error("Failed to fetch traces from proxy: %s", error_msg)
371
369
  return eval_rows
372
370
  except requests.exceptions.RequestException as e:
373
371
  # Non-HTTP errors (network issues, timeouts, etc.)
@@ -169,7 +169,9 @@ class OpenAIResponsesAdapter(BaseAdapter):
169
169
  raise NotImplementedError(f"Unsupported content type: {content_item.type}")
170
170
  elif item.type == "function_call_output":
171
171
  # Collect tool call outputs to add before assistant message
172
- tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
172
+ tool_call_outputs.append(
173
+ Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
174
+ )
173
175
  elif item.type == "function_call":
174
176
  tool_call = ChatCompletionMessageToolCall(
175
177
  id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
@@ -186,3 +188,29 @@ class OpenAIResponsesAdapter(BaseAdapter):
186
188
  messages.append(Message(role="assistant", tool_calls=current_tool_calls))
187
189
 
188
190
  return reversed(messages)
191
+
192
+ def _coerce_tool_output(self, output: Any) -> str:
193
+ """Coerce OpenAI Responses tool output into a string for Message.content.
194
+
195
+ The Responses API may return structured content lists. For our purposes,
196
+ we stringify non-string outputs to satisfy the Message.content type.
197
+ """
198
+ if isinstance(output, str):
199
+ return output
200
+ try:
201
+ # Attempt to join list of objects with any 'text' fields
202
+ if isinstance(output, list):
203
+ parts: list[str] = []
204
+ for part in output:
205
+ text = None
206
+ if isinstance(part, dict):
207
+ text = part.get("text")
208
+ if text:
209
+ parts.append(str(text))
210
+ else:
211
+ parts.append(str(part))
212
+ return "\n".join(parts)
213
+ # Fallback to string conversion
214
+ return str(output)
215
+ except Exception:
216
+ return str(output)
@@ -301,6 +301,12 @@ def parse_args(args=None):
301
301
  logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
302
302
  logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
303
303
  logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
304
+ logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
305
+ logs_parser.add_argument(
306
+ "--use-env-elasticsearch-confi",
307
+ action="store_true",
308
+ help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
309
+ )
304
310
 
305
311
  # Upload command
306
312
  upload_parser = subparsers.add_parser(
@@ -0,0 +1,54 @@
1
+ """
2
+ CLI command for serving logs with file watching and real-time updates.
3
+ """
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from ..utils.logs_server import serve_logs
9
+
10
+
11
+ def logs_command(args):
12
+ """Serve logs with file watching and real-time updates"""
13
+
14
+ port = args.port
15
+ print("🚀 Starting Eval Protocol Logs Server")
16
+ print(f"🌐 URL: http://localhost:{port}")
17
+ print(f"🔌 WebSocket: ws://localhost:{port}/ws")
18
+ print(f"👀 Watching paths: {['current directory']}")
19
+ print(f"🔍 Debug mode: {args.debug}")
20
+ print("Press Ctrl+C to stop the server")
21
+ print("-" * 50)
22
+
23
+ # Setup Elasticsearch based on flags
24
+ elasticsearch_config = None
25
+ try:
26
+ if getattr(args, "use_env_elasticsearch_config", False):
27
+ # Use environment variables for configuration
28
+ print("⚙️ Using environment variables for Elasticsearch config")
29
+ from eval_protocol.pytest.remote_rollout_processor import (
30
+ create_elasticsearch_config_from_env,
31
+ )
32
+
33
+ elasticsearch_config = create_elasticsearch_config_from_env()
34
+ elif not getattr(args, "disable_elasticsearch_setup", False):
35
+ # Default behavior: start or connect to local Elasticsearch via Docker helper
36
+ from eval_protocol.pytest.elasticsearch_setup import ElasticsearchSetup
37
+
38
+ print("🧰 Auto-configuring local Elasticsearch (Docker)")
39
+ elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
40
+ else:
41
+ print("🚫 Elasticsearch setup disabled; running without Elasticsearch integration")
42
+ except Exception as e:
43
+ print(f"❌ Failed to configure Elasticsearch: {e}")
44
+ return 1
45
+
46
+ try:
47
+ serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
48
+ return 0
49
+ except KeyboardInterrupt:
50
+ print("\n🛑 Server stopped by user")
51
+ return 0
52
+ except Exception as e:
53
+ print(f"❌ Error starting server: {e}")
54
+ return 1
@@ -0,0 +1,63 @@
1
+ import logging
2
+ import os
3
+ import threading
4
+ from datetime import datetime, timezone
5
+ from typing import Optional, Any, Dict, List, cast
6
+
7
+ import requests
8
+
9
+
10
+ class FireworksTracingHttpHandler(logging.Handler):
11
+ """Logging handler that posts structured logs to tracing.fireworks gateway /logs endpoint."""
12
+
13
+ def __init__(self, gateway_base_url: Optional[str] = None, rollout_id_env: str = "EP_ROLLOUT_ID") -> None:
14
+ super().__init__()
15
+ self.gateway_base_url = gateway_base_url or os.getenv("FW_TRACING_GATEWAY_BASE_URL")
16
+ self.rollout_id_env = rollout_id_env
17
+ self._session = requests.Session()
18
+ self._lock = threading.Lock()
19
+
20
+ def emit(self, record: logging.LogRecord) -> None:
21
+ try:
22
+ if not self.gateway_base_url:
23
+ return
24
+ rollout_id = self._get_rollout_id(record)
25
+ if not rollout_id:
26
+ return
27
+ payload = self._build_payload(record, rollout_id)
28
+ url = f"{self.gateway_base_url.rstrip('/')}/logs"
29
+ with self._lock:
30
+ self._session.post(url, json=payload, timeout=5)
31
+ except Exception:
32
+ # Avoid raising exceptions from logging
33
+ self.handleError(record)
34
+
35
+ def _get_rollout_id(self, record: logging.LogRecord) -> Optional[str]:
36
+ if hasattr(record, "rollout_id") and cast(Any, getattr(record, "rollout_id")) is not None:
37
+ return str(cast(Any, getattr(record, "rollout_id")))
38
+ return os.getenv(self.rollout_id_env)
39
+
40
+ def _build_payload(self, record: logging.LogRecord, rollout_id: str) -> Dict[str, Any]:
41
+ timestamp = datetime.fromtimestamp(record.created, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
42
+ message = record.getMessage()
43
+ tags: List[str] = [f"rollout_id:{rollout_id}"]
44
+ # Optional additional tags
45
+ if hasattr(record, "experiment_id") and cast(Any, getattr(record, "experiment_id")):
46
+ tags.append(f"experiment_id:{cast(Any, getattr(record, 'experiment_id'))}")
47
+ if hasattr(record, "run_id") and cast(Any, getattr(record, "run_id")):
48
+ tags.append(f"run_id:{cast(Any, getattr(record, 'run_id'))}")
49
+ program = cast(Optional[str], getattr(record, "program", None)) or "eval_protocol"
50
+ status_val = cast(Any, getattr(record, "status", None))
51
+ status = status_val if isinstance(status_val, str) else None
52
+ return {
53
+ "program": program,
54
+ "status": status,
55
+ "message": message,
56
+ "tags": tags,
57
+ "metadata": cast(Any, getattr(record, "metadata", None)),
58
+ "extras": {
59
+ "logger_name": record.name,
60
+ "level": record.levelname,
61
+ "timestamp": timestamp,
62
+ },
63
+ }
@@ -62,7 +62,8 @@ from eval_protocol.pytest.utils import (
62
62
  run_tasks_with_eval_progress,
63
63
  run_tasks_with_run_progress,
64
64
  )
65
- from eval_protocol.utils.show_results_url import store_local_ui_results_url
65
+ from eval_protocol.utils.show_results_url import store_local_ui_results_url, generate_invocation_filter_url
66
+ from eval_protocol.utils.browser_utils import is_logs_server_running, open_browser_tab
66
67
 
67
68
  from ..common_utils import load_jsonl
68
69
 
@@ -80,6 +81,7 @@ def evaluation_test(
80
81
  rollout_processor_kwargs: RolloutProcessorInputParam | None = None,
81
82
  aggregation_method: AggregationMethod = "mean",
82
83
  passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None,
84
+ disable_browser_open: bool = False,
83
85
  num_runs: int = 1,
84
86
  filtered_row_ids: Sequence[str] | None = None,
85
87
  max_dataset_rows: int | None = None,
@@ -246,10 +248,29 @@ def evaluation_test(
246
248
  else:
247
249
  invocation_id = generate_id()
248
250
 
251
+ # Track whether we've opened browser for this invocation
252
+ browser_opened_for_invocation = False
253
+
249
254
  async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None:
255
+ nonlocal browser_opened_for_invocation
256
+
250
257
  # Store URL for viewing results (after all postprocessing is complete)
251
258
  store_local_ui_results_url(invocation_id)
252
259
 
260
+ # Auto-open browser if server is running and not disabled (only once per invocation)
261
+ if (
262
+ not browser_opened_for_invocation
263
+ and not disable_browser_open
264
+ and os.environ.get("EP_DISABLE_AUTO_BROWSER") is None
265
+ ):
266
+ is_running, port = is_logs_server_running()
267
+ if is_running:
268
+ # Generate URL for table view with invocation filter
269
+ base_url = f"http://localhost:{port}" if port else "http://localhost:8000"
270
+ table_url = generate_invocation_filter_url(invocation_id, f"{base_url}/table")
271
+ open_browser_tab(table_url)
272
+ browser_opened_for_invocation = True
273
+
253
274
  eval_metadata = None
254
275
 
255
276
  all_results: list[list[EvaluationRow]] = [[] for _ in range(num_runs)]
@@ -26,6 +26,25 @@ import os
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
28
 
29
+ def create_elasticsearch_config_from_env() -> ElasticsearchConfig:
30
+ """Setup Elasticsearch config from environment variables."""
31
+ url = os.getenv("ELASTICSEARCH_URL")
32
+ api_key = os.getenv("ELASTICSEARCH_API_KEY")
33
+ index_name = os.getenv("ELASTICSEARCH_INDEX_NAME")
34
+
35
+ if url is None:
36
+ raise ValueError("ELASTICSEARCH_URL must be set")
37
+ if api_key is None:
38
+ raise ValueError("ELASTICSEARCH_API_KEY must be set")
39
+ if index_name is None:
40
+ raise ValueError("ELASTICSEARCH_INDEX_NAME must be set")
41
+ return ElasticsearchConfig(
42
+ url=url,
43
+ api_key=api_key,
44
+ index_name=index_name,
45
+ )
46
+
47
+
29
48
  def _build_fireworks_tracing_url(
30
49
  base_url: str, metadata: RolloutMetadata, completion_params_base_url: Optional[str] = None
31
50
  ) -> str:
@@ -93,7 +112,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
93
112
  poll_interval: float = 1.0,
94
113
  timeout_seconds: float = 120.0,
95
114
  output_data_loader: Optional[Callable[[DataLoaderConfig], DynamicDataLoader]] = None,
96
- disable_elastic_search: bool = False,
115
+ disable_elastic_search_setup: bool = False,
97
116
  elastic_search_config: Optional[ElasticsearchConfig] = None,
98
117
  ):
99
118
  # Prefer constructor-provided configuration. These can be overridden via
@@ -108,11 +127,11 @@ class RemoteRolloutProcessor(RolloutProcessor):
108
127
  self._poll_interval = poll_interval
109
128
  self._timeout_seconds = timeout_seconds
110
129
  self._output_data_loader = output_data_loader or _default_output_data_loader
111
- self._disable_elastic_search = disable_elastic_search
130
+ self._disable_elastic_search_setup = disable_elastic_search_setup
112
131
  self._elastic_search_config = elastic_search_config
113
132
 
114
133
  def setup(self) -> None:
115
- if self._disable_elastic_search:
134
+ if self._disable_elastic_search_setup:
116
135
  logger.info("Elasticsearch is disabled, skipping setup")
117
136
  return
118
137
  logger.info("Setting up Elasticsearch")
@@ -0,0 +1,114 @@
1
+ """
2
+ Browser utilities for auto-opening evaluation results in the local UI.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import threading
8
+ import time
9
+ import webbrowser
10
+ from pathlib import Path
11
+ from typing import Tuple, Optional
12
+
13
+ try:
14
+ import psutil
15
+
16
+ PSUTIL_AVAILABLE = True
17
+ except ImportError:
18
+ PSUTIL_AVAILABLE = False
19
+
20
+
21
+ def _get_pid_file_path() -> Path:
22
+ """Get the path to the logs server PID file."""
23
+ from eval_protocol.directory_utils import find_eval_protocol_dir
24
+
25
+ return Path(find_eval_protocol_dir()) / "logs_server.pid"
26
+
27
+
28
+ def write_pid_file(pid: int, port: int) -> None:
29
+ """
30
+ Write the server PID and port to a file for external processes to check.
31
+
32
+ Args:
33
+ pid: The process ID of the logs server
34
+ port: The port the server is running on
35
+ """
36
+ try:
37
+ pid_file = _get_pid_file_path()
38
+
39
+ data = {"pid": pid, "port": port}
40
+
41
+ with open(pid_file, "w") as f:
42
+ json.dump(data, f)
43
+
44
+ # Use print instead of logger to avoid circular imports
45
+ print(f"Wrote PID file: {pid_file} with PID {pid} and port {port}")
46
+ except Exception as e:
47
+ print(f"Warning: Failed to write PID file: {e}")
48
+
49
+
50
+ def is_logs_server_running() -> Tuple[bool, Optional[int]]:
51
+ """
52
+ Check if the logs server is running by reading the PID file and verifying the process.
53
+
54
+ Returns:
55
+ Tuple of (is_running, port) where:
56
+ - is_running: True if server is running, False otherwise
57
+ - port: The port the server is running on, or None if not running
58
+ """
59
+ if not PSUTIL_AVAILABLE:
60
+ return False, None
61
+
62
+ pid_file = _get_pid_file_path()
63
+ if not pid_file.exists():
64
+ return False, None
65
+
66
+ try:
67
+ with open(pid_file, "r") as f:
68
+ data = json.load(f)
69
+ pid = data.get("pid")
70
+ port = data.get("port")
71
+ except (json.JSONDecodeError, KeyError, FileNotFoundError):
72
+ return False, None
73
+
74
+ if pid is None:
75
+ return False, None
76
+
77
+ try:
78
+ # Check if the process is still running
79
+ process = psutil.Process(pid)
80
+ if not process.is_running():
81
+ return False, None
82
+
83
+ # Optionally verify it's listening on the expected port
84
+ if port is not None:
85
+ try:
86
+ connections = process.net_connections()
87
+ for conn in connections:
88
+ if conn.laddr.port == port and conn.status == "LISTEN":
89
+ return True, port
90
+ except (psutil.AccessDenied, psutil.NoSuchProcess):
91
+ # If we can't check connections, assume it's running if process exists
92
+ pass
93
+
94
+ return True, port
95
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
96
+ return False, None
97
+
98
+
99
+ def open_browser_tab(url: str, delay: float = 0.5) -> None:
100
+ """
101
+ Open a URL in a new browser tab with an optional delay.
102
+
103
+ Args:
104
+ url: The URL to open
105
+ delay: Delay in seconds before opening browser (default: 0.5)
106
+ """
107
+
108
+ def _open():
109
+ time.sleep(delay) # Give the server time to start
110
+ webbrowser.open_new_tab(url)
111
+
112
+ thread = threading.Thread(target=_open)
113
+ thread.daemon = True
114
+ thread.start()
@@ -6,6 +6,7 @@ import threading
6
6
  import time
7
7
  from datetime import datetime
8
8
  from contextlib import asynccontextmanager
9
+ from pathlib import Path
9
10
  from queue import Queue
10
11
  from typing import TYPE_CHECKING, Any, Dict, List, Optional
11
12
 
@@ -23,6 +24,7 @@ from eval_protocol.utils.vite_server import ViteServer
23
24
  from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
24
25
  from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
25
26
  from eval_protocol.utils.logs_models import LogEntry, LogsResponse
27
+ from eval_protocol.utils.browser_utils import write_pid_file
26
28
 
27
29
  if TYPE_CHECKING:
28
30
  from eval_protocol.models import EvaluationRow
@@ -378,7 +380,7 @@ class LogsServer(ViteServer):
378
380
  event_bus.subscribe(self._handle_event)
379
381
  logger.debug("[LOGS_SERVER_INIT] Successfully subscribed to event bus")
380
382
 
381
- logger.info(f"[LOGS_SERVER_INIT] LogsServer initialized on {host}:{port}")
383
+ logger.info(f"[LOGS_SERVER_INIT] LogsServer initialized on {self.host}:{self.port}")
382
384
 
383
385
  def _setup_websocket_routes(self):
384
386
  """Set up WebSocket routes for real-time communication."""
@@ -541,6 +543,12 @@ class LogsServer(ViteServer):
541
543
  )
542
544
 
543
545
  server = uvicorn.Server(config)
546
+
547
+ # Write PID file after server is configured but before serving
548
+ logger.debug(f"[LOGS_SERVER_RUN_ASYNC] Writing PID file for port {self.port}")
549
+ write_pid_file(os.getpid(), self.port)
550
+ logger.debug(f"[LOGS_SERVER_RUN_ASYNC] Successfully wrote PID file for port {self.port}")
551
+
544
552
  await server.serve()
545
553
 
546
554
  except KeyboardInterrupt:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.46.dev3
3
+ Version: 0.2.48
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -40,9 +40,9 @@ eval_protocol.egg-info/requires.txt
40
40
  eval_protocol.egg-info/top_level.txt
41
41
  eval_protocol/../vite-app/dist/index.html
42
42
  eval_protocol/../vite-app/dist/assets/favicon-BkAAWQga.png
43
- eval_protocol/../vite-app/dist/assets/index-C81y9r9l.js
44
- eval_protocol/../vite-app/dist/assets/index-C81y9r9l.js.map
45
- eval_protocol/../vite-app/dist/assets/index-DpYZaoAr.css
43
+ eval_protocol/../vite-app/dist/assets/index-34WaHH5W.css
44
+ eval_protocol/../vite-app/dist/assets/index-DOPsfOMT.js
45
+ eval_protocol/../vite-app/dist/assets/index-DOPsfOMT.js.map
46
46
  eval_protocol/../vite-app/dist/assets/logo-light-BprIBJQW.png
47
47
  eval_protocol/adapters/__init__.py
48
48
  eval_protocol/adapters/base.py
@@ -124,6 +124,7 @@ eval_protocol/log_utils/__init__.py
124
124
  eval_protocol/log_utils/elasticsearch_client.py
125
125
  eval_protocol/log_utils/elasticsearch_direct_http_handler.py
126
126
  eval_protocol/log_utils/elasticsearch_index_manager.py
127
+ eval_protocol/log_utils/fireworks_tracing_http_handler.py
127
128
  eval_protocol/log_utils/rollout_id_filter.py
128
129
  eval_protocol/log_utils/util.py
129
130
  eval_protocol/mcp/__init__.py
@@ -165,15 +166,6 @@ eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py
165
166
  eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md
166
167
  eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md
167
168
  eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md
168
- eval_protocol/proxy/__init__.py
169
- eval_protocol/proxy/proxy_core/__init__.py
170
- eval_protocol/proxy/proxy_core/app.py
171
- eval_protocol/proxy/proxy_core/auth.py
172
- eval_protocol/proxy/proxy_core/langfuse.py
173
- eval_protocol/proxy/proxy_core/litellm.py
174
- eval_protocol/proxy/proxy_core/main.py
175
- eval_protocol/proxy/proxy_core/models.py
176
- eval_protocol/proxy/proxy_core/redis_utils.py
177
169
  eval_protocol/pytest/__init__.py
178
170
  eval_protocol/pytest/default_agent_rollout_processor.py
179
171
  eval_protocol/pytest/default_dataset_adapter.py
@@ -238,6 +230,7 @@ eval_protocol/types/types.py
238
230
  eval_protocol/utils/__init__.py
239
231
  eval_protocol/utils/batch_evaluation.py
240
232
  eval_protocol/utils/batch_transformation.py
233
+ eval_protocol/utils/browser_utils.py
241
234
  eval_protocol/utils/check_server_status.py
242
235
  eval_protocol/utils/dataset_helpers.py
243
236
  eval_protocol/utils/logs_models.py