eval-protocol 0.2.51.dev0__tar.gz → 0.2.52__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. {eval_protocol-0.2.51.dev0/eval_protocol.egg-info → eval_protocol-0.2.52}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/__init__.py +7 -10
  3. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/fireworks_tracing.py +6 -8
  5. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/openai_responses.py +29 -1
  6. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/auth.py +39 -0
  7. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli.py +6 -0
  8. eval_protocol-0.2.52/eval_protocol/cli_commands/logs.py +76 -0
  9. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/upload.py +27 -54
  10. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/evaluation.py +125 -40
  11. eval_protocol-0.2.52/eval_protocol/log_utils/fireworks_tracing_http_handler.py +63 -0
  12. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/evaluation_test.py +22 -1
  13. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/remote_rollout_processor.py +22 -3
  14. eval_protocol-0.2.52/eval_protocol/utils/browser_utils.py +114 -0
  15. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/logs_server.py +9 -1
  16. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52/eval_protocol.egg-info}/PKG-INFO +1 -1
  17. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/SOURCES.txt +5 -12
  18. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_show_results_url.py +141 -0
  19. eval_protocol-0.2.52/vite-app/dist/assets/index-BGlGI2LH.css +1 -0
  20. eval_protocol-0.2.51.dev0/vite-app/dist/assets/index-C81y9r9l.js → eval_protocol-0.2.52/vite-app/dist/assets/index-zf20-zFD.js +25 -25
  21. eval_protocol-0.2.51.dev0/vite-app/dist/assets/index-C81y9r9l.js.map → eval_protocol-0.2.52/vite-app/dist/assets/index-zf20-zFD.js.map +1 -1
  22. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vite-app/dist/index.html +2 -2
  23. eval_protocol-0.2.51.dev0/eval_protocol/cli_commands/logs.py +0 -36
  24. eval_protocol-0.2.51.dev0/eval_protocol/proxy/__init__.py +0 -18
  25. eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/__init__.py +0 -13
  26. eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/app.py +0 -305
  27. eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/auth.py +0 -17
  28. eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/langfuse.py +0 -528
  29. eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/litellm.py +0 -170
  30. eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/main.py +0 -10
  31. eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/models.py +0 -104
  32. eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/redis_utils.py +0 -48
  33. eval_protocol-0.2.51.dev0/vite-app/dist/assets/index-DpYZaoAr.css +0 -1
  34. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/LICENSE +0 -0
  35. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/README.md +0 -0
  36. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/__init__.py +0 -0
  37. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/normalize_sandbox_fusion.py +0 -0
  38. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/utils/__init__.py +0 -0
  39. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/utils/generate_api_key.py +0 -0
  40. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/utils/subprocess_manager.py +0 -0
  41. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/__main__.py +0 -0
  42. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/__init__.py +0 -0
  43. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/base.py +0 -0
  44. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/bigquery.py +0 -0
  45. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/braintrust.py +0 -0
  46. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/huggingface.py +0 -0
  47. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/langchain.py +0 -0
  48. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/langfuse.py +0 -0
  49. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/langsmith.py +0 -0
  50. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/trl.py +0 -0
  51. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/utils.py +0 -0
  52. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/weave.py +0 -0
  53. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/__init__.py +0 -0
  54. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/models.py +0 -0
  55. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/orchestrator.py +0 -0
  56. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resource_abc.py +0 -0
  57. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resource_pool.py +0 -0
  58. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/__init__.py +0 -0
  59. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  60. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  61. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  62. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  63. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  64. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/docker_resource.py +0 -0
  65. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  66. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  67. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/sql_resource.py +0 -0
  68. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/task_manager.py +0 -0
  69. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/tool_registry.py +0 -0
  70. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/__init__.py +0 -0
  71. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  72. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  73. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_aime25.py +0 -0
  74. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  75. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  76. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  77. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  78. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  79. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/__init__.py +0 -0
  80. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  81. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/common.py +0 -0
  82. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/deploy.py +0 -0
  83. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  84. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/preview.py +0 -0
  85. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  86. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/common_utils.py +0 -0
  87. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/config.py +0 -0
  88. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/__init__.py +0 -0
  89. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  90. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  91. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  92. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/models.py +0 -0
  93. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/__init__.py +0 -0
  94. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  95. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  96. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  97. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  98. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/datasets/__init__.py +0 -0
  99. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/datasets/loader.py +0 -0
  100. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/directory_utils.py +0 -0
  101. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/__init__.py +0 -0
  102. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/event_bus.py +0 -0
  103. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/logger.py +0 -0
  104. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  105. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  106. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/execution/__init__.py +0 -0
  107. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/execution/pipeline.py +0 -0
  108. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/gcp_tools.py +0 -0
  109. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/generation/cache.py +0 -0
  110. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/generation/clients/base.py +0 -0
  111. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/generation/clients.py +0 -0
  112. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/generic_server.py +0 -0
  113. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/get_pep440_version.py +0 -0
  114. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/human_id/__init__.py +0 -0
  115. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/human_id/dictionary.py +0 -0
  116. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/integrations/__init__.py +0 -0
  117. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/integrations/deepeval.py +0 -0
  118. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/integrations/openeval.py +0 -0
  119. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/integrations/trl.py +0 -0
  120. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/__init__.py +0 -0
  121. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  122. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  123. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  124. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  125. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/util.py +0 -0
  126. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/logging_utils.py +0 -0
  127. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/__init__.py +0 -0
  128. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/adapter.py +0 -0
  129. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/client/__init__.py +0 -0
  130. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/client/connection.py +0 -0
  131. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/clients.py +0 -0
  132. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/execution/__init__.py +0 -0
  133. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/execution/base_policy.py +0 -0
  134. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/execution/manager.py +0 -0
  135. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/execution/policy.py +0 -0
  136. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/grid_renderer.py +0 -0
  137. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  138. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/mcpgym.py +0 -0
  139. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/process_manager.py +0 -0
  140. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/session/__init__.py +0 -0
  141. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/session/manager.py +0 -0
  142. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/simple_process_manager.py +0 -0
  143. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/simulation_server.py +0 -0
  144. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/__init__.py +0 -0
  145. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/config.py +0 -0
  146. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/main.py +0 -0
  147. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  148. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  149. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  150. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  151. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_env.py +0 -0
  152. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/__init__.py +0 -0
  153. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  154. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  155. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  156. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  157. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  158. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  159. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  160. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  161. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  162. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  163. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  164. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  165. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  166. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  167. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/models.py +0 -0
  168. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/packaging.py +0 -0
  169. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/platform_api.py +0 -0
  170. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/playback_policy.py +0 -0
  171. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/__init__.py +0 -0
  172. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  173. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  174. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  175. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  176. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  177. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  178. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  179. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  180. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  181. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  182. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/exception_config.py +0 -0
  183. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/execution.py +0 -0
  184. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  185. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  186. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/parameterize.py +0 -0
  187. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/plugin.py +0 -0
  188. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/rollout_processor.py +0 -0
  189. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/store_experiment_link.py +0 -0
  190. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/store_results_url.py +0 -0
  191. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/types.py +0 -0
  192. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/utils.py +0 -0
  193. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/validate_signature.py +0 -0
  194. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/__init__.py +0 -0
  195. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge.py +0 -0
  196. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  197. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
  198. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
  199. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
  200. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/utils.py +0 -0
  201. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/resources.py +0 -0
  202. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/reward_function.py +0 -0
  203. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/__init__.py +0 -0
  204. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/accuracy.py +0 -0
  205. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/accuracy_length.py +0 -0
  206. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  207. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  208. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/apps_testing_util.py +0 -0
  209. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/bfcl_reward.py +0 -0
  210. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/code_execution.py +0 -0
  211. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/code_execution_utils.py +0 -0
  212. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/cpp_code.py +0 -0
  213. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  214. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/format.py +0 -0
  215. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/function_calling.py +0 -0
  216. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/json_schema.py +0 -0
  217. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/language_consistency.py +0 -0
  218. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/lean_prover.py +0 -0
  219. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/length.py +0 -0
  220. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  221. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/math.py +0 -0
  222. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  223. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/reasoning_steps.py +0 -0
  224. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/repetition.py +0 -0
  225. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/tag_count.py +0 -0
  226. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rl_processing.py +0 -0
  227. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/server.py +0 -0
  228. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/stats/__init__.py +0 -0
  229. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/stats/confidence_intervals.py +0 -0
  230. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/typed_interface.py +0 -0
  231. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/types/__init__.py +0 -0
  232. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/types/errors.py +0 -0
  233. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/types/remote_rollout_processor.py +0 -0
  234. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/types/types.py +0 -0
  235. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/__init__.py +0 -0
  236. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/batch_evaluation.py +0 -0
  237. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/batch_transformation.py +0 -0
  238. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/check_server_status.py +0 -0
  239. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/dataset_helpers.py +0 -0
  240. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/logs_models.py +0 -0
  241. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/module_loader.py +0 -0
  242. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/packaging_utils.py +0 -0
  243. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/show_results_url.py +0 -0
  244. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/static_policy.py +0 -0
  245. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/subprocess_utils.py +0 -0
  246. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/vite_server.py +0 -0
  247. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/dependency_links.txt +0 -0
  248. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/entry_points.txt +0 -0
  249. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/requires.txt +0 -0
  250. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/top_level.txt +0 -0
  251. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/pyproject.toml +0 -0
  252. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/setup.cfg +0 -0
  253. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/setup.py +0 -0
  254. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_accuracy.py +0 -0
  255. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_accuracy_length.py +0 -0
  256. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_adapters_e2e.py +0 -0
  257. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_agent_orchestrator.py +0 -0
  258. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_agent_resources.py +0 -0
  259. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_auth.py +0 -0
  260. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_batch_evaluation.py +0 -0
  261. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_cli.py +0 -0
  262. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_cli_agent.py +0 -0
  263. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_cli_args.py +0 -0
  264. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_code_execution.py +0 -0
  265. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_config.py +0 -0
  266. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_control_plane_separation.py +0 -0
  267. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_cpp_code.py +0 -0
  268. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_data_driven_task_manager.py +0 -0
  269. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_deepcoder_reward.py +0 -0
  270. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_deepeval_integration.py +0 -0
  271. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_deploy_integration.py +0 -0
  272. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_directory_utils.py +0 -0
  273. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_e2b_integration.py +0 -0
  274. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_e2b_js_integration.py +0 -0
  275. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_edge_cases.py +0 -0
  276. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_eval_protocol_import.py +0 -0
  277. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_evaluation.py +0 -0
  278. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_evaluation_integration.py +0 -0
  279. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_evaluation_postprocess.py +0 -0
  280. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_evaluation_preview_integration.py +0 -0
  281. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_event_bus.py +0 -0
  282. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_event_bus_helper.py +0 -0
  283. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_examples_end_to_end.py +0 -0
  284. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_fireworks_api.py +0 -0
  285. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_format.py +0 -0
  286. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_fractional_code.py +0 -0
  287. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_function_calling.py +0 -0
  288. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_gcp_tools.py +0 -0
  289. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_generic_server.py +0 -0
  290. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_human_id.py +0 -0
  291. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_integration.py +0 -0
  292. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_json_schema.py +0 -0
  293. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_kwargs_validation.py +0 -0
  294. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_language_consistency.py +0 -0
  295. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_lean_prover.py +0 -0
  296. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_lean_prover_runner.py +0 -0
  297. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_length.py +0 -0
  298. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_list_comparison_math_reward.py +0 -0
  299. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_logs_server.py +0 -0
  300. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_logs_server_simple.py +0 -0
  301. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_math.py +0 -0
  302. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_minimal.py +0 -0
  303. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_models.py +0 -0
  304. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_models_rl.py +0 -0
  305. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_multiple_choice_math_reward.py +0 -0
  306. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_n_variant_batch_integration.py +0 -0
  307. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_n_variant_integration.py +0 -0
  308. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_openai_compatibility.py +0 -0
  309. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_openeval_integration.py +0 -0
  310. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_packaging.py +0 -0
  311. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_parallel_rollouts.py +0 -0
  312. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_platform_api.py +0 -0
  313. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_quickstart_utils.py +0 -0
  314. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_readiness.py +0 -0
  315. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_reasoning_steps.py +0 -0
  316. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_repetition.py +0 -0
  317. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_repetition_debug.py +0 -0
  318. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_retry_mechanism.py +0 -0
  319. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_reward_function.py +0 -0
  320. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_reward_protocol_import.py +0 -0
  321. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_rl_processing.py +0 -0
  322. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_rollout_control_plane_integration.py +0 -0
  323. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_server.py +0 -0
  324. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_status_migration_changes.py +0 -0
  325. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_status_migration_integration.py +0 -0
  326. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_status_model.py +0 -0
  327. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_tag_count.py +0 -0
  328. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_tau_bench_airline_smoke.py +0 -0
  329. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_typed_interface.py +0 -0
  330. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_typed_interface_rl.py +0 -0
  331. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_upload_entrypoint.py +0 -0
  332. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_url_handling.py +0 -0
  333. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_vite_server.py +0 -0
  334. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/__init__.py +0 -0
  335. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/agent/__init__.py +0 -0
  336. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/agent/base.py +0 -0
  337. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/agent/llm_agent.py +0 -0
  338. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/api_service/__init__.py +0 -0
  339. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/api_service/api_config.py +0 -0
  340. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/api_service/data_model.py +0 -0
  341. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/api_service/simulation_service.py +0 -0
  342. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/cli.py +0 -0
  343. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/config.py +0 -0
  344. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/airline/policy.md +0 -0
  345. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/mock/policy.md +0 -0
  346. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  347. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/retail/policy.md +0 -0
  348. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  349. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  350. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  351. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  352. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  353. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  354. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  355. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data_model/__init__.py +0 -0
  356. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data_model/message.py +0 -0
  357. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data_model/simulation.py +0 -0
  358. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data_model/tasks.py +0 -0
  359. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/__init__.py +0 -0
  360. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/__init__.py +0 -0
  361. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/data_model.py +0 -0
  362. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/environment.py +0 -0
  363. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/tools.py +0 -0
  364. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/utils.py +0 -0
  365. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/__init__.py +0 -0
  366. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/data_model.py +0 -0
  367. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/environment.py +0 -0
  368. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/tools.py +0 -0
  369. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/utils.py +0 -0
  370. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/__init__.py +0 -0
  371. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/data_model.py +0 -0
  372. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/environment.py +0 -0
  373. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/tools.py +0 -0
  374. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/utils.py +0 -0
  375. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/__init__.py +0 -0
  376. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/data_model.py +0 -0
  377. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/environment.py +0 -0
  378. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  379. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  380. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  381. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  382. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  383. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  384. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  385. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  386. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tools.py +0 -0
  387. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  388. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  389. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/utils.py +0 -0
  390. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/__init__.py +0 -0
  391. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/db.py +0 -0
  392. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/environment.py +0 -0
  393. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/server.py +0 -0
  394. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/tool.py +0 -0
  395. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/toolkit.py +0 -0
  396. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  397. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/__init__.py +0 -0
  398. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator.py +0 -0
  399. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  400. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  401. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  402. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  403. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  404. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/metrics/__init__.py +0 -0
  405. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/metrics/agent_metrics.py +0 -0
  406. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  407. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/orchestrator/__init__.py +0 -0
  408. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  409. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  410. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/orchestrator/utils.py +0 -0
  411. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/registry.py +0 -0
  412. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/run.py +0 -0
  413. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/__init__.py +0 -0
  414. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/check_data.py +0 -0
  415. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  416. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/start_servers.py +0 -0
  417. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/view_simulations.py +0 -0
  418. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/user/__init__.py +0 -0
  419. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/user/base.py +0 -0
  420. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/user/user_simulator.py +0 -0
  421. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/__init__.py +0 -0
  422. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/display.py +0 -0
  423. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/io_utils.py +0 -0
  424. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/llm_utils.py +0 -0
  425. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/pydantic_utils.py +0 -0
  426. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/utils.py +0 -0
  427. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/versioneer.py +0 -0
  428. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  429. {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.51.dev0
3
+ Version: 0.2.52
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -31,10 +31,14 @@ from .reward_function import RewardFunction
31
31
  from .typed_interface import reward_function
32
32
  from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
33
33
  from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
34
+ from .pytest.remote_rollout_processor import create_elasticsearch_config_from_env
34
35
  from .pytest.parameterize import DefaultParameterIdGenerator
35
36
  from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
36
37
  from .log_utils.rollout_id_filter import RolloutIdFilter
37
38
  from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
39
+ from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
40
+ from .log_utils.elasticsearch_client import ElasticsearchConfig
41
+
38
42
 
39
43
  from .types.remote_rollout_processor import (
40
44
  InitRequest,
@@ -70,16 +74,11 @@ try:
70
74
  except ImportError:
71
75
  WeaveAdapter = None
72
76
 
73
- try:
74
- from .proxy import create_app, AuthProvider
75
- except ImportError:
76
- create_app = None
77
- AuthProvider = None
78
-
79
-
80
77
  warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
81
78
 
82
79
  __all__ = [
80
+ "create_elasticsearch_config_from_env",
81
+ "ElasticsearchConfig",
83
82
  "ElasticsearchDirectHttpHandler",
84
83
  "RolloutIdFilter",
85
84
  "setup_rollout_logging_for_elasticsearch_handler",
@@ -102,6 +101,7 @@ __all__ = [
102
101
  "BraintrustAdapter",
103
102
  "create_braintrust_adapter",
104
103
  "LangSmithAdapter",
104
+ "FireworksTracingHttpHandler",
105
105
  # Core interfaces
106
106
  "Message",
107
107
  "MetricResult",
@@ -137,9 +137,6 @@ __all__ = [
137
137
  "RolloutMetadata",
138
138
  "StatusResponse",
139
139
  "create_langfuse_config_tags",
140
- # Proxy
141
- "create_app",
142
- "AuthProvider",
143
140
  ]
144
141
 
145
142
  from . import _version
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-10-10T21:44:48-0700",
11
+ "date": "2025-10-13T00:31:45-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "e753a0be05950541cdf51cd66ad22190ec1b1571",
15
- "version": "0.2.51-dev"
14
+ "full-revisionid": "270a91e21f730169bd3ff7f94c44f8c0502ace33",
15
+ "version": "0.2.52"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -7,9 +7,9 @@ to pull data from Langfuse deployments with simplified retry logic handling.
7
7
  from __future__ import annotations
8
8
  import logging
9
9
  import requests
10
+ import time
10
11
  from datetime import datetime
11
12
  from typing import Any, Dict, List, Optional, Protocol
12
- import os
13
13
 
14
14
  from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
15
15
  from .base import BaseAdapter
@@ -343,17 +343,15 @@ class FireworksTracingAdapter(BaseAdapter):
343
343
  # Remove None values
344
344
  params = {k: v for k, v in params.items() if v is not None}
345
345
 
346
- # Make request to proxy (using pointwise for efficiency)
346
+ # Make request to proxy
347
347
  if self.project_id:
348
- url = f"{self.base_url}/v1/project_id/{self.project_id}/traces/pointwise"
348
+ url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
349
349
  else:
350
- url = f"{self.base_url}/v1/traces/pointwise"
351
-
352
- headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
350
+ url = f"{self.base_url}/v1/traces"
353
351
 
354
352
  result = None
355
353
  try:
356
- response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
354
+ response = requests.get(url, params=params, timeout=self.timeout)
357
355
  response.raise_for_status()
358
356
  result = response.json()
359
357
  except requests.exceptions.HTTPError as e:
@@ -367,7 +365,7 @@ class FireworksTracingAdapter(BaseAdapter):
367
365
  except Exception: # In case e.response.json() fails
368
366
  error_msg = f"Proxy error: {e.response.text}"
369
367
 
370
- logger.error("Failed to fetch traces from proxy (HTTP %s): %s", e.response.status_code, error_msg)
368
+ logger.error("Failed to fetch traces from proxy: %s", error_msg)
371
369
  return eval_rows
372
370
  except requests.exceptions.RequestException as e:
373
371
  # Non-HTTP errors (network issues, timeouts, etc.)
@@ -169,7 +169,9 @@ class OpenAIResponsesAdapter(BaseAdapter):
169
169
  raise NotImplementedError(f"Unsupported content type: {content_item.type}")
170
170
  elif item.type == "function_call_output":
171
171
  # Collect tool call outputs to add before assistant message
172
- tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
172
+ tool_call_outputs.append(
173
+ Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
174
+ )
173
175
  elif item.type == "function_call":
174
176
  tool_call = ChatCompletionMessageToolCall(
175
177
  id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
@@ -186,3 +188,29 @@ class OpenAIResponsesAdapter(BaseAdapter):
186
188
  messages.append(Message(role="assistant", tool_calls=current_tool_calls))
187
189
 
188
190
  return reversed(messages)
191
+
192
+ def _coerce_tool_output(self, output: Any) -> str:
193
+ """Coerce OpenAI Responses tool output into a string for Message.content.
194
+
195
+ The Responses API may return structured content lists. For our purposes,
196
+ we stringify non-string outputs to satisfy the Message.content type.
197
+ """
198
+ if isinstance(output, str):
199
+ return output
200
+ try:
201
+ # Attempt to join list of objects with any 'text' fields
202
+ if isinstance(output, list):
203
+ parts: list[str] = []
204
+ for part in output:
205
+ text = None
206
+ if isinstance(part, dict):
207
+ text = part.get("text")
208
+ if text:
209
+ parts.append(str(text))
210
+ else:
211
+ parts.append(str(part))
212
+ return "\n".join(parts)
213
+ # Fallback to string conversion
214
+ return str(output)
215
+ except Exception:
216
+ return str(output)
@@ -4,6 +4,8 @@ import os
4
4
  from pathlib import Path
5
5
  from typing import Dict, Optional # Added Dict
6
6
 
7
+ import requests
8
+
7
9
  logger = logging.getLogger(__name__)
8
10
 
9
11
  # Default locations (used for tests and as fallback). Actual resolution is dynamic via _get_auth_ini_file().
@@ -218,3 +220,40 @@ def get_fireworks_api_base() -> str:
218
220
  else:
219
221
  logger.debug("FIREWORKS_API_BASE not set in environment, defaulting to %s.", api_base)
220
222
  return api_base
223
+
224
+
225
+ def verify_api_key_and_get_account_id(
226
+ api_key: Optional[str] = None,
227
+ api_base: Optional[str] = None,
228
+ ) -> Optional[str]:
229
+ """
230
+ Calls the Fireworks API verify endpoint to validate the API key and returns the
231
+ account id from response headers when available.
232
+
233
+ Args:
234
+ api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
235
+ api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
236
+
237
+ Returns:
238
+ The resolved account id if verification succeeds and the header is present; otherwise None.
239
+ """
240
+ try:
241
+ resolved_key = api_key or get_fireworks_api_key()
242
+ if not resolved_key:
243
+ return None
244
+ resolved_base = api_base or get_fireworks_api_base()
245
+ url = f"{resolved_base.rstrip('/')}/verifyApiKey"
246
+ headers = {"Authorization": f"Bearer {resolved_key}"}
247
+ resp = requests.get(url, headers=headers, timeout=10)
248
+ if resp.status_code != 200:
249
+ logger.debug("verifyApiKey returned status %s", resp.status_code)
250
+ return None
251
+ # Header keys could vary in case; requests provides case-insensitive dict
252
+ account_id = resp.headers.get("x-fireworks-account-id") or resp.headers.get("X-Fireworks-Account-Id")
253
+ if account_id and account_id.strip():
254
+ logger.debug("Resolved FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", account_id)
255
+ return account_id.strip()
256
+ return None
257
+ except Exception as e:
258
+ logger.debug("Failed to verify API key for account id resolution: %s", e)
259
+ return None
@@ -301,6 +301,12 @@ def parse_args(args=None):
301
301
  logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
302
302
  logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
303
303
  logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
304
+ logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
305
+ logs_parser.add_argument(
306
+ "--use-env-elasticsearch-config",
307
+ action="store_true",
308
+ help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
309
+ )
304
310
 
305
311
  # Upload command
306
312
  upload_parser = subparsers.add_parser(
@@ -0,0 +1,76 @@
1
+ """
2
+ CLI command for serving logs with file watching and real-time updates.
3
+ """
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from ..utils.logs_server import serve_logs
9
+
10
+
11
+ def logs_command(args):
12
+ """Serve logs with file watching and real-time updates"""
13
+
14
+ port = args.port
15
+ print("🚀 Starting Eval Protocol Logs Server")
16
+ print(f"🌐 URL: http://localhost:{port}")
17
+ print(f"🔌 WebSocket: ws://localhost:{port}/ws")
18
+ print(f"👀 Watching paths: {['current directory']}")
19
+ print(f"🔍 Debug mode: {args.debug}")
20
+ print("Press Ctrl+C to stop the server")
21
+ print("-" * 50)
22
+
23
+ # Setup Elasticsearch based on flags
24
+ elasticsearch_config = None
25
+ try:
26
+ if getattr(args, "use_env_elasticsearch_config", False):
27
+ # Use environment variables for configuration
28
+ print("⚙️ Using environment variables for Elasticsearch config")
29
+ from eval_protocol.pytest.remote_rollout_processor import (
30
+ create_elasticsearch_config_from_env,
31
+ )
32
+
33
+ elasticsearch_config = create_elasticsearch_config_from_env()
34
+ # Ensure index exists with correct mapping, mirroring Docker setup path
35
+ try:
36
+ from eval_protocol.log_utils.elasticsearch_index_manager import (
37
+ ElasticsearchIndexManager,
38
+ )
39
+
40
+ index_manager = ElasticsearchIndexManager(
41
+ elasticsearch_config.url,
42
+ elasticsearch_config.index_name,
43
+ elasticsearch_config.api_key,
44
+ )
45
+ created = index_manager.create_logging_index_mapping()
46
+ if created:
47
+ print(
48
+ f"🧭 Verified Elasticsearch index '{elasticsearch_config.index_name}' mapping (created or already correct)"
49
+ )
50
+ else:
51
+ print(
52
+ f"⚠️ Could not verify/create mapping for index '{elasticsearch_config.index_name}'. Searches may behave unexpectedly."
53
+ )
54
+ except Exception as e:
55
+ print(f"⚠️ Failed to ensure index mapping via IndexManager: {e}")
56
+ elif not getattr(args, "disable_elasticsearch_setup", False):
57
+ # Default behavior: start or connect to local Elasticsearch via Docker helper
58
+ from eval_protocol.pytest.elasticsearch_setup import ElasticsearchSetup
59
+
60
+ print("🧰 Auto-configuring local Elasticsearch (Docker)")
61
+ elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
62
+ else:
63
+ print("🚫 Elasticsearch setup disabled; running without Elasticsearch integration")
64
+ except Exception as e:
65
+ print(f"❌ Failed to configure Elasticsearch: {e}")
66
+ return 1
67
+
68
+ try:
69
+ serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
70
+ return 0
71
+ except KeyboardInterrupt:
72
+ print("\n🛑 Server stopped by user")
73
+ return 0
74
+ except Exception as e:
75
+ print(f"❌ Error starting server: {e}")
76
+ return 1
@@ -12,7 +12,12 @@ from pathlib import Path
12
12
  from typing import Any, Callable, Iterable, Optional
13
13
 
14
14
  import pytest
15
- from eval_protocol.auth import get_fireworks_account_id, get_fireworks_api_key
15
+ from eval_protocol.auth import (
16
+ get_fireworks_account_id,
17
+ get_fireworks_api_key,
18
+ get_fireworks_api_base,
19
+ verify_api_key_and_get_account_id,
20
+ )
16
21
  from eval_protocol.platform_api import create_or_update_fireworks_secret
17
22
 
18
23
  from eval_protocol.evaluation import create_evaluation
@@ -259,7 +264,7 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
259
264
  raise ValueError("--entry must be in 'module::function', 'path::function', or 'module:function' format")
260
265
 
261
266
 
262
- def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, str, str]:
267
+ def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
263
268
  target, func = _parse_entry(entry, cwd)
264
269
 
265
270
  # Check if target looks like a file path
@@ -293,47 +298,12 @@ def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, s
293
298
  raise ValueError(f"Function '{func}' not found in module '{module_name}'")
294
299
 
295
300
  qualname = f"{module_name}.{func}"
296
- code, file_name = _generate_ts_mode_code(
297
- DiscoveredTest(
298
- module_path=module_name,
299
- module_name=module_name,
300
- qualname=qualname,
301
- file_path=getattr(module, "__file__", module_name),
302
- lineno=None,
303
- has_parametrize=False,
304
- param_count=0,
305
- nodeids=[],
306
- )
307
- )
308
- return code, file_name, qualname, os.path.abspath(source_file_path) if source_file_path else ""
301
+ return qualname, os.path.abspath(source_file_path) if source_file_path else ""
309
302
 
310
303
 
311
304
  def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]:
312
- # Generate a minimal main.py that imports the test module and calls the function
313
- module = test.module_name
314
- func = test.qualname.split(".")[-1]
315
- code = f"""
316
- from typing import Any, Dict, List, Optional, Union
317
-
318
- from eval_protocol.models import EvaluationRow, Message
319
- from {module} import {func} as _ep_test
320
-
321
- def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
322
- row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth)
323
- result = _ep_test(row) # Supports sync/async via decorator's dual-mode
324
- if hasattr(result, "__await__"):
325
- import asyncio
326
- result = asyncio.get_event_loop().run_until_complete(result)
327
- if result.evaluation_result is None:
328
- return {{"score": 0.0, "reason": "No evaluation_result set"}}
329
- out = {{
330
- "score": float(result.evaluation_result.score or 0.0),
331
- "reason": result.evaluation_result.reason,
332
- "metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}},
333
- }}
334
- return out
335
- """
336
- return (code, "main.py")
305
+ # Deprecated: we no longer generate a shim; keep stub for import compatibility
306
+ return ("", "main.py")
337
307
 
338
308
 
339
309
  def _normalize_evaluator_id(evaluator_id: str) -> str:
@@ -522,10 +492,10 @@ def upload_command(args: argparse.Namespace) -> int:
522
492
  entries_arg = getattr(args, "entry", None)
523
493
  if entries_arg:
524
494
  entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
525
- selected_specs: list[tuple[str, str, str, str]] = []
495
+ selected_specs: list[tuple[str, str]] = []
526
496
  for e in entries:
527
- code, file_name, qualname, resolved_path = _generate_ts_mode_code_from_entry(e, root)
528
- selected_specs.append((code, file_name, qualname, resolved_path))
497
+ qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root)
498
+ selected_specs.append((qualname, resolved_path))
529
499
  else:
530
500
  print("Scanning for evaluation tests...")
531
501
  tests = _discover_tests(root)
@@ -545,11 +515,7 @@ def upload_command(args: argparse.Namespace) -> int:
545
515
  print(" handles all parameter combinations. The evaluator will work with")
546
516
  print(" the same logic regardless of which model/parameters are used.")
547
517
 
548
- selected_specs = []
549
- for t in selected_tests:
550
- code, file_name = _generate_ts_mode_code(t)
551
- # Store test info for better ID generation
552
- selected_specs.append((code, file_name, t.qualname, t.file_path))
518
+ selected_specs = [(t.qualname, t.file_path) for t in selected_tests]
553
519
 
554
520
  base_id = getattr(args, "id", None)
555
521
  display_name = getattr(args, "display_name", None)
@@ -560,6 +526,14 @@ def upload_command(args: argparse.Namespace) -> int:
560
526
  try:
561
527
  fw_account_id = get_fireworks_account_id()
562
528
  fw_api_key_value = get_fireworks_api_key()
529
+ if not fw_account_id and fw_api_key_value:
530
+ # Attempt to verify and resolve account id from server headers
531
+ resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base())
532
+ if resolved:
533
+ fw_account_id = resolved
534
+ # Propagate to environment so downstream calls use it if needed
535
+ os.environ["FIREWORKS_ACCOUNT_ID"] = fw_account_id
536
+ print(f"Resolved FIREWORKS_ACCOUNT_ID via API verification: {fw_account_id}")
563
537
  if fw_account_id and fw_api_key_value:
564
538
  print("Ensuring FIREWORKS_API_KEY is registered as a secret on Fireworks for rollout...")
565
539
  if create_or_update_fireworks_secret(
@@ -579,8 +553,7 @@ def upload_command(args: argparse.Namespace) -> int:
579
553
  print(f"Warning: Skipped Fireworks secret registration due to error: {e}")
580
554
 
581
555
  exit_code = 0
582
- for i, (code, file_name, qualname, source_file_path) in enumerate(selected_specs):
583
- # Use ts_mode to upload evaluator
556
+ for i, (qualname, source_file_path) in enumerate(selected_specs):
584
557
  # Generate a short default ID from just the test function name
585
558
  if base_id:
586
559
  evaluator_id = base_id
@@ -618,12 +591,12 @@ def upload_command(args: argparse.Namespace) -> int:
618
591
 
619
592
  print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
620
593
  try:
594
+ # Always treat as a single evaluator (single-metric) even if folder has helper modules
595
+ test_dir = os.path.dirname(source_file_path) if source_file_path else root
596
+ metric_name = os.path.basename(test_dir) or "metric"
621
597
  result = create_evaluation(
622
598
  evaluator_id=evaluator_id,
623
- python_code_to_evaluate=code,
624
- python_file_name_for_code=file_name,
625
- criterion_name_for_code=qualname,
626
- criterion_description_for_code=description or f"Evaluator for {qualname}",
599
+ metric_folders=[f"{metric_name}={test_dir}"],
627
600
  display_name=display_name or evaluator_id,
628
601
  description=description or f"Evaluator for {qualname}",
629
602
  force=force,