eval-protocol 0.2.55.dev1__tar.gz → 0.2.57.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (443) hide show
  1. {eval_protocol-0.2.55.dev1/eval_protocol.egg-info → eval_protocol-0.2.57.dev2}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/__init__.py +14 -2
  3. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/fireworks_tracing.py +49 -0
  5. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/openai_responses.py +29 -1
  6. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/auth.py +39 -0
  7. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli.py +16 -0
  8. eval_protocol-0.2.57.dev2/eval_protocol/cli_commands/logs.py +57 -0
  9. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/upload.py +40 -71
  10. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/evaluation.py +125 -40
  11. eval_protocol-0.2.57.dev2/eval_protocol/event_bus/__init__.py +25 -0
  12. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +11 -0
  13. eval_protocol-0.2.57.dev2/eval_protocol/log_utils/fireworks_tracing_http_handler.py +138 -0
  14. eval_protocol-0.2.57.dev2/eval_protocol/log_utils/init.py +69 -0
  15. eval_protocol-0.2.57.dev2/eval_protocol/log_utils/rollout_context.py +84 -0
  16. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/policy.py +18 -6
  17. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/models.py +3 -1
  18. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py +11 -2
  19. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/__init__.py +2 -0
  20. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +19 -6
  21. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +23 -11
  22. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/evaluation_test.py +71 -16
  23. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +6 -1
  24. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +1 -1
  25. eval_protocol-0.2.57.dev2/eval_protocol/pytest/github_action_rollout_processor.py +223 -0
  26. eval_protocol-0.2.57.dev2/eval_protocol/pytest/remote_rollout_processor.py +207 -0
  27. eval_protocol-0.2.57.dev2/eval_protocol/pytest/tracing_utils.py +165 -0
  28. eval_protocol-0.2.57.dev2/eval_protocol/quickstart/__init__.py +8 -0
  29. eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge/__init__.py +4 -0
  30. eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge/llm_judge.py +90 -0
  31. eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +63 -0
  32. {eval_protocol-0.2.55.dev1/eval_protocol/quickstart → eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge}/llm_judge_langfuse.py +1 -3
  33. eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge/utils.py +133 -0
  34. eval_protocol-0.2.57.dev2/eval_protocol/utils/browser_utils.py +114 -0
  35. eval_protocol-0.2.57.dev2/eval_protocol/utils/evaluation_row_utils.py +136 -0
  36. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/logs_server.py +87 -6
  37. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2/eval_protocol.egg-info}/PKG-INFO +1 -1
  38. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/SOURCES.txt +18 -7
  39. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_postprocess.py +1 -2
  40. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_quickstart_utils.py +1 -1
  41. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_show_results_url.py +141 -0
  42. eval_protocol-0.2.57.dev2/vite-app/dist/assets/index-BnDJont9.css +1 -0
  43. eval_protocol-0.2.57.dev2/vite-app/dist/assets/index-Cu9t0G5i.js +137 -0
  44. eval_protocol-0.2.57.dev2/vite-app/dist/assets/index-Cu9t0G5i.js.map +1 -0
  45. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vite-app/dist/index.html +2 -2
  46. eval_protocol-0.2.55.dev1/eval_protocol/cli_commands/logs.py +0 -36
  47. eval_protocol-0.2.55.dev1/eval_protocol/event_bus/__init__.py +0 -5
  48. eval_protocol-0.2.55.dev1/eval_protocol/pytest/remote_rollout_processor.py +0 -364
  49. eval_protocol-0.2.55.dev1/eval_protocol/quickstart/__init__.py +0 -4
  50. eval_protocol-0.2.55.dev1/vite-app/dist/assets/index-C81y9r9l.js +0 -136
  51. eval_protocol-0.2.55.dev1/vite-app/dist/assets/index-C81y9r9l.js.map +0 -1
  52. eval_protocol-0.2.55.dev1/vite-app/dist/assets/index-DpYZaoAr.css +0 -1
  53. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/LICENSE +0 -0
  54. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/README.md +0 -0
  55. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/__init__.py +0 -0
  56. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/normalize_sandbox_fusion.py +0 -0
  57. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/utils/__init__.py +0 -0
  58. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/utils/generate_api_key.py +0 -0
  59. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/utils/subprocess_manager.py +0 -0
  60. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/__main__.py +0 -0
  61. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/__init__.py +0 -0
  62. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/base.py +0 -0
  63. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/bigquery.py +0 -0
  64. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/braintrust.py +0 -0
  65. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/huggingface.py +0 -0
  66. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langchain.py +0 -0
  67. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langfuse.py +0 -0
  68. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langsmith.py +0 -0
  69. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/trl.py +0 -0
  70. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/utils.py +0 -0
  71. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/weave.py +0 -0
  72. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/__init__.py +0 -0
  73. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/models.py +0 -0
  74. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/orchestrator.py +0 -0
  75. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resource_abc.py +0 -0
  76. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resource_pool.py +0 -0
  77. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
  78. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  79. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  80. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  81. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  82. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  83. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
  84. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  85. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  86. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
  87. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/task_manager.py +0 -0
  88. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/tool_registry.py +0 -0
  89. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
  90. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  91. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  92. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
  93. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  94. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  95. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  96. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  97. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  98. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
  99. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  100. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/common.py +0 -0
  101. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
  102. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  103. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/preview.py +0 -0
  104. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  105. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/common_utils.py +0 -0
  106. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/config.py +0 -0
  107. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/__init__.py +0 -0
  108. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  109. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  110. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  111. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/models.py +0 -0
  112. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
  113. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  114. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  115. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  116. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  117. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/datasets/__init__.py +0 -0
  118. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/datasets/loader.py +0 -0
  119. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/directory_utils.py +0 -0
  120. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
  121. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/logger.py +0 -0
  122. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  123. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  124. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/execution/__init__.py +0 -0
  125. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/execution/pipeline.py +0 -0
  126. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/gcp_tools.py +0 -0
  127. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/cache.py +0 -0
  128. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/clients/base.py +0 -0
  129. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/clients.py +0 -0
  130. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/generic_server.py +0 -0
  131. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/get_pep440_version.py +0 -0
  132. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/human_id/__init__.py +0 -0
  133. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/human_id/dictionary.py +0 -0
  134. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/__init__.py +0 -0
  135. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/deepeval.py +0 -0
  136. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/openeval.py +0 -0
  137. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/trl.py +0 -0
  138. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/__init__.py +0 -0
  139. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  140. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  141. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  142. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/util.py +0 -0
  143. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/logging_utils.py +0 -0
  144. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/__init__.py +0 -0
  145. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/adapter.py +0 -0
  146. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
  147. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/client/connection.py +0 -0
  148. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/clients.py +0 -0
  149. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
  150. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
  151. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/manager.py +0 -0
  152. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
  153. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  154. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
  155. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/process_manager.py +0 -0
  156. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
  157. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/session/manager.py +0 -0
  158. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
  159. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
  160. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
  161. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/config.py +0 -0
  162. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/main.py +0 -0
  163. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  164. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  165. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  166. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  167. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_env.py +0 -0
  168. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
  169. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  170. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  171. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  172. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  173. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  174. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  175. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  176. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  177. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  178. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  179. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  180. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  181. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  182. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  183. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/packaging.py +0 -0
  184. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/platform_api.py +0 -0
  185. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/playback_policy.py +0 -0
  186. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/__init__.py +0 -0
  187. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  188. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/app.py +0 -0
  189. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  190. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  191. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  192. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/main.py +0 -0
  193. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/models.py +0 -0
  194. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  195. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  196. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  197. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  198. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  199. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  200. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  201. eval_protocol-0.2.55.dev1/eval_protocol/pytest/utils.py → eval_protocol-0.2.57.dev2/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  202. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/exception_config.py +0 -0
  203. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/execution.py +0 -0
  204. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  205. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/parameterize.py +0 -0
  206. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/plugin.py +0 -0
  207. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
  208. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
  209. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
  210. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/types.py +0 -0
  211. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
  212. {eval_protocol-0.2.55.dev1/eval_protocol/quickstart → eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge}/llm_judge_langsmith.py +1 -1
  213. {eval_protocol-0.2.55.dev1/eval_protocol/quickstart → eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge}/llm_judge_openai_responses.py +1 -1
  214. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
  215. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  216. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/utils.py +0 -0
  217. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/resources.py +0 -0
  218. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/reward_function.py +0 -0
  219. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/__init__.py +0 -0
  220. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/accuracy.py +0 -0
  221. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
  222. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  223. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  224. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
  225. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
  226. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/code_execution.py +0 -0
  227. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
  228. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
  229. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  230. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/format.py +0 -0
  231. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/function_calling.py +0 -0
  232. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/json_schema.py +0 -0
  233. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
  234. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
  235. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/length.py +0 -0
  236. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  237. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/math.py +0 -0
  238. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  239. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
  240. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/repetition.py +0 -0
  241. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/tag_count.py +0 -0
  242. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rl_processing.py +0 -0
  243. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/server.py +0 -0
  244. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/stats/__init__.py +0 -0
  245. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
  246. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/typed_interface.py +0 -0
  247. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/types/__init__.py +0 -0
  248. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/types/errors.py +0 -0
  249. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/types/remote_rollout_processor.py +0 -0
  250. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/types/types.py +0 -0
  251. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/__init__.py +0 -0
  252. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
  253. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
  254. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/check_server_status.py +0 -0
  255. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
  256. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/logs_models.py +0 -0
  257. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/module_loader.py +0 -0
  258. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
  259. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/show_results_url.py +0 -0
  260. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/static_policy.py +0 -0
  261. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
  262. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/vite_server.py +0 -0
  263. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
  264. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
  265. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/requires.txt +0 -0
  266. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
  267. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/pyproject.toml +0 -0
  268. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/setup.cfg +0 -0
  269. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/setup.py +0 -0
  270. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_accuracy.py +0 -0
  271. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_accuracy_length.py +0 -0
  272. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_adapters_e2e.py +0 -0
  273. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_agent_orchestrator.py +0 -0
  274. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_agent_resources.py +0 -0
  275. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_auth.py +0 -0
  276. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_batch_evaluation.py +0 -0
  277. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_cli.py +0 -0
  278. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_cli_agent.py +0 -0
  279. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_cli_args.py +0 -0
  280. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_code_execution.py +0 -0
  281. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_config.py +0 -0
  282. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_control_plane_separation.py +0 -0
  283. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_cpp_code.py +0 -0
  284. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_data_driven_task_manager.py +0 -0
  285. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_deepcoder_reward.py +0 -0
  286. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_deepeval_integration.py +0 -0
  287. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_deploy_integration.py +0 -0
  288. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_directory_utils.py +0 -0
  289. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_e2b_integration.py +0 -0
  290. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_e2b_js_integration.py +0 -0
  291. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_edge_cases.py +0 -0
  292. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_eval_protocol_import.py +0 -0
  293. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_evaluation.py +0 -0
  294. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_integration.py +0 -0
  295. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_preview_integration.py +0 -0
  296. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_event_bus.py +0 -0
  297. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_event_bus_helper.py +0 -0
  298. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_examples_end_to_end.py +0 -0
  299. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_fireworks_api.py +0 -0
  300. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_format.py +0 -0
  301. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_fractional_code.py +0 -0
  302. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_function_calling.py +0 -0
  303. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_gcp_tools.py +0 -0
  304. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_generic_server.py +0 -0
  305. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_human_id.py +0 -0
  306. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_integration.py +0 -0
  307. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_json_schema.py +0 -0
  308. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_kwargs_validation.py +0 -0
  309. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_language_consistency.py +0 -0
  310. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_lean_prover.py +0 -0
  311. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_lean_prover_runner.py +0 -0
  312. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_length.py +0 -0
  313. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_list_comparison_math_reward.py +0 -0
  314. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_logs_server.py +0 -0
  315. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_logs_server_simple.py +0 -0
  316. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_math.py +0 -0
  317. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_minimal.py +0 -0
  318. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_models.py +0 -0
  319. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_models_rl.py +0 -0
  320. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
  321. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_n_variant_batch_integration.py +0 -0
  322. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_n_variant_integration.py +0 -0
  323. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_openai_compatibility.py +0 -0
  324. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_openeval_integration.py +0 -0
  325. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_packaging.py +0 -0
  326. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_parallel_rollouts.py +0 -0
  327. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_platform_api.py +0 -0
  328. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_readiness.py +0 -0
  329. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_reasoning_steps.py +0 -0
  330. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_repetition.py +0 -0
  331. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_repetition_debug.py +0 -0
  332. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_retry_mechanism.py +0 -0
  333. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_reward_function.py +0 -0
  334. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_reward_protocol_import.py +0 -0
  335. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_rl_processing.py +0 -0
  336. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
  337. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_server.py +0 -0
  338. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_status_migration_changes.py +0 -0
  339. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_status_migration_integration.py +0 -0
  340. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_status_model.py +0 -0
  341. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_tag_count.py +0 -0
  342. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
  343. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_typed_interface.py +0 -0
  344. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_typed_interface_rl.py +0 -0
  345. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_upload_entrypoint.py +0 -0
  346. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_url_handling.py +0 -0
  347. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_vite_server.py +0 -0
  348. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/__init__.py +0 -0
  349. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/__init__.py +0 -0
  350. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/base.py +0 -0
  351. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
  352. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/__init__.py +0 -0
  353. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/api_config.py +0 -0
  354. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/data_model.py +0 -0
  355. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
  356. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/cli.py +0 -0
  357. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/config.py +0 -0
  358. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
  359. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
  360. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  361. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
  362. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  363. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  364. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  365. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  366. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  367. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  368. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  369. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/__init__.py +0 -0
  370. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/message.py +0 -0
  371. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/simulation.py +0 -0
  372. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/tasks.py +0 -0
  373. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/__init__.py +0 -0
  374. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
  375. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
  376. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
  377. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
  378. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
  379. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
  380. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
  381. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
  382. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
  383. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
  384. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
  385. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
  386. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
  387. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
  388. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
  389. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
  390. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
  391. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
  392. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  393. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  394. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  395. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  396. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  397. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  398. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  399. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  400. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
  401. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  402. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  403. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
  404. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/__init__.py +0 -0
  405. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/db.py +0 -0
  406. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/environment.py +0 -0
  407. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/server.py +0 -0
  408. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/tool.py +0 -0
  409. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/toolkit.py +0 -0
  410. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  411. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
  412. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
  413. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  414. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  415. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  416. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  417. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  418. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/__init__.py +0 -0
  419. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
  420. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  421. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
  422. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  423. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  424. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
  425. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/registry.py +0 -0
  426. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/run.py +0 -0
  427. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/__init__.py +0 -0
  428. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/check_data.py +0 -0
  429. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  430. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
  431. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
  432. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/__init__.py +0 -0
  433. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/base.py +0 -0
  434. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/user_simulator.py +0 -0
  435. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/__init__.py +0 -0
  436. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/display.py +0 -0
  437. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/io_utils.py +0 -0
  438. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
  439. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
  440. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/utils.py +0 -0
  441. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/versioneer.py +0 -0
  442. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  443. {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.55.dev1
3
+ Version: 0.2.57.dev2
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -29,12 +29,20 @@ from .playback_policy import PlaybackPolicyBase
29
29
  from .resources import create_llm_resource
30
30
  from .reward_function import RewardFunction
31
31
  from .typed_interface import reward_function
32
- from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
33
- from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
32
+ from .quickstart.aha_judge import aha_judge
33
+ from .utils.evaluation_row_utils import (
34
+ multi_turn_assistant_to_ground_truth,
35
+ assistant_to_ground_truth,
36
+ filter_longest_conversation,
37
+ )
38
+ from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor, GithubActionRolloutProcessor
34
39
  from .pytest.parameterize import DefaultParameterIdGenerator
35
40
  from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
36
41
  from .log_utils.rollout_id_filter import RolloutIdFilter
37
42
  from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
43
+ from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
44
+ from .log_utils.elasticsearch_client import ElasticsearchConfig
45
+
38
46
 
39
47
  from .types.remote_rollout_processor import (
40
48
  InitRequest,
@@ -81,12 +89,14 @@ except ImportError:
81
89
  warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
82
90
 
83
91
  __all__ = [
92
+ "ElasticsearchConfig",
84
93
  "ElasticsearchDirectHttpHandler",
85
94
  "RolloutIdFilter",
86
95
  "setup_rollout_logging_for_elasticsearch_handler",
87
96
  "DataLoaderConfig",
88
97
  "Status",
89
98
  "RemoteRolloutProcessor",
99
+ "GithubActionRolloutProcessor",
90
100
  "InputMetadata",
91
101
  "EvaluationRow",
92
102
  "DefaultParameterIdGenerator",
@@ -95,6 +105,7 @@ __all__ = [
95
105
  "aha_judge",
96
106
  "multi_turn_assistant_to_ground_truth",
97
107
  "assistant_to_ground_truth",
108
+ "filter_longest_conversation",
98
109
  "evaluation_test",
99
110
  "SingleTurnRolloutProcessor",
100
111
  "OpenAIResponsesAdapter",
@@ -103,6 +114,7 @@ __all__ = [
103
114
  "BraintrustAdapter",
104
115
  "create_braintrust_adapter",
105
116
  "LangSmithAdapter",
117
+ "FireworksTracingHttpHandler",
106
118
  # Core interfaces
107
119
  "Message",
108
120
  "MetricResult",
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-10-13T20:26:15-0700",
11
+ "date": "2025-10-21T14:44:45-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "3c516e0d466d1a1a2d501f7ca0ac6ee7f10cf017",
15
- "version": "0.2.55-dev1"
14
+ "full-revisionid": "5a0eb89e557f1362bc17acd8a02c25a072dc3092",
15
+ "version": "0.2.57-dev2"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -265,6 +265,55 @@ class FireworksTracingAdapter(BaseAdapter):
265
265
  self.base_url = base_url.rstrip("/")
266
266
  self.timeout = timeout
267
267
 
268
+ def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
269
+ """Fetch logs from Fireworks tracing gateway /logs endpoint.
270
+
271
+ Returns entries with keys: timestamp, message, severity, tags.
272
+ """
273
+ if not tags:
274
+ raise ValueError("At least one tag is required to fetch logs")
275
+
276
+ headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
277
+ params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
278
+
279
+ # Try /logs first, fall back to /v1/logs if not found
280
+ urls_to_try = [f"{self.base_url}/logs", f"{self.base_url}/v1/logs"]
281
+ data: Dict[str, Any] = {}
282
+ last_error: Optional[str] = None
283
+ for url in urls_to_try:
284
+ try:
285
+ response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
286
+ if response.status_code == 404:
287
+ # Try next variant
288
+ last_error = f"404 for {url}"
289
+ continue
290
+ response.raise_for_status()
291
+ data = response.json() or {}
292
+ break
293
+ except requests.exceptions.RequestException as e:
294
+ last_error = str(e)
295
+ continue
296
+ else:
297
+ # All attempts failed
298
+ if last_error:
299
+ logger.error("Failed to fetch logs from Fireworks (tried %s): %s", urls_to_try, last_error)
300
+ return []
301
+
302
+ entries: List[Dict[str, Any]] = data.get("entries", []) or []
303
+ # Normalize minimal shape
304
+ results: List[Dict[str, Any]] = []
305
+ for e in entries:
306
+ results.append(
307
+ {
308
+ "timestamp": e.get("timestamp"),
309
+ "message": e.get("message"),
310
+ "severity": e.get("severity", "INFO"),
311
+ "tags": e.get("tags", []),
312
+ "status": e.get("status"),
313
+ }
314
+ )
315
+ return results
316
+
268
317
  def get_evaluation_rows(
269
318
  self,
270
319
  tags: List[str],
@@ -169,7 +169,9 @@ class OpenAIResponsesAdapter(BaseAdapter):
169
169
  raise NotImplementedError(f"Unsupported content type: {content_item.type}")
170
170
  elif item.type == "function_call_output":
171
171
  # Collect tool call outputs to add before assistant message
172
- tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
172
+ tool_call_outputs.append(
173
+ Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
174
+ )
173
175
  elif item.type == "function_call":
174
176
  tool_call = ChatCompletionMessageToolCall(
175
177
  id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
@@ -186,3 +188,29 @@ class OpenAIResponsesAdapter(BaseAdapter):
186
188
  messages.append(Message(role="assistant", tool_calls=current_tool_calls))
187
189
 
188
190
  return reversed(messages)
191
+
192
+ def _coerce_tool_output(self, output: Any) -> str:
193
+ """Coerce OpenAI Responses tool output into a string for Message.content.
194
+
195
+ The Responses API may return structured content lists. For our purposes,
196
+ we stringify non-string outputs to satisfy the Message.content type.
197
+ """
198
+ if isinstance(output, str):
199
+ return output
200
+ try:
201
+ # Attempt to join list of objects with any 'text' fields
202
+ if isinstance(output, list):
203
+ parts: list[str] = []
204
+ for part in output:
205
+ text = None
206
+ if isinstance(part, dict):
207
+ text = part.get("text")
208
+ if text:
209
+ parts.append(str(text))
210
+ else:
211
+ parts.append(str(part))
212
+ return "\n".join(parts)
213
+ # Fallback to string conversion
214
+ return str(output)
215
+ except Exception:
216
+ return str(output)
@@ -4,6 +4,8 @@ import os
4
4
  from pathlib import Path
5
5
  from typing import Dict, Optional # Added Dict
6
6
 
7
+ import requests
8
+
7
9
  logger = logging.getLogger(__name__)
8
10
 
9
11
  # Default locations (used for tests and as fallback). Actual resolution is dynamic via _get_auth_ini_file().
@@ -218,3 +220,40 @@ def get_fireworks_api_base() -> str:
218
220
  else:
219
221
  logger.debug("FIREWORKS_API_BASE not set in environment, defaulting to %s.", api_base)
220
222
  return api_base
223
+
224
+
225
+ def verify_api_key_and_get_account_id(
226
+ api_key: Optional[str] = None,
227
+ api_base: Optional[str] = None,
228
+ ) -> Optional[str]:
229
+ """
230
+ Calls the Fireworks API verify endpoint to validate the API key and returns the
231
+ account id from response headers when available.
232
+
233
+ Args:
234
+ api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
235
+ api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
236
+
237
+ Returns:
238
+ The resolved account id if verification succeeds and the header is present; otherwise None.
239
+ """
240
+ try:
241
+ resolved_key = api_key or get_fireworks_api_key()
242
+ if not resolved_key:
243
+ return None
244
+ resolved_base = api_base or get_fireworks_api_base()
245
+ url = f"{resolved_base.rstrip('/')}/verifyApiKey"
246
+ headers = {"Authorization": f"Bearer {resolved_key}"}
247
+ resp = requests.get(url, headers=headers, timeout=10)
248
+ if resp.status_code != 200:
249
+ logger.debug("verifyApiKey returned status %s", resp.status_code)
250
+ return None
251
+ # Header keys could vary in case; requests provides case-insensitive dict
252
+ account_id = resp.headers.get("x-fireworks-account-id") or resp.headers.get("X-Fireworks-Account-Id")
253
+ if account_id and account_id.strip():
254
+ logger.debug("Resolved FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", account_id)
255
+ return account_id.strip()
256
+ return None
257
+ except Exception as e:
258
+ logger.debug("Failed to verify API key for account id resolution: %s", e)
259
+ return None
@@ -301,6 +301,22 @@ def parse_args(args=None):
301
301
  logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
302
302
  logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
303
303
  logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
304
+ logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
305
+ logs_parser.add_argument(
306
+ "--use-env-elasticsearch-config",
307
+ action="store_true",
308
+ help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
309
+ )
310
+ logs_parser.add_argument(
311
+ "--use-fireworks",
312
+ action="store_true",
313
+ help="Force Fireworks tracing backend for logs UI (overrides env auto-detection)",
314
+ )
315
+ logs_parser.add_argument(
316
+ "--use-elasticsearch",
317
+ action="store_true",
318
+ help="Force Elasticsearch backend for logs UI (overrides env auto-detection)",
319
+ )
304
320
 
305
321
  # Upload command
306
322
  upload_parser = subparsers.add_parser(
@@ -0,0 +1,57 @@
1
+ """
2
+ CLI command for serving logs with file watching and real-time updates.
3
+ """
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import os
9
+ from ..utils.logs_server import serve_logs
10
+
11
+
12
+ def logs_command(args):
13
+ """Serve logs with file watching and real-time updates"""
14
+
15
+ port = args.port
16
+ print("🚀 Starting Eval Protocol Logs Server")
17
+ print(f"🌐 URL: http://localhost:{port}")
18
+ print(f"🔌 WebSocket: ws://localhost:{port}/ws")
19
+ print(f"👀 Watching paths: {['current directory']}")
20
+ print(f"🔍 Debug mode: {args.debug}")
21
+ print("Press Ctrl+C to stop the server")
22
+ print("-" * 50)
23
+
24
+ # Backend selection: Fireworks first when API key present, unless overridden
25
+ use_fireworks = False
26
+ if getattr(args, "use_fireworks", False):
27
+ use_fireworks = True
28
+ elif getattr(args, "use_elasticsearch", False):
29
+ use_fireworks = False
30
+ else:
31
+ use_fireworks = bool(os.environ.get("FIREWORKS_API_KEY"))
32
+
33
+ # Setup backend configs
34
+ elasticsearch_config = None
35
+ # Prefer explicit FW_TRACING_GATEWAY_BASE_URL, then GATEWAY_URL from env (remote validation),
36
+ # finally default to public tracing.fireworks.ai
37
+ fireworks_base_url = (
38
+ os.environ.get("FW_TRACING_GATEWAY_BASE_URL")
39
+ or os.environ.get("GATEWAY_URL")
40
+ or "https://tracing.fireworks.ai"
41
+ )
42
+
43
+ try:
44
+ serve_logs(
45
+ port=args.port,
46
+ elasticsearch_config=elasticsearch_config,
47
+ debug=args.debug,
48
+ backend="fireworks" if use_fireworks else "elasticsearch",
49
+ fireworks_base_url=fireworks_base_url if use_fireworks else None,
50
+ )
51
+ return 0
52
+ except KeyboardInterrupt:
53
+ print("\n🛑 Server stopped by user")
54
+ return 0
55
+ except Exception as e:
56
+ print(f"❌ Error starting server: {e}")
57
+ return 1
@@ -12,7 +12,12 @@ from pathlib import Path
12
12
  from typing import Any, Callable, Iterable, Optional
13
13
 
14
14
  import pytest
15
- from eval_protocol.auth import get_fireworks_account_id, get_fireworks_api_key
15
+ from eval_protocol.auth import (
16
+ get_fireworks_account_id,
17
+ get_fireworks_api_key,
18
+ get_fireworks_api_base,
19
+ verify_api_key_and_get_account_id,
20
+ )
16
21
  from eval_protocol.platform_api import create_or_update_fireworks_secret
17
22
 
18
23
  from eval_protocol.evaluation import create_evaluation
@@ -259,81 +264,43 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
259
264
  raise ValueError("--entry must be in 'module::function', 'path::function', or 'module:function' format")
260
265
 
261
266
 
262
- def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, str, str]:
267
+ def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
263
268
  target, func = _parse_entry(entry, cwd)
264
269
 
265
- # Check if target looks like a file path
270
+ # Determine the file path to load
266
271
  if "/" in target or "\\" in target or os.path.exists(target):
267
- # It's a file path - convert to absolute and load as module
272
+ # It's a file path - convert to absolute
268
273
  if not os.path.isabs(target):
269
274
  target = os.path.abspath(os.path.join(cwd, target))
270
-
271
275
  if not target.endswith(".py"):
272
276
  target = target + ".py"
273
-
274
277
  if not os.path.isfile(target):
275
278
  raise ValueError(f"File not found: {target}")
276
-
277
- # Import module from file path
278
- spec = importlib.util.spec_from_file_location(Path(target).stem, target)
279
- if not spec or not spec.loader:
280
- raise ValueError(f"Unable to load module from path: {target}")
281
- module = importlib.util.module_from_spec(spec)
282
- sys.modules[spec.name] = module
283
- spec.loader.exec_module(module) # type: ignore[attr-defined]
284
- module_name = spec.name
285
279
  source_file_path = target
286
280
  else:
287
- # Treat as module path (e.g., "my_package.my_module")
288
- module_name = target
289
- module = importlib.import_module(module_name)
290
- source_file_path = getattr(module, "__file__", "") or ""
281
+ # Treat dotted name as a file path
282
+ dotted_as_path = target.replace(".", "/") + ".py"
283
+ source_file_path = os.path.join(cwd, dotted_as_path)
284
+
285
+ # Load the module from the file path
286
+ spec = importlib.util.spec_from_file_location(Path(source_file_path).stem, source_file_path)
287
+ if not spec or not spec.loader:
288
+ raise ValueError(f"Unable to load module from path: {source_file_path}")
289
+ module = importlib.util.module_from_spec(spec)
290
+ sys.modules[spec.name] = module
291
+ spec.loader.exec_module(module) # type: ignore[attr-defined]
292
+ module_name = spec.name
291
293
 
292
294
  if not hasattr(module, func):
293
295
  raise ValueError(f"Function '{func}' not found in module '{module_name}'")
294
296
 
295
297
  qualname = f"{module_name}.{func}"
296
- code, file_name = _generate_ts_mode_code(
297
- DiscoveredTest(
298
- module_path=module_name,
299
- module_name=module_name,
300
- qualname=qualname,
301
- file_path=getattr(module, "__file__", module_name),
302
- lineno=None,
303
- has_parametrize=False,
304
- param_count=0,
305
- nodeids=[],
306
- )
307
- )
308
- return code, file_name, qualname, os.path.abspath(source_file_path) if source_file_path else ""
298
+ return qualname, os.path.abspath(source_file_path) if source_file_path else ""
309
299
 
310
300
 
311
301
  def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]:
312
- # Generate a minimal main.py that imports the test module and calls the function
313
- module = test.module_name
314
- func = test.qualname.split(".")[-1]
315
- code = f"""
316
- from typing import Any, Dict, List, Optional, Union
317
-
318
- from eval_protocol.models import EvaluationRow, Message
319
- from {module} import {func} as _ep_test
320
-
321
- def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
322
- row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth)
323
- result = _ep_test(row) # Supports sync/async via decorator's dual-mode
324
- if hasattr(result, "__await__"):
325
- import asyncio
326
- result = asyncio.get_event_loop().run_until_complete(result)
327
- if result.evaluation_result is None:
328
- return {{"score": 0.0, "reason": "No evaluation_result set"}}
329
- out = {{
330
- "score": float(result.evaluation_result.score or 0.0),
331
- "reason": result.evaluation_result.reason,
332
- "metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}},
333
- }}
334
- return out
335
- """
336
- return (code, "main.py")
302
+ # Deprecated: we no longer generate a shim; keep stub for import compatibility
303
+ return ("", "main.py")
337
304
 
338
305
 
339
306
  def _normalize_evaluator_id(evaluator_id: str) -> str:
@@ -522,10 +489,10 @@ def upload_command(args: argparse.Namespace) -> int:
522
489
  entries_arg = getattr(args, "entry", None)
523
490
  if entries_arg:
524
491
  entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
525
- selected_specs: list[tuple[str, str, str, str]] = []
492
+ selected_specs: list[tuple[str, str]] = []
526
493
  for e in entries:
527
- code, file_name, qualname, resolved_path = _generate_ts_mode_code_from_entry(e, root)
528
- selected_specs.append((code, file_name, qualname, resolved_path))
494
+ qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root)
495
+ selected_specs.append((qualname, resolved_path))
529
496
  else:
530
497
  print("Scanning for evaluation tests...")
531
498
  tests = _discover_tests(root)
@@ -545,11 +512,7 @@ def upload_command(args: argparse.Namespace) -> int:
545
512
  print(" handles all parameter combinations. The evaluator will work with")
546
513
  print(" the same logic regardless of which model/parameters are used.")
547
514
 
548
- selected_specs = []
549
- for t in selected_tests:
550
- code, file_name = _generate_ts_mode_code(t)
551
- # Store test info for better ID generation
552
- selected_specs.append((code, file_name, t.qualname, t.file_path))
515
+ selected_specs = [(t.qualname, t.file_path) for t in selected_tests]
553
516
 
554
517
  base_id = getattr(args, "id", None)
555
518
  display_name = getattr(args, "display_name", None)
@@ -560,6 +523,14 @@ def upload_command(args: argparse.Namespace) -> int:
560
523
  try:
561
524
  fw_account_id = get_fireworks_account_id()
562
525
  fw_api_key_value = get_fireworks_api_key()
526
+ if not fw_account_id and fw_api_key_value:
527
+ # Attempt to verify and resolve account id from server headers
528
+ resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base())
529
+ if resolved:
530
+ fw_account_id = resolved
531
+ # Propagate to environment so downstream calls use it if needed
532
+ os.environ["FIREWORKS_ACCOUNT_ID"] = fw_account_id
533
+ print(f"Resolved FIREWORKS_ACCOUNT_ID via API verification: {fw_account_id}")
563
534
  if fw_account_id and fw_api_key_value:
564
535
  print("Ensuring FIREWORKS_API_KEY is registered as a secret on Fireworks for rollout...")
565
536
  if create_or_update_fireworks_secret(
@@ -579,8 +550,7 @@ def upload_command(args: argparse.Namespace) -> int:
579
550
  print(f"Warning: Skipped Fireworks secret registration due to error: {e}")
580
551
 
581
552
  exit_code = 0
582
- for i, (code, file_name, qualname, source_file_path) in enumerate(selected_specs):
583
- # Use ts_mode to upload evaluator
553
+ for i, (qualname, source_file_path) in enumerate(selected_specs):
584
554
  # Generate a short default ID from just the test function name
585
555
  if base_id:
586
556
  evaluator_id = base_id
@@ -618,12 +588,11 @@ def upload_command(args: argparse.Namespace) -> int:
618
588
 
619
589
  print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
620
590
  try:
591
+ test_dir = root
592
+ metric_name = os.path.basename(test_dir) or "metric"
621
593
  result = create_evaluation(
622
594
  evaluator_id=evaluator_id,
623
- python_code_to_evaluate=code,
624
- python_file_name_for_code=file_name,
625
- criterion_name_for_code=qualname,
626
- criterion_description_for_code=description or f"Evaluator for {qualname}",
595
+ metric_folders=[f"{metric_name}={test_dir}"],
627
596
  display_name=display_name or evaluator_id,
628
597
  description=description or f"Evaluator for {qualname}",
629
598
  force=force,