eval-protocol 0.2.53__tar.gz → 0.2.54.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. {eval_protocol-0.2.53/eval_protocol.egg-info → eval_protocol-0.2.54.dev0}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/__init__.py +10 -7
  3. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/fireworks_tracing.py +8 -6
  5. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/openai_responses.py +1 -29
  6. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/auth.py +0 -39
  7. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli.py +0 -6
  8. eval_protocol-0.2.54.dev0/eval_protocol/cli_commands/logs.py +36 -0
  9. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/upload.py +54 -27
  10. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/evaluation.py +40 -125
  11. eval_protocol-0.2.54.dev0/eval_protocol/proxy/__init__.py +18 -0
  12. eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/__init__.py +13 -0
  13. eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/app.py +305 -0
  14. eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/auth.py +17 -0
  15. eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/langfuse.py +528 -0
  16. eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/litellm.py +170 -0
  17. eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/main.py +10 -0
  18. eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/models.py +98 -0
  19. eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/redis_utils.py +48 -0
  20. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/evaluation_test.py +1 -22
  21. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/remote_rollout_processor.py +3 -22
  22. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/logs_server.py +1 -9
  23. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0/eval_protocol.egg-info}/PKG-INFO +1 -1
  24. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/SOURCES.txt +12 -5
  25. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_show_results_url.py +0 -141
  26. eval_protocol-0.2.53/vite-app/dist/assets/index-zf20-zFD.js → eval_protocol-0.2.54.dev0/vite-app/dist/assets/index-C81y9r9l.js +25 -25
  27. eval_protocol-0.2.53/vite-app/dist/assets/index-zf20-zFD.js.map → eval_protocol-0.2.54.dev0/vite-app/dist/assets/index-C81y9r9l.js.map +1 -1
  28. eval_protocol-0.2.54.dev0/vite-app/dist/assets/index-DpYZaoAr.css +1 -0
  29. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vite-app/dist/index.html +2 -2
  30. eval_protocol-0.2.53/eval_protocol/cli_commands/logs.py +0 -76
  31. eval_protocol-0.2.53/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -63
  32. eval_protocol-0.2.53/eval_protocol/utils/browser_utils.py +0 -114
  33. eval_protocol-0.2.53/vite-app/dist/assets/index-BGlGI2LH.css +0 -1
  34. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/LICENSE +0 -0
  35. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/README.md +0 -0
  36. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/__init__.py +0 -0
  37. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/normalize_sandbox_fusion.py +0 -0
  38. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/utils/__init__.py +0 -0
  39. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/utils/generate_api_key.py +0 -0
  40. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/utils/subprocess_manager.py +0 -0
  41. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/__main__.py +0 -0
  42. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/__init__.py +0 -0
  43. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/base.py +0 -0
  44. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/bigquery.py +0 -0
  45. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/braintrust.py +0 -0
  46. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/huggingface.py +0 -0
  47. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/langchain.py +0 -0
  48. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/langfuse.py +0 -0
  49. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/langsmith.py +0 -0
  50. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/trl.py +0 -0
  51. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/utils.py +0 -0
  52. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/weave.py +0 -0
  53. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/__init__.py +0 -0
  54. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/models.py +0 -0
  55. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/orchestrator.py +0 -0
  56. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resource_abc.py +0 -0
  57. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resource_pool.py +0 -0
  58. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/__init__.py +0 -0
  59. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  60. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  61. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  62. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  63. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  64. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/docker_resource.py +0 -0
  65. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  66. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  67. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/sql_resource.py +0 -0
  68. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/task_manager.py +0 -0
  69. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/tool_registry.py +0 -0
  70. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/__init__.py +0 -0
  71. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  72. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  73. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_aime25.py +0 -0
  74. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  75. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  76. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  77. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  78. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  79. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/__init__.py +0 -0
  80. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  81. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/common.py +0 -0
  82. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/deploy.py +0 -0
  83. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  84. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/preview.py +0 -0
  85. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  86. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/common_utils.py +0 -0
  87. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/config.py +0 -0
  88. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/__init__.py +0 -0
  89. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  90. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  91. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  92. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/models.py +0 -0
  93. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/__init__.py +0 -0
  94. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  95. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  96. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  97. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  98. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/datasets/__init__.py +0 -0
  99. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/datasets/loader.py +0 -0
  100. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/directory_utils.py +0 -0
  101. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/__init__.py +0 -0
  102. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/event_bus.py +0 -0
  103. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/logger.py +0 -0
  104. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  105. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  106. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/execution/__init__.py +0 -0
  107. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/execution/pipeline.py +0 -0
  108. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/gcp_tools.py +0 -0
  109. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/generation/cache.py +0 -0
  110. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/generation/clients/base.py +0 -0
  111. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/generation/clients.py +0 -0
  112. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/generic_server.py +0 -0
  113. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/get_pep440_version.py +0 -0
  114. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/human_id/__init__.py +0 -0
  115. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/human_id/dictionary.py +0 -0
  116. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/integrations/__init__.py +0 -0
  117. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/integrations/deepeval.py +0 -0
  118. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/integrations/openeval.py +0 -0
  119. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/integrations/trl.py +0 -0
  120. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/__init__.py +0 -0
  121. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  122. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  123. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  124. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  125. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/util.py +0 -0
  126. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/logging_utils.py +0 -0
  127. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/__init__.py +0 -0
  128. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/adapter.py +0 -0
  129. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/client/__init__.py +0 -0
  130. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/client/connection.py +0 -0
  131. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/clients.py +0 -0
  132. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/execution/__init__.py +0 -0
  133. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/execution/base_policy.py +0 -0
  134. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/execution/manager.py +0 -0
  135. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/execution/policy.py +0 -0
  136. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/grid_renderer.py +0 -0
  137. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  138. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/mcpgym.py +0 -0
  139. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/process_manager.py +0 -0
  140. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/session/__init__.py +0 -0
  141. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/session/manager.py +0 -0
  142. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/simple_process_manager.py +0 -0
  143. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/simulation_server.py +0 -0
  144. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/__init__.py +0 -0
  145. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/config.py +0 -0
  146. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/main.py +0 -0
  147. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  148. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  149. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  150. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  151. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_env.py +0 -0
  152. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/__init__.py +0 -0
  153. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  154. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  155. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  156. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  157. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  158. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  159. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  160. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  161. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  162. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  163. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  164. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  165. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  166. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  167. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/models.py +0 -0
  168. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/packaging.py +0 -0
  169. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/platform_api.py +0 -0
  170. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/playback_policy.py +0 -0
  171. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/__init__.py +0 -0
  172. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  173. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  174. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  175. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  176. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  177. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  178. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  179. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  180. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  181. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  182. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/exception_config.py +0 -0
  183. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/execution.py +0 -0
  184. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  185. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  186. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/parameterize.py +0 -0
  187. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/plugin.py +0 -0
  188. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/rollout_processor.py +0 -0
  189. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/store_experiment_link.py +0 -0
  190. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/store_results_url.py +0 -0
  191. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/types.py +0 -0
  192. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/utils.py +0 -0
  193. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/validate_signature.py +0 -0
  194. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/__init__.py +0 -0
  195. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge.py +0 -0
  196. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  197. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
  198. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
  199. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
  200. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/utils.py +0 -0
  201. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/resources.py +0 -0
  202. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/reward_function.py +0 -0
  203. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/__init__.py +0 -0
  204. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/accuracy.py +0 -0
  205. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/accuracy_length.py +0 -0
  206. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  207. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  208. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/apps_testing_util.py +0 -0
  209. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/bfcl_reward.py +0 -0
  210. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/code_execution.py +0 -0
  211. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/code_execution_utils.py +0 -0
  212. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/cpp_code.py +0 -0
  213. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  214. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/format.py +0 -0
  215. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/function_calling.py +0 -0
  216. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/json_schema.py +0 -0
  217. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/language_consistency.py +0 -0
  218. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/lean_prover.py +0 -0
  219. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/length.py +0 -0
  220. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  221. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/math.py +0 -0
  222. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  223. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/reasoning_steps.py +0 -0
  224. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/repetition.py +0 -0
  225. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/tag_count.py +0 -0
  226. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rl_processing.py +0 -0
  227. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/server.py +0 -0
  228. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/stats/__init__.py +0 -0
  229. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/stats/confidence_intervals.py +0 -0
  230. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/typed_interface.py +0 -0
  231. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/types/__init__.py +0 -0
  232. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/types/errors.py +0 -0
  233. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/types/remote_rollout_processor.py +0 -0
  234. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/types/types.py +0 -0
  235. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/__init__.py +0 -0
  236. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/batch_evaluation.py +0 -0
  237. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/batch_transformation.py +0 -0
  238. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/check_server_status.py +0 -0
  239. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/dataset_helpers.py +0 -0
  240. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/logs_models.py +0 -0
  241. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/module_loader.py +0 -0
  242. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/packaging_utils.py +0 -0
  243. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/show_results_url.py +0 -0
  244. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/static_policy.py +0 -0
  245. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/subprocess_utils.py +0 -0
  246. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/vite_server.py +0 -0
  247. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/dependency_links.txt +0 -0
  248. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/entry_points.txt +0 -0
  249. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/requires.txt +0 -0
  250. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/top_level.txt +0 -0
  251. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/pyproject.toml +0 -0
  252. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/setup.cfg +0 -0
  253. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/setup.py +0 -0
  254. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_accuracy.py +0 -0
  255. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_accuracy_length.py +0 -0
  256. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_adapters_e2e.py +0 -0
  257. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_agent_orchestrator.py +0 -0
  258. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_agent_resources.py +0 -0
  259. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_auth.py +0 -0
  260. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_batch_evaluation.py +0 -0
  261. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_cli.py +0 -0
  262. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_cli_agent.py +0 -0
  263. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_cli_args.py +0 -0
  264. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_code_execution.py +0 -0
  265. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_config.py +0 -0
  266. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_control_plane_separation.py +0 -0
  267. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_cpp_code.py +0 -0
  268. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_data_driven_task_manager.py +0 -0
  269. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_deepcoder_reward.py +0 -0
  270. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_deepeval_integration.py +0 -0
  271. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_deploy_integration.py +0 -0
  272. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_directory_utils.py +0 -0
  273. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_e2b_integration.py +0 -0
  274. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_e2b_js_integration.py +0 -0
  275. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_edge_cases.py +0 -0
  276. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_eval_protocol_import.py +0 -0
  277. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_evaluation.py +0 -0
  278. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_evaluation_integration.py +0 -0
  279. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_evaluation_postprocess.py +0 -0
  280. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_evaluation_preview_integration.py +0 -0
  281. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_event_bus.py +0 -0
  282. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_event_bus_helper.py +0 -0
  283. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_examples_end_to_end.py +0 -0
  284. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_fireworks_api.py +0 -0
  285. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_format.py +0 -0
  286. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_fractional_code.py +0 -0
  287. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_function_calling.py +0 -0
  288. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_gcp_tools.py +0 -0
  289. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_generic_server.py +0 -0
  290. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_human_id.py +0 -0
  291. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_integration.py +0 -0
  292. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_json_schema.py +0 -0
  293. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_kwargs_validation.py +0 -0
  294. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_language_consistency.py +0 -0
  295. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_lean_prover.py +0 -0
  296. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_lean_prover_runner.py +0 -0
  297. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_length.py +0 -0
  298. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_list_comparison_math_reward.py +0 -0
  299. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_logs_server.py +0 -0
  300. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_logs_server_simple.py +0 -0
  301. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_math.py +0 -0
  302. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_minimal.py +0 -0
  303. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_models.py +0 -0
  304. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_models_rl.py +0 -0
  305. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_multiple_choice_math_reward.py +0 -0
  306. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_n_variant_batch_integration.py +0 -0
  307. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_n_variant_integration.py +0 -0
  308. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_openai_compatibility.py +0 -0
  309. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_openeval_integration.py +0 -0
  310. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_packaging.py +0 -0
  311. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_parallel_rollouts.py +0 -0
  312. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_platform_api.py +0 -0
  313. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_quickstart_utils.py +0 -0
  314. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_readiness.py +0 -0
  315. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_reasoning_steps.py +0 -0
  316. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_repetition.py +0 -0
  317. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_repetition_debug.py +0 -0
  318. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_retry_mechanism.py +0 -0
  319. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_reward_function.py +0 -0
  320. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_reward_protocol_import.py +0 -0
  321. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_rl_processing.py +0 -0
  322. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_rollout_control_plane_integration.py +0 -0
  323. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_server.py +0 -0
  324. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_status_migration_changes.py +0 -0
  325. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_status_migration_integration.py +0 -0
  326. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_status_model.py +0 -0
  327. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_tag_count.py +0 -0
  328. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_tau_bench_airline_smoke.py +0 -0
  329. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_typed_interface.py +0 -0
  330. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_typed_interface_rl.py +0 -0
  331. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_upload_entrypoint.py +0 -0
  332. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_url_handling.py +0 -0
  333. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_vite_server.py +0 -0
  334. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/__init__.py +0 -0
  335. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/agent/__init__.py +0 -0
  336. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/agent/base.py +0 -0
  337. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/agent/llm_agent.py +0 -0
  338. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/api_service/__init__.py +0 -0
  339. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/api_service/api_config.py +0 -0
  340. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/api_service/data_model.py +0 -0
  341. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/api_service/simulation_service.py +0 -0
  342. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/cli.py +0 -0
  343. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/config.py +0 -0
  344. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/airline/policy.md +0 -0
  345. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/mock/policy.md +0 -0
  346. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  347. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/retail/policy.md +0 -0
  348. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  349. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  350. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  351. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  352. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  353. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  354. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  355. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data_model/__init__.py +0 -0
  356. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data_model/message.py +0 -0
  357. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data_model/simulation.py +0 -0
  358. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data_model/tasks.py +0 -0
  359. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/__init__.py +0 -0
  360. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/__init__.py +0 -0
  361. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/data_model.py +0 -0
  362. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/environment.py +0 -0
  363. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/tools.py +0 -0
  364. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/utils.py +0 -0
  365. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/__init__.py +0 -0
  366. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/data_model.py +0 -0
  367. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/environment.py +0 -0
  368. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/tools.py +0 -0
  369. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/utils.py +0 -0
  370. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/__init__.py +0 -0
  371. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/data_model.py +0 -0
  372. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/environment.py +0 -0
  373. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/tools.py +0 -0
  374. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/utils.py +0 -0
  375. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/__init__.py +0 -0
  376. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/data_model.py +0 -0
  377. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/environment.py +0 -0
  378. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  379. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  380. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  381. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  382. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  383. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  384. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  385. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  386. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tools.py +0 -0
  387. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  388. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  389. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/utils.py +0 -0
  390. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/__init__.py +0 -0
  391. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/db.py +0 -0
  392. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/environment.py +0 -0
  393. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/server.py +0 -0
  394. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/tool.py +0 -0
  395. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/toolkit.py +0 -0
  396. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  397. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/__init__.py +0 -0
  398. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator.py +0 -0
  399. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  400. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  401. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  402. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  403. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  404. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/metrics/__init__.py +0 -0
  405. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/metrics/agent_metrics.py +0 -0
  406. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  407. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/orchestrator/__init__.py +0 -0
  408. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  409. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  410. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/orchestrator/utils.py +0 -0
  411. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/registry.py +0 -0
  412. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/run.py +0 -0
  413. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/__init__.py +0 -0
  414. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/check_data.py +0 -0
  415. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  416. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/start_servers.py +0 -0
  417. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/view_simulations.py +0 -0
  418. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/user/__init__.py +0 -0
  419. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/user/base.py +0 -0
  420. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/user/user_simulator.py +0 -0
  421. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/__init__.py +0 -0
  422. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/display.py +0 -0
  423. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/io_utils.py +0 -0
  424. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/llm_utils.py +0 -0
  425. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/pydantic_utils.py +0 -0
  426. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/utils.py +0 -0
  427. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/versioneer.py +0 -0
  428. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  429. {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.53
3
+ Version: 0.2.54.dev0
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -31,14 +31,10 @@ from .reward_function import RewardFunction
31
31
  from .typed_interface import reward_function
32
32
  from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
33
33
  from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
34
- from .pytest.remote_rollout_processor import create_elasticsearch_config_from_env
35
34
  from .pytest.parameterize import DefaultParameterIdGenerator
36
35
  from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
37
36
  from .log_utils.rollout_id_filter import RolloutIdFilter
38
37
  from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
39
- from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
40
- from .log_utils.elasticsearch_client import ElasticsearchConfig
41
-
42
38
 
43
39
  from .types.remote_rollout_processor import (
44
40
  InitRequest,
@@ -74,11 +70,16 @@ try:
74
70
  except ImportError:
75
71
  WeaveAdapter = None
76
72
 
73
+ try:
74
+ from .proxy import create_app, AuthProvider
75
+ except ImportError:
76
+ create_app = None
77
+ AuthProvider = None
78
+
79
+
77
80
  warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
78
81
 
79
82
  __all__ = [
80
- "create_elasticsearch_config_from_env",
81
- "ElasticsearchConfig",
82
83
  "ElasticsearchDirectHttpHandler",
83
84
  "RolloutIdFilter",
84
85
  "setup_rollout_logging_for_elasticsearch_handler",
@@ -101,7 +102,6 @@ __all__ = [
101
102
  "BraintrustAdapter",
102
103
  "create_braintrust_adapter",
103
104
  "LangSmithAdapter",
104
- "FireworksTracingHttpHandler",
105
105
  # Core interfaces
106
106
  "Message",
107
107
  "MetricResult",
@@ -137,6 +137,9 @@ __all__ = [
137
137
  "RolloutMetadata",
138
138
  "StatusResponse",
139
139
  "create_langfuse_config_tags",
140
+ # Proxy
141
+ "create_app",
142
+ "AuthProvider",
140
143
  ]
141
144
 
142
145
  from . import _version
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-10-13T11:21:58-0700",
11
+ "date": "2025-10-13T17:28:46-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "3b326f2fcc52573154c9952ad882880de8096c6e",
15
- "version": "0.2.53"
14
+ "full-revisionid": "bfe8e3146c3971cadf5c7e43d259b40e7e26163a",
15
+ "version": "0.2.54-dev"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -7,9 +7,9 @@ to pull data from Langfuse deployments with simplified retry logic handling.
7
7
  from __future__ import annotations
8
8
  import logging
9
9
  import requests
10
- import time
11
10
  from datetime import datetime
12
11
  from typing import Any, Dict, List, Optional, Protocol
12
+ import os
13
13
 
14
14
  from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
15
15
  from .base import BaseAdapter
@@ -343,15 +343,17 @@ class FireworksTracingAdapter(BaseAdapter):
343
343
  # Remove None values
344
344
  params = {k: v for k, v in params.items() if v is not None}
345
345
 
346
- # Make request to proxy
346
+ # Make request to proxy (using pointwise for efficiency)
347
347
  if self.project_id:
348
- url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
348
+ url = f"{self.base_url}/v1/project_id/{self.project_id}/traces/pointwise"
349
349
  else:
350
- url = f"{self.base_url}/v1/traces"
350
+ url = f"{self.base_url}/v1/traces/pointwise"
351
+
352
+ headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
351
353
 
352
354
  result = None
353
355
  try:
354
- response = requests.get(url, params=params, timeout=self.timeout)
356
+ response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
355
357
  response.raise_for_status()
356
358
  result = response.json()
357
359
  except requests.exceptions.HTTPError as e:
@@ -365,7 +367,7 @@ class FireworksTracingAdapter(BaseAdapter):
365
367
  except Exception: # In case e.response.json() fails
366
368
  error_msg = f"Proxy error: {e.response.text}"
367
369
 
368
- logger.error("Failed to fetch traces from proxy: %s", error_msg)
370
+ logger.error("Failed to fetch traces from proxy (HTTP %s): %s", e.response.status_code, error_msg)
369
371
  return eval_rows
370
372
  except requests.exceptions.RequestException as e:
371
373
  # Non-HTTP errors (network issues, timeouts, etc.)
@@ -169,9 +169,7 @@ class OpenAIResponsesAdapter(BaseAdapter):
169
169
  raise NotImplementedError(f"Unsupported content type: {content_item.type}")
170
170
  elif item.type == "function_call_output":
171
171
  # Collect tool call outputs to add before assistant message
172
- tool_call_outputs.append(
173
- Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
174
- )
172
+ tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
175
173
  elif item.type == "function_call":
176
174
  tool_call = ChatCompletionMessageToolCall(
177
175
  id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
@@ -188,29 +186,3 @@ class OpenAIResponsesAdapter(BaseAdapter):
188
186
  messages.append(Message(role="assistant", tool_calls=current_tool_calls))
189
187
 
190
188
  return reversed(messages)
191
-
192
- def _coerce_tool_output(self, output: Any) -> str:
193
- """Coerce OpenAI Responses tool output into a string for Message.content.
194
-
195
- The Responses API may return structured content lists. For our purposes,
196
- we stringify non-string outputs to satisfy the Message.content type.
197
- """
198
- if isinstance(output, str):
199
- return output
200
- try:
201
- # Attempt to join list of objects with any 'text' fields
202
- if isinstance(output, list):
203
- parts: list[str] = []
204
- for part in output:
205
- text = None
206
- if isinstance(part, dict):
207
- text = part.get("text")
208
- if text:
209
- parts.append(str(text))
210
- else:
211
- parts.append(str(part))
212
- return "\n".join(parts)
213
- # Fallback to string conversion
214
- return str(output)
215
- except Exception:
216
- return str(output)
@@ -4,8 +4,6 @@ import os
4
4
  from pathlib import Path
5
5
  from typing import Dict, Optional # Added Dict
6
6
 
7
- import requests
8
-
9
7
  logger = logging.getLogger(__name__)
10
8
 
11
9
  # Default locations (used for tests and as fallback). Actual resolution is dynamic via _get_auth_ini_file().
@@ -220,40 +218,3 @@ def get_fireworks_api_base() -> str:
220
218
  else:
221
219
  logger.debug("FIREWORKS_API_BASE not set in environment, defaulting to %s.", api_base)
222
220
  return api_base
223
-
224
-
225
- def verify_api_key_and_get_account_id(
226
- api_key: Optional[str] = None,
227
- api_base: Optional[str] = None,
228
- ) -> Optional[str]:
229
- """
230
- Calls the Fireworks API verify endpoint to validate the API key and returns the
231
- account id from response headers when available.
232
-
233
- Args:
234
- api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
235
- api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
236
-
237
- Returns:
238
- The resolved account id if verification succeeds and the header is present; otherwise None.
239
- """
240
- try:
241
- resolved_key = api_key or get_fireworks_api_key()
242
- if not resolved_key:
243
- return None
244
- resolved_base = api_base or get_fireworks_api_base()
245
- url = f"{resolved_base.rstrip('/')}/verifyApiKey"
246
- headers = {"Authorization": f"Bearer {resolved_key}"}
247
- resp = requests.get(url, headers=headers, timeout=10)
248
- if resp.status_code != 200:
249
- logger.debug("verifyApiKey returned status %s", resp.status_code)
250
- return None
251
- # Header keys could vary in case; requests provides case-insensitive dict
252
- account_id = resp.headers.get("x-fireworks-account-id") or resp.headers.get("X-Fireworks-Account-Id")
253
- if account_id and account_id.strip():
254
- logger.debug("Resolved FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", account_id)
255
- return account_id.strip()
256
- return None
257
- except Exception as e:
258
- logger.debug("Failed to verify API key for account id resolution: %s", e)
259
- return None
@@ -301,12 +301,6 @@ def parse_args(args=None):
301
301
  logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
302
302
  logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
303
303
  logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
304
- logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
305
- logs_parser.add_argument(
306
- "--use-env-elasticsearch-config",
307
- action="store_true",
308
- help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
309
- )
310
304
 
311
305
  # Upload command
312
306
  upload_parser = subparsers.add_parser(
@@ -0,0 +1,36 @@
1
+ """
2
+ CLI command for serving logs with file watching and real-time updates.
3
+ """
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from ..utils.logs_server import serve_logs
9
+
10
+
11
+ def logs_command(args):
12
+ """Serve logs with file watching and real-time updates"""
13
+
14
+ port = args.port
15
+ print("🚀 Starting Eval Protocol Logs Server")
16
+ print(f"🌐 URL: http://localhost:{port}")
17
+ print(f"🔌 WebSocket: ws://localhost:{port}/ws")
18
+ print(f"👀 Watching paths: {['current directory']}")
19
+ print(f"🔍 Debug mode: {args.debug}")
20
+ print("Press Ctrl+C to stop the server")
21
+ print("-" * 50)
22
+
23
+ # setup Elasticsearch
24
+ from eval_protocol.pytest.elasticsearch_setup import ElasticsearchSetup
25
+
26
+ elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
27
+
28
+ try:
29
+ serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
30
+ return 0
31
+ except KeyboardInterrupt:
32
+ print("\n🛑 Server stopped by user")
33
+ return 0
34
+ except Exception as e:
35
+ print(f"❌ Error starting server: {e}")
36
+ return 1
@@ -12,12 +12,7 @@ from pathlib import Path
12
12
  from typing import Any, Callable, Iterable, Optional
13
13
 
14
14
  import pytest
15
- from eval_protocol.auth import (
16
- get_fireworks_account_id,
17
- get_fireworks_api_key,
18
- get_fireworks_api_base,
19
- verify_api_key_and_get_account_id,
20
- )
15
+ from eval_protocol.auth import get_fireworks_account_id, get_fireworks_api_key
21
16
  from eval_protocol.platform_api import create_or_update_fireworks_secret
22
17
 
23
18
  from eval_protocol.evaluation import create_evaluation
@@ -264,7 +259,7 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
264
259
  raise ValueError("--entry must be in 'module::function', 'path::function', or 'module:function' format")
265
260
 
266
261
 
267
- def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
262
+ def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, str, str]:
268
263
  target, func = _parse_entry(entry, cwd)
269
264
 
270
265
  # Check if target looks like a file path
@@ -298,12 +293,47 @@ def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
298
293
  raise ValueError(f"Function '{func}' not found in module '{module_name}'")
299
294
 
300
295
  qualname = f"{module_name}.{func}"
301
- return qualname, os.path.abspath(source_file_path) if source_file_path else ""
296
+ code, file_name = _generate_ts_mode_code(
297
+ DiscoveredTest(
298
+ module_path=module_name,
299
+ module_name=module_name,
300
+ qualname=qualname,
301
+ file_path=getattr(module, "__file__", module_name),
302
+ lineno=None,
303
+ has_parametrize=False,
304
+ param_count=0,
305
+ nodeids=[],
306
+ )
307
+ )
308
+ return code, file_name, qualname, os.path.abspath(source_file_path) if source_file_path else ""
302
309
 
303
310
 
304
311
  def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]:
305
- # Deprecated: we no longer generate a shim; keep stub for import compatibility
306
- return ("", "main.py")
312
+ # Generate a minimal main.py that imports the test module and calls the function
313
+ module = test.module_name
314
+ func = test.qualname.split(".")[-1]
315
+ code = f"""
316
+ from typing import Any, Dict, List, Optional, Union
317
+
318
+ from eval_protocol.models import EvaluationRow, Message
319
+ from {module} import {func} as _ep_test
320
+
321
+ def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
322
+ row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth)
323
+ result = _ep_test(row) # Supports sync/async via decorator's dual-mode
324
+ if hasattr(result, "__await__"):
325
+ import asyncio
326
+ result = asyncio.get_event_loop().run_until_complete(result)
327
+ if result.evaluation_result is None:
328
+ return {{"score": 0.0, "reason": "No evaluation_result set"}}
329
+ out = {{
330
+ "score": float(result.evaluation_result.score or 0.0),
331
+ "reason": result.evaluation_result.reason,
332
+ "metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}},
333
+ }}
334
+ return out
335
+ """
336
+ return (code, "main.py")
307
337
 
308
338
 
309
339
  def _normalize_evaluator_id(evaluator_id: str) -> str:
@@ -492,10 +522,10 @@ def upload_command(args: argparse.Namespace) -> int:
492
522
  entries_arg = getattr(args, "entry", None)
493
523
  if entries_arg:
494
524
  entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
495
- selected_specs: list[tuple[str, str]] = []
525
+ selected_specs: list[tuple[str, str, str, str]] = []
496
526
  for e in entries:
497
- qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root)
498
- selected_specs.append((qualname, resolved_path))
527
+ code, file_name, qualname, resolved_path = _generate_ts_mode_code_from_entry(e, root)
528
+ selected_specs.append((code, file_name, qualname, resolved_path))
499
529
  else:
500
530
  print("Scanning for evaluation tests...")
501
531
  tests = _discover_tests(root)
@@ -515,7 +545,11 @@ def upload_command(args: argparse.Namespace) -> int:
515
545
  print(" handles all parameter combinations. The evaluator will work with")
516
546
  print(" the same logic regardless of which model/parameters are used.")
517
547
 
518
- selected_specs = [(t.qualname, t.file_path) for t in selected_tests]
548
+ selected_specs = []
549
+ for t in selected_tests:
550
+ code, file_name = _generate_ts_mode_code(t)
551
+ # Store test info for better ID generation
552
+ selected_specs.append((code, file_name, t.qualname, t.file_path))
519
553
 
520
554
  base_id = getattr(args, "id", None)
521
555
  display_name = getattr(args, "display_name", None)
@@ -526,14 +560,6 @@ def upload_command(args: argparse.Namespace) -> int:
526
560
  try:
527
561
  fw_account_id = get_fireworks_account_id()
528
562
  fw_api_key_value = get_fireworks_api_key()
529
- if not fw_account_id and fw_api_key_value:
530
- # Attempt to verify and resolve account id from server headers
531
- resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base())
532
- if resolved:
533
- fw_account_id = resolved
534
- # Propagate to environment so downstream calls use it if needed
535
- os.environ["FIREWORKS_ACCOUNT_ID"] = fw_account_id
536
- print(f"Resolved FIREWORKS_ACCOUNT_ID via API verification: {fw_account_id}")
537
563
  if fw_account_id and fw_api_key_value:
538
564
  print("Ensuring FIREWORKS_API_KEY is registered as a secret on Fireworks for rollout...")
539
565
  if create_or_update_fireworks_secret(
@@ -553,7 +579,8 @@ def upload_command(args: argparse.Namespace) -> int:
553
579
  print(f"Warning: Skipped Fireworks secret registration due to error: {e}")
554
580
 
555
581
  exit_code = 0
556
- for i, (qualname, source_file_path) in enumerate(selected_specs):
582
+ for i, (code, file_name, qualname, source_file_path) in enumerate(selected_specs):
583
+ # Use ts_mode to upload evaluator
557
584
  # Generate a short default ID from just the test function name
558
585
  if base_id:
559
586
  evaluator_id = base_id
@@ -591,12 +618,12 @@ def upload_command(args: argparse.Namespace) -> int:
591
618
 
592
619
  print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
593
620
  try:
594
- # Always treat as a single evaluator (single-metric) even if folder has helper modules
595
- test_dir = os.path.dirname(source_file_path) if source_file_path else root
596
- metric_name = os.path.basename(test_dir) or "metric"
597
621
  result = create_evaluation(
598
622
  evaluator_id=evaluator_id,
599
- metric_folders=[f"{metric_name}={test_dir}"],
623
+ python_code_to_evaluate=code,
624
+ python_file_name_for_code=file_name,
625
+ criterion_name_for_code=qualname,
626
+ criterion_description_for_code=description or f"Evaluator for {qualname}",
600
627
  display_name=display_name or evaluator_id,
601
628
  description=description or f"Evaluator for {qualname}",
602
629
  force=force,