eval-protocol 0.3.15.dev1__tar.gz → 0.3.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (472) hide show
  1. {eval_protocol-0.3.15.dev1/eval_protocol.egg-info → eval_protocol-0.3.16}/PKG-INFO +4 -1
  2. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/__init__.py +4 -0
  3. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/fireworks_tracing.py +50 -3
  5. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/execution/policy.py +0 -3
  6. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/proxy/proxy_core/app.py +11 -15
  7. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/proxy/proxy_core/langfuse.py +1 -0
  8. eval_protocol-0.3.16/eval_protocol/proxy/proxy_core/litellm.py +150 -0
  9. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/proxy/proxy_core/models.py +0 -1
  10. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/handle_persist_flow.py +1 -1
  11. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/reward_function.py +0 -1
  12. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16/eval_protocol.egg-info}/PKG-INFO +4 -1
  13. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol.egg-info/requires.txt +3 -0
  14. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/pyproject.toml +3 -0
  15. eval_protocol-0.3.15.dev1/eval_protocol/proxy/proxy_core/litellm.py +0 -197
  16. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/LICENSE +0 -0
  17. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/README.md +0 -0
  18. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/development/__init__.py +0 -0
  19. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/development/normalize_sandbox_fusion.py +0 -0
  20. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/development/utils/__init__.py +0 -0
  21. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/development/utils/generate_api_key.py +0 -0
  22. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/development/utils/subprocess_manager.py +0 -0
  23. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/__main__.py +0 -0
  24. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/__init__.py +0 -0
  25. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/base.py +0 -0
  26. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/bigquery.py +0 -0
  27. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/braintrust.py +0 -0
  28. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/dataframe.py +0 -0
  29. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/huggingface.py +0 -0
  30. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/langchain.py +0 -0
  31. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/langfuse.py +0 -0
  32. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/langsmith.py +0 -0
  33. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/openai_responses.py +0 -0
  34. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/trl.py +0 -0
  35. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/utils.py +0 -0
  36. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/adapters/weave.py +0 -0
  37. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/__init__.py +0 -0
  38. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/models.py +0 -0
  39. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/orchestrator.py +0 -0
  40. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resource_abc.py +0 -0
  41. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resource_pool.py +0 -0
  42. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/__init__.py +0 -0
  43. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  44. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  45. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  46. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  47. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  48. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/docker_resource.py +0 -0
  49. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  50. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  51. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/resources/sql_resource.py +0 -0
  52. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/task_manager.py +0 -0
  53. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/agent/tool_registry.py +0 -0
  54. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/auth.py +0 -0
  55. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/__init__.py +0 -0
  56. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  57. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  58. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/test_aime25.py +0 -0
  59. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  60. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
  61. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  62. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  63. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  64. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  65. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli.py +0 -0
  66. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/__init__.py +0 -0
  67. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  68. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/common.py +0 -0
  69. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/create_rft.py +0 -0
  70. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/export_docs.py +0 -0
  71. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/local_test.py +0 -0
  72. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/logs.py +0 -0
  73. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  74. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/upload.py +0 -0
  75. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/cli_commands/utils.py +0 -0
  76. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/common_utils.py +0 -0
  77. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/config.py +0 -0
  78. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/data_loader/__init__.py +0 -0
  79. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  80. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  81. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  82. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  83. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/data_loader/models.py +0 -0
  84. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/dataset_logger/__init__.py +0 -0
  85. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  86. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  87. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  88. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  89. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/datasets/__init__.py +0 -0
  90. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/datasets/loader.py +0 -0
  91. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/directory_utils.py +0 -0
  92. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/evaluation.py +0 -0
  93. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/event_bus/__init__.py +0 -0
  94. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/event_bus/event_bus.py +0 -0
  95. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/event_bus/logger.py +0 -0
  96. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  97. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  98. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/exceptions.py +0 -0
  99. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/execution/__init__.py +0 -0
  100. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/execution/pipeline.py +0 -0
  101. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/fireworks_rft.py +0 -0
  102. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/gcp_tools.py +0 -0
  103. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/generation/cache.py +0 -0
  104. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/generation/clients/base.py +0 -0
  105. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/generation/clients.py +0 -0
  106. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/generic_server.py +0 -0
  107. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/get_pep440_version.py +0 -0
  108. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/human_id/__init__.py +0 -0
  109. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/human_id/dictionary.py +0 -0
  110. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/integrations/__init__.py +0 -0
  111. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/integrations/deepeval.py +0 -0
  112. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/integrations/openai_rft.py +0 -0
  113. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/integrations/openeval.py +0 -0
  114. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  115. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
  116. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/integrations/trl.py +0 -0
  117. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/__init__.py +0 -0
  118. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  119. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  120. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  121. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  122. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/init.py +0 -0
  123. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/rollout_context.py +0 -0
  124. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  125. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/log_utils/util.py +0 -0
  126. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/logging_utils.py +0 -0
  127. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/__init__.py +0 -0
  128. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/adapter.py +0 -0
  129. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/client/__init__.py +0 -0
  130. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/client/connection.py +0 -0
  131. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/clients.py +0 -0
  132. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/execution/__init__.py +0 -0
  133. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/execution/base_policy.py +0 -0
  134. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/execution/manager.py +0 -0
  135. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  136. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/grid_renderer.py +0 -0
  137. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  138. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/mcpgym.py +0 -0
  139. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/process_manager.py +0 -0
  140. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/session/__init__.py +0 -0
  141. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/session/manager.py +0 -0
  142. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/simple_process_manager.py +0 -0
  143. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp/simulation_server.py +0 -0
  144. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_agent/__init__.py +0 -0
  145. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_agent/config.py +0 -0
  146. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_agent/main.py +0 -0
  147. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  148. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  149. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  150. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  151. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_env.py +0 -0
  152. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/__init__.py +0 -0
  153. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  154. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  155. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  156. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  157. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  158. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  159. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  160. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  161. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  162. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  163. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  164. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  165. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  166. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  167. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/models.py +0 -0
  168. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/packaging.py +0 -0
  169. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/platform_api.py +0 -0
  170. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/playback_policy.py +0 -0
  171. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/proxy/__init__.py +0 -0
  172. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  173. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  174. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/proxy/proxy_core/main.py +0 -0
  175. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  176. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/__init__.py +0 -0
  177. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/buffer.py +0 -0
  178. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  179. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  180. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
  181. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  182. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  183. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  184. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  185. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  186. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  187. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  188. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/evaluation_test.py +0 -0
  189. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  190. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  191. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/exception_config.py +0 -0
  192. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/execution.py +0 -0
  193. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  194. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  195. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  196. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
  197. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/parameterize.py +0 -0
  198. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/plugin.py +0 -0
  199. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/priority_scheduler.py +0 -0
  200. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  201. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/rollout_processor.py +0 -0
  202. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  203. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/store_experiment_link.py +0 -0
  204. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/store_results_url.py +0 -0
  205. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/tracing_utils.py +0 -0
  206. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/types.py +0 -0
  207. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/utils.py +0 -0
  208. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/pytest/validate_signature.py +0 -0
  209. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/__init__.py +0 -0
  210. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  211. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  212. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  213. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  214. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  215. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  216. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  217. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/llm_judge.py +0 -0
  218. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  219. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  220. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  221. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  222. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/quickstart/utils.py +0 -0
  223. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/resources.py +0 -0
  224. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/__init__.py +0 -0
  225. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/accuracy.py +0 -0
  226. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/accuracy_length.py +0 -0
  227. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  228. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  229. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/apps_testing_util.py +0 -0
  230. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/bfcl_reward.py +0 -0
  231. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/code_execution.py +0 -0
  232. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/code_execution_utils.py +0 -0
  233. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/cpp_code.py +0 -0
  234. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  235. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/format.py +0 -0
  236. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/function_calling.py +0 -0
  237. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/json_schema.py +0 -0
  238. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/language_consistency.py +0 -0
  239. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/lean_prover.py +0 -0
  240. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/length.py +0 -0
  241. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  242. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/math.py +0 -0
  243. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  244. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/reasoning_steps.py +0 -0
  245. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/repetition.py +0 -0
  246. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rewards/tag_count.py +0 -0
  247. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/rl_processing.py +0 -0
  248. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/server.py +0 -0
  249. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/stats/__init__.py +0 -0
  250. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/stats/confidence_intervals.py +0 -0
  251. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/training/__init__.py +0 -0
  252. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/training/gepa_trainer.py +0 -0
  253. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/training/gepa_utils.py +0 -0
  254. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/training/trainer.py +0 -0
  255. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/training/utils.py +0 -0
  256. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/typed_interface.py +0 -0
  257. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/types/__init__.py +0 -0
  258. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/types/errors.py +0 -0
  259. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/types/remote_rollout_processor.py +0 -0
  260. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/types/types.py +0 -0
  261. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/__init__.py +0 -0
  262. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/batch_evaluation.py +0 -0
  263. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/batch_transformation.py +0 -0
  264. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/browser_utils.py +0 -0
  265. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/check_server_status.py +0 -0
  266. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/dataset_helpers.py +0 -0
  267. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  268. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/logs_models.py +0 -0
  269. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/logs_server.py +0 -0
  270. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/module_loader.py +0 -0
  271. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/packaging_utils.py +0 -0
  272. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/show_results_url.py +0 -0
  273. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/static_policy.py +0 -0
  274. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/subprocess_utils.py +0 -0
  275. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol/utils/vite_server.py +0 -0
  276. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol.egg-info/SOURCES.txt +0 -0
  277. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol.egg-info/dependency_links.txt +0 -0
  278. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol.egg-info/entry_points.txt +0 -0
  279. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/eval_protocol.egg-info/top_level.txt +0 -0
  280. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/setup.cfg +0 -0
  281. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/setup.py +0 -0
  282. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_accuracy.py +0 -0
  283. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_accuracy_length.py +0 -0
  284. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_adapters_e2e.py +0 -0
  285. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_agent_orchestrator.py +0 -0
  286. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_agent_resources.py +0 -0
  287. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_auth.py +0 -0
  288. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_batch_evaluation.py +0 -0
  289. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_cli_agent.py +0 -0
  290. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_cli_args.py +0 -0
  291. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_cli_create_rft.py +0 -0
  292. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_cli_local_test.py +0 -0
  293. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_cli_startup_benchmark.py +0 -0
  294. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_code_execution.py +0 -0
  295. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_config.py +0 -0
  296. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_control_plane_separation.py +0 -0
  297. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_cpp_code.py +0 -0
  298. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_data_driven_task_manager.py +0 -0
  299. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_deepcoder_reward.py +0 -0
  300. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_deepeval_integration.py +0 -0
  301. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_directory_utils.py +0 -0
  302. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_e2b_integration.py +0 -0
  303. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_e2b_js_integration.py +0 -0
  304. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_edge_cases.py +0 -0
  305. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_ep_upload_e2e.py +0 -0
  306. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_eval_protocol_import.py +0 -0
  307. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_evaluation.py +0 -0
  308. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_evaluation_postprocess.py +0 -0
  309. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_event_bus.py +0 -0
  310. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_event_bus_helper.py +0 -0
  311. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_examples_end_to_end.py +0 -0
  312. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_exception_config.py +0 -0
  313. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_exceptions.py +0 -0
  314. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_fireworks_api.py +0 -0
  315. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_format.py +0 -0
  316. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_fractional_code.py +0 -0
  317. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_function_calling.py +0 -0
  318. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_gcp_tools.py +0 -0
  319. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_generic_server.py +0 -0
  320. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_human_id.py +0 -0
  321. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_integration.py +0 -0
  322. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_json_schema.py +0 -0
  323. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_kwargs_validation.py +0 -0
  324. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_language_consistency.py +0 -0
  325. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_lean_prover.py +0 -0
  326. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_lean_prover_runner.py +0 -0
  327. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_length.py +0 -0
  328. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_list_comparison_math_reward.py +0 -0
  329. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_litellm_policy_provider_fields.py +0 -0
  330. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_logs_server.py +0 -0
  331. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_logs_server_simple.py +0 -0
  332. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_math.py +0 -0
  333. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_message_field_filtering.py +0 -0
  334. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_minimal.py +0 -0
  335. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_models.py +0 -0
  336. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_models_rl.py +0 -0
  337. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_multiple_choice_math_reward.py +0 -0
  338. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_n_variant_batch_integration.py +0 -0
  339. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_n_variant_integration.py +0 -0
  340. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_no_implicit_dotenv.py +0 -0
  341. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_openai_compatibility.py +0 -0
  342. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_openai_rft_integration.py +0 -0
  343. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_openeval_integration.py +0 -0
  344. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_packaging.py +0 -0
  345. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_parallel_rollouts.py +0 -0
  346. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_platform_api.py +0 -0
  347. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_priority_scheduler.py +0 -0
  348. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_quickstart_utils.py +0 -0
  349. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_readiness.py +0 -0
  350. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_reasoning_steps.py +0 -0
  351. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_repetition.py +0 -0
  352. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_repetition_debug.py +0 -0
  353. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_retry_mechanism.py +0 -0
  354. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_reward_function.py +0 -0
  355. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_reward_protocol_import.py +0 -0
  356. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_rl_processing.py +0 -0
  357. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_rollout_control_plane_integration.py +0 -0
  358. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_rollout_logprobs.py +0 -0
  359. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_server.py +0 -0
  360. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_show_results_url.py +0 -0
  361. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_sqlite_hardening.py +0 -0
  362. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_status_migration_changes.py +0 -0
  363. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_status_migration_integration.py +0 -0
  364. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_status_model.py +0 -0
  365. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_tag_count.py +0 -0
  366. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_tau_bench_airline_smoke.py +0 -0
  367. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_training_utils.py +0 -0
  368. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_typed_interface.py +0 -0
  369. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_typed_interface_rl.py +0 -0
  370. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_upload_entrypoint.py +0 -0
  371. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_url_handling.py +0 -0
  372. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/tests/test_vite_server.py +0 -0
  373. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/__init__.py +0 -0
  374. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/agent/__init__.py +0 -0
  375. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/agent/base.py +0 -0
  376. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/agent/llm_agent.py +0 -0
  377. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/api_service/__init__.py +0 -0
  378. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/api_service/api_config.py +0 -0
  379. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/api_service/data_model.py +0 -0
  380. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/api_service/simulation_service.py +0 -0
  381. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/cli.py +0 -0
  382. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/config.py +0 -0
  383. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/airline/policy.md +0 -0
  384. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/mock/policy.md +0 -0
  385. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  386. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/retail/policy.md +0 -0
  387. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  388. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  389. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  390. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  391. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  392. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  393. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  394. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data_model/__init__.py +0 -0
  395. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data_model/message.py +0 -0
  396. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data_model/simulation.py +0 -0
  397. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/data_model/tasks.py +0 -0
  398. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/__init__.py +0 -0
  399. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/airline/__init__.py +0 -0
  400. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/airline/data_model.py +0 -0
  401. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/airline/environment.py +0 -0
  402. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/airline/tools.py +0 -0
  403. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/airline/utils.py +0 -0
  404. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/mock/__init__.py +0 -0
  405. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/mock/data_model.py +0 -0
  406. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/mock/environment.py +0 -0
  407. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/mock/tools.py +0 -0
  408. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/mock/utils.py +0 -0
  409. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/retail/__init__.py +0 -0
  410. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/retail/data_model.py +0 -0
  411. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/retail/environment.py +0 -0
  412. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/retail/tools.py +0 -0
  413. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/retail/utils.py +0 -0
  414. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/__init__.py +0 -0
  415. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/data_model.py +0 -0
  416. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/environment.py +0 -0
  417. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  418. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  419. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  420. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  421. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  422. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  423. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  424. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  425. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/tools.py +0 -0
  426. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  427. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  428. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/domains/telecom/utils.py +0 -0
  429. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/environment/__init__.py +0 -0
  430. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/environment/db.py +0 -0
  431. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/environment/environment.py +0 -0
  432. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/environment/server.py +0 -0
  433. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/environment/tool.py +0 -0
  434. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/environment/toolkit.py +0 -0
  435. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  436. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/evaluator/__init__.py +0 -0
  437. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/evaluator/evaluator.py +0 -0
  438. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  439. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  440. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  441. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  442. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  443. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/metrics/__init__.py +0 -0
  444. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/metrics/agent_metrics.py +0 -0
  445. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  446. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/orchestrator/__init__.py +0 -0
  447. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  448. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  449. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/orchestrator/utils.py +0 -0
  450. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/registry.py +0 -0
  451. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/run.py +0 -0
  452. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/scripts/__init__.py +0 -0
  453. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/scripts/check_data.py +0 -0
  454. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  455. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/scripts/start_servers.py +0 -0
  456. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/scripts/view_simulations.py +0 -0
  457. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/user/__init__.py +0 -0
  458. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/user/base.py +0 -0
  459. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/user/user_simulator.py +0 -0
  460. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/utils/__init__.py +0 -0
  461. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/utils/display.py +0 -0
  462. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/utils/io_utils.py +0 -0
  463. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/utils/llm_utils.py +0 -0
  464. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/utils/pydantic_utils.py +0 -0
  465. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vendor/tau2/utils/utils.py +0 -0
  466. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/versioneer.py +0 -0
  467. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  468. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vite-app/dist/assets/index-10cZ11iB.js +0 -0
  469. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vite-app/dist/assets/index-10cZ11iB.js.map +0 -0
  470. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vite-app/dist/assets/index-DOD73Wyg.css +0 -0
  471. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  472. {eval_protocol-0.3.15.dev1 → eval_protocol-0.3.16}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.15.dev1
3
+ Version: 0.3.16
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -116,6 +116,9 @@ Provides-Extra: proxy
116
116
  Requires-Dist: redis>=5.0.0; extra == "proxy"
117
117
  Requires-Dist: langfuse>=2.0.0; extra == "proxy"
118
118
  Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
119
+ Requires-Dist: opentelemetry-api>=1.20.0; extra == "proxy"
120
+ Requires-Dist: opentelemetry-sdk>=1.20.0; extra == "proxy"
121
+ Requires-Dist: opentelemetry-exporter-otlp>=1.20.0; extra == "proxy"
119
122
  Dynamic: license-file
120
123
 
121
124
  # Eval Protocol
@@ -13,6 +13,10 @@ import sys
13
13
  import warnings
14
14
  from typing import TYPE_CHECKING
15
15
 
16
+ import litellm
17
+
18
+ litellm.disable_add_transform_inline_image_block = True
19
+
16
20
  warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
17
21
 
18
22
  # Eager imports for symbols that conflict with module names - ONLY when pytest is running.
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-01-23T23:15:42-0800",
11
+ "date": "2026-01-26T17:57:51-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "154394f68fbe20a91f663a4fc87f9d0b3d98ec68",
15
- "version": "0.3.15.dev.1"
14
+ "full-revisionid": "d6acf244b2072a305511451cc98f2dd784e98bf5",
15
+ "version": "0.3.16"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -8,8 +8,10 @@ from __future__ import annotations
8
8
  import logging
9
9
  import requests
10
10
  from datetime import datetime
11
- from typing import Any, Dict, List, Optional, Protocol
11
+ import ast
12
+ import json
12
13
  import os
14
+ from typing import Any, Dict, List, Optional, Protocol
13
15
 
14
16
  from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
15
17
  from .base import BaseAdapter
@@ -44,6 +46,43 @@ class TraceDictConverter(Protocol):
44
46
  ...
45
47
 
46
48
 
49
+ def extract_openai_response(observations: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
50
+ """Attempt to extract and parse attributes from raw_gen_ai_request observation. This only works when stored in OTEL format.
51
+
52
+ Args:
53
+ observations: List of observation dictionaries from the trace
54
+
55
+ Returns:
56
+ Dict with all attributes parsed. Or None if not found.
57
+ """
58
+ for obs in observations:
59
+ if obs.get("name") == "raw_gen_ai_request" and obs.get("type") == "SPAN":
60
+ metadata = obs.get("metadata") or {}
61
+ attributes = metadata.get("attributes") or {}
62
+
63
+ result: Dict[str, Any] = {}
64
+
65
+ for key, value in attributes.items():
66
+ # Try to parse stringified objects (could be Python repr or JSON)
67
+ if isinstance(value, str) and value.startswith(("[", "{")):
68
+ try:
69
+ result[key] = ast.literal_eval(value)
70
+ except Exception as e:
71
+ logger.debug("Failed to parse %s with ast.literal_eval: %s", key, e)
72
+ try:
73
+ result[key] = json.loads(value)
74
+ except Exception as e:
75
+ logger.debug("Failed to parse %s with json.loads: %s", key, e)
76
+ result[key] = value
77
+ else:
78
+ result[key] = value
79
+
80
+ if result:
81
+ return result
82
+
83
+ return None
84
+
85
+
47
86
  def convert_trace_dict_to_evaluation_row(
48
87
  trace: Dict[str, Any], include_tool_calls: bool = True, span_name: Optional[str] = None
49
88
  ) -> Optional[EvaluationRow]:
@@ -96,6 +135,14 @@ def convert_trace_dict_to_evaluation_row(
96
135
  ):
97
136
  break # Break early if we've found all the metadata we need
98
137
 
138
+ observations = trace.get("observations") or []
139
+ # We can only extract when stored in OTEL format.
140
+ openai_response = extract_openai_response(observations)
141
+ if openai_response:
142
+ choices = openai_response.get("llm.openai.choices")
143
+ if choices and len(choices) > 0:
144
+ execution_metadata.finish_reason = choices[0].get("finish_reason")
145
+
99
146
  return EvaluationRow(
100
147
  messages=messages,
101
148
  tools=tools,
@@ -160,7 +207,7 @@ def extract_messages_from_trace_dict(
160
207
  # Fallback: use the last GENERATION observation which typically contains full chat history
161
208
  if not messages:
162
209
  try:
163
- all_observations = trace.get("observations", [])
210
+ all_observations = trace.get("observations") or []
164
211
  gens = [obs for obs in all_observations if obs.get("type") == "GENERATION"]
165
212
  if gens:
166
213
  gens.sort(key=lambda x: x.get("start_time", ""))
@@ -186,7 +233,7 @@ def get_final_generation_in_span_dict(trace: Dict[str, Any], span_name: str) ->
186
233
  The final generation dictionary, or None if not found
187
234
  """
188
235
  # Get all observations from the trace
189
- all_observations = trace.get("observations", [])
236
+ all_observations = trace.get("observations") or []
190
237
 
191
238
  # Find a span with the given name that has generation children
192
239
  parent_span = None
@@ -22,9 +22,6 @@ from .base_policy import LLMBasePolicy
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
25
- # Disable LiteLLM's document inlining feature that appends #transform=inline
26
- # to base64 image URLs, which breaks base64 decoding
27
- litellm.disable_add_transform_inline_image_block = True
28
25
 
29
26
  class LiteLLMPolicy(LLMBasePolicy):
30
27
  """
@@ -15,7 +15,7 @@ from contextlib import asynccontextmanager
15
15
 
16
16
  from .models import ProxyConfig, LangfuseTracesResponse, TracesParams, ChatParams, ChatRequestHook, TracesRequestHook
17
17
  from .auth import AuthProvider, NoAuthProvider
18
- from .litellm import handle_chat_completion, proxy_to_litellm
18
+ from .litellm import handle_chat_completion
19
19
  from .langfuse import fetch_langfuse_traces, pointwise_fetch_langfuse_trace
20
20
 
21
21
  # Configure logging before any other imports (so all modules inherit this config)
@@ -35,10 +35,6 @@ def build_proxy_config(
35
35
  preprocess_traces_request: Optional[TracesRequestHook] = None,
36
36
  ) -> ProxyConfig:
37
37
  """Load environment and secrets, and build ProxyConfig"""
38
- # Env
39
- litellm_url = os.getenv("LITELLM_URL")
40
- if not litellm_url:
41
- raise ValueError("LITELLM_URL environment variable must be set")
42
38
  request_timeout = float(os.getenv("REQUEST_TIMEOUT", "300.0"))
43
39
  langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
44
40
 
@@ -66,7 +62,6 @@ def build_proxy_config(
66
62
  raise ValueError(f"Invalid format in secrets file {secrets_path.name}: {e}")
67
63
 
68
64
  return ProxyConfig(
69
- litellm_url=litellm_url,
70
65
  request_timeout=request_timeout,
71
66
  langfuse_host=langfuse_host,
72
67
  langfuse_keys=langfuse_keys,
@@ -113,6 +108,16 @@ def create_app(
113
108
  app.state.config = build_proxy_config(preprocess_chat_request, preprocess_traces_request)
114
109
  app.state.redis = init_redis()
115
110
 
111
+ config = app.state.config
112
+ default_keys = config.langfuse_keys[config.default_project_id]
113
+ os.environ["LANGFUSE_PUBLIC_KEY"] = default_keys["public_key"]
114
+ os.environ["LANGFUSE_SECRET_KEY"] = default_keys["secret_key"]
115
+ os.environ.setdefault("LANGFUSE_HOST", config.langfuse_host)
116
+
117
+ import litellm
118
+
119
+ litellm.callbacks = ["langfuse_otel"]
120
+
116
121
  try:
117
122
  yield
118
123
  finally:
@@ -297,13 +302,4 @@ def create_app(
297
302
  async def health():
298
303
  return {"status": "healthy", "service": "metadata-proxy"}
299
304
 
300
- # Catch-all
301
- @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
302
- async def catch_all_proxy(
303
- path: str,
304
- request: Request,
305
- config: ProxyConfig = Depends(get_config),
306
- ):
307
- return await proxy_to_litellm(config, path, request)
308
-
309
305
  return app
@@ -50,6 +50,7 @@ def _serialize_trace_to_dict(trace_full: Any) -> Dict[str, Any]:
50
50
  "input": getattr(obs, "input", None),
51
51
  "output": getattr(obs, "output", None),
52
52
  "parent_observation_id": getattr(obs, "parent_observation_id", None),
53
+ "metadata": getattr(obs, "metadata", None),
53
54
  }
54
55
  for obs in getattr(trace_full, "observations", [])
55
56
  ]
@@ -0,0 +1,150 @@
1
+ """
2
+ LiteLLM client - handles all LLM calls directly via LiteLLM SDK with Langfuse OTEL integration.
3
+ """
4
+
5
+ import json
6
+ import base64
7
+ import logging
8
+ from uuid6 import uuid7
9
+ from fastapi import Request, Response, HTTPException
10
+ from fastapi.responses import StreamingResponse
11
+ import redis
12
+ import openai
13
+ from litellm import acompletion
14
+
15
+ from .redis_utils import register_insertion_id
16
+ from .models import ProxyConfig, ChatParams
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ async def handle_chat_completion(
22
+ config: ProxyConfig,
23
+ redis_client: redis.Redis,
24
+ request: Request,
25
+ params: ChatParams,
26
+ ) -> Response:
27
+ """
28
+ Handle chat completion requests using LiteLLM SDK directly with Langfuse OTEL.
29
+
30
+ If metadata IDs (rollout_id, etc.) are provided, they'll be added as tags
31
+ and the assistant message count will be tracked in Redis.
32
+
33
+ If encoded_base_url is provided, it will be decoded and used as api_base.
34
+ """
35
+ body = await request.body()
36
+ data = json.loads(body) if body else {}
37
+
38
+ if config.preprocess_chat_request:
39
+ data, params = config.preprocess_chat_request(data, request, params)
40
+
41
+ project_id = params.project_id
42
+ rollout_id = params.rollout_id
43
+ invocation_id = params.invocation_id
44
+ experiment_id = params.experiment_id
45
+ run_id = params.run_id
46
+ row_id = params.row_id
47
+ encoded_base_url = params.encoded_base_url
48
+
49
+ # Use default project if not specified
50
+ if project_id is None:
51
+ project_id = config.default_project_id
52
+
53
+ # Decode and add base_url if provided
54
+ if encoded_base_url:
55
+ try:
56
+ decoded_bytes = base64.urlsafe_b64decode(encoded_base_url)
57
+ data["base_url"] = decoded_bytes.decode("utf-8")
58
+ logger.debug(f"Decoded base_url: {data['base_url']}")
59
+ except Exception as e:
60
+ logger.error(f"Failed to decode base_url: {e}")
61
+ raise HTTPException(status_code=400, detail=f"Invalid encoded_base_url: {str(e)}")
62
+
63
+ # Extract API key from Authorization header and add to data
64
+ auth_header = request.headers.get("authorization", "")
65
+ if auth_header.startswith("Bearer "):
66
+ data["api_key"] = auth_header.replace("Bearer ", "").strip()
67
+
68
+ # Build metadata with tags for Langfuse
69
+ insertion_id = None
70
+ metadata = data.pop("metadata", {}) or {}
71
+ tags = list(metadata.pop("tags", []) or [])
72
+
73
+ if rollout_id is not None:
74
+ insertion_id = str(uuid7())
75
+ tags.extend(
76
+ [
77
+ f"rollout_id:{rollout_id}",
78
+ f"insertion_id:{insertion_id}",
79
+ f"invocation_id:{invocation_id}",
80
+ f"experiment_id:{experiment_id}",
81
+ f"run_id:{run_id}",
82
+ f"row_id:{row_id}",
83
+ ]
84
+ )
85
+
86
+ # Build Langfuse metadata (tags)
87
+ litellm_metadata = {"tags": tags, **metadata}
88
+
89
+ langfuse_keys = config.langfuse_keys[project_id]
90
+
91
+ # Check if streaming is requested
92
+ is_streaming = data.get("stream", False)
93
+
94
+ # Pop fields that we pass explicitly to avoid duplicate kwarg errors
95
+ request_timeout = data.pop("timeout", None) or config.request_timeout
96
+ data.pop("langfuse_public_key", None)
97
+ data.pop("langfuse_secret_key", None)
98
+
99
+ try:
100
+ # Make the completion call - pass all params through
101
+ # Note: langfuse_host is set via LANGFUSE_HOST env var at startup; OTEL doesn't support per-request host override
102
+ response = await acompletion(
103
+ **data,
104
+ metadata=litellm_metadata,
105
+ timeout=request_timeout,
106
+ langfuse_public_key=langfuse_keys["public_key"],
107
+ langfuse_secret_key=langfuse_keys["secret_key"],
108
+ )
109
+
110
+ if is_streaming:
111
+ # For streaming, return a StreamingResponse with SSE format
112
+ # Register insertion_id only after stream completes successfully
113
+ async def stream_generator():
114
+ async for chunk in response: # type: ignore[union-attr]
115
+ yield f"data: {chunk.model_dump_json()}\n\n"
116
+ yield "data: [DONE]\n\n"
117
+ # Stream completed successfully - now register
118
+ if insertion_id is not None and rollout_id is not None:
119
+ register_insertion_id(redis_client, rollout_id, insertion_id)
120
+
121
+ return StreamingResponse(
122
+ stream_generator(),
123
+ media_type="text/event-stream",
124
+ headers={
125
+ "Cache-Control": "no-cache",
126
+ "Connection": "keep-alive",
127
+ },
128
+ )
129
+ else:
130
+ # Non-streaming: register insertion_id on success
131
+ if insertion_id is not None and rollout_id is not None:
132
+ register_insertion_id(redis_client, rollout_id, insertion_id)
133
+
134
+ return Response(
135
+ content=response.model_dump_json(),
136
+ status_code=200,
137
+ media_type="application/json",
138
+ )
139
+
140
+ except HTTPException:
141
+ raise
142
+ except openai.APIError as e:
143
+ # Convert to HTTPException and let FastAPI handle it
144
+ raise HTTPException(
145
+ status_code=getattr(e, "status_code", 500),
146
+ detail=str(e),
147
+ )
148
+ except Exception as e:
149
+ logger.error(f"Unexpected error: {e}", exc_info=True)
150
+ raise HTTPException(status_code=500, detail=str(e))
@@ -53,7 +53,6 @@ class TracesParams(BaseModel):
53
53
  class ProxyConfig(BaseModel):
54
54
  """Configuration model for the LiteLLM Metadata Proxy"""
55
55
 
56
- litellm_url: str
57
56
  request_timeout: float = 300.0
58
57
  langfuse_host: str
59
58
  langfuse_keys: Dict[str, Dict[str, str]]
@@ -209,7 +209,7 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
209
209
 
210
210
  store_experiment_link(
211
211
  experiment_id,
212
- f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}",
212
+ f"https://app.fireworks.ai/dashboard/evaluation-jobs/{job_id}?account={fireworks_account_id}",
213
213
  "success",
214
214
  )
215
215
  else:
@@ -12,7 +12,6 @@ import requests
12
12
  from .models import EvaluateResult, MetricResult
13
13
  from .typed_interface import reward_function
14
14
 
15
- logging.basicConfig(level=logging.INFO)
16
15
  logger = logging.getLogger(__name__)
17
16
 
18
17
  T = TypeVar("T", bound=Callable[..., EvaluateResult])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.15.dev1
3
+ Version: 0.3.16
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -116,6 +116,9 @@ Provides-Extra: proxy
116
116
  Requires-Dist: redis>=5.0.0; extra == "proxy"
117
117
  Requires-Dist: langfuse>=2.0.0; extra == "proxy"
118
118
  Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
119
+ Requires-Dist: opentelemetry-api>=1.20.0; extra == "proxy"
120
+ Requires-Dist: opentelemetry-sdk>=1.20.0; extra == "proxy"
121
+ Requires-Dist: opentelemetry-exporter-otlp>=1.20.0; extra == "proxy"
119
122
  Dynamic: license-file
120
123
 
121
124
  # Eval Protocol
@@ -108,6 +108,9 @@ openevals>=0.1.0
108
108
  redis>=5.0.0
109
109
  langfuse>=2.0.0
110
110
  uuid6>=2025.0.0
111
+ opentelemetry-api>=1.20.0
112
+ opentelemetry-sdk>=1.20.0
113
+ opentelemetry-exporter-otlp>=1.20.0
111
114
 
112
115
  [pydantic]
113
116
  pydantic-ai>=1.0.2
@@ -153,6 +153,9 @@ proxy = [
153
153
  "redis>=5.0.0",
154
154
  "langfuse>=2.0.0",
155
155
  "uuid6>=2025.0.0",
156
+ "opentelemetry-api>=1.20.0",
157
+ "opentelemetry-sdk>=1.20.0",
158
+ "opentelemetry-exporter-otlp>=1.20.0",
156
159
  ]
157
160
 
158
161
  [project.scripts]
@@ -1,197 +0,0 @@
1
- """
2
- LiteLLM client - handles all communication with LiteLLM service.
3
- """
4
-
5
- import json
6
- import base64
7
- import asyncio
8
- import httpx
9
- import logging
10
- from uuid6 import uuid7
11
- from fastapi import Request, Response, HTTPException
12
- import redis
13
- from .redis_utils import register_insertion_id
14
- from .models import ProxyConfig, ChatParams
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- # Retry configuration for 404 errors
19
- # 8 retries with exponential backoff (1, 2, 4, 8, 16, 32, 64, 128 seconds)
20
- # Total wait time: ~255 seconds (~4.25 minutes)
21
- MAX_RETRIES_ON_404 = 8
22
- RETRY_BASE_DELAY_SECONDS = 1
23
-
24
-
25
- async def handle_chat_completion(
26
- config: ProxyConfig,
27
- redis_client: redis.Redis,
28
- request: Request,
29
- params: ChatParams,
30
- ) -> Response:
31
- """
32
- Handle chat completion requests and forward to LiteLLM.
33
-
34
- If metadata IDs (rollout_id, etc.) are provided, they'll be added as tags
35
- and the assistant message count will be tracked in Redis.
36
-
37
- If encoded_base_url is provided, it will be decoded and added to the request.
38
- """
39
- body = await request.body()
40
- data = json.loads(body) if body else {}
41
-
42
- if config.preprocess_chat_request:
43
- data, params = config.preprocess_chat_request(data, request, params)
44
-
45
- project_id = params.project_id
46
- rollout_id = params.rollout_id
47
- invocation_id = params.invocation_id
48
- experiment_id = params.experiment_id
49
- run_id = params.run_id
50
- row_id = params.row_id
51
- encoded_base_url = params.encoded_base_url
52
-
53
- # Use default project if not specified
54
- if project_id is None:
55
- project_id = config.default_project_id
56
-
57
- # Decode and add base_url if provided
58
- if encoded_base_url:
59
- try:
60
- # Decode from URL-safe base64
61
- decoded_bytes = base64.urlsafe_b64decode(encoded_base_url)
62
- base_url = decoded_bytes.decode("utf-8")
63
- data["base_url"] = base_url
64
- logger.debug(f"Decoded base_url: {base_url}")
65
- except Exception as e:
66
- logger.error(f"Failed to decode base_url: {e}")
67
- raise HTTPException(status_code=400, detail=f"Invalid encoded_base_url: {str(e)}")
68
-
69
- # Extract API key from Authorization header and inject into request body
70
- auth_header = request.headers.get("authorization", "")
71
- if auth_header.startswith("Bearer "):
72
- api_key = auth_header.replace("Bearer ", "").strip()
73
- # Only inject API key if model is a Fireworks model
74
- model = data.get("model")
75
- if model and isinstance(model, str) and model.startswith("fireworks_ai"):
76
- data["api_key"] = api_key
77
-
78
- # If metadata IDs are provided, add them as tags
79
- insertion_id = None
80
- if rollout_id is not None:
81
- insertion_id = str(uuid7())
82
-
83
- if "metadata" not in data:
84
- data["metadata"] = {}
85
- if "tags" not in data["metadata"]:
86
- data["metadata"]["tags"] = []
87
-
88
- # Add extracted IDs as tags
89
- data["metadata"]["tags"].extend(
90
- [
91
- f"rollout_id:{rollout_id}",
92
- f"insertion_id:{insertion_id}",
93
- f"invocation_id:{invocation_id}",
94
- f"experiment_id:{experiment_id}",
95
- f"run_id:{run_id}",
96
- f"row_id:{row_id}",
97
- ]
98
- )
99
-
100
- # Add Langfuse configuration
101
- data["langfuse_public_key"] = config.langfuse_keys[project_id]["public_key"]
102
- data["langfuse_secret_key"] = config.langfuse_keys[project_id]["secret_key"]
103
- data["langfuse_host"] = config.langfuse_host
104
-
105
- # Forward to LiteLLM's standard /chat/completions endpoint
106
- # Set longer timeout for LLM API calls (LLMs can be slow)
107
- timeout = httpx.Timeout(config.request_timeout)
108
- async with httpx.AsyncClient(timeout=timeout) as client:
109
- # Copy headers from original request but exclude content-length (httpx will set it correctly)
110
- headers = dict(request.headers)
111
- headers.pop("host", None)
112
- headers.pop("content-length", None) # Let httpx calculate the correct length
113
- headers["content-type"] = "application/json"
114
-
115
- # Forward to LiteLLM
116
- litellm_url = f"{config.litellm_url}/chat/completions"
117
-
118
- # Retry loop with exponential backoff for 404 errors
119
- # Initial request
120
- response = await client.post(
121
- litellm_url,
122
- json=data, # httpx will serialize and set correct Content-Length
123
- headers=headers,
124
- )
125
-
126
- for attempt in range(MAX_RETRIES_ON_404):
127
- if response.status_code != 404:
128
- break
129
-
130
- # Wait with exponential backoff before retry
131
- delay = RETRY_BASE_DELAY_SECONDS * (2**attempt)
132
- logger.warning(f"Got 404 from LiteLLM, retrying in {delay}s (attempt {attempt + 1}/{MAX_RETRIES_ON_404})")
133
- await asyncio.sleep(delay)
134
-
135
- response = await client.post(
136
- litellm_url,
137
- json=data,
138
- headers=headers,
139
- )
140
-
141
- # Register insertion_id in Redis only on successful response
142
- if response.status_code == 200 and insertion_id is not None and rollout_id is not None:
143
- register_insertion_id(redis_client, rollout_id, insertion_id)
144
-
145
- # Return the response
146
- return Response(
147
- content=response.content,
148
- status_code=response.status_code,
149
- headers=dict(response.headers),
150
- )
151
-
152
-
153
- async def proxy_to_litellm(config: ProxyConfig, path: str, request: Request) -> Response:
154
- """
155
- Catch-all proxy: Forward any request to LiteLLM, extracting API key from Authorization header.
156
- """
157
- # Set longer timeout for LLM API calls (LLMs can be slow)
158
- timeout = httpx.Timeout(config.request_timeout)
159
- async with httpx.AsyncClient(timeout=timeout) as client:
160
- # Copy headers
161
- headers = dict(request.headers)
162
- headers.pop("host", None)
163
- headers.pop("content-length", None)
164
-
165
- # Get body
166
- body = await request.body()
167
-
168
- # Pass through API key from Authorization header
169
- if request.method in ["POST", "PUT", "PATCH"] and body:
170
- try:
171
- data = json.loads(body)
172
-
173
- auth_header = request.headers.get("authorization", "")
174
- if auth_header.startswith("Bearer "):
175
- api_key = auth_header.replace("Bearer ", "").strip()
176
- data["api_key"] = api_key
177
-
178
- # Re-serialize
179
- body = json.dumps(data).encode()
180
- except json.JSONDecodeError:
181
- pass
182
-
183
- # Forward to LiteLLM
184
- litellm_url = f"{config.litellm_url}/{path}"
185
-
186
- response = await client.request(
187
- method=request.method,
188
- url=litellm_url,
189
- headers=headers,
190
- content=body,
191
- )
192
-
193
- return Response(
194
- content=response.content,
195
- status_code=response.status_code,
196
- headers=dict(response.headers),
197
- )