eval-protocol 0.3.23__tar.gz → 0.3.25__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (474) hide show
  1. {eval_protocol-0.3.23/eval_protocol.egg-info → eval_protocol-0.3.25}/PKG-INFO +3 -6
  2. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/fireworks_tracing.py +3 -55
  4. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/app.py +15 -11
  5. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/langfuse.py +0 -1
  6. eval_protocol-0.3.25/eval_protocol/proxy/proxy_core/litellm.py +173 -0
  7. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/models.py +1 -1
  8. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/evaluation_test.py +15 -9
  9. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/evaluation_test_utils.py +6 -2
  10. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/exception_config.py +4 -0
  11. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/remote_rollout_processor.py +3 -2
  12. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/reward_function.py +1 -0
  13. {eval_protocol-0.3.23 → eval_protocol-0.3.25/eval_protocol.egg-info}/PKG-INFO +3 -6
  14. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/requires.txt +2 -5
  15. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/pyproject.toml +2 -6
  16. eval_protocol-0.3.23/eval_protocol/proxy/proxy_core/litellm.py +0 -154
  17. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/LICENSE +0 -0
  18. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/README.md +0 -0
  19. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/__init__.py +0 -0
  20. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/normalize_sandbox_fusion.py +0 -0
  21. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/utils/__init__.py +0 -0
  22. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/utils/generate_api_key.py +0 -0
  23. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/development/utils/subprocess_manager.py +0 -0
  24. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/__init__.py +0 -0
  25. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/__main__.py +0 -0
  26. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/__init__.py +0 -0
  27. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/base.py +0 -0
  28. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/bigquery.py +0 -0
  29. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/braintrust.py +0 -0
  30. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/dataframe.py +0 -0
  31. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/huggingface.py +0 -0
  32. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/langchain.py +0 -0
  33. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/langfuse.py +0 -0
  34. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/langsmith.py +0 -0
  35. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/openai_responses.py +0 -0
  36. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/trl.py +0 -0
  37. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/utils.py +0 -0
  38. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/adapters/weave.py +0 -0
  39. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/__init__.py +0 -0
  40. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/models.py +0 -0
  41. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/orchestrator.py +0 -0
  42. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resource_abc.py +0 -0
  43. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resource_pool.py +0 -0
  44. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/__init__.py +0 -0
  45. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  46. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  47. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  48. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  49. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  50. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/docker_resource.py +0 -0
  51. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  52. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  53. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/resources/sql_resource.py +0 -0
  54. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/task_manager.py +0 -0
  55. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/agent/tool_registry.py +0 -0
  56. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/auth.py +0 -0
  57. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/__init__.py +0 -0
  58. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  59. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  60. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_aime25.py +0 -0
  61. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  62. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
  63. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  64. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  65. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  66. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  67. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli.py +0 -0
  68. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/__init__.py +0 -0
  69. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  70. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/common.py +0 -0
  71. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/create_rft.py +0 -0
  72. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/export_docs.py +0 -0
  73. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/local_test.py +0 -0
  74. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/logs.py +0 -0
  75. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  76. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/upload.py +0 -0
  77. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/cli_commands/utils.py +0 -0
  78. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/common_utils.py +0 -0
  79. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/config.py +0 -0
  80. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/__init__.py +0 -0
  81. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  82. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  83. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  84. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  85. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/data_loader/models.py +0 -0
  86. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/__init__.py +0 -0
  87. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  88. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  89. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  90. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  91. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/datasets/__init__.py +0 -0
  92. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/datasets/loader.py +0 -0
  93. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/directory_utils.py +0 -0
  94. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/evaluation.py +0 -0
  95. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/__init__.py +0 -0
  96. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/event_bus.py +0 -0
  97. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/logger.py +0 -0
  98. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  99. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  100. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/exceptions.py +0 -0
  101. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/execution/__init__.py +0 -0
  102. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/execution/pipeline.py +0 -0
  103. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/fireworks_rft.py +0 -0
  104. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/gcp_tools.py +0 -0
  105. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/generation/cache.py +0 -0
  106. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/generation/clients/base.py +0 -0
  107. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/generation/clients.py +0 -0
  108. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/generic_server.py +0 -0
  109. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/get_pep440_version.py +0 -0
  110. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/human_id/__init__.py +0 -0
  111. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/human_id/dictionary.py +0 -0
  112. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/__init__.py +0 -0
  113. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/deepeval.py +0 -0
  114. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/fireworks_v1_completions_client.py +0 -0
  115. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/openai_rft.py +0 -0
  116. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/openeval.py +0 -0
  117. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  118. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
  119. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/integrations/trl.py +0 -0
  120. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/__init__.py +0 -0
  121. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  122. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  123. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  124. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  125. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/init.py +0 -0
  126. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/rollout_context.py +0 -0
  127. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  128. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/log_utils/util.py +0 -0
  129. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/logging_utils.py +0 -0
  130. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/__init__.py +0 -0
  131. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/adapter.py +0 -0
  132. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/client/__init__.py +0 -0
  133. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/client/connection.py +0 -0
  134. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/clients.py +0 -0
  135. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/__init__.py +0 -0
  136. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/base_policy.py +0 -0
  137. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/manager.py +0 -0
  138. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/policy.py +0 -0
  139. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  140. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/grid_renderer.py +0 -0
  141. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  142. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/mcpgym.py +0 -0
  143. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/process_manager.py +0 -0
  144. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/session/__init__.py +0 -0
  145. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/session/manager.py +0 -0
  146. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/simple_process_manager.py +0 -0
  147. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp/simulation_server.py +0 -0
  148. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/__init__.py +0 -0
  149. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/config.py +0 -0
  150. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/main.py +0 -0
  151. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  152. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  153. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  154. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  155. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_env.py +0 -0
  156. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/__init__.py +0 -0
  157. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  158. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  159. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  160. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  161. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  162. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  163. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  164. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  165. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  166. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  167. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  168. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  169. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  170. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  171. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/models.py +0 -0
  172. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/packaging.py +0 -0
  173. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/platform_api.py +0 -0
  174. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/playback_policy.py +0 -0
  175. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/__init__.py +0 -0
  176. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  177. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  178. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/main.py +0 -0
  179. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  180. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/__init__.py +0 -0
  181. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/buffer.py +0 -0
  182. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  183. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  184. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
  185. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  186. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  187. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  188. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  189. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  190. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  191. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  192. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  193. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/execution.py +0 -0
  194. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  195. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  196. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  197. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  198. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
  199. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/parameterize.py +0 -0
  200. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/plugin.py +0 -0
  201. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/priority_scheduler.py +0 -0
  202. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/rollout_processor.py +0 -0
  203. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  204. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/store_experiment_link.py +0 -0
  205. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/store_results_url.py +0 -0
  206. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/tracing_utils.py +0 -0
  207. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/types.py +0 -0
  208. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/utils.py +0 -0
  209. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/pytest/validate_signature.py +0 -0
  210. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/__init__.py +0 -0
  211. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  212. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  213. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  214. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  215. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  216. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  217. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  218. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/llm_judge.py +0 -0
  219. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  220. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  221. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  222. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  223. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/quickstart/utils.py +0 -0
  224. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/resources.py +0 -0
  225. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/__init__.py +0 -0
  226. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/accuracy.py +0 -0
  227. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/accuracy_length.py +0 -0
  228. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  229. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  230. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/apps_testing_util.py +0 -0
  231. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/bfcl_reward.py +0 -0
  232. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/code_execution.py +0 -0
  233. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/code_execution_utils.py +0 -0
  234. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/cpp_code.py +0 -0
  235. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  236. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/format.py +0 -0
  237. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/function_calling.py +0 -0
  238. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/json_schema.py +0 -0
  239. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/language_consistency.py +0 -0
  240. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/lean_prover.py +0 -0
  241. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/length.py +0 -0
  242. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  243. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/math.py +0 -0
  244. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  245. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/reasoning_steps.py +0 -0
  246. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/repetition.py +0 -0
  247. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rewards/tag_count.py +0 -0
  248. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/rl_processing.py +0 -0
  249. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/server.py +0 -0
  250. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/stats/__init__.py +0 -0
  251. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/stats/confidence_intervals.py +0 -0
  252. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/__init__.py +0 -0
  253. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/gepa_trainer.py +0 -0
  254. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/gepa_utils.py +0 -0
  255. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/trainer.py +0 -0
  256. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/training/utils.py +0 -0
  257. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/typed_interface.py +0 -0
  258. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/types/__init__.py +0 -0
  259. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/types/errors.py +0 -0
  260. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/types/remote_rollout_processor.py +0 -0
  261. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/types/types.py +0 -0
  262. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/__init__.py +0 -0
  263. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/batch_evaluation.py +0 -0
  264. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/batch_transformation.py +0 -0
  265. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/browser_utils.py +0 -0
  266. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/check_server_status.py +0 -0
  267. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/dataset_helpers.py +0 -0
  268. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  269. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/logs_models.py +0 -0
  270. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/logs_server.py +0 -0
  271. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/module_loader.py +0 -0
  272. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/packaging_utils.py +0 -0
  273. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/show_results_url.py +0 -0
  274. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/static_policy.py +0 -0
  275. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/subprocess_utils.py +0 -0
  276. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol/utils/vite_server.py +0 -0
  277. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/SOURCES.txt +0 -0
  278. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/dependency_links.txt +0 -0
  279. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/entry_points.txt +0 -0
  280. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/eval_protocol.egg-info/top_level.txt +0 -0
  281. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/setup.cfg +0 -0
  282. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/setup.py +0 -0
  283. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_accuracy.py +0 -0
  284. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_accuracy_length.py +0 -0
  285. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_adapters_e2e.py +0 -0
  286. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_agent_orchestrator.py +0 -0
  287. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_agent_resources.py +0 -0
  288. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_auth.py +0 -0
  289. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_batch_evaluation.py +0 -0
  290. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_agent.py +0 -0
  291. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_args.py +0 -0
  292. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_create_rft.py +0 -0
  293. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_local_test.py +0 -0
  294. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cli_startup_benchmark.py +0 -0
  295. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_code_execution.py +0 -0
  296. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_config.py +0 -0
  297. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_control_plane_separation.py +0 -0
  298. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_cpp_code.py +0 -0
  299. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_data_driven_task_manager.py +0 -0
  300. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_deepcoder_reward.py +0 -0
  301. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_deepeval_integration.py +0 -0
  302. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_directory_utils.py +0 -0
  303. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_e2b_integration.py +0 -0
  304. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_e2b_js_integration.py +0 -0
  305. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_edge_cases.py +0 -0
  306. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_ep_upload_e2e.py +0 -0
  307. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_eval_protocol_import.py +0 -0
  308. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_evaluation.py +0 -0
  309. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_evaluation_postprocess.py +0 -0
  310. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_event_bus.py +0 -0
  311. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_event_bus_helper.py +0 -0
  312. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_examples_end_to_end.py +0 -0
  313. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_exception_config.py +0 -0
  314. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_exceptions.py +0 -0
  315. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_fireworks_api.py +0 -0
  316. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_fireworks_v1_completions_client.py +0 -0
  317. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_format.py +0 -0
  318. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_fractional_code.py +0 -0
  319. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_function_calling.py +0 -0
  320. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_gcp_tools.py +0 -0
  321. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_generic_server.py +0 -0
  322. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_human_id.py +0 -0
  323. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_integration.py +0 -0
  324. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_json_schema.py +0 -0
  325. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_kwargs_validation.py +0 -0
  326. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_language_consistency.py +0 -0
  327. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_lean_prover.py +0 -0
  328. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_lean_prover_runner.py +0 -0
  329. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_length.py +0 -0
  330. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_list_comparison_math_reward.py +0 -0
  331. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_litellm_policy_provider_fields.py +0 -0
  332. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_logs_server.py +0 -0
  333. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_logs_server_simple.py +0 -0
  334. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_math.py +0 -0
  335. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_message_field_filtering.py +0 -0
  336. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_minimal.py +0 -0
  337. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_models.py +0 -0
  338. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_models_rl.py +0 -0
  339. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_multiple_choice_math_reward.py +0 -0
  340. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_n_variant_batch_integration.py +0 -0
  341. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_n_variant_integration.py +0 -0
  342. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_no_implicit_dotenv.py +0 -0
  343. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_openai_compatibility.py +0 -0
  344. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_openai_rft_integration.py +0 -0
  345. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_openeval_integration.py +0 -0
  346. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_packaging.py +0 -0
  347. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_parallel_rollouts.py +0 -0
  348. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_platform_api.py +0 -0
  349. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_priority_scheduler.py +0 -0
  350. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_quickstart_utils.py +0 -0
  351. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_readiness.py +0 -0
  352. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_reasoning_steps.py +0 -0
  353. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_repetition.py +0 -0
  354. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_repetition_debug.py +0 -0
  355. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_retry_mechanism.py +0 -0
  356. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_reward_function.py +0 -0
  357. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_reward_protocol_import.py +0 -0
  358. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_rl_processing.py +0 -0
  359. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_rollout_control_plane_integration.py +0 -0
  360. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_rollout_logprobs.py +0 -0
  361. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_server.py +0 -0
  362. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_show_results_url.py +0 -0
  363. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_sqlite_hardening.py +0 -0
  364. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_status_migration_changes.py +0 -0
  365. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_status_migration_integration.py +0 -0
  366. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_status_model.py +0 -0
  367. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_tag_count.py +0 -0
  368. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_tau_bench_airline_smoke.py +0 -0
  369. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_training_utils.py +0 -0
  370. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_typed_interface.py +0 -0
  371. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_typed_interface_rl.py +0 -0
  372. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_upload_entrypoint.py +0 -0
  373. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_url_handling.py +0 -0
  374. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/tests/test_vite_server.py +0 -0
  375. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/__init__.py +0 -0
  376. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/agent/__init__.py +0 -0
  377. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/agent/base.py +0 -0
  378. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/agent/llm_agent.py +0 -0
  379. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/api_service/__init__.py +0 -0
  380. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/api_service/api_config.py +0 -0
  381. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/api_service/data_model.py +0 -0
  382. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/api_service/simulation_service.py +0 -0
  383. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/cli.py +0 -0
  384. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/config.py +0 -0
  385. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/airline/policy.md +0 -0
  386. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/mock/policy.md +0 -0
  387. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  388. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/retail/policy.md +0 -0
  389. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  390. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  391. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  392. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  393. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  394. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  395. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  396. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data_model/__init__.py +0 -0
  397. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data_model/message.py +0 -0
  398. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data_model/simulation.py +0 -0
  399. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/data_model/tasks.py +0 -0
  400. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/__init__.py +0 -0
  401. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/__init__.py +0 -0
  402. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/data_model.py +0 -0
  403. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/environment.py +0 -0
  404. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/tools.py +0 -0
  405. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/airline/utils.py +0 -0
  406. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/__init__.py +0 -0
  407. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/data_model.py +0 -0
  408. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/environment.py +0 -0
  409. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/tools.py +0 -0
  410. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/mock/utils.py +0 -0
  411. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/__init__.py +0 -0
  412. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/data_model.py +0 -0
  413. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/environment.py +0 -0
  414. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/tools.py +0 -0
  415. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/retail/utils.py +0 -0
  416. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/__init__.py +0 -0
  417. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/data_model.py +0 -0
  418. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/environment.py +0 -0
  419. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  420. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  421. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  422. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  423. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  424. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  425. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  426. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  427. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/tools.py +0 -0
  428. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  429. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  430. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/domains/telecom/utils.py +0 -0
  431. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/__init__.py +0 -0
  432. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/db.py +0 -0
  433. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/environment.py +0 -0
  434. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/server.py +0 -0
  435. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/tool.py +0 -0
  436. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/toolkit.py +0 -0
  437. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  438. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/__init__.py +0 -0
  439. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator.py +0 -0
  440. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  441. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  442. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  443. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  444. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  445. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/metrics/__init__.py +0 -0
  446. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/metrics/agent_metrics.py +0 -0
  447. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  448. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/orchestrator/__init__.py +0 -0
  449. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  450. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  451. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/orchestrator/utils.py +0 -0
  452. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/registry.py +0 -0
  453. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/run.py +0 -0
  454. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/__init__.py +0 -0
  455. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/check_data.py +0 -0
  456. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  457. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/start_servers.py +0 -0
  458. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/scripts/view_simulations.py +0 -0
  459. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/user/__init__.py +0 -0
  460. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/user/base.py +0 -0
  461. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/user/user_simulator.py +0 -0
  462. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/__init__.py +0 -0
  463. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/display.py +0 -0
  464. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/io_utils.py +0 -0
  465. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/llm_utils.py +0 -0
  466. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/pydantic_utils.py +0 -0
  467. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vendor/tau2/utils/utils.py +0 -0
  468. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/versioneer.py +0 -0
  469. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  470. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/index-DFeF7AG_.js +0 -0
  471. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/index-DFeF7AG_.js.map +0 -0
  472. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/index-DvKW7FQL.css +0 -0
  473. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  474. {eval_protocol-0.3.23 → eval_protocol-0.3.25}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.23
3
+ Version: 0.3.25
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -24,7 +24,7 @@ Requires-Dist: hydra-core>=1.3.2
24
24
  Requires-Dist: omegaconf>=2.3.0
25
25
  Requires-Dist: httpx>=0.24.0
26
26
  Requires-Dist: anthropic>=0.59.0
27
- Requires-Dist: litellm<1.82.0,>=1.81.0
27
+ Requires-Dist: litellm<1.75.0
28
28
  Requires-Dist: pytest>=6.0.0
29
29
  Requires-Dist: pytest-asyncio>=0.21.0
30
30
  Requires-Dist: peewee>=3.18.2
@@ -111,14 +111,11 @@ Requires-Dist: langchain-core>=0.3.75; extra == "langgraph"
111
111
  Provides-Extra: langgraph-tools
112
112
  Requires-Dist: langgraph>=0.6.7; extra == "langgraph-tools"
113
113
  Requires-Dist: langchain>=0.3.0; extra == "langgraph-tools"
114
+ Requires-Dist: langchain-fireworks>=0.3.0; extra == "langgraph-tools"
114
115
  Provides-Extra: proxy
115
116
  Requires-Dist: redis>=5.0.0; extra == "proxy"
116
117
  Requires-Dist: langfuse>=2.0.0; extra == "proxy"
117
118
  Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
118
- Requires-Dist: litellm<1.82.0,>=1.81.0; extra == "proxy"
119
- Requires-Dist: opentelemetry-api>=1.29.0; extra == "proxy"
120
- Requires-Dist: opentelemetry-sdk>=1.29.0; extra == "proxy"
121
- Requires-Dist: opentelemetry-exporter-otlp>=1.29.0; extra == "proxy"
122
119
  Dynamic: license-file
123
120
 
124
121
  # Eval Protocol
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-03-06T17:59:19-0800",
11
+ "date": "2026-03-13T16:09:44-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "5ac0bb46b02200b500ae535dcc5a86c4179c8408",
15
- "version": "0.3.23"
14
+ "full-revisionid": "3c8d8f23f7b301697f246c64e57d08fa1c7af50b",
15
+ "version": "0.3.25"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -8,10 +8,8 @@ from __future__ import annotations
8
8
  import logging
9
9
  import requests
10
10
  from datetime import datetime
11
- import ast
12
- import json
13
- import os
14
11
  from typing import Any, Dict, List, Optional, Protocol
12
+ import os
15
13
 
16
14
  from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
17
15
  from .base import BaseAdapter
@@ -46,43 +44,6 @@ class TraceDictConverter(Protocol):
46
44
  ...
47
45
 
48
46
 
49
- def extract_otel_attributes(observations: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
50
- """Attempt to extract and parse attributes from raw_gen_ai_request observation. This only works when stored in OTEL format.
51
-
52
- Args:
53
- observations: List of observation dictionaries from the trace
54
-
55
- Returns:
56
- Dict with all attributes parsed. Or None if not found.
57
- """
58
- for obs in observations:
59
- if obs.get("name") == "raw_gen_ai_request" and obs.get("type") == "SPAN":
60
- metadata = obs.get("metadata") or {}
61
- attributes = metadata.get("attributes") or {}
62
-
63
- result: Dict[str, Any] = {}
64
-
65
- for key, value in attributes.items():
66
- # Try to parse stringified objects (could be Python repr or JSON)
67
- if isinstance(value, str) and value.startswith(("[", "{")):
68
- try:
69
- result[key] = ast.literal_eval(value)
70
- except Exception as e:
71
- logger.debug("Failed to parse %s with ast.literal_eval: %s", key, e)
72
- try:
73
- result[key] = json.loads(value)
74
- except Exception as e:
75
- logger.debug("Failed to parse %s with json.loads: %s", key, e)
76
- result[key] = value
77
- else:
78
- result[key] = value
79
-
80
- if result:
81
- return result
82
-
83
- return None
84
-
85
-
86
47
  def convert_trace_dict_to_evaluation_row(
87
48
  trace: Dict[str, Any], include_tool_calls: bool = True, span_name: Optional[str] = None
88
49
  ) -> Optional[EvaluationRow]:
@@ -135,19 +96,6 @@ def convert_trace_dict_to_evaluation_row(
135
96
  ):
136
97
  break # Break early if we've found all the metadata we need
137
98
 
138
- observations = trace.get("observations") or []
139
- # We can only extract when stored in OTEL format.
140
- otel_attributes = extract_otel_attributes(observations)
141
- if otel_attributes:
142
- # Find choices from any provider (llm.*.choices pattern)
143
- choices = None
144
- for key, value in otel_attributes.items():
145
- if key.endswith(".choices") and isinstance(value, list):
146
- choices = value
147
- break
148
- if choices and len(choices) > 0:
149
- execution_metadata.finish_reason = choices[0].get("finish_reason")
150
-
151
99
  return EvaluationRow(
152
100
  messages=messages,
153
101
  tools=tools,
@@ -212,7 +160,7 @@ def extract_messages_from_trace_dict(
212
160
  # Fallback: use the last GENERATION observation which typically contains full chat history
213
161
  if not messages:
214
162
  try:
215
- all_observations = trace.get("observations") or []
163
+ all_observations = trace.get("observations", [])
216
164
  gens = [obs for obs in all_observations if obs.get("type") == "GENERATION"]
217
165
  if gens:
218
166
  gens.sort(key=lambda x: x.get("start_time", ""))
@@ -238,7 +186,7 @@ def get_final_generation_in_span_dict(trace: Dict[str, Any], span_name: str) ->
238
186
  The final generation dictionary, or None if not found
239
187
  """
240
188
  # Get all observations from the trace
241
- all_observations = trace.get("observations") or []
189
+ all_observations = trace.get("observations", [])
242
190
 
243
191
  # Find a span with the given name that has generation children
244
192
  parent_span = None
@@ -15,7 +15,7 @@ from contextlib import asynccontextmanager
15
15
 
16
16
  from .models import ProxyConfig, LangfuseTracesResponse, TracesParams, ChatParams, ChatRequestHook, TracesRequestHook
17
17
  from .auth import AuthProvider, NoAuthProvider
18
- from .litellm import handle_chat_completion
18
+ from .litellm import handle_chat_completion, proxy_to_litellm
19
19
  from .langfuse import fetch_langfuse_traces, pointwise_fetch_langfuse_trace
20
20
 
21
21
  # Configure logging before any other imports (so all modules inherit this config)
@@ -35,6 +35,10 @@ def build_proxy_config(
35
35
  preprocess_traces_request: Optional[TracesRequestHook] = None,
36
36
  ) -> ProxyConfig:
37
37
  """Load environment and secrets, and build ProxyConfig"""
38
+ # Env
39
+ litellm_url = os.getenv("LITELLM_URL")
40
+ if not litellm_url:
41
+ raise ValueError("LITELLM_URL environment variable must be set")
38
42
  request_timeout = float(os.getenv("REQUEST_TIMEOUT", "300.0"))
39
43
  langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
40
44
 
@@ -62,6 +66,7 @@ def build_proxy_config(
62
66
  raise ValueError(f"Invalid format in secrets file {secrets_path.name}: {e}")
63
67
 
64
68
  return ProxyConfig(
69
+ litellm_url=litellm_url,
65
70
  request_timeout=request_timeout,
66
71
  langfuse_host=langfuse_host,
67
72
  langfuse_keys=langfuse_keys,
@@ -108,16 +113,6 @@ def create_app(
108
113
  app.state.config = build_proxy_config(preprocess_chat_request, preprocess_traces_request)
109
114
  app.state.redis = init_redis()
110
115
 
111
- config = app.state.config
112
- default_keys = config.langfuse_keys[config.default_project_id]
113
- os.environ["LANGFUSE_PUBLIC_KEY"] = default_keys["public_key"]
114
- os.environ["LANGFUSE_SECRET_KEY"] = default_keys["secret_key"]
115
- os.environ.setdefault("LANGFUSE_HOST", config.langfuse_host)
116
-
117
- import litellm
118
-
119
- litellm.callbacks = ["langfuse_otel"]
120
-
121
116
  try:
122
117
  yield
123
118
  finally:
@@ -302,4 +297,13 @@ def create_app(
302
297
  async def health():
303
298
  return {"status": "healthy", "service": "metadata-proxy"}
304
299
 
300
+ # Catch-all
301
+ @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
302
+ async def catch_all_proxy(
303
+ path: str,
304
+ request: Request,
305
+ config: ProxyConfig = Depends(get_config),
306
+ ):
307
+ return await proxy_to_litellm(config, path, request)
308
+
305
309
  return app
@@ -50,7 +50,6 @@ def _serialize_trace_to_dict(trace_full: Any) -> Dict[str, Any]:
50
50
  "input": getattr(obs, "input", None),
51
51
  "output": getattr(obs, "output", None),
52
52
  "parent_observation_id": getattr(obs, "parent_observation_id", None),
53
- "metadata": getattr(obs, "metadata", None),
54
53
  }
55
54
  for obs in getattr(trace_full, "observations", [])
56
55
  ]
@@ -0,0 +1,173 @@
1
+ """
2
+ LiteLLM client - handles all communication with LiteLLM service.
3
+ """
4
+
5
+ import json
6
+ import base64
7
+ import httpx
8
+ import logging
9
+ from uuid6 import uuid7
10
+ from fastapi import Request, Response, HTTPException
11
+ import redis
12
+ from .redis_utils import register_insertion_id
13
+ from .models import ProxyConfig, ChatParams
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ async def handle_chat_completion(
19
+ config: ProxyConfig,
20
+ redis_client: redis.Redis,
21
+ request: Request,
22
+ params: ChatParams,
23
+ ) -> Response:
24
+ """
25
+ Handle chat completion requests and forward to LiteLLM.
26
+
27
+ If metadata IDs (rollout_id, etc.) are provided, they'll be added as tags
28
+ and the assistant message count will be tracked in Redis.
29
+
30
+ If encoded_base_url is provided, it will be decoded and added to the request.
31
+ """
32
+ body = await request.body()
33
+ data = json.loads(body) if body else {}
34
+
35
+ if config.preprocess_chat_request:
36
+ data, params = config.preprocess_chat_request(data, request, params)
37
+
38
+ project_id = params.project_id
39
+ rollout_id = params.rollout_id
40
+ invocation_id = params.invocation_id
41
+ experiment_id = params.experiment_id
42
+ run_id = params.run_id
43
+ row_id = params.row_id
44
+ encoded_base_url = params.encoded_base_url
45
+
46
+ # Use default project if not specified
47
+ if project_id is None:
48
+ project_id = config.default_project_id
49
+
50
+ # Decode and add base_url if provided
51
+ if encoded_base_url:
52
+ try:
53
+ # Decode from URL-safe base64
54
+ decoded_bytes = base64.urlsafe_b64decode(encoded_base_url)
55
+ base_url = decoded_bytes.decode("utf-8")
56
+ data["base_url"] = base_url
57
+ logger.debug(f"Decoded base_url: {base_url}")
58
+ except Exception as e:
59
+ logger.error(f"Failed to decode base_url: {e}")
60
+ raise HTTPException(status_code=400, detail=f"Invalid encoded_base_url: {str(e)}")
61
+
62
+ # Extract API key from Authorization header and inject into request body
63
+ auth_header = request.headers.get("authorization", "")
64
+ if auth_header.startswith("Bearer "):
65
+ api_key = auth_header.replace("Bearer ", "").strip()
66
+ # Only inject API key if model is a Fireworks model
67
+ model = data.get("model")
68
+ if model and isinstance(model, str) and model.startswith("fireworks_ai"):
69
+ data["api_key"] = api_key
70
+
71
+ # If metadata IDs are provided, add them as tags
72
+ insertion_id = None
73
+ if rollout_id is not None:
74
+ insertion_id = str(uuid7())
75
+
76
+ if "metadata" not in data:
77
+ data["metadata"] = {}
78
+ if "tags" not in data["metadata"]:
79
+ data["metadata"]["tags"] = []
80
+
81
+ # Add extracted IDs as tags
82
+ data["metadata"]["tags"].extend(
83
+ [
84
+ f"rollout_id:{rollout_id}",
85
+ f"insertion_id:{insertion_id}",
86
+ f"invocation_id:{invocation_id}",
87
+ f"experiment_id:{experiment_id}",
88
+ f"run_id:{run_id}",
89
+ f"row_id:{row_id}",
90
+ ]
91
+ )
92
+
93
+ # Add Langfuse configuration
94
+ data["langfuse_public_key"] = config.langfuse_keys[project_id]["public_key"]
95
+ data["langfuse_secret_key"] = config.langfuse_keys[project_id]["secret_key"]
96
+ data["langfuse_host"] = config.langfuse_host
97
+
98
+ # Forward to LiteLLM's standard /chat/completions endpoint
99
+ # Set longer timeout for LLM API calls (LLMs can be slow)
100
+ timeout = httpx.Timeout(config.request_timeout)
101
+ async with httpx.AsyncClient(timeout=timeout) as client:
102
+ # Copy headers from original request but exclude content-length (httpx will set it correctly)
103
+ headers = dict(request.headers)
104
+ headers.pop("host", None)
105
+ headers.pop("content-length", None) # Let httpx calculate the correct length
106
+ headers["content-type"] = "application/json"
107
+
108
+ # Forward to LiteLLM
109
+ litellm_url = f"{config.litellm_url}/chat/completions"
110
+
111
+ response = await client.post(
112
+ litellm_url,
113
+ json=data, # httpx will serialize and set correct Content-Length
114
+ headers=headers,
115
+ )
116
+
117
+ # Register insertion_id in Redis only on successful response
118
+ if response.status_code == 200 and insertion_id is not None and rollout_id is not None:
119
+ register_insertion_id(redis_client, rollout_id, insertion_id)
120
+
121
+ # Return the response
122
+ return Response(
123
+ content=response.content,
124
+ status_code=response.status_code,
125
+ headers=dict(response.headers),
126
+ )
127
+
128
+
129
+ async def proxy_to_litellm(config: ProxyConfig, path: str, request: Request) -> Response:
130
+ """
131
+ Catch-all proxy: Forward any request to LiteLLM, extracting API key from Authorization header.
132
+ """
133
+ # Set longer timeout for LLM API calls (LLMs can be slow)
134
+ timeout = httpx.Timeout(config.request_timeout)
135
+ async with httpx.AsyncClient(timeout=timeout) as client:
136
+ # Copy headers
137
+ headers = dict(request.headers)
138
+ headers.pop("host", None)
139
+ headers.pop("content-length", None)
140
+
141
+ # Get body
142
+ body = await request.body()
143
+
144
+ # Pass through API key from Authorization header
145
+ if request.method in ["POST", "PUT", "PATCH"] and body:
146
+ try:
147
+ data = json.loads(body)
148
+
149
+ auth_header = request.headers.get("authorization", "")
150
+ if auth_header.startswith("Bearer "):
151
+ api_key = auth_header.replace("Bearer ", "").strip()
152
+ data["api_key"] = api_key
153
+
154
+ # Re-serialize
155
+ body = json.dumps(data).encode()
156
+ except json.JSONDecodeError:
157
+ pass
158
+
159
+ # Forward to LiteLLM
160
+ litellm_url = f"{config.litellm_url}/{path}"
161
+
162
+ response = await client.request(
163
+ method=request.method,
164
+ url=litellm_url,
165
+ headers=headers,
166
+ content=body,
167
+ )
168
+
169
+ return Response(
170
+ content=response.content,
171
+ status_code=response.status_code,
172
+ headers=dict(response.headers),
173
+ )
@@ -53,6 +53,7 @@ class TracesParams(BaseModel):
53
53
  class ProxyConfig(BaseModel):
54
54
  """Configuration model for the LiteLLM Metadata Proxy"""
55
55
 
56
+ litellm_url: str
56
57
  request_timeout: float = 300.0
57
58
  langfuse_host: str
58
59
  langfuse_keys: Dict[str, Dict[str, str]]
@@ -72,7 +73,6 @@ class ObservationResponse(BaseModel):
72
73
  input: Optional[Any] = None
73
74
  output: Optional[Any] = None
74
75
  parent_observation_id: Optional[str] = None
75
- metadata: Optional[Dict[str, Any]] = None
76
76
 
77
77
 
78
78
  class TraceResponse(BaseModel):
@@ -449,6 +449,8 @@ def evaluation_test(
449
449
  finally:
450
450
  if output_buffer:
451
451
  await output_buffer.close()
452
+ await rollout_processor.acleanup()
453
+ rollout_processor.cleanup()
452
454
 
453
455
  for res in priority_results:
454
456
  run_idx = (res.execution_metadata.extra or {}).get("run_index", 0)
@@ -697,15 +699,19 @@ def evaluation_test(
697
699
  # Lazy import (cached after first import above)
698
700
  from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor
699
701
 
700
- if isinstance(rollout_processor, MCPGymRolloutProcessor):
701
- # For MCPGymRolloutProcessor, create and execute tasks one at a time to avoid port conflicts
702
- for run_idx in range(num_runs):
703
- task = asyncio.create_task(execute_run(run_idx, config))
704
- await task
705
- else:
706
- # For other processors, create all tasks at once and run in parallel
707
- # Concurrency is now controlled by the shared semaphore in each rollout processor
708
- await run_tasks_with_run_progress(execute_run, num_runs, config)
702
+ try:
703
+ if isinstance(rollout_processor, MCPGymRolloutProcessor):
704
+ # For MCPGymRolloutProcessor, create and execute tasks one at a time to avoid port conflicts
705
+ for run_idx in range(num_runs):
706
+ task = asyncio.create_task(execute_run(run_idx, config))
707
+ await task
708
+ else:
709
+ # For other processors, create all tasks at once and run in parallel
710
+ # Concurrency is now controlled by the shared semaphore in each rollout processor
711
+ await run_tasks_with_run_progress(execute_run, num_runs, config)
712
+ finally:
713
+ await rollout_processor.acleanup()
714
+ rollout_processor.cleanup()
709
715
 
710
716
  experiment_duration_seconds = time.perf_counter() - experiment_start_time
711
717
 
@@ -476,8 +476,12 @@ async def rollout_processor_with_retry(
476
476
  yield result
477
477
 
478
478
  finally:
479
- await rollout_processor.acleanup()
480
- rollout_processor.cleanup()
479
+ # Cleanup is intentionally NOT called here. rollout_processor_with_retry
480
+ # is invoked per-run, but the processor (and its session) is shared
481
+ # across parallel runs. Closing per-run would kill in-flight requests
482
+ # in other runs. Cleanup is called once after all runs complete in
483
+ # evaluation_test.py.
484
+ pass
481
485
 
482
486
 
483
487
  def sanitize_filename(text: str) -> str:
@@ -23,6 +23,7 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
23
23
  return _default_retryable_exceptions
24
24
 
25
25
  # Lazy imports (these are expensive)
26
+ import aiohttp
26
27
  import httpx
27
28
  import litellm
28
29
  import requests
@@ -32,6 +33,9 @@ def get_default_retryable_exceptions() -> Set[Type[Exception]]:
32
33
  ConnectionError, # type: ignore[assignment]
33
34
  TimeoutError, # type: ignore[assignment]
34
35
  OSError, # type: ignore[assignment] # Covers network-related OS errors
36
+ # aiohttp library exceptions
37
+ aiohttp.ClientConnectionError,
38
+ aiohttp.ServerDisconnectedError,
35
39
  # Requests library exceptions
36
40
  requests.exceptions.ConnectionError,
37
41
  requests.exceptions.Timeout,
@@ -104,6 +104,9 @@ class RemoteRolloutProcessor(RolloutProcessor):
104
104
  try:
105
105
  session = self._get_or_create_session()
106
106
  async with session.post(init_url, json=init_payload.model_dump(), timeout=timeout_init) as resp:
107
+ if resp.status >= 500:
108
+ body = await resp.text()
109
+ raise ConnectionError(f"Remote /init returned server error (HTTP {resp.status}): {body}")
107
110
  if resp.status >= 400:
108
111
  body = await resp.text()
109
112
  raise RuntimeError(f"Remote /init failed (HTTP {resp.status}): {body}")
@@ -215,8 +218,6 @@ class RemoteRolloutProcessor(RolloutProcessor):
215
218
  loop = asyncio.get_running_loop()
216
219
  loop.create_task(self._session.close())
217
220
  except RuntimeError:
218
- # No running event loop - can't safely close the session.
219
- # The session will be garbage collected eventually, but warn about it.
220
221
  logger.warning(
221
222
  "RemoteRolloutProcessor.cleanup() called outside of async context. "
222
223
  "Session may not be properly closed. Use `await processor.acleanup()` when possible."
@@ -12,6 +12,7 @@ import requests
12
12
  from .models import EvaluateResult, MetricResult
13
13
  from .typed_interface import reward_function
14
14
 
15
+ logging.basicConfig(level=logging.INFO)
15
16
  logger = logging.getLogger(__name__)
16
17
 
17
18
  T = TypeVar("T", bound=Callable[..., EvaluateResult])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.23
3
+ Version: 0.3.25
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -24,7 +24,7 @@ Requires-Dist: hydra-core>=1.3.2
24
24
  Requires-Dist: omegaconf>=2.3.0
25
25
  Requires-Dist: httpx>=0.24.0
26
26
  Requires-Dist: anthropic>=0.59.0
27
- Requires-Dist: litellm<1.82.0,>=1.81.0
27
+ Requires-Dist: litellm<1.75.0
28
28
  Requires-Dist: pytest>=6.0.0
29
29
  Requires-Dist: pytest-asyncio>=0.21.0
30
30
  Requires-Dist: peewee>=3.18.2
@@ -111,14 +111,11 @@ Requires-Dist: langchain-core>=0.3.75; extra == "langgraph"
111
111
  Provides-Extra: langgraph-tools
112
112
  Requires-Dist: langgraph>=0.6.7; extra == "langgraph-tools"
113
113
  Requires-Dist: langchain>=0.3.0; extra == "langgraph-tools"
114
+ Requires-Dist: langchain-fireworks>=0.3.0; extra == "langgraph-tools"
114
115
  Provides-Extra: proxy
115
116
  Requires-Dist: redis>=5.0.0; extra == "proxy"
116
117
  Requires-Dist: langfuse>=2.0.0; extra == "proxy"
117
118
  Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
118
- Requires-Dist: litellm<1.82.0,>=1.81.0; extra == "proxy"
119
- Requires-Dist: opentelemetry-api>=1.29.0; extra == "proxy"
120
- Requires-Dist: opentelemetry-sdk>=1.29.0; extra == "proxy"
121
- Requires-Dist: opentelemetry-exporter-otlp>=1.29.0; extra == "proxy"
122
119
  Dynamic: license-file
123
120
 
124
121
  # Eval Protocol
@@ -12,7 +12,7 @@ hydra-core>=1.3.2
12
12
  omegaconf>=2.3.0
13
13
  httpx>=0.24.0
14
14
  anthropic>=0.59.0
15
- litellm<1.82.0,>=1.81.0
15
+ litellm<1.75.0
16
16
  pytest>=6.0.0
17
17
  pytest-asyncio>=0.21.0
18
18
  peewee>=3.18.2
@@ -93,6 +93,7 @@ langchain-core>=0.3.75
93
93
  [langgraph_tools]
94
94
  langgraph>=0.6.7
95
95
  langchain>=0.3.0
96
+ langchain-fireworks>=0.3.0
96
97
 
97
98
  [langsmith]
98
99
  langsmith>=0.1.86
@@ -107,10 +108,6 @@ openevals>=0.1.0
107
108
  redis>=5.0.0
108
109
  langfuse>=2.0.0
109
110
  uuid6>=2025.0.0
110
- litellm<1.82.0,>=1.81.0
111
- opentelemetry-api>=1.29.0
112
- opentelemetry-sdk>=1.29.0
113
- opentelemetry-exporter-otlp>=1.29.0
114
111
 
115
112
  [pydantic]
116
113
  pydantic-ai>=1.0.2
@@ -31,7 +31,7 @@ dependencies = [
31
31
  "omegaconf>=2.3.0",
32
32
  "httpx>=0.24.0",
33
33
  "anthropic>=0.59.0",
34
- "litellm>=1.81.0,<1.82.0",
34
+ "litellm<1.75.0",
35
35
  "pytest>=6.0.0",
36
36
  "pytest-asyncio>=0.21.0",
37
37
  "peewee>=3.18.2",
@@ -146,17 +146,13 @@ langgraph = [
146
146
  langgraph_tools = [
147
147
  "langgraph>=0.6.7",
148
148
  "langchain>=0.3.0",
149
- # langchain-fireworks removed: incompatible with fireworks-ai>=1.0.0
149
+ "langchain-fireworks>=0.3.0",
150
150
  ]
151
151
 
152
152
  proxy = [
153
153
  "redis>=5.0.0",
154
154
  "langfuse>=2.0.0",
155
155
  "uuid6>=2025.0.0",
156
- "litellm>=1.81.0,<1.82.0",
157
- "opentelemetry-api>=1.29.0",
158
- "opentelemetry-sdk>=1.29.0",
159
- "opentelemetry-exporter-otlp>=1.29.0",
160
156
  ]
161
157
 
162
158
  [project.scripts]