eval-protocol 0.3.29__tar.gz → 0.3.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (475) hide show
  1. {eval_protocol-0.3.29/eval_protocol.egg-info → eval_protocol-0.3.30}/PKG-INFO +2 -1
  2. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/fireworks_tracing.py +78 -1
  4. eval_protocol-0.3.30/eval_protocol/adapters/lp_deserializer.py +109 -0
  5. eval_protocol-0.3.30/eval_protocol/adapters/r3_deserializer.py +187 -0
  6. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/remote_rollout_processor.py +44 -37
  7. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/tracing_utils.py +65 -6
  8. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/types/remote_rollout_processor.py +1 -0
  9. {eval_protocol-0.3.29 → eval_protocol-0.3.30/eval_protocol.egg-info}/PKG-INFO +2 -1
  10. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/SOURCES.txt +2 -0
  11. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/requires.txt +1 -0
  12. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/pyproject.toml +1 -0
  13. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/LICENSE +0 -0
  14. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/README.md +0 -0
  15. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/__init__.py +0 -0
  16. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/normalize_sandbox_fusion.py +0 -0
  17. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/utils/__init__.py +0 -0
  18. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/utils/generate_api_key.py +0 -0
  19. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/development/utils/subprocess_manager.py +0 -0
  20. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/__init__.py +0 -0
  21. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/__main__.py +0 -0
  22. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/__init__.py +0 -0
  23. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/base.py +0 -0
  24. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/bigquery.py +0 -0
  25. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/braintrust.py +0 -0
  26. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/dataframe.py +0 -0
  27. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/huggingface.py +0 -0
  28. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/langchain.py +0 -0
  29. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/langfuse.py +0 -0
  30. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/langsmith.py +0 -0
  31. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/openai_responses.py +0 -0
  32. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/trl.py +0 -0
  33. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/utils.py +0 -0
  34. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/adapters/weave.py +0 -0
  35. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/__init__.py +0 -0
  36. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/models.py +0 -0
  37. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/orchestrator.py +0 -0
  38. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resource_abc.py +0 -0
  39. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resource_pool.py +0 -0
  40. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/__init__.py +0 -0
  41. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  42. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  43. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  44. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  45. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  46. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/docker_resource.py +0 -0
  47. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  48. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  49. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/resources/sql_resource.py +0 -0
  50. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/task_manager.py +0 -0
  51. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/agent/tool_registry.py +0 -0
  52. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/auth.py +0 -0
  53. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/__init__.py +0 -0
  54. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  55. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  56. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_aime25.py +0 -0
  57. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  58. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
  59. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  60. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  61. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  62. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  63. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli.py +0 -0
  64. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/__init__.py +0 -0
  65. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  66. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/common.py +0 -0
  67. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/create_rft.py +0 -0
  68. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/export_docs.py +0 -0
  69. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/local_test.py +0 -0
  70. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/logs.py +0 -0
  71. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  72. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/upload.py +0 -0
  73. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/cli_commands/utils.py +0 -0
  74. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/common_utils.py +0 -0
  75. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/config.py +0 -0
  76. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/__init__.py +0 -0
  77. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  78. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  79. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  80. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  81. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/data_loader/models.py +0 -0
  82. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/__init__.py +0 -0
  83. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  84. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  85. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  86. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  87. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/datasets/__init__.py +0 -0
  88. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/datasets/loader.py +0 -0
  89. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/directory_utils.py +0 -0
  90. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/evaluation.py +0 -0
  91. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/__init__.py +0 -0
  92. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/event_bus.py +0 -0
  93. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/logger.py +0 -0
  94. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  95. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  96. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/exceptions.py +0 -0
  97. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/execution/__init__.py +0 -0
  98. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/execution/pipeline.py +0 -0
  99. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/fireworks_rft.py +0 -0
  100. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/gcp_tools.py +0 -0
  101. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/generation/cache.py +0 -0
  102. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/generation/clients/base.py +0 -0
  103. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/generation/clients.py +0 -0
  104. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/generic_server.py +0 -0
  105. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/get_pep440_version.py +0 -0
  106. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/human_id/__init__.py +0 -0
  107. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/human_id/dictionary.py +0 -0
  108. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/__init__.py +0 -0
  109. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/deepeval.py +0 -0
  110. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/fireworks_v1_completions_client.py +0 -0
  111. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/openai_rft.py +0 -0
  112. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/openeval.py +0 -0
  113. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  114. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
  115. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/integrations/trl.py +0 -0
  116. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/__init__.py +0 -0
  117. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  118. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  119. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  120. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  121. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/init.py +0 -0
  122. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/rollout_context.py +0 -0
  123. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  124. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/log_utils/util.py +0 -0
  125. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/logging_utils.py +0 -0
  126. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/__init__.py +0 -0
  127. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/adapter.py +0 -0
  128. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/client/__init__.py +0 -0
  129. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/client/connection.py +0 -0
  130. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/clients.py +0 -0
  131. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/__init__.py +0 -0
  132. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/base_policy.py +0 -0
  133. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/manager.py +0 -0
  134. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/policy.py +0 -0
  135. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  136. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/grid_renderer.py +0 -0
  137. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  138. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/mcpgym.py +0 -0
  139. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/process_manager.py +0 -0
  140. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/session/__init__.py +0 -0
  141. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/session/manager.py +0 -0
  142. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/simple_process_manager.py +0 -0
  143. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp/simulation_server.py +0 -0
  144. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/__init__.py +0 -0
  145. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/config.py +0 -0
  146. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/main.py +0 -0
  147. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  148. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  149. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  150. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  151. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_env.py +0 -0
  152. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/__init__.py +0 -0
  153. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  154. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  155. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  156. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  157. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  158. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  159. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  160. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  161. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  162. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  163. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  164. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  165. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  166. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  167. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/models.py +0 -0
  168. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/packaging.py +0 -0
  169. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/platform_api.py +0 -0
  170. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/playback_policy.py +0 -0
  171. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/__init__.py +0 -0
  172. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  173. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/app.py +0 -0
  174. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  175. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  176. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  177. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/main.py +0 -0
  178. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/models.py +0 -0
  179. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  180. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/__init__.py +0 -0
  181. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/buffer.py +0 -0
  182. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  183. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  184. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
  185. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  186. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  187. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  188. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  189. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  190. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  191. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  192. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test.py +0 -0
  193. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  194. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  195. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/exception_config.py +0 -0
  196. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/execution.py +0 -0
  197. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  198. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  199. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  200. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  201. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
  202. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/parameterize.py +0 -0
  203. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/plugin.py +0 -0
  204. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/priority_scheduler.py +0 -0
  205. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/rollout_processor.py +0 -0
  206. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  207. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/store_experiment_link.py +0 -0
  208. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/store_results_url.py +0 -0
  209. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/types.py +0 -0
  210. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/utils.py +0 -0
  211. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/pytest/validate_signature.py +0 -0
  212. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/__init__.py +0 -0
  213. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  214. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  215. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  216. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  217. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  218. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  219. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  220. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/llm_judge.py +0 -0
  221. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  222. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  223. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  224. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  225. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/quickstart/utils.py +0 -0
  226. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/resources.py +0 -0
  227. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/reward_function.py +0 -0
  228. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/__init__.py +0 -0
  229. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/accuracy.py +0 -0
  230. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/accuracy_length.py +0 -0
  231. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  232. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  233. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_testing_util.py +0 -0
  234. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/bfcl_reward.py +0 -0
  235. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/code_execution.py +0 -0
  236. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/code_execution_utils.py +0 -0
  237. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/cpp_code.py +0 -0
  238. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  239. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/format.py +0 -0
  240. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/function_calling.py +0 -0
  241. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/json_schema.py +0 -0
  242. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/language_consistency.py +0 -0
  243. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/lean_prover.py +0 -0
  244. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/length.py +0 -0
  245. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  246. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/math.py +0 -0
  247. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  248. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/reasoning_steps.py +0 -0
  249. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/repetition.py +0 -0
  250. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rewards/tag_count.py +0 -0
  251. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/rl_processing.py +0 -0
  252. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/server.py +0 -0
  253. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/stats/__init__.py +0 -0
  254. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/stats/confidence_intervals.py +0 -0
  255. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/__init__.py +0 -0
  256. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/gepa_trainer.py +0 -0
  257. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/gepa_utils.py +0 -0
  258. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/trainer.py +0 -0
  259. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/training/utils.py +0 -0
  260. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/typed_interface.py +0 -0
  261. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/types/__init__.py +0 -0
  262. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/types/errors.py +0 -0
  263. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/types/types.py +0 -0
  264. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/__init__.py +0 -0
  265. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/batch_evaluation.py +0 -0
  266. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/batch_transformation.py +0 -0
  267. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/browser_utils.py +0 -0
  268. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/check_server_status.py +0 -0
  269. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/dataset_helpers.py +0 -0
  270. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  271. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/logs_models.py +0 -0
  272. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/logs_server.py +0 -0
  273. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/module_loader.py +0 -0
  274. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/packaging_utils.py +0 -0
  275. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/show_results_url.py +0 -0
  276. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/static_policy.py +0 -0
  277. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/subprocess_utils.py +0 -0
  278. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol/utils/vite_server.py +0 -0
  279. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/dependency_links.txt +0 -0
  280. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/entry_points.txt +0 -0
  281. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/eval_protocol.egg-info/top_level.txt +0 -0
  282. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/setup.cfg +0 -0
  283. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/setup.py +0 -0
  284. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_accuracy.py +0 -0
  285. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_accuracy_length.py +0 -0
  286. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_adapters_e2e.py +0 -0
  287. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_agent_orchestrator.py +0 -0
  288. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_agent_resources.py +0 -0
  289. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_auth.py +0 -0
  290. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_batch_evaluation.py +0 -0
  291. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_agent.py +0 -0
  292. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_args.py +0 -0
  293. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_create_rft.py +0 -0
  294. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_local_test.py +0 -0
  295. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cli_startup_benchmark.py +0 -0
  296. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_code_execution.py +0 -0
  297. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_config.py +0 -0
  298. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_control_plane_separation.py +0 -0
  299. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_cpp_code.py +0 -0
  300. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_data_driven_task_manager.py +0 -0
  301. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_deepcoder_reward.py +0 -0
  302. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_deepeval_integration.py +0 -0
  303. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_directory_utils.py +0 -0
  304. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_e2b_integration.py +0 -0
  305. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_e2b_js_integration.py +0 -0
  306. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_edge_cases.py +0 -0
  307. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_ep_upload_e2e.py +0 -0
  308. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_eval_protocol_import.py +0 -0
  309. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_evaluation.py +0 -0
  310. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_evaluation_postprocess.py +0 -0
  311. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_event_bus.py +0 -0
  312. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_event_bus_helper.py +0 -0
  313. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_examples_end_to_end.py +0 -0
  314. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_exception_config.py +0 -0
  315. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_exceptions.py +0 -0
  316. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_fireworks_api.py +0 -0
  317. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_fireworks_v1_completions_client.py +0 -0
  318. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_format.py +0 -0
  319. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_fractional_code.py +0 -0
  320. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_function_calling.py +0 -0
  321. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_gcp_tools.py +0 -0
  322. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_generic_server.py +0 -0
  323. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_human_id.py +0 -0
  324. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_integration.py +0 -0
  325. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_json_schema.py +0 -0
  326. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_kwargs_validation.py +0 -0
  327. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_language_consistency.py +0 -0
  328. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_lean_prover.py +0 -0
  329. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_lean_prover_runner.py +0 -0
  330. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_length.py +0 -0
  331. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_list_comparison_math_reward.py +0 -0
  332. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_litellm_policy_provider_fields.py +0 -0
  333. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_logs_server.py +0 -0
  334. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_logs_server_simple.py +0 -0
  335. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_math.py +0 -0
  336. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_message_field_filtering.py +0 -0
  337. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_minimal.py +0 -0
  338. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_models.py +0 -0
  339. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_models_rl.py +0 -0
  340. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_multiple_choice_math_reward.py +0 -0
  341. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_n_variant_batch_integration.py +0 -0
  342. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_n_variant_integration.py +0 -0
  343. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_no_implicit_dotenv.py +0 -0
  344. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_openai_compatibility.py +0 -0
  345. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_openai_rft_integration.py +0 -0
  346. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_openeval_integration.py +0 -0
  347. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_packaging.py +0 -0
  348. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_parallel_rollouts.py +0 -0
  349. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_platform_api.py +0 -0
  350. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_priority_scheduler.py +0 -0
  351. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_quickstart_utils.py +0 -0
  352. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_readiness.py +0 -0
  353. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_reasoning_steps.py +0 -0
  354. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_repetition.py +0 -0
  355. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_repetition_debug.py +0 -0
  356. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_retry_mechanism.py +0 -0
  357. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_reward_function.py +0 -0
  358. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_reward_protocol_import.py +0 -0
  359. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_rl_processing.py +0 -0
  360. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_rollout_control_plane_integration.py +0 -0
  361. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_rollout_logprobs.py +0 -0
  362. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_server.py +0 -0
  363. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_show_results_url.py +0 -0
  364. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_sqlite_hardening.py +0 -0
  365. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_status_migration_changes.py +0 -0
  366. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_status_migration_integration.py +0 -0
  367. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_status_model.py +0 -0
  368. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_tag_count.py +0 -0
  369. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_tau_bench_airline_smoke.py +0 -0
  370. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_training_utils.py +0 -0
  371. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_typed_interface.py +0 -0
  372. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_typed_interface_rl.py +0 -0
  373. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_upload_entrypoint.py +0 -0
  374. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_url_handling.py +0 -0
  375. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/tests/test_vite_server.py +0 -0
  376. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/__init__.py +0 -0
  377. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/agent/__init__.py +0 -0
  378. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/agent/base.py +0 -0
  379. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/agent/llm_agent.py +0 -0
  380. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/api_service/__init__.py +0 -0
  381. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/api_service/api_config.py +0 -0
  382. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/api_service/data_model.py +0 -0
  383. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/api_service/simulation_service.py +0 -0
  384. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/cli.py +0 -0
  385. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/config.py +0 -0
  386. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/airline/policy.md +0 -0
  387. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/mock/policy.md +0 -0
  388. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  389. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/retail/policy.md +0 -0
  390. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  391. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  392. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  393. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  394. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  395. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  396. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  397. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data_model/__init__.py +0 -0
  398. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data_model/message.py +0 -0
  399. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data_model/simulation.py +0 -0
  400. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/data_model/tasks.py +0 -0
  401. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/__init__.py +0 -0
  402. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/__init__.py +0 -0
  403. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/data_model.py +0 -0
  404. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/environment.py +0 -0
  405. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/tools.py +0 -0
  406. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/utils.py +0 -0
  407. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/__init__.py +0 -0
  408. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/data_model.py +0 -0
  409. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/environment.py +0 -0
  410. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/tools.py +0 -0
  411. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/utils.py +0 -0
  412. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/__init__.py +0 -0
  413. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/data_model.py +0 -0
  414. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/environment.py +0 -0
  415. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/tools.py +0 -0
  416. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/utils.py +0 -0
  417. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/__init__.py +0 -0
  418. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/data_model.py +0 -0
  419. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/environment.py +0 -0
  420. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  421. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  422. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  423. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  424. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  425. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  426. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  427. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  428. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tools.py +0 -0
  429. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  430. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  431. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/utils.py +0 -0
  432. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/__init__.py +0 -0
  433. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/db.py +0 -0
  434. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/environment.py +0 -0
  435. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/server.py +0 -0
  436. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/tool.py +0 -0
  437. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/toolkit.py +0 -0
  438. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  439. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/__init__.py +0 -0
  440. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator.py +0 -0
  441. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  442. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  443. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  444. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  445. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  446. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/metrics/__init__.py +0 -0
  447. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/metrics/agent_metrics.py +0 -0
  448. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  449. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/__init__.py +0 -0
  450. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  451. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  452. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/utils.py +0 -0
  453. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/registry.py +0 -0
  454. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/run.py +0 -0
  455. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/__init__.py +0 -0
  456. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/check_data.py +0 -0
  457. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  458. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/start_servers.py +0 -0
  459. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/scripts/view_simulations.py +0 -0
  460. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/user/__init__.py +0 -0
  461. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/user/base.py +0 -0
  462. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/user/user_simulator.py +0 -0
  463. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/__init__.py +0 -0
  464. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/display.py +0 -0
  465. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/io_utils.py +0 -0
  466. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/llm_utils.py +0 -0
  467. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/pydantic_utils.py +0 -0
  468. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vendor/tau2/utils/utils.py +0 -0
  469. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/versioneer.py +0 -0
  470. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  471. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DFeF7AG_.js +0 -0
  472. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DFeF7AG_.js.map +0 -0
  473. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DvKW7FQL.css +0 -0
  474. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  475. {eval_protocol-0.3.29 → eval_protocol-0.3.30}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.29
3
+ Version: 0.3.30
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -40,6 +40,7 @@ Requires-Dist: addict>=2.4.0
40
40
  Requires-Dist: deepdiff>=6.0.0
41
41
  Requires-Dist: websockets>=15.0.1
42
42
  Requires-Dist: fastapi>=0.116.1
43
+ Requires-Dist: zstandard>=0.19.0
43
44
  Provides-Extra: dev
44
45
  Requires-Dist: build; extra == "dev"
45
46
  Requires-Dist: twine; extra == "dev"
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-04-28T20:17:42-0700",
11
+ "date": "2026-05-29T16:09:24-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "6b9bea9979400c7e0d2eec9f4f167a98d4a2a057",
15
- "version": "0.3.29"
14
+ "full-revisionid": "1bd5447a3afbca3b71e0f0d205ed7cff6c3afe5d",
15
+ "version": "0.3.30"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -16,6 +16,8 @@ import os
16
16
 
17
17
  from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
18
18
  from .base import BaseAdapter
19
+ from .lp_deserializer import decompress_and_parse_lp
20
+ from .r3_deserializer import decompress_and_parse_r3
19
21
  from .utils import extract_messages_from_data
20
22
  from ..common_utils import get_user_agent
21
23
 
@@ -100,13 +102,53 @@ def convert_trace_dict_to_evaluation_row(
100
102
  ):
101
103
  break # Break early if we've found all the metadata we need
102
104
 
105
+ # Extract router replay payloads when present
106
+ payloads = trace.get("payloads")
107
+ if isinstance(payloads, dict):
108
+ router_replay = payloads.get("router_replay")
109
+ if isinstance(router_replay, dict) and router_replay.get("data"):
110
+ try:
111
+ matrices, r3_meta = decompress_and_parse_r3(router_replay["data"])
112
+ if execution_metadata.extra is None:
113
+ execution_metadata.extra = {}
114
+ execution_metadata.extra["routing_matrices"] = matrices
115
+ execution_metadata.extra["routing_metadata"] = r3_meta
116
+ except Exception as e:
117
+ logger.warning("Failed to decompress R3 payload for trace %s: %s", trace.get("id"), e)
118
+
119
+ logprobs_payload = payloads.get("logprobs")
120
+ if isinstance(logprobs_payload, dict) and logprobs_payload.get("data"):
121
+ try:
122
+ logprobs, token_ids, lp_meta = decompress_and_parse_lp(logprobs_payload["data"])
123
+ if execution_metadata.extra is None:
124
+ execution_metadata.extra = {}
125
+ execution_metadata.extra["completion_logprobs"] = logprobs
126
+ if token_ids is not None:
127
+ execution_metadata.extra["completion_token_ids"] = token_ids
128
+ execution_metadata.extra["logprobs_metadata"] = lp_meta
129
+
130
+ for i in range(len(messages) - 1, -1, -1):
131
+ if messages[i].role == "assistant":
132
+ content_entries = [{"logprob": lp} for lp in logprobs]
133
+ if token_ids is not None:
134
+ for entry, tid in zip(content_entries, token_ids):
135
+ entry["token_id"] = tid
136
+ messages[i].logprobs = {"content": content_entries}
137
+ break
138
+ except Exception as e:
139
+ logger.warning(
140
+ "Failed to decompress logprobs payload for trace %s: %s",
141
+ trace.get("id"),
142
+ e,
143
+ )
144
+
103
145
  return EvaluationRow(
104
146
  messages=messages,
105
147
  tools=tools,
106
148
  input_metadata=InputMetadata(
107
149
  row_id=row_id,
108
150
  session_data={
109
- "langfuse_trace_id": trace.get("id"), # Store the trace ID here
151
+ "langfuse_trace_id": trace.get("id"),
110
152
  },
111
153
  ),
112
154
  execution_metadata=execution_metadata,
@@ -375,6 +417,37 @@ class FireworksTracingAdapter(BaseAdapter):
375
417
  )
376
418
  return results
377
419
 
420
+ async def async_get_status(self, session: aiohttp.ClientSession, rollout_id: str) -> Optional[Dict[str, Any]]:
421
+ """Fetch rollout status from the lightweight /status endpoint.
422
+
423
+ Returns the parsed JSON response or None if the status is not yet available.
424
+ Response shape: {"rollout_id": "...", "status": {"code": ...} | null, "extras": {...} | null}
425
+ """
426
+ headers = {
427
+ "Authorization": f"Bearer {self._get_api_key()}",
428
+ "User-Agent": get_user_agent(),
429
+ }
430
+ params: Dict[str, Any] = {"rollout_id": rollout_id}
431
+ timeout = aiohttp.ClientTimeout(total=self.timeout)
432
+
433
+ urls_to_try = [f"{self.base_url}/v1/status", f"{self.base_url}/status"]
434
+ last_error: Optional[str] = None
435
+ for url in urls_to_try:
436
+ try:
437
+ async with session.get(url, params=params, headers=headers, timeout=timeout) as resp:
438
+ if resp.status == 404:
439
+ last_error = f"404 for {url}"
440
+ continue
441
+ resp.raise_for_status()
442
+ return (await resp.json(content_type=None)) or {}
443
+ except (aiohttp.ClientError, asyncio.TimeoutError, json.JSONDecodeError) as e:
444
+ last_error = str(e)
445
+ continue
446
+
447
+ if last_error:
448
+ logger.error("Failed to fetch status from Fireworks (tried %s): %s", urls_to_try, last_error)
449
+ return None
450
+
378
451
  def get_evaluation_rows(
379
452
  self,
380
453
  tags: List[str],
@@ -395,6 +468,7 @@ class FireworksTracingAdapter(BaseAdapter):
395
468
  max_retries: int = 3,
396
469
  span_name: Optional[str] = None,
397
470
  converter: Optional[TraceDictConverter] = None,
471
+ include_payloads: bool = False,
398
472
  ) -> List[EvaluationRow]:
399
473
  """Pull traces from Langfuse via proxy and convert to EvaluationRow format.
400
474
 
@@ -418,6 +492,8 @@ class FireworksTracingAdapter(BaseAdapter):
418
492
  max_retries: Max retry attempts used by proxy (default: 3)
419
493
  converter: Optional custom converter implementing TraceDictConverter protocol.
420
494
  If provided, this will be used instead of the default conversion logic.
495
+ include_payloads: If True, request payload data (e.g., router replay)
496
+ from the gateway and decompress it into the returned EvaluationRows.
421
497
 
422
498
  Returns:
423
499
  List[EvaluationRow]: Converted evaluation rows
@@ -448,6 +524,7 @@ class FireworksTracingAdapter(BaseAdapter):
448
524
  "to_timestamp": to_timestamp.isoformat() if to_timestamp else None,
449
525
  "sleep_between_gets": sleep_between_gets,
450
526
  "max_retries": max_retries,
527
+ "include_payloads": include_payloads if include_payloads else None,
451
528
  }
452
529
 
453
530
  # Remove None values
@@ -0,0 +1,109 @@
1
+ """LP/v1 binary deserializer for per-token logprobs payloads.
2
+
3
+ Implements the inverse of the tracing gateway's ``logprobs_serializer.serialize_logprobs``.
4
+ See that module for the full header specification.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import base64
10
+ import struct
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ import zstandard as zstd
14
+
15
+ MAGIC = b"LP01"
16
+ HEADER_VERSION = 1
17
+ MISSING_TOKEN_ID = -1
18
+ ENTRY_FORMAT = "<if"
19
+ ENTRY_SIZE = struct.calcsize(ENTRY_FORMAT) # 8 bytes
20
+ HEADER_FORMAT = "<4sBBHIIQ"
21
+ HEADER_SIZE = struct.calcsize(HEADER_FORMAT) # 24 bytes
22
+
23
+
24
+ def _parse_header(raw: bytes) -> Dict[str, Any]:
25
+ if len(raw) < HEADER_SIZE:
26
+ raise ValueError(f"Payload too short for lp/v1 header: {len(raw)} < {HEADER_SIZE}")
27
+
28
+ (
29
+ magic,
30
+ version,
31
+ flags,
32
+ reserved_u16,
33
+ token_count,
34
+ body_byte_length,
35
+ reserved_u64,
36
+ ) = struct.unpack(HEADER_FORMAT, raw[:HEADER_SIZE])
37
+
38
+ if magic != MAGIC:
39
+ raise ValueError(f"Bad LP/v1 magic: {magic!r}")
40
+ if version != HEADER_VERSION:
41
+ raise ValueError(f"Unsupported lp/v1 header version: {version}")
42
+
43
+ return {
44
+ "flags": flags,
45
+ "reserved_u16": reserved_u16,
46
+ "token_count": token_count,
47
+ "body_byte_length": body_byte_length,
48
+ "reserved_u64": reserved_u64,
49
+ }
50
+
51
+
52
+ def parse_logprobs(raw: bytes) -> Tuple[List[float], Optional[List[int]], Dict[str, Any]]:
53
+ """Parse uncompressed LP/v1 bytes into logprobs, optional token ids, and metadata."""
54
+ header = _parse_header(raw)
55
+ token_count = header["token_count"]
56
+ body_byte_length = header["body_byte_length"]
57
+
58
+ if token_count == 0:
59
+ raise ValueError("LP/v1 token_count must be > 0")
60
+ if body_byte_length != token_count * ENTRY_SIZE:
61
+ raise ValueError(
62
+ f"body_byte_length ({body_byte_length}) != token_count * {ENTRY_SIZE} "
63
+ f"({token_count * ENTRY_SIZE})"
64
+ )
65
+
66
+ expected_len = HEADER_SIZE + body_byte_length
67
+ if len(raw) != expected_len:
68
+ raise ValueError(f"LP/v1 payload length mismatch: {len(raw)} != {expected_len}")
69
+
70
+ logprobs: List[float] = []
71
+ token_ids: List[int] = []
72
+ all_token_ids_valid = True
73
+ offset = HEADER_SIZE
74
+ for _ in range(token_count):
75
+ wire_id, logprob = struct.unpack(ENTRY_FORMAT, raw[offset : offset + ENTRY_SIZE])
76
+ offset += ENTRY_SIZE
77
+ logprobs.append(logprob)
78
+ if wire_id == MISSING_TOKEN_ID:
79
+ all_token_ids_valid = False
80
+ token_ids.append(wire_id)
81
+ else:
82
+ token_ids.append(wire_id)
83
+
84
+ metadata: Dict[str, Any] = {
85
+ "scope": "completion_only",
86
+ "completion_token_count": token_count,
87
+ "all_token_ids_valid": all_token_ids_valid,
88
+ }
89
+ header.update(metadata)
90
+ ids_out: Optional[List[int]] = token_ids if all_token_ids_valid else None
91
+ return logprobs, ids_out, header
92
+
93
+
94
+ def decompress_and_parse_lp(data_b64: str) -> Tuple[List[float], Optional[List[int]], Dict[str, Any]]:
95
+ """Decompress and unpack an LP/v1 payload into completion logprobs and token ids.
96
+
97
+ Args:
98
+ data_b64: Base64-encoded zstd-compressed LP binary blob from
99
+ ``payloads.logprobs.data``.
100
+
101
+ Returns:
102
+ ``(logprobs, token_ids, metadata)`` where ``logprobs`` is per-completion-token
103
+ scalars, ``token_ids`` is ``None`` if any wire id was ``MISSING_TOKEN_ID``,
104
+ and ``metadata`` includes ``all_token_ids_valid`` and ``completion_token_count``.
105
+ """
106
+ compressed = base64.b64decode(data_b64)
107
+ decompressor = zstd.ZstdDecompressor()
108
+ raw = decompressor.decompress(compressed)
109
+ return parse_logprobs(raw)
@@ -0,0 +1,187 @@
1
+ """R3/v1 binary deserializer for router-replay payloads.
2
+
3
+ Implements the inverse of the packed binary format produced by the tracing
4
+ gateway's ``r3_serializer.serialize_r3``. See that module for the full
5
+ header specification.
6
+
7
+ The main entry point is :func:`decompress_and_parse_r3`, which accepts the
8
+ base64-encoded compressed blob returned by the gateway's
9
+ ``/v1/traces/pointwise?include_payloads=true`` endpoint and produces
10
+ per-token routing matrices in the same ``List[Optional[str]]`` format used
11
+ by the direct inference path (``DeploymentSampler.sample_with_tokens()``).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import base64
17
+ import struct
18
+ from enum import IntEnum
19
+ from typing import Any, Dict, List, Optional, Tuple
20
+
21
+ import zstandard as zstd
22
+
23
+ MAGIC = b"R3V1"
24
+ HEADER_FORMAT = "<4sBBBBIIIIQ"
25
+ HEADER_SIZE = struct.calcsize(HEADER_FORMAT) # 32 bytes
26
+ BITS_PER_BYTE = 8
27
+
28
+
29
+ class _SelectorMode(IntEnum):
30
+ ALL = 0
31
+ SUFFIX = 1
32
+ BITMAP = 2
33
+
34
+
35
+ class _RoutingDtype(IntEnum):
36
+ UINT8 = 1
37
+ UINT16 = 2
38
+
39
+
40
+ _SELECTOR_MODE_NAMES = {v: v.name.lower() for v in _SelectorMode}
41
+ _ROUTING_DTYPE_NAMES = {v: v.name.lower() for v in _RoutingDtype}
42
+
43
+
44
+ def _parse_header(raw: bytes) -> Dict[str, Any]:
45
+ if len(raw) < HEADER_SIZE:
46
+ raise ValueError(
47
+ f"Payload too short for r3/v1 header: {len(raw)} < {HEADER_SIZE}"
48
+ )
49
+
50
+ (
51
+ magic,
52
+ version,
53
+ selector_mode,
54
+ routing_dtype,
55
+ flags,
56
+ total_token_count,
57
+ replayed_token_count,
58
+ replay_start_token,
59
+ selector_byte_length,
60
+ matrix_byte_length,
61
+ ) = struct.unpack(HEADER_FORMAT, raw[:HEADER_SIZE])
62
+
63
+ if magic != MAGIC:
64
+ raise ValueError(f"Bad R3 magic: {magic!r}")
65
+ if version != 1:
66
+ raise ValueError(f"Unsupported R3 header version: {version}")
67
+
68
+ return {
69
+ "selector_mode": selector_mode,
70
+ "routing_dtype": routing_dtype,
71
+ "flags": flags,
72
+ "total_token_count": total_token_count,
73
+ "replayed_token_count": replayed_token_count,
74
+ "replay_start_token": replay_start_token,
75
+ "selector_byte_length": selector_byte_length,
76
+ "matrix_byte_length": matrix_byte_length,
77
+ }
78
+
79
+
80
+ def _read_bitmap_positions(
81
+ selector_bytes: bytes, total_token_count: int
82
+ ) -> List[int]:
83
+ """Return sorted token indices where the bitmap bit is set."""
84
+ positions: List[int] = []
85
+ for i in range(total_token_count):
86
+ byte_idx = i // BITS_PER_BYTE
87
+ bit_idx = i % BITS_PER_BYTE
88
+ if byte_idx < len(selector_bytes) and (selector_bytes[byte_idx] >> bit_idx) & 1:
89
+ positions.append(i)
90
+ return positions
91
+
92
+
93
+ def decompress_and_parse_r3(
94
+ data_b64: str,
95
+ ) -> Tuple[List[Optional[str]], Dict[str, Any]]:
96
+ """Decompress and unpack an R3/v1 payload into per-token routing matrices.
97
+
98
+ Args:
99
+ data_b64: Base64-encoded zstd-compressed R3 binary blob, as returned
100
+ by the tracing gateway in ``payloads.router_replay.data``.
101
+
102
+ Returns:
103
+ A tuple of ``(routing_matrices, metadata)`` where:
104
+
105
+ - ``routing_matrices`` is a ``List[Optional[str]]`` of length
106
+ ``total_token_count``. Each present position contains a
107
+ base64-encoded routing matrix (matching the format returned by
108
+ the direct inference path); absent positions are ``None``.
109
+ - ``metadata`` is a dict with keys ``routing_dtype``,
110
+ ``selector_mode``, ``total_token_count``, ``replayed_token_count``,
111
+ ``replay_start_token``.
112
+ """
113
+ compressed = base64.b64decode(data_b64)
114
+
115
+ # ZstdCompressor.compress() embeds the uncompressed size in the frame
116
+ # header by default, so the library can auto-allocate the output buffer.
117
+ decompressor = zstd.ZstdDecompressor()
118
+ raw = decompressor.decompress(compressed)
119
+
120
+ header = _parse_header(raw)
121
+
122
+ selector_mode = header["selector_mode"]
123
+ routing_dtype = header["routing_dtype"]
124
+ total_token_count = header["total_token_count"]
125
+ replayed_token_count = header["replayed_token_count"]
126
+ replay_start_token = header["replay_start_token"]
127
+ selector_byte_length = header["selector_byte_length"]
128
+ matrix_byte_length = header["matrix_byte_length"]
129
+
130
+ metadata: Dict[str, Any] = {
131
+ "routing_dtype": _ROUTING_DTYPE_NAMES.get(routing_dtype, str(routing_dtype)),
132
+ "selector_mode": _SELECTOR_MODE_NAMES.get(selector_mode, str(selector_mode)),
133
+ "total_token_count": total_token_count,
134
+ "replayed_token_count": replayed_token_count,
135
+ "replay_start_token": replay_start_token,
136
+ }
137
+
138
+ if replayed_token_count == 0:
139
+ return [None] * total_token_count, metadata
140
+
141
+ # Per-token matrix byte size is implicit in the payload: all replayed
142
+ # tokens share the same matrix length, so we can recover it from the
143
+ # matrix section total length divided by the replayed-token count.
144
+ if matrix_byte_length % replayed_token_count != 0:
145
+ raise ValueError(
146
+ f"matrix_byte_length ({matrix_byte_length}) is not a multiple of "
147
+ f"replayed_token_count ({replayed_token_count}); cannot split "
148
+ "into per-token matrices"
149
+ )
150
+ matrix_elem_size = matrix_byte_length // replayed_token_count
151
+
152
+ body = raw[HEADER_SIZE:]
153
+ expected_body_length = selector_byte_length + matrix_byte_length
154
+ if len(body) < expected_body_length:
155
+ raise ValueError(
156
+ f"Payload body too short for selector and matrix sections: "
157
+ f"{len(body)} < {expected_body_length}"
158
+ )
159
+
160
+ selector_bytes = body[:selector_byte_length]
161
+ matrix_bytes = body[selector_byte_length : selector_byte_length + matrix_byte_length]
162
+
163
+ if selector_mode == _SelectorMode.ALL:
164
+ replayed_positions = list(range(total_token_count))
165
+ elif selector_mode == _SelectorMode.SUFFIX:
166
+ replayed_positions = list(
167
+ range(replay_start_token, replay_start_token + replayed_token_count)
168
+ )
169
+ elif selector_mode == _SelectorMode.BITMAP:
170
+ replayed_positions = _read_bitmap_positions(selector_bytes, total_token_count)
171
+ else:
172
+ raise ValueError(f"Unknown selector_mode: {selector_mode}")
173
+
174
+ if len(replayed_positions) != replayed_token_count:
175
+ raise ValueError(
176
+ f"Selector produced {len(replayed_positions)} replayed positions, "
177
+ f"but header replayed_token_count is {replayed_token_count}"
178
+ )
179
+
180
+ # Split matrix bytes into per-token chunks and base64-encode each one
181
+ matrices: List[Optional[str]] = [None] * total_token_count
182
+ for idx, pos in enumerate(replayed_positions):
183
+ start = idx * matrix_elem_size
184
+ end = start + matrix_elem_size
185
+ matrices[pos] = base64.b64encode(matrix_bytes[start:end]).decode("ascii")
186
+
187
+ return matrices, metadata
@@ -35,11 +35,13 @@ class RemoteRolloutProcessor(RolloutProcessor):
35
35
  model_base_url: str = "https://tracing.fireworks.ai",
36
36
  poll_interval: float = 1.0,
37
37
  timeout_seconds: float = 120.0,
38
+ include_payloads: bool = False,
38
39
  ):
39
40
  # Prefer constructor-provided configuration. These can be overridden via
40
41
  # config.kwargs at call time for backward compatibility.
41
42
  self._remote_base_url = remote_base_url
42
43
  self._model_base_url = model_base_url
44
+ self._include_payloads = include_payloads
43
45
  if os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL"):
44
46
  self._remote_base_url = os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL")
45
47
  _ep_model_base_url = os.getenv("EP_MODEL_BASE_URL")
@@ -122,45 +124,46 @@ class RemoteRolloutProcessor(RolloutProcessor):
122
124
 
123
125
  while time.time() < deadline:
124
126
  session = self._get_or_create_session()
125
- completed_logs = await self._tracing_adapter.async_search_logs(
126
- session, tags=[f"rollout_id:{row.execution_metadata.rollout_id}"]
127
+ status_result = await self._tracing_adapter.async_get_status(
128
+ session,
129
+ rollout_id=row.execution_metadata.rollout_id,
127
130
  )
128
- # Filter for logs that actually have status information
129
- status_logs = []
130
- for log in completed_logs:
131
- status_dict = log.get("status")
132
- if status_dict and isinstance(status_dict, dict) and "code" in status_dict:
133
- status_logs.append(log)
134
-
135
- if status_logs:
136
- if len(status_logs) > 1:
137
- logger.warning(
138
- "Found %s status logs for rollout %s; expected at most 1. Using the first one: %s",
139
- len(status_logs),
140
- row.execution_metadata.rollout_id,
141
- status_logs[0],
142
- )
143
- # Use the first log with status information
144
- status_log = status_logs[0]
145
- status_dict = status_log.get("status")
146
- raw_extras = status_log.get("extras") or {}
147
- status_extras = {
148
- k: v for k, v in raw_extras.items() if k not in ("logger_name", "level", "timestamp")
149
- }
131
+ status = (status_result or {}).get("status")
132
+ if isinstance(status, dict) and "code" in status:
133
+ status_code = status["code"]
134
+ if status_code == Status.Code.RUNNING:
135
+ await asyncio.sleep(poll_interval)
136
+ continue
150
137
 
151
138
  logger.info(
152
- f"Found status log for rollout {row.execution_metadata.rollout_id}: {status_log.get('message', '')}"
139
+ "Found status for rollout %s with code %s",
140
+ row.execution_metadata.rollout_id,
141
+ status_code,
153
142
  )
154
143
 
155
- status_code = status_dict.get("code")
156
- status_message = status_dict.get("message", "")
157
- status_details = status_dict.get("details", [])
158
-
159
- logger.info(
160
- f"Found Fireworks log for rollout {row.execution_metadata.rollout_id} with status code {status_code}"
144
+ # /status only returns the code; backfill message/details/extras from Logs once.
145
+ status_message: str = ""
146
+ status_details: list = []
147
+ status_extras: dict = {}
148
+ completed_logs = await self._tracing_adapter.async_search_logs(
149
+ session, tags=[f"rollout_id:{row.execution_metadata.rollout_id}"]
161
150
  )
151
+ # Pick the log row whose status code matches the terminal
152
+ # code from /status, so intermediate RUNNING checkpoints
153
+ # don't poison the backfill.
154
+ for log in completed_logs:
155
+ sd = log.get("status")
156
+ if isinstance(sd, dict) and sd.get("code") == status_code:
157
+ status_message = sd.get("message", "") or ""
158
+ status_details = sd.get("details", []) or []
159
+ raw_extras = log.get("extras") or {}
160
+ status_extras = {
161
+ k: v
162
+ for k, v in raw_extras.items()
163
+ if k not in ("logger_name", "level", "timestamp")
164
+ }
165
+ break
162
166
 
163
- # Create and raise exception if appropriate, preserving original message
164
167
  exception = exception_for_status_code(status_code, status_message)
165
168
  if exception is not None:
166
169
  raise exception
@@ -171,10 +174,11 @@ class RemoteRolloutProcessor(RolloutProcessor):
171
174
  details=status_details,
172
175
  )
173
176
 
174
- if row.execution_metadata.extra:
175
- row.execution_metadata.extra.update(status_extras)
176
- else:
177
- row.execution_metadata.extra = status_extras
177
+ if status_extras:
178
+ if row.execution_metadata.extra:
179
+ row.execution_metadata.extra.update(status_extras)
180
+ else:
181
+ row.execution_metadata.extra = status_extras
178
182
 
179
183
  logger.info("Stopping polling for rollout %s", row.execution_metadata.rollout_id)
180
184
  break
@@ -192,7 +196,10 @@ class RemoteRolloutProcessor(RolloutProcessor):
192
196
  row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
193
197
 
194
198
  def _update_with_trace() -> None:
195
- return update_row_with_remote_trace(row, default_fireworks_output_data_loader, model_base_url)
199
+ return update_row_with_remote_trace(
200
+ row, default_fireworks_output_data_loader, model_base_url,
201
+ include_payloads=self._include_payloads,
202
+ )
196
203
 
197
204
  await asyncio.to_thread(_update_with_trace) # Update row with remote trace in-place
198
205
  return row
@@ -22,9 +22,61 @@ def default_fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDat
22
22
  # Use EP_REMOTE_API_KEY for fetching remote traces, falling back to FIREWORKS_API_KEY
23
23
  api_key = os.environ.get("EP_REMOTE_API_KEY") or os.environ.get("FIREWORKS_API_KEY")
24
24
  adapter = FireworksTracingAdapter(base_url=base_url, api_key=api_key)
25
- return adapter.get_evaluation_rows(tags=[f"rollout_id:{config.rollout_id}"], max_retries=5)
25
+ return adapter.get_evaluation_rows(
26
+ tags=[f"rollout_id:{config.rollout_id}"],
27
+ max_retries=5,
28
+ include_payloads=config.include_payloads,
29
+ )
26
30
 
27
- return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=filter_longest_conversation)
31
+ def preprocess_traces(rows: List[EvaluationRow]) -> List[EvaluationRow]:
32
+ filtered_rows = filter_longest_conversation(rows)
33
+ if config.include_payloads and filtered_rows:
34
+ _merge_payloads_into_longest_row(filtered_rows[0], rows)
35
+ return filtered_rows
36
+
37
+ return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=preprocess_traces)
38
+
39
+
40
+ def _merge_payloads_into_longest_row(longest_row: EvaluationRow, rows: List[EvaluationRow]) -> None:
41
+ """
42
+ Preserve per-turn payload-derived metadata after selecting the longest trace row.
43
+
44
+ Each trace row carries payloads for its final assistant turn. The longest row
45
+ keeps the full conversation, while its top-level execution metadata remains
46
+ the payload metadata for the final completion for backward compatibility.
47
+ """
48
+ target_assistants = longest_row.get_assistant_messages()
49
+ assistant_turn_payloads = []
50
+
51
+ for row in sorted(rows, key=lambda item: len(item.messages)):
52
+ source = row.last_assistant_message()
53
+ source_turn_index = len(row.get_assistant_messages()) - 1
54
+ if source_turn_index < 0 or source_turn_index >= len(target_assistants):
55
+ continue
56
+
57
+ if source and source.logprobs and not target_assistants[source_turn_index].logprobs:
58
+ target_assistants[source_turn_index].logprobs = source.logprobs
59
+
60
+ extra = row.execution_metadata.extra or {}
61
+ turn_payload = {
62
+ key: extra[key]
63
+ for key in (
64
+ "completion_logprobs",
65
+ "completion_token_ids",
66
+ "logprobs_metadata",
67
+ "routing_matrices",
68
+ "routing_metadata",
69
+ )
70
+ if key in extra
71
+ }
72
+ if turn_payload:
73
+ turn_payload["assistant_turn_index"] = source_turn_index
74
+ assistant_turn_payloads.append(turn_payload)
75
+
76
+ if assistant_turn_payloads:
77
+ if longest_row.execution_metadata.extra is None:
78
+ longest_row.execution_metadata.extra = {}
79
+ longest_row.execution_metadata.extra["assistant_turn_payloads"] = assistant_turn_payloads
28
80
 
29
81
 
30
82
  def build_fireworks_tracing_url(
@@ -99,7 +151,7 @@ def build_init_request(
99
151
  if not completion_params_dict.get("model"):
100
152
  raise ValueError("Model must be provided in completion_params")
101
153
 
102
- # Extract base_url from completion_params
154
+ # Extract base_url from completion_params for tracing-gateway URL encoding
103
155
  completion_params_base_url: Optional[str] = completion_params_dict.get("base_url")
104
156
 
105
157
  # Strip non-OpenAI fields from messages
@@ -129,7 +181,7 @@ def build_init_request(
129
181
 
130
182
  # Build final model base URL with tracing metadata
131
183
  final_model_base_url = model_base_url
132
- if model_base_url and ("tracing.fireworks.ai" in model_base_url or model_base_url.startswith("http://localhost")):
184
+ if model_base_url and ("tracing.fireworks.ai" in model_base_url or model_base_url.startswith("http://localhost") or "litellm-gateway" in model_base_url):
133
185
  final_model_base_url = build_fireworks_tracing_url(model_base_url, meta, completion_params_base_url)
134
186
 
135
187
  # Extract API key from environment or completion_params
@@ -148,13 +200,20 @@ def build_init_request(
148
200
 
149
201
 
150
202
  def update_row_with_remote_trace(
151
- row: EvaluationRow, output_data_loader: Callable[[DataLoaderConfig], DynamicDataLoader], model_base_url: str
203
+ row: EvaluationRow,
204
+ output_data_loader: Callable[[DataLoaderConfig], DynamicDataLoader],
205
+ model_base_url: str,
206
+ include_payloads: bool = False,
152
207
  ) -> None:
153
208
  """Update row with remote trace data using output_data_loader (shared logic)."""
154
209
  if not row.execution_metadata.rollout_id:
155
210
  return None
156
211
 
157
- loader_config = DataLoaderConfig(rollout_id=row.execution_metadata.rollout_id, model_base_url=model_base_url)
212
+ loader_config = DataLoaderConfig(
213
+ rollout_id=row.execution_metadata.rollout_id,
214
+ model_base_url=model_base_url,
215
+ include_payloads=include_payloads,
216
+ )
158
217
  data_loader = output_data_loader(loader_config)
159
218
  results = data_loader.load()
160
219
  output_rows: List[EvaluationRow] = [r for result in results for r in result.rows]