eval-protocol 0.3.28__tar.gz → 0.3.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (475) hide show
  1. {eval_protocol-0.3.28/eval_protocol.egg-info → eval_protocol-0.3.30}/PKG-INFO +2 -1
  2. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/fireworks_tracing.py +47 -1
  4. eval_protocol-0.3.30/eval_protocol/adapters/lp_deserializer.py +109 -0
  5. eval_protocol-0.3.30/eval_protocol/adapters/r3_deserializer.py +187 -0
  6. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/remote_rollout_processor.py +29 -5
  7. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/tracing_utils.py +65 -6
  8. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/types/remote_rollout_processor.py +1 -0
  9. {eval_protocol-0.3.28 → eval_protocol-0.3.30/eval_protocol.egg-info}/PKG-INFO +2 -1
  10. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol.egg-info/SOURCES.txt +2 -0
  11. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol.egg-info/requires.txt +1 -0
  12. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/pyproject.toml +1 -0
  13. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/LICENSE +0 -0
  14. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/README.md +0 -0
  15. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/development/__init__.py +0 -0
  16. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/development/normalize_sandbox_fusion.py +0 -0
  17. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/development/utils/__init__.py +0 -0
  18. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/development/utils/generate_api_key.py +0 -0
  19. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/development/utils/subprocess_manager.py +0 -0
  20. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/__init__.py +0 -0
  21. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/__main__.py +0 -0
  22. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/__init__.py +0 -0
  23. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/base.py +0 -0
  24. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/bigquery.py +0 -0
  25. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/braintrust.py +0 -0
  26. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/dataframe.py +0 -0
  27. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/huggingface.py +0 -0
  28. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/langchain.py +0 -0
  29. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/langfuse.py +0 -0
  30. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/langsmith.py +0 -0
  31. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/openai_responses.py +0 -0
  32. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/trl.py +0 -0
  33. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/utils.py +0 -0
  34. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/adapters/weave.py +0 -0
  35. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/__init__.py +0 -0
  36. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/models.py +0 -0
  37. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/orchestrator.py +0 -0
  38. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resource_abc.py +0 -0
  39. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resource_pool.py +0 -0
  40. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/__init__.py +0 -0
  41. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  42. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  43. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  44. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  45. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  46. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/docker_resource.py +0 -0
  47. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  48. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  49. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/resources/sql_resource.py +0 -0
  50. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/task_manager.py +0 -0
  51. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/agent/tool_registry.py +0 -0
  52. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/auth.py +0 -0
  53. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/__init__.py +0 -0
  54. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  55. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  56. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_aime25.py +0 -0
  57. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  58. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
  59. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  60. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  61. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  62. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  63. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli.py +0 -0
  64. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/__init__.py +0 -0
  65. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  66. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/common.py +0 -0
  67. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/create_rft.py +0 -0
  68. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/export_docs.py +0 -0
  69. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/local_test.py +0 -0
  70. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/logs.py +0 -0
  71. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  72. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/upload.py +0 -0
  73. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/cli_commands/utils.py +0 -0
  74. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/common_utils.py +0 -0
  75. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/config.py +0 -0
  76. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/data_loader/__init__.py +0 -0
  77. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  78. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  79. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  80. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  81. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/data_loader/models.py +0 -0
  82. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/__init__.py +0 -0
  83. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  84. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  85. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  86. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  87. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/datasets/__init__.py +0 -0
  88. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/datasets/loader.py +0 -0
  89. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/directory_utils.py +0 -0
  90. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/evaluation.py +0 -0
  91. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/event_bus/__init__.py +0 -0
  92. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/event_bus/event_bus.py +0 -0
  93. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/event_bus/logger.py +0 -0
  94. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  95. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  96. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/exceptions.py +0 -0
  97. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/execution/__init__.py +0 -0
  98. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/execution/pipeline.py +0 -0
  99. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/fireworks_rft.py +0 -0
  100. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/gcp_tools.py +0 -0
  101. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/generation/cache.py +0 -0
  102. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/generation/clients/base.py +0 -0
  103. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/generation/clients.py +0 -0
  104. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/generic_server.py +0 -0
  105. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/get_pep440_version.py +0 -0
  106. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/human_id/__init__.py +0 -0
  107. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/human_id/dictionary.py +0 -0
  108. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/integrations/__init__.py +0 -0
  109. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/integrations/deepeval.py +0 -0
  110. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/integrations/fireworks_v1_completions_client.py +0 -0
  111. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/integrations/openai_rft.py +0 -0
  112. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/integrations/openeval.py +0 -0
  113. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  114. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
  115. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/integrations/trl.py +0 -0
  116. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/__init__.py +0 -0
  117. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  118. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  119. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  120. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  121. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/init.py +0 -0
  122. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/rollout_context.py +0 -0
  123. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  124. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/log_utils/util.py +0 -0
  125. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/logging_utils.py +0 -0
  126. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/__init__.py +0 -0
  127. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/adapter.py +0 -0
  128. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/client/__init__.py +0 -0
  129. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/client/connection.py +0 -0
  130. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/clients.py +0 -0
  131. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/__init__.py +0 -0
  132. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/base_policy.py +0 -0
  133. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/manager.py +0 -0
  134. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/policy.py +0 -0
  135. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  136. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/grid_renderer.py +0 -0
  137. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  138. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/mcpgym.py +0 -0
  139. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/process_manager.py +0 -0
  140. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/session/__init__.py +0 -0
  141. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/session/manager.py +0 -0
  142. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/simple_process_manager.py +0 -0
  143. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp/simulation_server.py +0 -0
  144. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/__init__.py +0 -0
  145. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/config.py +0 -0
  146. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/main.py +0 -0
  147. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  148. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  149. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  150. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  151. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_env.py +0 -0
  152. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/__init__.py +0 -0
  153. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  154. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  155. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  156. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  157. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  158. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  159. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  160. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  161. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  162. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  163. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  164. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  165. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  166. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  167. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/models.py +0 -0
  168. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/packaging.py +0 -0
  169. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/platform_api.py +0 -0
  170. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/playback_policy.py +0 -0
  171. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/__init__.py +0 -0
  172. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  173. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/app.py +0 -0
  174. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  175. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  176. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  177. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/main.py +0 -0
  178. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/models.py +0 -0
  179. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  180. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/__init__.py +0 -0
  181. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/buffer.py +0 -0
  182. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  183. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  184. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
  185. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  186. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  187. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  188. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  189. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  190. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  191. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  192. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test.py +0 -0
  193. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  194. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  195. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/exception_config.py +0 -0
  196. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/execution.py +0 -0
  197. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  198. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  199. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  200. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  201. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
  202. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/parameterize.py +0 -0
  203. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/plugin.py +0 -0
  204. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/priority_scheduler.py +0 -0
  205. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/rollout_processor.py +0 -0
  206. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  207. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/store_experiment_link.py +0 -0
  208. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/store_results_url.py +0 -0
  209. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/types.py +0 -0
  210. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/utils.py +0 -0
  211. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/pytest/validate_signature.py +0 -0
  212. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/__init__.py +0 -0
  213. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  214. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  215. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  216. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  217. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  218. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  219. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  220. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/llm_judge.py +0 -0
  221. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  222. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  223. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  224. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  225. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/quickstart/utils.py +0 -0
  226. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/resources.py +0 -0
  227. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/reward_function.py +0 -0
  228. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/__init__.py +0 -0
  229. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/accuracy.py +0 -0
  230. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/accuracy_length.py +0 -0
  231. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  232. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  233. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/apps_testing_util.py +0 -0
  234. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/bfcl_reward.py +0 -0
  235. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/code_execution.py +0 -0
  236. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/code_execution_utils.py +0 -0
  237. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/cpp_code.py +0 -0
  238. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  239. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/format.py +0 -0
  240. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/function_calling.py +0 -0
  241. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/json_schema.py +0 -0
  242. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/language_consistency.py +0 -0
  243. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/lean_prover.py +0 -0
  244. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/length.py +0 -0
  245. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  246. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/math.py +0 -0
  247. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  248. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/reasoning_steps.py +0 -0
  249. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/repetition.py +0 -0
  250. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rewards/tag_count.py +0 -0
  251. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/rl_processing.py +0 -0
  252. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/server.py +0 -0
  253. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/stats/__init__.py +0 -0
  254. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/stats/confidence_intervals.py +0 -0
  255. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/training/__init__.py +0 -0
  256. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/training/gepa_trainer.py +0 -0
  257. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/training/gepa_utils.py +0 -0
  258. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/training/trainer.py +0 -0
  259. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/training/utils.py +0 -0
  260. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/typed_interface.py +0 -0
  261. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/types/__init__.py +0 -0
  262. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/types/errors.py +0 -0
  263. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/types/types.py +0 -0
  264. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/__init__.py +0 -0
  265. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/batch_evaluation.py +0 -0
  266. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/batch_transformation.py +0 -0
  267. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/browser_utils.py +0 -0
  268. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/check_server_status.py +0 -0
  269. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/dataset_helpers.py +0 -0
  270. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  271. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/logs_models.py +0 -0
  272. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/logs_server.py +0 -0
  273. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/module_loader.py +0 -0
  274. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/packaging_utils.py +0 -0
  275. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/show_results_url.py +0 -0
  276. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/static_policy.py +0 -0
  277. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/subprocess_utils.py +0 -0
  278. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol/utils/vite_server.py +0 -0
  279. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol.egg-info/dependency_links.txt +0 -0
  280. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol.egg-info/entry_points.txt +0 -0
  281. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/eval_protocol.egg-info/top_level.txt +0 -0
  282. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/setup.cfg +0 -0
  283. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/setup.py +0 -0
  284. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_accuracy.py +0 -0
  285. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_accuracy_length.py +0 -0
  286. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_adapters_e2e.py +0 -0
  287. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_agent_orchestrator.py +0 -0
  288. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_agent_resources.py +0 -0
  289. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_auth.py +0 -0
  290. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_batch_evaluation.py +0 -0
  291. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_cli_agent.py +0 -0
  292. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_cli_args.py +0 -0
  293. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_cli_create_rft.py +0 -0
  294. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_cli_local_test.py +0 -0
  295. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_cli_startup_benchmark.py +0 -0
  296. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_code_execution.py +0 -0
  297. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_config.py +0 -0
  298. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_control_plane_separation.py +0 -0
  299. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_cpp_code.py +0 -0
  300. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_data_driven_task_manager.py +0 -0
  301. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_deepcoder_reward.py +0 -0
  302. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_deepeval_integration.py +0 -0
  303. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_directory_utils.py +0 -0
  304. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_e2b_integration.py +0 -0
  305. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_e2b_js_integration.py +0 -0
  306. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_edge_cases.py +0 -0
  307. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_ep_upload_e2e.py +0 -0
  308. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_eval_protocol_import.py +0 -0
  309. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_evaluation.py +0 -0
  310. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_evaluation_postprocess.py +0 -0
  311. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_event_bus.py +0 -0
  312. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_event_bus_helper.py +0 -0
  313. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_examples_end_to_end.py +0 -0
  314. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_exception_config.py +0 -0
  315. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_exceptions.py +0 -0
  316. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_fireworks_api.py +0 -0
  317. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_fireworks_v1_completions_client.py +0 -0
  318. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_format.py +0 -0
  319. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_fractional_code.py +0 -0
  320. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_function_calling.py +0 -0
  321. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_gcp_tools.py +0 -0
  322. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_generic_server.py +0 -0
  323. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_human_id.py +0 -0
  324. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_integration.py +0 -0
  325. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_json_schema.py +0 -0
  326. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_kwargs_validation.py +0 -0
  327. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_language_consistency.py +0 -0
  328. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_lean_prover.py +0 -0
  329. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_lean_prover_runner.py +0 -0
  330. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_length.py +0 -0
  331. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_list_comparison_math_reward.py +0 -0
  332. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_litellm_policy_provider_fields.py +0 -0
  333. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_logs_server.py +0 -0
  334. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_logs_server_simple.py +0 -0
  335. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_math.py +0 -0
  336. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_message_field_filtering.py +0 -0
  337. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_minimal.py +0 -0
  338. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_models.py +0 -0
  339. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_models_rl.py +0 -0
  340. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_multiple_choice_math_reward.py +0 -0
  341. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_n_variant_batch_integration.py +0 -0
  342. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_n_variant_integration.py +0 -0
  343. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_no_implicit_dotenv.py +0 -0
  344. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_openai_compatibility.py +0 -0
  345. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_openai_rft_integration.py +0 -0
  346. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_openeval_integration.py +0 -0
  347. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_packaging.py +0 -0
  348. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_parallel_rollouts.py +0 -0
  349. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_platform_api.py +0 -0
  350. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_priority_scheduler.py +0 -0
  351. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_quickstart_utils.py +0 -0
  352. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_readiness.py +0 -0
  353. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_reasoning_steps.py +0 -0
  354. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_repetition.py +0 -0
  355. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_repetition_debug.py +0 -0
  356. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_retry_mechanism.py +0 -0
  357. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_reward_function.py +0 -0
  358. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_reward_protocol_import.py +0 -0
  359. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_rl_processing.py +0 -0
  360. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_rollout_control_plane_integration.py +0 -0
  361. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_rollout_logprobs.py +0 -0
  362. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_server.py +0 -0
  363. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_show_results_url.py +0 -0
  364. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_sqlite_hardening.py +0 -0
  365. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_status_migration_changes.py +0 -0
  366. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_status_migration_integration.py +0 -0
  367. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_status_model.py +0 -0
  368. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_tag_count.py +0 -0
  369. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_tau_bench_airline_smoke.py +0 -0
  370. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_training_utils.py +0 -0
  371. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_typed_interface.py +0 -0
  372. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_typed_interface_rl.py +0 -0
  373. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_upload_entrypoint.py +0 -0
  374. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_url_handling.py +0 -0
  375. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/tests/test_vite_server.py +0 -0
  376. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/__init__.py +0 -0
  377. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/agent/__init__.py +0 -0
  378. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/agent/base.py +0 -0
  379. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/agent/llm_agent.py +0 -0
  380. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/api_service/__init__.py +0 -0
  381. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/api_service/api_config.py +0 -0
  382. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/api_service/data_model.py +0 -0
  383. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/api_service/simulation_service.py +0 -0
  384. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/cli.py +0 -0
  385. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/config.py +0 -0
  386. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/airline/policy.md +0 -0
  387. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/mock/policy.md +0 -0
  388. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  389. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/retail/policy.md +0 -0
  390. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  391. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  392. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  393. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  394. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  395. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  396. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  397. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data_model/__init__.py +0 -0
  398. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data_model/message.py +0 -0
  399. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data_model/simulation.py +0 -0
  400. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/data_model/tasks.py +0 -0
  401. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/__init__.py +0 -0
  402. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/__init__.py +0 -0
  403. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/data_model.py +0 -0
  404. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/environment.py +0 -0
  405. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/tools.py +0 -0
  406. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/airline/utils.py +0 -0
  407. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/__init__.py +0 -0
  408. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/data_model.py +0 -0
  409. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/environment.py +0 -0
  410. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/tools.py +0 -0
  411. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/mock/utils.py +0 -0
  412. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/__init__.py +0 -0
  413. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/data_model.py +0 -0
  414. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/environment.py +0 -0
  415. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/tools.py +0 -0
  416. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/retail/utils.py +0 -0
  417. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/__init__.py +0 -0
  418. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/data_model.py +0 -0
  419. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/environment.py +0 -0
  420. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  421. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  422. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  423. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  424. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  425. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  426. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  427. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  428. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/tools.py +0 -0
  429. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  430. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  431. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/domains/telecom/utils.py +0 -0
  432. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/environment/__init__.py +0 -0
  433. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/environment/db.py +0 -0
  434. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/environment/environment.py +0 -0
  435. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/environment/server.py +0 -0
  436. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/environment/tool.py +0 -0
  437. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/environment/toolkit.py +0 -0
  438. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  439. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/evaluator/__init__.py +0 -0
  440. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator.py +0 -0
  441. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  442. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  443. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  444. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  445. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  446. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/metrics/__init__.py +0 -0
  447. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/metrics/agent_metrics.py +0 -0
  448. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  449. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/__init__.py +0 -0
  450. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  451. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  452. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/orchestrator/utils.py +0 -0
  453. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/registry.py +0 -0
  454. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/run.py +0 -0
  455. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/scripts/__init__.py +0 -0
  456. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/scripts/check_data.py +0 -0
  457. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  458. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/scripts/start_servers.py +0 -0
  459. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/scripts/view_simulations.py +0 -0
  460. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/user/__init__.py +0 -0
  461. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/user/base.py +0 -0
  462. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/user/user_simulator.py +0 -0
  463. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/utils/__init__.py +0 -0
  464. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/utils/display.py +0 -0
  465. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/utils/io_utils.py +0 -0
  466. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/utils/llm_utils.py +0 -0
  467. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/utils/pydantic_utils.py +0 -0
  468. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vendor/tau2/utils/utils.py +0 -0
  469. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/versioneer.py +0 -0
  470. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  471. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DFeF7AG_.js +0 -0
  472. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DFeF7AG_.js.map +0 -0
  473. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vite-app/dist/assets/index-DvKW7FQL.css +0 -0
  474. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  475. {eval_protocol-0.3.28 → eval_protocol-0.3.30}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.28
3
+ Version: 0.3.30
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -40,6 +40,7 @@ Requires-Dist: addict>=2.4.0
40
40
  Requires-Dist: deepdiff>=6.0.0
41
41
  Requires-Dist: websockets>=15.0.1
42
42
  Requires-Dist: fastapi>=0.116.1
43
+ Requires-Dist: zstandard>=0.19.0
43
44
  Provides-Extra: dev
44
45
  Requires-Dist: build; extra == "dev"
45
46
  Requires-Dist: twine; extra == "dev"
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-04-28T20:08:36-0700",
11
+ "date": "2026-05-29T16:09:24-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "fd4358c26f9cbe57bb9c0091f28ae6642194941b",
15
- "version": "0.3.28"
14
+ "full-revisionid": "1bd5447a3afbca3b71e0f0d205ed7cff6c3afe5d",
15
+ "version": "0.3.30"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -16,6 +16,8 @@ import os
16
16
 
17
17
  from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
18
18
  from .base import BaseAdapter
19
+ from .lp_deserializer import decompress_and_parse_lp
20
+ from .r3_deserializer import decompress_and_parse_r3
19
21
  from .utils import extract_messages_from_data
20
22
  from ..common_utils import get_user_agent
21
23
 
@@ -100,13 +102,53 @@ def convert_trace_dict_to_evaluation_row(
100
102
  ):
101
103
  break # Break early if we've found all the metadata we need
102
104
 
105
+ # Extract router replay payloads when present
106
+ payloads = trace.get("payloads")
107
+ if isinstance(payloads, dict):
108
+ router_replay = payloads.get("router_replay")
109
+ if isinstance(router_replay, dict) and router_replay.get("data"):
110
+ try:
111
+ matrices, r3_meta = decompress_and_parse_r3(router_replay["data"])
112
+ if execution_metadata.extra is None:
113
+ execution_metadata.extra = {}
114
+ execution_metadata.extra["routing_matrices"] = matrices
115
+ execution_metadata.extra["routing_metadata"] = r3_meta
116
+ except Exception as e:
117
+ logger.warning("Failed to decompress R3 payload for trace %s: %s", trace.get("id"), e)
118
+
119
+ logprobs_payload = payloads.get("logprobs")
120
+ if isinstance(logprobs_payload, dict) and logprobs_payload.get("data"):
121
+ try:
122
+ logprobs, token_ids, lp_meta = decompress_and_parse_lp(logprobs_payload["data"])
123
+ if execution_metadata.extra is None:
124
+ execution_metadata.extra = {}
125
+ execution_metadata.extra["completion_logprobs"] = logprobs
126
+ if token_ids is not None:
127
+ execution_metadata.extra["completion_token_ids"] = token_ids
128
+ execution_metadata.extra["logprobs_metadata"] = lp_meta
129
+
130
+ for i in range(len(messages) - 1, -1, -1):
131
+ if messages[i].role == "assistant":
132
+ content_entries = [{"logprob": lp} for lp in logprobs]
133
+ if token_ids is not None:
134
+ for entry, tid in zip(content_entries, token_ids):
135
+ entry["token_id"] = tid
136
+ messages[i].logprobs = {"content": content_entries}
137
+ break
138
+ except Exception as e:
139
+ logger.warning(
140
+ "Failed to decompress logprobs payload for trace %s: %s",
141
+ trace.get("id"),
142
+ e,
143
+ )
144
+
103
145
  return EvaluationRow(
104
146
  messages=messages,
105
147
  tools=tools,
106
148
  input_metadata=InputMetadata(
107
149
  row_id=row_id,
108
150
  session_data={
109
- "langfuse_trace_id": trace.get("id"), # Store the trace ID here
151
+ "langfuse_trace_id": trace.get("id"),
110
152
  },
111
153
  ),
112
154
  execution_metadata=execution_metadata,
@@ -426,6 +468,7 @@ class FireworksTracingAdapter(BaseAdapter):
426
468
  max_retries: int = 3,
427
469
  span_name: Optional[str] = None,
428
470
  converter: Optional[TraceDictConverter] = None,
471
+ include_payloads: bool = False,
429
472
  ) -> List[EvaluationRow]:
430
473
  """Pull traces from Langfuse via proxy and convert to EvaluationRow format.
431
474
 
@@ -449,6 +492,8 @@ class FireworksTracingAdapter(BaseAdapter):
449
492
  max_retries: Max retry attempts used by proxy (default: 3)
450
493
  converter: Optional custom converter implementing TraceDictConverter protocol.
451
494
  If provided, this will be used instead of the default conversion logic.
495
+ include_payloads: If True, request payload data (e.g., router replay)
496
+ from the gateway and decompress it into the returned EvaluationRows.
452
497
 
453
498
  Returns:
454
499
  List[EvaluationRow]: Converted evaluation rows
@@ -479,6 +524,7 @@ class FireworksTracingAdapter(BaseAdapter):
479
524
  "to_timestamp": to_timestamp.isoformat() if to_timestamp else None,
480
525
  "sleep_between_gets": sleep_between_gets,
481
526
  "max_retries": max_retries,
527
+ "include_payloads": include_payloads if include_payloads else None,
482
528
  }
483
529
 
484
530
  # Remove None values
@@ -0,0 +1,109 @@
1
+ """LP/v1 binary deserializer for per-token logprobs payloads.
2
+
3
+ Implements the inverse of the tracing gateway's ``logprobs_serializer.serialize_logprobs``.
4
+ See that module for the full header specification.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import base64
10
+ import struct
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ import zstandard as zstd
14
+
15
+ MAGIC = b"LP01"
16
+ HEADER_VERSION = 1
17
+ MISSING_TOKEN_ID = -1
18
+ ENTRY_FORMAT = "<if"
19
+ ENTRY_SIZE = struct.calcsize(ENTRY_FORMAT) # 8 bytes
20
+ HEADER_FORMAT = "<4sBBHIIQ"
21
+ HEADER_SIZE = struct.calcsize(HEADER_FORMAT) # 24 bytes
22
+
23
+
24
+ def _parse_header(raw: bytes) -> Dict[str, Any]:
25
+ if len(raw) < HEADER_SIZE:
26
+ raise ValueError(f"Payload too short for lp/v1 header: {len(raw)} < {HEADER_SIZE}")
27
+
28
+ (
29
+ magic,
30
+ version,
31
+ flags,
32
+ reserved_u16,
33
+ token_count,
34
+ body_byte_length,
35
+ reserved_u64,
36
+ ) = struct.unpack(HEADER_FORMAT, raw[:HEADER_SIZE])
37
+
38
+ if magic != MAGIC:
39
+ raise ValueError(f"Bad LP/v1 magic: {magic!r}")
40
+ if version != HEADER_VERSION:
41
+ raise ValueError(f"Unsupported lp/v1 header version: {version}")
42
+
43
+ return {
44
+ "flags": flags,
45
+ "reserved_u16": reserved_u16,
46
+ "token_count": token_count,
47
+ "body_byte_length": body_byte_length,
48
+ "reserved_u64": reserved_u64,
49
+ }
50
+
51
+
52
+ def parse_logprobs(raw: bytes) -> Tuple[List[float], Optional[List[int]], Dict[str, Any]]:
53
+ """Parse uncompressed LP/v1 bytes into logprobs, optional token ids, and metadata."""
54
+ header = _parse_header(raw)
55
+ token_count = header["token_count"]
56
+ body_byte_length = header["body_byte_length"]
57
+
58
+ if token_count == 0:
59
+ raise ValueError("LP/v1 token_count must be > 0")
60
+ if body_byte_length != token_count * ENTRY_SIZE:
61
+ raise ValueError(
62
+ f"body_byte_length ({body_byte_length}) != token_count * {ENTRY_SIZE} "
63
+ f"({token_count * ENTRY_SIZE})"
64
+ )
65
+
66
+ expected_len = HEADER_SIZE + body_byte_length
67
+ if len(raw) != expected_len:
68
+ raise ValueError(f"LP/v1 payload length mismatch: {len(raw)} != {expected_len}")
69
+
70
+ logprobs: List[float] = []
71
+ token_ids: List[int] = []
72
+ all_token_ids_valid = True
73
+ offset = HEADER_SIZE
74
+ for _ in range(token_count):
75
+ wire_id, logprob = struct.unpack(ENTRY_FORMAT, raw[offset : offset + ENTRY_SIZE])
76
+ offset += ENTRY_SIZE
77
+ logprobs.append(logprob)
78
+ if wire_id == MISSING_TOKEN_ID:
79
+ all_token_ids_valid = False
80
+ token_ids.append(wire_id)
81
+ else:
82
+ token_ids.append(wire_id)
83
+
84
+ metadata: Dict[str, Any] = {
85
+ "scope": "completion_only",
86
+ "completion_token_count": token_count,
87
+ "all_token_ids_valid": all_token_ids_valid,
88
+ }
89
+ header.update(metadata)
90
+ ids_out: Optional[List[int]] = token_ids if all_token_ids_valid else None
91
+ return logprobs, ids_out, header
92
+
93
+
94
+ def decompress_and_parse_lp(data_b64: str) -> Tuple[List[float], Optional[List[int]], Dict[str, Any]]:
95
+ """Decompress and unpack an LP/v1 payload into completion logprobs and token ids.
96
+
97
+ Args:
98
+ data_b64: Base64-encoded zstd-compressed LP binary blob from
99
+ ``payloads.logprobs.data``.
100
+
101
+ Returns:
102
+ ``(logprobs, token_ids, metadata)`` where ``logprobs`` is per-completion-token
103
+ scalars, ``token_ids`` is ``None`` if any wire id was ``MISSING_TOKEN_ID``,
104
+ and ``metadata`` includes ``all_token_ids_valid`` and ``completion_token_count``.
105
+ """
106
+ compressed = base64.b64decode(data_b64)
107
+ decompressor = zstd.ZstdDecompressor()
108
+ raw = decompressor.decompress(compressed)
109
+ return parse_logprobs(raw)
@@ -0,0 +1,187 @@
1
+ """R3/v1 binary deserializer for router-replay payloads.
2
+
3
+ Implements the inverse of the packed binary format produced by the tracing
4
+ gateway's ``r3_serializer.serialize_r3``. See that module for the full
5
+ header specification.
6
+
7
+ The main entry point is :func:`decompress_and_parse_r3`, which accepts the
8
+ base64-encoded compressed blob returned by the gateway's
9
+ ``/v1/traces/pointwise?include_payloads=true`` endpoint and produces
10
+ per-token routing matrices in the same ``List[Optional[str]]`` format used
11
+ by the direct inference path (``DeploymentSampler.sample_with_tokens()``).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import base64
17
+ import struct
18
+ from enum import IntEnum
19
+ from typing import Any, Dict, List, Optional, Tuple
20
+
21
+ import zstandard as zstd
22
+
23
+ MAGIC = b"R3V1"
24
+ HEADER_FORMAT = "<4sBBBBIIIIQ"
25
+ HEADER_SIZE = struct.calcsize(HEADER_FORMAT) # 32 bytes
26
+ BITS_PER_BYTE = 8
27
+
28
+
29
+ class _SelectorMode(IntEnum):
30
+ ALL = 0
31
+ SUFFIX = 1
32
+ BITMAP = 2
33
+
34
+
35
+ class _RoutingDtype(IntEnum):
36
+ UINT8 = 1
37
+ UINT16 = 2
38
+
39
+
40
+ _SELECTOR_MODE_NAMES = {v: v.name.lower() for v in _SelectorMode}
41
+ _ROUTING_DTYPE_NAMES = {v: v.name.lower() for v in _RoutingDtype}
42
+
43
+
44
+ def _parse_header(raw: bytes) -> Dict[str, Any]:
45
+ if len(raw) < HEADER_SIZE:
46
+ raise ValueError(
47
+ f"Payload too short for r3/v1 header: {len(raw)} < {HEADER_SIZE}"
48
+ )
49
+
50
+ (
51
+ magic,
52
+ version,
53
+ selector_mode,
54
+ routing_dtype,
55
+ flags,
56
+ total_token_count,
57
+ replayed_token_count,
58
+ replay_start_token,
59
+ selector_byte_length,
60
+ matrix_byte_length,
61
+ ) = struct.unpack(HEADER_FORMAT, raw[:HEADER_SIZE])
62
+
63
+ if magic != MAGIC:
64
+ raise ValueError(f"Bad R3 magic: {magic!r}")
65
+ if version != 1:
66
+ raise ValueError(f"Unsupported R3 header version: {version}")
67
+
68
+ return {
69
+ "selector_mode": selector_mode,
70
+ "routing_dtype": routing_dtype,
71
+ "flags": flags,
72
+ "total_token_count": total_token_count,
73
+ "replayed_token_count": replayed_token_count,
74
+ "replay_start_token": replay_start_token,
75
+ "selector_byte_length": selector_byte_length,
76
+ "matrix_byte_length": matrix_byte_length,
77
+ }
78
+
79
+
80
+ def _read_bitmap_positions(
81
+ selector_bytes: bytes, total_token_count: int
82
+ ) -> List[int]:
83
+ """Return sorted token indices where the bitmap bit is set."""
84
+ positions: List[int] = []
85
+ for i in range(total_token_count):
86
+ byte_idx = i // BITS_PER_BYTE
87
+ bit_idx = i % BITS_PER_BYTE
88
+ if byte_idx < len(selector_bytes) and (selector_bytes[byte_idx] >> bit_idx) & 1:
89
+ positions.append(i)
90
+ return positions
91
+
92
+
93
+ def decompress_and_parse_r3(
94
+ data_b64: str,
95
+ ) -> Tuple[List[Optional[str]], Dict[str, Any]]:
96
+ """Decompress and unpack an R3/v1 payload into per-token routing matrices.
97
+
98
+ Args:
99
+ data_b64: Base64-encoded zstd-compressed R3 binary blob, as returned
100
+ by the tracing gateway in ``payloads.router_replay.data``.
101
+
102
+ Returns:
103
+ A tuple of ``(routing_matrices, metadata)`` where:
104
+
105
+ - ``routing_matrices`` is a ``List[Optional[str]]`` of length
106
+ ``total_token_count``. Each present position contains a
107
+ base64-encoded routing matrix (matching the format returned by
108
+ the direct inference path); absent positions are ``None``.
109
+ - ``metadata`` is a dict with keys ``routing_dtype``,
110
+ ``selector_mode``, ``total_token_count``, ``replayed_token_count``,
111
+ ``replay_start_token``.
112
+ """
113
+ compressed = base64.b64decode(data_b64)
114
+
115
+ # ZstdCompressor.compress() embeds the uncompressed size in the frame
116
+ # header by default, so the library can auto-allocate the output buffer.
117
+ decompressor = zstd.ZstdDecompressor()
118
+ raw = decompressor.decompress(compressed)
119
+
120
+ header = _parse_header(raw)
121
+
122
+ selector_mode = header["selector_mode"]
123
+ routing_dtype = header["routing_dtype"]
124
+ total_token_count = header["total_token_count"]
125
+ replayed_token_count = header["replayed_token_count"]
126
+ replay_start_token = header["replay_start_token"]
127
+ selector_byte_length = header["selector_byte_length"]
128
+ matrix_byte_length = header["matrix_byte_length"]
129
+
130
+ metadata: Dict[str, Any] = {
131
+ "routing_dtype": _ROUTING_DTYPE_NAMES.get(routing_dtype, str(routing_dtype)),
132
+ "selector_mode": _SELECTOR_MODE_NAMES.get(selector_mode, str(selector_mode)),
133
+ "total_token_count": total_token_count,
134
+ "replayed_token_count": replayed_token_count,
135
+ "replay_start_token": replay_start_token,
136
+ }
137
+
138
+ if replayed_token_count == 0:
139
+ return [None] * total_token_count, metadata
140
+
141
+ # Per-token matrix byte size is implicit in the payload: all replayed
142
+ # tokens share the same matrix length, so we can recover it from the
143
+ # matrix section total length divided by the replayed-token count.
144
+ if matrix_byte_length % replayed_token_count != 0:
145
+ raise ValueError(
146
+ f"matrix_byte_length ({matrix_byte_length}) is not a multiple of "
147
+ f"replayed_token_count ({replayed_token_count}); cannot split "
148
+ "into per-token matrices"
149
+ )
150
+ matrix_elem_size = matrix_byte_length // replayed_token_count
151
+
152
+ body = raw[HEADER_SIZE:]
153
+ expected_body_length = selector_byte_length + matrix_byte_length
154
+ if len(body) < expected_body_length:
155
+ raise ValueError(
156
+ f"Payload body too short for selector and matrix sections: "
157
+ f"{len(body)} < {expected_body_length}"
158
+ )
159
+
160
+ selector_bytes = body[:selector_byte_length]
161
+ matrix_bytes = body[selector_byte_length : selector_byte_length + matrix_byte_length]
162
+
163
+ if selector_mode == _SelectorMode.ALL:
164
+ replayed_positions = list(range(total_token_count))
165
+ elif selector_mode == _SelectorMode.SUFFIX:
166
+ replayed_positions = list(
167
+ range(replay_start_token, replay_start_token + replayed_token_count)
168
+ )
169
+ elif selector_mode == _SelectorMode.BITMAP:
170
+ replayed_positions = _read_bitmap_positions(selector_bytes, total_token_count)
171
+ else:
172
+ raise ValueError(f"Unknown selector_mode: {selector_mode}")
173
+
174
+ if len(replayed_positions) != replayed_token_count:
175
+ raise ValueError(
176
+ f"Selector produced {len(replayed_positions)} replayed positions, "
177
+ f"but header replayed_token_count is {replayed_token_count}"
178
+ )
179
+
180
+ # Split matrix bytes into per-token chunks and base64-encode each one
181
+ matrices: List[Optional[str]] = [None] * total_token_count
182
+ for idx, pos in enumerate(replayed_positions):
183
+ start = idx * matrix_elem_size
184
+ end = start + matrix_elem_size
185
+ matrices[pos] = base64.b64encode(matrix_bytes[start:end]).decode("ascii")
186
+
187
+ return matrices, metadata
@@ -35,11 +35,13 @@ class RemoteRolloutProcessor(RolloutProcessor):
35
35
  model_base_url: str = "https://tracing.fireworks.ai",
36
36
  poll_interval: float = 1.0,
37
37
  timeout_seconds: float = 120.0,
38
+ include_payloads: bool = False,
38
39
  ):
39
40
  # Prefer constructor-provided configuration. These can be overridden via
40
41
  # config.kwargs at call time for backward compatibility.
41
42
  self._remote_base_url = remote_base_url
42
43
  self._model_base_url = model_base_url
44
+ self._include_payloads = include_payloads
43
45
  if os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL"):
44
46
  self._remote_base_url = os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL")
45
47
  _ep_model_base_url = os.getenv("EP_MODEL_BASE_URL")
@@ -139,8 +141,28 @@ class RemoteRolloutProcessor(RolloutProcessor):
139
141
  status_code,
140
142
  )
141
143
 
142
- status_message = status.get("message", "") or ""
143
- status_details = status.get("details", []) or []
144
+ # /status only returns the code; backfill message/details/extras from Logs once.
145
+ status_message: str = ""
146
+ status_details: list = []
147
+ status_extras: dict = {}
148
+ completed_logs = await self._tracing_adapter.async_search_logs(
149
+ session, tags=[f"rollout_id:{row.execution_metadata.rollout_id}"]
150
+ )
151
+ # Pick the log row whose status code matches the terminal
152
+ # code from /status, so intermediate RUNNING checkpoints
153
+ # don't poison the backfill.
154
+ for log in completed_logs:
155
+ sd = log.get("status")
156
+ if isinstance(sd, dict) and sd.get("code") == status_code:
157
+ status_message = sd.get("message", "") or ""
158
+ status_details = sd.get("details", []) or []
159
+ raw_extras = log.get("extras") or {}
160
+ status_extras = {
161
+ k: v
162
+ for k, v in raw_extras.items()
163
+ if k not in ("logger_name", "level", "timestamp")
164
+ }
165
+ break
144
166
 
145
167
  exception = exception_for_status_code(status_code, status_message)
146
168
  if exception is not None:
@@ -152,8 +174,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
152
174
  details=status_details,
153
175
  )
154
176
 
155
- status_extras = (status_result or {}).get("extras")
156
- if isinstance(status_extras, dict):
177
+ if status_extras:
157
178
  if row.execution_metadata.extra:
158
179
  row.execution_metadata.extra.update(status_extras)
159
180
  else:
@@ -175,7 +196,10 @@ class RemoteRolloutProcessor(RolloutProcessor):
175
196
  row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
176
197
 
177
198
  def _update_with_trace() -> None:
178
- return update_row_with_remote_trace(row, default_fireworks_output_data_loader, model_base_url)
199
+ return update_row_with_remote_trace(
200
+ row, default_fireworks_output_data_loader, model_base_url,
201
+ include_payloads=self._include_payloads,
202
+ )
179
203
 
180
204
  await asyncio.to_thread(_update_with_trace) # Update row with remote trace in-place
181
205
  return row
@@ -22,9 +22,61 @@ def default_fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDat
22
22
  # Use EP_REMOTE_API_KEY for fetching remote traces, falling back to FIREWORKS_API_KEY
23
23
  api_key = os.environ.get("EP_REMOTE_API_KEY") or os.environ.get("FIREWORKS_API_KEY")
24
24
  adapter = FireworksTracingAdapter(base_url=base_url, api_key=api_key)
25
- return adapter.get_evaluation_rows(tags=[f"rollout_id:{config.rollout_id}"], max_retries=5)
25
+ return adapter.get_evaluation_rows(
26
+ tags=[f"rollout_id:{config.rollout_id}"],
27
+ max_retries=5,
28
+ include_payloads=config.include_payloads,
29
+ )
26
30
 
27
- return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=filter_longest_conversation)
31
+ def preprocess_traces(rows: List[EvaluationRow]) -> List[EvaluationRow]:
32
+ filtered_rows = filter_longest_conversation(rows)
33
+ if config.include_payloads and filtered_rows:
34
+ _merge_payloads_into_longest_row(filtered_rows[0], rows)
35
+ return filtered_rows
36
+
37
+ return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=preprocess_traces)
38
+
39
+
40
+ def _merge_payloads_into_longest_row(longest_row: EvaluationRow, rows: List[EvaluationRow]) -> None:
41
+ """
42
+ Preserve per-turn payload-derived metadata after selecting the longest trace row.
43
+
44
+ Each trace row carries payloads for its final assistant turn. The longest row
45
+ keeps the full conversation, while its top-level execution metadata remains
46
+ the payload metadata for the final completion for backward compatibility.
47
+ """
48
+ target_assistants = longest_row.get_assistant_messages()
49
+ assistant_turn_payloads = []
50
+
51
+ for row in sorted(rows, key=lambda item: len(item.messages)):
52
+ source = row.last_assistant_message()
53
+ source_turn_index = len(row.get_assistant_messages()) - 1
54
+ if source_turn_index < 0 or source_turn_index >= len(target_assistants):
55
+ continue
56
+
57
+ if source and source.logprobs and not target_assistants[source_turn_index].logprobs:
58
+ target_assistants[source_turn_index].logprobs = source.logprobs
59
+
60
+ extra = row.execution_metadata.extra or {}
61
+ turn_payload = {
62
+ key: extra[key]
63
+ for key in (
64
+ "completion_logprobs",
65
+ "completion_token_ids",
66
+ "logprobs_metadata",
67
+ "routing_matrices",
68
+ "routing_metadata",
69
+ )
70
+ if key in extra
71
+ }
72
+ if turn_payload:
73
+ turn_payload["assistant_turn_index"] = source_turn_index
74
+ assistant_turn_payloads.append(turn_payload)
75
+
76
+ if assistant_turn_payloads:
77
+ if longest_row.execution_metadata.extra is None:
78
+ longest_row.execution_metadata.extra = {}
79
+ longest_row.execution_metadata.extra["assistant_turn_payloads"] = assistant_turn_payloads
28
80
 
29
81
 
30
82
  def build_fireworks_tracing_url(
@@ -99,7 +151,7 @@ def build_init_request(
99
151
  if not completion_params_dict.get("model"):
100
152
  raise ValueError("Model must be provided in completion_params")
101
153
 
102
- # Extract base_url from completion_params
154
+ # Extract base_url from completion_params for tracing-gateway URL encoding
103
155
  completion_params_base_url: Optional[str] = completion_params_dict.get("base_url")
104
156
 
105
157
  # Strip non-OpenAI fields from messages
@@ -129,7 +181,7 @@ def build_init_request(
129
181
 
130
182
  # Build final model base URL with tracing metadata
131
183
  final_model_base_url = model_base_url
132
- if model_base_url and ("tracing.fireworks.ai" in model_base_url or model_base_url.startswith("http://localhost")):
184
+ if model_base_url and ("tracing.fireworks.ai" in model_base_url or model_base_url.startswith("http://localhost") or "litellm-gateway" in model_base_url):
133
185
  final_model_base_url = build_fireworks_tracing_url(model_base_url, meta, completion_params_base_url)
134
186
 
135
187
  # Extract API key from environment or completion_params
@@ -148,13 +200,20 @@ def build_init_request(
148
200
 
149
201
 
150
202
  def update_row_with_remote_trace(
151
- row: EvaluationRow, output_data_loader: Callable[[DataLoaderConfig], DynamicDataLoader], model_base_url: str
203
+ row: EvaluationRow,
204
+ output_data_loader: Callable[[DataLoaderConfig], DynamicDataLoader],
205
+ model_base_url: str,
206
+ include_payloads: bool = False,
152
207
  ) -> None:
153
208
  """Update row with remote trace data using output_data_loader (shared logic)."""
154
209
  if not row.execution_metadata.rollout_id:
155
210
  return None
156
211
 
157
- loader_config = DataLoaderConfig(rollout_id=row.execution_metadata.rollout_id, model_base_url=model_base_url)
212
+ loader_config = DataLoaderConfig(
213
+ rollout_id=row.execution_metadata.rollout_id,
214
+ model_base_url=model_base_url,
215
+ include_payloads=include_payloads,
216
+ )
158
217
  data_loader = output_data_loader(loader_config)
159
218
  results = data_loader.load()
160
219
  output_rows: List[EvaluationRow] = [r for result in results for r in result.rows]
@@ -39,6 +39,7 @@ class DataLoaderConfig(BaseModel):
39
39
 
40
40
  rollout_id: str
41
41
  model_base_url: Optional[str] = None
42
+ include_payloads: bool = False
42
43
 
43
44
 
44
45
  class InitRequest(BaseModel):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.28
3
+ Version: 0.3.30
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -40,6 +40,7 @@ Requires-Dist: addict>=2.4.0
40
40
  Requires-Dist: deepdiff>=6.0.0
41
41
  Requires-Dist: websockets>=15.0.1
42
42
  Requires-Dist: fastapi>=0.116.1
43
+ Requires-Dist: zstandard>=0.19.0
43
44
  Provides-Extra: dev
44
45
  Requires-Dist: build; extra == "dev"
45
46
  Requires-Dist: twine; extra == "dev"
@@ -56,7 +56,9 @@ eval_protocol/adapters/huggingface.py
56
56
  eval_protocol/adapters/langchain.py
57
57
  eval_protocol/adapters/langfuse.py
58
58
  eval_protocol/adapters/langsmith.py
59
+ eval_protocol/adapters/lp_deserializer.py
59
60
  eval_protocol/adapters/openai_responses.py
61
+ eval_protocol/adapters/r3_deserializer.py
60
62
  eval_protocol/adapters/trl.py
61
63
  eval_protocol/adapters/utils.py
62
64
  eval_protocol/adapters/weave.py
@@ -28,6 +28,7 @@ addict>=2.4.0
28
28
  deepdiff>=6.0.0
29
29
  websockets>=15.0.1
30
30
  fastapi>=0.116.1
31
+ zstandard>=0.19.0
31
32
 
32
33
  [bigquery]
33
34
  google-cloud-bigquery>=3.0.0
@@ -48,6 +48,7 @@ dependencies = [
48
48
  "deepdiff>=6.0.0",
49
49
  "websockets>=15.0.1",
50
50
  "fastapi>=0.116.1",
51
+ "zstandard>=0.19.0",
51
52
  ]
52
53
 
53
54
  [project.urls]
File without changes
File without changes