eval-protocol 0.2.99__tar.gz → 0.2.99.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (472) hide show
  1. {eval_protocol-0.2.99/eval_protocol.egg-info → eval_protocol-0.2.99.dev2}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/create_rft.py +2 -2
  4. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/tinker_rollout_processor.py +1 -1
  5. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/manager.py +1 -1
  6. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/models.py +13 -1
  7. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +1 -1
  8. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +1 -1
  9. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +1 -1
  10. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/evaluation_test_utils.py +9 -2
  11. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/github_action_rollout_processor.py +4 -4
  12. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/openenv_rollout_processor.py +3 -3
  13. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/priority_scheduler.py +178 -60
  14. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/remote_rollout_processor.py +1 -1
  15. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2/eval_protocol.egg-info}/PKG-INFO +1 -1
  16. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_create_rft.py +4 -4
  17. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_priority_scheduler.py +4 -4
  18. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/index-CuQbfdPD.js +1 -1
  19. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/index-CuQbfdPD.js.map +1 -1
  20. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/LICENSE +0 -0
  21. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/README.md +0 -0
  22. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/__init__.py +0 -0
  23. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/normalize_sandbox_fusion.py +0 -0
  24. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/utils/__init__.py +0 -0
  25. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/utils/generate_api_key.py +0 -0
  26. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/utils/subprocess_manager.py +0 -0
  27. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/__init__.py +0 -0
  28. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/__main__.py +0 -0
  29. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/__init__.py +0 -0
  30. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/base.py +0 -0
  31. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/bigquery.py +0 -0
  32. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/braintrust.py +0 -0
  33. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  34. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/huggingface.py +0 -0
  35. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/langchain.py +0 -0
  36. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/langfuse.py +0 -0
  37. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/langsmith.py +0 -0
  38. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/openai_responses.py +0 -0
  39. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/trl.py +0 -0
  40. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/utils.py +0 -0
  41. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/weave.py +0 -0
  42. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/__init__.py +0 -0
  43. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/models.py +0 -0
  44. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/orchestrator.py +0 -0
  45. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resource_abc.py +0 -0
  46. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resource_pool.py +0 -0
  47. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
  48. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  49. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  50. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  51. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  52. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  53. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
  54. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  55. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  56. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
  57. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/task_manager.py +0 -0
  58. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/tool_registry.py +0 -0
  59. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/auth.py +0 -0
  60. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
  61. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  62. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  63. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
  64. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  65. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
  66. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  67. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  68. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  69. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  70. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli.py +0 -0
  71. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
  72. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  73. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/common.py +0 -0
  74. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
  75. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  76. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/export_docs.py +0 -0
  77. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/local_test.py +0 -0
  78. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/logs.py +0 -0
  79. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/preview.py +0 -0
  80. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  81. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/upload.py +0 -0
  82. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/utils.py +0 -0
  83. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/common_utils.py +0 -0
  84. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/config.py +0 -0
  85. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/__init__.py +0 -0
  86. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  87. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  88. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  89. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  90. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/models.py +0 -0
  91. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
  92. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  93. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  94. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  95. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  96. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/datasets/__init__.py +0 -0
  97. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/datasets/loader.py +0 -0
  98. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/directory_utils.py +0 -0
  99. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/evaluation.py +0 -0
  100. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/__init__.py +0 -0
  101. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
  102. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/logger.py +0 -0
  103. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  104. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  105. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/exceptions.py +0 -0
  106. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/execution/__init__.py +0 -0
  107. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/execution/pipeline.py +0 -0
  108. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/fireworks_rft.py +0 -0
  109. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/gcp_tools.py +0 -0
  110. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/generation/cache.py +0 -0
  111. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/generation/clients/base.py +0 -0
  112. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/generation/clients.py +0 -0
  113. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/generic_server.py +0 -0
  114. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/get_pep440_version.py +0 -0
  115. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/human_id/__init__.py +0 -0
  116. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/human_id/dictionary.py +0 -0
  117. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/__init__.py +0 -0
  118. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/deepeval.py +0 -0
  119. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/openai_rft.py +0 -0
  120. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/openeval.py +0 -0
  121. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  122. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/trl.py +0 -0
  123. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/__init__.py +0 -0
  124. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  125. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  126. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  127. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  128. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/init.py +0 -0
  129. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/rollout_context.py +0 -0
  130. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  131. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/util.py +0 -0
  132. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/logging_utils.py +0 -0
  133. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/__init__.py +0 -0
  134. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/adapter.py +0 -0
  135. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
  136. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/client/connection.py +0 -0
  137. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/clients.py +0 -0
  138. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
  139. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
  140. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
  141. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  142. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
  143. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  144. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
  145. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/process_manager.py +0 -0
  146. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
  147. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/session/manager.py +0 -0
  148. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
  149. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
  150. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
  151. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/config.py +0 -0
  152. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/main.py +0 -0
  153. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  154. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  155. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  156. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  157. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_env.py +0 -0
  158. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
  159. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  160. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  161. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  162. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  163. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  164. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  165. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  166. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  167. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  168. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  169. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  170. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  171. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  172. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  173. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/packaging.py +0 -0
  174. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/platform_api.py +0 -0
  175. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/playback_policy.py +0 -0
  176. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/__init__.py +0 -0
  177. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  178. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/app.py +0 -0
  179. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  180. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  181. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  182. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/main.py +0 -0
  183. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/models.py +0 -0
  184. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  185. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/__init__.py +0 -0
  186. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/buffer.py +0 -0
  187. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  188. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  189. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  190. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  191. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  192. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  193. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/evaluation_test.py +0 -0
  194. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  195. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/exception_config.py +0 -0
  196. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/execution.py +0 -0
  197. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  198. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  199. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  200. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/parameterize.py +0 -0
  201. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/plugin.py +0 -0
  202. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
  203. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  204. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
  205. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
  206. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/tracing_utils.py +0 -0
  207. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/types.py +0 -0
  208. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
  209. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/__init__.py +0 -0
  210. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  211. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  212. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  213. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  214. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  215. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  216. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  217. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
  218. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  219. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  220. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  221. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  222. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/utils.py +0 -0
  223. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/resources.py +0 -0
  224. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/reward_function.py +0 -0
  225. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/__init__.py +0 -0
  226. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/accuracy.py +0 -0
  227. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
  228. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  229. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  230. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
  231. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
  232. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/code_execution.py +0 -0
  233. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
  234. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
  235. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  236. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/format.py +0 -0
  237. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/function_calling.py +0 -0
  238. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/json_schema.py +0 -0
  239. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
  240. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
  241. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/length.py +0 -0
  242. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  243. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/math.py +0 -0
  244. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  245. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
  246. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/repetition.py +0 -0
  247. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/tag_count.py +0 -0
  248. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rl_processing.py +0 -0
  249. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/server.py +0 -0
  250. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/stats/__init__.py +0 -0
  251. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
  252. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/__init__.py +0 -0
  253. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/gepa_trainer.py +0 -0
  254. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/gepa_utils.py +0 -0
  255. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/trainer.py +0 -0
  256. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/utils.py +0 -0
  257. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/typed_interface.py +0 -0
  258. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/types/__init__.py +0 -0
  259. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/types/errors.py +0 -0
  260. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/types/remote_rollout_processor.py +0 -0
  261. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/types/types.py +0 -0
  262. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/__init__.py +0 -0
  263. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
  264. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
  265. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/browser_utils.py +0 -0
  266. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/check_server_status.py +0 -0
  267. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
  268. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  269. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/logs_models.py +0 -0
  270. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/logs_server.py +0 -0
  271. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/module_loader.py +0 -0
  272. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
  273. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/show_results_url.py +0 -0
  274. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/static_policy.py +0 -0
  275. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
  276. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/vite_server.py +0 -0
  277. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/SOURCES.txt +0 -0
  278. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
  279. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
  280. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/requires.txt +0 -0
  281. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
  282. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/pyproject.toml +0 -0
  283. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/setup.cfg +0 -0
  284. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/setup.py +0 -0
  285. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_accuracy.py +0 -0
  286. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_accuracy_length.py +0 -0
  287. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_adapters_e2e.py +0 -0
  288. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_agent_orchestrator.py +0 -0
  289. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_agent_resources.py +0 -0
  290. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_auth.py +0 -0
  291. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_batch_evaluation.py +0 -0
  292. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli.py +0 -0
  293. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_agent.py +0 -0
  294. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_args.py +0 -0
  295. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_local_test.py +0 -0
  296. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_code_execution.py +0 -0
  297. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_config.py +0 -0
  298. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_control_plane_separation.py +0 -0
  299. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cpp_code.py +0 -0
  300. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_data_driven_task_manager.py +0 -0
  301. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_deepcoder_reward.py +0 -0
  302. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_deepeval_integration.py +0 -0
  303. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_deploy_integration.py +0 -0
  304. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_directory_utils.py +0 -0
  305. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_e2b_integration.py +0 -0
  306. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_e2b_js_integration.py +0 -0
  307. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_edge_cases.py +0 -0
  308. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_ep_upload_e2e.py +0 -0
  309. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_eval_protocol_import.py +0 -0
  310. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_evaluation.py +0 -0
  311. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_evaluation_integration.py +0 -0
  312. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_evaluation_postprocess.py +0 -0
  313. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_evaluation_preview_integration.py +0 -0
  314. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_event_bus.py +0 -0
  315. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_event_bus_helper.py +0 -0
  316. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_examples_end_to_end.py +0 -0
  317. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_exception_config.py +0 -0
  318. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_exceptions.py +0 -0
  319. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_fireworks_api.py +0 -0
  320. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_format.py +0 -0
  321. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_fractional_code.py +0 -0
  322. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_function_calling.py +0 -0
  323. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_gcp_tools.py +0 -0
  324. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_generic_server.py +0 -0
  325. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_human_id.py +0 -0
  326. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_integration.py +0 -0
  327. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_json_schema.py +0 -0
  328. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_kwargs_validation.py +0 -0
  329. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_language_consistency.py +0 -0
  330. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_lean_prover.py +0 -0
  331. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_lean_prover_runner.py +0 -0
  332. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_length.py +0 -0
  333. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_list_comparison_math_reward.py +0 -0
  334. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_litellm_policy_provider_fields.py +0 -0
  335. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_logs_server.py +0 -0
  336. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_logs_server_simple.py +0 -0
  337. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_math.py +0 -0
  338. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_message_field_filtering.py +0 -0
  339. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_minimal.py +0 -0
  340. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_models.py +0 -0
  341. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_models_rl.py +0 -0
  342. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
  343. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_n_variant_batch_integration.py +0 -0
  344. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_n_variant_integration.py +0 -0
  345. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_openai_compatibility.py +0 -0
  346. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_openai_rft_integration.py +0 -0
  347. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_openeval_integration.py +0 -0
  348. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_packaging.py +0 -0
  349. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_parallel_rollouts.py +0 -0
  350. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_platform_api.py +0 -0
  351. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_quickstart_utils.py +0 -0
  352. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_readiness.py +0 -0
  353. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_reasoning_steps.py +0 -0
  354. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_repetition.py +0 -0
  355. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_repetition_debug.py +0 -0
  356. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_retry_mechanism.py +0 -0
  357. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_reward_function.py +0 -0
  358. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_reward_protocol_import.py +0 -0
  359. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_rl_processing.py +0 -0
  360. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
  361. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_server.py +0 -0
  362. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_show_results_url.py +0 -0
  363. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_sqlite_hardening.py +0 -0
  364. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_status_migration_changes.py +0 -0
  365. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_status_migration_integration.py +0 -0
  366. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_status_model.py +0 -0
  367. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_tag_count.py +0 -0
  368. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
  369. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_training_utils.py +0 -0
  370. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_typed_interface.py +0 -0
  371. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_typed_interface_rl.py +0 -0
  372. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_upload_entrypoint.py +0 -0
  373. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_url_handling.py +0 -0
  374. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_vite_server.py +0 -0
  375. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/__init__.py +0 -0
  376. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/agent/__init__.py +0 -0
  377. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/agent/base.py +0 -0
  378. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
  379. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/api_service/__init__.py +0 -0
  380. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/api_service/api_config.py +0 -0
  381. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/api_service/data_model.py +0 -0
  382. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
  383. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/cli.py +0 -0
  384. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/config.py +0 -0
  385. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
  386. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
  387. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  388. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
  389. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  390. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  391. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  392. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  393. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  394. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  395. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  396. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data_model/__init__.py +0 -0
  397. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data_model/message.py +0 -0
  398. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data_model/simulation.py +0 -0
  399. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data_model/tasks.py +0 -0
  400. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/__init__.py +0 -0
  401. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
  402. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
  403. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
  404. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
  405. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
  406. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
  407. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
  408. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
  409. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
  410. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
  411. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
  412. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
  413. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
  414. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
  415. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
  416. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
  417. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
  418. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
  419. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  420. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  421. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  422. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  423. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  424. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  425. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  426. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  427. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
  428. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  429. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  430. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
  431. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/__init__.py +0 -0
  432. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/db.py +0 -0
  433. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/environment.py +0 -0
  434. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/server.py +0 -0
  435. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/tool.py +0 -0
  436. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/toolkit.py +0 -0
  437. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  438. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
  439. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
  440. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  441. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  442. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  443. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  444. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  445. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/metrics/__init__.py +0 -0
  446. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
  447. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  448. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
  449. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  450. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  451. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
  452. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/registry.py +0 -0
  453. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/run.py +0 -0
  454. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/__init__.py +0 -0
  455. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/check_data.py +0 -0
  456. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  457. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
  458. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
  459. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/user/__init__.py +0 -0
  460. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/user/base.py +0 -0
  461. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/user/user_simulator.py +0 -0
  462. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/__init__.py +0 -0
  463. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/display.py +0 -0
  464. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/io_utils.py +0 -0
  465. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
  466. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
  467. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/utils.py +0 -0
  468. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/versioneer.py +0 -0
  469. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  470. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/index-iZp_HgyW.css +0 -0
  471. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  472. {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.99
3
+ Version: 0.2.99.dev2
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-12-16T16:20:44-0800",
11
+ "date": "2025-12-17T19:22:32-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "2b765e03b649eee53bc18f024d5e7f7dbeb2891a",
15
- "version": "0.2.99"
14
+ "full-revisionid": "686ed67e7b83d4451d8fbd613f7d261a41fff9cb",
15
+ "version": "0.2.99.dev.2"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -660,8 +660,8 @@ def _create_rft_job(
660
660
  ("temperature", "temperature"),
661
661
  ("topP", "top_p"),
662
662
  ("topK", "top_k"),
663
- ("maxTokens", "max_output_tokens"),
664
- ("n", "response_candidates_count"),
663
+ ("maxOutputTokens", "max_output_tokens"),
664
+ ("responseCandidatesCount", "response_candidates_count"),
665
665
  ]:
666
666
  val = getattr(args, arg_name, None)
667
667
  if val is not None:
@@ -152,7 +152,7 @@ class TinkerRolloutProcessor(RolloutProcessor):
152
152
  # Update row
153
153
  new_messages = list(row.messages) + [Message(role="assistant", content=assistant_content)]
154
154
  row.messages = new_messages
155
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
155
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
156
156
 
157
157
  # Log usage (approximate since Tinker might not return usage stats in same format)
158
158
  # We can count tokens ourselves
@@ -150,7 +150,7 @@ class ExecutionManager:
150
150
  else:
151
151
  evaluation_row.rollout_status = Status.rollout_running()
152
152
 
153
- evaluation_row.execution_metadata.duration_seconds = time.perf_counter() - row_start_time
153
+ evaluation_row.execution_metadata.rollout_duration_seconds = time.perf_counter() - row_start_time
154
154
 
155
155
  return evaluation_row
156
156
 
@@ -809,9 +809,21 @@ class ExecutionMetadata(BaseModel):
809
809
 
810
810
  cost_metrics: Optional[CostMetrics] = Field(default=None, description="Cost breakdown for LLM API calls.")
811
811
 
812
+ # deprecated: use rollout_duration_seconds and eval_duration_seconds instead
812
813
  duration_seconds: Optional[float] = Field(
813
814
  default=None,
814
- description="Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
815
+ deprecated=True,
816
+ description="[Deprecated] Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
817
+ )
818
+
819
+ rollout_duration_seconds: Optional[float] = Field(
820
+ default=None,
821
+ description="Processing duration in seconds for the rollout of this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
822
+ )
823
+
824
+ eval_duration_seconds: Optional[float] = Field(
825
+ default=None,
826
+ description="Processing duration in seconds for the evaluation of this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
815
827
  )
816
828
 
817
829
  experiment_duration_seconds: Optional[float] = Field(
@@ -267,7 +267,7 @@ class AgentRolloutProcessor(RolloutProcessor):
267
267
  total_tokens=agent.usage["total_tokens"],
268
268
  )
269
269
 
270
- agent.evaluation_row.execution_metadata.duration_seconds = time.perf_counter() - start_time
270
+ agent.evaluation_row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
271
271
 
272
272
  return agent.evaluation_row
273
273
  finally:
@@ -83,7 +83,7 @@ class PydanticAgentRolloutProcessor(RolloutProcessor):
83
83
  # total_tokens=usage_info.total_tokens or 0,
84
84
  # )
85
85
 
86
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
86
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
87
87
 
88
88
  return row
89
89
 
@@ -180,7 +180,7 @@ class SingleTurnRolloutProcessor(RolloutProcessor):
180
180
 
181
181
  row.messages = messages
182
182
 
183
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
183
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
184
184
 
185
185
  default_logger.log(row)
186
186
  return row
@@ -42,7 +42,7 @@ AggregationMethod = Literal["mean", "max", "min", "bootstrap"]
42
42
 
43
43
 
44
44
  async def run_tasks_with_eval_progress(
45
- pointwise_tasks: list[asyncio.Task[EvaluationRow]], run_idx: int
45
+ pointwise_tasks: list[asyncio.Task[EvaluationRow]], run_idx: int, disable_tqdm: bool = False
46
46
  ) -> list[EvaluationRow]:
47
47
  """
48
48
  Run evaluation tasks with a progress bar and proper cancellation handling.
@@ -66,6 +66,7 @@ async def run_tasks_with_eval_progress(
66
66
  miniters=1,
67
67
  mininterval=0.1,
68
68
  bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
69
+ disable=disable_tqdm,
69
70
  ) as eval_pbar:
70
71
 
71
72
  async def task_with_progress(task: asyncio.Task[EvaluationRow]) -> EvaluationRow:
@@ -88,7 +89,10 @@ async def run_tasks_with_eval_progress(
88
89
 
89
90
 
90
91
  async def run_tasks_with_run_progress(
91
- execute_run_func: Callable[[int, RolloutProcessorConfig], Any], num_runs: int, config: RolloutProcessorConfig
92
+ execute_run_func: Callable[[int, RolloutProcessorConfig], Any],
93
+ num_runs: int,
94
+ config: RolloutProcessorConfig,
95
+ disable_tqdm: bool = False,
92
96
  ) -> None:
93
97
  """
94
98
  Run tasks with a parallel runs progress bar, preserving original logic.
@@ -108,6 +112,7 @@ async def run_tasks_with_run_progress(
108
112
  dynamic_ncols=True,
109
113
  miniters=1,
110
114
  bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
115
+ disable=disable_tqdm,
111
116
  ) as run_pbar:
112
117
 
113
118
  async def execute_run_with_progress(run_idx: int, config: RolloutProcessorConfig) -> Any:
@@ -330,6 +335,7 @@ async def rollout_processor_with_retry(
330
335
  fresh_dataset: list[EvaluationRow],
331
336
  config: RolloutProcessorConfig,
332
337
  run_idx: int = 0,
338
+ disable_tqdm: bool = False,
333
339
  ) -> AsyncGenerator[EvaluationRow, None]:
334
340
  """
335
341
  Wrapper around rollout_processor that handles retry logic using the Python backoff library.
@@ -449,6 +455,7 @@ async def rollout_processor_with_retry(
449
455
  miniters=1,
450
456
  mininterval=0.1,
451
457
  bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
458
+ disable=disable_tqdm,
452
459
  ) as rollout_pbar:
453
460
  # Yield results as they complete
454
461
  for task in asyncio.as_completed(retry_tasks):
@@ -162,7 +162,7 @@ class GithubActionRolloutProcessor(RolloutProcessor):
162
162
  row.rollout_status = Status.rollout_error(
163
163
  f"Failed to find workflow run in GHA with rollout_id {row.execution_metadata.rollout_id}"
164
164
  )
165
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
165
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
166
166
  return row
167
167
 
168
168
  run_id = run.get("id")
@@ -170,7 +170,7 @@ class GithubActionRolloutProcessor(RolloutProcessor):
170
170
  row.rollout_status = Status.rollout_error(
171
171
  f"Failed to find workflow run in GHA with rollout_id {row.execution_metadata.rollout_id}"
172
172
  )
173
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
173
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
174
174
  return row
175
175
 
176
176
  # Poll the specific run until completion
@@ -194,10 +194,10 @@ class GithubActionRolloutProcessor(RolloutProcessor):
194
194
  row.rollout_status = Status.rollout_error(
195
195
  f"GitHub Actions run timed out after {self.timeout_seconds} seconds"
196
196
  )
197
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
197
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
198
198
  return row
199
199
 
200
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
200
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
201
201
 
202
202
  def _update_with_trace() -> None:
203
203
  return update_row_with_remote_trace(row, self._output_data_loader, self.model_base_url)
@@ -411,7 +411,7 @@ class OpenEnvRolloutProcessor(RolloutProcessor):
411
411
  completion_tokens=usage["completion_tokens"],
412
412
  total_tokens=usage["total_tokens"],
413
413
  )
414
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
414
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
415
415
 
416
416
  # Attach per-step rewards and accumulated token IDs to
417
417
  # execution_metadata.extra for downstream integrations
@@ -436,14 +436,14 @@ class OpenEnvRolloutProcessor(RolloutProcessor):
436
436
  logger.info("[OpenEnvRolloutProcessor] Total reward: %.3f", total_reward)
437
437
  logger.info(
438
438
  "[OpenEnvRolloutProcessor] Duration: %.2fs",
439
- row.execution_metadata.duration_seconds,
439
+ row.execution_metadata.rollout_duration_seconds,
440
440
  )
441
441
  logger.debug("[OpenEnvRolloutProcessor] Messages collected: %d", len(messages))
442
442
 
443
443
  logger.info(
444
444
  f"Rollout complete: {len(step_rewards)} steps, "
445
445
  f"total_reward={total_reward:.2f}, "
446
- f"duration={row.execution_metadata.duration_seconds:.2f}s"
446
+ f"duration={row.execution_metadata.rollout_duration_seconds:.2f}s"
447
447
  )
448
448
  # Final log with complete message history
449
449
  if getattr(config, "logger", None):
@@ -1,9 +1,12 @@
1
1
  import asyncio
2
2
  import logging
3
3
  import os
4
+ import time
4
5
  from collections import defaultdict
5
6
  from dataclasses import dataclass, field
6
- from typing import Any, Callable, List, Dict, Optional, Union, Awaitable
7
+ from typing import Any, List, Dict, Optional, Union
8
+
9
+ from tqdm.asyncio import tqdm as async_tqdm
7
10
 
8
11
  from eval_protocol.models import EvaluationRow, Status
9
12
  from eval_protocol.pytest.types import RolloutProcessorConfig, TestFunction
@@ -79,6 +82,18 @@ class PriorityRolloutScheduler:
79
82
  self.rollout_n = rollout_n
80
83
  self.in_group_minibatch_size = in_group_minibatch_size if in_group_minibatch_size > 0 else rollout_n
81
84
  self.evaluation_test_kwargs = evaluation_test_kwargs
85
+
86
+ # Progress bars (initialized in run())
87
+ self.rollout_pbar: Optional[async_tqdm] = None
88
+ self.eval_pbar: Optional[async_tqdm] = None
89
+
90
+ # Track active rollouts: {row_index: set of run_indices currently in progress}
91
+ self.active_rollouts: Dict[int, set] = defaultdict(set)
92
+ self.active_rollouts_lock = asyncio.Lock()
93
+
94
+ # Track active evaluations
95
+ self.active_evals: int = 0
96
+ self.active_evals_lock = asyncio.Lock()
82
97
 
83
98
  async def schedule_dataset(
84
99
  self,
@@ -132,41 +147,68 @@ class PriorityRolloutScheduler:
132
147
  experiment_id = rows_to_eval[0].execution_metadata.experiment_id if isinstance(rows_to_eval, list) else rows_to_eval.execution_metadata.experiment_id
133
148
  run_id = rows_to_eval[0].execution_metadata.run_id if isinstance(rows_to_eval, list) else rows_to_eval.execution_metadata.run_id
134
149
  eval_res = None
150
+
151
+ # Track active eval
152
+ async with self.active_evals_lock:
153
+ self.active_evals += 1
154
+ if self.eval_pbar:
155
+ self.eval_pbar.set_postfix_str(f"active={self.active_evals}")
156
+
157
+ start_time = time.perf_counter()
135
158
 
136
- async with self.eval_sem:
137
- async with rollout_logging_context(
138
- rollout_id or "",
139
- experiment_id=experiment_id,
140
- run_id=run_id,
141
- ):
142
- if isinstance(rows_to_eval, list):
143
- eval_res = await execute_pytest_with_exception_handling(
144
- test_func=self.eval_executor,
145
- evaluation_test_kwargs=self.evaluation_test_kwargs,
146
- processed_dataset=rows_to_eval,
147
- )
148
- else:
149
- eval_res = await execute_pytest_with_exception_handling(
150
- test_func=self.eval_executor,
151
- evaluation_test_kwargs=self.evaluation_test_kwargs,
152
- processed_row=rows_to_eval,
153
- )
154
-
155
- # push result to the output buffer
156
- if self.output_buffer:
159
+ try:
160
+ async with self.eval_sem:
161
+ async with rollout_logging_context(
162
+ rollout_id or "",
163
+ experiment_id=experiment_id,
164
+ run_id=run_id,
165
+ ):
166
+ if isinstance(rows_to_eval, list):
167
+ eval_res = await execute_pytest_with_exception_handling(
168
+ test_func=self.eval_executor,
169
+ evaluation_test_kwargs=self.evaluation_test_kwargs,
170
+ processed_dataset=rows_to_eval,
171
+ )
172
+ else:
173
+ eval_res = await execute_pytest_with_exception_handling(
174
+ test_func=self.eval_executor,
175
+ evaluation_test_kwargs=self.evaluation_test_kwargs,
176
+ processed_row=rows_to_eval,
177
+ )
178
+ eval_duration = time.perf_counter() - start_time
179
+
180
+ # Set eval_duration_seconds BEFORE buffer writes to ensure it's included in serialization
157
181
  if isinstance(eval_res, list):
158
182
  for row in eval_res:
159
- self._post_process_result(row)
160
- await self.output_buffer.add_result(row)
183
+ row.execution_metadata.eval_duration_seconds = eval_duration
161
184
  else:
162
- self._post_process_result(eval_res)
163
- await self.output_buffer.add_result(eval_res)
185
+ eval_res.execution_metadata.eval_duration_seconds = eval_duration
164
186
 
165
- if isinstance(eval_res, list):
166
- self.results.extend(eval_res)
167
- else:
168
- self.results.append(eval_res)
169
- return eval_res
187
+ # push result to the output buffer
188
+ if self.output_buffer:
189
+ if isinstance(eval_res, list):
190
+ for row in eval_res:
191
+ self._post_process_result(row)
192
+ await self.output_buffer.add_result(row)
193
+ else:
194
+ self._post_process_result(eval_res)
195
+ await self.output_buffer.add_result(eval_res)
196
+
197
+ if isinstance(eval_res, list):
198
+ for row in eval_res:
199
+ self.results.append(row)
200
+ else:
201
+ self.results.append(eval_res)
202
+ return eval_res
203
+ finally:
204
+ # Always update progress bar (handles both success and failure cases)
205
+ if self.eval_pbar:
206
+ self.eval_pbar.update(1)
207
+ # Decrement active eval counter
208
+ async with self.active_evals_lock:
209
+ self.active_evals -= 1
210
+ if self.eval_pbar:
211
+ self.eval_pbar.set_postfix_str(f"active={self.active_evals}")
170
212
 
171
213
  # 1. Prepare Config & Row for this micro-batch
172
214
  current_batch_rows = []
@@ -205,15 +247,33 @@ class PriorityRolloutScheduler:
205
247
  batch_results: List[EvaluationRow] = []
206
248
  if current_batch_rows:
207
249
  for idx, row in current_batch_rows:
208
- async for result_row in rollout_processor_with_retry(
209
- self.rollout_processor, [row], task.config, idx
210
- ):
211
- batch_results.append(result_row)
212
- # in pointwise, we start evaluation immediately
213
- if self.mode == "pointwise":
214
- t = asyncio.create_task(_run_eval(result_row))
215
- self.background_tasks.add(t)
216
- t.add_done_callback(self.background_tasks.discard)
250
+ # Track this rollout as active
251
+ async with self.active_rollouts_lock:
252
+ self.active_rollouts[task.row_index].add(idx)
253
+ await self._update_rollout_pbar_postfix()
254
+
255
+ try:
256
+ async for result_row in rollout_processor_with_retry(
257
+ self.rollout_processor, [row], task.config, idx, disable_tqdm=True
258
+ ):
259
+ batch_results.append(result_row)
260
+
261
+ # Update rollout progress bar
262
+ if self.rollout_pbar:
263
+ self.rollout_pbar.update(1)
264
+
265
+ # in pointwise, we start evaluation immediately
266
+ if self.mode == "pointwise":
267
+ t = asyncio.create_task(_run_eval(result_row))
268
+ self.background_tasks.add(t)
269
+ t.add_done_callback(self.background_tasks.discard)
270
+ finally:
271
+ # Remove from active tracking
272
+ async with self.active_rollouts_lock:
273
+ self.active_rollouts[task.row_index].discard(idx)
274
+ if not self.active_rollouts[task.row_index]:
275
+ del self.active_rollouts[task.row_index]
276
+ await self._update_rollout_pbar_postfix()
217
277
 
218
278
  # 3. Evaluate and Collect History
219
279
  current_batch_history_updates = []
@@ -257,6 +317,34 @@ class PriorityRolloutScheduler:
257
317
  )
258
318
  self.queue.put_nowait(new_task)
259
319
 
320
+ def _format_active_rollouts(self) -> str:
321
+ """Format active rollouts for display in progress bar."""
322
+ if not self.active_rollouts:
323
+ return ""
324
+
325
+ # Show active rows and their run indices
326
+ parts = []
327
+ for row_idx in sorted(self.active_rollouts.keys())[:5]: # Limit to 5 rows to keep it readable
328
+ runs = sorted(self.active_rollouts[row_idx])
329
+ if runs:
330
+ runs_str = ",".join(str(r) for r in runs[:3]) # Show up to 3 run indices
331
+ if len(runs) > 3:
332
+ runs_str += f"+{len(runs)-3}"
333
+ parts.append(f"r{row_idx}:[{runs_str}]")
334
+
335
+ if len(self.active_rollouts) > 5:
336
+ parts.append(f"+{len(self.active_rollouts)-5} more")
337
+
338
+ return " | ".join(parts)
339
+
340
+ async def _update_rollout_pbar_postfix(self):
341
+ """Update the rollout progress bar postfix with active tasks info."""
342
+ if self.rollout_pbar:
343
+ active_count = sum(len(runs) for runs in self.active_rollouts.values())
344
+ self.rollout_pbar.set_postfix_str(
345
+ f"active={active_count} {self._format_active_rollouts()}"
346
+ )
347
+
260
348
  def _post_process_result(self, res: EvaluationRow):
261
349
  """
262
350
  Process evaluation result: update cost metrics, status, and log.
@@ -294,28 +382,58 @@ class PriorityRolloutScheduler:
294
382
  async def run(self, dataset: List[EvaluationRow], num_runs: int, base_config: RolloutProcessorConfig):
295
383
  self.num_runs = num_runs
296
384
 
297
- # 1. Schedule initial tasks
298
- await self.schedule_dataset(dataset, base_config)
299
-
300
- # 2. Start Workers
301
- # If we have separate limits, we need enough workers to saturate both stages
302
- num_workers = self.max_concurrent_rollouts
303
-
304
- workers = [asyncio.create_task(self.worker()) for _ in range(num_workers)]
305
-
306
- # 3. Wait for completion
307
- await self.queue.join()
308
-
309
- # Wait for background evaluations to finish
310
- if self.background_tasks:
311
- await asyncio.gather(*self.background_tasks, return_exceptions=True)
385
+ # Calculate totals for progress bars
386
+ total_rollouts = len(dataset) * num_runs
387
+ # In pointwise mode: 1 eval per rollout; in groupwise mode: 1 eval per dataset row
388
+ total_evals = total_rollouts if self.mode == "pointwise" else len(dataset)
312
389
 
313
- # 4. Cleanup
314
- for w in workers:
315
- w.cancel()
390
+ # Initialize progress bars
391
+ self.rollout_pbar = async_tqdm(
392
+ total=total_rollouts,
393
+ desc="🚀 Rollouts",
394
+ unit="row",
395
+ position=0,
396
+ leave=True,
397
+ colour="cyan",
398
+ )
399
+ self.eval_pbar = async_tqdm(
400
+ total=total_evals,
401
+ desc="📊 Evals",
402
+ unit="eval",
403
+ position=1,
404
+ leave=True,
405
+ colour="green",
406
+ )
316
407
 
317
- if workers:
318
- await asyncio.gather(*workers, return_exceptions=True)
408
+ try:
409
+ # 1. Schedule initial tasks
410
+ await self.schedule_dataset(dataset, base_config)
411
+
412
+ # 2. Start Workers
413
+ # If we have separate limits, we need enough workers to saturate both stages
414
+ num_workers = self.max_concurrent_rollouts
415
+
416
+ workers = [asyncio.create_task(self.worker()) for _ in range(num_workers)]
417
+
418
+ # 3. Wait for completion
419
+ await self.queue.join()
420
+
421
+ # Wait for background evaluations to finish
422
+ if self.background_tasks:
423
+ await asyncio.gather(*self.background_tasks, return_exceptions=True)
424
+
425
+ # 4. Cleanup
426
+ for w in workers:
427
+ w.cancel()
428
+
429
+ if workers:
430
+ await asyncio.gather(*workers, return_exceptions=True)
431
+ finally:
432
+ # Close progress bars
433
+ if self.rollout_pbar:
434
+ self.rollout_pbar.close()
435
+ if self.eval_pbar:
436
+ self.eval_pbar.close()
319
437
 
320
438
  # Return collected results
321
439
  return self.results
@@ -185,7 +185,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
185
185
  f"Rollout {row.execution_metadata.rollout_id} timed out after {timeout_seconds} seconds"
186
186
  )
187
187
 
188
- row.execution_metadata.duration_seconds = time.perf_counter() - start_time
188
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
189
189
 
190
190
  def _update_with_trace() -> None:
191
191
  return update_row_with_remote_trace(row, self._output_data_loader, model_base_url)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.99
3
+ Version: 0.2.99.dev2
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -182,8 +182,8 @@ def test_create_rft_passes_all_flags_into_request_body(rft_test_harness, monkeyp
182
182
  assert abs(ip["temperature"] - 0.9) < 1e-12
183
183
  assert abs(ip["topP"] - 0.95) < 1e-12
184
184
  assert ip["topK"] == 50
185
- assert ip["maxTokens"] == 4096
186
- assert ip["n"] == 6
185
+ assert ip["maxOutputTokens"] == 4096
186
+ assert ip["responseCandidatesCount"] == 6
187
187
  assert ip["extraBody"] == '{"foo":"bar"}'
188
188
 
189
189
  # W&B mapping
@@ -1126,8 +1126,8 @@ def test_cli_full_command_style_evaluator_and_dataset_flags(tmp_path, monkeypatc
1126
1126
 
1127
1127
  # Inference params mapping
1128
1128
  ip = body["inferenceParameters"]
1129
- assert ip["n"] == 4
1130
- assert ip["maxTokens"] == 32768
1129
+ assert ip["responseCandidatesCount"] == 4
1130
+ assert ip["maxOutputTokens"] == 32768
1131
1131
 
1132
1132
  # Other top-level
1133
1133
  assert body["chunkSize"] == 50
@@ -57,7 +57,7 @@ async def test_scheduler_basic_execution(
57
57
  micro_batch_size = 1
58
58
 
59
59
  # Mock rollout processor with delay
60
- async def delayed_rollout(processor, rows, config, run_idx):
60
+ async def delayed_rollout(processor, rows, config, run_idx, **kwargs):
61
61
  await asyncio.sleep(0.01)
62
62
  for row in rows:
63
63
  yield row
@@ -110,7 +110,7 @@ async def test_concurrency_control(
110
110
  rollout_lock = asyncio.Lock()
111
111
  eval_lock = asyncio.Lock()
112
112
 
113
- async def mock_rollout_gen(processor, rows, config, run_idx):
113
+ async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
114
114
  nonlocal active_rollouts, max_active_rollouts_seen
115
115
  async with rollout_lock:
116
116
  active_rollouts += 1
@@ -177,7 +177,7 @@ async def test_priority_scheduling(
177
177
 
178
178
  execution_order = []
179
179
 
180
- async def mock_rollout_gen(processor, rows, config, run_idx):
180
+ async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
181
181
  row_id = rows[0].input_metadata.row_id
182
182
  execution_order.append(f"{row_id}_run_{run_idx}")
183
183
  for row in rows:
@@ -290,7 +290,7 @@ async def test_groupwise_mode(
290
290
  eval_calls.append(rows)
291
291
  return rows # Pass through
292
292
 
293
- async def mock_rollout_gen(processor, rows, config, run_idx):
293
+ async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
294
294
  for row in rows:
295
295
  yield row
296
296