eval-protocol 0.2.96__tar.gz → 0.2.97__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (464) hide show
  1. {eval_protocol-0.2.96/eval_protocol.egg-info → eval_protocol-0.2.97}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +61 -135
  4. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/create_rft.py +8 -2
  5. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/local_test.py +25 -2
  6. eval_protocol-0.2.97/eval_protocol/pytest/buffer.py +82 -0
  7. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/evaluation_test.py +284 -226
  8. eval_protocol-0.2.97/eval_protocol/pytest/priority_scheduler.py +348 -0
  9. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/validate_signature.py +0 -2
  10. {eval_protocol-0.2.96 → eval_protocol-0.2.97/eval_protocol.egg-info}/PKG-INFO +1 -1
  11. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/SOURCES.txt +3 -0
  12. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_evaluation_postprocess.py +60 -1
  13. eval_protocol-0.2.97/tests/test_priority_scheduler.py +322 -0
  14. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/LICENSE +0 -0
  15. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/README.md +0 -0
  16. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/__init__.py +0 -0
  17. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/normalize_sandbox_fusion.py +0 -0
  18. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/utils/__init__.py +0 -0
  19. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/utils/generate_api_key.py +0 -0
  20. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/utils/subprocess_manager.py +0 -0
  21. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/__init__.py +0 -0
  22. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/__main__.py +0 -0
  23. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/__init__.py +0 -0
  24. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/base.py +0 -0
  25. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/bigquery.py +0 -0
  26. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/braintrust.py +0 -0
  27. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  28. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/huggingface.py +0 -0
  29. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/langchain.py +0 -0
  30. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/langfuse.py +0 -0
  31. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/langsmith.py +0 -0
  32. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/openai_responses.py +0 -0
  33. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/trl.py +0 -0
  34. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/utils.py +0 -0
  35. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/weave.py +0 -0
  36. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/__init__.py +0 -0
  37. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/models.py +0 -0
  38. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/orchestrator.py +0 -0
  39. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resource_abc.py +0 -0
  40. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resource_pool.py +0 -0
  41. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/__init__.py +0 -0
  42. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  43. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  44. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  45. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  46. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  47. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/docker_resource.py +0 -0
  48. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  49. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  50. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/sql_resource.py +0 -0
  51. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/task_manager.py +0 -0
  52. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/tool_registry.py +0 -0
  53. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/auth.py +0 -0
  54. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/__init__.py +0 -0
  55. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  56. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  57. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_aime25.py +0 -0
  58. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  59. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  60. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  61. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  62. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  63. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli.py +0 -0
  64. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/__init__.py +0 -0
  65. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  66. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/common.py +0 -0
  67. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/deploy.py +0 -0
  68. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  69. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/logs.py +0 -0
  70. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/preview.py +0 -0
  71. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  72. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/upload.py +0 -0
  73. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/utils.py +0 -0
  74. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/common_utils.py +0 -0
  75. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/config.py +0 -0
  76. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/__init__.py +0 -0
  77. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  78. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  79. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  80. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  81. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/models.py +0 -0
  82. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/__init__.py +0 -0
  83. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  84. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  85. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  86. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  87. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/datasets/__init__.py +0 -0
  88. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/datasets/loader.py +0 -0
  89. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/directory_utils.py +0 -0
  90. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/evaluation.py +0 -0
  91. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/__init__.py +0 -0
  92. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/event_bus.py +0 -0
  93. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/logger.py +0 -0
  94. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  95. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  96. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/exceptions.py +0 -0
  97. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/execution/__init__.py +0 -0
  98. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/execution/pipeline.py +0 -0
  99. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/fireworks_rft.py +0 -0
  100. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/gcp_tools.py +0 -0
  101. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/generation/cache.py +0 -0
  102. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/generation/clients/base.py +0 -0
  103. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/generation/clients.py +0 -0
  104. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/generic_server.py +0 -0
  105. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/get_pep440_version.py +0 -0
  106. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/human_id/__init__.py +0 -0
  107. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/human_id/dictionary.py +0 -0
  108. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/__init__.py +0 -0
  109. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/deepeval.py +0 -0
  110. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/openai_rft.py +0 -0
  111. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/openeval.py +0 -0
  112. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  113. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
  114. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/trl.py +0 -0
  115. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/__init__.py +0 -0
  116. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  117. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  118. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  119. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  120. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/init.py +0 -0
  121. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/rollout_context.py +0 -0
  122. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  123. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/util.py +0 -0
  124. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/logging_utils.py +0 -0
  125. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/__init__.py +0 -0
  126. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/adapter.py +0 -0
  127. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/client/__init__.py +0 -0
  128. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/client/connection.py +0 -0
  129. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/clients.py +0 -0
  130. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/__init__.py +0 -0
  131. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/base_policy.py +0 -0
  132. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/manager.py +0 -0
  133. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/policy.py +0 -0
  134. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  135. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/grid_renderer.py +0 -0
  136. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  137. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/mcpgym.py +0 -0
  138. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/process_manager.py +0 -0
  139. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/session/__init__.py +0 -0
  140. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/session/manager.py +0 -0
  141. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/simple_process_manager.py +0 -0
  142. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/simulation_server.py +0 -0
  143. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/__init__.py +0 -0
  144. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/config.py +0 -0
  145. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/main.py +0 -0
  146. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  147. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  148. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  149. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  150. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_env.py +0 -0
  151. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/__init__.py +0 -0
  152. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  153. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  154. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  155. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  156. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  157. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  158. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  159. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  160. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  161. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  162. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  163. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  164. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  165. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  166. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/models.py +0 -0
  167. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/packaging.py +0 -0
  168. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/platform_api.py +0 -0
  169. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/playback_policy.py +0 -0
  170. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/__init__.py +0 -0
  171. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  172. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/app.py +0 -0
  173. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  174. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  175. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  176. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/main.py +0 -0
  177. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/models.py +0 -0
  178. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  179. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/__init__.py +0 -0
  180. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  181. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  182. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  183. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  184. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  185. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  186. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  187. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  188. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  189. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  190. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  191. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/exception_config.py +0 -0
  192. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/execution.py +0 -0
  193. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  194. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  195. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  196. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  197. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
  198. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/parameterize.py +0 -0
  199. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/plugin.py +0 -0
  200. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  201. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/rollout_processor.py +0 -0
  202. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  203. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/store_experiment_link.py +0 -0
  204. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/store_results_url.py +0 -0
  205. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/tracing_utils.py +0 -0
  206. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/types.py +0 -0
  207. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/__init__.py +0 -0
  208. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  209. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  210. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  211. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  212. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  213. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  214. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  215. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/llm_judge.py +0 -0
  216. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  217. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  218. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  219. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  220. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/utils.py +0 -0
  221. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/resources.py +0 -0
  222. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/reward_function.py +0 -0
  223. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/__init__.py +0 -0
  224. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/accuracy.py +0 -0
  225. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/accuracy_length.py +0 -0
  226. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  227. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  228. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/apps_testing_util.py +0 -0
  229. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/bfcl_reward.py +0 -0
  230. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/code_execution.py +0 -0
  231. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/code_execution_utils.py +0 -0
  232. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/cpp_code.py +0 -0
  233. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  234. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/format.py +0 -0
  235. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/function_calling.py +0 -0
  236. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/json_schema.py +0 -0
  237. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/language_consistency.py +0 -0
  238. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/lean_prover.py +0 -0
  239. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/length.py +0 -0
  240. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  241. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/math.py +0 -0
  242. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  243. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/reasoning_steps.py +0 -0
  244. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/repetition.py +0 -0
  245. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/tag_count.py +0 -0
  246. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rl_processing.py +0 -0
  247. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/server.py +0 -0
  248. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/stats/__init__.py +0 -0
  249. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/stats/confidence_intervals.py +0 -0
  250. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/typed_interface.py +0 -0
  251. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/types/__init__.py +0 -0
  252. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/types/errors.py +0 -0
  253. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/types/remote_rollout_processor.py +0 -0
  254. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/types/types.py +0 -0
  255. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/__init__.py +0 -0
  256. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/batch_evaluation.py +0 -0
  257. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/batch_transformation.py +0 -0
  258. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/browser_utils.py +0 -0
  259. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/check_server_status.py +0 -0
  260. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/dataset_helpers.py +0 -0
  261. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  262. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/logs_models.py +0 -0
  263. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/logs_server.py +0 -0
  264. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/module_loader.py +0 -0
  265. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/packaging_utils.py +0 -0
  266. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/show_results_url.py +0 -0
  267. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/static_policy.py +0 -0
  268. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/subprocess_utils.py +0 -0
  269. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/vite_server.py +0 -0
  270. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/dependency_links.txt +0 -0
  271. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/entry_points.txt +0 -0
  272. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/requires.txt +0 -0
  273. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/top_level.txt +0 -0
  274. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/pyproject.toml +0 -0
  275. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/setup.cfg +0 -0
  276. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/setup.py +0 -0
  277. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_accuracy.py +0 -0
  278. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_accuracy_length.py +0 -0
  279. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_adapters_e2e.py +0 -0
  280. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_agent_orchestrator.py +0 -0
  281. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_agent_resources.py +0 -0
  282. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_auth.py +0 -0
  283. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_batch_evaluation.py +0 -0
  284. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli.py +0 -0
  285. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli_agent.py +0 -0
  286. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli_args.py +0 -0
  287. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli_create_rft.py +0 -0
  288. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli_local_test.py +0 -0
  289. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_code_execution.py +0 -0
  290. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_config.py +0 -0
  291. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_control_plane_separation.py +0 -0
  292. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cpp_code.py +0 -0
  293. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_data_driven_task_manager.py +0 -0
  294. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_deepcoder_reward.py +0 -0
  295. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_deepeval_integration.py +0 -0
  296. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_deploy_integration.py +0 -0
  297. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_directory_utils.py +0 -0
  298. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_e2b_integration.py +0 -0
  299. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_e2b_js_integration.py +0 -0
  300. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_edge_cases.py +0 -0
  301. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_ep_upload_e2e.py +0 -0
  302. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_eval_protocol_import.py +0 -0
  303. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_evaluation.py +0 -0
  304. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_evaluation_integration.py +0 -0
  305. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_evaluation_preview_integration.py +0 -0
  306. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_event_bus.py +0 -0
  307. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_event_bus_helper.py +0 -0
  308. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_examples_end_to_end.py +0 -0
  309. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_exception_config.py +0 -0
  310. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_exceptions.py +0 -0
  311. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_fireworks_api.py +0 -0
  312. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_format.py +0 -0
  313. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_fractional_code.py +0 -0
  314. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_function_calling.py +0 -0
  315. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_gcp_tools.py +0 -0
  316. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_generic_server.py +0 -0
  317. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_human_id.py +0 -0
  318. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_integration.py +0 -0
  319. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_json_schema.py +0 -0
  320. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_kwargs_validation.py +0 -0
  321. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_language_consistency.py +0 -0
  322. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_lean_prover.py +0 -0
  323. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_lean_prover_runner.py +0 -0
  324. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_length.py +0 -0
  325. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_list_comparison_math_reward.py +0 -0
  326. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_litellm_policy_provider_fields.py +0 -0
  327. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_logs_server.py +0 -0
  328. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_logs_server_simple.py +0 -0
  329. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_math.py +0 -0
  330. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_message_field_filtering.py +0 -0
  331. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_minimal.py +0 -0
  332. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_models.py +0 -0
  333. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_models_rl.py +0 -0
  334. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_multiple_choice_math_reward.py +0 -0
  335. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_n_variant_batch_integration.py +0 -0
  336. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_n_variant_integration.py +0 -0
  337. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_openai_compatibility.py +0 -0
  338. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_openai_rft_integration.py +0 -0
  339. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_openeval_integration.py +0 -0
  340. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_packaging.py +0 -0
  341. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_parallel_rollouts.py +0 -0
  342. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_platform_api.py +0 -0
  343. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_quickstart_utils.py +0 -0
  344. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_readiness.py +0 -0
  345. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_reasoning_steps.py +0 -0
  346. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_repetition.py +0 -0
  347. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_repetition_debug.py +0 -0
  348. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_retry_mechanism.py +0 -0
  349. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_reward_function.py +0 -0
  350. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_reward_protocol_import.py +0 -0
  351. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_rl_processing.py +0 -0
  352. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_rollout_control_plane_integration.py +0 -0
  353. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_server.py +0 -0
  354. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_show_results_url.py +0 -0
  355. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_status_migration_changes.py +0 -0
  356. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_status_migration_integration.py +0 -0
  357. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_status_model.py +0 -0
  358. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_tag_count.py +0 -0
  359. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_tau_bench_airline_smoke.py +0 -0
  360. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_typed_interface.py +0 -0
  361. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_typed_interface_rl.py +0 -0
  362. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_upload_entrypoint.py +0 -0
  363. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_url_handling.py +0 -0
  364. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_vite_server.py +0 -0
  365. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/__init__.py +0 -0
  366. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/agent/__init__.py +0 -0
  367. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/agent/base.py +0 -0
  368. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/agent/llm_agent.py +0 -0
  369. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/api_service/__init__.py +0 -0
  370. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/api_service/api_config.py +0 -0
  371. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/api_service/data_model.py +0 -0
  372. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/api_service/simulation_service.py +0 -0
  373. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/cli.py +0 -0
  374. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/config.py +0 -0
  375. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/airline/policy.md +0 -0
  376. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/mock/policy.md +0 -0
  377. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  378. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/retail/policy.md +0 -0
  379. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  380. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  381. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  382. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  383. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  384. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  385. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  386. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data_model/__init__.py +0 -0
  387. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data_model/message.py +0 -0
  388. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data_model/simulation.py +0 -0
  389. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data_model/tasks.py +0 -0
  390. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/__init__.py +0 -0
  391. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/__init__.py +0 -0
  392. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/data_model.py +0 -0
  393. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/environment.py +0 -0
  394. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/tools.py +0 -0
  395. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/utils.py +0 -0
  396. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/__init__.py +0 -0
  397. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/data_model.py +0 -0
  398. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/environment.py +0 -0
  399. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/tools.py +0 -0
  400. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/utils.py +0 -0
  401. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/__init__.py +0 -0
  402. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/data_model.py +0 -0
  403. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/environment.py +0 -0
  404. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/tools.py +0 -0
  405. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/utils.py +0 -0
  406. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/__init__.py +0 -0
  407. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/data_model.py +0 -0
  408. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/environment.py +0 -0
  409. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  410. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  411. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  412. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  413. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  414. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  415. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  416. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  417. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tools.py +0 -0
  418. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  419. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  420. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/utils.py +0 -0
  421. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/__init__.py +0 -0
  422. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/db.py +0 -0
  423. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/environment.py +0 -0
  424. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/server.py +0 -0
  425. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/tool.py +0 -0
  426. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/toolkit.py +0 -0
  427. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  428. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/__init__.py +0 -0
  429. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator.py +0 -0
  430. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  431. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  432. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  433. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  434. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  435. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/metrics/__init__.py +0 -0
  436. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/metrics/agent_metrics.py +0 -0
  437. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  438. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/orchestrator/__init__.py +0 -0
  439. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  440. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  441. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/orchestrator/utils.py +0 -0
  442. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/registry.py +0 -0
  443. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/run.py +0 -0
  444. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/__init__.py +0 -0
  445. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/check_data.py +0 -0
  446. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  447. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/start_servers.py +0 -0
  448. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/view_simulations.py +0 -0
  449. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/user/__init__.py +0 -0
  450. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/user/base.py +0 -0
  451. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/user/user_simulator.py +0 -0
  452. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/__init__.py +0 -0
  453. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/display.py +0 -0
  454. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/io_utils.py +0 -0
  455. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/llm_utils.py +0 -0
  456. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/pydantic_utils.py +0 -0
  457. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/utils.py +0 -0
  458. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/versioneer.py +0 -0
  459. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  460. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/index-CuQbfdPD.js +0 -0
  461. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/index-CuQbfdPD.js.map +0 -0
  462. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/index-iZp_HgyW.css +0 -0
  463. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  464. {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.96
3
+ Version: 0.2.97
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-12-08T17:48:51-0800",
11
+ "date": "2025-12-09T23:27:24-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "0902602fdca3f7d80e259fd29bd293758a79c44a",
15
- "version": "0.2.96"
14
+ "full-revisionid": "a8914717e39825c126682e1686e036c0e7aa8960",
15
+ "version": "0.2.97"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -10,7 +10,6 @@ from eval_protocol.models import (
10
10
  EvaluationRow,
11
11
  Message,
12
12
  MetricResult,
13
- ChatCompletionContentPartTextParam,
14
13
  )
15
14
  from eval_protocol.pytest.default_single_turn_rollout_process import (
16
15
  SingleTurnRolloutProcessor,
@@ -18,12 +17,12 @@ from eval_protocol.pytest.default_single_turn_rollout_process import (
18
17
  from eval_protocol.pytest.evaluation_test import evaluation_test
19
18
 
20
19
 
21
- DEFAULT_MODEL_ID = "fireworks_ai/accounts/fireworks/models/glm-4p6"
20
+ DEFAULT_MODEL_ID = "fireworks_ai/accounts/pyroworks/deployedModels/minimax-m2-zmi4qk9f"
22
21
  DEFAULT_MAX_TOKENS = 10000
23
22
 
24
23
 
25
24
  def _coerce_content_to_str(
26
- content: str | list[ChatCompletionContentPartTextParam] | None,
25
+ content: str | list[Any] | None,
27
26
  ) -> str:
28
27
  if isinstance(content, list):
29
28
  texts: list[str] = []
@@ -153,7 +152,34 @@ PEER_TOOL_BRACE_PAYLOAD = {
153
152
  "content": "Call test_brace_bug with param1='test_value', param2=42, and param3=true",
154
153
  }
155
154
  ],
156
- "tools": WEATHER_TOOL_DEFINITION,
155
+ "tools": [
156
+ {
157
+ "type": "function",
158
+ "function": {
159
+ "name": "test_brace_bug",
160
+ "description": "A test function to validate JSON brace handling in tool arguments",
161
+ "parameters": {
162
+ "type": "object",
163
+ "properties": {
164
+ "param1": {
165
+ "type": "string",
166
+ "description": "A string parameter",
167
+ },
168
+ "param2": {
169
+ "type": "integer",
170
+ "description": "An integer parameter",
171
+ },
172
+ "param3": {
173
+ "type": "boolean",
174
+ "description": "A boolean parameter",
175
+ },
176
+ },
177
+ "required": ["param1", "param2", "param3"],
178
+ "additionalProperties": False,
179
+ },
180
+ },
181
+ }
182
+ ],
157
183
  "temperature": 0.1,
158
184
  "top_p": 1,
159
185
  }
@@ -468,48 +494,6 @@ PEER_TOOL_PARAMETER_FORMAT_ERRORS_PAYLOAD = {
468
494
  "stream": True,
469
495
  }
470
496
 
471
- PEER_TOOL_RECOVERY_FAILURE_PAYLOAD = {
472
- "messages": [
473
- {
474
- "role": "user",
475
- "content": (
476
- "View the file at /tmp/test.txt. If that fails, try again with the correct parameters. "
477
- "Keep retrying until it works."
478
- ),
479
- }
480
- ],
481
- "tools": [
482
- {
483
- "type": "function",
484
- "function": {
485
- "name": "view",
486
- "description": "View a file or directory",
487
- "strict": True,
488
- "parameters": {
489
- "type": "object",
490
- "properties": {
491
- "path": {
492
- "type": "string",
493
- "description": "Path to the file or directory to view",
494
- },
495
- "type": {
496
- "type": "string",
497
- "enum": ["file", "directory"],
498
- "description": "Type of the path (file or directory)",
499
- },
500
- },
501
- "required": ["path", "type"],
502
- "additionalProperties": False,
503
- },
504
- },
505
- }
506
- ],
507
- "tool_choice": "required",
508
- "temperature": 0.1,
509
- "max_tokens": 4000,
510
- "stream": True,
511
- }
512
-
513
497
 
514
498
  def _build_row_from_payload(case: str, payload: dict[str, Any]) -> EvaluationRow:
515
499
  messages = [
@@ -1329,13 +1313,13 @@ def test_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow:
1329
1313
  return row
1330
1314
 
1331
1315
 
1332
- _PEER_TOOL_MISSING_REQUIRED_ROW = _build_row_from_payload(
1333
- "peer-tool-missing-required-param", PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD
1316
+ _PEER_TOOL_REQUIRED_PARAMS_ROW = _build_row_from_payload(
1317
+ "peer-tool-required-params", PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD
1334
1318
  )
1335
1319
 
1336
1320
 
1337
1321
  @evaluation_test(
1338
- input_rows=[[_PEER_TOOL_MISSING_REQUIRED_ROW]],
1322
+ input_rows=[[_PEER_TOOL_REQUIRED_PARAMS_ROW]],
1339
1323
  completion_params=[_build_completion_params_from_payload(PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD)],
1340
1324
  rollout_processor=SingleTurnRolloutProcessor(),
1341
1325
  aggregation_method="mean",
@@ -1343,22 +1327,23 @@ _PEER_TOOL_MISSING_REQUIRED_ROW = _build_row_from_payload(
1343
1327
  num_runs=1,
1344
1328
  mode="pointwise",
1345
1329
  )
1346
- def test_streaming_tool_missing_required_param(row: EvaluationRow) -> EvaluationRow:
1347
- """Detect whether required parameters are omitted during streaming."""
1330
+ def test_streaming_tool_required_params_present(row: EvaluationRow) -> EvaluationRow:
1331
+ """Verify that tool calls include all required parameters during streaming."""
1348
1332
 
1349
1333
  assistant_msg = row.last_assistant_message()
1350
1334
  finish_reason = row.execution_metadata.finish_reason
1351
- _debug_log_assistant_message("tool_missing_required_param", assistant_msg, finish_reason)
1335
+ _debug_log_assistant_message("tool_required_params", assistant_msg, finish_reason)
1352
1336
  content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else ""
1353
1337
  reasoning_str = (assistant_msg.reasoning_content or "").strip() if assistant_msg else ""
1354
1338
  calls = _collect_tool_calls(assistant_msg.tool_calls if assistant_msg else [])
1355
1339
 
1356
- missing_required = False
1340
+ required_params_present = False
1357
1341
  arguments = None
1358
1342
  for _, args in calls:
1359
1343
  if args:
1360
1344
  arguments = args
1361
- missing_required = "type" not in args or args.get("type") not in {"file", "directory"}
1345
+ # Check that required 'type' param is present and valid
1346
+ required_params_present = "type" in args and args.get("type") in {"file", "directory"}
1362
1347
 
1363
1348
  metrics = {
1364
1349
  "tool_call_emitted": MetricResult(
@@ -1366,10 +1351,12 @@ def test_streaming_tool_missing_required_param(row: EvaluationRow) -> Evaluation
1366
1351
  is_score_valid=True,
1367
1352
  reason="Tool call emitted" if calls else "No tool call emitted",
1368
1353
  ),
1369
- "missing_required_param": MetricResult(
1370
- score=1.0 if missing_required else 0.0,
1354
+ "required_params_present": MetricResult(
1355
+ score=1.0 if required_params_present else 0.0,
1371
1356
  is_score_valid=bool(calls),
1372
- reason="Required parameter missing or invalid" if missing_required else "All required parameters present",
1357
+ reason="All required parameters present"
1358
+ if required_params_present
1359
+ else "Required parameter missing or invalid",
1373
1360
  data={"arguments": arguments},
1374
1361
  ),
1375
1362
  "finish_reason": MetricResult(
@@ -1386,15 +1373,19 @@ def test_streaming_tool_missing_required_param(row: EvaluationRow) -> Evaluation
1386
1373
  )
1387
1374
 
1388
1375
  all_checks_passed = (
1389
- missing_required and finish_reason_present and no_forbidden_tags and no_xml_tags and no_reasoning_leakage
1376
+ required_params_present
1377
+ and finish_reason_present
1378
+ and no_forbidden_tags
1379
+ and no_xml_tags
1380
+ and no_reasoning_leakage
1390
1381
  )
1391
1382
 
1392
1383
  row.evaluation_result = EvaluateResult(
1393
1384
  score=1.0 if all_checks_passed else 0.0,
1394
1385
  is_score_valid=True,
1395
- reason="Detected missing required parameter"
1386
+ reason="All required parameters included in tool call"
1396
1387
  if all_checks_passed
1397
- else "Required parameters satisfied or response invalid",
1388
+ else "Required parameters missing or response invalid",
1398
1389
  metrics=metrics,
1399
1390
  )
1400
1391
  return row
@@ -1674,71 +1665,6 @@ def test_streaming_tool_parameter_types(row: EvaluationRow) -> EvaluationRow:
1674
1665
  return row
1675
1666
 
1676
1667
 
1677
- _PEER_TOOL_RECOVERY_ROW = _build_row_from_payload("peer-tool-recovery-failure", PEER_TOOL_RECOVERY_FAILURE_PAYLOAD)
1678
-
1679
-
1680
- @evaluation_test(
1681
- input_rows=[[_PEER_TOOL_RECOVERY_ROW]],
1682
- completion_params=[_build_completion_params_from_payload(PEER_TOOL_RECOVERY_FAILURE_PAYLOAD)],
1683
- rollout_processor=SingleTurnRolloutProcessor(),
1684
- aggregation_method="mean",
1685
- passed_threshold=0.0,
1686
- num_runs=1,
1687
- mode="pointwise",
1688
- )
1689
- def test_streaming_tool_retry_behavior(row: EvaluationRow) -> EvaluationRow:
1690
- """Check whether the assistant retries tool calls when instructed to recover."""
1691
-
1692
- assistant_msg = row.last_assistant_message()
1693
- print(f"assistant_msg: {assistant_msg}")
1694
- finish_reason = row.execution_metadata.finish_reason
1695
- _debug_log_assistant_message("tool_recovery", assistant_msg, finish_reason)
1696
- content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else ""
1697
- calls = _collect_tool_calls(assistant_msg.tool_calls if assistant_msg else [])
1698
- reasoning = (assistant_msg.reasoning_content or "").strip() if assistant_msg else ""
1699
-
1700
- multiple_attempts = len(calls) >= 2
1701
- metrics = {
1702
- "tool_call_attempts": MetricResult(
1703
- score=1.0 if multiple_attempts else 0.0,
1704
- is_score_valid=True,
1705
- reason="Multiple tool call attempts" if multiple_attempts else "Single/no tool call attempt",
1706
- data={"tool_call_count": len(calls)},
1707
- ),
1708
- "reasoning_present": MetricResult(
1709
- score=1.0 if reasoning else 0.0,
1710
- is_score_valid=True,
1711
- reason="Reasoning present" if reasoning else "No reasoning provided",
1712
- data={"reasoning": reasoning[:160]},
1713
- ),
1714
- "finish_reason": MetricResult(
1715
- score=1.0 if finish_reason in {"tool_calls", "stop"} else 0.0,
1716
- is_score_valid=True,
1717
- reason="finish_reason acceptable"
1718
- if finish_reason in {"tool_calls", "stop"}
1719
- else f"Unexpected finish_reason: {finish_reason}",
1720
- ),
1721
- }
1722
-
1723
- finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
1724
- metrics, finish_reason, content_str, reasoning
1725
- )
1726
-
1727
- all_checks_passed = (
1728
- multiple_attempts and finish_reason_present and no_forbidden_tags and no_xml_tags and no_reasoning_leakage
1729
- )
1730
-
1731
- row.evaluation_result = EvaluateResult(
1732
- score=1.0 if all_checks_passed else 0.0,
1733
- is_score_valid=True,
1734
- reason="Multiple recovery attempts observed"
1735
- if all_checks_passed
1736
- else "Recovery attempts missing or response invalid",
1737
- metrics=metrics,
1738
- )
1739
- return row
1740
-
1741
-
1742
1668
  # ============================================================================
1743
1669
  # Reasoning Effort Tests
1744
1670
  # ============================================================================
@@ -1759,7 +1685,7 @@ REASONING_DISABLED_ROW.input_metadata.dataset_info = {
1759
1685
  input_rows=[[REASONING_DISABLED_ROW]],
1760
1686
  completion_params=[
1761
1687
  {
1762
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model
1688
+ "model": DEFAULT_MODEL_ID, # Reasoning-capable model
1763
1689
  "reasoning_effort": "none", # Explicitly disable reasoning
1764
1690
  "max_tokens": DEFAULT_MAX_TOKENS,
1765
1691
  "temperature": 0.0,
@@ -1869,7 +1795,7 @@ REASONING_ENABLED_ROW.input_metadata.dataset_info = {
1869
1795
  input_rows=[[REASONING_ENABLED_ROW]],
1870
1796
  completion_params=[
1871
1797
  {
1872
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model
1798
+ "model": DEFAULT_MODEL_ID, # Reasoning-capable model
1873
1799
  "reasoning_effort": "low", # Enable reasoning
1874
1800
  "max_tokens": DEFAULT_MAX_TOKENS,
1875
1801
  "temperature": 0.0,
@@ -2004,7 +1930,7 @@ TOOLS_WITH_REASONING_ROW.input_metadata.dataset_info = {
2004
1930
  input_rows=[[TOOLS_WITH_REASONING_ROW]],
2005
1931
  completion_params=[
2006
1932
  {
2007
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model
1933
+ "model": DEFAULT_MODEL_ID, # Reasoning-capable model
2008
1934
  "reasoning_effort": "low", # Enable reasoning
2009
1935
  "max_tokens": DEFAULT_MAX_TOKENS,
2010
1936
  "temperature": 0.0,
@@ -2727,7 +2653,7 @@ REASONING_DISABLED_NON_STREAM_ROW.input_metadata.dataset_info = {
2727
2653
  input_rows=[[REASONING_DISABLED_NON_STREAM_ROW]],
2728
2654
  completion_params=[
2729
2655
  {
2730
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
2656
+ "model": DEFAULT_MODEL_ID,
2731
2657
  "reasoning_effort": "none",
2732
2658
  "max_tokens": DEFAULT_MAX_TOKENS,
2733
2659
  "temperature": 0.0,
@@ -2834,7 +2760,7 @@ REASONING_ENABLED_NON_STREAM_ROW.input_metadata.dataset_info = {
2834
2760
  input_rows=[[REASONING_ENABLED_NON_STREAM_ROW]],
2835
2761
  completion_params=[
2836
2762
  {
2837
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
2763
+ "model": DEFAULT_MODEL_ID,
2838
2764
  "reasoning_effort": "low",
2839
2765
  "max_tokens": DEFAULT_MAX_TOKENS,
2840
2766
  "temperature": 0.0,
@@ -2962,7 +2888,7 @@ TOOLS_WITH_REASONING_NON_STREAM_ROW.input_metadata.dataset_info = {
2962
2888
  input_rows=[[TOOLS_WITH_REASONING_NON_STREAM_ROW]],
2963
2889
  completion_params=[
2964
2890
  {
2965
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
2891
+ "model": DEFAULT_MODEL_ID,
2966
2892
  "reasoning_effort": "low",
2967
2893
  "max_tokens": DEFAULT_MAX_TOKENS,
2968
2894
  "temperature": 0.0,
@@ -3108,7 +3034,7 @@ STRUCTURED_JSON_SCHEMA = {
3108
3034
  input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_ROW]],
3109
3035
  completion_params=[
3110
3036
  {
3111
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
3037
+ "model": DEFAULT_MODEL_ID,
3112
3038
  "stream": True,
3113
3039
  "reasoning_effort": "low",
3114
3040
  "response_format": STRUCTURED_JSON_SCHEMA,
@@ -3211,7 +3137,7 @@ STRUCTURED_OUTPUT_WITH_REASONING_NON_STREAM_ROW.input_metadata.dataset_info = {
3211
3137
  input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_NON_STREAM_ROW]],
3212
3138
  completion_params=[
3213
3139
  {
3214
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
3140
+ "model": DEFAULT_MODEL_ID,
3215
3141
  "stream": False,
3216
3142
  "reasoning_effort": "low",
3217
3143
  "response_format": STRUCTURED_JSON_SCHEMA,
@@ -3334,7 +3260,7 @@ MULTIPLE_TOOLS_WITH_REASONING_ROW.input_metadata.dataset_info = {
3334
3260
  input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_ROW]],
3335
3261
  completion_params=[
3336
3262
  {
3337
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
3263
+ "model": DEFAULT_MODEL_ID,
3338
3264
  "stream": True,
3339
3265
  "reasoning_effort": "low",
3340
3266
  "temperature": 0.0,
@@ -3461,7 +3387,7 @@ MULTIPLE_TOOLS_WITH_REASONING_NON_STREAM_ROW.input_metadata.dataset_info = {
3461
3387
  input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_NON_STREAM_ROW]],
3462
3388
  completion_params=[
3463
3389
  {
3464
- "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
3390
+ "model": DEFAULT_MODEL_ID,
3465
3391
  "stream": False,
3466
3392
  "reasoning_effort": "low",
3467
3393
  "temperature": 0.0,
@@ -279,7 +279,13 @@ def _validate_evaluator_locally(
279
279
  docker_build_extra: str,
280
280
  docker_run_extra: str,
281
281
  ) -> bool:
282
- """Run pytest locally for the selected evaluation test to validate the evaluator."""
282
+ """Run pytest locally for the selected evaluation test to validate the evaluator.
283
+
284
+ The pytest helpers always enforce a small success threshold (0.01) for
285
+ evaluation_test-based suites so that an evaluation run where all scores are
286
+ 0.0 will naturally fail with a non-zero pytest exit code, which we then treat
287
+ as a failed validator.
288
+ """
283
289
  if not selected_test_file or not selected_test_func:
284
290
  # No local test associated; skip validation but warn the user.
285
291
  print("Warning: Could not resolve a local evaluation test for this evaluator; skipping local validation.")
@@ -702,7 +708,7 @@ def _create_rft_job(
702
708
  print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
703
709
  if getattr(args, "evaluation_dataset", None):
704
710
  body["evaluationDataset"] = args.evaluation_dataset
705
-
711
+
706
712
  output_model_arg = getattr(args, "output_model", None)
707
713
  if output_model_arg:
708
714
  if len(output_model_arg) > 63:
@@ -38,7 +38,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List
38
38
  def _run_pytest_host(pytest_target: str) -> int:
39
39
  """Run pytest against a target on the host and return its exit code."""
40
40
  print(f"Running locally: pytest {pytest_target} -vs")
41
- proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
41
+ # Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail.
42
+ cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
43
+ proc = subprocess.run(cmd)
42
44
  return proc.returncode
43
45
 
44
46
 
@@ -69,6 +71,22 @@ def _run_pytest_in_docker(
69
71
  "-w",
70
72
  workdir,
71
73
  ]
74
+
75
+ # If EP_SUMMARY_JSON is set on the host, mirror it into the container so that
76
+ # pytest evaluation tests can write summary artifacts that are visible to the
77
+ # host. We map paths under the host logs directory (~/.eval_protocol) into the
78
+ # mounted container home directory.
79
+ host_summary_path = os.environ.get("EP_SUMMARY_JSON")
80
+ if host_summary_path:
81
+ try:
82
+ rel_path = os.path.relpath(host_summary_path, host_logs_dir)
83
+ # Only forward the variable when the summary path is inside the logs dir.
84
+ if not rel_path.startswith(os.pardir):
85
+ container_summary_path = os.path.join("/container_home/.eval_protocol", rel_path)
86
+ cmd += ["-e", f"EP_SUMMARY_JSON={container_summary_path}"]
87
+ except Exception:
88
+ # Best-effort only; do not fail docker execution if we can't map the path.
89
+ pass
72
90
  # Try to match host user to avoid permission problems on mounted volume
73
91
  try:
74
92
  uid = os.getuid() # type: ignore[attr-defined]
@@ -78,7 +96,12 @@ def _run_pytest_in_docker(
78
96
  pass
79
97
  if run_extras:
80
98
  cmd += run_extras
81
- cmd += [image_tag, "pytest", pytest_target, "-vs"]
99
+
100
+ # Build pytest command, always enforcing the same small success threshold as
101
+ # the host runner so that all-zero score runs fail consistently.
102
+ pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
103
+
104
+ cmd += [image_tag] + pytest_cmd
82
105
  print("Running in Docker:", " ".join(cmd))
83
106
  try:
84
107
  proc = subprocess.run(cmd)
@@ -0,0 +1,82 @@
1
+ import asyncio
2
+ import os
3
+ from collections import defaultdict
4
+ from typing import List, Dict
5
+
6
+ from eval_protocol.models import EvaluationRow
7
+
8
+ class MicroBatchDataBuffer:
9
+ """
10
+ Buffers evaluation results and writes them to disk in minibatches.
11
+ Waits for all runs of a sample to complete before considering it ready and flush to disk.
12
+ """
13
+ def __init__(self, num_runs: int, batch_size: int, output_path_template: str):
14
+ self.num_runs = num_runs
15
+ self.batch_size = batch_size
16
+ self.output_path_template = output_path_template
17
+ self.pending_samples: Dict[str, List[EvaluationRow]] = defaultdict(list) # row_id -> list[EvaluationRow]
18
+ self.completed_samples_buffer: List[List[EvaluationRow]] = [] # List[List[EvaluationRow]]
19
+ self.batch_index = 0
20
+ self.lock = asyncio.Lock()
21
+
22
+ async def add_result(self, row: EvaluationRow):
23
+ """
24
+ Add a single evaluation result.
25
+ Thread-safe/Coroutine-safe.
26
+ """
27
+ async with self.lock:
28
+ row_id = row.input_metadata.row_id
29
+ if not row_id:
30
+ # Should not happen in valid EP workflow, unique row_id is required to group things together properly
31
+ return
32
+
33
+ self.pending_samples[row_id].append(row)
34
+
35
+ if len(self.pending_samples[row_id]) >= self.num_runs:
36
+ # Sample completed (all runs finished)
37
+ completed_rows = self.pending_samples.pop(row_id)
38
+ self.completed_samples_buffer.append(completed_rows)
39
+
40
+ if len(self.completed_samples_buffer) >= self.batch_size:
41
+ await self._flush_unsafe()
42
+
43
+ async def _flush_unsafe(self):
44
+ """
45
+ not thread safe, assumes lock is held by called
46
+ """
47
+ if not self.completed_samples_buffer:
48
+ return
49
+
50
+ if "{index}" in self.output_path_template:
51
+ output_path = self.output_path_template.format(index=self.batch_index)
52
+ mode = "w"
53
+ else:
54
+ output_path = self.output_path_template
55
+ mode = "a" # Append if no index placeholder
56
+
57
+ # Ensure directory exists
58
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
59
+
60
+ # Write flattened rows
61
+ with open(output_path, mode) as f:
62
+ for sample_rows in self.completed_samples_buffer:
63
+ for row in sample_rows:
64
+ f.write(row.model_dump_json() + "\n")
65
+
66
+ self.completed_samples_buffer = []
67
+ self.batch_index += 1
68
+
69
+ async def close(self):
70
+ """
71
+ Flush any remaining samples in the buffer.
72
+ """
73
+ async with self.lock:
74
+ # Also flush pending (incomplete) samples to avoid data loss
75
+ if self.pending_samples:
76
+ for rows in self.pending_samples.values():
77
+ self.completed_samples_buffer.append(rows)
78
+ self.pending_samples.clear()
79
+
80
+ if self.completed_samples_buffer:
81
+ await self._flush_unsafe()
82
+