eval-protocol 0.2.30__tar.gz → 0.2.32__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. {eval_protocol-0.2.30/eval_protocol.egg-info → eval_protocol-0.2.32}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/evaluation_test.py +8 -0
  4. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/parameterize.py +87 -0
  5. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/remote_rollout_processor.py +39 -5
  6. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/types/remote_rollout_processor.py +8 -2
  7. {eval_protocol-0.2.30 → eval_protocol-0.2.32/eval_protocol.egg-info}/PKG-INFO +1 -1
  8. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/LICENSE +0 -0
  9. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/README.md +0 -0
  10. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/development/__init__.py +0 -0
  11. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/development/normalize_sandbox_fusion.py +0 -0
  12. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/development/utils/__init__.py +0 -0
  13. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/development/utils/generate_api_key.py +0 -0
  14. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/development/utils/subprocess_manager.py +0 -0
  15. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/__init__.py +0 -0
  16. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/__main__.py +0 -0
  17. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/__init__.py +0 -0
  18. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/base.py +0 -0
  19. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/bigquery.py +0 -0
  20. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/braintrust.py +0 -0
  21. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/huggingface.py +0 -0
  22. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/langchain.py +0 -0
  23. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/langfuse.py +0 -0
  24. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/langsmith.py +0 -0
  25. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/openai_responses.py +0 -0
  26. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/trl.py +0 -0
  27. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/adapters/utils.py +0 -0
  28. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/__init__.py +0 -0
  29. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/models.py +0 -0
  30. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/orchestrator.py +0 -0
  31. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resource_abc.py +0 -0
  32. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resource_pool.py +0 -0
  33. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/__init__.py +0 -0
  34. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  35. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  36. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  37. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  38. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  39. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/docker_resource.py +0 -0
  40. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  41. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  42. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/resources/sql_resource.py +0 -0
  43. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/task_manager.py +0 -0
  44. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/agent/tool_registry.py +0 -0
  45. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/auth.py +0 -0
  46. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/benchmarks/__init__.py +0 -0
  47. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  48. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  49. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/benchmarks/test_aime25.py +0 -0
  50. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  51. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  52. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  53. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  54. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli.py +0 -0
  55. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli_commands/__init__.py +0 -0
  56. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  57. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli_commands/common.py +0 -0
  58. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli_commands/deploy.py +0 -0
  59. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  60. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli_commands/logs.py +0 -0
  61. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli_commands/preview.py +0 -0
  62. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  63. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/common_utils.py +0 -0
  64. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/config.py +0 -0
  65. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/data_loader/__init__.py +0 -0
  66. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  67. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  68. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  69. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/data_loader/models.py +0 -0
  70. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/dataset_logger/__init__.py +0 -0
  71. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  72. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  73. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  74. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  75. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/datasets/__init__.py +0 -0
  76. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/datasets/loader.py +0 -0
  77. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/directory_utils.py +0 -0
  78. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/evaluation.py +0 -0
  79. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/event_bus/__init__.py +0 -0
  80. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/event_bus/event_bus.py +0 -0
  81. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/event_bus/logger.py +0 -0
  82. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  83. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  84. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/execution/__init__.py +0 -0
  85. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/execution/pipeline.py +0 -0
  86. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/gcp_tools.py +0 -0
  87. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/generation/cache.py +0 -0
  88. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/generation/clients/base.py +0 -0
  89. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/generation/clients.py +0 -0
  90. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/generic_server.py +0 -0
  91. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/get_pep440_version.py +0 -0
  92. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/human_id/__init__.py +0 -0
  93. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/human_id/dictionary.py +0 -0
  94. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/integrations/__init__.py +0 -0
  95. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/integrations/deepeval.py +0 -0
  96. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/integrations/openeval.py +0 -0
  97. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/integrations/trl.py +0 -0
  98. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/logging_utils.py +0 -0
  99. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/__init__.py +0 -0
  100. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/adapter.py +0 -0
  101. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/client/__init__.py +0 -0
  102. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/client/connection.py +0 -0
  103. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/clients.py +0 -0
  104. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/execution/__init__.py +0 -0
  105. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/execution/base_policy.py +0 -0
  106. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/execution/manager.py +0 -0
  107. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/execution/policy.py +0 -0
  108. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/grid_renderer.py +0 -0
  109. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  110. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/mcpgym.py +0 -0
  111. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/process_manager.py +0 -0
  112. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/session/__init__.py +0 -0
  113. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/session/manager.py +0 -0
  114. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/simple_process_manager.py +0 -0
  115. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp/simulation_server.py +0 -0
  116. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_agent/__init__.py +0 -0
  117. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_agent/config.py +0 -0
  118. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_agent/main.py +0 -0
  119. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  120. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  121. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  122. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  123. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_env.py +0 -0
  124. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/__init__.py +0 -0
  125. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  126. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  127. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  128. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  129. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  130. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  131. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  132. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  133. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  134. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  135. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  136. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/models.py +0 -0
  137. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/packaging.py +0 -0
  138. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/platform_api.py +0 -0
  139. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/playback_policy.py +0 -0
  140. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/__init__.py +0 -0
  141. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  142. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  143. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  144. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  145. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  146. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  147. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  148. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  149. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  150. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/exception_config.py +0 -0
  151. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/execution.py +0 -0
  152. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  153. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  154. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/plugin.py +0 -0
  155. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/rollout_processor.py +0 -0
  156. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/store_experiment_link.py +0 -0
  157. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/store_results_url.py +0 -0
  158. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/types.py +0 -0
  159. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/utils.py +0 -0
  160. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/pytest/validate_signature.py +0 -0
  161. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/quickstart/__init__.py +0 -0
  162. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/quickstart/llm_judge.py +0 -0
  163. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  164. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
  165. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
  166. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
  167. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/quickstart/utils.py +0 -0
  168. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/resources.py +0 -0
  169. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/reward_function.py +0 -0
  170. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/__init__.py +0 -0
  171. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/accuracy.py +0 -0
  172. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/accuracy_length.py +0 -0
  173. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  174. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  175. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/apps_testing_util.py +0 -0
  176. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/bfcl_reward.py +0 -0
  177. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/code_execution.py +0 -0
  178. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/code_execution_utils.py +0 -0
  179. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/cpp_code.py +0 -0
  180. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  181. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/format.py +0 -0
  182. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/function_calling.py +0 -0
  183. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/json_schema.py +0 -0
  184. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/language_consistency.py +0 -0
  185. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/lean_prover.py +0 -0
  186. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/length.py +0 -0
  187. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  188. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/math.py +0 -0
  189. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  190. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/reasoning_steps.py +0 -0
  191. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/repetition.py +0 -0
  192. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rewards/tag_count.py +0 -0
  193. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/rl_processing.py +0 -0
  194. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/server.py +0 -0
  195. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/stats/__init__.py +0 -0
  196. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/stats/confidence_intervals.py +0 -0
  197. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/typed_interface.py +0 -0
  198. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/types/__init__.py +0 -0
  199. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/types/errors.py +0 -0
  200. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/types/types.py +0 -0
  201. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/__init__.py +0 -0
  202. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/batch_evaluation.py +0 -0
  203. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/batch_transformation.py +0 -0
  204. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/check_server_status.py +0 -0
  205. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/dataset_helpers.py +0 -0
  206. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/logs_server.py +0 -0
  207. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/module_loader.py +0 -0
  208. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/packaging_utils.py +0 -0
  209. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/show_results_url.py +0 -0
  210. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/static_policy.py +0 -0
  211. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol/utils/vite_server.py +0 -0
  212. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol.egg-info/SOURCES.txt +0 -0
  213. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol.egg-info/dependency_links.txt +0 -0
  214. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol.egg-info/entry_points.txt +0 -0
  215. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol.egg-info/requires.txt +0 -0
  216. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/eval_protocol.egg-info/top_level.txt +0 -0
  217. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/pyproject.toml +0 -0
  218. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/setup.cfg +0 -0
  219. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/setup.py +0 -0
  220. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_accuracy.py +0 -0
  221. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_accuracy_length.py +0 -0
  222. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_adapters_e2e.py +0 -0
  223. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_agent_orchestrator.py +0 -0
  224. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_agent_resources.py +0 -0
  225. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_auth.py +0 -0
  226. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_batch_evaluation.py +0 -0
  227. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_cli.py +0 -0
  228. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_cli_agent.py +0 -0
  229. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_cli_args.py +0 -0
  230. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_code_execution.py +0 -0
  231. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_config.py +0 -0
  232. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_control_plane_separation.py +0 -0
  233. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_cpp_code.py +0 -0
  234. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_data_driven_task_manager.py +0 -0
  235. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_deepcoder_reward.py +0 -0
  236. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_deepeval_integration.py +0 -0
  237. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_deploy_integration.py +0 -0
  238. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_e2b_integration.py +0 -0
  239. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_e2b_js_integration.py +0 -0
  240. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_edge_cases.py +0 -0
  241. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_eval_protocol_import.py +0 -0
  242. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_evaluation.py +0 -0
  243. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_evaluation_integration.py +0 -0
  244. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_evaluation_postprocess.py +0 -0
  245. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_evaluation_preview_integration.py +0 -0
  246. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_event_bus.py +0 -0
  247. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_examples_end_to_end.py +0 -0
  248. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_fireworks_api.py +0 -0
  249. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_format.py +0 -0
  250. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_fractional_code.py +0 -0
  251. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_function_calling.py +0 -0
  252. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_gcp_tools.py +0 -0
  253. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_generic_server.py +0 -0
  254. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_human_id.py +0 -0
  255. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_integration.py +0 -0
  256. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_json_schema.py +0 -0
  257. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_kwargs_validation.py +0 -0
  258. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_language_consistency.py +0 -0
  259. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_lean_prover.py +0 -0
  260. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_lean_prover_runner.py +0 -0
  261. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_length.py +0 -0
  262. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_list_comparison_math_reward.py +0 -0
  263. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_logs_server.py +0 -0
  264. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_logs_server_simple.py +0 -0
  265. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_math.py +0 -0
  266. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_minimal.py +0 -0
  267. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_models.py +0 -0
  268. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_models_rl.py +0 -0
  269. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_multiple_choice_math_reward.py +0 -0
  270. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_n_variant_batch_integration.py +0 -0
  271. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_n_variant_integration.py +0 -0
  272. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_openai_compatibility.py +0 -0
  273. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_openeval_integration.py +0 -0
  274. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_packaging.py +0 -0
  275. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_parallel_rollouts.py +0 -0
  276. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_platform_api.py +0 -0
  277. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_quickstart_utils.py +0 -0
  278. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_readiness.py +0 -0
  279. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_reasoning_steps.py +0 -0
  280. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_repetition.py +0 -0
  281. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_repetition_debug.py +0 -0
  282. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_retry_mechanism.py +0 -0
  283. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_reward_function.py +0 -0
  284. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_reward_protocol_import.py +0 -0
  285. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_rl_processing.py +0 -0
  286. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_rollout_control_plane_integration.py +0 -0
  287. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_server.py +0 -0
  288. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_show_results_url.py +0 -0
  289. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_status_migration_changes.py +0 -0
  290. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_status_migration_integration.py +0 -0
  291. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_status_model.py +0 -0
  292. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_tag_count.py +0 -0
  293. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_tau_bench_airline_smoke.py +0 -0
  294. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_typed_interface.py +0 -0
  295. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_typed_interface_rl.py +0 -0
  296. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_url_handling.py +0 -0
  297. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/tests/test_vite_server.py +0 -0
  298. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/__init__.py +0 -0
  299. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/agent/__init__.py +0 -0
  300. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/agent/base.py +0 -0
  301. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/agent/llm_agent.py +0 -0
  302. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/api_service/__init__.py +0 -0
  303. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/api_service/api_config.py +0 -0
  304. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/api_service/data_model.py +0 -0
  305. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/api_service/simulation_service.py +0 -0
  306. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/cli.py +0 -0
  307. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/config.py +0 -0
  308. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/airline/policy.md +0 -0
  309. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/mock/policy.md +0 -0
  310. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  311. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/retail/policy.md +0 -0
  312. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  313. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  314. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  315. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  316. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  317. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  318. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  319. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data_model/__init__.py +0 -0
  320. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data_model/message.py +0 -0
  321. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data_model/simulation.py +0 -0
  322. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/data_model/tasks.py +0 -0
  323. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/__init__.py +0 -0
  324. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/airline/__init__.py +0 -0
  325. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/airline/data_model.py +0 -0
  326. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/airline/environment.py +0 -0
  327. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/airline/tools.py +0 -0
  328. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/airline/utils.py +0 -0
  329. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/mock/__init__.py +0 -0
  330. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/mock/data_model.py +0 -0
  331. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/mock/environment.py +0 -0
  332. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/mock/tools.py +0 -0
  333. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/mock/utils.py +0 -0
  334. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/retail/__init__.py +0 -0
  335. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/retail/data_model.py +0 -0
  336. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/retail/environment.py +0 -0
  337. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/retail/tools.py +0 -0
  338. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/retail/utils.py +0 -0
  339. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/__init__.py +0 -0
  340. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/data_model.py +0 -0
  341. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/environment.py +0 -0
  342. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  343. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  344. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  345. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  346. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  347. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  348. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  349. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  350. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/tools.py +0 -0
  351. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  352. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  353. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/domains/telecom/utils.py +0 -0
  354. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/environment/__init__.py +0 -0
  355. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/environment/db.py +0 -0
  356. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/environment/environment.py +0 -0
  357. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/environment/server.py +0 -0
  358. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/environment/tool.py +0 -0
  359. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/environment/toolkit.py +0 -0
  360. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  361. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/evaluator/__init__.py +0 -0
  362. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/evaluator/evaluator.py +0 -0
  363. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  364. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  365. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  366. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  367. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  368. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/metrics/__init__.py +0 -0
  369. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/metrics/agent_metrics.py +0 -0
  370. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  371. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/orchestrator/__init__.py +0 -0
  372. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  373. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  374. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/orchestrator/utils.py +0 -0
  375. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/registry.py +0 -0
  376. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/run.py +0 -0
  377. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/scripts/__init__.py +0 -0
  378. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/scripts/check_data.py +0 -0
  379. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  380. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/scripts/start_servers.py +0 -0
  381. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/scripts/view_simulations.py +0 -0
  382. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/user/__init__.py +0 -0
  383. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/user/base.py +0 -0
  384. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/user/user_simulator.py +0 -0
  385. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/utils/__init__.py +0 -0
  386. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/utils/display.py +0 -0
  387. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/utils/io_utils.py +0 -0
  388. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/utils/llm_utils.py +0 -0
  389. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/utils/pydantic_utils.py +0 -0
  390. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vendor/tau2/utils/utils.py +0 -0
  391. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/versioneer.py +0 -0
  392. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  393. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vite-app/dist/assets/index-C8woq7EO.js +0 -0
  394. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vite-app/dist/assets/index-C8woq7EO.js.map +0 -0
  395. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vite-app/dist/assets/index-CSKGq1w7.css +0 -0
  396. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  397. {eval_protocol-0.2.30 → eval_protocol-0.2.32}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.30
3
+ Version: 0.2.32
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-09-26T14:10:14-0700",
11
+ "date": "2025-09-29T16:40:31-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "626a125899fb42ed135a1f223b3e827f37e44ae0",
15
- "version": "0.2.30"
14
+ "full-revisionid": "1744b558ceb39f3a910a898ed6dd3df6a4576691",
15
+ "version": "0.2.32"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -79,6 +79,7 @@ def evaluation_test(
79
79
  aggregation_method: AggregationMethod = "mean",
80
80
  passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None,
81
81
  num_runs: int = 1,
82
+ filtered_row_ids: Sequence[str] | None = None,
82
83
  max_dataset_rows: int | None = None,
83
84
  mcp_config_path: str | None = None,
84
85
  max_concurrent_rollouts: int = 8,
@@ -146,6 +147,7 @@ def evaluation_test(
146
147
  Success rate must be above success, and if set, standard error must be below standard_error.
147
148
  Success rate +/- one standard_error is equivalent to 68% confidence interval.
148
149
  num_runs: Number of times to repeat the rollout and evaluations.
150
+ filtered_row_ids: List of row_ids to filter for the evaluation. If provided, only the rows with the given row_ids will be evaluated.
149
151
  max_dataset_rows: Limit dataset to the first N rows.
150
152
  mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
151
153
  max_concurrent_rollouts: Maximum number of concurrent rollouts to run in parallel.
@@ -262,6 +264,9 @@ def evaluation_test(
262
264
  results = data_loader.load()
263
265
  for result in results:
264
266
  data.extend(result.rows)
267
+ # Apply max_dataset_rows limit to data from data loaders
268
+ if max_dataset_rows is not None:
269
+ data = data[:max_dataset_rows]
265
270
  elif "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
266
271
  ds_arg: list[str] = kwargs["dataset_path"]
267
272
  # Support either a single path or a list of paths; if a list is provided,
@@ -283,6 +288,9 @@ def evaluation_test(
283
288
  else:
284
289
  raise ValueError("No input dataset, input messages, or input rows provided")
285
290
 
291
+ if filtered_row_ids is not None:
292
+ data = [row for row in data if row.input_metadata.row_id in filtered_row_ids]
293
+
286
294
  """
287
295
  data_loaders handles preprocess_fn internally so we want
288
296
  to specially handle data_loaders here so we don't double
@@ -73,6 +73,9 @@ def _is_pytest_parametrize_with_completion_params(decorator: ast.expr) -> bool:
73
73
  and decorator.func.value.attr == "mark"
74
74
  and decorator.func.attr == "parametrize"
75
75
  ):
76
+ # Validate argvalues if present
77
+ _validate_parametrize_argvalues(decorator)
78
+
76
79
  # Check positional arguments first (argnames is typically the first positional arg)
77
80
  if len(decorator.args) > 0:
78
81
  argnames_arg = decorator.args[0]
@@ -88,6 +91,90 @@ def _is_pytest_parametrize_with_completion_params(decorator: ast.expr) -> bool:
88
91
  return False
89
92
 
90
93
 
94
+ def _ast_dict_to_string(dict_node: ast.Dict) -> str:
95
+ """
96
+ Convert an AST Dict node to its string representation.
97
+
98
+ Args:
99
+ dict_node: AST node representing a dictionary
100
+
101
+ Returns:
102
+ String representation of the dictionary
103
+ """
104
+ if not dict_node.keys:
105
+ return "{}"
106
+
107
+ pairs = []
108
+ for key, value in zip(dict_node.keys, dict_node.values):
109
+ if key is not None:
110
+ key_str = _ast_node_to_string(key)
111
+ value_str = _ast_node_to_string(value)
112
+ pairs.append(f"{key_str}: {value_str}")
113
+
114
+ return "{" + ", ".join(pairs) + "}"
115
+
116
+
117
+ def _ast_node_to_string(node: ast.expr) -> str:
118
+ """
119
+ Convert an AST node to its string representation.
120
+
121
+ Args:
122
+ node: AST node to convert
123
+
124
+ Returns:
125
+ String representation of the node
126
+ """
127
+ if isinstance(node, ast.Constant):
128
+ if isinstance(node.value, str):
129
+ return repr(node.value)
130
+ else:
131
+ return str(node.value)
132
+ elif isinstance(node, ast.Name):
133
+ return node.id
134
+ elif isinstance(node, ast.Dict):
135
+ return _ast_dict_to_string(node)
136
+ elif isinstance(node, ast.List):
137
+ elements = [_ast_node_to_string(elt) for elt in node.elts]
138
+ return "[" + ", ".join(elements) + "]"
139
+ elif isinstance(node, ast.Tuple):
140
+ elements = [_ast_node_to_string(elt) for elt in node.elts]
141
+ return "(" + ", ".join(elements) + ")"
142
+ else:
143
+ # For complex expressions, return a simplified representation
144
+ return "<complex expression>"
145
+
146
+
147
+ def _validate_parametrize_argvalues(decorator: ast.Call) -> None:
148
+ """
149
+ Validate that pytest.mark.parametrize argvalues is a list/tuple, not a dict.
150
+
151
+ Args:
152
+ decorator: AST node representing the pytest.mark.parametrize decorator call
153
+
154
+ Raises:
155
+ ValueError: If argvalues is a dict instead of a list/tuple
156
+ """
157
+ # Check positional arguments (argvalues is typically the second positional arg)
158
+ if len(decorator.args) > 1:
159
+ argvalues_arg = decorator.args[1]
160
+ if isinstance(argvalues_arg, ast.Dict):
161
+ dict_repr = _ast_dict_to_string(argvalues_arg)
162
+ raise ValueError(
163
+ f"For evaluation_test with completion_params, pytest.mark.parametrize argvalues must be a list or tuple, not a dict. "
164
+ f"Use [{dict_repr}] instead of {dict_repr}."
165
+ )
166
+
167
+ # Check keyword arguments for argvalues
168
+ for keyword in decorator.keywords:
169
+ if keyword.arg == "argvalues":
170
+ if isinstance(keyword.value, ast.Dict):
171
+ dict_repr = _ast_dict_to_string(keyword.value)
172
+ raise ValueError(
173
+ f"For evaluation_test with completion_params, pytest.mark.parametrize argvalues must be a list or tuple, not a dict. "
174
+ f"Use [{dict_repr}] instead of {dict_repr}."
175
+ )
176
+
177
+
91
178
  def _check_argnames_for_completion_params(argnames_node: ast.expr) -> bool:
92
179
  """
93
180
  Check if an argnames AST node contains "completion_params".
@@ -22,6 +22,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
22
22
  self,
23
23
  *,
24
24
  remote_base_url: Optional[str] = None,
25
+ model_base_url: Optional[str] = None,
25
26
  poll_interval: float = 1.0,
26
27
  timeout_seconds: float = 120.0,
27
28
  output_data_loader: Callable[[str], DynamicDataLoader],
@@ -29,6 +30,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
29
30
  # Prefer constructor-provided configuration. These can be overridden via
30
31
  # config.kwargs at call time for backward compatibility.
31
32
  self._remote_base_url = remote_base_url
33
+ self._model_base_url = model_base_url
32
34
  self._poll_interval = poll_interval
33
35
  self._timeout_seconds = timeout_seconds
34
36
  self._output_data_loader = output_data_loader
@@ -38,6 +40,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
38
40
 
39
41
  # Start with constructor values
40
42
  remote_base_url: Optional[str] = self._remote_base_url
43
+ model_base_url: Optional[str] = self._model_base_url
41
44
  poll_interval: float = self._poll_interval
42
45
  timeout_seconds: float = self._timeout_seconds
43
46
 
@@ -112,14 +115,25 @@ class RemoteRolloutProcessor(RolloutProcessor):
112
115
  messages=clean_messages,
113
116
  tools=row.tools,
114
117
  metadata=meta,
115
- model_base_url=config.kwargs.get("model_base_url", None),
118
+ model_base_url=model_base_url,
116
119
  )
117
120
 
118
121
  # Fire-and-poll
119
122
  def _post_init() -> None:
120
123
  url = f"{remote_base_url}/init"
121
- r = requests.post(url, json=init_payload.model_dump(), timeout=30)
122
- r.raise_for_status()
124
+ try:
125
+ r = requests.post(url, json=init_payload.model_dump(), timeout=30)
126
+ r.raise_for_status()
127
+ except requests.exceptions.Timeout:
128
+ raise TimeoutError(
129
+ "The /init endpoint timed out after 30 seconds. "
130
+ "CRITICAL: The /init endpoint must return immediately (within 30s) and NOT block on rollout execution. "
131
+ "Your remote server should:\n"
132
+ "1. Accept the /init request and return a 200 response immediately\n"
133
+ "2. Process the actual rollout asynchronously in the background\n"
134
+ "3. Use the /status endpoint to report progress\n"
135
+ "For Python/Node.js: Start a separate process per rollout to avoid blocking the /init response."
136
+ )
123
137
 
124
138
  await asyncio.to_thread(_post_init)
125
139
 
@@ -141,7 +155,13 @@ class RemoteRolloutProcessor(RolloutProcessor):
141
155
  except Exception:
142
156
  # transient errors; continue polling
143
157
  pass
158
+
144
159
  await asyncio.sleep(poll_interval)
160
+ else:
161
+ # Loop completed without breaking, which means we timed out
162
+ row.rollout_status = Status.rollout_error(
163
+ f"Rollout {row.execution_metadata.rollout_id} timed out after {timeout_seconds} seconds"
164
+ )
145
165
 
146
166
  # Update duration, regardless of termination
147
167
  row.execution_metadata.duration_seconds = time.perf_counter() - start_time
@@ -164,14 +184,28 @@ class RemoteRolloutProcessor(RolloutProcessor):
164
184
  elif len(output_rows) == 1: # Return the Langfuse row
165
185
  langfuse_row = output_rows[0]
166
186
  langfuse_row.input_metadata.completion_params = row.input_metadata.completion_params
187
+ # merge dataset_info dicts on input_metadata
188
+ if langfuse_row.input_metadata.dataset_info and row.input_metadata.dataset_info:
189
+ langfuse_row.input_metadata.dataset_info = {
190
+ **row.input_metadata.dataset_info,
191
+ **langfuse_row.input_metadata.dataset_info,
192
+ }
193
+ elif row.input_metadata.dataset_info:
194
+ langfuse_row.input_metadata.dataset_info = row.input_metadata.dataset_info
167
195
  langfuse_row.eval_metadata = row.eval_metadata
196
+ langfuse_row.ground_truth = row.ground_truth
168
197
  return langfuse_row
169
198
  else:
170
199
  raise ValueError("RemoteRolloutProcessor's output_data_loader should return exactly one row.")
171
200
 
172
- for r in rows:
173
- tasks.append(asyncio.create_task(_process_row(r)))
201
+ semaphore = config.semaphore
202
+
203
+ async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
204
+ async with semaphore:
205
+ result = await _process_row(r)
206
+ return result
174
207
 
208
+ tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
175
209
  return tasks
176
210
 
177
211
  def cleanup(self) -> None:
@@ -4,7 +4,7 @@ Request and response models for remote rollout processor servers.
4
4
 
5
5
  from typing import Any, Dict, List, Optional
6
6
  from pydantic import BaseModel, Field
7
- from eval_protocol.models import Message
7
+ from eval_protocol.models import Message, Status
8
8
 
9
9
 
10
10
  class RolloutMetadata(BaseModel):
@@ -21,7 +21,7 @@ class InitRequest(BaseModel):
21
21
  """Request model for POST /init endpoint."""
22
22
 
23
23
  model: str
24
- messages: List[Message] = Field(min_length=1)
24
+ messages: Optional[List[Message]] = None
25
25
  tools: Optional[List[Dict[str, Any]]] = None
26
26
 
27
27
  model_base_url: Optional[str] = None
@@ -40,6 +40,12 @@ class StatusResponse(BaseModel):
40
40
  terminated: bool
41
41
  info: Optional[Dict[str, Any]] = None
42
42
 
43
+ status: Optional[Status] = None
44
+ """
45
+ Optional status indicator for the rollout to be used by eval-protocol. This
46
+ is useful to distinguish between successful and failed rollouts.
47
+ """
48
+
43
49
 
44
50
  def create_langfuse_config_tags(init_request: InitRequest) -> List[str]:
45
51
  """Create Langfuse tags from InitRequest metadata."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.30
3
+ Version: 0.2.32
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
File without changes
File without changes