eval-protocol 0.2.26__tar.gz → 0.2.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (399) hide show
  1. {eval_protocol-0.2.26/eval_protocol.egg-info → eval_protocol-0.2.27}/PKG-INFO +70 -89
  2. eval_protocol-0.2.27/README.md +115 -0
  3. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/__init__.py +3 -0
  4. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/_version.py +3 -3
  5. eval_protocol-0.2.27/eval_protocol/data_loader/__init__.py +4 -0
  6. eval_protocol-0.2.27/eval_protocol/data_loader/dynamic_data_loader.py +38 -0
  7. eval_protocol-0.2.27/eval_protocol/data_loader/factory_data_loader.py +38 -0
  8. eval_protocol-0.2.27/eval_protocol/data_loader/inline_data_loader.py +68 -0
  9. eval_protocol-0.2.27/eval_protocol/data_loader/models.py +128 -0
  10. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/evaluation_test.py +29 -6
  11. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/evaluation_test_postprocess.py +12 -3
  12. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/generate_parameter_combinations.py +25 -6
  13. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/parameterize.py +14 -2
  14. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/plugin.py +4 -4
  15. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/store_results_url.py +9 -6
  16. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/quickstart/llm_judge_braintrust.py +15 -14
  17. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/quickstart/llm_judge_langfuse.py +14 -10
  18. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/quickstart/llm_judge_langsmith.py +7 -11
  19. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/quickstart/llm_judge_openai_responses.py +15 -11
  20. {eval_protocol-0.2.26 → eval_protocol-0.2.27/eval_protocol.egg-info}/PKG-INFO +70 -89
  21. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol.egg-info/SOURCES.txt +5 -0
  22. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_batch_evaluation.py +0 -2
  23. eval_protocol-0.2.27/tests/test_evaluation_postprocess.py +441 -0
  24. eval_protocol-0.2.27/tests/test_fireworks_api.py +68 -0
  25. eval_protocol-0.2.26/README.md +0 -134
  26. eval_protocol-0.2.26/tests/test_evaluation_postprocess.py +0 -207
  27. eval_protocol-0.2.26/tests/test_fireworks_api.py +0 -66
  28. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/LICENSE +0 -0
  29. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/development/__init__.py +0 -0
  30. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/development/normalize_sandbox_fusion.py +0 -0
  31. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/development/utils/__init__.py +0 -0
  32. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/development/utils/generate_api_key.py +0 -0
  33. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/development/utils/subprocess_manager.py +0 -0
  34. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/__main__.py +0 -0
  35. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/__init__.py +0 -0
  36. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/base.py +0 -0
  37. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/bigquery.py +0 -0
  38. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/braintrust.py +0 -0
  39. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/huggingface.py +0 -0
  40. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/langchain.py +0 -0
  41. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/langfuse.py +0 -0
  42. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/langsmith.py +0 -0
  43. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/openai_responses.py +0 -0
  44. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/trl.py +0 -0
  45. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/adapters/utils.py +0 -0
  46. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/__init__.py +0 -0
  47. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/models.py +0 -0
  48. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/orchestrator.py +0 -0
  49. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resource_abc.py +0 -0
  50. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resource_pool.py +0 -0
  51. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/__init__.py +0 -0
  52. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  53. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  54. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  55. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  56. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  57. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/docker_resource.py +0 -0
  58. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  59. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  60. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/resources/sql_resource.py +0 -0
  61. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/task_manager.py +0 -0
  62. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/agent/tool_registry.py +0 -0
  63. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/auth.py +0 -0
  64. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/benchmarks/__init__.py +0 -0
  65. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  66. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  67. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/benchmarks/test_aime25.py +0 -0
  68. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  69. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  70. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  71. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  72. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli.py +0 -0
  73. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli_commands/__init__.py +0 -0
  74. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  75. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli_commands/common.py +0 -0
  76. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli_commands/deploy.py +0 -0
  77. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  78. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli_commands/logs.py +0 -0
  79. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli_commands/preview.py +0 -0
  80. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  81. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/common_utils.py +0 -0
  82. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/config.py +0 -0
  83. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/dataset_logger/__init__.py +0 -0
  84. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  85. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  86. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  87. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  88. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/datasets/__init__.py +0 -0
  89. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/datasets/loader.py +0 -0
  90. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/directory_utils.py +0 -0
  91. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/evaluation.py +0 -0
  92. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/event_bus/__init__.py +0 -0
  93. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/event_bus/event_bus.py +0 -0
  94. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/event_bus/logger.py +0 -0
  95. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  96. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  97. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/execution/__init__.py +0 -0
  98. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/execution/pipeline.py +0 -0
  99. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/gcp_tools.py +0 -0
  100. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/generation/cache.py +0 -0
  101. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/generation/clients/base.py +0 -0
  102. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/generation/clients.py +0 -0
  103. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/generic_server.py +0 -0
  104. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/get_pep440_version.py +0 -0
  105. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/human_id/__init__.py +0 -0
  106. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/human_id/dictionary.py +0 -0
  107. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/integrations/__init__.py +0 -0
  108. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/integrations/deepeval.py +0 -0
  109. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/integrations/openeval.py +0 -0
  110. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/integrations/trl.py +0 -0
  111. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/logging_utils.py +0 -0
  112. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/__init__.py +0 -0
  113. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/adapter.py +0 -0
  114. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/client/__init__.py +0 -0
  115. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/client/connection.py +0 -0
  116. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/clients.py +0 -0
  117. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/execution/__init__.py +0 -0
  118. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/execution/base_policy.py +0 -0
  119. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/execution/manager.py +0 -0
  120. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/execution/policy.py +0 -0
  121. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/grid_renderer.py +0 -0
  122. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  123. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/mcpgym.py +0 -0
  124. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/process_manager.py +0 -0
  125. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/session/__init__.py +0 -0
  126. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/session/manager.py +0 -0
  127. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/simple_process_manager.py +0 -0
  128. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp/simulation_server.py +0 -0
  129. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_agent/__init__.py +0 -0
  130. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_agent/config.py +0 -0
  131. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_agent/main.py +0 -0
  132. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  133. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  134. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  135. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  136. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_env.py +0 -0
  137. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/__init__.py +0 -0
  138. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  139. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  140. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  141. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  142. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  143. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  144. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  145. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  146. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  147. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  148. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  149. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/models.py +0 -0
  150. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/packaging.py +0 -0
  151. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/platform_api.py +0 -0
  152. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/playback_policy.py +0 -0
  153. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/__init__.py +0 -0
  154. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  155. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  156. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  157. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  158. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  159. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  160. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  161. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  162. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/exception_config.py +0 -0
  163. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/execution.py +0 -0
  164. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  165. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  166. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/rollout_processor.py +0 -0
  167. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/store_experiment_link.py +0 -0
  168. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/types.py +0 -0
  169. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/utils.py +0 -0
  170. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/pytest/validate_signature.py +0 -0
  171. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/quickstart/__init__.py +0 -0
  172. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/quickstart/llm_judge.py +0 -0
  173. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/quickstart/utils.py +0 -0
  174. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/resources.py +0 -0
  175. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/reward_function.py +0 -0
  176. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/__init__.py +0 -0
  177. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/accuracy.py +0 -0
  178. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/accuracy_length.py +0 -0
  179. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  180. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  181. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/apps_testing_util.py +0 -0
  182. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/bfcl_reward.py +0 -0
  183. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/code_execution.py +0 -0
  184. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/code_execution_utils.py +0 -0
  185. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/cpp_code.py +0 -0
  186. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  187. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/format.py +0 -0
  188. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/function_calling.py +0 -0
  189. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/json_schema.py +0 -0
  190. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/language_consistency.py +0 -0
  191. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/lean_prover.py +0 -0
  192. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/length.py +0 -0
  193. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  194. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/math.py +0 -0
  195. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  196. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/reasoning_steps.py +0 -0
  197. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/repetition.py +0 -0
  198. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rewards/tag_count.py +0 -0
  199. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/rl_processing.py +0 -0
  200. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/server.py +0 -0
  201. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/stats/__init__.py +0 -0
  202. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/stats/confidence_intervals.py +0 -0
  203. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/typed_interface.py +0 -0
  204. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/types/__init__.py +0 -0
  205. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/types/errors.py +0 -0
  206. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/types/types.py +0 -0
  207. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/__init__.py +0 -0
  208. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/batch_evaluation.py +0 -0
  209. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/batch_transformation.py +0 -0
  210. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/check_server_status.py +0 -0
  211. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/dataset_helpers.py +0 -0
  212. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/logs_server.py +0 -0
  213. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/module_loader.py +0 -0
  214. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/packaging_utils.py +0 -0
  215. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/show_results_url.py +0 -0
  216. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/static_policy.py +0 -0
  217. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol/utils/vite_server.py +0 -0
  218. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol.egg-info/dependency_links.txt +0 -0
  219. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol.egg-info/entry_points.txt +0 -0
  220. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol.egg-info/requires.txt +0 -0
  221. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/eval_protocol.egg-info/top_level.txt +0 -0
  222. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/pyproject.toml +0 -0
  223. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/setup.cfg +0 -0
  224. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/setup.py +0 -0
  225. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_accuracy.py +0 -0
  226. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_accuracy_length.py +0 -0
  227. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_adapters_e2e.py +0 -0
  228. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_agent_orchestrator.py +0 -0
  229. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_agent_resources.py +0 -0
  230. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_auth.py +0 -0
  231. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_cli.py +0 -0
  232. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_cli_agent.py +0 -0
  233. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_cli_args.py +0 -0
  234. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_code_execution.py +0 -0
  235. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_config.py +0 -0
  236. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_control_plane_separation.py +0 -0
  237. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_cpp_code.py +0 -0
  238. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_data_driven_task_manager.py +0 -0
  239. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_deepcoder_reward.py +0 -0
  240. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_deepeval_integration.py +0 -0
  241. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_deploy_integration.py +0 -0
  242. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_e2b_integration.py +0 -0
  243. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_e2b_js_integration.py +0 -0
  244. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_edge_cases.py +0 -0
  245. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_eval_protocol_import.py +0 -0
  246. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_evaluation.py +0 -0
  247. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_evaluation_integration.py +0 -0
  248. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_evaluation_preview_integration.py +0 -0
  249. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_event_bus.py +0 -0
  250. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_examples_end_to_end.py +0 -0
  251. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_format.py +0 -0
  252. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_fractional_code.py +0 -0
  253. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_function_calling.py +0 -0
  254. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_gcp_tools.py +0 -0
  255. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_generic_server.py +0 -0
  256. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_human_id.py +0 -0
  257. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_integration.py +0 -0
  258. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_json_schema.py +0 -0
  259. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_kwargs_validation.py +0 -0
  260. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_language_consistency.py +0 -0
  261. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_lean_prover.py +0 -0
  262. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_lean_prover_runner.py +0 -0
  263. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_length.py +0 -0
  264. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_list_comparison_math_reward.py +0 -0
  265. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_logs_server.py +0 -0
  266. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_logs_server_simple.py +0 -0
  267. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_math.py +0 -0
  268. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_minimal.py +0 -0
  269. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_models.py +0 -0
  270. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_models_rl.py +0 -0
  271. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_multiple_choice_math_reward.py +0 -0
  272. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_n_variant_batch_integration.py +0 -0
  273. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_n_variant_integration.py +0 -0
  274. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_openai_compatibility.py +0 -0
  275. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_openeval_integration.py +0 -0
  276. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_packaging.py +0 -0
  277. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_parallel_rollouts.py +0 -0
  278. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_platform_api.py +0 -0
  279. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_quickstart_utils.py +0 -0
  280. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_readiness.py +0 -0
  281. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_reasoning_steps.py +0 -0
  282. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_repetition.py +0 -0
  283. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_repetition_debug.py +0 -0
  284. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_retry_mechanism.py +0 -0
  285. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_reward_function.py +0 -0
  286. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_reward_protocol_import.py +0 -0
  287. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_rl_processing.py +0 -0
  288. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_rollout_control_plane_integration.py +0 -0
  289. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_server.py +0 -0
  290. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_show_results_url.py +0 -0
  291. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_status_migration_changes.py +0 -0
  292. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_status_migration_integration.py +0 -0
  293. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_status_model.py +0 -0
  294. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_tag_count.py +0 -0
  295. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_tau_bench_airline_smoke.py +0 -0
  296. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_typed_interface.py +0 -0
  297. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_typed_interface_rl.py +0 -0
  298. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_url_handling.py +0 -0
  299. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/tests/test_vite_server.py +0 -0
  300. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/__init__.py +0 -0
  301. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/agent/__init__.py +0 -0
  302. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/agent/base.py +0 -0
  303. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/agent/llm_agent.py +0 -0
  304. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/api_service/__init__.py +0 -0
  305. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/api_service/api_config.py +0 -0
  306. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/api_service/data_model.py +0 -0
  307. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/api_service/simulation_service.py +0 -0
  308. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/cli.py +0 -0
  309. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/config.py +0 -0
  310. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/airline/policy.md +0 -0
  311. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/mock/policy.md +0 -0
  312. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  313. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/retail/policy.md +0 -0
  314. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  315. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  316. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  317. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  318. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  319. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  320. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  321. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data_model/__init__.py +0 -0
  322. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data_model/message.py +0 -0
  323. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data_model/simulation.py +0 -0
  324. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/data_model/tasks.py +0 -0
  325. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/__init__.py +0 -0
  326. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/airline/__init__.py +0 -0
  327. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/airline/data_model.py +0 -0
  328. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/airline/environment.py +0 -0
  329. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/airline/tools.py +0 -0
  330. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/airline/utils.py +0 -0
  331. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/mock/__init__.py +0 -0
  332. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/mock/data_model.py +0 -0
  333. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/mock/environment.py +0 -0
  334. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/mock/tools.py +0 -0
  335. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/mock/utils.py +0 -0
  336. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/retail/__init__.py +0 -0
  337. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/retail/data_model.py +0 -0
  338. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/retail/environment.py +0 -0
  339. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/retail/tools.py +0 -0
  340. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/retail/utils.py +0 -0
  341. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/__init__.py +0 -0
  342. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/data_model.py +0 -0
  343. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/environment.py +0 -0
  344. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  345. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  346. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  347. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  348. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  349. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  350. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  351. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  352. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/tools.py +0 -0
  353. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  354. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  355. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/domains/telecom/utils.py +0 -0
  356. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/environment/__init__.py +0 -0
  357. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/environment/db.py +0 -0
  358. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/environment/environment.py +0 -0
  359. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/environment/server.py +0 -0
  360. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/environment/tool.py +0 -0
  361. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/environment/toolkit.py +0 -0
  362. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  363. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/evaluator/__init__.py +0 -0
  364. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/evaluator/evaluator.py +0 -0
  365. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  366. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  367. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  368. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  369. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  370. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/metrics/__init__.py +0 -0
  371. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/metrics/agent_metrics.py +0 -0
  372. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  373. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/orchestrator/__init__.py +0 -0
  374. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  375. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  376. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/orchestrator/utils.py +0 -0
  377. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/registry.py +0 -0
  378. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/run.py +0 -0
  379. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/scripts/__init__.py +0 -0
  380. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/scripts/check_data.py +0 -0
  381. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  382. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/scripts/start_servers.py +0 -0
  383. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/scripts/view_simulations.py +0 -0
  384. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/user/__init__.py +0 -0
  385. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/user/base.py +0 -0
  386. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/user/user_simulator.py +0 -0
  387. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/utils/__init__.py +0 -0
  388. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/utils/display.py +0 -0
  389. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/utils/io_utils.py +0 -0
  390. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/utils/llm_utils.py +0 -0
  391. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/utils/pydantic_utils.py +0 -0
  392. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vendor/tau2/utils/utils.py +0 -0
  393. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/versioneer.py +0 -0
  394. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  395. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vite-app/dist/assets/index-C8woq7EO.js +0 -0
  396. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vite-app/dist/assets/index-C8woq7EO.js.map +0 -0
  397. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vite-app/dist/assets/index-CSKGq1w7.css +0 -0
  398. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  399. {eval_protocol-0.2.26 → eval_protocol-0.2.27}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.26
3
+ Version: 0.2.27
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -119,133 +119,114 @@ Dynamic: license-file
119
119
  # Eval Protocol (EP)
120
120
 
121
121
  [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
122
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/eval-protocol/python-sdk)
122
123
 
123
- **The open-source toolkit for building your internal model leaderboard.**
124
+ **Stop guessing which AI model to use. Build a data-driven model leaderboard.**
124
125
 
125
- When you have multiple AI models to choose from—different versions, providers,
126
- or configurations—how do you know which one is best for your use case?
126
+ With hundreds of models and configs, you need objective data to choose the right one for your use case. EP helps you evaluate real traces, compare models, and visualize results locally.
127
127
 
128
128
  ## 🚀 Features
129
129
 
130
- - **Custom Evaluations**: Write evaluations tailored to your specific business needs
131
- - **Auto-Evaluation**: Stack-rank models using LLMs as judges with just model traces using out-of-the-box evaluators
132
- - **RL Environments via MCP**: Build reinforcement learning environments using the Model Control Protocol (MCP) to simulate user interactions and advanced evaluation scenarios
133
- - **Consistent Testing**: Test across various models and configurations with a unified framework
134
- - **Resilient Runtime**: Automatic retries for unstable LLM APIs and concurrent execution for long-running evaluations
135
- - **Rich Visualizations**: Built-in pivot tables and visualizations for result analysis
136
- - **Data-Driven Decisions**: Make informed model deployment decisions based on comprehensive evaluation results
130
+ - **Pytest authoring**: `@evaluation_test` decorator to configure evaluations
131
+ - **Robust rollouts**: Handles flaky LLM APIs and parallel execution
132
+ - **Integrations**: Works with Langfuse, LangSmith, Braintrust, Responses API
133
+ - **Agent support**: LangGraph and Pydantic AI
134
+ - **MCP RL envs**: Build reinforcement learning environments with MCP
135
+ - **Built-in benchmarks**: AIME, tau-bench
136
+ - **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
137
+ - **Local UI**: Pivot/table views for real-time analysis
137
138
 
138
- ## Quick Examples
139
+ ## Quickstart (no labels needed)
139
140
 
140
- ### Basic Model Comparison
141
+ Install with your tracing platform extras and set API keys:
141
142
 
142
- Compare models on a simple formatting task:
143
-
144
- ```python test_bold_format.py
145
- from eval_protocol.models import EvaluateResult, EvaluationRow, Message
146
- from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
143
+ ```bash
144
+ pip install 'eval-protocol[langfuse]'
147
145
 
148
- @evaluation_test(
149
- input_messages=[
150
- [
151
- Message(role="system", content="Use bold text to highlight important information."),
152
- Message(role="user", content="Explain why evaluations matter for AI agents. Make it dramatic!"),
153
- ],
154
- ],
155
- completion_params=[
156
- {"model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct"},
157
- {"model": "openai/gpt-4"},
158
- {"model": "anthropic/claude-3-sonnet"}
159
- ],
160
- rollout_processor=default_single_turn_rollout_processor,
161
- mode="pointwise",
162
- )
163
- def test_bold_format(row: EvaluationRow) -> EvaluationRow:
164
- """Check if the model's response contains bold text."""
165
- assistant_response = row.messages[-1].content
146
+ # Model API keys (set what you need)
147
+ export OPENAI_API_KEY=...
148
+ export FIREWORKS_API_KEY=...
149
+ export GEMINI_API_KEY=...
166
150
 
167
- if assistant_response is None:
168
- row.evaluation_result = EvaluateResult(score=0.0, reason="No response")
169
- return row
151
+ # Platform keys
152
+ export LANGFUSE_PUBLIC_KEY=...
153
+ export LANGFUSE_SECRET_KEY=...
154
+ export LANGFUSE_HOST=https://your-deployment.com # optional
155
+ ```
170
156
 
171
- has_bold = "**" in str(assistant_response)
172
- score = 1.0 if has_bold else 0.0
173
- reason = "Contains bold text" if has_bold else "No bold text found"
157
+ Minimal evaluation using the built-in AHA judge:
174
158
 
175
- row.evaluation_result = EvaluateResult(score=score, reason=reason)
176
- return row
177
- ```
159
+ ```python
160
+ from datetime import datetime
161
+ import pytest
162
+
163
+ from eval_protocol import (
164
+ evaluation_test,
165
+ aha_judge,
166
+ EvaluationRow,
167
+ SingleTurnRolloutProcessor,
168
+ DynamicDataLoader,
169
+ create_langfuse_adapter,
170
+ )
178
171
 
179
- ### Using Datasets
180
172
 
181
- Evaluate models on existing datasets:
173
+ def langfuse_data_generator() -> list[EvaluationRow]:
174
+ adapter = create_langfuse_adapter()
175
+ return adapter.get_evaluation_rows(
176
+ to_timestamp=datetime.utcnow(),
177
+ limit=20,
178
+ sample_size=5,
179
+ )
182
180
 
183
- ```python
184
- from eval_protocol.pytest import evaluation_test
185
- from eval_protocol.adapters.huggingface import create_gsm8k_adapter
186
181
 
187
- @evaluation_test(
188
- input_dataset=["development/gsm8k_sample.jsonl"], # Local JSONL file
189
- dataset_adapter=create_gsm8k_adapter(), # Adapter to convert data
190
- completion_params=[
191
- {"model": "openai/gpt-4"},
192
- {"model": "anthropic/claude-3-sonnet"}
182
+ @pytest.mark.parametrize(
183
+ "completion_params",
184
+ [
185
+ {"model": "openai/gpt-4.1"},
186
+ {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
193
187
  ],
194
- mode="pointwise"
195
188
  )
196
- def test_math_reasoning(row: EvaluationRow) -> EvaluationRow:
197
- # Your evaluation logic here
198
- return row
189
+ @evaluation_test(
190
+ data_loaders=DynamicDataLoader(generators=[langfuse_data_generator]),
191
+ rollout_processor=SingleTurnRolloutProcessor(),
192
+ )
193
+ async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
194
+ return await aha_judge(row)
199
195
  ```
200
196
 
197
+ Run it:
201
198
 
202
- ## 📚 Resources
199
+ ```bash
200
+ pytest -q -s
201
+ ```
203
202
 
204
- - **[Documentation](https://evalprotocol.io)** - Complete guides and API reference
205
- - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** - Community discussions
206
- - **[GitHub](https://github.com/eval-protocol/python-sdk)** - Source code and examples
203
+ The pytest output includes local links for a leaderboard and row-level traces (pivot/table) at `http://localhost:8000`.
207
204
 
208
205
  ## Installation
209
206
 
210
- **This library requires Python >= 3.10.**
207
+ This library requires Python >= 3.10.
211
208
 
212
- ### Basic Installation
213
-
214
- Install with pip:
209
+ ### pip
215
210
 
216
211
  ```bash
217
212
  pip install eval-protocol
218
213
  ```
219
214
 
220
- ### Recommended Installation with uv
221
-
222
- For better dependency management and faster installs, we recommend using [uv](https://docs.astral.sh/uv/):
215
+ ### uv (recommended)
223
216
 
224
217
  ```bash
225
- # Install uv if you haven't already
218
+ # Install uv (if needed)
226
219
  curl -LsSf https://astral.sh/uv/install.sh | sh
227
220
 
228
- # Install eval-protocol
221
+ # Add to your project
229
222
  uv add eval-protocol
230
223
  ```
231
224
 
232
- ### Optional Dependencies
233
-
234
- Install with additional features:
235
-
236
- ```bash
237
- # For Langfuse integration
238
- pip install 'eval-protocol[langfuse]'
239
-
240
- # For HuggingFace datasets
241
- pip install 'eval-protocol[huggingface]'
242
-
243
- # For all adapters
244
- pip install 'eval-protocol[adapters]'
225
+ ## 📚 Resources
245
226
 
246
- # For development
247
- pip install 'eval-protocol[dev]'
248
- ```
227
+ - **[Documentation](https://evalprotocol.io)** – Guides and API reference
228
+ - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
229
+ - **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
249
230
 
250
231
  ## License
251
232
 
@@ -0,0 +1,115 @@
1
+ # Eval Protocol (EP)
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
4
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/eval-protocol/python-sdk)
5
+
6
+ **Stop guessing which AI model to use. Build a data-driven model leaderboard.**
7
+
8
+ With hundreds of models and configs, you need objective data to choose the right one for your use case. EP helps you evaluate real traces, compare models, and visualize results locally.
9
+
10
+ ## 🚀 Features
11
+
12
+ - **Pytest authoring**: `@evaluation_test` decorator to configure evaluations
13
+ - **Robust rollouts**: Handles flaky LLM APIs and parallel execution
14
+ - **Integrations**: Works with Langfuse, LangSmith, Braintrust, Responses API
15
+ - **Agent support**: LangGraph and Pydantic AI
16
+ - **MCP RL envs**: Build reinforcement learning environments with MCP
17
+ - **Built-in benchmarks**: AIME, tau-bench
18
+ - **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
19
+ - **Local UI**: Pivot/table views for real-time analysis
20
+
21
+ ## ⚡ Quickstart (no labels needed)
22
+
23
+ Install with your tracing platform extras and set API keys:
24
+
25
+ ```bash
26
+ pip install 'eval-protocol[langfuse]'
27
+
28
+ # Model API keys (set what you need)
29
+ export OPENAI_API_KEY=...
30
+ export FIREWORKS_API_KEY=...
31
+ export GEMINI_API_KEY=...
32
+
33
+ # Platform keys
34
+ export LANGFUSE_PUBLIC_KEY=...
35
+ export LANGFUSE_SECRET_KEY=...
36
+ export LANGFUSE_HOST=https://your-deployment.com # optional
37
+ ```
38
+
39
+ Minimal evaluation using the built-in AHA judge:
40
+
41
+ ```python
42
+ from datetime import datetime
43
+ import pytest
44
+
45
+ from eval_protocol import (
46
+ evaluation_test,
47
+ aha_judge,
48
+ EvaluationRow,
49
+ SingleTurnRolloutProcessor,
50
+ DynamicDataLoader,
51
+ create_langfuse_adapter,
52
+ )
53
+
54
+
55
+ def langfuse_data_generator() -> list[EvaluationRow]:
56
+ adapter = create_langfuse_adapter()
57
+ return adapter.get_evaluation_rows(
58
+ to_timestamp=datetime.utcnow(),
59
+ limit=20,
60
+ sample_size=5,
61
+ )
62
+
63
+
64
+ @pytest.mark.parametrize(
65
+ "completion_params",
66
+ [
67
+ {"model": "openai/gpt-4.1"},
68
+ {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
69
+ ],
70
+ )
71
+ @evaluation_test(
72
+ data_loaders=DynamicDataLoader(generators=[langfuse_data_generator]),
73
+ rollout_processor=SingleTurnRolloutProcessor(),
74
+ )
75
+ async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
76
+ return await aha_judge(row)
77
+ ```
78
+
79
+ Run it:
80
+
81
+ ```bash
82
+ pytest -q -s
83
+ ```
84
+
85
+ The pytest output includes local links for a leaderboard and row-level traces (pivot/table) at `http://localhost:8000`.
86
+
87
+ ## Installation
88
+
89
+ This library requires Python >= 3.10.
90
+
91
+ ### pip
92
+
93
+ ```bash
94
+ pip install eval-protocol
95
+ ```
96
+
97
+ ### uv (recommended)
98
+
99
+ ```bash
100
+ # Install uv (if needed)
101
+ curl -LsSf https://astral.sh/uv/install.sh | sh
102
+
103
+ # Add to your project
104
+ uv add eval-protocol
105
+ ```
106
+
107
+ ## 📚 Resources
108
+
109
+ - **[Documentation](https://evalprotocol.io)** – Guides and API reference
110
+ - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
111
+ - **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
112
+
113
+ ## License
114
+
115
+ [MIT](LICENSE)
@@ -22,6 +22,7 @@ from .mcp_env import (
22
22
  rollout,
23
23
  test_mcp,
24
24
  )
25
+ from .data_loader import DynamicDataLoader, InlineDataLoader
25
26
 
26
27
  # Try to import FireworksPolicy if available
27
28
  try:
@@ -66,6 +67,8 @@ warnings.filterwarnings("default", category=DeprecationWarning, module="eval_pro
66
67
 
67
68
  __all__ = [
68
69
  "DefaultParameterIdGenerator",
70
+ "DynamicDataLoader",
71
+ "InlineDataLoader",
69
72
  "aha_judge",
70
73
  "multi_turn_assistant_to_ground_truth",
71
74
  "assistant_to_ground_truth",
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-09-23T16:59:02-0700",
11
+ "date": "2025-09-23T23:32:05-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "bcc2d7c22085ccddf46d952e3481012250245d90",
15
- "version": "0.2.26"
14
+ "full-revisionid": "aadaf82ad89ca3a8941a348ff818684554f5bf0c",
15
+ "version": "0.2.27"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,4 @@
1
+ from .dynamic_data_loader import DynamicDataLoader
2
+ from .inline_data_loader import InlineDataLoader
3
+
4
+ __all__ = ["DynamicDataLoader", "InlineDataLoader"]
@@ -0,0 +1,38 @@
1
+ from collections.abc import Callable, Sequence
2
+ from dataclasses import dataclass
3
+
4
+ from eval_protocol.data_loader.models import (
5
+ DataLoaderResult,
6
+ DataLoaderVariant,
7
+ EvaluationDataLoader,
8
+ )
9
+ from eval_protocol.models import EvaluationRow
10
+
11
+
12
+ @dataclass(kw_only=True)
13
+ class DynamicDataLoader(EvaluationDataLoader):
14
+ """Data loader for dynamic data generation."""
15
+
16
+ generators: Sequence[Callable[[], list[EvaluationRow]]]
17
+ """Dynamic data generation functions. These callables are invoked each time data
18
+ needs to be loaded, allowing for dynamic data generation, lazy loading, or data that
19
+ changes between evaluation runs. Each function should return a list of EvaluationRow
20
+ objects. This is useful for scenarios like generating test data on-the-fly, loading
21
+ data from external sources, or creating data with randomized elements for robust testing."""
22
+
23
+ def variants(self) -> Sequence[DataLoaderVariant]:
24
+ variants: Sequence[DataLoaderVariant] = []
25
+ for generator in self.generators:
26
+
27
+ def _load() -> DataLoaderResult:
28
+ resolved_rows = generator()
29
+ return DataLoaderResult(
30
+ rows=resolved_rows,
31
+ type=self.__class__.__name__,
32
+ variant_id=generator.__name__,
33
+ variant_description=generator.__doc__,
34
+ )
35
+
36
+ variants.append(_load)
37
+
38
+ return variants
@@ -0,0 +1,38 @@
1
+ from collections.abc import Callable, Sequence
2
+ from dataclasses import dataclass
3
+
4
+ from eval_protocol.data_loader.models import (
5
+ DataLoaderResult,
6
+ DataLoaderVariant,
7
+ EvaluationDataLoader,
8
+ )
9
+ from eval_protocol.models import EvaluationRow
10
+
11
+
12
+ @dataclass(kw_only=True)
13
+ class DynamicDataLoader(EvaluationDataLoader):
14
+ """Data loader for dynamic data generation."""
15
+
16
+ factory: Sequence[Callable[[], list[EvaluationRow]]]
17
+ """Dynamic data generation functions. These callables are invoked each time data
18
+ needs to be loaded, allowing for dynamic data generation, lazy loading, or data that
19
+ changes between evaluation runs. Each function should return a list of EvaluationRow
20
+ objects. This is useful for scenarios like generating test data on-the-fly, loading
21
+ data from external sources, or creating data with randomized elements for robust testing."""
22
+
23
+ def variants(self) -> Sequence[DataLoaderVariant]:
24
+ variants: Sequence[DataLoaderVariant] = []
25
+ for factory in self.factory:
26
+
27
+ def _load() -> DataLoaderResult:
28
+ resolved_rows = factory()
29
+ return DataLoaderResult(
30
+ rows=resolved_rows,
31
+ type=self.__class__.__name__,
32
+ variant_id=factory.__name__,
33
+ variant_description=factory.__doc__,
34
+ )
35
+
36
+ variants.append(_load)
37
+
38
+ return variants
@@ -0,0 +1,68 @@
1
+ from collections.abc import Sequence
2
+ from dataclasses import dataclass
3
+
4
+ from eval_protocol.data_loader.models import (
5
+ DataLoaderResult,
6
+ DataLoaderVariant,
7
+ EvaluationDataLoader,
8
+ )
9
+ from eval_protocol.models import EvaluationRow, Message
10
+ from eval_protocol.pytest.types import InputMessagesParam
11
+
12
+
13
+ DEFAULT_VARIANT_ID: str = "inline"
14
+
15
+
16
+ @dataclass(kw_only=True)
17
+ class InlineDataLoader(EvaluationDataLoader):
18
+ """Data loader for inline ``EvaluationRow`` or message payloads."""
19
+
20
+ rows: list[EvaluationRow] | None = None
21
+ """Pre-defined evaluation rows with tools and metadata. Use this when you have complete
22
+ EvaluationRow objects that include tools, input_metadata, and other structured data.
23
+ This is the preferred option when working with tool-calling scenarios or when you need
24
+ to provide additional metadata like row_id, dataset information, or custom fields."""
25
+
26
+ messages: Sequence[InputMessagesParam] | None = None
27
+ """Raw chat completion message history. Use this when you only have simple
28
+ conversation history without tools or additional metadata. The messages will be
29
+ automatically converted to EvaluationRow objects. InputMessagesParam is a list of
30
+ Message objects representing the conversation flow (user, assistant, system messages)."""
31
+
32
+ id: str = DEFAULT_VARIANT_ID
33
+ """Unique identifier for this data loader variant. Used to label and distinguish
34
+ different input data sources, versions, or configurations. This helps with tracking
35
+ and organizing evaluation results from different data sources."""
36
+
37
+ description: str | None = None
38
+ """Optional human-readable description of this data loader. Provides additional
39
+ context about the data source, purpose, or any special characteristics. Used for
40
+ documentation and debugging purposes. If not provided, the variant_id will be used instead."""
41
+
42
+ def __post_init__(self) -> None:
43
+ if self.rows is None and self.messages is None:
44
+ raise ValueError("InlineDataLoader requires rows or messages to be provided")
45
+
46
+ def variants(self) -> Sequence[DataLoaderVariant]:
47
+ def _load() -> DataLoaderResult:
48
+ resolved_rows: list[EvaluationRow] = []
49
+ if self.rows is not None:
50
+ resolved_rows = [row.model_copy(deep=True) for row in self.rows]
51
+ if self.messages is not None:
52
+ for dataset_messages in self.messages:
53
+ row_messages: list[Message] = []
54
+ for msg in dataset_messages:
55
+ if isinstance(msg, Message):
56
+ row_messages.append(msg.model_copy(deep=True))
57
+ else:
58
+ row_messages.append(Message.model_validate(msg))
59
+ resolved_rows.append(EvaluationRow(messages=row_messages))
60
+
61
+ return DataLoaderResult(
62
+ rows=resolved_rows,
63
+ variant_id=self.id,
64
+ variant_description=self.description,
65
+ type=self.__class__.__name__,
66
+ )
67
+
68
+ return [_load]
@@ -0,0 +1,128 @@
1
+ """Data loader abstractions"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Sequence
6
+ from dataclasses import dataclass
7
+ from typing import Callable
8
+ from typing_extensions import Protocol
9
+ from abc import ABC, abstractmethod
10
+
11
+ from pydantic import BaseModel, Field, field_validator
12
+
13
+ from eval_protocol.models import EvaluationRow
14
+
15
+
16
+ class DataLoaderResult(BaseModel):
17
+ """Rows and metadata returned by a loader variant."""
18
+
19
+ rows: list[EvaluationRow] = Field(
20
+ description="List of evaluation rows loaded from the data source. These are the "
21
+ "processed and ready-to-use evaluation data that will be fed into the evaluation pipeline."
22
+ )
23
+
24
+ type: str = Field(
25
+ ...,
26
+ description="Type of the data loader that produced this result. Used for identification "
27
+ "and debugging purposes (e.g., 'InlineDataLoader', 'DynamicDataLoader').",
28
+ )
29
+
30
+ variant_id: str = Field(
31
+ ...,
32
+ description="Unique identifier for the data loader variant that produced this result. "
33
+ "Used for tracking and organizing evaluation results from different data sources.",
34
+ )
35
+
36
+ variant_description: str | None = Field(
37
+ default=None,
38
+ description="Human-readable description of the data loader variant that produced this result. "
39
+ "Provides context about what this variant represents, its purpose, or any special characteristics that distinguish "
40
+ "it from other variants.",
41
+ )
42
+
43
+ preprocessed: bool = Field(
44
+ default=False,
45
+ description="Whether the data has been preprocessed. This flag indicates if any "
46
+ "preprocessing functions have been applied to the data, helping to avoid duplicate "
47
+ "processing and track data transformation state.",
48
+ )
49
+
50
+ @field_validator("type")
51
+ @classmethod
52
+ def validate_type(cls, v: str) -> str:
53
+ if not v or not v.strip():
54
+ raise ValueError("type must be non-empty")
55
+ return v
56
+
57
+ @field_validator("variant_id")
58
+ @classmethod
59
+ def validate_variant_id(cls, v: str) -> str:
60
+ if not v or not v.strip():
61
+ raise ValueError("variant_id must be non-empty")
62
+ return v
63
+
64
+
65
+ class DataLoaderVariant(Protocol):
66
+ """Single parameterizable variant from a data loader."""
67
+
68
+ def __call__(self) -> DataLoaderResult:
69
+ """Load a dataset for this variant using the provided context."""
70
+ ...
71
+
72
+
73
+ @dataclass(kw_only=True)
74
+ class EvaluationDataLoader(ABC):
75
+ """Abstract base class for data loaders that can be consumed by ``evaluation_test``."""
76
+
77
+ preprocess_fn: Callable[[list[EvaluationRow]], list[EvaluationRow]] | None = None
78
+ """Optional preprocessing function for evaluation rows. This function is applied
79
+ to the loaded data before it's returned, allowing for data cleaning, transformation,
80
+ filtering, or other modifications. The function receives a list of EvaluationRow objects
81
+ and should return a modified list of EvaluationRow objects."""
82
+
83
+ @abstractmethod
84
+ def variants(self) -> Sequence[DataLoaderVariant]:
85
+ """Return parameterizable variants emitted by this loader."""
86
+ ...
87
+
88
+ def load(self) -> list[DataLoaderResult]:
89
+ """Loads all variants of this data loader and return a list of DataLoaderResult."""
90
+ results = []
91
+ for variant in self.variants():
92
+ result = variant()
93
+ result = self._process_variant(result)
94
+ results.append(result)
95
+ return results
96
+
97
+ def _process_variant(self, result: DataLoaderResult) -> DataLoaderResult:
98
+ """Process a single variant: preprocess data and apply metadata."""
99
+ # Preprocess data
100
+ original_count = len(result.rows)
101
+ if self.preprocess_fn:
102
+ result.rows = self.preprocess_fn(result.rows)
103
+ result.preprocessed = True
104
+ processed_count = len(result.rows)
105
+ else:
106
+ processed_count = original_count
107
+
108
+ # Apply metadata to rows
109
+ self._apply_metadata(result, original_count, processed_count)
110
+ return result
111
+
112
+ def _apply_metadata(self, result: DataLoaderResult, original_count: int, processed_count: int) -> None:
113
+ """Apply metadata to all rows in the result."""
114
+ for row in result.rows:
115
+ if row.input_metadata.dataset_info is None:
116
+ row.input_metadata.dataset_info = {}
117
+
118
+ # Apply result attributes as metadata
119
+ for attr_name, attr_value in vars(result).items():
120
+ """
121
+ Exclude rows and private attributes from metadata.
122
+ """
123
+ if attr_name != "rows" and not attr_name.startswith("_"):
124
+ row.input_metadata.dataset_info[f"data_loader_{attr_name}"] = attr_value
125
+
126
+ # Apply row counts
127
+ row.input_metadata.dataset_info["data_loader_num_rows"] = original_count
128
+ row.input_metadata.dataset_info["data_loader_num_rows_after_preprocessing"] = processed_count