eval-protocol 0.2.26__tar.gz → 0.2.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (400) hide show
  1. {eval_protocol-0.2.26/eval_protocol.egg-info → eval_protocol-0.2.28}/PKG-INFO +70 -89
  2. eval_protocol-0.2.28/README.md +115 -0
  3. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/__init__.py +20 -2
  4. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/_version.py +3 -3
  5. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/langfuse.py +36 -115
  6. eval_protocol-0.2.28/eval_protocol/data_loader/__init__.py +4 -0
  7. eval_protocol-0.2.28/eval_protocol/data_loader/dynamic_data_loader.py +38 -0
  8. eval_protocol-0.2.28/eval_protocol/data_loader/factory_data_loader.py +38 -0
  9. eval_protocol-0.2.28/eval_protocol/data_loader/inline_data_loader.py +68 -0
  10. eval_protocol-0.2.28/eval_protocol/data_loader/models.py +128 -0
  11. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/evaluation_test.py +52 -26
  12. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/evaluation_test_postprocess.py +12 -3
  13. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/generate_parameter_combinations.py +25 -6
  14. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/parameterize.py +14 -2
  15. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/plugin.py +4 -4
  16. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/remote_rollout_processor.py +28 -9
  17. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/store_results_url.py +9 -6
  18. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/utils.py +17 -11
  19. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/quickstart/llm_judge_braintrust.py +16 -15
  20. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/quickstart/llm_judge_langfuse.py +15 -11
  21. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/quickstart/llm_judge_langsmith.py +8 -11
  22. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/quickstart/llm_judge_openai_responses.py +16 -12
  23. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/quickstart/utils.py +22 -0
  24. eval_protocol-0.2.28/eval_protocol/types/remote_rollout_processor.py +46 -0
  25. {eval_protocol-0.2.26 → eval_protocol-0.2.28/eval_protocol.egg-info}/PKG-INFO +70 -89
  26. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol.egg-info/SOURCES.txt +6 -0
  27. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_batch_evaluation.py +0 -2
  28. eval_protocol-0.2.28/tests/test_evaluation_postprocess.py +441 -0
  29. eval_protocol-0.2.28/tests/test_fireworks_api.py +68 -0
  30. eval_protocol-0.2.26/README.md +0 -134
  31. eval_protocol-0.2.26/tests/test_evaluation_postprocess.py +0 -207
  32. eval_protocol-0.2.26/tests/test_fireworks_api.py +0 -66
  33. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/LICENSE +0 -0
  34. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/development/__init__.py +0 -0
  35. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/development/normalize_sandbox_fusion.py +0 -0
  36. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/development/utils/__init__.py +0 -0
  37. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/development/utils/generate_api_key.py +0 -0
  38. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/development/utils/subprocess_manager.py +0 -0
  39. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/__main__.py +0 -0
  40. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/__init__.py +0 -0
  41. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/base.py +0 -0
  42. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/bigquery.py +0 -0
  43. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/braintrust.py +0 -0
  44. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/huggingface.py +0 -0
  45. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/langchain.py +0 -0
  46. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/langsmith.py +0 -0
  47. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/openai_responses.py +0 -0
  48. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/trl.py +0 -0
  49. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/adapters/utils.py +0 -0
  50. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/__init__.py +0 -0
  51. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/models.py +0 -0
  52. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/orchestrator.py +0 -0
  53. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resource_abc.py +0 -0
  54. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resource_pool.py +0 -0
  55. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/__init__.py +0 -0
  56. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  57. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  58. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  59. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  60. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  61. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/docker_resource.py +0 -0
  62. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  63. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  64. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/resources/sql_resource.py +0 -0
  65. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/task_manager.py +0 -0
  66. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/agent/tool_registry.py +0 -0
  67. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/auth.py +0 -0
  68. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/benchmarks/__init__.py +0 -0
  69. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  70. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  71. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/benchmarks/test_aime25.py +0 -0
  72. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  73. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  74. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  75. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  76. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli.py +0 -0
  77. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli_commands/__init__.py +0 -0
  78. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  79. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli_commands/common.py +0 -0
  80. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli_commands/deploy.py +0 -0
  81. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  82. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli_commands/logs.py +0 -0
  83. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli_commands/preview.py +0 -0
  84. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  85. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/common_utils.py +0 -0
  86. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/config.py +0 -0
  87. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/dataset_logger/__init__.py +0 -0
  88. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  89. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  90. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  91. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  92. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/datasets/__init__.py +0 -0
  93. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/datasets/loader.py +0 -0
  94. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/directory_utils.py +0 -0
  95. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/evaluation.py +0 -0
  96. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/event_bus/__init__.py +0 -0
  97. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/event_bus/event_bus.py +0 -0
  98. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/event_bus/logger.py +0 -0
  99. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  100. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  101. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/execution/__init__.py +0 -0
  102. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/execution/pipeline.py +0 -0
  103. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/gcp_tools.py +0 -0
  104. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/generation/cache.py +0 -0
  105. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/generation/clients/base.py +0 -0
  106. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/generation/clients.py +0 -0
  107. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/generic_server.py +0 -0
  108. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/get_pep440_version.py +0 -0
  109. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/human_id/__init__.py +0 -0
  110. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/human_id/dictionary.py +0 -0
  111. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/integrations/__init__.py +0 -0
  112. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/integrations/deepeval.py +0 -0
  113. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/integrations/openeval.py +0 -0
  114. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/integrations/trl.py +0 -0
  115. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/logging_utils.py +0 -0
  116. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/__init__.py +0 -0
  117. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/adapter.py +0 -0
  118. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/client/__init__.py +0 -0
  119. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/client/connection.py +0 -0
  120. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/clients.py +0 -0
  121. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/execution/__init__.py +0 -0
  122. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/execution/base_policy.py +0 -0
  123. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/execution/manager.py +0 -0
  124. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/execution/policy.py +0 -0
  125. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/grid_renderer.py +0 -0
  126. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  127. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/mcpgym.py +0 -0
  128. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/process_manager.py +0 -0
  129. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/session/__init__.py +0 -0
  130. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/session/manager.py +0 -0
  131. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/simple_process_manager.py +0 -0
  132. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp/simulation_server.py +0 -0
  133. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_agent/__init__.py +0 -0
  134. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_agent/config.py +0 -0
  135. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_agent/main.py +0 -0
  136. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  137. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  138. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  139. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  140. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_env.py +0 -0
  141. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/__init__.py +0 -0
  142. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  143. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  144. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  145. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  146. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  147. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  148. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  149. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  150. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  151. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  152. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  153. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/models.py +0 -0
  154. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/packaging.py +0 -0
  155. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/platform_api.py +0 -0
  156. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/playback_policy.py +0 -0
  157. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/__init__.py +0 -0
  158. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  159. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  160. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  161. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  162. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  163. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  164. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  165. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  166. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/exception_config.py +0 -0
  167. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/execution.py +0 -0
  168. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  169. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/rollout_processor.py +0 -0
  170. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/store_experiment_link.py +0 -0
  171. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/types.py +0 -0
  172. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/pytest/validate_signature.py +0 -0
  173. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/quickstart/__init__.py +0 -0
  174. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/quickstart/llm_judge.py +0 -0
  175. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/resources.py +0 -0
  176. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/reward_function.py +0 -0
  177. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/__init__.py +0 -0
  178. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/accuracy.py +0 -0
  179. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/accuracy_length.py +0 -0
  180. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  181. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  182. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/apps_testing_util.py +0 -0
  183. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/bfcl_reward.py +0 -0
  184. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/code_execution.py +0 -0
  185. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/code_execution_utils.py +0 -0
  186. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/cpp_code.py +0 -0
  187. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  188. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/format.py +0 -0
  189. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/function_calling.py +0 -0
  190. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/json_schema.py +0 -0
  191. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/language_consistency.py +0 -0
  192. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/lean_prover.py +0 -0
  193. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/length.py +0 -0
  194. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  195. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/math.py +0 -0
  196. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  197. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/reasoning_steps.py +0 -0
  198. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/repetition.py +0 -0
  199. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rewards/tag_count.py +0 -0
  200. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/rl_processing.py +0 -0
  201. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/server.py +0 -0
  202. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/stats/__init__.py +0 -0
  203. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/stats/confidence_intervals.py +0 -0
  204. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/typed_interface.py +0 -0
  205. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/types/__init__.py +0 -0
  206. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/types/errors.py +0 -0
  207. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/types/types.py +0 -0
  208. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/__init__.py +0 -0
  209. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/batch_evaluation.py +0 -0
  210. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/batch_transformation.py +0 -0
  211. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/check_server_status.py +0 -0
  212. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/dataset_helpers.py +0 -0
  213. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/logs_server.py +0 -0
  214. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/module_loader.py +0 -0
  215. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/packaging_utils.py +0 -0
  216. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/show_results_url.py +0 -0
  217. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/static_policy.py +0 -0
  218. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol/utils/vite_server.py +0 -0
  219. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol.egg-info/dependency_links.txt +0 -0
  220. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol.egg-info/entry_points.txt +0 -0
  221. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol.egg-info/requires.txt +0 -0
  222. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/eval_protocol.egg-info/top_level.txt +0 -0
  223. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/pyproject.toml +0 -0
  224. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/setup.cfg +0 -0
  225. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/setup.py +0 -0
  226. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_accuracy.py +0 -0
  227. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_accuracy_length.py +0 -0
  228. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_adapters_e2e.py +0 -0
  229. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_agent_orchestrator.py +0 -0
  230. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_agent_resources.py +0 -0
  231. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_auth.py +0 -0
  232. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_cli.py +0 -0
  233. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_cli_agent.py +0 -0
  234. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_cli_args.py +0 -0
  235. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_code_execution.py +0 -0
  236. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_config.py +0 -0
  237. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_control_plane_separation.py +0 -0
  238. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_cpp_code.py +0 -0
  239. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_data_driven_task_manager.py +0 -0
  240. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_deepcoder_reward.py +0 -0
  241. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_deepeval_integration.py +0 -0
  242. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_deploy_integration.py +0 -0
  243. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_e2b_integration.py +0 -0
  244. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_e2b_js_integration.py +0 -0
  245. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_edge_cases.py +0 -0
  246. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_eval_protocol_import.py +0 -0
  247. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_evaluation.py +0 -0
  248. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_evaluation_integration.py +0 -0
  249. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_evaluation_preview_integration.py +0 -0
  250. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_event_bus.py +0 -0
  251. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_examples_end_to_end.py +0 -0
  252. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_format.py +0 -0
  253. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_fractional_code.py +0 -0
  254. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_function_calling.py +0 -0
  255. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_gcp_tools.py +0 -0
  256. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_generic_server.py +0 -0
  257. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_human_id.py +0 -0
  258. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_integration.py +0 -0
  259. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_json_schema.py +0 -0
  260. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_kwargs_validation.py +0 -0
  261. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_language_consistency.py +0 -0
  262. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_lean_prover.py +0 -0
  263. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_lean_prover_runner.py +0 -0
  264. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_length.py +0 -0
  265. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_list_comparison_math_reward.py +0 -0
  266. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_logs_server.py +0 -0
  267. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_logs_server_simple.py +0 -0
  268. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_math.py +0 -0
  269. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_minimal.py +0 -0
  270. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_models.py +0 -0
  271. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_models_rl.py +0 -0
  272. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_multiple_choice_math_reward.py +0 -0
  273. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_n_variant_batch_integration.py +0 -0
  274. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_n_variant_integration.py +0 -0
  275. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_openai_compatibility.py +0 -0
  276. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_openeval_integration.py +0 -0
  277. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_packaging.py +0 -0
  278. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_parallel_rollouts.py +0 -0
  279. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_platform_api.py +0 -0
  280. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_quickstart_utils.py +0 -0
  281. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_readiness.py +0 -0
  282. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_reasoning_steps.py +0 -0
  283. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_repetition.py +0 -0
  284. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_repetition_debug.py +0 -0
  285. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_retry_mechanism.py +0 -0
  286. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_reward_function.py +0 -0
  287. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_reward_protocol_import.py +0 -0
  288. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_rl_processing.py +0 -0
  289. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_rollout_control_plane_integration.py +0 -0
  290. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_server.py +0 -0
  291. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_show_results_url.py +0 -0
  292. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_status_migration_changes.py +0 -0
  293. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_status_migration_integration.py +0 -0
  294. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_status_model.py +0 -0
  295. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_tag_count.py +0 -0
  296. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_tau_bench_airline_smoke.py +0 -0
  297. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_typed_interface.py +0 -0
  298. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_typed_interface_rl.py +0 -0
  299. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_url_handling.py +0 -0
  300. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/tests/test_vite_server.py +0 -0
  301. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/__init__.py +0 -0
  302. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/agent/__init__.py +0 -0
  303. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/agent/base.py +0 -0
  304. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/agent/llm_agent.py +0 -0
  305. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/api_service/__init__.py +0 -0
  306. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/api_service/api_config.py +0 -0
  307. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/api_service/data_model.py +0 -0
  308. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/api_service/simulation_service.py +0 -0
  309. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/cli.py +0 -0
  310. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/config.py +0 -0
  311. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/airline/policy.md +0 -0
  312. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/mock/policy.md +0 -0
  313. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  314. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/retail/policy.md +0 -0
  315. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  316. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  317. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  318. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  319. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  320. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  321. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  322. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data_model/__init__.py +0 -0
  323. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data_model/message.py +0 -0
  324. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data_model/simulation.py +0 -0
  325. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/data_model/tasks.py +0 -0
  326. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/__init__.py +0 -0
  327. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/airline/__init__.py +0 -0
  328. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/airline/data_model.py +0 -0
  329. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/airline/environment.py +0 -0
  330. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/airline/tools.py +0 -0
  331. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/airline/utils.py +0 -0
  332. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/mock/__init__.py +0 -0
  333. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/mock/data_model.py +0 -0
  334. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/mock/environment.py +0 -0
  335. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/mock/tools.py +0 -0
  336. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/mock/utils.py +0 -0
  337. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/retail/__init__.py +0 -0
  338. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/retail/data_model.py +0 -0
  339. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/retail/environment.py +0 -0
  340. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/retail/tools.py +0 -0
  341. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/retail/utils.py +0 -0
  342. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/__init__.py +0 -0
  343. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/data_model.py +0 -0
  344. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/environment.py +0 -0
  345. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  346. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  347. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  348. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  349. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  350. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  351. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  352. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  353. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/tools.py +0 -0
  354. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  355. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  356. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/domains/telecom/utils.py +0 -0
  357. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/environment/__init__.py +0 -0
  358. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/environment/db.py +0 -0
  359. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/environment/environment.py +0 -0
  360. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/environment/server.py +0 -0
  361. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/environment/tool.py +0 -0
  362. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/environment/toolkit.py +0 -0
  363. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  364. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/evaluator/__init__.py +0 -0
  365. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/evaluator/evaluator.py +0 -0
  366. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  367. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  368. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  369. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  370. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  371. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/metrics/__init__.py +0 -0
  372. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/metrics/agent_metrics.py +0 -0
  373. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  374. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/orchestrator/__init__.py +0 -0
  375. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  376. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  377. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/orchestrator/utils.py +0 -0
  378. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/registry.py +0 -0
  379. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/run.py +0 -0
  380. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/scripts/__init__.py +0 -0
  381. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/scripts/check_data.py +0 -0
  382. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  383. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/scripts/start_servers.py +0 -0
  384. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/scripts/view_simulations.py +0 -0
  385. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/user/__init__.py +0 -0
  386. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/user/base.py +0 -0
  387. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/user/user_simulator.py +0 -0
  388. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/utils/__init__.py +0 -0
  389. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/utils/display.py +0 -0
  390. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/utils/io_utils.py +0 -0
  391. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/utils/llm_utils.py +0 -0
  392. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/utils/pydantic_utils.py +0 -0
  393. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vendor/tau2/utils/utils.py +0 -0
  394. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/versioneer.py +0 -0
  395. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  396. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vite-app/dist/assets/index-C8woq7EO.js +0 -0
  397. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vite-app/dist/assets/index-C8woq7EO.js.map +0 -0
  398. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vite-app/dist/assets/index-CSKGq1w7.css +0 -0
  399. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  400. {eval_protocol-0.2.26 → eval_protocol-0.2.28}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.26
3
+ Version: 0.2.28
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -119,133 +119,114 @@ Dynamic: license-file
119
119
  # Eval Protocol (EP)
120
120
 
121
121
  [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
122
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/eval-protocol/python-sdk)
122
123
 
123
- **The open-source toolkit for building your internal model leaderboard.**
124
+ **Stop guessing which AI model to use. Build a data-driven model leaderboard.**
124
125
 
125
- When you have multiple AI models to choose from—different versions, providers,
126
- or configurations—how do you know which one is best for your use case?
126
+ With hundreds of models and configs, you need objective data to choose the right one for your use case. EP helps you evaluate real traces, compare models, and visualize results locally.
127
127
 
128
128
  ## 🚀 Features
129
129
 
130
- - **Custom Evaluations**: Write evaluations tailored to your specific business needs
131
- - **Auto-Evaluation**: Stack-rank models using LLMs as judges with just model traces using out-of-the-box evaluators
132
- - **RL Environments via MCP**: Build reinforcement learning environments using the Model Control Protocol (MCP) to simulate user interactions and advanced evaluation scenarios
133
- - **Consistent Testing**: Test across various models and configurations with a unified framework
134
- - **Resilient Runtime**: Automatic retries for unstable LLM APIs and concurrent execution for long-running evaluations
135
- - **Rich Visualizations**: Built-in pivot tables and visualizations for result analysis
136
- - **Data-Driven Decisions**: Make informed model deployment decisions based on comprehensive evaluation results
130
+ - **Pytest authoring**: `@evaluation_test` decorator to configure evaluations
131
+ - **Robust rollouts**: Handles flaky LLM APIs and parallel execution
132
+ - **Integrations**: Works with Langfuse, LangSmith, Braintrust, Responses API
133
+ - **Agent support**: LangGraph and Pydantic AI
134
+ - **MCP RL envs**: Build reinforcement learning environments with MCP
135
+ - **Built-in benchmarks**: AIME, tau-bench
136
+ - **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
137
+ - **Local UI**: Pivot/table views for real-time analysis
137
138
 
138
- ## Quick Examples
139
+ ## Quickstart (no labels needed)
139
140
 
140
- ### Basic Model Comparison
141
+ Install with your tracing platform extras and set API keys:
141
142
 
142
- Compare models on a simple formatting task:
143
-
144
- ```python test_bold_format.py
145
- from eval_protocol.models import EvaluateResult, EvaluationRow, Message
146
- from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
143
+ ```bash
144
+ pip install 'eval-protocol[langfuse]'
147
145
 
148
- @evaluation_test(
149
- input_messages=[
150
- [
151
- Message(role="system", content="Use bold text to highlight important information."),
152
- Message(role="user", content="Explain why evaluations matter for AI agents. Make it dramatic!"),
153
- ],
154
- ],
155
- completion_params=[
156
- {"model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct"},
157
- {"model": "openai/gpt-4"},
158
- {"model": "anthropic/claude-3-sonnet"}
159
- ],
160
- rollout_processor=default_single_turn_rollout_processor,
161
- mode="pointwise",
162
- )
163
- def test_bold_format(row: EvaluationRow) -> EvaluationRow:
164
- """Check if the model's response contains bold text."""
165
- assistant_response = row.messages[-1].content
146
+ # Model API keys (set what you need)
147
+ export OPENAI_API_KEY=...
148
+ export FIREWORKS_API_KEY=...
149
+ export GEMINI_API_KEY=...
166
150
 
167
- if assistant_response is None:
168
- row.evaluation_result = EvaluateResult(score=0.0, reason="No response")
169
- return row
151
+ # Platform keys
152
+ export LANGFUSE_PUBLIC_KEY=...
153
+ export LANGFUSE_SECRET_KEY=...
154
+ export LANGFUSE_HOST=https://your-deployment.com # optional
155
+ ```
170
156
 
171
- has_bold = "**" in str(assistant_response)
172
- score = 1.0 if has_bold else 0.0
173
- reason = "Contains bold text" if has_bold else "No bold text found"
157
+ Minimal evaluation using the built-in AHA judge:
174
158
 
175
- row.evaluation_result = EvaluateResult(score=score, reason=reason)
176
- return row
177
- ```
159
+ ```python
160
+ from datetime import datetime
161
+ import pytest
162
+
163
+ from eval_protocol import (
164
+ evaluation_test,
165
+ aha_judge,
166
+ EvaluationRow,
167
+ SingleTurnRolloutProcessor,
168
+ DynamicDataLoader,
169
+ create_langfuse_adapter,
170
+ )
178
171
 
179
- ### Using Datasets
180
172
 
181
- Evaluate models on existing datasets:
173
+ def langfuse_data_generator() -> list[EvaluationRow]:
174
+ adapter = create_langfuse_adapter()
175
+ return adapter.get_evaluation_rows(
176
+ to_timestamp=datetime.utcnow(),
177
+ limit=20,
178
+ sample_size=5,
179
+ )
182
180
 
183
- ```python
184
- from eval_protocol.pytest import evaluation_test
185
- from eval_protocol.adapters.huggingface import create_gsm8k_adapter
186
181
 
187
- @evaluation_test(
188
- input_dataset=["development/gsm8k_sample.jsonl"], # Local JSONL file
189
- dataset_adapter=create_gsm8k_adapter(), # Adapter to convert data
190
- completion_params=[
191
- {"model": "openai/gpt-4"},
192
- {"model": "anthropic/claude-3-sonnet"}
182
+ @pytest.mark.parametrize(
183
+ "completion_params",
184
+ [
185
+ {"model": "openai/gpt-4.1"},
186
+ {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
193
187
  ],
194
- mode="pointwise"
195
188
  )
196
- def test_math_reasoning(row: EvaluationRow) -> EvaluationRow:
197
- # Your evaluation logic here
198
- return row
189
+ @evaluation_test(
190
+ data_loaders=DynamicDataLoader(generators=[langfuse_data_generator]),
191
+ rollout_processor=SingleTurnRolloutProcessor(),
192
+ )
193
+ async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
194
+ return await aha_judge(row)
199
195
  ```
200
196
 
197
+ Run it:
201
198
 
202
- ## 📚 Resources
199
+ ```bash
200
+ pytest -q -s
201
+ ```
203
202
 
204
- - **[Documentation](https://evalprotocol.io)** - Complete guides and API reference
205
- - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** - Community discussions
206
- - **[GitHub](https://github.com/eval-protocol/python-sdk)** - Source code and examples
203
+ The pytest output includes local links for a leaderboard and row-level traces (pivot/table) at `http://localhost:8000`.
207
204
 
208
205
  ## Installation
209
206
 
210
- **This library requires Python >= 3.10.**
207
+ This library requires Python >= 3.10.
211
208
 
212
- ### Basic Installation
213
-
214
- Install with pip:
209
+ ### pip
215
210
 
216
211
  ```bash
217
212
  pip install eval-protocol
218
213
  ```
219
214
 
220
- ### Recommended Installation with uv
221
-
222
- For better dependency management and faster installs, we recommend using [uv](https://docs.astral.sh/uv/):
215
+ ### uv (recommended)
223
216
 
224
217
  ```bash
225
- # Install uv if you haven't already
218
+ # Install uv (if needed)
226
219
  curl -LsSf https://astral.sh/uv/install.sh | sh
227
220
 
228
- # Install eval-protocol
221
+ # Add to your project
229
222
  uv add eval-protocol
230
223
  ```
231
224
 
232
- ### Optional Dependencies
233
-
234
- Install with additional features:
235
-
236
- ```bash
237
- # For Langfuse integration
238
- pip install 'eval-protocol[langfuse]'
239
-
240
- # For HuggingFace datasets
241
- pip install 'eval-protocol[huggingface]'
242
-
243
- # For all adapters
244
- pip install 'eval-protocol[adapters]'
225
+ ## 📚 Resources
245
226
 
246
- # For development
247
- pip install 'eval-protocol[dev]'
248
- ```
227
+ - **[Documentation](https://evalprotocol.io)** – Guides and API reference
228
+ - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
229
+ - **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
249
230
 
250
231
  ## License
251
232
 
@@ -0,0 +1,115 @@
1
+ # Eval Protocol (EP)
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
4
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/eval-protocol/python-sdk)
5
+
6
+ **Stop guessing which AI model to use. Build a data-driven model leaderboard.**
7
+
8
+ With hundreds of models and configs, you need objective data to choose the right one for your use case. EP helps you evaluate real traces, compare models, and visualize results locally.
9
+
10
+ ## 🚀 Features
11
+
12
+ - **Pytest authoring**: `@evaluation_test` decorator to configure evaluations
13
+ - **Robust rollouts**: Handles flaky LLM APIs and parallel execution
14
+ - **Integrations**: Works with Langfuse, LangSmith, Braintrust, Responses API
15
+ - **Agent support**: LangGraph and Pydantic AI
16
+ - **MCP RL envs**: Build reinforcement learning environments with MCP
17
+ - **Built-in benchmarks**: AIME, tau-bench
18
+ - **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
19
+ - **Local UI**: Pivot/table views for real-time analysis
20
+
21
+ ## ⚡ Quickstart (no labels needed)
22
+
23
+ Install with your tracing platform extras and set API keys:
24
+
25
+ ```bash
26
+ pip install 'eval-protocol[langfuse]'
27
+
28
+ # Model API keys (set what you need)
29
+ export OPENAI_API_KEY=...
30
+ export FIREWORKS_API_KEY=...
31
+ export GEMINI_API_KEY=...
32
+
33
+ # Platform keys
34
+ export LANGFUSE_PUBLIC_KEY=...
35
+ export LANGFUSE_SECRET_KEY=...
36
+ export LANGFUSE_HOST=https://your-deployment.com # optional
37
+ ```
38
+
39
+ Minimal evaluation using the built-in AHA judge:
40
+
41
+ ```python
42
+ from datetime import datetime
43
+ import pytest
44
+
45
+ from eval_protocol import (
46
+ evaluation_test,
47
+ aha_judge,
48
+ EvaluationRow,
49
+ SingleTurnRolloutProcessor,
50
+ DynamicDataLoader,
51
+ create_langfuse_adapter,
52
+ )
53
+
54
+
55
+ def langfuse_data_generator() -> list[EvaluationRow]:
56
+ adapter = create_langfuse_adapter()
57
+ return adapter.get_evaluation_rows(
58
+ to_timestamp=datetime.utcnow(),
59
+ limit=20,
60
+ sample_size=5,
61
+ )
62
+
63
+
64
+ @pytest.mark.parametrize(
65
+ "completion_params",
66
+ [
67
+ {"model": "openai/gpt-4.1"},
68
+ {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
69
+ ],
70
+ )
71
+ @evaluation_test(
72
+ data_loaders=DynamicDataLoader(generators=[langfuse_data_generator]),
73
+ rollout_processor=SingleTurnRolloutProcessor(),
74
+ )
75
+ async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
76
+ return await aha_judge(row)
77
+ ```
78
+
79
+ Run it:
80
+
81
+ ```bash
82
+ pytest -q -s
83
+ ```
84
+
85
+ The pytest output includes local links for a leaderboard and row-level traces (pivot/table) at `http://localhost:8000`.
86
+
87
+ ## Installation
88
+
89
+ This library requires Python >= 3.10.
90
+
91
+ ### pip
92
+
93
+ ```bash
94
+ pip install eval-protocol
95
+ ```
96
+
97
+ ### uv (recommended)
98
+
99
+ ```bash
100
+ # Install uv (if needed)
101
+ curl -LsSf https://astral.sh/uv/install.sh | sh
102
+
103
+ # Add to your project
104
+ uv add eval-protocol
105
+ ```
106
+
107
+ ## 📚 Resources
108
+
109
+ - **[Documentation](https://evalprotocol.io)** – Guides and API reference
110
+ - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
111
+ - **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
112
+
113
+ ## License
114
+
115
+ [MIT](LICENSE)
@@ -22,6 +22,7 @@ from .mcp_env import (
22
22
  rollout,
23
23
  test_mcp,
24
24
  )
25
+ from .data_loader import DynamicDataLoader, InlineDataLoader
25
26
 
26
27
  # Try to import FireworksPolicy if available
27
28
  try:
@@ -32,13 +33,13 @@ except (ImportError, AttributeError):
32
33
  _FIREWORKS_AVAILABLE = False
33
34
  # Import submodules to make them available via eval_protocol.rewards, etc.
34
35
  from . import mcp, rewards
35
- from .models import EvaluateResult, Message, MetricResult, EvaluationRow
36
+ from .models import EvaluateResult, Message, MetricResult, EvaluationRow, InputMetadata
36
37
  from .playback_policy import PlaybackPolicyBase
37
38
  from .resources import create_llm_resource
38
39
  from .reward_function import RewardFunction
39
40
  from .typed_interface import reward_function
40
41
  from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
41
- from .pytest import evaluation_test, SingleTurnRolloutProcessor
42
+ from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
42
43
  from .pytest.parameterize import DefaultParameterIdGenerator
43
44
 
44
45
  try:
@@ -61,11 +62,23 @@ try:
61
62
  except ImportError:
62
63
  LangSmithAdapter = None
63
64
 
65
+ # Remote server types
66
+ from .types.remote_rollout_processor import (
67
+ InitRequest,
68
+ RolloutMetadata,
69
+ StatusResponse,
70
+ create_langfuse_config_tags,
71
+ )
64
72
 
65
73
  warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
66
74
 
67
75
  __all__ = [
76
+ "RemoteRolloutProcessor",
77
+ "InputMetadata",
78
+ "EvaluationRow",
68
79
  "DefaultParameterIdGenerator",
80
+ "DynamicDataLoader",
81
+ "InlineDataLoader",
69
82
  "aha_judge",
70
83
  "multi_turn_assistant_to_ground_truth",
71
84
  "assistant_to_ground_truth",
@@ -107,6 +120,11 @@ __all__ = [
107
120
  # Submodules
108
121
  "rewards",
109
122
  "mcp",
123
+ # Remote server types
124
+ "InitRequest",
125
+ "RolloutMetadata",
126
+ "StatusResponse",
127
+ "create_langfuse_config_tags",
110
128
  ]
111
129
 
112
130
  from . import _version
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-09-23T16:59:02-0700",
11
+ "date": "2025-09-26T11:40:07-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "bcc2d7c22085ccddf46d952e3481012250245d90",
15
- "version": "0.2.26"
14
+ "full-revisionid": "f9c6f1be71edb167cdbb8fc1d57e71ec3df4fe18",
15
+ "version": "0.2.28"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -12,7 +12,7 @@ from datetime import datetime, timedelta
12
12
  from typing import Any, Dict, List, Optional, Protocol, TYPE_CHECKING, cast
13
13
 
14
14
  from langfuse.api.resources.commons.types.observations_view import ObservationsView
15
- from eval_protocol.models import EvaluationRow, InputMetadata, Message
15
+ from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
16
16
  from .base import BaseAdapter
17
17
  from .utils import extract_messages_from_data
18
18
 
@@ -82,14 +82,41 @@ def convert_trace_to_evaluation_row(
82
82
  if not messages:
83
83
  return None
84
84
 
85
+ execution_metadata = ExecutionMetadata()
86
+ row_id = None
87
+
88
+ if trace.tags:
89
+ for tag in trace.tags:
90
+ if tag.startswith("invocation_id:"):
91
+ execution_metadata.invocation_id = tag.split(":", 1)[1]
92
+ elif tag.startswith("experiment_id:"):
93
+ execution_metadata.experiment_id = tag.split(":", 1)[1]
94
+ elif tag.startswith("rollout_id:"):
95
+ execution_metadata.rollout_id = tag.split(":", 1)[1]
96
+ elif tag.startswith("run_id:"):
97
+ execution_metadata.run_id = tag.split(":", 1)[1]
98
+ elif tag.startswith("row_id:"):
99
+ row_id = tag.split(":", 1)[1]
100
+
101
+ if (
102
+ execution_metadata.invocation_id
103
+ and execution_metadata.experiment_id
104
+ and execution_metadata.rollout_id
105
+ and execution_metadata.run_id
106
+ and row_id
107
+ ):
108
+ break # Break early if we've found all the metadata we need
109
+
85
110
  return EvaluationRow(
86
111
  messages=messages,
87
112
  tools=tools,
88
113
  input_metadata=InputMetadata(
114
+ row_id=row_id,
89
115
  session_data={
90
116
  "langfuse_trace_id": trace.id, # Store the trace ID here
91
- }
117
+ },
92
118
  ),
119
+ execution_metadata=execution_metadata,
93
120
  )
94
121
 
95
122
  except (AttributeError, ValueError, KeyError) as e:
@@ -259,9 +286,6 @@ class LangfuseAdapter(BaseAdapter):
259
286
  max_retries: int = 3,
260
287
  span_name: Optional[str] = None,
261
288
  converter: Optional[TraceConverter] = None,
262
- metadata: Optional[Dict[str, Any]] = None,
263
- requester_metadata: Optional[Dict[str, Any]] = None,
264
- requester_metadata_contains: Optional[str] = None,
265
289
  ) -> List[EvaluationRow]:
266
290
  """Pull traces from Langfuse and convert to EvaluationRow format.
267
291
 
@@ -296,10 +320,6 @@ class LangfuseAdapter(BaseAdapter):
296
320
  to_timestamp = datetime.now()
297
321
  from_timestamp = to_timestamp - timedelta(hours=hours_back)
298
322
 
299
- # If filtering by metadata/requester_metadata, prefer fetching metadata fields
300
- if (metadata is not None or requester_metadata is not None or requester_metadata_contains) and not fields:
301
- fields = "core,metadata,observations"
302
-
303
323
  # Collect trace summaries via pagination (up to limit)
304
324
  all_traces = []
305
325
  page = 1
@@ -332,16 +352,18 @@ class LangfuseAdapter(BaseAdapter):
332
352
  to_timestamp=to_timestamp,
333
353
  order_by="timestamp.desc",
334
354
  )
355
+
356
+ # If no results, possible due to indexing delay--remote rollout processor just finished pushing rows to Langfuse
357
+ if traces and traces.meta and traces.meta.total_items == 0 and page == 1:
358
+ raise Exception("Empty results - indexing delay")
359
+
335
360
  break
336
361
  except Exception as e:
337
362
  list_retries += 1
338
- if "429" in str(e) and list_retries < max_retries:
363
+ if list_retries < max_retries and ("429" in str(e) or "Empty results" in str(e)):
339
364
  sleep_time = 2**list_retries # Exponential backoff
340
365
  logger.warning(
341
- "Rate limit hit on trace.list(), retrying in %ds (attempt %d/%d)",
342
- sleep_time,
343
- list_retries,
344
- max_retries,
366
+ "Retrying in %ds (attempt %d/%d): %s", sleep_time, list_retries, max_retries, str(e)
345
367
  )
346
368
  time.sleep(sleep_time)
347
369
  else:
@@ -379,74 +401,6 @@ class LangfuseAdapter(BaseAdapter):
379
401
  selected_traces = all_traces
380
402
  logger.debug("Processing all %d collected traces (no sampling)", len(all_traces))
381
403
 
382
- # Helper to check if a trace matches provided metadata filters. We look in multiple places
383
- # to account for Langfuse moving fields (e.g., metadata vs requester_metadata) and SDK shape.
384
- def _trace_matches_metadata_filters(trace_obj: Any) -> bool:
385
- if metadata is None and requester_metadata is None:
386
- return True
387
-
388
- def _as_dict(val: Any) -> Dict[str, Any]:
389
- if val is None:
390
- return {}
391
- if isinstance(val, dict):
392
- return val
393
- # Some SDK objects expose .model_dump() or behave like pydantic models
394
- dump = getattr(val, "model_dump", None)
395
- if callable(dump):
396
- try:
397
- return dump() # type: ignore[no-any-return]
398
- except Exception:
399
- return {}
400
- return {}
401
-
402
- # Try common locations for metadata on full trace
403
- trace_meta = _as_dict(getattr(trace_obj, "metadata", None))
404
- trace_req_meta = _as_dict(getattr(trace_obj, "requester_metadata", None))
405
- # Some Langfuse deployments nest requester_metadata inside metadata
406
- nested_req_meta = {}
407
- try:
408
- if isinstance(trace_meta, dict) and isinstance(trace_meta.get("requester_metadata"), dict):
409
- nested_req_meta = _as_dict(trace_meta.get("requester_metadata"))
410
- except Exception:
411
- nested_req_meta = {}
412
-
413
- # Fallbacks: sometimes metadata is embedded in input
414
- input_meta = {}
415
- try:
416
- inp = getattr(trace_obj, "input", None)
417
- if isinstance(inp, dict):
418
- input_meta = _as_dict(inp.get("metadata"))
419
- except Exception:
420
- input_meta = {}
421
-
422
- # Combine for matching convenience (later keys override earlier for equality check only)
423
- combined_meta = {**trace_meta, **input_meta}
424
- combined_req_meta = {**trace_req_meta}
425
-
426
- # Also merge nested requester metadata when present
427
- if nested_req_meta:
428
- combined_req_meta = {**combined_req_meta, **nested_req_meta}
429
-
430
- def _is_subset(needle: Dict[str, Any], haystack: Dict[str, Any]) -> bool:
431
- for k, v in needle.items():
432
- if haystack.get(k) != v:
433
- return False
434
- return True
435
-
436
- ok_meta = True
437
- ok_req_meta = True
438
-
439
- if metadata is not None:
440
- # Accept match if found either in metadata or requester_metadata buckets
441
- ok_meta = _is_subset(metadata, combined_meta) or _is_subset(metadata, combined_req_meta)
442
-
443
- if requester_metadata is not None:
444
- ok_req_meta = _is_subset(requester_metadata, combined_req_meta) or _is_subset(
445
- requester_metadata, combined_meta
446
- )
447
-
448
- return ok_meta and ok_req_meta
449
-
450
404
  # Process each selected trace with sleep and retry logic
451
405
  for trace_info in selected_traces:
452
406
  # Sleep between gets to avoid rate limits
@@ -483,39 +437,6 @@ class LangfuseAdapter(BaseAdapter):
483
437
  break # Skip this trace
484
438
 
485
439
  if trace_full:
486
- # If metadata filters are provided, skip non-matching traces early
487
- try:
488
- if not _trace_matches_metadata_filters(trace_full):
489
- continue
490
- except Exception:
491
- # Be permissive on filter errors; treat as non-match
492
- continue
493
-
494
- # If observations carry requester_metadata, allow substring filtering
495
- if requester_metadata_contains:
496
- contains_val = requester_metadata_contains
497
- found_match = False
498
- try:
499
- for obs in getattr(trace_full, "observations", []) or []:
500
- obs_rmd = getattr(obs, "requester_metadata", None)
501
- if isinstance(obs_rmd, dict) and any(
502
- (isinstance(v, str) and contains_val in v) for v in obs_rmd.values()
503
- ):
504
- found_match = True
505
- break
506
- obs_md = getattr(obs, "metadata", None)
507
- if isinstance(obs_md, dict):
508
- nested = obs_md.get("requester_metadata")
509
- if isinstance(nested, dict) and any(
510
- (isinstance(v, str) and contains_val in v) for v in nested.values()
511
- ):
512
- found_match = True
513
- break
514
- except Exception:
515
- found_match = False
516
- if not found_match:
517
- continue
518
-
519
440
  try:
520
441
  if converter:
521
442
  eval_row = converter(trace_full, include_tool_calls, span_name)
@@ -0,0 +1,4 @@
1
+ from .dynamic_data_loader import DynamicDataLoader
2
+ from .inline_data_loader import InlineDataLoader
3
+
4
+ __all__ = ["DynamicDataLoader", "InlineDataLoader"]
@@ -0,0 +1,38 @@
1
+ from collections.abc import Callable, Sequence
2
+ from dataclasses import dataclass
3
+
4
+ from eval_protocol.data_loader.models import (
5
+ DataLoaderResult,
6
+ DataLoaderVariant,
7
+ EvaluationDataLoader,
8
+ )
9
+ from eval_protocol.models import EvaluationRow
10
+
11
+
12
+ @dataclass(kw_only=True)
13
+ class DynamicDataLoader(EvaluationDataLoader):
14
+ """Data loader for dynamic data generation."""
15
+
16
+ generators: Sequence[Callable[[], list[EvaluationRow]]]
17
+ """Dynamic data generation functions. These callables are invoked each time data
18
+ needs to be loaded, allowing for dynamic data generation, lazy loading, or data that
19
+ changes between evaluation runs. Each function should return a list of EvaluationRow
20
+ objects. This is useful for scenarios like generating test data on-the-fly, loading
21
+ data from external sources, or creating data with randomized elements for robust testing."""
22
+
23
+ def variants(self) -> Sequence[DataLoaderVariant]:
24
+ variants: Sequence[DataLoaderVariant] = []
25
+ for generator in self.generators:
26
+
27
+ def _load() -> DataLoaderResult:
28
+ resolved_rows = generator()
29
+ return DataLoaderResult(
30
+ rows=resolved_rows,
31
+ type=self.__class__.__name__,
32
+ variant_id=generator.__name__,
33
+ variant_description=generator.__doc__,
34
+ )
35
+
36
+ variants.append(_load)
37
+
38
+ return variants