eval-protocol 0.2.11.dev1__tar.gz → 0.2.98.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (508) hide show
  1. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/LICENSE +1 -1
  2. eval_protocol-0.2.98.dev1/PKG-INFO +156 -0
  3. eval_protocol-0.2.98.dev1/README.md +39 -0
  4. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/normalize_sandbox_fusion.py +9 -10
  5. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/utils/subprocess_manager.py +1 -1
  6. eval_protocol-0.2.98.dev1/eval_protocol/__init__.py +178 -0
  7. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/_version.py +3 -3
  8. eval_protocol-0.2.98.dev1/eval_protocol/adapters/__init__.py +101 -0
  9. eval_protocol-0.2.98.dev1/eval_protocol/adapters/base.py +25 -0
  10. eval_protocol-0.2.98.dev1/eval_protocol/adapters/bigquery.py +304 -0
  11. eval_protocol-0.2.98.dev1/eval_protocol/adapters/braintrust.py +315 -0
  12. eval_protocol-0.2.98.dev1/eval_protocol/adapters/fireworks_tracing.py +453 -0
  13. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/adapters/huggingface.py +5 -12
  14. eval_protocol-0.2.98.dev1/eval_protocol/adapters/langchain.py +214 -0
  15. eval_protocol-0.2.98.dev1/eval_protocol/adapters/langfuse.py +552 -0
  16. eval_protocol-0.2.98.dev1/eval_protocol/adapters/langsmith.py +413 -0
  17. eval_protocol-0.2.98.dev1/eval_protocol/adapters/openai_responses.py +216 -0
  18. eval_protocol-0.2.98.dev1/eval_protocol/adapters/utils.py +98 -0
  19. eval_protocol-0.2.98.dev1/eval_protocol/adapters/weave.py +130 -0
  20. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/orchestrator.py +53 -52
  21. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +1 -2
  22. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/docker_resource.py +10 -13
  23. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/task_manager.py +19 -10
  24. eval_protocol-0.2.98.dev1/eval_protocol/auth.py +331 -0
  25. eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/data/airline_dataset.jsonl +50 -0
  26. eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/data/retail_dataset.jsonl +114 -0
  27. eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/aime25.py → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_aime25.py +24 -8
  28. eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_frozen_lake.py +80 -0
  29. eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_glm_streaming_compliance.py +3477 -0
  30. eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/gpqa.py → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_gpqa.py +49 -21
  31. eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/livebench_data_analysis.py → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_livebench_data_analysis.py +76 -33
  32. eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_tau_bench_airline.py +304 -0
  33. eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/tau_bench_retail.py → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks/test_tau_bench_retail.py +71 -18
  34. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli.py +306 -25
  35. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +1 -5
  36. eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/create_rft.py +880 -0
  37. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/deploy.py +34 -11
  38. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/deploy_mcp.py +7 -4
  39. eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/local_test.py +212 -0
  40. eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/logs.py +57 -0
  41. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/preview.py +3 -3
  42. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +2 -1
  43. eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/upload.py +306 -0
  44. eval_protocol-0.2.98.dev1/eval_protocol/cli_commands/utils.py +511 -0
  45. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/common_utils.py +17 -0
  46. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/config.py +3 -3
  47. eval_protocol-0.2.98.dev1/eval_protocol/data_loader/__init__.py +5 -0
  48. eval_protocol-0.2.98.dev1/eval_protocol/data_loader/dynamic_data_loader.py +38 -0
  49. eval_protocol-0.2.98.dev1/eval_protocol/data_loader/factory_data_loader.py +38 -0
  50. eval_protocol-0.2.98.dev1/eval_protocol/data_loader/inline_data_loader.py +68 -0
  51. eval_protocol-0.2.98.dev1/eval_protocol/data_loader/jsonl_data_loader.py +42 -0
  52. eval_protocol-0.2.98.dev1/eval_protocol/data_loader/models.py +129 -0
  53. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/__init__.py +9 -2
  54. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +8 -1
  55. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +7 -5
  56. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/datasets/loader.py +3 -5
  57. eval_protocol-0.2.98.dev1/eval_protocol/directory_utils.py +39 -0
  58. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/evaluation.py +499 -73
  59. eval_protocol-0.2.98.dev1/eval_protocol/event_bus/__init__.py +38 -0
  60. eval_protocol-0.2.98.dev1/eval_protocol/event_bus/sqlite_event_bus.py +126 -0
  61. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/event_bus/sqlite_event_bus_database.py +6 -8
  62. eval_protocol-0.2.98.dev1/eval_protocol/exceptions.py +184 -0
  63. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/execution/pipeline.py +51 -17
  64. eval_protocol-0.2.98.dev1/eval_protocol/fireworks_rft.py +249 -0
  65. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/gcp_tools.py +3 -3
  66. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/generation/clients.py +5 -2
  67. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/generic_server.py +1 -1
  68. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/get_pep440_version.py +9 -1
  69. eval_protocol-0.2.98.dev1/eval_protocol/human_id/__init__.py +77 -0
  70. eval_protocol-0.2.98.dev1/eval_protocol/integrations/__init__.py +7 -0
  71. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/integrations/deepeval.py +11 -2
  72. eval_protocol-0.2.98.dev1/eval_protocol/integrations/openai_rft.py +190 -0
  73. eval_protocol-0.2.98.dev1/eval_protocol/integrations/tinker_cookbook.py +197 -0
  74. eval_protocol-0.2.98.dev1/eval_protocol/integrations/tinker_rollout_processor.py +170 -0
  75. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/integrations/trl.py +1 -1
  76. eval_protocol-0.2.98.dev1/eval_protocol/log_utils/elasticsearch_client.py +338 -0
  77. eval_protocol-0.2.98.dev1/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +160 -0
  78. eval_protocol-0.2.98.dev1/eval_protocol/log_utils/elasticsearch_index_manager.py +168 -0
  79. eval_protocol-0.2.98.dev1/eval_protocol/log_utils/fireworks_tracing_http_handler.py +138 -0
  80. eval_protocol-0.2.98.dev1/eval_protocol/log_utils/init.py +69 -0
  81. eval_protocol-0.2.98.dev1/eval_protocol/log_utils/rollout_context.py +84 -0
  82. eval_protocol-0.2.98.dev1/eval_protocol/log_utils/rollout_id_filter.py +28 -0
  83. eval_protocol-0.2.98.dev1/eval_protocol/log_utils/util.py +22 -0
  84. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/client/connection.py +46 -29
  85. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/clients.py +7 -5
  86. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/execution/base_policy.py +7 -1
  87. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/execution/manager.py +155 -73
  88. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/execution/policy.py +57 -29
  89. eval_protocol-0.2.98.dev1/eval_protocol/mcp/execution/vllm_policy.py +186 -0
  90. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/mcp_multi_client.py +62 -16
  91. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/mcpgym.py +44 -19
  92. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/session/manager.py +1 -1
  93. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/simple_process_manager.py +2 -2
  94. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/simulation_server.py +30 -8
  95. eval_protocol-0.2.98.dev1/eval_protocol/mcp_agent/main.py +18 -0
  96. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +13 -4
  97. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_env.py +25 -10
  98. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +160 -0
  99. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +102 -0
  100. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/frozen_lake/server.py +57 -0
  101. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/README.md +250 -0
  102. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/__init__.py +61 -0
  103. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +107 -0
  104. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +100 -0
  105. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +112 -0
  106. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/server.py +83 -0
  107. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tau2_mcp.py +767 -0
  108. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +178 -0
  109. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +18 -0
  110. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +147 -0
  111. eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +1689 -0
  112. eval_protocol-0.2.98.dev1/eval_protocol/models.py +1192 -0
  113. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/platform_api.py +35 -16
  114. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/playback_policy.py +14 -38
  115. eval_protocol-0.2.98.dev1/eval_protocol/proxy/__init__.py +18 -0
  116. eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/__init__.py +13 -0
  117. eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/app.py +305 -0
  118. eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/auth.py +17 -0
  119. eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/langfuse.py +546 -0
  120. eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/litellm.py +173 -0
  121. eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/main.py +10 -0
  122. eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/models.py +98 -0
  123. eval_protocol-0.2.98.dev1/eval_protocol/proxy/proxy_core/redis_utils.py +57 -0
  124. eval_protocol-0.2.98.dev1/eval_protocol/pytest/__init__.py +55 -0
  125. eval_protocol-0.2.98.dev1/eval_protocol/pytest/buffer.py +82 -0
  126. eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_agent_rollout_processor.py +284 -0
  127. eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_dataset_adapter.py +9 -0
  128. eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_langchain_rollout_processor.py +159 -0
  129. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +110 -43
  130. eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +163 -0
  131. eval_protocol-0.2.98.dev1/eval_protocol/pytest/default_single_turn_rollout_process.py +197 -0
  132. eval_protocol-0.2.98.dev1/eval_protocol/pytest/dual_mode_wrapper.py +78 -0
  133. eval_protocol-0.2.98.dev1/eval_protocol/pytest/elasticsearch_setup.py +167 -0
  134. eval_protocol-0.2.98.dev1/eval_protocol/pytest/evaluation_test.py +778 -0
  135. eval_protocol-0.2.98.dev1/eval_protocol/pytest/evaluation_test_postprocess.py +208 -0
  136. eval_protocol-0.2.98.dev1/eval_protocol/pytest/evaluation_test_utils.py +613 -0
  137. eval_protocol-0.2.98.dev1/eval_protocol/pytest/exception_config.py +151 -0
  138. eval_protocol-0.2.98.dev1/eval_protocol/pytest/execution.py +111 -0
  139. eval_protocol-0.2.98.dev1/eval_protocol/pytest/generate_parameter_combinations.py +145 -0
  140. eval_protocol-0.2.98.dev1/eval_protocol/pytest/github_action_rollout_processor.py +225 -0
  141. eval_protocol-0.2.98.dev1/eval_protocol/pytest/handle_persist_flow.py +225 -0
  142. eval_protocol-0.2.98.dev1/eval_protocol/pytest/integrations/openenv_trl_vllm.py +473 -0
  143. eval_protocol-0.2.98.dev1/eval_protocol/pytest/openenv_rollout_processor.py +585 -0
  144. eval_protocol-0.2.98.dev1/eval_protocol/pytest/parameterize.py +424 -0
  145. eval_protocol-0.2.98.dev1/eval_protocol/pytest/plugin.py +413 -0
  146. eval_protocol-0.2.98.dev1/eval_protocol/pytest/priority_scheduler.py +348 -0
  147. eval_protocol-0.2.98.dev1/eval_protocol/pytest/remote_rollout_processor.py +207 -0
  148. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/pytest/rollout_processor.py +5 -2
  149. eval_protocol-0.2.98.dev1/eval_protocol/pytest/rollout_result_post_processor.py +57 -0
  150. eval_protocol-0.2.98.dev1/eval_protocol/pytest/store_experiment_link.py +41 -0
  151. eval_protocol-0.2.98.dev1/eval_protocol/pytest/store_results_url.py +49 -0
  152. eval_protocol-0.2.98.dev1/eval_protocol/pytest/tracing_utils.py +185 -0
  153. eval_protocol-0.2.98.dev1/eval_protocol/pytest/types.py +79 -0
  154. eval_protocol-0.2.98.dev1/eval_protocol/pytest/validate_signature.py +69 -0
  155. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/__init__.py +8 -0
  156. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/__init__.py +4 -0
  157. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge.py +90 -0
  158. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +63 -0
  159. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +58 -0
  160. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +82 -0
  161. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +66 -0
  162. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/aha_judge/utils.py +133 -0
  163. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/llm_judge.py +90 -0
  164. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/llm_judge_braintrust.py +63 -0
  165. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +223 -0
  166. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/svg_agent/evaluator/utils.py +523 -0
  167. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +202 -0
  168. eval_protocol-0.2.98.dev1/eval_protocol/quickstart/utils.py +251 -0
  169. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/resources.py +2 -2
  170. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/accuracy.py +28 -3
  171. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/accuracy_length.py +19 -6
  172. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/apps_coding_reward.py +2 -2
  173. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/apps_execution_utils.py +1 -1
  174. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/apps_testing_util.py +8 -3
  175. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/bfcl_reward.py +3 -2
  176. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/code_execution.py +20 -6
  177. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/cpp_code.py +2 -2
  178. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/deepcoder_reward.py +8 -3
  179. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/format.py +5 -2
  180. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/function_calling.py +3 -1
  181. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/json_schema.py +36 -6
  182. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/language_consistency.py +25 -10
  183. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/lean_prover.py +14 -11
  184. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/length.py +6 -4
  185. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +6 -1
  186. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/math.py +22 -17
  187. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +12 -2
  188. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/reasoning_steps.py +2 -2
  189. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/repetition.py +28 -4
  190. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/tag_count.py +28 -5
  191. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/server.py +5 -5
  192. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/stats/__init__.py +0 -2
  193. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/stats/confidence_intervals.py +8 -10
  194. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/typed_interface.py +58 -12
  195. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/types/__init__.py +2 -1
  196. eval_protocol-0.2.98.dev1/eval_protocol/types/errors.py +11 -0
  197. eval_protocol-0.2.98.dev1/eval_protocol/types/remote_rollout_processor.py +87 -0
  198. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/types/types.py +8 -4
  199. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/batch_transformation.py +1 -1
  200. eval_protocol-0.2.98.dev1/eval_protocol/utils/browser_utils.py +114 -0
  201. eval_protocol-0.2.98.dev1/eval_protocol/utils/check_server_status.py +77 -0
  202. eval_protocol-0.2.98.dev1/eval_protocol/utils/evaluation_row_utils.py +158 -0
  203. eval_protocol-0.2.98.dev1/eval_protocol/utils/logs_models.py +45 -0
  204. eval_protocol-0.2.98.dev1/eval_protocol/utils/logs_server.py +720 -0
  205. eval_protocol-0.2.98.dev1/eval_protocol/utils/show_results_url.py +74 -0
  206. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/static_policy.py +7 -7
  207. eval_protocol-0.2.98.dev1/eval_protocol/utils/subprocess_utils.py +118 -0
  208. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/vite_server.py +3 -3
  209. eval_protocol-0.2.98.dev1/eval_protocol.egg-info/PKG-INFO +156 -0
  210. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/SOURCES.txt +143 -17
  211. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/requires.txt +53 -24
  212. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/pyproject.toml +96 -36
  213. eval_protocol-0.2.98.dev1/tests/test_adapters_e2e.py +765 -0
  214. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_agent_resources.py +6 -6
  215. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_auth.py +10 -5
  216. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_batch_evaluation.py +6 -12
  217. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_cli_agent.py +2 -4
  218. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_cli_args.py +0 -1
  219. eval_protocol-0.2.98.dev1/tests/test_cli_create_rft.py +1233 -0
  220. eval_protocol-0.2.98.dev1/tests/test_cli_local_test.py +285 -0
  221. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_code_execution.py +4 -4
  222. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_data_driven_task_manager.py +0 -5
  223. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_deepeval_integration.py +20 -4
  224. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_deploy_integration.py +0 -1
  225. eval_protocol-0.2.98.dev1/tests/test_directory_utils.py +95 -0
  226. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_e2b_integration.py +1 -1
  227. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_e2b_js_integration.py +1 -1
  228. eval_protocol-0.2.98.dev1/tests/test_ep_upload_e2e.py +647 -0
  229. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_eval_protocol_import.py +35 -39
  230. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_evaluation.py +71 -34
  231. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_evaluation_integration.py +123 -29
  232. eval_protocol-0.2.98.dev1/tests/test_evaluation_postprocess.py +526 -0
  233. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_evaluation_preview_integration.py +113 -53
  234. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_event_bus.py +75 -39
  235. eval_protocol-0.2.98.dev1/tests/test_event_bus_helper.py +73 -0
  236. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_examples_end_to_end.py +0 -1
  237. eval_protocol-0.2.98.dev1/tests/test_exception_config.py +114 -0
  238. eval_protocol-0.2.98.dev1/tests/test_exceptions.py +385 -0
  239. eval_protocol-0.2.98.dev1/tests/test_fireworks_api.py +68 -0
  240. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_function_calling.py +6 -6
  241. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_gcp_tools.py +0 -2
  242. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_generic_server.py +1 -2
  243. eval_protocol-0.2.98.dev1/tests/test_human_id.py +94 -0
  244. eval_protocol-0.2.98.dev1/tests/test_litellm_policy_provider_fields.py +95 -0
  245. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_logs_server.py +20 -9
  246. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_math.py +5 -2
  247. eval_protocol-0.2.98.dev1/tests/test_message_field_filtering.py +64 -0
  248. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_models.py +363 -1
  249. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_models_rl.py +1 -4
  250. eval_protocol-0.2.98.dev1/tests/test_openai_rft_integration.py +66 -0
  251. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_packaging.py +0 -1
  252. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_parallel_rollouts.py +2 -2
  253. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_platform_api.py +0 -1
  254. eval_protocol-0.2.98.dev1/tests/test_priority_scheduler.py +322 -0
  255. eval_protocol-0.2.98.dev1/tests/test_quickstart_utils.py +388 -0
  256. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_readiness.py +0 -2
  257. eval_protocol-0.2.98.dev1/tests/test_retry_mechanism.py +485 -0
  258. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_reward_protocol_import.py +35 -39
  259. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_rl_processing.py +1 -4
  260. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_rollout_control_plane_integration.py +14 -14
  261. eval_protocol-0.2.98.dev1/tests/test_show_results_url.py +336 -0
  262. eval_protocol-0.2.98.dev1/tests/test_status_migration_changes.py +440 -0
  263. eval_protocol-0.2.98.dev1/tests/test_status_migration_integration.py +388 -0
  264. eval_protocol-0.2.98.dev1/tests/test_status_model.py +360 -0
  265. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_typed_interface_rl.py +0 -1
  266. eval_protocol-0.2.98.dev1/tests/test_upload_entrypoint.py +227 -0
  267. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_url_handling.py +1 -0
  268. eval_protocol-0.2.98.dev1/vendor/tau2/__init__.py +21 -0
  269. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/agent/base.py +1 -3
  270. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/agent/llm_agent.py +9 -9
  271. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/cli.py +1 -3
  272. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/config.py +1 -1
  273. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/airline/policy.md +167 -0
  274. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/mock/policy.md +7 -0
  275. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/mock/policy_solo.md +6 -0
  276. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/retail/policy.md +136 -0
  277. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/main_policy.md +159 -0
  278. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/main_policy_solo.md +155 -0
  279. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/tech_support_manual.md +206 -0
  280. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/tech_support_workflow.md +303 -0
  281. eval_protocol-0.2.98.dev1/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +299 -0
  282. eval_protocol-0.2.98.dev1/vendor/tau2/data/user_simulator/simulation_guidelines.md +18 -0
  283. eval_protocol-0.2.98.dev1/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +30 -0
  284. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/data_model/message.py +14 -44
  285. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/data_model/simulation.py +14 -44
  286. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/data_model/tasks.py +20 -75
  287. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/data_model.py +22 -66
  288. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/tools.py +19 -60
  289. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/data_model.py +2 -6
  290. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/environment.py +1 -3
  291. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/tools.py +1 -3
  292. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/data_model.py +24 -72
  293. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/tools.py +8 -25
  294. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/data_model.py +26 -78
  295. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/environment.py +3 -9
  296. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/const.py +1 -1
  297. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +3 -3
  298. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +6 -20
  299. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +2 -6
  300. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +1 -3
  301. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tools.py +11 -36
  302. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/user_data_model.py +18 -56
  303. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/user_tools.py +29 -73
  304. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/environment.py +3 -3
  305. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/server.py +7 -11
  306. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/tool.py +3 -9
  307. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/toolkit.py +4 -12
  308. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/utils/interface_agent.py +4 -12
  309. eval_protocol-0.2.98.dev1/vendor/tau2/evaluator/__init__.py +0 -0
  310. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator.py +1 -3
  311. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_action.py +1 -4
  312. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +2 -6
  313. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_env.py +5 -19
  314. eval_protocol-0.2.98.dev1/vendor/tau2/metrics/__init__.py +0 -0
  315. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/metrics/agent_metrics.py +2 -6
  316. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/metrics/break_down_metrics.py +6 -20
  317. eval_protocol-0.2.98.dev1/vendor/tau2/orchestrator/__init__.py +0 -0
  318. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/orchestrator/environment_manager.py +5 -15
  319. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/orchestrator/orchestrator.py +23 -85
  320. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/orchestrator/utils.py +1 -3
  321. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/registry.py +23 -27
  322. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/run.py +23 -33
  323. eval_protocol-0.2.98.dev1/vendor/tau2/scripts/__init__.py +0 -0
  324. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/scripts/show_domain_doc.py +1 -3
  325. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/scripts/start_servers.py +2 -6
  326. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/scripts/view_simulations.py +20 -50
  327. eval_protocol-0.2.98.dev1/vendor/tau2/user/__init__.py +0 -0
  328. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/user/base.py +3 -9
  329. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/user/user_simulator.py +3 -3
  330. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/display.py +13 -45
  331. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/pydantic_utils.py +1 -3
  332. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/utils.py +2 -8
  333. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/versioneer.py +1 -3
  334. eval_protocol-0.2.98.dev1/vite-app/dist/assets/index-CuQbfdPD.js +46 -0
  335. eval_protocol-0.2.98.dev1/vite-app/dist/assets/index-CuQbfdPD.js.map +1 -0
  336. eval_protocol-0.2.98.dev1/vite-app/dist/assets/index-iZp_HgyW.css +1 -0
  337. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vite-app/dist/index.html +2 -2
  338. eval_protocol-0.2.11.dev1/PKG-INFO +0 -173
  339. eval_protocol-0.2.11.dev1/README.md +0 -75
  340. eval_protocol-0.2.11.dev1/eval_protocol/__init__.py +0 -82
  341. eval_protocol-0.2.11.dev1/eval_protocol/adapters/__init__.py +0 -47
  342. eval_protocol-0.2.11.dev1/eval_protocol/adapters/braintrust.py +0 -8
  343. eval_protocol-0.2.11.dev1/eval_protocol/adapters/langfuse.py +0 -392
  344. eval_protocol-0.2.11.dev1/eval_protocol/auth.py +0 -156
  345. eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/__init__.py +0 -9
  346. eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/registry.py +0 -330
  347. eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/run.py +0 -100
  348. eval_protocol-0.2.11.dev1/eval_protocol/benchmarks/suites/__init__.py +0 -3
  349. eval_protocol-0.2.11.dev1/eval_protocol/cli_commands/logs.py +0 -30
  350. eval_protocol-0.2.11.dev1/eval_protocol/directory_utils.py +0 -55
  351. eval_protocol-0.2.11.dev1/eval_protocol/event_bus/__init__.py +0 -5
  352. eval_protocol-0.2.11.dev1/eval_protocol/event_bus/sqlite_event_bus.py +0 -109
  353. eval_protocol-0.2.11.dev1/eval_protocol/human_id/__init__.py +0 -35
  354. eval_protocol-0.2.11.dev1/eval_protocol/integrations/__init__.py +0 -12
  355. eval_protocol-0.2.11.dev1/eval_protocol/integrations/braintrust.py +0 -51
  356. eval_protocol-0.2.11.dev1/eval_protocol/mcp_agent/intermediary_server.py +0 -542
  357. eval_protocol-0.2.11.dev1/eval_protocol/mcp_agent/main.py +0 -210
  358. eval_protocol-0.2.11.dev1/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -304
  359. eval_protocol-0.2.11.dev1/eval_protocol/mcp_agent/session.py +0 -79
  360. eval_protocol-0.2.11.dev1/eval_protocol/models.py +0 -563
  361. eval_protocol-0.2.11.dev1/eval_protocol/pytest/__init__.py +0 -19
  362. eval_protocol-0.2.11.dev1/eval_protocol/pytest/default_agent_rollout_processor.py +0 -158
  363. eval_protocol-0.2.11.dev1/eval_protocol/pytest/default_dataset_adapter.py +0 -10
  364. eval_protocol-0.2.11.dev1/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -118
  365. eval_protocol-0.2.11.dev1/eval_protocol/pytest/evaluation_test.py +0 -962
  366. eval_protocol-0.2.11.dev1/eval_protocol/pytest/plugin.py +0 -161
  367. eval_protocol-0.2.11.dev1/eval_protocol/pytest/types.py +0 -52
  368. eval_protocol-0.2.11.dev1/eval_protocol/pytest/utils.py +0 -363
  369. eval_protocol-0.2.11.dev1/eval_protocol/utils/logs_server.py +0 -388
  370. eval_protocol-0.2.11.dev1/eval_protocol.egg-info/PKG-INFO +0 -173
  371. eval_protocol-0.2.11.dev1/tests/test_adapters_e2e.py +0 -447
  372. eval_protocol-0.2.11.dev1/tests/test_braintrust_adapter.py +0 -34
  373. eval_protocol-0.2.11.dev1/tests/test_braintrust_example.py +0 -49
  374. eval_protocol-0.2.11.dev1/tests/test_fireworks_api.py +0 -66
  375. eval_protocol-0.2.11.dev1/tests/test_retry_mechanism.py +0 -157
  376. eval_protocol-0.2.11.dev1/vendor/tau2/__init__.py +0 -1
  377. eval_protocol-0.2.11.dev1/vendor/tau2/data_model/__init__.py +0 -1
  378. eval_protocol-0.2.11.dev1/vendor/tau2/evaluator/__init__.py +0 -1
  379. eval_protocol-0.2.11.dev1/vite-app/dist/assets/index-D1ErODUS.js +0 -93
  380. eval_protocol-0.2.11.dev1/vite-app/dist/assets/index-D1ErODUS.js.map +0 -1
  381. eval_protocol-0.2.11.dev1/vite-app/dist/assets/index-D5KxcfFQ.css +0 -1
  382. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/__init__.py +0 -0
  383. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/utils/__init__.py +0 -0
  384. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/development/utils/generate_api_key.py +0 -0
  385. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/__main__.py +0 -0
  386. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/adapters/trl.py +0 -0
  387. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/__init__.py +0 -0
  388. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/models.py +0 -0
  389. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resource_abc.py +0 -0
  390. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resource_pool.py +0 -0
  391. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
  392. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  393. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  394. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  395. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  396. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  397. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  398. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
  399. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/agent/tool_registry.py +0 -0
  400. {eval_protocol-0.2.11.dev1/vendor/tau2/agent → eval_protocol-0.2.98.dev1/eval_protocol/benchmarks}/__init__.py +0 -0
  401. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
  402. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/cli_commands/common.py +0 -0
  403. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  404. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  405. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/datasets/__init__.py +0 -0
  406. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/event_bus/event_bus.py +0 -0
  407. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/event_bus/logger.py +0 -0
  408. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/execution/__init__.py +0 -0
  409. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/generation/cache.py +0 -0
  410. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/generation/clients/base.py +0 -0
  411. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/human_id/dictionary.py +0 -0
  412. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/integrations/openeval.py +0 -0
  413. {eval_protocol-0.2.11.dev1/vendor/tau2/domains → eval_protocol-0.2.98.dev1/eval_protocol/log_utils}/__init__.py +0 -0
  414. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/logging_utils.py +0 -0
  415. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/__init__.py +0 -0
  416. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/adapter.py +0 -0
  417. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
  418. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
  419. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
  420. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/process_manager.py +0 -0
  421. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
  422. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
  423. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/config.py +0 -0
  424. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  425. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  426. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  427. {eval_protocol-0.2.11.dev1/vendor/tau2/domains/telecom/tasks → eval_protocol-0.2.98.dev1/eval_protocol/mcp_servers}/__init__.py +0 -0
  428. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/packaging.py +0 -0
  429. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  430. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/reward_function.py +0 -0
  431. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/__init__.py +0 -0
  432. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
  433. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/rl_processing.py +0 -0
  434. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/__init__.py +0 -0
  435. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
  436. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
  437. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/module_loader.py +0 -0
  438. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
  439. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
  440. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/entry_points.txt +0 -0
  441. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
  442. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/setup.cfg +0 -0
  443. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/setup.py +0 -0
  444. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_accuracy.py +0 -0
  445. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_accuracy_length.py +0 -0
  446. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_agent_orchestrator.py +0 -0
  447. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_cli.py +0 -0
  448. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_config.py +0 -0
  449. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_control_plane_separation.py +0 -0
  450. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_cpp_code.py +0 -0
  451. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_deepcoder_reward.py +0 -0
  452. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_edge_cases.py +0 -0
  453. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_format.py +0 -0
  454. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_fractional_code.py +0 -0
  455. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_integration.py +0 -0
  456. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_json_schema.py +0 -0
  457. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_kwargs_validation.py +0 -0
  458. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_language_consistency.py +0 -0
  459. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_lean_prover.py +0 -0
  460. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_lean_prover_runner.py +0 -0
  461. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_length.py +0 -0
  462. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_list_comparison_math_reward.py +0 -0
  463. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_logs_server_simple.py +0 -0
  464. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_minimal.py +0 -0
  465. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
  466. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_n_variant_batch_integration.py +0 -0
  467. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_n_variant_integration.py +0 -0
  468. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_openai_compatibility.py +0 -0
  469. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_openeval_integration.py +0 -0
  470. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_reasoning_steps.py +0 -0
  471. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_repetition.py +0 -0
  472. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_repetition_debug.py +0 -0
  473. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_reward_function.py +0 -0
  474. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_server.py +0 -0
  475. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_tag_count.py +0 -0
  476. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_tau_bench_airline_smoke.py +0 -0
  477. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_typed_interface.py +0 -0
  478. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/tests/test_vite_server.py +0 -0
  479. {eval_protocol-0.2.11.dev1/vendor/tau2/environment → eval_protocol-0.2.98.dev1/vendor/tau2/agent}/__init__.py +0 -0
  480. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/api_service/__init__.py +0 -0
  481. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/api_service/api_config.py +0 -0
  482. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/api_service/data_model.py +0 -0
  483. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
  484. {eval_protocol-0.2.11.dev1/vendor/tau2/metrics → eval_protocol-0.2.98.dev1/vendor/tau2/data_model}/__init__.py +0 -0
  485. {eval_protocol-0.2.11.dev1/vendor/tau2/orchestrator → eval_protocol-0.2.98.dev1/vendor/tau2/domains}/__init__.py +0 -0
  486. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
  487. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
  488. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
  489. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
  490. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
  491. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
  492. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
  493. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
  494. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
  495. {eval_protocol-0.2.11.dev1/vendor/tau2/scripts → eval_protocol-0.2.98.dev1/vendor/tau2/domains/telecom/tasks}/__init__.py +0 -0
  496. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  497. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  498. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
  499. {eval_protocol-0.2.11.dev1/vendor/tau2/user → eval_protocol-0.2.98.dev1/vendor/tau2/environment}/__init__.py +0 -0
  500. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/environment/db.py +0 -0
  501. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  502. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  503. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/scripts/check_data.py +0 -0
  504. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/__init__.py +0 -0
  505. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/io_utils.py +0 -0
  506. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vendor/tau2/utils/llm_utils.py +2 -2
  507. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  508. {eval_protocol-0.2.11.dev1 → eval_protocol-0.2.98.dev1}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
18
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
19
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
20
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
21
+ SOFTWARE.
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: eval-protocol
3
+ Version: 0.2.98.dev1
4
+ Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
+ Author-email: Fireworks AI <info@fireworks.ai>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/fireworks-ai/eval-protocol
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: requests>=2.25.0
14
+ Requires-Dist: pydantic>=2.0.0
15
+ Requires-Dist: dataclasses-json>=0.5.7
16
+ Requires-Dist: uvicorn>=0.15.0
17
+ Requires-Dist: python-dotenv>=0.19.0
18
+ Requires-Dist: openai>=1.78.1
19
+ Requires-Dist: aiosqlite
20
+ Requires-Dist: aiohttp
21
+ Requires-Dist: mcp>=1.9.2
22
+ Requires-Dist: PyYAML>=5.0
23
+ Requires-Dist: hydra-core>=1.3.2
24
+ Requires-Dist: omegaconf>=2.3.0
25
+ Requires-Dist: httpx>=0.24.0
26
+ Requires-Dist: anthropic>=0.59.0
27
+ Requires-Dist: litellm<1.75.0
28
+ Requires-Dist: pytest>=6.0.0
29
+ Requires-Dist: pytest-asyncio>=0.21.0
30
+ Requires-Dist: peewee>=3.18.2
31
+ Requires-Dist: backoff>=2.2.0
32
+ Requires-Dist: questionary>=2.0.0
33
+ Requires-Dist: toml>=0.10.0
34
+ Requires-Dist: loguru>=0.6.0
35
+ Requires-Dist: docstring-parser>=0.15
36
+ Requires-Dist: rich>=12.0.0
37
+ Requires-Dist: psutil>=6.0.0
38
+ Requires-Dist: addict>=2.4.0
39
+ Requires-Dist: deepdiff>=6.0.0
40
+ Requires-Dist: websockets>=15.0.1
41
+ Requires-Dist: fastapi>=0.116.1
42
+ Provides-Extra: dev
43
+ Requires-Dist: build; extra == "dev"
44
+ Requires-Dist: twine; extra == "dev"
45
+ Requires-Dist: pytest-httpserver; extra == "dev"
46
+ Requires-Dist: werkzeug>=2.0.0; extra == "dev"
47
+ Requires-Dist: ruff>=0.5.0; extra == "dev"
48
+ Requires-Dist: transformers>=4.0.0; extra == "dev"
49
+ Requires-Dist: pandas>=1.5.0; extra == "dev"
50
+ Requires-Dist: types-setuptools; extra == "dev"
51
+ Requires-Dist: types-requests; extra == "dev"
52
+ Requires-Dist: types-PyYAML; extra == "dev"
53
+ Requires-Dist: types-docker; extra == "dev"
54
+ Requires-Dist: versioneer>=0.20; extra == "dev"
55
+ Requires-Dist: openai>=1.78.1; extra == "dev"
56
+ Requires-Dist: pre-commit; extra == "dev"
57
+ Requires-Dist: e2b; extra == "dev"
58
+ Requires-Dist: pytest-cov; extra == "dev"
59
+ Requires-Dist: pytest-xdist; extra == "dev"
60
+ Requires-Dist: docker==7.1.0; extra == "dev"
61
+ Requires-Dist: ipykernel>=6.30.0; extra == "dev"
62
+ Requires-Dist: jupyter>=1.1.1; extra == "dev"
63
+ Requires-Dist: pip>=25.1.1; extra == "dev"
64
+ Requires-Dist: haikus==0.3.8; extra == "dev"
65
+ Requires-Dist: syrupy>=4.0.0; extra == "dev"
66
+ Requires-Dist: gymnasium>=1.2.0; extra == "dev"
67
+ Provides-Extra: trl
68
+ Requires-Dist: torch>=1.9; extra == "trl"
69
+ Requires-Dist: trl>=0.7.0; extra == "trl"
70
+ Requires-Dist: peft>=0.7.0; extra == "trl"
71
+ Requires-Dist: transformers>=4.0.0; extra == "trl"
72
+ Requires-Dist: accelerate>=0.28.0; extra == "trl"
73
+ Provides-Extra: openevals
74
+ Requires-Dist: openevals>=0.1.0; extra == "openevals"
75
+ Provides-Extra: fireworks
76
+ Requires-Dist: fireworks-ai>=0.19.19; extra == "fireworks"
77
+ Provides-Extra: box2d
78
+ Requires-Dist: swig; extra == "box2d"
79
+ Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"
80
+ Requires-Dist: Pillow; extra == "box2d"
81
+ Provides-Extra: langfuse
82
+ Requires-Dist: langfuse>=2.0.0; extra == "langfuse"
83
+ Provides-Extra: huggingface
84
+ Requires-Dist: datasets>=3.0.0; extra == "huggingface"
85
+ Requires-Dist: transformers>=4.0.0; extra == "huggingface"
86
+ Provides-Extra: langsmith
87
+ Requires-Dist: langsmith>=0.1.86; extra == "langsmith"
88
+ Provides-Extra: bigquery
89
+ Requires-Dist: google-cloud-bigquery>=3.0.0; extra == "bigquery"
90
+ Requires-Dist: google-auth>=2.0.0; extra == "bigquery"
91
+ Provides-Extra: svgbench
92
+ Requires-Dist: selenium>=4.0.0; extra == "svgbench"
93
+ Provides-Extra: pydantic
94
+ Requires-Dist: pydantic-ai>=1.0.2; extra == "pydantic"
95
+ Provides-Extra: supabase
96
+ Requires-Dist: supabase>=2.18.1; extra == "supabase"
97
+ Provides-Extra: chinook
98
+ Requires-Dist: psycopg2-binary>=2.9.10; extra == "chinook"
99
+ Provides-Extra: langchain
100
+ Requires-Dist: langchain-core>=0.3.0; extra == "langchain"
101
+ Provides-Extra: braintrust
102
+ Requires-Dist: braintrust[otel]; extra == "braintrust"
103
+ Provides-Extra: openenv
104
+ Requires-Dist: openenv-core; extra == "openenv"
105
+ Provides-Extra: langgraph
106
+ Requires-Dist: langgraph>=0.6.7; extra == "langgraph"
107
+ Requires-Dist: langchain-core>=0.3.75; extra == "langgraph"
108
+ Provides-Extra: langgraph-tools
109
+ Requires-Dist: langgraph>=0.6.7; extra == "langgraph-tools"
110
+ Requires-Dist: langchain>=0.3.0; extra == "langgraph-tools"
111
+ Requires-Dist: langchain-fireworks>=0.3.0; extra == "langgraph-tools"
112
+ Provides-Extra: proxy
113
+ Requires-Dist: redis>=5.0.0; extra == "proxy"
114
+ Requires-Dist: langfuse>=2.0.0; extra == "proxy"
115
+ Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
116
+ Dynamic: license-file
117
+
118
+ # Eval Protocol
119
+
120
+ [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
121
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/eval-protocol/python-sdk)
122
+
123
+ **Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
124
+
125
+ ![Eval Protocol overview](https://github.com/eval-protocol/python-sdk/raw/main/docs/intro.png)
126
+
127
+ Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
128
+
129
+ Eval Protocol makes this possible in two ways:
130
+
131
+ 1. **Expose your agent through a simple API**
132
+ Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
133
+ 2. **Connect with any trainer**
134
+ Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
135
+
136
+ The result: RL that works out-of-the-box for existing production agents.
137
+
138
+ ## Who This Is For
139
+
140
+ - **Applied AI teams** adding RL to existing production agents.
141
+ - **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
142
+ - **MLOps teams** building reproducible, language-agnostic rollout pipelines.
143
+
144
+ ## Quickstart
145
+
146
+ - See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
147
+
148
+ ## Resources
149
+
150
+ - **[Documentation](https://evalprotocol.io)** – Guides and API reference
151
+ - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
152
+ - **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
153
+
154
+ ## License
155
+
156
+ [MIT](LICENSE)
@@ -0,0 +1,39 @@
1
+ # Eval Protocol
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
4
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/eval-protocol/python-sdk)
5
+
6
+ **Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
7
+
8
+ ![Eval Protocol overview](https://github.com/eval-protocol/python-sdk/raw/main/docs/intro.png)
9
+
10
+ Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
11
+
12
+ Eval Protocol makes this possible in two ways:
13
+
14
+ 1. **Expose your agent through a simple API**
15
+ Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
16
+ 2. **Connect with any trainer**
17
+ Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
18
+
19
+ The result: RL that works out-of-the-box for existing production agents.
20
+
21
+ ## Who This Is For
22
+
23
+ - **Applied AI teams** adding RL to existing production agents.
24
+ - **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
25
+ - **MLOps teams** building reproducible, language-agnostic rollout pipelines.
26
+
27
+ ## Quickstart
28
+
29
+ - See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
30
+
31
+ ## Resources
32
+
33
+ - **[Documentation](https://evalprotocol.io)** – Guides and API reference
34
+ - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
35
+ - **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
36
+
37
+ ## License
38
+
39
+ [MIT](LICENSE)
@@ -56,7 +56,7 @@ OUTPUT_JSONL_FILE = "./development/CODING_DATASET.jsonl"
56
56
  try:
57
57
  repobench_p_tokenizer = AutoTokenizer.from_pretrained("gpt2")
58
58
  except OSError:
59
- print("Warning: Could not load gpt2 tokenizer for Repobench-P. " "Falling back to basic split for token counting.")
59
+ print("Warning: Could not load gpt2 tokenizer for Repobench-P. Falling back to basic split for token counting.")
60
60
  repobench_p_tokenizer = None
61
61
 
62
62
 
@@ -108,8 +108,7 @@ def format_aider_prompt(problem_json: dict) -> str:
108
108
  """Format the prompt for Aider benchmark style problems."""
109
109
  question = problem_json.get("content", "")
110
110
  return (
111
- f"{question}\n\nPlease generate the code in the following format:\n"
112
- "```python\n# Your code response here\n```"
111
+ f"{question}\n\nPlease generate the code in the following format:\n```python\n# Your code response here\n```"
113
112
  )
114
113
 
115
114
 
@@ -327,7 +326,7 @@ def normalize_problem_to_openai_format(
327
326
  try:
328
327
  labels = json.loads(labels_data)
329
328
  except json.JSONDecodeError:
330
- print(f"Warning: Skipping ID {problem_id_str} in {filename} " "- malformed JSON in labels.")
329
+ print(f"Warning: Skipping ID {problem_id_str} in {filename} - malformed JSON in labels.")
331
330
  return None
332
331
  elif isinstance(labels_data, dict):
333
332
  labels = labels_data
@@ -426,10 +425,10 @@ def normalize_problem_to_openai_format(
426
425
  )
427
426
  return None
428
427
  if not final_user_content.strip() or not final_assistant_content.strip():
429
- print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "empty processed content.")
428
+ print(f"Warning: Skipping ID {problem_id_str} in {filename} - empty processed content.")
430
429
  return None
431
430
  if final_assistant_content.strip() == "import sys; sys.exit(0)":
432
- print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "placeholder solution.")
431
+ print(f"Warning: Skipping ID {problem_id_str} in {filename} - placeholder solution.")
433
432
  return None
434
433
 
435
434
  return {
@@ -439,7 +438,7 @@ def normalize_problem_to_openai_format(
439
438
  ]
440
439
  }
441
440
  except Exception as e:
442
- print(f"Warning: Skipping ID {problem_id_str} in {filename} - " f"error ({type(e).__name__}: {e}).")
441
+ print(f"Warning: Skipping ID {problem_id_str} in {filename} - error ({type(e).__name__}: {e}).")
443
442
  import traceback
444
443
 
445
444
  traceback.print_exc()
@@ -474,7 +473,7 @@ def main():
474
473
  file_error_count += 1
475
474
  continue
476
475
 
477
- print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: " f"{filename}...")
476
+ print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: {filename}...")
478
477
  lines_in_file = 0
479
478
  processed_in_file = 0
480
479
  skipped_in_file = 0
@@ -488,7 +487,7 @@ def main():
488
487
  try:
489
488
  problem_data = json.loads(stripped_line)
490
489
  except json.JSONDecodeError:
491
- print(f"Warning: Malformed JSON on line {line_number} " f"in {filepath}. Skipping line.")
490
+ print(f"Warning: Malformed JSON on line {line_number} in {filepath}. Skipping line.")
492
491
  skipped_in_file += 1
493
492
  continue
494
493
 
@@ -507,7 +506,7 @@ def main():
507
506
  processed_count += processed_in_file
508
507
  skipped_count += skipped_in_file
509
508
  except Exception as e:
510
- print(f"Error processing file {filepath}: {type(e).__name__}: {e}. " "Skipping rest of file.")
509
+ print(f"Error processing file {filepath}: {type(e).__name__}: {e}. Skipping rest of file.")
511
510
  import traceback
512
511
 
513
512
  traceback.print_exc()
@@ -139,7 +139,7 @@ def start_ngrok_and_get_url(
139
139
  # Or by setting NGROK_AUTHTOKEN environment variable.
140
140
  # Forcing it via command line is also an option but less common for persistent setup.
141
141
  print(
142
- f"Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
142
+ "Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken <token>') or via NGROK_AUTHTOKEN env var."
143
143
  )
144
144
  # Example if passing via env for the subprocess:
145
145
  # ngrok_env = os.environ.copy()
@@ -0,0 +1,178 @@
1
+ """
2
+ Fireworks Eval Protocol - Simplify reward modeling and evaluation for LLM RL fine-tuning.
3
+
4
+ A Python library for defining, testing, deploying, and using reward functions
5
+ for LLM fine-tuning, including launching full RL jobs on the Fireworks platform.
6
+
7
+ The library also provides an agent evaluation framework for testing and evaluating
8
+ tool-augmented models using self-contained task bundles.
9
+ """
10
+
11
+ import warnings
12
+
13
+ from .auth import get_fireworks_account_id, get_fireworks_api_key
14
+ from .common_utils import load_jsonl
15
+ from .config import RewardKitConfig, get_config, load_config
16
+ from .mcp_env import (
17
+ AnthropicPolicy,
18
+ FireworksPolicy,
19
+ LiteLLMPolicy,
20
+ OpenAIPolicy,
21
+ make,
22
+ rollout,
23
+ test_mcp,
24
+ )
25
+ from .data_loader import DynamicDataLoader, InlineDataLoader
26
+ from . import mcp, rewards
27
+ from .models import EvaluateResult, Message, MetricResult, EvaluationRow, InputMetadata, Status
28
+ from .playback_policy import PlaybackPolicyBase
29
+ from .resources import create_llm_resource
30
+ from .reward_function import RewardFunction
31
+ from .typed_interface import reward_function
32
+ from .quickstart.aha_judge import aha_judge
33
+ from .utils.evaluation_row_utils import (
34
+ multi_turn_assistant_to_ground_truth,
35
+ assistant_to_ground_truth,
36
+ filter_longest_conversation,
37
+ )
38
+ from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor, GithubActionRolloutProcessor
39
+ from .pytest.parameterize import DefaultParameterIdGenerator
40
+ from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
41
+ from .log_utils.rollout_id_filter import RolloutIdFilter
42
+ from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
43
+ from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
44
+ from .log_utils.elasticsearch_client import ElasticsearchConfig
45
+
46
+
47
+ from .types.remote_rollout_processor import (
48
+ InitRequest,
49
+ RolloutMetadata,
50
+ StatusResponse,
51
+ create_langfuse_config_tags,
52
+ DataLoaderConfig,
53
+ )
54
+
55
+ try:
56
+ from .adapters import OpenAIResponsesAdapter
57
+ except ImportError:
58
+ OpenAIResponsesAdapter = None
59
+
60
+ try:
61
+ from .adapters import LangfuseAdapter, create_langfuse_adapter
62
+ except ImportError:
63
+ LangfuseAdapter = None
64
+
65
+ try:
66
+ from .adapters import BraintrustAdapter, create_braintrust_adapter
67
+ except ImportError:
68
+ BraintrustAdapter = None
69
+
70
+ try:
71
+ from .adapters import LangSmithAdapter
72
+ except ImportError:
73
+ LangSmithAdapter = None
74
+
75
+
76
+ try:
77
+ from .adapters import WeaveAdapter
78
+ except ImportError:
79
+ WeaveAdapter = None
80
+
81
+ try:
82
+ from .proxy import create_app, AuthProvider, AccountInfo # pyright: ignore[reportAssignmentType]
83
+ except ImportError:
84
+
85
+ def create_app(*args, **kwargs):
86
+ raise ImportError(
87
+ "Proxy functionality requires additional dependencies. "
88
+ "Please install with: pip install eval-protocol[proxy]"
89
+ )
90
+
91
+ class AuthProvider:
92
+ def __init__(self, *args, **kwargs):
93
+ raise ImportError(
94
+ "Proxy functionality requires additional dependencies. "
95
+ "Please install with: pip install eval-protocol[proxy]"
96
+ )
97
+
98
+ class AccountInfo:
99
+ def __init__(self, *args, **kwargs):
100
+ raise ImportError(
101
+ "Proxy functionality requires additional dependencies. "
102
+ "Please install with: pip install eval-protocol[proxy]"
103
+ )
104
+
105
+
106
+ warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
107
+
108
+ __all__ = [
109
+ "ElasticsearchConfig",
110
+ "ElasticsearchDirectHttpHandler",
111
+ "RolloutIdFilter",
112
+ "setup_rollout_logging_for_elasticsearch_handler",
113
+ "DataLoaderConfig",
114
+ "Status",
115
+ "RemoteRolloutProcessor",
116
+ "GithubActionRolloutProcessor",
117
+ "InputMetadata",
118
+ "EvaluationRow",
119
+ "DefaultParameterIdGenerator",
120
+ "DynamicDataLoader",
121
+ "InlineDataLoader",
122
+ "aha_judge",
123
+ "multi_turn_assistant_to_ground_truth",
124
+ "assistant_to_ground_truth",
125
+ "filter_longest_conversation",
126
+ "evaluation_test",
127
+ "SingleTurnRolloutProcessor",
128
+ "OpenAIResponsesAdapter",
129
+ "LangfuseAdapter",
130
+ "create_langfuse_adapter",
131
+ "BraintrustAdapter",
132
+ "create_braintrust_adapter",
133
+ "LangSmithAdapter",
134
+ "FireworksTracingHttpHandler",
135
+ # Core interfaces
136
+ "Message",
137
+ "MetricResult",
138
+ "EvaluateResult",
139
+ "reward_function",
140
+ "RewardFunction",
141
+ # Authentication
142
+ "get_fireworks_api_key",
143
+ "get_fireworks_account_id",
144
+ # Configuration
145
+ "load_config",
146
+ "get_config",
147
+ "RewardKitConfig",
148
+ # Utilities
149
+ "load_jsonl",
150
+ # MCP Environment API
151
+ "make",
152
+ "rollout",
153
+ "LiteLLMPolicy",
154
+ "AnthropicPolicy",
155
+ "FireworksPolicy",
156
+ "OpenAIPolicy",
157
+ "test_mcp",
158
+ # Playback functionality
159
+ "PlaybackPolicyBase",
160
+ # Resource management
161
+ "create_llm_resource",
162
+ # Submodules
163
+ "rewards",
164
+ "mcp",
165
+ # Remote server types
166
+ "InitRequest",
167
+ "RolloutMetadata",
168
+ "StatusResponse",
169
+ "create_langfuse_config_tags",
170
+ # Proxy
171
+ "create_app",
172
+ "AuthProvider",
173
+ "AccountInfo",
174
+ ]
175
+
176
+ from . import _version
177
+
178
+ __version__ = _version.get_versions()["version"]
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-15T00:15:02-0700",
11
+ "date": "2025-12-15T16:40:32-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "58d840995e6ca925da6fa17dc48b0b0d9ad9d2e8",
15
- "version": "0.2.11-dev1"
14
+ "full-revisionid": "438a49431d16626a8e883cfb04afecfb188eb9dc",
15
+ "version": "0.2.98.dev.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -0,0 +1,101 @@
1
+ """Data source adapters for Eval Protocol.
2
+
3
+ This package provides adapters for integrating with various data sources
4
+ and converting them to EvaluationRow format for use in evaluation pipelines.
5
+
6
+ Available adapters:
7
+ - BaseAdapter: Abstract base class for all adapters
8
+ - LangfuseAdapter: Pull data from Langfuse deployments
9
+ - FireworksTracingAdapter: Pull data from Langfuse via Fireworks tracing proxy
10
+ - HuggingFaceAdapter: Load datasets from HuggingFace Hub
11
+ - BigQueryAdapter: Query data from Google BigQuery
12
+ - TRL integration (legacy)
13
+ """
14
+
15
+ # Always available
16
+ from .base import BaseAdapter
17
+
18
+ __all__ = ["BaseAdapter"]
19
+
20
+ # Conditional imports based on available dependencies
21
+ try:
22
+ from .langfuse import LangfuseAdapter, create_langfuse_adapter
23
+
24
+ __all__.extend(["LangfuseAdapter", "create_langfuse_adapter"])
25
+ except ImportError:
26
+ pass
27
+
28
+ from .fireworks_tracing import FireworksTracingAdapter
29
+
30
+ __all__.extend(["FireworksTracingAdapter"])
31
+
32
+ try:
33
+ from .huggingface import (
34
+ HuggingFaceAdapter,
35
+ create_gsm8k_adapter,
36
+ create_huggingface_adapter,
37
+ create_math_adapter,
38
+ )
39
+
40
+ __all__.extend(
41
+ [
42
+ "HuggingFaceAdapter",
43
+ "create_huggingface_adapter",
44
+ "create_gsm8k_adapter",
45
+ "create_math_adapter",
46
+ ]
47
+ )
48
+ except ImportError:
49
+ pass
50
+
51
+ try:
52
+ from .bigquery import (
53
+ BigQueryAdapter,
54
+ create_bigquery_adapter,
55
+ )
56
+
57
+ __all__.extend(
58
+ [
59
+ "BigQueryAdapter",
60
+ "create_bigquery_adapter",
61
+ ]
62
+ )
63
+ except ImportError:
64
+ pass
65
+
66
+ try:
67
+ from .braintrust import BraintrustAdapter, create_braintrust_adapter
68
+
69
+ __all__.extend(["BraintrustAdapter", "create_braintrust_adapter"])
70
+ except ImportError:
71
+ pass
72
+
73
+ # Legacy adapters (always available)
74
+
75
+ try:
76
+ from .trl import create_trl_adapter
77
+
78
+ __all__.extend(["create_trl_adapter"])
79
+ except ImportError:
80
+ pass
81
+
82
+ try:
83
+ from .openai_responses import OpenAIResponsesAdapter
84
+
85
+ __all__.extend(["OpenAIResponsesAdapter"])
86
+ except ImportError:
87
+ pass
88
+
89
+ try:
90
+ from .langsmith import LangSmithAdapter
91
+
92
+ __all__.extend(["LangSmithAdapter"])
93
+ except ImportError:
94
+ pass
95
+
96
+ try:
97
+ from .weave import WeaveAdapter
98
+
99
+ __all__.extend(["WeaveAdapter"])
100
+ except ImportError:
101
+ pass
@@ -0,0 +1,25 @@
1
+ """
2
+ Base adapter interface for Eval Protocol.
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import List
7
+
8
+ from eval_protocol.models import EvaluationRow
9
+
10
+
11
+ class BaseAdapter(ABC):
12
+ """Abstract base class for all Eval Protocol adapters."""
13
+
14
+ @abstractmethod
15
+ def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
16
+ """Get evaluation rows from the data source."""
17
+ pass
18
+
19
+ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
20
+ """Upload evaluation scores back to the data source for tracking and analysis."""
21
+ pass
22
+
23
+ def upload_score(self, row: EvaluationRow, model_name: str) -> None:
24
+ """Upload evaluation score for a single row back to the data source."""
25
+ pass