eval-protocol 0.2.85__tar.gz → 0.2.87__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (450) hide show
  1. {eval_protocol-0.2.85/eval_protocol.egg-info → eval_protocol-0.2.87}/PKG-INFO +18 -94
  2. eval_protocol-0.2.87/README.md +39 -0
  3. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli.py +26 -9
  5. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/create_rft.py +45 -18
  6. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/local_test.py +25 -12
  7. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/fireworks_rft.py +9 -0
  8. {eval_protocol-0.2.85 → eval_protocol-0.2.87/eval_protocol.egg-info}/PKG-INFO +18 -94
  9. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli_create_rft_infer.py +346 -18
  10. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli_local_test.py +115 -4
  11. eval_protocol-0.2.85/README.md +0 -115
  12. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/LICENSE +0 -0
  13. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/__init__.py +0 -0
  14. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/normalize_sandbox_fusion.py +0 -0
  15. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/utils/__init__.py +0 -0
  16. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/utils/generate_api_key.py +0 -0
  17. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/development/utils/subprocess_manager.py +0 -0
  18. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/__init__.py +0 -0
  19. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/__main__.py +0 -0
  20. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/__init__.py +0 -0
  21. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/base.py +0 -0
  22. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/bigquery.py +0 -0
  23. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/braintrust.py +0 -0
  24. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  25. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/huggingface.py +0 -0
  26. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/langchain.py +0 -0
  27. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/langfuse.py +0 -0
  28. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/langsmith.py +0 -0
  29. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/openai_responses.py +0 -0
  30. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/trl.py +0 -0
  31. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/utils.py +0 -0
  32. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/adapters/weave.py +0 -0
  33. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/__init__.py +0 -0
  34. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/models.py +0 -0
  35. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/orchestrator.py +0 -0
  36. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resource_abc.py +0 -0
  37. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resource_pool.py +0 -0
  38. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/__init__.py +0 -0
  39. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  40. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  41. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  42. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  43. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  44. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/docker_resource.py +0 -0
  45. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  46. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  47. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/resources/sql_resource.py +0 -0
  48. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/task_manager.py +0 -0
  49. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/agent/tool_registry.py +0 -0
  50. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/auth.py +0 -0
  51. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/__init__.py +0 -0
  52. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  53. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  54. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_aime25.py +0 -0
  55. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  56. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  57. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  58. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  59. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  60. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/__init__.py +0 -0
  61. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  62. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/common.py +0 -0
  63. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/deploy.py +0 -0
  64. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  65. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/logs.py +0 -0
  66. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/preview.py +0 -0
  67. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  68. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/cli_commands/upload.py +0 -0
  69. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/common_utils.py +0 -0
  70. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/config.py +0 -0
  71. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/__init__.py +0 -0
  72. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  73. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  74. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  75. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  76. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/data_loader/models.py +0 -0
  77. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/__init__.py +0 -0
  78. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  79. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  80. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  81. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  82. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/datasets/__init__.py +0 -0
  83. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/datasets/loader.py +0 -0
  84. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/directory_utils.py +0 -0
  85. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/evaluation.py +0 -0
  86. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/__init__.py +0 -0
  87. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/event_bus.py +0 -0
  88. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/logger.py +0 -0
  89. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  90. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  91. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/exceptions.py +0 -0
  92. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/execution/__init__.py +0 -0
  93. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/execution/pipeline.py +0 -0
  94. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/gcp_tools.py +0 -0
  95. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/generation/cache.py +0 -0
  96. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/generation/clients/base.py +0 -0
  97. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/generation/clients.py +0 -0
  98. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/generic_server.py +0 -0
  99. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/get_pep440_version.py +0 -0
  100. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/human_id/__init__.py +0 -0
  101. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/human_id/dictionary.py +0 -0
  102. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/integrations/__init__.py +0 -0
  103. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/integrations/deepeval.py +0 -0
  104. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/integrations/openeval.py +0 -0
  105. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/integrations/trl.py +0 -0
  106. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/__init__.py +0 -0
  107. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  108. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  109. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  110. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  111. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/init.py +0 -0
  112. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/rollout_context.py +0 -0
  113. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  114. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/log_utils/util.py +0 -0
  115. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/logging_utils.py +0 -0
  116. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/__init__.py +0 -0
  117. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/adapter.py +0 -0
  118. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/client/__init__.py +0 -0
  119. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/client/connection.py +0 -0
  120. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/clients.py +0 -0
  121. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/__init__.py +0 -0
  122. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/base_policy.py +0 -0
  123. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/manager.py +0 -0
  124. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/execution/policy.py +0 -0
  125. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/grid_renderer.py +0 -0
  126. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  127. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/mcpgym.py +0 -0
  128. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/process_manager.py +0 -0
  129. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/session/__init__.py +0 -0
  130. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/session/manager.py +0 -0
  131. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/simple_process_manager.py +0 -0
  132. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp/simulation_server.py +0 -0
  133. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/__init__.py +0 -0
  134. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/config.py +0 -0
  135. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/main.py +0 -0
  136. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  137. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  138. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  139. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  140. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_env.py +0 -0
  141. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/__init__.py +0 -0
  142. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  143. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  144. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  145. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  146. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  147. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  148. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  149. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  150. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  151. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  152. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  153. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  154. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  155. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  156. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/models.py +0 -0
  157. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/packaging.py +0 -0
  158. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/platform_api.py +0 -0
  159. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/playback_policy.py +0 -0
  160. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/__init__.py +0 -0
  161. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  162. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/app.py +0 -0
  163. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  164. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  165. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  166. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/main.py +0 -0
  167. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/models.py +0 -0
  168. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  169. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/__init__.py +0 -0
  170. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  171. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  172. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  173. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  174. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  175. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  176. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  177. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  178. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  179. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test.py +0 -0
  180. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  181. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  182. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/exception_config.py +0 -0
  183. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/execution.py +0 -0
  184. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  185. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  186. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  187. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/parameterize.py +0 -0
  188. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/plugin.py +0 -0
  189. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  190. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/rollout_processor.py +0 -0
  191. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/store_experiment_link.py +0 -0
  192. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/store_results_url.py +0 -0
  193. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/tracing_utils.py +0 -0
  194. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/types.py +0 -0
  195. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/pytest/validate_signature.py +0 -0
  196. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/__init__.py +0 -0
  197. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  198. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  199. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  200. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  201. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  202. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  203. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  204. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/llm_judge.py +0 -0
  205. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  206. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  207. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  208. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  209. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/quickstart/utils.py +0 -0
  210. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/resources.py +0 -0
  211. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/reward_function.py +0 -0
  212. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/__init__.py +0 -0
  213. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/accuracy.py +0 -0
  214. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/accuracy_length.py +0 -0
  215. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  216. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  217. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/apps_testing_util.py +0 -0
  218. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/bfcl_reward.py +0 -0
  219. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/code_execution.py +0 -0
  220. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/code_execution_utils.py +0 -0
  221. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/cpp_code.py +0 -0
  222. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  223. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/format.py +0 -0
  224. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/function_calling.py +0 -0
  225. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/json_schema.py +0 -0
  226. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/language_consistency.py +0 -0
  227. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/lean_prover.py +0 -0
  228. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/length.py +0 -0
  229. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  230. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/math.py +0 -0
  231. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  232. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/reasoning_steps.py +0 -0
  233. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/repetition.py +0 -0
  234. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rewards/tag_count.py +0 -0
  235. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/rl_processing.py +0 -0
  236. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/server.py +0 -0
  237. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/stats/__init__.py +0 -0
  238. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/stats/confidence_intervals.py +0 -0
  239. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/typed_interface.py +0 -0
  240. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/types/__init__.py +0 -0
  241. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/types/errors.py +0 -0
  242. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/types/remote_rollout_processor.py +0 -0
  243. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/types/types.py +0 -0
  244. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/__init__.py +0 -0
  245. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/batch_evaluation.py +0 -0
  246. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/batch_transformation.py +0 -0
  247. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/browser_utils.py +0 -0
  248. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/check_server_status.py +0 -0
  249. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/dataset_helpers.py +0 -0
  250. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  251. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/logs_models.py +0 -0
  252. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/logs_server.py +0 -0
  253. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/module_loader.py +0 -0
  254. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/packaging_utils.py +0 -0
  255. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/show_results_url.py +0 -0
  256. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/static_policy.py +0 -0
  257. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/subprocess_utils.py +0 -0
  258. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol/utils/vite_server.py +0 -0
  259. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/SOURCES.txt +0 -0
  260. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/dependency_links.txt +0 -0
  261. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/entry_points.txt +0 -0
  262. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/requires.txt +0 -0
  263. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/eval_protocol.egg-info/top_level.txt +0 -0
  264. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/pyproject.toml +0 -0
  265. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/setup.cfg +0 -0
  266. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/setup.py +0 -0
  267. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_accuracy.py +0 -0
  268. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_accuracy_length.py +0 -0
  269. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_adapters_e2e.py +0 -0
  270. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_agent_orchestrator.py +0 -0
  271. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_agent_resources.py +0 -0
  272. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_auth.py +0 -0
  273. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_batch_evaluation.py +0 -0
  274. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli.py +0 -0
  275. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli_agent.py +0 -0
  276. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cli_args.py +0 -0
  277. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_code_execution.py +0 -0
  278. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_config.py +0 -0
  279. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_control_plane_separation.py +0 -0
  280. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_cpp_code.py +0 -0
  281. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_data_driven_task_manager.py +0 -0
  282. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_deepcoder_reward.py +0 -0
  283. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_deepeval_integration.py +0 -0
  284. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_deploy_integration.py +0 -0
  285. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_directory_utils.py +0 -0
  286. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_e2b_integration.py +0 -0
  287. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_e2b_js_integration.py +0 -0
  288. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_edge_cases.py +0 -0
  289. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_ep_upload_e2e.py +0 -0
  290. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_eval_protocol_import.py +0 -0
  291. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_evaluation.py +0 -0
  292. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_evaluation_integration.py +0 -0
  293. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_evaluation_postprocess.py +0 -0
  294. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_evaluation_preview_integration.py +0 -0
  295. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_event_bus.py +0 -0
  296. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_event_bus_helper.py +0 -0
  297. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_examples_end_to_end.py +0 -0
  298. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_exceptions.py +0 -0
  299. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_fireworks_api.py +0 -0
  300. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_format.py +0 -0
  301. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_fractional_code.py +0 -0
  302. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_function_calling.py +0 -0
  303. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_gcp_tools.py +0 -0
  304. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_generic_server.py +0 -0
  305. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_human_id.py +0 -0
  306. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_integration.py +0 -0
  307. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_json_schema.py +0 -0
  308. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_kwargs_validation.py +0 -0
  309. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_language_consistency.py +0 -0
  310. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_lean_prover.py +0 -0
  311. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_lean_prover_runner.py +0 -0
  312. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_length.py +0 -0
  313. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_list_comparison_math_reward.py +0 -0
  314. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_logs_server.py +0 -0
  315. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_logs_server_simple.py +0 -0
  316. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_math.py +0 -0
  317. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_message_field_filtering.py +0 -0
  318. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_minimal.py +0 -0
  319. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_models.py +0 -0
  320. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_models_rl.py +0 -0
  321. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_multiple_choice_math_reward.py +0 -0
  322. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_n_variant_batch_integration.py +0 -0
  323. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_n_variant_integration.py +0 -0
  324. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_openai_compatibility.py +0 -0
  325. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_openeval_integration.py +0 -0
  326. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_packaging.py +0 -0
  327. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_parallel_rollouts.py +0 -0
  328. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_platform_api.py +0 -0
  329. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_quickstart_utils.py +0 -0
  330. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_readiness.py +0 -0
  331. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_reasoning_steps.py +0 -0
  332. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_repetition.py +0 -0
  333. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_repetition_debug.py +0 -0
  334. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_retry_mechanism.py +0 -0
  335. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_reward_function.py +0 -0
  336. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_reward_protocol_import.py +0 -0
  337. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_rl_processing.py +0 -0
  338. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_rollout_control_plane_integration.py +0 -0
  339. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_server.py +0 -0
  340. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_show_results_url.py +0 -0
  341. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_status_migration_changes.py +0 -0
  342. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_status_migration_integration.py +0 -0
  343. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_status_model.py +0 -0
  344. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_tag_count.py +0 -0
  345. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_tau_bench_airline_smoke.py +0 -0
  346. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_typed_interface.py +0 -0
  347. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_typed_interface_rl.py +0 -0
  348. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_upload_entrypoint.py +0 -0
  349. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_url_handling.py +0 -0
  350. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/tests/test_vite_server.py +0 -0
  351. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/__init__.py +0 -0
  352. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/agent/__init__.py +0 -0
  353. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/agent/base.py +0 -0
  354. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/agent/llm_agent.py +0 -0
  355. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/api_service/__init__.py +0 -0
  356. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/api_service/api_config.py +0 -0
  357. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/api_service/data_model.py +0 -0
  358. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/api_service/simulation_service.py +0 -0
  359. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/cli.py +0 -0
  360. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/config.py +0 -0
  361. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/airline/policy.md +0 -0
  362. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/mock/policy.md +0 -0
  363. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  364. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/retail/policy.md +0 -0
  365. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  366. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  367. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  368. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  369. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  370. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  371. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  372. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data_model/__init__.py +0 -0
  373. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data_model/message.py +0 -0
  374. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data_model/simulation.py +0 -0
  375. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/data_model/tasks.py +0 -0
  376. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/__init__.py +0 -0
  377. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/__init__.py +0 -0
  378. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/data_model.py +0 -0
  379. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/environment.py +0 -0
  380. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/tools.py +0 -0
  381. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/airline/utils.py +0 -0
  382. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/__init__.py +0 -0
  383. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/data_model.py +0 -0
  384. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/environment.py +0 -0
  385. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/tools.py +0 -0
  386. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/mock/utils.py +0 -0
  387. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/__init__.py +0 -0
  388. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/data_model.py +0 -0
  389. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/environment.py +0 -0
  390. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/tools.py +0 -0
  391. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/retail/utils.py +0 -0
  392. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/__init__.py +0 -0
  393. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/data_model.py +0 -0
  394. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/environment.py +0 -0
  395. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  396. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  397. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  398. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  399. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  400. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  401. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  402. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  403. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/tools.py +0 -0
  404. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  405. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  406. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/domains/telecom/utils.py +0 -0
  407. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/__init__.py +0 -0
  408. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/db.py +0 -0
  409. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/environment.py +0 -0
  410. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/server.py +0 -0
  411. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/tool.py +0 -0
  412. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/toolkit.py +0 -0
  413. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  414. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/__init__.py +0 -0
  415. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator.py +0 -0
  416. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  417. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  418. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  419. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  420. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  421. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/metrics/__init__.py +0 -0
  422. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/metrics/agent_metrics.py +0 -0
  423. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  424. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/__init__.py +0 -0
  425. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  426. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  427. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/orchestrator/utils.py +0 -0
  428. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/registry.py +0 -0
  429. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/run.py +0 -0
  430. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/__init__.py +0 -0
  431. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/check_data.py +0 -0
  432. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  433. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/start_servers.py +0 -0
  434. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/scripts/view_simulations.py +0 -0
  435. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/user/__init__.py +0 -0
  436. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/user/base.py +0 -0
  437. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/user/user_simulator.py +0 -0
  438. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/__init__.py +0 -0
  439. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/display.py +0 -0
  440. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/io_utils.py +0 -0
  441. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/llm_utils.py +0 -0
  442. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/pydantic_utils.py +0 -0
  443. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vendor/tau2/utils/utils.py +0 -0
  444. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/versioneer.py +0 -0
  445. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  446. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
  447. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
  448. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
  449. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  450. {eval_protocol-0.2.85 → eval_protocol-0.2.87}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.85
3
+ Version: 0.2.87
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -113,113 +113,37 @@ Requires-Dist: langfuse>=2.0.0; extra == "proxy"
113
113
  Requires-Dist: uuid6>=2025.0.0; extra == "proxy"
114
114
  Dynamic: license-file
115
115
 
116
- # Eval Protocol (EP)
116
+ # Eval Protocol
117
117
 
118
118
  [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
119
119
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/eval-protocol/python-sdk)
120
120
 
121
- **Stop guessing which AI model to use. Build a data-driven model leaderboard.**
121
+ **Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
122
122
 
123
- With hundreds of models and configs, you need objective data to choose the right one for your use case. EP helps you evaluate real traces, compare models, and visualize results locally.
123
+ ![Eval Protocol overview](./docs/intro.png)
124
124
 
125
- ## 🚀 Features
125
+ Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
126
126
 
127
- - **Pytest authoring**: `@evaluation_test` decorator to configure evaluations
128
- - **Robust rollouts**: Handles flaky LLM APIs and parallel execution
129
- - **Integrations**: Works with Langfuse, LangSmith, Braintrust, Responses API
130
- - **Agent support**: LangGraph and Pydantic AI
131
- - **MCP RL envs**: Build reinforcement learning environments with MCP
132
- - **Built-in benchmarks**: AIME, tau-bench
133
- - **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
134
- - **Local UI**: Pivot/table views for real-time analysis
127
+ Eval Protocol makes this possible in two ways:
135
128
 
136
- ## Quickstart (no labels needed)
129
+ 1. **Expose your agent through a simple API**
130
+ Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
131
+ 2. **Connect with any trainer**
132
+ Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
137
133
 
138
- Install with your tracing platform extras and set API keys:
134
+ The result: RL that works out-of-the-box for existing production agents.
139
135
 
140
- ```bash
141
- pip install 'eval-protocol[langfuse]'
136
+ ## Who This Is For
142
137
 
143
- # Model API keys (set what you need)
144
- export OPENAI_API_KEY=...
145
- export FIREWORKS_API_KEY=...
146
- export GEMINI_API_KEY=...
138
+ - **Applied AI teams** adding RL to existing production agents.
139
+ - **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
140
+ - **MLOps teams** building reproducible, language-agnostic rollout pipelines.
147
141
 
148
- # Platform keys
149
- export LANGFUSE_PUBLIC_KEY=...
150
- export LANGFUSE_SECRET_KEY=...
151
- export LANGFUSE_HOST=https://your-deployment.com # optional
152
- ```
142
+ ## Quickstart
153
143
 
154
- Minimal evaluation using the built-in AHA judge:
144
+ - See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
155
145
 
156
- ```python
157
- from datetime import datetime
158
- import pytest
159
-
160
- from eval_protocol import (
161
- evaluation_test,
162
- aha_judge,
163
- EvaluationRow,
164
- SingleTurnRolloutProcessor,
165
- DynamicDataLoader,
166
- create_langfuse_adapter,
167
- )
168
-
169
-
170
- def langfuse_data_generator() -> list[EvaluationRow]:
171
- adapter = create_langfuse_adapter()
172
- return adapter.get_evaluation_rows(
173
- to_timestamp=datetime.utcnow(),
174
- limit=20,
175
- sample_size=5,
176
- )
177
-
178
-
179
- @pytest.mark.parametrize(
180
- "completion_params",
181
- [
182
- {"model": "openai/gpt-4.1"},
183
- {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
184
- ],
185
- )
186
- @evaluation_test(
187
- data_loaders=DynamicDataLoader(generators=[langfuse_data_generator]),
188
- rollout_processor=SingleTurnRolloutProcessor(),
189
- )
190
- async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
191
- return await aha_judge(row)
192
- ```
193
-
194
- Run it:
195
-
196
- ```bash
197
- pytest -q -s
198
- ```
199
-
200
- The pytest output includes local links for a leaderboard and row-level traces (pivot/table) at `http://localhost:8000`.
201
-
202
- ## Installation
203
-
204
- This library requires Python >= 3.10.
205
-
206
- ### pip
207
-
208
- ```bash
209
- pip install eval-protocol
210
- ```
211
-
212
- ### uv (recommended)
213
-
214
- ```bash
215
- # Install uv (if needed)
216
- curl -LsSf https://astral.sh/uv/install.sh | sh
217
-
218
- # Add to your project
219
- uv add eval-protocol
220
- ```
221
-
222
- ## 📚 Resources
146
+ ## Resources
223
147
 
224
148
  - **[Documentation](https://evalprotocol.io)** – Guides and API reference
225
149
  - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
@@ -0,0 +1,39 @@
1
+ # Eval Protocol
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
4
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/eval-protocol/python-sdk)
5
+
6
+ **Eval Protocol (EP) is an open solution for doing reinforcement learning fine-tuning on existing agents — across any language, container, or framework.**
7
+
8
+ ![Eval Protocol overview](./docs/intro.png)
9
+
10
+ Most teams already have complex agents running in production — often across remote services with heavy dependencies, Docker containers, or TypeScript backends deployed on Vercel. When they try to train or fine-tune these agents with reinforcement learning, connecting them to a trainer quickly becomes painful.
11
+
12
+ Eval Protocol makes this possible in two ways:
13
+
14
+ 1. **Expose your agent through a simple API**
15
+ Wrap your existing agent (Python, TypeScript, Docker, etc.) in a simple HTTP service using EP’s rollout interface. EP handles the rollout orchestration, metadata passing, and trace storage automatically.
16
+ 2. **Connect with any trainer**
17
+ Once your agent speaks the EP standard, it can be fine-tuned or evaluated with any supported trainer — Fireworks RFT, TRL, Unsloth, or your own — with no environment rewrites.
18
+
19
+ The result: RL that works out-of-the-box for existing production agents.
20
+
21
+ ## Who This Is For
22
+
23
+ - **Applied AI teams** adding RL to existing production agents.
24
+ - **Research engineers** experimenting with fine-tuning complex, multi-turn or tool-using agents.
25
+ - **MLOps teams** building reproducible, language-agnostic rollout pipelines.
26
+
27
+ ## Quickstart
28
+
29
+ - See the Quickstart repository: [eval-protocol/quickstart](https://github.com/eval-protocol/quickstart/tree/main)
30
+
31
+ ## Resources
32
+
33
+ - **[Documentation](https://evalprotocol.io)** – Guides and API reference
34
+ - **[Discord](https://discord.com/channels/1137072072808472616/1400975572405850155)** – Community
35
+ - **[GitHub](https://github.com/eval-protocol/python-sdk)** – Source and examples
36
+
37
+ ## License
38
+
39
+ [MIT](LICENSE)
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-11-11T15:13:03-0800",
11
+ "date": "2025-11-12T15:43:06-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "41b79daeafe6bcb53a8a3183738314596874696d",
15
- "version": "0.2.85"
14
+ "full-revisionid": "8ab1c920bb77880deb87f2320c6cf6ea8780458e",
15
+ "version": "0.2.87"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -371,13 +371,13 @@ def parse_args(args=None):
371
371
  help="Create a Reinforcement Fine-tuning Job on Fireworks",
372
372
  )
373
373
  rft_parser.add_argument(
374
- "--evaluator-id",
375
- help="Evaluator ID used during upload; if omitted, derive from local traces or a single discovered test",
374
+ "--evaluator",
375
+ help="Evaluator ID or fully-qualified resource (accounts/{acct}/evaluators/{id}); if omitted, derive from local tests",
376
376
  )
377
377
  # Dataset options
378
378
  rft_parser.add_argument(
379
- "--dataset-id",
380
- help="Use existing Fireworks dataset id (skip local materialization)",
379
+ "--dataset",
380
+ help="Use existing dataset (ID or resource 'accounts/{acct}/datasets/{id}') to skip local materialization",
381
381
  )
382
382
  rft_parser.add_argument(
383
383
  "--dataset-jsonl",
@@ -400,6 +400,8 @@ def parse_args(args=None):
400
400
  rft_parser.add_argument("--learning-rate", type=float, default=3e-5)
401
401
  rft_parser.add_argument("--max-context-length", type=int, default=65536)
402
402
  rft_parser.add_argument("--lora-rank", type=int, default=16)
403
+ rft_parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of gradient accumulation steps")
404
+ rft_parser.add_argument("--learning-rate-warmup-steps", type=int, help="Number of LR warmup steps")
403
405
  rft_parser.add_argument("--accelerator-count", type=int, default=1)
404
406
  rft_parser.add_argument("--region", help="Fireworks region enum value")
405
407
  rft_parser.add_argument("--display-name", help="RFT job display name")
@@ -407,14 +409,19 @@ def parse_args(args=None):
407
409
  rft_parser.add_argument("--eval-auto-carveout", dest="eval_auto_carveout", action="store_true", default=True)
408
410
  rft_parser.add_argument("--no-eval-auto-carveout", dest="eval_auto_carveout", action="store_false")
409
411
  # Rollout chunking
410
- rft_parser.add_argument("--chunk-size", type=int, default=10, help="Data chunk size for rollout batching")
412
+ rft_parser.add_argument("--chunk-size", type=int, default=100, help="Data chunk size for rollout batching")
411
413
  # Inference params
412
414
  rft_parser.add_argument("--temperature", type=float)
413
415
  rft_parser.add_argument("--top-p", type=float)
414
416
  rft_parser.add_argument("--top-k", type=int)
415
- rft_parser.add_argument("--max-tokens", type=int, default=32768)
416
- rft_parser.add_argument("--n", type=int, default=8)
417
- rft_parser.add_argument("--inference-extra-body", help="JSON string for extra inference params")
417
+ rft_parser.add_argument("--max-output-tokens", type=int, default=32768)
418
+ rft_parser.add_argument("--response-candidates-count", type=int, default=8)
419
+ rft_parser.add_argument("--extra-body", help="JSON string for extra inference params")
420
+ # MCP server (optional)
421
+ rft_parser.add_argument(
422
+ "--mcp-server",
423
+ help="The MCP server resource name to use for the reinforcement fine-tuning job.",
424
+ )
418
425
  # Wandb
419
426
  rft_parser.add_argument("--wandb-enabled", action="store_true")
420
427
  rft_parser.add_argument("--wandb-project")
@@ -422,7 +429,7 @@ def parse_args(args=None):
422
429
  rft_parser.add_argument("--wandb-run-id")
423
430
  rft_parser.add_argument("--wandb-api-key")
424
431
  # Misc
425
- rft_parser.add_argument("--rft-job-id", help="Specify an explicit RFT job id")
432
+ rft_parser.add_argument("--job-id", help="Specify an explicit RFT job id")
426
433
  rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
427
434
  rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
428
435
  rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
@@ -447,6 +454,16 @@ def parse_args(args=None):
447
454
  action="store_true",
448
455
  help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
449
456
  )
457
+ local_test_parser.add_argument(
458
+ "--docker-build-extra",
459
+ default="",
460
+ help="Extra flags to pass to 'docker build' (quoted string, e.g. \"--no-cache --pull --progress=plain\")",
461
+ )
462
+ local_test_parser.add_argument(
463
+ "--docker-run-extra",
464
+ default="",
465
+ help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")",
466
+ )
450
467
 
451
468
  # Run command (for Hydra-based evaluations)
452
469
  # This subparser intentionally defines no arguments itself.
@@ -344,7 +344,7 @@ def _poll_evaluator_status(
344
344
 
345
345
 
346
346
  def create_rft_command(args) -> int:
347
- evaluator_id: Optional[str] = getattr(args, "evaluator_id", None)
347
+ evaluator_id: Optional[str] = getattr(args, "evaluator", None)
348
348
  non_interactive: bool = bool(getattr(args, "yes", False))
349
349
  dry_run: bool = bool(getattr(args, "dry_run", False))
350
350
  force: bool = bool(getattr(args, "force", False))
@@ -373,11 +373,11 @@ def create_rft_command(args) -> int:
373
373
  print("No evaluation tests found.")
374
374
  print("\nHint: Make sure your tests use the @evaluation_test decorator.")
375
375
  return 1
376
- # Always interactive selection here (no implicit quiet unless --evaluator-id was provided)
376
+ # Always interactive selection here
377
377
  try:
378
378
  selected_tests = _prompt_select(tests, non_interactive=non_interactive)
379
379
  except Exception:
380
- print("Error: Failed to open selector UI. Please pass --evaluator-id or --entry explicitly.")
380
+ print("Error: Failed to open selector UI. Please pass --evaluator or --entry explicitly.")
381
381
  return 1
382
382
  if not selected_tests:
383
383
  print("No tests selected.")
@@ -385,7 +385,7 @@ def create_rft_command(args) -> int:
385
385
  if len(selected_tests) != 1:
386
386
  if non_interactive and len(selected_tests) > 1:
387
387
  print("Error: Multiple evaluation tests found in --yes (non-interactive) mode.")
388
- print(" Please pass --evaluator-id or --entry to disambiguate.")
388
+ print(" Please pass --evaluator or --entry to disambiguate.")
389
389
  try:
390
390
  # Offer candidate evaluator ids for convenience
391
391
  tests = _discover_tests(project_root)
@@ -410,8 +410,13 @@ def create_rft_command(args) -> int:
410
410
  selected_test_file_path, selected_test_func_name = _resolve_selected_test(
411
411
  project_root, evaluator_id, selected_tests=selected_tests
412
412
  )
413
- # Resolve evaluator resource name to fully-qualified format required by API
414
- evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
413
+ # Resolve evaluator resource name to fully-qualified format required by API.
414
+ # Allow users to pass either short id or fully-qualified resource.
415
+ if evaluator_id and evaluator_id.startswith("accounts/"):
416
+ evaluator_resource_name = evaluator_id
417
+ evaluator_id = _extract_terminal_segment(evaluator_id)
418
+ else:
419
+ evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
415
420
 
416
421
  # Optional short-circuit: if evaluator already exists and not forcing, skip upload path
417
422
  skip_upload = False
@@ -470,10 +475,10 @@ def create_rft_command(args) -> int:
470
475
  # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
471
476
  if selected_entry is None and len(tests) > 1:
472
477
  print(
473
- f"Error: Multiple evaluation tests found, and the selected evaluator_id {evaluator_id} does not match any discovered test.\n"
474
- " Please re-run specifying the evaluator id.\n"
478
+ f"Error: Multiple evaluation tests found, and the selected evaluator {evaluator_id} does not match any discovered test.\n"
479
+ " Please re-run specifying the evaluator.\n"
475
480
  " Hints:\n"
476
- " - eval-protocol create rft --evaluator-id <existing-evaluator-id>\n"
481
+ " - eval-protocol create rft --evaluator <existing-evaluator-id>\n"
477
482
  )
478
483
  return 1
479
484
 
@@ -523,10 +528,15 @@ def create_rft_command(args) -> int:
523
528
  print(f"Warning: Failed to upload evaluator automatically: {e}")
524
529
 
525
530
  # Determine dataset id and materialization path
526
- dataset_id = getattr(args, "dataset_id", None)
531
+ dataset_id = getattr(args, "dataset", None)
527
532
  dataset_jsonl = getattr(args, "dataset_jsonl", None)
528
533
  dataset_display_name = getattr(args, "dataset_display_name", None)
529
534
  dataset_builder = getattr(args, "dataset_builder", None) # accepted but unused in simplified flow
535
+ dataset_resource_override: Optional[str] = None
536
+ if isinstance(dataset_id, str) and dataset_id.startswith("accounts/"):
537
+ # Caller passed a fully-qualified dataset; capture it for body and keep only terminal id for printing
538
+ dataset_resource_override = dataset_id
539
+ dataset_id = _extract_terminal_segment(dataset_id)
530
540
 
531
541
  if not dataset_id:
532
542
  # Prefer explicit --dataset-jsonl, else attempt to extract from the selected test's data loader or input_dataset.
@@ -573,7 +583,7 @@ def create_rft_command(args) -> int:
573
583
  print(f"Warning: dataset builder failed: {e}")
574
584
  if not dataset_jsonl:
575
585
  print(
576
- "Error: Could not determine dataset. Provide --dataset-id or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
586
+ "Error: Could not determine dataset. Provide --dataset or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
577
587
  )
578
588
  return 1
579
589
 
@@ -628,6 +638,8 @@ def create_rft_command(args) -> int:
628
638
  ("learningRate", "learning_rate"),
629
639
  ("maxContextLength", "max_context_length"),
630
640
  ("loraRank", "lora_rank"),
641
+ ("gradientAccumulationSteps", "gradient_accumulation_steps"),
642
+ ("learningRateWarmupSteps", "learning_rate_warmup_steps"),
631
643
  ("acceleratorCount", "accelerator_count"),
632
644
  ("region", "region"),
633
645
  ]:
@@ -640,14 +652,25 @@ def create_rft_command(args) -> int:
640
652
  ("temperature", "temperature"),
641
653
  ("topP", "top_p"),
642
654
  ("topK", "top_k"),
643
- ("maxTokens", "max_tokens"),
644
- ("n", "n"),
655
+ ("maxTokens", "max_output_tokens"),
656
+ ("n", "response_candidates_count"),
645
657
  ]:
646
658
  val = getattr(args, arg_name, None)
647
659
  if val is not None:
648
660
  inference_params[key] = val
649
- if getattr(args, "inference_extra_body", None):
650
- inference_params["extraBody"] = args.inference_extra_body
661
+ if getattr(args, "extra_body", None):
662
+ extra = getattr(args, "extra_body")
663
+ if isinstance(extra, (dict, list)):
664
+ try:
665
+ inference_params["extraBody"] = json.dumps(extra, ensure_ascii=False)
666
+ except (TypeError, ValueError) as e:
667
+ print(f"Error: --extra-body dict/list must be JSON-serializable: {e}")
668
+ return 1
669
+ elif isinstance(extra, str):
670
+ inference_params["extraBody"] = extra
671
+ else:
672
+ print("Error: --extra-body must be a JSON string or a JSON-serializable dict/list.")
673
+ return 1
651
674
 
652
675
  wandb_config: Optional[Dict[str, Any]] = None
653
676
  if getattr(args, "wandb_enabled", False):
@@ -659,9 +682,12 @@ def create_rft_command(args) -> int:
659
682
  "runId": getattr(args, "wandb_run_id", None),
660
683
  }
661
684
 
685
+ # Build dataset resource (prefer override when provided)
686
+ dataset_resource = dataset_resource_override or f"accounts/{account_id}/datasets/{dataset_id}"
687
+
662
688
  body: Dict[str, Any] = {
663
- # "displayName": getattr(args, "display_name", None) or f"{evaluator_id}-rft",
664
- "dataset": f"accounts/{account_id}/datasets/{dataset_id}",
689
+ "displayName": getattr(args, "display_name", None),
690
+ "dataset": dataset_resource,
665
691
  "evaluator": evaluator_resource_name,
666
692
  "evalAutoCarveout": bool(getattr(args, "eval_auto_carveout", True)),
667
693
  "trainingConfig": training_config,
@@ -670,7 +696,8 @@ def create_rft_command(args) -> int:
670
696
  "chunkSize": getattr(args, "chunk_size", None),
671
697
  "outputStats": None,
672
698
  "outputMetrics": None,
673
- "mcpServer": None,
699
+ "mcpServer": getattr(args, "mcp_server", None),
700
+ "jobId": getattr(args, "job_id", None),
674
701
  }
675
702
  # Debug: print minimal summary
676
703
  print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
@@ -2,6 +2,7 @@ import argparse
2
2
  import os
3
3
  import subprocess
4
4
  import sys
5
+ import shlex
5
6
  from typing import List
6
7
 
7
8
  from .upload import _discover_tests, _prompt_select
@@ -24,16 +25,15 @@ def _run_pytest_host(pytest_target: str) -> int:
24
25
  return proc.returncode
25
26
 
26
27
 
27
- def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
28
+ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List[str] | None = None) -> bool:
28
29
  context_dir = os.path.dirname(dockerfile_path)
29
30
  print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
30
31
  try:
31
- proc = subprocess.run(
32
- ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
33
- stdout=subprocess.PIPE,
34
- stderr=subprocess.STDOUT,
35
- text=True,
36
- )
32
+ base_cmd = ["docker", "build"]
33
+ if build_extras:
34
+ base_cmd += build_extras
35
+ base_cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir]
36
+ proc = subprocess.run(base_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
37
37
  print(proc.stdout)
38
38
  return proc.returncode == 0
39
39
  except FileNotFoundError:
@@ -41,7 +41,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
41
41
  return False
42
42
 
43
43
 
44
- def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
44
+ def _run_pytest_in_docker(
45
+ project_root: str, image_tag: str, pytest_target: str, run_extras: List[str] | None = None
46
+ ) -> int:
45
47
  workdir = "/workspace"
46
48
  # Host HOME logs directory to map into container
47
49
  host_home = os.path.expanduser("~")
@@ -73,6 +75,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
73
75
  cmd += ["--user", f"{uid}:{gid}"]
74
76
  except Exception:
75
77
  pass
78
+ if run_extras:
79
+ cmd += run_extras
76
80
  cmd += [image_tag, "pytest", pytest_target, "-vs"]
77
81
  print("Running in Docker:", " ".join(cmd))
78
82
  try:
@@ -91,11 +95,16 @@ def local_test_command(args: argparse.Namespace) -> int:
91
95
  entry = getattr(args, "entry", None)
92
96
  if entry:
93
97
  if "::" in entry:
94
- file_part = entry.split("::", 1)[0]
98
+ file_part, func_part = entry.split("::", 1)
95
99
  file_path = (
96
100
  file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
97
101
  )
98
- pytest_target = entry
102
+ # Convert to project-relative like the non-:: path
103
+ try:
104
+ rel = os.path.relpath(file_path, project_root)
105
+ except Exception:
106
+ rel = file_path
107
+ pytest_target = f"{rel}::{func_part}"
99
108
  else:
100
109
  file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
101
110
  # Use path relative to project_root when possible
@@ -126,6 +135,10 @@ def local_test_command(args: argparse.Namespace) -> int:
126
135
  pytest_target = rel
127
136
 
128
137
  ignore_docker = bool(getattr(args, "ignore_docker", False))
138
+ build_extras_str = getattr(args, "docker_build_extra", "") or ""
139
+ run_extras_str = getattr(args, "docker_run_extra", "") or ""
140
+ build_extras = shlex.split(build_extras_str) if build_extras_str else []
141
+ run_extras = shlex.split(run_extras_str) if run_extras_str else []
129
142
  if ignore_docker:
130
143
  if not pytest_target:
131
144
  print("Error: Failed to resolve a pytest target to run.")
@@ -146,14 +159,14 @@ def local_test_command(args: argparse.Namespace) -> int:
146
159
  except Exception:
147
160
  pass
148
161
  image_tag = "ep-evaluator:local"
149
- ok = _build_docker_image(dockerfiles[0], image_tag)
162
+ ok = _build_docker_image(dockerfiles[0], image_tag, build_extras=build_extras)
150
163
  if not ok:
151
164
  print("Docker build failed. See logs above.")
152
165
  return 1
153
166
  if not pytest_target:
154
167
  print("Error: Failed to resolve a pytest target to run.")
155
168
  return 1
156
- return _run_pytest_in_docker(project_root, image_tag, pytest_target)
169
+ return _run_pytest_in_docker(project_root, image_tag, pytest_target, run_extras=run_extras)
157
170
 
158
171
  # No Dockerfile: run on host
159
172
  if not pytest_target:
@@ -8,6 +8,7 @@ import time
8
8
  import uuid
9
9
  from pathlib import Path
10
10
  from typing import Any, Callable, Dict, Iterable, Optional, Tuple
11
+ from urllib.parse import urlencode
11
12
 
12
13
  import requests
13
14
 
@@ -186,6 +187,14 @@ def create_reinforcement_fine_tuning_job(
186
187
  body: Dict[str, Any],
187
188
  ) -> Dict[str, Any]:
188
189
  url = f"{api_base.rstrip('/')}/v1/accounts/{account_id}/reinforcementFineTuningJobs"
190
+ # Move optional jobId from body to query parameter if provided
191
+ job_id = body.get("jobId")
192
+ if isinstance(job_id, str):
193
+ job_id = job_id.strip()
194
+ if job_id:
195
+ # Remove from body and append as query param
196
+ body.pop("jobId", None)
197
+ url = f"{url}?{urlencode({'reinforcementFineTuningJobId': job_id})}"
189
198
  headers = {
190
199
  "Authorization": f"Bearer {api_key}",
191
200
  "Content-Type": "application/json",