eval-protocol 0.2.82__tar.gz → 0.2.83__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (448) hide show
  1. {eval_protocol-0.2.82/eval_protocol.egg-info → eval_protocol-0.2.83}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/create_rft.py +25 -120
  4. {eval_protocol-0.2.82 → eval_protocol-0.2.83/eval_protocol.egg-info}/PKG-INFO +1 -1
  5. eval_protocol-0.2.83/tests/test_cli_create_rft_infer.py +388 -0
  6. eval_protocol-0.2.82/tests/test_cli_create_rft_infer.py +0 -314
  7. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/LICENSE +0 -0
  8. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/README.md +0 -0
  9. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/development/__init__.py +0 -0
  10. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/development/normalize_sandbox_fusion.py +0 -0
  11. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/development/utils/__init__.py +0 -0
  12. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/development/utils/generate_api_key.py +0 -0
  13. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/development/utils/subprocess_manager.py +0 -0
  14. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/__init__.py +0 -0
  15. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/__main__.py +0 -0
  16. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/__init__.py +0 -0
  17. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/base.py +0 -0
  18. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/bigquery.py +0 -0
  19. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/braintrust.py +0 -0
  20. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  21. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/huggingface.py +0 -0
  22. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/langchain.py +0 -0
  23. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/langfuse.py +0 -0
  24. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/langsmith.py +0 -0
  25. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/openai_responses.py +0 -0
  26. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/trl.py +0 -0
  27. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/utils.py +0 -0
  28. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/adapters/weave.py +0 -0
  29. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/__init__.py +0 -0
  30. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/models.py +0 -0
  31. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/orchestrator.py +0 -0
  32. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resource_abc.py +0 -0
  33. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resource_pool.py +0 -0
  34. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/__init__.py +0 -0
  35. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  36. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  37. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  38. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  39. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  40. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/docker_resource.py +0 -0
  41. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  42. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  43. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/resources/sql_resource.py +0 -0
  44. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/task_manager.py +0 -0
  45. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/agent/tool_registry.py +0 -0
  46. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/auth.py +0 -0
  47. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/__init__.py +0 -0
  48. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  49. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  50. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/test_aime25.py +0 -0
  51. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  52. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  53. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  54. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  55. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  56. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli.py +0 -0
  57. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/__init__.py +0 -0
  58. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  59. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/common.py +0 -0
  60. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/deploy.py +0 -0
  61. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  62. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/logs.py +0 -0
  63. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/preview.py +0 -0
  64. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  65. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/cli_commands/upload.py +0 -0
  66. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/common_utils.py +0 -0
  67. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/config.py +0 -0
  68. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/data_loader/__init__.py +0 -0
  69. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  70. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  71. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  72. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  73. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/data_loader/models.py +0 -0
  74. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/dataset_logger/__init__.py +0 -0
  75. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  76. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  77. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  78. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  79. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/datasets/__init__.py +0 -0
  80. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/datasets/loader.py +0 -0
  81. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/directory_utils.py +0 -0
  82. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/evaluation.py +0 -0
  83. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/event_bus/__init__.py +0 -0
  84. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/event_bus/event_bus.py +0 -0
  85. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/event_bus/logger.py +0 -0
  86. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  87. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  88. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/exceptions.py +0 -0
  89. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/execution/__init__.py +0 -0
  90. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/execution/pipeline.py +0 -0
  91. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/fireworks_rft.py +0 -0
  92. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/gcp_tools.py +0 -0
  93. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/generation/cache.py +0 -0
  94. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/generation/clients/base.py +0 -0
  95. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/generation/clients.py +0 -0
  96. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/generic_server.py +0 -0
  97. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/get_pep440_version.py +0 -0
  98. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/human_id/__init__.py +0 -0
  99. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/human_id/dictionary.py +0 -0
  100. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/integrations/__init__.py +0 -0
  101. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/integrations/deepeval.py +0 -0
  102. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/integrations/openeval.py +0 -0
  103. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/integrations/trl.py +0 -0
  104. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/__init__.py +0 -0
  105. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  106. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  107. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  108. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  109. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/init.py +0 -0
  110. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/rollout_context.py +0 -0
  111. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  112. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/log_utils/util.py +0 -0
  113. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/logging_utils.py +0 -0
  114. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/__init__.py +0 -0
  115. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/adapter.py +0 -0
  116. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/client/__init__.py +0 -0
  117. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/client/connection.py +0 -0
  118. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/clients.py +0 -0
  119. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/execution/__init__.py +0 -0
  120. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/execution/base_policy.py +0 -0
  121. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/execution/manager.py +0 -0
  122. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/execution/policy.py +0 -0
  123. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/grid_renderer.py +0 -0
  124. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  125. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/mcpgym.py +0 -0
  126. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/process_manager.py +0 -0
  127. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/session/__init__.py +0 -0
  128. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/session/manager.py +0 -0
  129. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/simple_process_manager.py +0 -0
  130. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp/simulation_server.py +0 -0
  131. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_agent/__init__.py +0 -0
  132. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_agent/config.py +0 -0
  133. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_agent/main.py +0 -0
  134. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  135. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  136. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  137. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  138. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_env.py +0 -0
  139. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/__init__.py +0 -0
  140. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  141. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  142. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  143. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  144. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  145. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  146. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  147. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  148. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  149. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  150. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  151. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  152. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  153. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  154. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/models.py +0 -0
  155. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/packaging.py +0 -0
  156. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/platform_api.py +0 -0
  157. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/playback_policy.py +0 -0
  158. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/__init__.py +0 -0
  159. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  160. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/proxy_core/app.py +0 -0
  161. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  162. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  163. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  164. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/proxy_core/main.py +0 -0
  165. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/proxy_core/models.py +0 -0
  166. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  167. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/__init__.py +0 -0
  168. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  169. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  170. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  171. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  172. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  173. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  174. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  175. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  176. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  177. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/evaluation_test.py +0 -0
  178. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  179. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  180. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/exception_config.py +0 -0
  181. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/execution.py +0 -0
  182. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  183. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  184. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  185. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/parameterize.py +0 -0
  186. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/plugin.py +0 -0
  187. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  188. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/rollout_processor.py +0 -0
  189. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/store_experiment_link.py +0 -0
  190. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/store_results_url.py +0 -0
  191. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/tracing_utils.py +0 -0
  192. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/types.py +0 -0
  193. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/pytest/validate_signature.py +0 -0
  194. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/__init__.py +0 -0
  195. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  196. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  197. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  198. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  199. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  200. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  201. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  202. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/llm_judge.py +0 -0
  203. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  204. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  205. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  206. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  207. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/quickstart/utils.py +0 -0
  208. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/resources.py +0 -0
  209. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/reward_function.py +0 -0
  210. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/__init__.py +0 -0
  211. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/accuracy.py +0 -0
  212. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/accuracy_length.py +0 -0
  213. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  214. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  215. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/apps_testing_util.py +0 -0
  216. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/bfcl_reward.py +0 -0
  217. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/code_execution.py +0 -0
  218. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/code_execution_utils.py +0 -0
  219. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/cpp_code.py +0 -0
  220. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  221. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/format.py +0 -0
  222. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/function_calling.py +0 -0
  223. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/json_schema.py +0 -0
  224. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/language_consistency.py +0 -0
  225. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/lean_prover.py +0 -0
  226. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/length.py +0 -0
  227. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  228. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/math.py +0 -0
  229. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  230. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/reasoning_steps.py +0 -0
  231. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/repetition.py +0 -0
  232. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rewards/tag_count.py +0 -0
  233. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/rl_processing.py +0 -0
  234. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/server.py +0 -0
  235. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/stats/__init__.py +0 -0
  236. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/stats/confidence_intervals.py +0 -0
  237. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/typed_interface.py +0 -0
  238. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/types/__init__.py +0 -0
  239. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/types/errors.py +0 -0
  240. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/types/remote_rollout_processor.py +0 -0
  241. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/types/types.py +0 -0
  242. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/__init__.py +0 -0
  243. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/batch_evaluation.py +0 -0
  244. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/batch_transformation.py +0 -0
  245. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/browser_utils.py +0 -0
  246. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/check_server_status.py +0 -0
  247. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/dataset_helpers.py +0 -0
  248. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  249. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/logs_models.py +0 -0
  250. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/logs_server.py +0 -0
  251. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/module_loader.py +0 -0
  252. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/packaging_utils.py +0 -0
  253. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/show_results_url.py +0 -0
  254. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/static_policy.py +0 -0
  255. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/subprocess_utils.py +0 -0
  256. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol/utils/vite_server.py +0 -0
  257. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol.egg-info/SOURCES.txt +0 -0
  258. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol.egg-info/dependency_links.txt +0 -0
  259. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol.egg-info/entry_points.txt +0 -0
  260. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol.egg-info/requires.txt +0 -0
  261. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/eval_protocol.egg-info/top_level.txt +0 -0
  262. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/pyproject.toml +0 -0
  263. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/setup.cfg +0 -0
  264. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/setup.py +0 -0
  265. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_accuracy.py +0 -0
  266. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_accuracy_length.py +0 -0
  267. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_adapters_e2e.py +0 -0
  268. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_agent_orchestrator.py +0 -0
  269. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_agent_resources.py +0 -0
  270. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_auth.py +0 -0
  271. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_batch_evaluation.py +0 -0
  272. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_cli.py +0 -0
  273. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_cli_agent.py +0 -0
  274. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_cli_args.py +0 -0
  275. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_code_execution.py +0 -0
  276. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_config.py +0 -0
  277. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_control_plane_separation.py +0 -0
  278. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_cpp_code.py +0 -0
  279. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_data_driven_task_manager.py +0 -0
  280. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_deepcoder_reward.py +0 -0
  281. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_deepeval_integration.py +0 -0
  282. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_deploy_integration.py +0 -0
  283. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_directory_utils.py +0 -0
  284. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_e2b_integration.py +0 -0
  285. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_e2b_js_integration.py +0 -0
  286. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_edge_cases.py +0 -0
  287. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_ep_upload_e2e.py +0 -0
  288. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_eval_protocol_import.py +0 -0
  289. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_evaluation.py +0 -0
  290. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_evaluation_integration.py +0 -0
  291. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_evaluation_postprocess.py +0 -0
  292. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_evaluation_preview_integration.py +0 -0
  293. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_event_bus.py +0 -0
  294. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_event_bus_helper.py +0 -0
  295. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_examples_end_to_end.py +0 -0
  296. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_exceptions.py +0 -0
  297. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_fireworks_api.py +0 -0
  298. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_format.py +0 -0
  299. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_fractional_code.py +0 -0
  300. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_function_calling.py +0 -0
  301. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_gcp_tools.py +0 -0
  302. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_generic_server.py +0 -0
  303. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_human_id.py +0 -0
  304. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_integration.py +0 -0
  305. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_json_schema.py +0 -0
  306. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_kwargs_validation.py +0 -0
  307. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_language_consistency.py +0 -0
  308. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_lean_prover.py +0 -0
  309. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_lean_prover_runner.py +0 -0
  310. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_length.py +0 -0
  311. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_list_comparison_math_reward.py +0 -0
  312. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_logs_server.py +0 -0
  313. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_logs_server_simple.py +0 -0
  314. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_math.py +0 -0
  315. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_message_field_filtering.py +0 -0
  316. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_minimal.py +0 -0
  317. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_models.py +0 -0
  318. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_models_rl.py +0 -0
  319. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_multiple_choice_math_reward.py +0 -0
  320. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_n_variant_batch_integration.py +0 -0
  321. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_n_variant_integration.py +0 -0
  322. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_openai_compatibility.py +0 -0
  323. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_openeval_integration.py +0 -0
  324. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_packaging.py +0 -0
  325. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_parallel_rollouts.py +0 -0
  326. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_platform_api.py +0 -0
  327. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_quickstart_utils.py +0 -0
  328. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_readiness.py +0 -0
  329. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_reasoning_steps.py +0 -0
  330. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_repetition.py +0 -0
  331. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_repetition_debug.py +0 -0
  332. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_retry_mechanism.py +0 -0
  333. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_reward_function.py +0 -0
  334. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_reward_protocol_import.py +0 -0
  335. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_rl_processing.py +0 -0
  336. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_rollout_control_plane_integration.py +0 -0
  337. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_server.py +0 -0
  338. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_show_results_url.py +0 -0
  339. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_status_migration_changes.py +0 -0
  340. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_status_migration_integration.py +0 -0
  341. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_status_model.py +0 -0
  342. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_tag_count.py +0 -0
  343. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_tau_bench_airline_smoke.py +0 -0
  344. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_typed_interface.py +0 -0
  345. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_typed_interface_rl.py +0 -0
  346. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_upload_entrypoint.py +0 -0
  347. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_url_handling.py +0 -0
  348. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/tests/test_vite_server.py +0 -0
  349. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/__init__.py +0 -0
  350. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/agent/__init__.py +0 -0
  351. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/agent/base.py +0 -0
  352. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/agent/llm_agent.py +0 -0
  353. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/api_service/__init__.py +0 -0
  354. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/api_service/api_config.py +0 -0
  355. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/api_service/data_model.py +0 -0
  356. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/api_service/simulation_service.py +0 -0
  357. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/cli.py +0 -0
  358. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/config.py +0 -0
  359. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/airline/policy.md +0 -0
  360. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/mock/policy.md +0 -0
  361. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  362. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/retail/policy.md +0 -0
  363. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  364. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  365. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  366. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  367. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  368. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  369. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  370. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data_model/__init__.py +0 -0
  371. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data_model/message.py +0 -0
  372. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data_model/simulation.py +0 -0
  373. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/data_model/tasks.py +0 -0
  374. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/__init__.py +0 -0
  375. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/airline/__init__.py +0 -0
  376. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/airline/data_model.py +0 -0
  377. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/airline/environment.py +0 -0
  378. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/airline/tools.py +0 -0
  379. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/airline/utils.py +0 -0
  380. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/mock/__init__.py +0 -0
  381. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/mock/data_model.py +0 -0
  382. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/mock/environment.py +0 -0
  383. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/mock/tools.py +0 -0
  384. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/mock/utils.py +0 -0
  385. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/retail/__init__.py +0 -0
  386. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/retail/data_model.py +0 -0
  387. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/retail/environment.py +0 -0
  388. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/retail/tools.py +0 -0
  389. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/retail/utils.py +0 -0
  390. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/__init__.py +0 -0
  391. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/data_model.py +0 -0
  392. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/environment.py +0 -0
  393. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  394. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  395. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  396. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  397. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  398. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  399. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  400. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  401. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/tools.py +0 -0
  402. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  403. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  404. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/domains/telecom/utils.py +0 -0
  405. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/environment/__init__.py +0 -0
  406. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/environment/db.py +0 -0
  407. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/environment/environment.py +0 -0
  408. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/environment/server.py +0 -0
  409. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/environment/tool.py +0 -0
  410. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/environment/toolkit.py +0 -0
  411. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  412. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/evaluator/__init__.py +0 -0
  413. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/evaluator/evaluator.py +0 -0
  414. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  415. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  416. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  417. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  418. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  419. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/metrics/__init__.py +0 -0
  420. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/metrics/agent_metrics.py +0 -0
  421. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  422. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/orchestrator/__init__.py +0 -0
  423. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  424. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  425. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/orchestrator/utils.py +0 -0
  426. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/registry.py +0 -0
  427. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/run.py +0 -0
  428. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/scripts/__init__.py +0 -0
  429. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/scripts/check_data.py +0 -0
  430. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  431. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/scripts/start_servers.py +0 -0
  432. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/scripts/view_simulations.py +0 -0
  433. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/user/__init__.py +0 -0
  434. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/user/base.py +0 -0
  435. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/user/user_simulator.py +0 -0
  436. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/utils/__init__.py +0 -0
  437. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/utils/display.py +0 -0
  438. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/utils/io_utils.py +0 -0
  439. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/utils/llm_utils.py +0 -0
  440. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/utils/pydantic_utils.py +0 -0
  441. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vendor/tau2/utils/utils.py +0 -0
  442. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/versioneer.py +0 -0
  443. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  444. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
  445. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
  446. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
  447. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  448. {eval_protocol-0.2.82 → eval_protocol-0.2.83}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.82
3
+ Version: 0.2.83
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-11-08T19:39:06-0800",
11
+ "date": "2025-11-09T23:23:12-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "69e53a7d7a70440177cb5545eb23f1b953994da9",
15
- "version": "0.2.82"
14
+ "full-revisionid": "a533dcb232528e3910d94adb922c6ab7df27bc4e",
15
+ "version": "0.2.83"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -20,88 +20,7 @@ from ..fireworks_rft import (
20
20
  create_dataset_from_jsonl,
21
21
  create_reinforcement_fine_tuning_job,
22
22
  )
23
- from .upload import _discover_tests, _normalize_evaluator_id, _resolve_entry_to_qual_and_source
24
-
25
-
26
- def _last_evaluator_paths(cwd: str) -> list[str]:
27
- return [
28
- os.path.join(cwd, ".eval_protocol", "last_evaluator.json"),
29
- os.path.expanduser(os.path.join("~", ".eval_protocol", "last_evaluator.json")),
30
- ]
31
-
32
-
33
- def _load_last_evaluator(cwd: str) -> Optional[str]:
34
- import json
35
-
36
- for p in _last_evaluator_paths(cwd):
37
- try:
38
- if os.path.isfile(p):
39
- with open(p, "r", encoding="utf-8") as f:
40
- data = json.load(f)
41
- if isinstance(data, dict) and data.get("evaluator_id"):
42
- return str(data["evaluator_id"])
43
- except Exception:
44
- # ignore and continue
45
- pass
46
- return None
47
-
48
-
49
- def _save_last_evaluator(cwd: str, evaluator_id: str) -> None:
50
- import json
51
-
52
- base = os.path.join(cwd, ".eval_protocol")
53
- try:
54
- os.makedirs(base, exist_ok=True)
55
- with open(os.path.join(base, "last_evaluator.json"), "w", encoding="utf-8") as f:
56
- json.dump({"evaluator_id": evaluator_id, "ts": time.time()}, f)
57
- except Exception:
58
- # best-effort only
59
- pass
60
-
61
-
62
- def _gather_evaluator_traces(cwd: str) -> list[dict]:
63
- roots = [
64
- os.path.join(cwd, ".eval_protocol", "evaluators"),
65
- os.path.expanduser(os.path.join("~", ".eval_protocol", "evaluators")),
66
- ]
67
- records: list[dict] = []
68
- for root in roots:
69
- if os.path.isdir(root):
70
- for name in os.listdir(root):
71
- if name.endswith(".json"):
72
- full = os.path.join(root, name)
73
- try:
74
- mtime = os.path.getmtime(full)
75
- except Exception:
76
- mtime = 0.0
77
- records.append({"id": name[:-5], "path": full, "mtime": mtime})
78
- # dedupe by id keeping most recent mtime
79
- dedup: dict[str, dict] = {}
80
- for rec in records:
81
- cur = dedup.get(rec["id"])
82
- if not cur or rec["mtime"] > cur["mtime"]:
83
- dedup[rec["id"]] = rec
84
- return list(dedup.values())
85
-
86
-
87
- def _prompt_select_evaluator(candidates: list[dict]) -> Optional[str]:
88
- print("\nMultiple evaluators detected. Select one:")
89
- ordered = sorted(candidates, key=lambda x: -x["mtime"])
90
- for i, c in enumerate(ordered, start=1):
91
- print(f" {i}) {c['id']} (from {c['path']})")
92
- try:
93
- choice = input("Enter a number (or press Enter to cancel): ").strip()
94
- except KeyboardInterrupt:
95
- print("\nCancelled.")
96
- return None
97
- if not choice or not choice.isdigit():
98
- return None
99
- n = int(choice)
100
- if 1 <= n <= len(ordered):
101
- sel = ordered[n - 1]["id"]
102
- print(f"✓ Using evaluator: {sel}")
103
- return sel
104
- return None
23
+ from .upload import _discover_tests, _normalize_evaluator_id, _prompt_select
105
24
 
106
25
 
107
26
  def _ensure_account_id() -> Optional[str]:
@@ -331,37 +250,6 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str:
331
250
  return f"{base}{suffix}"
332
251
 
333
252
 
334
- def _auto_select_evaluator_id(cwd: str, *, non_interactive: bool = False) -> Optional[str]:
335
- # 1) Use last used pointer if available
336
- last = _load_last_evaluator(cwd)
337
- if last:
338
- return last
339
-
340
- # 2) Look for evaluator traces in project and home
341
- traces = _gather_evaluator_traces(cwd)
342
- if len(traces) == 1:
343
- return traces[0]["id"]
344
- if len(traces) > 1:
345
- if non_interactive:
346
- sel = sorted(traces, key=lambda x: -x["mtime"])[0]["id"]
347
- print(f"⚠️ Multiple evaluators found; using most recent: {sel}. Override with --evaluator-id.")
348
- return sel
349
- chosen = _prompt_select_evaluator(traces)
350
- if chosen:
351
- return chosen
352
- return None
353
-
354
- # 3) Fall back to discovering a single evaluation_test
355
- tests = _discover_tests(cwd)
356
- if len(tests) == 1:
357
- qualname, source_file_path = tests[0].qualname, tests[0].file_path
358
- test_func_name = qualname.split(".")[-1]
359
- source_file_name = os.path.splitext(os.path.basename(source_file_path))[0]
360
- evaluator_id = _normalize_evaluator_id(f"{source_file_name}-{test_func_name}")
361
- return evaluator_id
362
- return None
363
-
364
-
365
253
  def _poll_evaluator_status(
366
254
  evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
367
255
  ) -> bool:
@@ -441,13 +329,31 @@ def create_rft_command(args) -> int:
441
329
 
442
330
  api_base = get_fireworks_api_base()
443
331
 
444
- # Resolve evaluator id if omitted
332
+ # Resolve evaluator id/entry if omitted (reuse upload's selector flow)
445
333
  project_root = os.getcwd()
446
334
  if not evaluator_id:
447
- evaluator_id = _auto_select_evaluator_id(project_root, non_interactive=non_interactive)
448
- if not evaluator_id:
449
- print("Error: Could not infer evaluator id. Provide --evaluator-id or run 'eval-protocol upload' first.")
335
+ print("Scanning for evaluation tests...")
336
+ tests = _discover_tests(project_root)
337
+ if not tests:
338
+ print("No evaluation tests found.")
339
+ print("\nHint: Make sure your tests use the @evaluation_test decorator.")
340
+ return 1
341
+ # Always interactive selection here (no implicit quiet unless --evaluator-id was provided)
342
+ try:
343
+ selected_tests = _prompt_select(tests, non_interactive=non_interactive)
344
+ except Exception:
345
+ print("Error: Failed to open selector UI. Please pass --evaluator-id or --entry explicitly.")
346
+ return 1
347
+ if not selected_tests:
348
+ print("No tests selected.")
349
+ return 1
350
+ if len(selected_tests) != 1:
351
+ print("Error: Please select exactly one evaluation test for 'create rft'.")
450
352
  return 1
353
+ chosen = selected_tests[0]
354
+ func_name = chosen.qualname.split(".")[-1]
355
+ source_file_name = os.path.splitext(os.path.basename(chosen.file_path))[0]
356
+ evaluator_id = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
451
357
  # Resolve evaluator resource name to fully-qualified format required by API
452
358
  evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
453
359
 
@@ -479,7 +385,6 @@ def create_rft_command(args) -> int:
479
385
  print(f"📊 Please check the evaluator status at: {dashboard_url}")
480
386
  print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
481
387
  return 1
482
- _save_last_evaluator(project_root, evaluator_id)
483
388
  skip_upload = True
484
389
  except requests.exceptions.RequestException:
485
390
  pass
@@ -561,8 +466,8 @@ def create_rft_command(args) -> int:
561
466
  print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
562
467
  return 1
563
468
  else:
564
- # Only persist last-used evaluator after successful ensure + ACTIVE
565
- _save_last_evaluator(project_root, evaluator_id)
469
+ # Evaluator ACTIVE; proceed
470
+ pass
566
471
  else:
567
472
  print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
568
473
  except Exception as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.82
3
+ Version: 0.2.83
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -0,0 +1,388 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from types import SimpleNamespace
5
+ from unittest.mock import patch
6
+
7
+ import pytest
8
+
9
+ from eval_protocol.cli_commands import create_rft as cr
10
+
11
+
12
+ def _write_json(path: str, data: dict) -> None:
13
+ os.makedirs(os.path.dirname(path), exist_ok=True)
14
+ with open(path, "w", encoding="utf-8") as f:
15
+ json.dump(data, f)
16
+
17
+
18
+ def test_create_rft_picks_most_recent_evaluator_and_dataset_id_follows(tmp_path, monkeypatch):
19
+ # Isolate HOME so expanduser paths remain inside tmp
20
+ monkeypatch.setenv("HOME", str(tmp_path / "home"))
21
+
22
+ # Create a fake project and chdir into it (create_rft uses os.getcwd())
23
+ project = tmp_path / "proj"
24
+ project.mkdir()
25
+ monkeypatch.chdir(project)
26
+
27
+ # Create a dummy dataset jsonl file
28
+ ds_path = project / "evaluator" / "dummy_dataset.jsonl"
29
+ ds_path.parent.mkdir(parents=True, exist_ok=True)
30
+ ds_path.write_text('{"input":"x"}\n', encoding="utf-8")
31
+
32
+ # Env required by create_rft_command
33
+ monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
34
+ monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123")
35
+ monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
36
+
37
+ # Stub out networked/subcommands used by create_rft
38
+ # Patch selector and upload
39
+ import eval_protocol.cli_commands.upload as upload_mod
40
+
41
+ # Simulate exactly one discovered test and selector returning it
42
+ one_file = project / "metric" / "test_single.py"
43
+ one_file.parent.mkdir(parents=True, exist_ok=True)
44
+ one_file.write_text("# single", encoding="utf-8")
45
+ single_disc = SimpleNamespace(qualname="metric.test_single", file_path=str(one_file))
46
+ monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [single_disc])
47
+ monkeypatch.setattr(upload_mod, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
48
+ monkeypatch.setattr(upload_mod, "upload_command", lambda args: 0)
49
+ monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
50
+
51
+ captured = {"dataset_id": None}
52
+
53
+ def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path):
54
+ captured["dataset_id"] = dataset_id
55
+ return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"}
56
+
57
+ monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl)
58
+ monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"})
59
+
60
+ # Build args: non_interactive (yes=True), no explicit evaluator_id, valid warm_start_from
61
+ args = type("Args", (), {})()
62
+ setattr(args, "evaluator_id", None)
63
+ setattr(args, "yes", True)
64
+ setattr(args, "dry_run", False)
65
+ setattr(args, "force", False)
66
+ setattr(args, "env_file", None)
67
+ setattr(args, "dataset_id", None)
68
+ setattr(args, "dataset_jsonl", str(ds_path))
69
+ setattr(args, "dataset_display_name", None)
70
+ setattr(args, "dataset_builder", None)
71
+ setattr(args, "base_model", None)
72
+ setattr(args, "warm_start_from", "accounts/acct123/models/ft-abc123")
73
+ setattr(args, "output_model", None)
74
+ setattr(args, "n", None)
75
+ setattr(args, "max_tokens", None)
76
+ setattr(args, "learning_rate", None)
77
+ setattr(args, "batch_size", None)
78
+ setattr(args, "epochs", None)
79
+ setattr(args, "lora_rank", None)
80
+ setattr(args, "max_context_length", None)
81
+ setattr(args, "chunk_size", None)
82
+ setattr(args, "eval_auto_carveout", None)
83
+
84
+ rc = cr.create_rft_command(args)
85
+ assert rc == 0
86
+
87
+ # Assert dataset id derived from selected test: metric-test_single
88
+ assert captured["dataset_id"] is not None
89
+ assert captured["dataset_id"].startswith("test-single-test-single-dataset-")
90
+
91
+
92
+ def test_create_rft_passes_matching_evaluator_id_and_entry_with_multiple_tests(tmp_path, monkeypatch):
93
+ # Ensure expanduser paths stay under tmp
94
+ monkeypatch.setenv("HOME", str(tmp_path / "home"))
95
+
96
+ # Project structure and CWD
97
+ project = tmp_path / "proj"
98
+ project.mkdir()
99
+ monkeypatch.chdir(project)
100
+
101
+ # Create dummy test files for discovery
102
+ eval_dir = project / "evaluator"
103
+ eval_dir.mkdir(parents=True, exist_ok=True)
104
+ cal_file = eval_dir / "foo_eval.py"
105
+ svg_file = eval_dir / "bar_eval.py"
106
+ cal_file.write_text("# foo", encoding="utf-8")
107
+ svg_file.write_text("# bar", encoding="utf-8")
108
+
109
+ # Fake discovered tests: foo and bar
110
+ cal_disc = SimpleNamespace(qualname="foo_eval.test_bar_evaluation", file_path=str(cal_file))
111
+ svg_disc = SimpleNamespace(qualname="bar_eval.test_baz_evaluation", file_path=str(svg_file))
112
+ monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [cal_disc, svg_disc])
113
+
114
+ # Env for CLI
115
+ monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
116
+ monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123")
117
+ monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
118
+
119
+ # Capture what upload receives (id and entry)
120
+ captured = {"id": None, "entry": None, "dataset_id": None}
121
+
122
+ # Monkeypatch the upload command from the upload module (the function imports it inside)
123
+ import eval_protocol.cli_commands.upload as upload_mod
124
+
125
+ def _fake_upload(ns):
126
+ captured["id"] = getattr(ns, "id", None)
127
+ captured["entry"] = getattr(ns, "entry", None)
128
+ return 0
129
+
130
+ monkeypatch.setattr(upload_mod, "upload_command", _fake_upload)
131
+
132
+ # Avoid network and capture dataset id
133
+ monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
134
+
135
+ def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path):
136
+ captured["dataset_id"] = dataset_id
137
+ return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"}
138
+
139
+ monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl)
140
+ monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"})
141
+
142
+ # Provide a dataset jsonl so flow proceeds
143
+ ds_path = eval_dir / "dummy_dataset.jsonl"
144
+ ds_path.write_text('{"input":"x"}\n', encoding="utf-8")
145
+
146
+ # Build args: no explicit evaluator id, selector will not be used here; mapping by id
147
+ import argparse
148
+
149
+ args = argparse.Namespace(
150
+ evaluator_id=cr._normalize_evaluator_id("foo_eval-test_bar_evaluation"),
151
+ yes=True,
152
+ dry_run=False,
153
+ force=False,
154
+ env_file=None,
155
+ dataset_id=None,
156
+ dataset_jsonl=str(ds_path),
157
+ dataset_display_name=None,
158
+ dataset_builder=None,
159
+ base_model=None,
160
+ warm_start_from="accounts/acct123/models/ft-abc123",
161
+ output_model=None,
162
+ n=None,
163
+ max_tokens=None,
164
+ learning_rate=None,
165
+ batch_size=None,
166
+ epochs=None,
167
+ lora_rank=None,
168
+ max_context_length=None,
169
+ chunk_size=None,
170
+ eval_auto_carveout=None,
171
+ )
172
+
173
+ rc = cr.create_rft_command(args)
174
+ assert rc == 0
175
+
176
+ # Assert evaluator_id passed to upload matches the provided id
177
+ assert captured["id"] == cr._normalize_evaluator_id("foo_eval-test_bar_evaluation")
178
+ # Assert entry points to the foo test (should map when id matches normalization)
179
+ assert captured["entry"] is not None and captured["entry"].endswith("foo_eval.py::test_bar_evaluation")
180
+ # Assert dataset id is derived from the same evaluator id (trimmed base + '-dataset-<timestamp>')
181
+ assert captured["dataset_id"] is not None
182
+ expected_prefix = (
183
+ cr._build_trimmed_dataset_id(cr._normalize_evaluator_id("foo_eval-test_bar_evaluation")).split("-dataset-")[0]
184
+ + "-dataset-"
185
+ )
186
+ assert captured["dataset_id"].startswith(expected_prefix)
187
+
188
+
189
+ def test_create_rft_interactive_selector_single_test(tmp_path, monkeypatch):
190
+ # Setup project
191
+ project = tmp_path / "proj"
192
+ project.mkdir()
193
+ monkeypatch.chdir(project)
194
+
195
+ # Single discovered test
196
+ test_file = project / "metric" / "test_one.py"
197
+ test_file.parent.mkdir(parents=True, exist_ok=True)
198
+ test_file.write_text("# one", encoding="utf-8")
199
+ single_disc = SimpleNamespace(qualname="metric.test_one", file_path=str(test_file))
200
+ monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [single_disc])
201
+
202
+ # Environment
203
+ monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
204
+ monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123")
205
+ monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
206
+
207
+ # Stub selector to return the single test; stub upload and polling
208
+ import eval_protocol.cli_commands.upload as upload_mod
209
+
210
+ monkeypatch.setattr(upload_mod, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
211
+ captured = {"id": None, "entry": None, "dataset_id": None}
212
+
213
+ def _fake_upload(ns):
214
+ captured["id"] = getattr(ns, "id", None)
215
+ captured["entry"] = getattr(ns, "entry", None)
216
+ return 0
217
+
218
+ monkeypatch.setattr(upload_mod, "upload_command", _fake_upload)
219
+ monkeypatch.setattr(cr, "_poll_evaluator_status", lambda **kwargs: True)
220
+
221
+ # Provide dataset jsonl
222
+ ds_path = project / "metric" / "dataset.jsonl"
223
+ ds_path.write_text('{"input":"x"}\n', encoding="utf-8")
224
+ monkeypatch.setattr(
225
+ cr,
226
+ "create_dataset_from_jsonl",
227
+ lambda account_id, api_key, api_base, dataset_id, display_name, jsonl_path: (
228
+ dataset_id,
229
+ {"name": f"accounts/{account_id}/datasets/{dataset_id}"},
230
+ ),
231
+ )
232
+ monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"})
233
+
234
+ # Run without evaluator_id; use --yes so selector returns tests directly (no UI)
235
+ import argparse
236
+
237
+ args = argparse.Namespace(
238
+ evaluator_id=None,
239
+ yes=True,
240
+ dry_run=False,
241
+ force=False,
242
+ env_file=None,
243
+ dataset_id=None,
244
+ dataset_jsonl=str(ds_path),
245
+ dataset_display_name=None,
246
+ dataset_builder=None,
247
+ base_model=None,
248
+ warm_start_from="accounts/acct123/models/ft-abc123",
249
+ output_model=None,
250
+ n=None,
251
+ max_tokens=None,
252
+ learning_rate=None,
253
+ batch_size=None,
254
+ epochs=None,
255
+ lora_rank=None,
256
+ max_context_length=None,
257
+ chunk_size=None,
258
+ eval_auto_carveout=None,
259
+ )
260
+
261
+ rc = cr.create_rft_command(args)
262
+ assert rc == 0
263
+ assert captured["id"] is not None
264
+ assert captured["entry"] is not None and captured["entry"].endswith("test_one.py::test_one")
265
+
266
+
267
+ def test_create_rft_quiet_existing_evaluator_skips_upload(tmp_path, monkeypatch):
268
+ project = tmp_path / "proj"
269
+ project.mkdir()
270
+ monkeypatch.chdir(project)
271
+
272
+ # Env
273
+ monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
274
+ monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123")
275
+ monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
276
+
277
+ # Mock evaluator exists and is ACTIVE
278
+ class _Resp:
279
+ ok = True
280
+
281
+ def json(self):
282
+ return {"state": "ACTIVE"}
283
+
284
+ def raise_for_status(self):
285
+ return None
286
+
287
+ monkeypatch.setattr(cr.requests, "get", lambda *a, **k: _Resp())
288
+
289
+ # Provide dataset via --dataset-jsonl so no test discovery needed
290
+ ds_path = project / "dataset.jsonl"
291
+ ds_path.write_text('{"input":"x"}\n', encoding="utf-8")
292
+ monkeypatch.setattr(
293
+ cr,
294
+ "create_dataset_from_jsonl",
295
+ lambda account_id, api_key, api_base, dataset_id, display_name, jsonl_path: (
296
+ dataset_id,
297
+ {"name": f"accounts/{account_id}/datasets/{dataset_id}"},
298
+ ),
299
+ )
300
+ monkeypatch.setattr(cr, "create_reinforcement_fine_tuning_job", lambda *a, **k: {"name": "jobs/123"})
301
+
302
+ import argparse
303
+
304
+ args = argparse.Namespace(
305
+ evaluator_id="some-eval",
306
+ yes=True,
307
+ dry_run=False,
308
+ force=False,
309
+ env_file=None,
310
+ dataset_id=None,
311
+ dataset_jsonl=str(ds_path),
312
+ dataset_display_name=None,
313
+ dataset_builder=None,
314
+ base_model=None,
315
+ warm_start_from="accounts/acct123/models/ft-abc123",
316
+ output_model=None,
317
+ n=None,
318
+ max_tokens=None,
319
+ learning_rate=None,
320
+ batch_size=None,
321
+ epochs=None,
322
+ lora_rank=None,
323
+ max_context_length=None,
324
+ chunk_size=None,
325
+ eval_auto_carveout=None,
326
+ )
327
+
328
+ rc = cr.create_rft_command(args)
329
+ assert rc == 0
330
+
331
+
332
+ def test_create_rft_quiet_new_evaluator_ambiguous_without_entry_errors(tmp_path, monkeypatch):
333
+ project = tmp_path / "proj"
334
+ project.mkdir()
335
+ monkeypatch.chdir(project)
336
+
337
+ # Env
338
+ monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
339
+ monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "acct123")
340
+ monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai")
341
+
342
+ # Evaluator does not exist (force path into upload section)
343
+ def _raise(*a, **k):
344
+ raise requests.exceptions.RequestException("nope")
345
+
346
+ import requests
347
+
348
+ monkeypatch.setattr(cr.requests, "get", _raise)
349
+
350
+ # Two discovered tests (ambiguous)
351
+ f1 = project / "a.py"
352
+ f2 = project / "b.py"
353
+ f1.write_text("# a", encoding="utf-8")
354
+ f2.write_text("# b", encoding="utf-8")
355
+ d1 = SimpleNamespace(qualname="a.test_one", file_path=str(f1))
356
+ d2 = SimpleNamespace(qualname="b.test_two", file_path=str(f2))
357
+ monkeypatch.setattr(cr, "_discover_tests", lambda cwd: [d1, d2])
358
+
359
+ import argparse
360
+
361
+ args = argparse.Namespace(
362
+ evaluator_id="some-eval",
363
+ yes=True,
364
+ dry_run=False,
365
+ force=False,
366
+ env_file=None,
367
+ dataset_id=None,
368
+ dataset_jsonl=str(project / "dataset.jsonl"),
369
+ dataset_display_name=None,
370
+ dataset_builder=None,
371
+ base_model=None,
372
+ warm_start_from="accounts/acct123/models/ft-abc123",
373
+ output_model=None,
374
+ n=None,
375
+ max_tokens=None,
376
+ learning_rate=None,
377
+ batch_size=None,
378
+ epochs=None,
379
+ lora_rank=None,
380
+ max_context_length=None,
381
+ chunk_size=None,
382
+ eval_auto_carveout=None,
383
+ )
384
+ # create the dataset file so we don't fail earlier
385
+ (project / "dataset.jsonl").write_text('{"input":"x"}\n', encoding="utf-8")
386
+
387
+ rc = cr.create_rft_command(args)
388
+ assert rc == 1