eval-protocol 0.2.83__tar.gz → 0.2.84__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (447) hide show
  1. {eval_protocol-0.2.83/eval_protocol.egg-info → eval_protocol-0.2.84}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/create_rft.py +96 -29
  4. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/upload.py +22 -67
  5. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/fireworks_rft.py +0 -21
  6. {eval_protocol-0.2.83 → eval_protocol-0.2.84/eval_protocol.egg-info}/PKG-INFO +1 -1
  7. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cli_create_rft_infer.py +322 -0
  8. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/LICENSE +0 -0
  9. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/README.md +0 -0
  10. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/__init__.py +0 -0
  11. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/normalize_sandbox_fusion.py +0 -0
  12. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/utils/__init__.py +0 -0
  13. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/utils/generate_api_key.py +0 -0
  14. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/development/utils/subprocess_manager.py +0 -0
  15. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/__init__.py +0 -0
  16. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/__main__.py +0 -0
  17. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/__init__.py +0 -0
  18. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/base.py +0 -0
  19. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/bigquery.py +0 -0
  20. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/braintrust.py +0 -0
  21. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  22. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/huggingface.py +0 -0
  23. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/langchain.py +0 -0
  24. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/langfuse.py +0 -0
  25. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/langsmith.py +0 -0
  26. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/openai_responses.py +0 -0
  27. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/trl.py +0 -0
  28. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/utils.py +0 -0
  29. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/adapters/weave.py +0 -0
  30. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/__init__.py +0 -0
  31. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/models.py +0 -0
  32. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/orchestrator.py +0 -0
  33. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resource_abc.py +0 -0
  34. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resource_pool.py +0 -0
  35. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/__init__.py +0 -0
  36. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  37. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  38. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  39. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  40. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  41. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/docker_resource.py +0 -0
  42. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  43. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  44. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/resources/sql_resource.py +0 -0
  45. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/task_manager.py +0 -0
  46. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/agent/tool_registry.py +0 -0
  47. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/auth.py +0 -0
  48. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/__init__.py +0 -0
  49. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  50. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  51. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_aime25.py +0 -0
  52. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  53. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  54. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  55. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  56. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  57. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli.py +0 -0
  58. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/__init__.py +0 -0
  59. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  60. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/common.py +0 -0
  61. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/deploy.py +0 -0
  62. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  63. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/logs.py +0 -0
  64. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/preview.py +0 -0
  65. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  66. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/common_utils.py +0 -0
  67. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/config.py +0 -0
  68. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/__init__.py +0 -0
  69. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  70. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  71. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  72. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  73. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/data_loader/models.py +0 -0
  74. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/__init__.py +0 -0
  75. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  76. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  77. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  78. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  79. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/datasets/__init__.py +0 -0
  80. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/datasets/loader.py +0 -0
  81. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/directory_utils.py +0 -0
  82. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/evaluation.py +0 -0
  83. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/__init__.py +0 -0
  84. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/event_bus.py +0 -0
  85. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/logger.py +0 -0
  86. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  87. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  88. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/exceptions.py +0 -0
  89. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/execution/__init__.py +0 -0
  90. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/execution/pipeline.py +0 -0
  91. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/gcp_tools.py +0 -0
  92. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/generation/cache.py +0 -0
  93. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/generation/clients/base.py +0 -0
  94. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/generation/clients.py +0 -0
  95. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/generic_server.py +0 -0
  96. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/get_pep440_version.py +0 -0
  97. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/human_id/__init__.py +0 -0
  98. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/human_id/dictionary.py +0 -0
  99. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/integrations/__init__.py +0 -0
  100. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/integrations/deepeval.py +0 -0
  101. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/integrations/openeval.py +0 -0
  102. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/integrations/trl.py +0 -0
  103. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/__init__.py +0 -0
  104. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  105. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  106. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  107. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  108. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/init.py +0 -0
  109. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/rollout_context.py +0 -0
  110. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  111. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/log_utils/util.py +0 -0
  112. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/logging_utils.py +0 -0
  113. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/__init__.py +0 -0
  114. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/adapter.py +0 -0
  115. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/client/__init__.py +0 -0
  116. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/client/connection.py +0 -0
  117. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/clients.py +0 -0
  118. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/execution/__init__.py +0 -0
  119. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/execution/base_policy.py +0 -0
  120. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/execution/manager.py +0 -0
  121. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/execution/policy.py +0 -0
  122. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/grid_renderer.py +0 -0
  123. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  124. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/mcpgym.py +0 -0
  125. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/process_manager.py +0 -0
  126. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/session/__init__.py +0 -0
  127. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/session/manager.py +0 -0
  128. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/simple_process_manager.py +0 -0
  129. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp/simulation_server.py +0 -0
  130. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/__init__.py +0 -0
  131. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/config.py +0 -0
  132. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/main.py +0 -0
  133. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  134. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  135. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  136. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  137. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_env.py +0 -0
  138. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/__init__.py +0 -0
  139. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  140. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  141. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  142. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  143. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  144. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  145. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  146. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  147. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  148. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  149. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  150. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  151. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  152. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  153. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/models.py +0 -0
  154. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/packaging.py +0 -0
  155. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/platform_api.py +0 -0
  156. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/playback_policy.py +0 -0
  157. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/__init__.py +0 -0
  158. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  159. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/app.py +0 -0
  160. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  161. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  162. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  163. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/main.py +0 -0
  164. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/models.py +0 -0
  165. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  166. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/__init__.py +0 -0
  167. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  168. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  169. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  170. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  171. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  172. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  173. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  174. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  175. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  176. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/evaluation_test.py +0 -0
  177. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  178. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  179. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/exception_config.py +0 -0
  180. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/execution.py +0 -0
  181. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  182. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  183. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  184. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/parameterize.py +0 -0
  185. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/plugin.py +0 -0
  186. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  187. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/rollout_processor.py +0 -0
  188. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/store_experiment_link.py +0 -0
  189. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/store_results_url.py +0 -0
  190. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/tracing_utils.py +0 -0
  191. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/types.py +0 -0
  192. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/pytest/validate_signature.py +0 -0
  193. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/__init__.py +0 -0
  194. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  195. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  196. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  197. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  198. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  199. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  200. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  201. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/llm_judge.py +0 -0
  202. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  203. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  204. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  205. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  206. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/quickstart/utils.py +0 -0
  207. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/resources.py +0 -0
  208. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/reward_function.py +0 -0
  209. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/__init__.py +0 -0
  210. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/accuracy.py +0 -0
  211. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/accuracy_length.py +0 -0
  212. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  213. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  214. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/apps_testing_util.py +0 -0
  215. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/bfcl_reward.py +0 -0
  216. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/code_execution.py +0 -0
  217. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/code_execution_utils.py +0 -0
  218. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/cpp_code.py +0 -0
  219. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  220. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/format.py +0 -0
  221. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/function_calling.py +0 -0
  222. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/json_schema.py +0 -0
  223. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/language_consistency.py +0 -0
  224. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/lean_prover.py +0 -0
  225. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/length.py +0 -0
  226. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  227. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/math.py +0 -0
  228. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  229. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/reasoning_steps.py +0 -0
  230. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/repetition.py +0 -0
  231. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rewards/tag_count.py +0 -0
  232. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/rl_processing.py +0 -0
  233. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/server.py +0 -0
  234. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/stats/__init__.py +0 -0
  235. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/stats/confidence_intervals.py +0 -0
  236. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/typed_interface.py +0 -0
  237. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/types/__init__.py +0 -0
  238. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/types/errors.py +0 -0
  239. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/types/remote_rollout_processor.py +0 -0
  240. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/types/types.py +0 -0
  241. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/__init__.py +0 -0
  242. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/batch_evaluation.py +0 -0
  243. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/batch_transformation.py +0 -0
  244. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/browser_utils.py +0 -0
  245. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/check_server_status.py +0 -0
  246. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/dataset_helpers.py +0 -0
  247. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  248. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/logs_models.py +0 -0
  249. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/logs_server.py +0 -0
  250. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/module_loader.py +0 -0
  251. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/packaging_utils.py +0 -0
  252. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/show_results_url.py +0 -0
  253. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/static_policy.py +0 -0
  254. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/subprocess_utils.py +0 -0
  255. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol/utils/vite_server.py +0 -0
  256. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/SOURCES.txt +0 -0
  257. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/dependency_links.txt +0 -0
  258. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/entry_points.txt +0 -0
  259. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/requires.txt +0 -0
  260. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/eval_protocol.egg-info/top_level.txt +0 -0
  261. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/pyproject.toml +0 -0
  262. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/setup.cfg +0 -0
  263. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/setup.py +0 -0
  264. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_accuracy.py +0 -0
  265. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_accuracy_length.py +0 -0
  266. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_adapters_e2e.py +0 -0
  267. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_agent_orchestrator.py +0 -0
  268. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_agent_resources.py +0 -0
  269. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_auth.py +0 -0
  270. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_batch_evaluation.py +0 -0
  271. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cli.py +0 -0
  272. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cli_agent.py +0 -0
  273. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cli_args.py +0 -0
  274. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_code_execution.py +0 -0
  275. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_config.py +0 -0
  276. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_control_plane_separation.py +0 -0
  277. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_cpp_code.py +0 -0
  278. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_data_driven_task_manager.py +0 -0
  279. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_deepcoder_reward.py +0 -0
  280. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_deepeval_integration.py +0 -0
  281. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_deploy_integration.py +0 -0
  282. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_directory_utils.py +0 -0
  283. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_e2b_integration.py +0 -0
  284. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_e2b_js_integration.py +0 -0
  285. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_edge_cases.py +0 -0
  286. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_ep_upload_e2e.py +0 -0
  287. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_eval_protocol_import.py +0 -0
  288. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_evaluation.py +0 -0
  289. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_evaluation_integration.py +0 -0
  290. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_evaluation_postprocess.py +0 -0
  291. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_evaluation_preview_integration.py +0 -0
  292. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_event_bus.py +0 -0
  293. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_event_bus_helper.py +0 -0
  294. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_examples_end_to_end.py +0 -0
  295. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_exceptions.py +0 -0
  296. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_fireworks_api.py +0 -0
  297. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_format.py +0 -0
  298. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_fractional_code.py +0 -0
  299. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_function_calling.py +0 -0
  300. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_gcp_tools.py +0 -0
  301. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_generic_server.py +0 -0
  302. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_human_id.py +0 -0
  303. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_integration.py +0 -0
  304. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_json_schema.py +0 -0
  305. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_kwargs_validation.py +0 -0
  306. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_language_consistency.py +0 -0
  307. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_lean_prover.py +0 -0
  308. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_lean_prover_runner.py +0 -0
  309. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_length.py +0 -0
  310. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_list_comparison_math_reward.py +0 -0
  311. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_logs_server.py +0 -0
  312. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_logs_server_simple.py +0 -0
  313. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_math.py +0 -0
  314. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_message_field_filtering.py +0 -0
  315. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_minimal.py +0 -0
  316. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_models.py +0 -0
  317. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_models_rl.py +0 -0
  318. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_multiple_choice_math_reward.py +0 -0
  319. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_n_variant_batch_integration.py +0 -0
  320. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_n_variant_integration.py +0 -0
  321. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_openai_compatibility.py +0 -0
  322. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_openeval_integration.py +0 -0
  323. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_packaging.py +0 -0
  324. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_parallel_rollouts.py +0 -0
  325. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_platform_api.py +0 -0
  326. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_quickstart_utils.py +0 -0
  327. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_readiness.py +0 -0
  328. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_reasoning_steps.py +0 -0
  329. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_repetition.py +0 -0
  330. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_repetition_debug.py +0 -0
  331. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_retry_mechanism.py +0 -0
  332. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_reward_function.py +0 -0
  333. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_reward_protocol_import.py +0 -0
  334. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_rl_processing.py +0 -0
  335. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_rollout_control_plane_integration.py +0 -0
  336. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_server.py +0 -0
  337. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_show_results_url.py +0 -0
  338. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_status_migration_changes.py +0 -0
  339. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_status_migration_integration.py +0 -0
  340. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_status_model.py +0 -0
  341. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_tag_count.py +0 -0
  342. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_tau_bench_airline_smoke.py +0 -0
  343. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_typed_interface.py +0 -0
  344. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_typed_interface_rl.py +0 -0
  345. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_upload_entrypoint.py +0 -0
  346. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_url_handling.py +0 -0
  347. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/tests/test_vite_server.py +0 -0
  348. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/__init__.py +0 -0
  349. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/agent/__init__.py +0 -0
  350. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/agent/base.py +0 -0
  351. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/agent/llm_agent.py +0 -0
  352. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/api_service/__init__.py +0 -0
  353. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/api_service/api_config.py +0 -0
  354. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/api_service/data_model.py +0 -0
  355. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/api_service/simulation_service.py +0 -0
  356. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/cli.py +0 -0
  357. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/config.py +0 -0
  358. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/airline/policy.md +0 -0
  359. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/mock/policy.md +0 -0
  360. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  361. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/retail/policy.md +0 -0
  362. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  363. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  364. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  365. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  366. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  367. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  368. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  369. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data_model/__init__.py +0 -0
  370. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data_model/message.py +0 -0
  371. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data_model/simulation.py +0 -0
  372. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/data_model/tasks.py +0 -0
  373. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/__init__.py +0 -0
  374. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/__init__.py +0 -0
  375. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/data_model.py +0 -0
  376. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/environment.py +0 -0
  377. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/tools.py +0 -0
  378. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/airline/utils.py +0 -0
  379. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/__init__.py +0 -0
  380. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/data_model.py +0 -0
  381. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/environment.py +0 -0
  382. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/tools.py +0 -0
  383. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/mock/utils.py +0 -0
  384. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/__init__.py +0 -0
  385. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/data_model.py +0 -0
  386. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/environment.py +0 -0
  387. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/tools.py +0 -0
  388. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/retail/utils.py +0 -0
  389. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/__init__.py +0 -0
  390. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/data_model.py +0 -0
  391. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/environment.py +0 -0
  392. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  393. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  394. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  395. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  396. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  397. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  398. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  399. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  400. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/tools.py +0 -0
  401. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  402. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  403. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/domains/telecom/utils.py +0 -0
  404. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/__init__.py +0 -0
  405. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/db.py +0 -0
  406. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/environment.py +0 -0
  407. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/server.py +0 -0
  408. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/tool.py +0 -0
  409. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/toolkit.py +0 -0
  410. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  411. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/__init__.py +0 -0
  412. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator.py +0 -0
  413. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  414. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  415. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  416. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  417. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  418. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/metrics/__init__.py +0 -0
  419. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/metrics/agent_metrics.py +0 -0
  420. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  421. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/orchestrator/__init__.py +0 -0
  422. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  423. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  424. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/orchestrator/utils.py +0 -0
  425. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/registry.py +0 -0
  426. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/run.py +0 -0
  427. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/__init__.py +0 -0
  428. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/check_data.py +0 -0
  429. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  430. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/start_servers.py +0 -0
  431. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/scripts/view_simulations.py +0 -0
  432. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/user/__init__.py +0 -0
  433. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/user/base.py +0 -0
  434. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/user/user_simulator.py +0 -0
  435. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/__init__.py +0 -0
  436. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/display.py +0 -0
  437. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/io_utils.py +0 -0
  438. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/llm_utils.py +0 -0
  439. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/pydantic_utils.py +0 -0
  440. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vendor/tau2/utils/utils.py +0 -0
  441. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/versioneer.py +0 -0
  442. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  443. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
  444. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
  445. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
  446. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  447. {eval_protocol-0.2.83 → eval_protocol-0.2.84}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.83
3
+ Version: 0.2.84
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-11-09T23:23:12-0800",
11
+ "date": "2025-11-10T00:30:58-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "a533dcb232528e3910d94adb922c6ab7df27bc4e",
15
- "version": "0.2.83"
14
+ "full-revisionid": "2d75acf5944468856d9f1bea787fce63dcabc16f",
15
+ "version": "0.2.84"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -20,6 +20,7 @@ from ..fireworks_rft import (
20
20
  create_dataset_from_jsonl,
21
21
  create_reinforcement_fine_tuning_job,
22
22
  )
23
+ from ..fireworks_rft import detect_dataset_builder, materialize_dataset_via_builder
23
24
  from .upload import _discover_tests, _normalize_evaluator_id, _prompt_select
24
25
 
25
26
 
@@ -250,6 +251,37 @@ def _build_trimmed_dataset_id(evaluator_id: str) -> str:
250
251
  return f"{base}{suffix}"
251
252
 
252
253
 
254
+ def _resolve_selected_test(
255
+ project_root: str,
256
+ evaluator_id: Optional[str],
257
+ selected_tests: Optional[list] = None,
258
+ ) -> tuple[Optional[str], Optional[str]]:
259
+ """
260
+ Resolve a single test's source file path and function name to use downstream.
261
+ Priority:
262
+ 1) If selected_tests provided and length == 1, use it.
263
+ 2) Else discover tests; if exactly one test, use it.
264
+ 3) Else, if evaluator_id provided, match by normalized '<file-stem>-<func-name>'.
265
+ Returns: (file_path, func_name) or (None, None) if unresolved.
266
+ """
267
+ try:
268
+ tests = selected_tests if selected_tests is not None else _discover_tests(project_root)
269
+ if not tests:
270
+ return None, None
271
+ if len(tests) == 1:
272
+ return tests[0].file_path, tests[0].qualname.split(".")[-1]
273
+ if evaluator_id:
274
+ for t in tests:
275
+ func_name = t.qualname.split(".")[-1]
276
+ source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
277
+ candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
278
+ if candidate == evaluator_id:
279
+ return t.file_path, func_name
280
+ return None, None
281
+ except Exception:
282
+ return None, None
283
+
284
+
253
285
  def _poll_evaluator_status(
254
286
  evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
255
287
  ) -> bool:
@@ -316,6 +348,9 @@ def create_rft_command(args) -> int:
316
348
  non_interactive: bool = bool(getattr(args, "yes", False))
317
349
  dry_run: bool = bool(getattr(args, "dry_run", False))
318
350
  force: bool = bool(getattr(args, "force", False))
351
+ # Track the specifically chosen test (if any) to aid dataset inference later
352
+ selected_test_file_path: Optional[str] = None
353
+ selected_test_func_name: Optional[str] = None
319
354
 
320
355
  api_key = get_fireworks_api_key()
321
356
  if not api_key:
@@ -348,12 +383,33 @@ def create_rft_command(args) -> int:
348
383
  print("No tests selected.")
349
384
  return 1
350
385
  if len(selected_tests) != 1:
351
- print("Error: Please select exactly one evaluation test for 'create rft'.")
386
+ if non_interactive and len(selected_tests) > 1:
387
+ print("Error: Multiple evaluation tests found in --yes (non-interactive) mode.")
388
+ print(" Please pass --evaluator-id or --entry to disambiguate.")
389
+ try:
390
+ # Offer candidate evaluator ids for convenience
391
+ tests = _discover_tests(project_root)
392
+ if tests:
393
+ print(" Candidate evaluator ids:")
394
+ for t in tests:
395
+ func = t.qualname.split(".")[-1]
396
+ stem = os.path.splitext(os.path.basename(t.file_path))[0]
397
+ cand = _normalize_evaluator_id(f"{stem}-{func}")
398
+ print(f" - {cand}")
399
+ except Exception:
400
+ pass
401
+ else:
402
+ print("Error: Please select exactly one evaluation test for 'create rft'.")
352
403
  return 1
404
+ # Derive evaluator_id from user's single selection
353
405
  chosen = selected_tests[0]
354
406
  func_name = chosen.qualname.split(".")[-1]
355
407
  source_file_name = os.path.splitext(os.path.basename(chosen.file_path))[0]
356
408
  evaluator_id = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
409
+ # Resolve selected test once for downstream
410
+ selected_test_file_path, selected_test_func_name = _resolve_selected_test(
411
+ project_root, evaluator_id, selected_tests=selected_tests
412
+ )
357
413
  # Resolve evaluator resource name to fully-qualified format required by API
358
414
  evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
359
415
 
@@ -386,6 +442,11 @@ def create_rft_command(args) -> int:
386
442
  print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
387
443
  return 1
388
444
  skip_upload = True
445
+ # Populate selected test info for dataset inference later
446
+ st_path, st_func = _resolve_selected_test(project_root, evaluator_id)
447
+ if st_path and st_func:
448
+ selected_test_file_path = st_path
449
+ selected_test_func_name = st_func
389
450
  except requests.exceptions.RequestException:
390
451
  pass
391
452
 
@@ -396,28 +457,16 @@ def create_rft_command(args) -> int:
396
457
 
397
458
  tests = _discover_tests(project_root)
398
459
  selected_entry: Optional[str] = None
399
- if len(tests) == 1:
400
- func_name = tests[0].qualname.split(".")[-1]
401
- abs_path = os.path.abspath(tests[0].file_path)
460
+ st_path, st_func = _resolve_selected_test(project_root, evaluator_id, selected_tests=tests)
461
+ if st_path and st_func:
462
+ abs_path = os.path.abspath(st_path)
402
463
  try:
403
464
  rel = os.path.relpath(abs_path, project_root)
404
465
  except Exception:
405
466
  rel = abs_path
406
- selected_entry = f"{rel}::{func_name}"
407
- else:
408
- # Try to match evaluator_id to a discovered test's normalized ID
409
- for t in tests:
410
- func_name = t.qualname.split(".")[-1]
411
- source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
412
- candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
413
- if candidate == evaluator_id:
414
- abs_path = os.path.abspath(t.file_path)
415
- try:
416
- rel = os.path.relpath(abs_path, project_root)
417
- except Exception:
418
- rel = abs_path
419
- selected_entry = f"{rel}::{func_name}"
420
- break
467
+ selected_entry = f"{rel}::{st_func}"
468
+ selected_test_file_path = st_path
469
+ selected_test_func_name = st_func
421
470
  # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
422
471
  if selected_entry is None and len(tests) > 1:
423
472
  print(
@@ -480,30 +529,48 @@ def create_rft_command(args) -> int:
480
529
  dataset_builder = getattr(args, "dataset_builder", None) # accepted but unused in simplified flow
481
530
 
482
531
  if not dataset_id:
483
- # Prefer explicit --dataset-jsonl, else attempt to extract from data loader or input_dataset of the single discovered test
532
+ # Prefer explicit --dataset-jsonl, else attempt to extract from the selected test's data loader or input_dataset.
484
533
  if not dataset_jsonl:
485
- tests = _discover_tests(project_root)
486
- if len(tests) == 1:
487
- func_name = tests[0].qualname.split(".")[-1]
488
- # Try data_loaders first (existing behavior)
489
- dataset_jsonl = _extract_jsonl_from_dataloader(tests[0].file_path, func_name)
534
+ # Use specifically selected test if available; else only infer when exactly one test exists
535
+ test_file_for_infer = None
536
+ func_for_infer = None
537
+ if selected_test_file_path and selected_test_func_name:
538
+ test_file_for_infer = selected_test_file_path
539
+ func_for_infer = selected_test_func_name
540
+ else:
541
+ tests = _discover_tests(project_root)
542
+ if len(tests) == 1:
543
+ test_file_for_infer = tests[0].file_path
544
+ func_for_infer = tests[0].qualname.split(".")[-1]
545
+ if test_file_for_infer and func_for_infer:
546
+ # Try data_loaders first
547
+ dataset_jsonl = _extract_jsonl_from_dataloader(test_file_for_infer, func_for_infer)
490
548
  if dataset_jsonl:
491
- # Display relative path for readability
492
549
  try:
493
550
  rel = os.path.relpath(dataset_jsonl, project_root)
494
551
  except Exception:
495
552
  rel = dataset_jsonl
496
553
  print(f"✓ Using JSONL from data loader: {rel}")
497
- else:
554
+ if not dataset_jsonl:
498
555
  # Fall back to input_dataset (dataset_path)
499
- dataset_jsonl = _extract_jsonl_from_input_dataset(tests[0].file_path, func_name)
556
+ dataset_jsonl = _extract_jsonl_from_input_dataset(test_file_for_infer, func_for_infer)
500
557
  if dataset_jsonl:
501
- # Display relative path for readability
502
558
  try:
503
559
  rel = os.path.relpath(dataset_jsonl, project_root)
504
560
  except Exception:
505
561
  rel = dataset_jsonl
506
562
  print(f"✓ Using JSONL from input_dataset: {rel}")
563
+ if not dataset_jsonl:
564
+ # Last resort: attempt to detect and run a dataset builder in the test's directory
565
+ metric_dir = os.path.dirname(test_file_for_infer)
566
+ builder_spec = detect_dataset_builder(metric_dir)
567
+ if builder_spec:
568
+ try:
569
+ tmp_jsonl, count = materialize_dataset_via_builder(builder_spec)
570
+ dataset_jsonl = tmp_jsonl
571
+ print(f"✓ Materialized {count} rows via dataset builder: {builder_spec}")
572
+ except Exception as e:
573
+ print(f"Warning: dataset builder failed: {e}")
507
574
  if not dataset_jsonl:
508
575
  print(
509
576
  "Error: Could not determine dataset. Provide --dataset-id or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
@@ -21,7 +21,6 @@ from eval_protocol.auth import (
21
21
  from eval_protocol.platform_api import create_or_update_fireworks_secret
22
22
 
23
23
  from eval_protocol.evaluation import create_evaluation
24
- from eval_protocol.fireworks_rft import save_evaluator_trace, detect_dataset_builder
25
24
 
26
25
 
27
26
  @dataclass
@@ -444,49 +443,25 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe
444
443
  else:
445
444
  return []
446
445
 
447
- # Enter-only selection UX with optional multi-select via repeat
448
- remaining_indices = list(range(len(tests)))
449
- selected_indices: list[int] = []
450
-
446
+ # Single-select UX
451
447
  print("\n")
452
- print("Tip: Use ↑/↓ arrows to navigate and press ENTER to select.")
453
- print(" After selecting one, you can choose to add more.\n")
454
-
455
- while remaining_indices:
456
- # Build choices from remaining
457
- choices = []
458
- for idx, test_idx in enumerate(remaining_indices, 1):
459
- t = tests[test_idx]
460
- choice_text = _format_test_choice(t, idx)
461
- choices.append({"name": choice_text, "value": test_idx})
462
-
463
- selected = questionary.select(
464
- "Select an evaluation test to upload:", choices=choices, style=custom_style
465
- ).ask()
466
-
467
- if selected is None: # Ctrl+C
468
- print("\nUpload cancelled.")
469
- return []
448
+ print("Tip: Use ↑/↓ arrows to navigate and press ENTER to select.\n")
470
449
 
471
- if isinstance(selected, int):
472
- selected_indices.append(selected)
473
- # Remove from remaining
474
- if selected in remaining_indices:
475
- remaining_indices.remove(selected)
450
+ choices = []
451
+ for idx, t in enumerate(tests, 1):
452
+ choice_text = _format_test_choice(t, idx)
453
+ choices.append({"name": choice_text, "value": idx - 1})
476
454
 
477
- # Ask whether to add another (ENTER to finish)
478
- add_more = questionary.confirm("Add another?", default=False, style=custom_style).ask()
479
- if not add_more:
480
- break
481
- else:
482
- break
455
+ selected = questionary.select(
456
+ "Select an evaluation test to upload:", choices=choices, style=custom_style
457
+ ).ask()
483
458
 
484
- if not selected_indices:
485
- print("\n⚠️ No tests were selected.")
459
+ if selected is None: # Ctrl+C
460
+ print("\nUpload cancelled.")
486
461
  return []
487
462
 
488
- print(f"\n✓ Selected {len(selected_indices)} test(s)")
489
- return [tests[i] for i in selected_indices]
463
+ print("\n✓ Selected 1 test")
464
+ return [tests[selected]]
490
465
 
491
466
  except ImportError:
492
467
  # Fallback to simpler implementation
@@ -525,22 +500,19 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]
525
500
 
526
501
  print("=" * 80)
527
502
  try:
528
- choice = input("Enter numbers to upload (comma or space-separated), or 'all': ").strip()
503
+ choice = input("Enter the number to upload: ").strip()
529
504
  except KeyboardInterrupt:
530
505
  print("\n\nUpload cancelled.")
531
506
  return []
532
507
 
533
- if choice.lower() in ("all", "a", "*"):
534
- return tests
535
-
536
- indices: list[int] = []
537
- for token in re.split(r"[\s,]+", choice):
538
- if token.isdigit():
539
- n = int(token)
540
- if 1 <= n <= len(tests):
541
- indices.append(n - 1)
542
- indices = sorted(set(indices))
543
- return [tests[i] for i in indices]
508
+ if not choice.isdigit():
509
+ print("\n⚠️ Invalid selection.")
510
+ return []
511
+ n = int(choice)
512
+ if not (1 <= n <= len(tests)):
513
+ print("\n⚠️ Selection out of range.")
514
+ return []
515
+ return [tests[n - 1]]
544
516
 
545
517
 
546
518
  def _prompt_select(tests: list[DiscoveredTest], non_interactive: bool) -> list[DiscoveredTest]:
@@ -718,23 +690,6 @@ def upload_command(args: argparse.Namespace) -> int:
718
690
  )
719
691
  name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
720
692
 
721
- # Persist local evaluator trace for later `create rft`
722
- try:
723
- metric_dir = os.path.dirname(source_file_path) if source_file_path else root
724
- builder_spec = detect_dataset_builder(metric_dir) or None
725
- trace_payload = {
726
- "evaluator_id": evaluator_id,
727
- "evaluator_resource_name": name,
728
- "entry_point": entry_point,
729
- "metric_dir": metric_dir,
730
- "project_root": root,
731
- "dataset_builder": builder_spec,
732
- }
733
- save_evaluator_trace(project_root=root, evaluator_id=evaluator_id, trace=trace_payload)
734
- except Exception:
735
- # Non-fatal; continue
736
- pass
737
-
738
693
  # Print success message with Fireworks dashboard link
739
694
  print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
740
695
  print("📊 View in Fireworks Dashboard:")
@@ -37,25 +37,6 @@ def _map_api_host_to_app_host(api_base: str) -> str:
37
37
  return "https://app.fireworks.ai"
38
38
 
39
39
 
40
- def load_evaluator_trace(project_root: str, evaluator_id: str) -> Optional[Dict[str, Any]]:
41
- trace_path = Path(project_root) / ".eval_protocol" / "evaluators" / f"{evaluator_id}.json"
42
- if not trace_path.exists():
43
- return None
44
- try:
45
- with open(trace_path, "r", encoding="utf-8") as f:
46
- return json.load(f)
47
- except Exception:
48
- return None
49
-
50
-
51
- def save_evaluator_trace(project_root: str, evaluator_id: str, trace: Dict[str, Any]) -> None:
52
- base_dir = Path(project_root) / ".eval_protocol" / "evaluators"
53
- base_dir.mkdir(parents=True, exist_ok=True)
54
- trace_path = base_dir / f"{evaluator_id}.json"
55
- with open(trace_path, "w", encoding="utf-8") as f:
56
- json.dump(trace, f, indent=2, ensure_ascii=False)
57
-
58
-
59
40
  def detect_dataset_builder(metric_dir: str) -> Optional[str]:
60
41
  """
61
42
  Best-effort scan for a dataset builder callable inside the metric directory.
@@ -228,8 +209,6 @@ def build_default_output_model(evaluator_id: str) -> str:
228
209
 
229
210
 
230
211
  __all__ = [
231
- "load_evaluator_trace",
232
- "save_evaluator_trace",
233
212
  "detect_dataset_builder",
234
213
  "materialize_dataset_via_builder",
235
214
  "create_dataset_from_jsonl",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.83
3
+ Version: 0.2.84
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT