eval-protocol 0.3.10.dev1__tar.gz → 0.3.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (471) hide show
  1. {eval_protocol-0.3.10.dev1/eval_protocol.egg-info → eval_protocol-0.3.11}/PKG-INFO +2 -2
  2. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/auth.py +1 -29
  4. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli.py +6 -8
  5. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/create_rft.py +100 -66
  6. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/upload.py +3 -3
  7. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/evaluation.py +32 -53
  8. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/platform_api.py +27 -17
  9. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_agent_rollout_processor.py +5 -1
  10. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +27 -21
  11. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +11 -7
  12. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_single_turn_rollout_process.py +12 -11
  13. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/evaluation_test.py +0 -3
  14. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/evaluation_test_utils.py +0 -19
  15. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/github_action_rollout_processor.py +7 -0
  16. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/openenv_rollout_processor.py +10 -6
  17. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/remote_rollout_processor.py +7 -0
  18. eval_protocol-0.3.11/eval_protocol/pytest/utils.py +24 -0
  19. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11/eval_protocol.egg-info}/PKG-INFO +2 -2
  20. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/SOURCES.txt +1 -2
  21. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/requires.txt +1 -1
  22. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/pyproject.toml +1 -1
  23. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cli_create_rft.py +61 -17
  24. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_ep_upload_e2e.py +140 -51
  25. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_evaluation.py +7 -22
  26. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_upload_entrypoint.py +12 -10
  27. eval_protocol-0.3.10.dev1/eval_protocol/fireworks_client.py +0 -132
  28. eval_protocol-0.3.10.dev1/tests/test_fireworks_client.py +0 -143
  29. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/LICENSE +0 -0
  30. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/README.md +0 -0
  31. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/__init__.py +0 -0
  32. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/normalize_sandbox_fusion.py +0 -0
  33. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/utils/__init__.py +0 -0
  34. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/utils/generate_api_key.py +0 -0
  35. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/development/utils/subprocess_manager.py +0 -0
  36. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/__init__.py +0 -0
  37. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/__main__.py +0 -0
  38. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/__init__.py +0 -0
  39. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/base.py +0 -0
  40. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/bigquery.py +0 -0
  41. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/braintrust.py +0 -0
  42. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/dataframe.py +0 -0
  43. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  44. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/huggingface.py +0 -0
  45. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/langchain.py +0 -0
  46. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/langfuse.py +0 -0
  47. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/langsmith.py +0 -0
  48. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/openai_responses.py +0 -0
  49. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/trl.py +0 -0
  50. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/utils.py +0 -0
  51. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/adapters/weave.py +0 -0
  52. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/__init__.py +0 -0
  53. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/models.py +0 -0
  54. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/orchestrator.py +0 -0
  55. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resource_abc.py +0 -0
  56. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resource_pool.py +0 -0
  57. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/__init__.py +0 -0
  58. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  59. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  60. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  61. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  62. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  63. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/docker_resource.py +0 -0
  64. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  65. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  66. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/resources/sql_resource.py +0 -0
  67. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/task_manager.py +0 -0
  68. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/agent/tool_registry.py +0 -0
  69. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/__init__.py +0 -0
  70. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  71. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  72. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_aime25.py +0 -0
  73. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  74. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
  75. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  76. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  77. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  78. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  79. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/__init__.py +0 -0
  80. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  81. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/common.py +0 -0
  82. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/export_docs.py +0 -0
  83. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/local_test.py +0 -0
  84. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/logs.py +0 -0
  85. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  86. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/cli_commands/utils.py +0 -0
  87. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/common_utils.py +0 -0
  88. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/config.py +0 -0
  89. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/__init__.py +0 -0
  90. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  91. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  92. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  93. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  94. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/data_loader/models.py +0 -0
  95. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/__init__.py +0 -0
  96. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  97. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  98. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  99. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  100. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/datasets/__init__.py +0 -0
  101. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/datasets/loader.py +0 -0
  102. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/directory_utils.py +0 -0
  103. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/__init__.py +0 -0
  104. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/event_bus.py +0 -0
  105. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/logger.py +0 -0
  106. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  107. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  108. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/exceptions.py +0 -0
  109. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/execution/__init__.py +0 -0
  110. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/execution/pipeline.py +0 -0
  111. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/fireworks_rft.py +0 -0
  112. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/gcp_tools.py +0 -0
  113. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/generation/cache.py +0 -0
  114. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/generation/clients/base.py +0 -0
  115. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/generation/clients.py +0 -0
  116. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/generic_server.py +0 -0
  117. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/get_pep440_version.py +0 -0
  118. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/human_id/__init__.py +0 -0
  119. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/human_id/dictionary.py +0 -0
  120. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/__init__.py +0 -0
  121. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/deepeval.py +0 -0
  122. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/openai_rft.py +0 -0
  123. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/openeval.py +0 -0
  124. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  125. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
  126. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/integrations/trl.py +0 -0
  127. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/__init__.py +0 -0
  128. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  129. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  130. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  131. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  132. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/init.py +0 -0
  133. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/rollout_context.py +0 -0
  134. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  135. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/log_utils/util.py +0 -0
  136. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/logging_utils.py +0 -0
  137. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/__init__.py +0 -0
  138. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/adapter.py +0 -0
  139. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/client/__init__.py +0 -0
  140. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/client/connection.py +0 -0
  141. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/clients.py +0 -0
  142. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/__init__.py +0 -0
  143. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/base_policy.py +0 -0
  144. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/manager.py +0 -0
  145. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/policy.py +0 -0
  146. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  147. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/grid_renderer.py +0 -0
  148. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  149. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/mcpgym.py +0 -0
  150. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/process_manager.py +0 -0
  151. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/session/__init__.py +0 -0
  152. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/session/manager.py +0 -0
  153. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/simple_process_manager.py +0 -0
  154. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp/simulation_server.py +0 -0
  155. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/__init__.py +0 -0
  156. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/config.py +0 -0
  157. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/main.py +0 -0
  158. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  159. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  160. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  161. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  162. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_env.py +0 -0
  163. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/__init__.py +0 -0
  164. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  165. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  166. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  167. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  168. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  169. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  170. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  171. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  172. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  173. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  174. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  175. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  176. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  177. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  178. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/models.py +0 -0
  179. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/packaging.py +0 -0
  180. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/playback_policy.py +0 -0
  181. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/__init__.py +0 -0
  182. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  183. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/app.py +0 -0
  184. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  185. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  186. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  187. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/main.py +0 -0
  188. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/models.py +0 -0
  189. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  190. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/__init__.py +0 -0
  191. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/buffer.py +0 -0
  192. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  193. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  194. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  195. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  196. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  197. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  198. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  199. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/exception_config.py +0 -0
  200. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/execution.py +0 -0
  201. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  202. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  203. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  204. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/parameterize.py +0 -0
  205. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/plugin.py +0 -0
  206. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/priority_scheduler.py +0 -0
  207. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/rollout_processor.py +0 -0
  208. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  209. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/store_experiment_link.py +0 -0
  210. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/store_results_url.py +0 -0
  211. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/tracing_utils.py +0 -0
  212. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/types.py +0 -0
  213. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/pytest/validate_signature.py +0 -0
  214. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/__init__.py +0 -0
  215. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  216. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  217. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  218. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  219. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  220. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  221. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  222. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/llm_judge.py +0 -0
  223. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  224. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  225. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  226. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  227. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/quickstart/utils.py +0 -0
  228. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/resources.py +0 -0
  229. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/reward_function.py +0 -0
  230. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/__init__.py +0 -0
  231. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/accuracy.py +0 -0
  232. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/accuracy_length.py +0 -0
  233. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  234. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  235. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/apps_testing_util.py +0 -0
  236. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/bfcl_reward.py +0 -0
  237. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/code_execution.py +0 -0
  238. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/code_execution_utils.py +0 -0
  239. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/cpp_code.py +0 -0
  240. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  241. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/format.py +0 -0
  242. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/function_calling.py +0 -0
  243. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/json_schema.py +0 -0
  244. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/language_consistency.py +0 -0
  245. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/lean_prover.py +0 -0
  246. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/length.py +0 -0
  247. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  248. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/math.py +0 -0
  249. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  250. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/reasoning_steps.py +0 -0
  251. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/repetition.py +0 -0
  252. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rewards/tag_count.py +0 -0
  253. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/rl_processing.py +0 -0
  254. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/server.py +0 -0
  255. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/stats/__init__.py +0 -0
  256. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/stats/confidence_intervals.py +0 -0
  257. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/__init__.py +0 -0
  258. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/gepa_trainer.py +0 -0
  259. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/gepa_utils.py +0 -0
  260. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/trainer.py +0 -0
  261. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/training/utils.py +0 -0
  262. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/typed_interface.py +0 -0
  263. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/types/__init__.py +0 -0
  264. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/types/errors.py +0 -0
  265. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/types/remote_rollout_processor.py +0 -0
  266. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/types/types.py +0 -0
  267. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/__init__.py +0 -0
  268. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/batch_evaluation.py +0 -0
  269. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/batch_transformation.py +0 -0
  270. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/browser_utils.py +0 -0
  271. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/check_server_status.py +0 -0
  272. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/dataset_helpers.py +0 -0
  273. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  274. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/logs_models.py +0 -0
  275. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/logs_server.py +0 -0
  276. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/module_loader.py +0 -0
  277. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/packaging_utils.py +0 -0
  278. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/show_results_url.py +0 -0
  279. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/static_policy.py +0 -0
  280. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/subprocess_utils.py +0 -0
  281. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol/utils/vite_server.py +0 -0
  282. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/dependency_links.txt +0 -0
  283. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/entry_points.txt +0 -0
  284. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/eval_protocol.egg-info/top_level.txt +0 -0
  285. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/setup.cfg +0 -0
  286. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/setup.py +0 -0
  287. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_accuracy.py +0 -0
  288. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_accuracy_length.py +0 -0
  289. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_adapters_e2e.py +0 -0
  290. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_agent_orchestrator.py +0 -0
  291. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_agent_resources.py +0 -0
  292. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_auth.py +0 -0
  293. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_batch_evaluation.py +0 -0
  294. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cli_agent.py +0 -0
  295. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cli_args.py +0 -0
  296. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cli_local_test.py +0 -0
  297. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_code_execution.py +0 -0
  298. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_config.py +0 -0
  299. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_control_plane_separation.py +0 -0
  300. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_cpp_code.py +0 -0
  301. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_data_driven_task_manager.py +0 -0
  302. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_deepcoder_reward.py +0 -0
  303. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_deepeval_integration.py +0 -0
  304. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_directory_utils.py +0 -0
  305. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_e2b_integration.py +0 -0
  306. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_e2b_js_integration.py +0 -0
  307. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_edge_cases.py +0 -0
  308. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_eval_protocol_import.py +0 -0
  309. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_evaluation_postprocess.py +0 -0
  310. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_event_bus.py +0 -0
  311. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_event_bus_helper.py +0 -0
  312. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_examples_end_to_end.py +0 -0
  313. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_exception_config.py +0 -0
  314. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_exceptions.py +0 -0
  315. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_fireworks_api.py +0 -0
  316. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_format.py +0 -0
  317. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_fractional_code.py +0 -0
  318. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_function_calling.py +0 -0
  319. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_gcp_tools.py +0 -0
  320. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_generic_server.py +0 -0
  321. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_human_id.py +0 -0
  322. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_integration.py +0 -0
  323. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_json_schema.py +0 -0
  324. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_kwargs_validation.py +0 -0
  325. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_language_consistency.py +0 -0
  326. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_lean_prover.py +0 -0
  327. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_lean_prover_runner.py +0 -0
  328. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_length.py +0 -0
  329. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_list_comparison_math_reward.py +0 -0
  330. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_litellm_policy_provider_fields.py +0 -0
  331. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_logs_server.py +0 -0
  332. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_logs_server_simple.py +0 -0
  333. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_math.py +0 -0
  334. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_message_field_filtering.py +0 -0
  335. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_minimal.py +0 -0
  336. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_models.py +0 -0
  337. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_models_rl.py +0 -0
  338. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_multiple_choice_math_reward.py +0 -0
  339. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_n_variant_batch_integration.py +0 -0
  340. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_n_variant_integration.py +0 -0
  341. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_openai_compatibility.py +0 -0
  342. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_openai_rft_integration.py +0 -0
  343. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_openeval_integration.py +0 -0
  344. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_packaging.py +0 -0
  345. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_parallel_rollouts.py +0 -0
  346. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_platform_api.py +0 -0
  347. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_priority_scheduler.py +0 -0
  348. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_quickstart_utils.py +0 -0
  349. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_readiness.py +0 -0
  350. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_reasoning_steps.py +0 -0
  351. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_repetition.py +0 -0
  352. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_repetition_debug.py +0 -0
  353. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_retry_mechanism.py +0 -0
  354. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_reward_function.py +0 -0
  355. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_reward_protocol_import.py +0 -0
  356. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_rl_processing.py +0 -0
  357. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_rollout_control_plane_integration.py +0 -0
  358. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_rollout_logprobs.py +0 -0
  359. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_server.py +0 -0
  360. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_show_results_url.py +0 -0
  361. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_sqlite_hardening.py +0 -0
  362. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_status_migration_changes.py +0 -0
  363. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_status_migration_integration.py +0 -0
  364. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_status_model.py +0 -0
  365. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_tag_count.py +0 -0
  366. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_tau_bench_airline_smoke.py +0 -0
  367. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_training_utils.py +0 -0
  368. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_typed_interface.py +0 -0
  369. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_typed_interface_rl.py +0 -0
  370. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_url_handling.py +0 -0
  371. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/tests/test_vite_server.py +0 -0
  372. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/__init__.py +0 -0
  373. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/agent/__init__.py +0 -0
  374. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/agent/base.py +0 -0
  375. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/agent/llm_agent.py +0 -0
  376. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/api_service/__init__.py +0 -0
  377. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/api_service/api_config.py +0 -0
  378. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/api_service/data_model.py +0 -0
  379. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/api_service/simulation_service.py +0 -0
  380. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/cli.py +0 -0
  381. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/config.py +0 -0
  382. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/airline/policy.md +0 -0
  383. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/mock/policy.md +0 -0
  384. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  385. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/retail/policy.md +0 -0
  386. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  387. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  388. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  389. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  390. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  391. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  392. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  393. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data_model/__init__.py +0 -0
  394. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data_model/message.py +0 -0
  395. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data_model/simulation.py +0 -0
  396. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/data_model/tasks.py +0 -0
  397. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/__init__.py +0 -0
  398. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/__init__.py +0 -0
  399. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/data_model.py +0 -0
  400. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/environment.py +0 -0
  401. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/tools.py +0 -0
  402. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/airline/utils.py +0 -0
  403. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/__init__.py +0 -0
  404. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/data_model.py +0 -0
  405. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/environment.py +0 -0
  406. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/tools.py +0 -0
  407. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/mock/utils.py +0 -0
  408. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/__init__.py +0 -0
  409. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/data_model.py +0 -0
  410. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/environment.py +0 -0
  411. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/tools.py +0 -0
  412. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/retail/utils.py +0 -0
  413. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/__init__.py +0 -0
  414. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/data_model.py +0 -0
  415. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/environment.py +0 -0
  416. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  417. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  418. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  419. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  420. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  421. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  422. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  423. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  424. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/tools.py +0 -0
  425. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  426. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  427. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/domains/telecom/utils.py +0 -0
  428. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/__init__.py +0 -0
  429. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/db.py +0 -0
  430. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/environment.py +0 -0
  431. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/server.py +0 -0
  432. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/tool.py +0 -0
  433. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/toolkit.py +0 -0
  434. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  435. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/__init__.py +0 -0
  436. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator.py +0 -0
  437. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  438. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  439. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  440. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  441. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  442. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/metrics/__init__.py +0 -0
  443. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/metrics/agent_metrics.py +0 -0
  444. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  445. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/orchestrator/__init__.py +0 -0
  446. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  447. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  448. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/orchestrator/utils.py +0 -0
  449. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/registry.py +0 -0
  450. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/run.py +0 -0
  451. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/__init__.py +0 -0
  452. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/check_data.py +0 -0
  453. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  454. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/start_servers.py +0 -0
  455. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/scripts/view_simulations.py +0 -0
  456. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/user/__init__.py +0 -0
  457. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/user/base.py +0 -0
  458. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/user/user_simulator.py +0 -0
  459. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/__init__.py +0 -0
  460. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/display.py +0 -0
  461. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/io_utils.py +0 -0
  462. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/llm_utils.py +0 -0
  463. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/pydantic_utils.py +0 -0
  464. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vendor/tau2/utils/utils.py +0 -0
  465. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/versioneer.py +0 -0
  466. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  467. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/index-10cZ11iB.js +0 -0
  468. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/index-10cZ11iB.js.map +0 -0
  469. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/index-DOD73Wyg.css +0 -0
  470. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  471. {eval_protocol-0.3.10.dev1 → eval_protocol-0.3.11}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.10.dev1
3
+ Version: 0.3.11
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -29,7 +29,7 @@ Requires-Dist: pytest>=6.0.0
29
29
  Requires-Dist: pytest-asyncio>=0.21.0
30
30
  Requires-Dist: peewee>=3.18.2
31
31
  Requires-Dist: backoff>=2.2.0
32
- Requires-Dist: fireworks-ai==1.0.0a22
32
+ Requires-Dist: fireworks-ai==1.0.0a20
33
33
  Requires-Dist: questionary>=2.0.0
34
34
  Requires-Dist: toml>=0.10.0
35
35
  Requires-Dist: loguru>=0.6.0
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-01-13T15:54:22-0800",
11
+ "date": "2026-01-13T17:18:11-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "3314becfcdf35f771c41988a24f38dcb91593203",
15
- "version": "0.3.10.dev.1"
14
+ "full-revisionid": "6702c557e88f2d256fd820770e6ab6b32db72701",
15
+ "version": "0.3.11"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -3,30 +3,9 @@ import os
3
3
  from typing import Optional
4
4
 
5
5
  import requests
6
- from dotenv import find_dotenv, load_dotenv
7
6
 
8
7
  logger = logging.getLogger(__name__)
9
8
 
10
- # --- Load .env files ---
11
- # Attempt to load .env.dev first, then .env as a fallback.
12
- # This happens when the module is imported.
13
- # We use override=False (default) so that existing environment variables
14
- # (e.g., set in the shell) are NOT overridden by .env files.
15
- _ENV_DEV_PATH = find_dotenv(filename=".env.dev", raise_error_if_not_found=False, usecwd=True)
16
- if _ENV_DEV_PATH:
17
- load_dotenv(dotenv_path=_ENV_DEV_PATH, override=False)
18
- logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_DEV_PATH}")
19
- else:
20
- _ENV_PATH = find_dotenv(filename=".env", raise_error_if_not_found=False, usecwd=True)
21
- if _ENV_PATH:
22
- load_dotenv(dotenv_path=_ENV_PATH, override=False)
23
- logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_PATH}")
24
- else:
25
- logger.debug(
26
- "eval_protocol.auth: No .env.dev or .env file found. Relying on shell/existing environment variables."
27
- )
28
- # --- End .env loading ---
29
-
30
9
 
31
10
  def get_fireworks_api_key() -> Optional[str]:
32
11
  """
@@ -94,8 +73,6 @@ def verify_api_key_and_get_account_id(
94
73
  Args:
95
74
  api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
96
75
  api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
97
- If api_base is api.fireworks.ai, it is used directly. Otherwise, defaults to
98
- dev.api.fireworks.ai for the verification call.
99
76
 
100
77
  Returns:
101
78
  The resolved account id if verification succeeds and the header is present; otherwise None.
@@ -104,12 +81,7 @@ def verify_api_key_and_get_account_id(
104
81
  resolved_key = api_key or get_fireworks_api_key()
105
82
  if not resolved_key:
106
83
  return None
107
- provided_base = api_base or get_fireworks_api_base()
108
- # Use api.fireworks.ai if explicitly provided, otherwise fall back to dev
109
- if "api.fireworks.ai" in provided_base:
110
- resolved_base = provided_base
111
- else:
112
- resolved_base = "https://dev.api.fireworks.ai"
84
+ resolved_base = api_base or get_fireworks_api_base()
113
85
 
114
86
  from .common_utils import get_user_agent
115
87
 
@@ -81,12 +81,13 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
81
81
  "--env-file",
82
82
  help="Path to .env file containing secrets to upload (default: .env in current directory)",
83
83
  )
84
+ upload_parser.add_argument(
85
+ "--force",
86
+ action="store_true",
87
+ help="Overwrite existing evaluator with the same ID",
88
+ )
84
89
 
85
90
  # Auto-generate flags from SDK Fireworks().evaluators.create() signature
86
- # Note: We use Fireworks() directly here instead of create_fireworks_client()
87
- # because we only need the method signature for introspection, not a fully
88
- # authenticated client. create_fireworks_client() would trigger an HTTP request
89
- # to verify the API key, causing delays even for --help invocations.
90
91
  create_evaluator_fn = Fireworks().evaluators.create
91
92
 
92
93
  upload_skip_fields = {
@@ -136,6 +137,7 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
136
137
 
137
138
  rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
138
139
  rft_parser.add_argument("--dry-run", action="store_true", help="Print planned SDK call without sending")
140
+ rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
139
141
  rft_parser.add_argument("--skip-validation", action="store_true", help="Skip local dataset/evaluator validation")
140
142
  rft_parser.add_argument(
141
143
  "--ignore-docker",
@@ -196,10 +198,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
196
198
  "loss_config.method": "RL loss method for underlying trainers. One of {grpo,dapo}.",
197
199
  }
198
200
 
199
- # Note: We use Fireworks() directly here instead of create_fireworks_client()
200
- # because we only need the method signature for introspection, not a fully
201
- # authenticated client. create_fireworks_client() would trigger an HTTP request
202
- # to verify the API key, causing delays even for --help invocations.
203
201
  create_rft_job_fn = Fireworks().reinforcement_fine_tuning_jobs.create
204
202
 
205
203
  add_args_from_callable_signature(
@@ -7,18 +7,19 @@ import sys
7
7
  import time
8
8
  from typing import Any, Callable, Dict, Optional
9
9
  import inspect
10
+ import requests
10
11
  import tempfile
11
12
  from pydantic import ValidationError
12
13
 
13
14
  from ..auth import get_fireworks_api_base, get_fireworks_api_key
14
- from ..fireworks_client import create_fireworks_client
15
- from ..common_utils import load_jsonl
15
+ from ..common_utils import get_user_agent, load_jsonl
16
16
  from ..fireworks_rft import (
17
17
  create_dataset_from_jsonl,
18
18
  detect_dataset_builder,
19
19
  materialize_dataset_via_builder,
20
20
  )
21
21
  from ..models import EvaluationRow
22
+ from .upload import upload_command
22
23
  from .utils import (
23
24
  _build_entry_point,
24
25
  _build_trimmed_dataset_id,
@@ -34,6 +35,8 @@ from .utils import (
34
35
  )
35
36
  from .local_test import run_evaluator_test
36
37
 
38
+ from fireworks import Fireworks
39
+
37
40
 
38
41
  def _extract_dataset_adapter(
39
42
  test_file_path: str, test_func_name: str
@@ -220,68 +223,64 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
220
223
  return None
221
224
 
222
225
 
223
- def _poll_evaluator_version_status(
224
- evaluator_id: str,
225
- version_id: str,
226
- api_key: str,
227
- api_base: str,
228
- timeout_minutes: int = 10,
226
+ def _poll_evaluator_status(
227
+ evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
229
228
  ) -> bool:
230
229
  """
231
- Poll a specific evaluator version status until it becomes ACTIVE or times out.
232
-
233
- Uses the Fireworks SDK to get the specified version of the evaluator and checks
234
- its build state.
230
+ Poll evaluator status until it becomes ACTIVE or times out.
235
231
 
236
232
  Args:
237
- evaluator_id: The evaluator ID (not full resource name)
238
- version_id: The specific version ID to poll
233
+ evaluator_resource_name: Full evaluator resource name (e.g., accounts/xxx/evaluators/yyy)
239
234
  api_key: Fireworks API key
240
235
  api_base: Fireworks API base URL
241
236
  timeout_minutes: Maximum time to wait in minutes
242
237
 
243
238
  Returns:
244
- True if evaluator version becomes ACTIVE, False if timeout or BUILD_FAILED
239
+ True if evaluator becomes ACTIVE, False if timeout or BUILD_FAILED
245
240
  """
241
+ headers = {
242
+ "Authorization": f"Bearer {api_key}",
243
+ "Content-Type": "application/json",
244
+ "User-Agent": get_user_agent(),
245
+ }
246
+
247
+ check_url = f"{api_base}/v1/{evaluator_resource_name}"
246
248
  timeout_seconds = timeout_minutes * 60
247
249
  poll_interval = 10 # seconds
248
250
  start_time = time.time()
249
251
 
250
- print(
251
- f"Polling evaluator version '{version_id}' status (timeout: {timeout_minutes}m, interval: {poll_interval}s)..."
252
- )
253
-
254
- client = create_fireworks_client(api_key=api_key, base_url=api_base)
252
+ print(f"Polling evaluator status (timeout: {timeout_minutes}m, interval: {poll_interval}s)...")
255
253
 
256
254
  while time.time() - start_time < timeout_seconds:
257
255
  try:
258
- version = client.evaluator_versions.get(version_id, evaluator_id=evaluator_id)
259
- state = version.state or "STATE_UNSPECIFIED"
260
- status_msg = ""
261
- if version.status and version.status.message:
262
- status_msg = version.status.message
256
+ response = requests.get(check_url, headers=headers, timeout=30)
257
+ response.raise_for_status()
258
+
259
+ evaluator_data = response.json()
260
+ state = evaluator_data.get("state", "STATE_UNSPECIFIED")
261
+ status = evaluator_data.get("status", "")
263
262
 
264
263
  if state == "ACTIVE":
265
- print("✅ Evaluator version is ACTIVE and ready!")
264
+ print("✅ Evaluator is ACTIVE and ready!")
266
265
  return True
267
266
  elif state == "BUILD_FAILED":
268
- print(f"❌ Evaluator version build failed. Status: {status_msg}")
267
+ print(f"❌ Evaluator build failed. Status: {status}")
269
268
  return False
270
269
  elif state == "BUILDING":
271
270
  elapsed_minutes = (time.time() - start_time) / 60
272
- print(f"⏳ Evaluator version is still building... ({elapsed_minutes:.1f}m elapsed)")
271
+ print(f"⏳ Evaluator is still building... ({elapsed_minutes:.1f}m elapsed)")
273
272
  else:
274
- print(f"⏳ Evaluator version state: {state}, status: {status_msg}")
273
+ print(f"⏳ Evaluator state: {state}, status: {status}")
275
274
 
276
- except Exception as e:
277
- print(f"Warning: Failed to check evaluator version status: {e}")
275
+ except requests.exceptions.RequestException as e:
276
+ print(f"Warning: Failed to check evaluator status: {e}")
278
277
 
279
278
  # Wait before next poll
280
279
  time.sleep(poll_interval)
281
280
 
282
281
  # Timeout reached
283
282
  elapsed_minutes = (time.time() - start_time) / 60
284
- print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator version is not yet ACTIVE")
283
+ print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator is not yet ACTIVE")
285
284
  return False
286
285
 
287
286
 
@@ -566,16 +565,42 @@ def _upload_dataset(
566
565
  def _upload_and_ensure_evaluator(
567
566
  project_root: str,
568
567
  evaluator_id: str,
568
+ evaluator_resource_name: str,
569
569
  api_key: str,
570
570
  api_base: str,
571
+ force: bool,
571
572
  ) -> bool:
572
- """Upload evaluator and ensure its version becomes ACTIVE.
573
-
574
- Creates/updates the evaluator and uploads the code, then polls the specific
575
- version until it becomes ACTIVE.
576
- """
577
- from eval_protocol.evaluation import create_evaluation
573
+ """Ensure the evaluator exists and is ACTIVE, uploading it if needed."""
574
+ # Optional short-circuit: if evaluator already exists and not forcing, skip upload path
575
+ if not force:
576
+ try:
577
+ headers = {
578
+ "Authorization": f"Bearer {api_key}",
579
+ "Content-Type": "application/json",
580
+ "User-Agent": get_user_agent(),
581
+ }
582
+ resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
583
+ if resp.ok:
584
+ state = resp.json().get("state", "STATE_UNSPECIFIED")
585
+ print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
586
+ # Poll for ACTIVE before proceeding
587
+ print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
588
+ if not _poll_evaluator_status(
589
+ evaluator_resource_name=evaluator_resource_name,
590
+ api_key=api_key,
591
+ api_base=api_base,
592
+ timeout_minutes=10,
593
+ ):
594
+ dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
595
+ print("\n❌ Evaluator is not ready within the timeout period.")
596
+ print(f"📊 Please check the evaluator status at: {dashboard_url}")
597
+ print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
598
+ return False
599
+ return True
600
+ except requests.exceptions.RequestException:
601
+ pass
578
602
 
603
+ # Ensure evaluator exists by invoking the upload flow programmatically
579
604
  try:
580
605
  tests = _discover_tests(project_root)
581
606
  selected_entry: Optional[str] = None
@@ -592,37 +617,43 @@ def _upload_and_ensure_evaluator(
592
617
  )
593
618
  return False
594
619
 
595
- print(f"\nUploading evaluator '{evaluator_id}'...")
596
- result, version_id = create_evaluation(
597
- evaluator_id=evaluator_id,
598
- display_name=evaluator_id,
599
- description=f"Evaluator for {evaluator_id}",
600
- entry_point=selected_entry,
620
+ upload_args = argparse.Namespace(
621
+ path=project_root,
622
+ entry=selected_entry,
623
+ id=evaluator_id,
624
+ display_name=None,
625
+ description=None,
626
+ force=force, # Pass through the --force flag
627
+ yes=True,
628
+ env_file=None, # Add the new env_file parameter
601
629
  )
602
630
 
603
- if not version_id:
604
- print("Warning: Evaluator created but version upload failed.")
605
- return False
631
+ if force:
632
+ print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")
606
633
 
607
- print(f"✓ Uploaded evaluator: {evaluator_id} (version: {version_id})")
634
+ rc = upload_command(upload_args)
635
+ if rc == 0:
636
+ print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
608
637
 
609
- # Poll for the specific evaluator version status
610
- print(f"Waiting for evaluator '{evaluator_id}' version '{version_id}' to become ACTIVE...")
611
- is_active = _poll_evaluator_version_status(
612
- evaluator_id=evaluator_id,
613
- version_id=version_id,
614
- api_key=api_key,
615
- api_base=api_base,
616
- timeout_minutes=10,
617
- )
638
+ # Poll for evaluator status
639
+ print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
640
+ is_active = _poll_evaluator_status(
641
+ evaluator_resource_name=evaluator_resource_name,
642
+ api_key=api_key,
643
+ api_base=api_base,
644
+ timeout_minutes=10,
645
+ )
618
646
 
619
- if not is_active:
620
- dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
621
- print("\n❌ Evaluator version is not ready within the timeout period.")
622
- print(f"📊 Please check the evaluator status at: {dashboard_url}")
623
- print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
647
+ if not is_active:
648
+ dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
649
+ print("\n❌ Evaluator is not ready within the timeout period.")
650
+ print(f"📊 Please check the evaluator status at: {dashboard_url}")
651
+ print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
652
+ return False
653
+ return True
654
+ else:
655
+ print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
624
656
  return False
625
- return True
626
657
  except Exception as e:
627
658
  print(f"Warning: Failed to upload evaluator automatically: {e}")
628
659
  return False
@@ -641,7 +672,7 @@ def _create_rft_job(
641
672
  ) -> int:
642
673
  """Build and submit the RFT job request (via Fireworks SDK)."""
643
674
 
644
- signature = inspect.signature(create_fireworks_client().reinforcement_fine_tuning_jobs.create)
675
+ signature = inspect.signature(Fireworks().reinforcement_fine_tuning_jobs.create)
645
676
 
646
677
  # Build top-level SDK kwargs
647
678
  sdk_kwargs: Dict[str, Any] = {
@@ -680,7 +711,7 @@ def _create_rft_job(
680
711
  return 0
681
712
 
682
713
  try:
683
- fw: Fireworks = create_fireworks_client(api_key=api_key, base_url=api_base)
714
+ fw: Fireworks = Fireworks(api_key=api_key, base_url=api_base)
684
715
  job: ReinforcementFineTuningJob = fw.reinforcement_fine_tuning_jobs.create(account_id=account_id, **sdk_kwargs)
685
716
  job_name = job.name
686
717
  print(f"\n✅ Created Reinforcement Fine-tuning Job: {job_name}")
@@ -708,6 +739,7 @@ def create_rft_command(args) -> int:
708
739
  evaluator_arg: Optional[str] = getattr(args, "evaluator", None)
709
740
  non_interactive: bool = bool(getattr(args, "yes", False))
710
741
  dry_run: bool = bool(getattr(args, "dry_run", False))
742
+ force: bool = bool(getattr(args, "force", False))
711
743
  skip_validation: bool = bool(getattr(args, "skip_validation", False))
712
744
  ignore_docker: bool = bool(getattr(args, "ignore_docker", False))
713
745
  docker_build_extra: str = getattr(args, "docker_build_extra", "") or ""
@@ -778,12 +810,14 @@ def create_rft_command(args) -> int:
778
810
  if not dataset_id or not dataset_resource:
779
811
  return 1
780
812
 
781
- # 5) Ensure evaluator exists and its latest version is ACTIVE (upload + poll if needed)
813
+ # 5) Ensure evaluator exists and is ACTIVE (upload + poll if needed)
782
814
  if not _upload_and_ensure_evaluator(
783
815
  project_root=project_root,
784
816
  evaluator_id=evaluator_id,
817
+ evaluator_resource_name=evaluator_resource_name,
785
818
  api_key=api_key,
786
819
  api_base=api_base,
820
+ force=force,
787
821
  ):
788
822
  return 1
789
823
 
@@ -289,6 +289,7 @@ def upload_command(args: argparse.Namespace) -> int:
289
289
  base_id = getattr(args, "id", None)
290
290
  display_name = getattr(args, "display_name", None)
291
291
  description = getattr(args, "description", None)
292
+ force = bool(getattr(args, "force", False))
292
293
  env_file = getattr(args, "env_file", None)
293
294
 
294
295
  # Load secrets from .env file and ensure they're available on Fireworks
@@ -377,18 +378,17 @@ def upload_command(args: argparse.Namespace) -> int:
377
378
 
378
379
  print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
379
380
  try:
380
- result, version_id = create_evaluation(
381
+ result = create_evaluation(
381
382
  evaluator_id=evaluator_id,
382
383
  display_name=display_name or evaluator_id,
383
384
  description=description or f"Evaluator for {qualname}",
385
+ force=force,
384
386
  entry_point=entry_point,
385
387
  )
386
388
  name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
387
389
 
388
390
  # Print success message with Fireworks dashboard link
389
391
  print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
390
- if version_id:
391
- print(f" Version: {version_id}")
392
392
  print("📊 View in Fireworks Dashboard:")
393
393
  dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
394
394
  print(f" {dashboard_url}\n")
@@ -4,15 +4,14 @@ import time
4
4
  from typing import List, Optional
5
5
 
6
6
  import fireworks
7
- from fireworks.types import EvaluatorVersionParam
8
7
  import requests
8
+ from fireworks import Fireworks
9
9
 
10
10
  from eval_protocol.auth import (
11
11
  get_fireworks_account_id,
12
12
  get_fireworks_api_key,
13
13
  verify_api_key_and_get_account_id,
14
14
  )
15
- from eval_protocol.fireworks_client import create_fireworks_client
16
15
  from eval_protocol.get_pep440_version import get_pep440_version
17
16
 
18
17
  logger = logging.getLogger(__name__)
@@ -154,7 +153,7 @@ class Evaluator:
154
153
  logger.info(f"Created {output_path} ({size_bytes:,} bytes)")
155
154
  return size_bytes
156
155
 
157
- def create(self, evaluator_id, display_name=None, description=None):
156
+ def create(self, evaluator_id, display_name=None, description=None, force=False):
158
157
  auth_token = self.api_key or get_fireworks_api_key()
159
158
  account_id = self.account_id or get_fireworks_account_id()
160
159
  if not account_id and auth_token:
@@ -164,11 +163,7 @@ class Evaluator:
164
163
  logger.error("Authentication error: API credentials appear to be invalid or incomplete.")
165
164
  raise ValueError("Invalid or missing API credentials.")
166
165
 
167
- client = create_fireworks_client(
168
- api_key=auth_token,
169
- base_url=self.api_base,
170
- account_id=account_id,
171
- )
166
+ client = Fireworks(api_key=auth_token, base_url=self.api_base, account_id=account_id)
172
167
 
173
168
  self.display_name = display_name or evaluator_id
174
169
  self.description = description or f"Evaluator created from {evaluator_id}"
@@ -202,20 +197,28 @@ class Evaluator:
202
197
  logger.info(f"Creating evaluator '{evaluator_id}' for account '{account_id}'...")
203
198
 
204
199
  try:
205
- # Try to create evaluator using SDK
206
- try:
207
- result = client.evaluators.create(
208
- evaluator_id=evaluator_id,
209
- evaluator=evaluator_params,
210
- )
211
- logger.info(f"Successfully created evaluator '{evaluator_id}'")
212
- except fireworks.APIStatusError as create_error:
213
- if create_error.status_code == 409:
214
- # Evaluator already exists, get the existing one and proceed to create a new version
215
- logger.info(f"Evaluator '{evaluator_id}' already exists, creating new version...")
216
- result = client.evaluators.get(evaluator_id=evaluator_id)
217
- else:
218
- raise
200
+ if force:
201
+ try:
202
+ logger.info("Checking if evaluator exists")
203
+ existing_evaluator = client.evaluators.get(evaluator_id=evaluator_id)
204
+ if existing_evaluator:
205
+ logger.info(f"Evaluator '{evaluator_id}' already exists, deleting and recreating...")
206
+ try:
207
+ client.evaluators.delete(evaluator_id=evaluator_id)
208
+ logger.info(f"Successfully deleted evaluator '{evaluator_id}'")
209
+ except fireworks.NotFoundError:
210
+ logger.info(f"Evaluator '{evaluator_id}' not found, creating...")
211
+ except fireworks.APIError as e:
212
+ logger.warning(f"Error deleting evaluator: {str(e)}")
213
+ except fireworks.NotFoundError:
214
+ logger.info(f"Evaluator '{evaluator_id}' does not exist, creating...")
215
+
216
+ # Create evaluator using SDK
217
+ result = client.evaluators.create(
218
+ evaluator_id=evaluator_id,
219
+ evaluator=evaluator_params,
220
+ )
221
+ logger.info(f"Successfully created evaluator '{evaluator_id}'")
219
222
 
220
223
  # Upload code as tar.gz to GCS
221
224
  evaluator_name = result.name # e.g., "accounts/pyroworks/evaluators/test-123"
@@ -226,25 +229,6 @@ class Evaluator:
226
229
  f"Cannot proceed with code upload. Response: {result}"
227
230
  )
228
231
 
229
- evaluator_version_param: EvaluatorVersionParam = {}
230
- if "commit_hash" in evaluator_params:
231
- evaluator_version_param["commit_hash"] = evaluator_params["commit_hash"]
232
- if "entry_point" in evaluator_params:
233
- evaluator_version_param["entry_point"] = evaluator_params["entry_point"]
234
- if "requirements" in evaluator_params:
235
- evaluator_version_param["requirements"] = evaluator_params["requirements"]
236
-
237
- evaluator_version = client.evaluator_versions.create(
238
- evaluator_id=evaluator_id,
239
- evaluator_version=evaluator_version_param,
240
- )
241
- evaluator_version_id = evaluator_version.name.split("/")[-1] if evaluator_version.name else None
242
- if not evaluator_version_id:
243
- raise ValueError(
244
- "Create evaluator version response missing 'name' field. "
245
- f"Cannot proceed with code upload. Response: {evaluator_version}"
246
- )
247
-
248
232
  try:
249
233
  # Create tar.gz of current directory
250
234
  cwd = os.getcwd()
@@ -256,8 +240,7 @@ class Evaluator:
256
240
 
257
241
  # Call GetEvaluatorUploadEndpoint using SDK
258
242
  logger.info(f"Requesting upload endpoint for {tar_filename}")
259
- upload_response = client.evaluator_versions.get_upload_endpoint(
260
- version_id=evaluator_version_id,
243
+ upload_response = client.evaluators.get_upload_endpoint(
261
244
  evaluator_id=evaluator_id,
262
245
  filename_to_size={tar_filename: str(tar_size)},
263
246
  )
@@ -338,9 +321,9 @@ class Evaluator:
338
321
  raise
339
322
 
340
323
  # Step 3: Validate upload using SDK
341
- client.evaluator_versions.validate_upload(
342
- version_id=evaluator_version_id,
324
+ client.evaluators.validate_upload(
343
325
  evaluator_id=evaluator_id,
326
+ body={},
344
327
  )
345
328
  logger.info("Upload validated successfully")
346
329
 
@@ -351,10 +334,8 @@ class Evaluator:
351
334
  except Exception as upload_error:
352
335
  logger.warning(f"Code upload failed (evaluator created but code not uploaded): {upload_error}")
353
336
  # Don't fail - evaluator is created, just code upload failed
354
- # Return None for version_id since upload failed
355
- return result, None
356
337
 
357
- return result, evaluator_version_id # Return evaluator result and version ID
338
+ return result # Return after attempting upload
358
339
  except fireworks.APIStatusError as e:
359
340
  logger.error(f"Error creating evaluator: {str(e)}")
360
341
  logger.error(f"Status code: {e.status_code}, Response: {e.response.text}")
@@ -380,6 +361,7 @@ def create_evaluation(
380
361
  evaluator_id: str,
381
362
  display_name: Optional[str] = None,
382
363
  description: Optional[str] = None,
364
+ force: bool = False,
383
365
  account_id: Optional[str] = None,
384
366
  api_key: Optional[str] = None,
385
367
  entry_point: Optional[str] = None,
@@ -391,13 +373,10 @@ def create_evaluation(
391
373
  evaluator_id: Unique identifier for the evaluator
392
374
  display_name: Display name for the evaluator
393
375
  description: Description for the evaluator
376
+ force: If True, delete and recreate if evaluator exists
394
377
  account_id: Optional Fireworks account ID
395
378
  api_key: Optional Fireworks API key
396
379
  entry_point: Optional entry point (module::function or path::function)
397
-
398
- Returns:
399
- A tuple of (evaluator_result, version_id) where version_id is the ID of the
400
- created evaluator version, or None if upload failed.
401
380
  """
402
381
  evaluator = Evaluator(
403
382
  account_id=account_id,
@@ -405,4 +384,4 @@ def create_evaluation(
405
384
  entry_point=entry_point,
406
385
  )
407
386
 
408
- return evaluator.create(evaluator_id, display_name, description)
387
+ return evaluator.create(evaluator_id, display_name, description, force)