eval-protocol 0.2.84__tar.gz → 0.2.84.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (449) hide show
  1. {eval_protocol-0.2.84/eval_protocol.egg-info → eval_protocol-0.2.84.dev2}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/auth.py +68 -3
  4. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli.py +26 -1
  5. eval_protocol-0.2.84.dev2/eval_protocol/cli_commands/local_test.py +140 -0
  6. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/upload.py +24 -3
  7. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/evaluation.py +3 -1
  8. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/fireworks_rft.py +3 -1
  9. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/handle_persist_flow.py +15 -15
  10. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2/eval_protocol.egg-info}/PKG-INFO +1 -1
  11. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/SOURCES.txt +2 -0
  12. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_auth.py +10 -5
  13. eval_protocol-0.2.84.dev2/tests/test_cli_local_test.py +145 -0
  14. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/LICENSE +0 -0
  15. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/README.md +0 -0
  16. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/__init__.py +0 -0
  17. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/normalize_sandbox_fusion.py +0 -0
  18. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/utils/__init__.py +0 -0
  19. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/utils/generate_api_key.py +0 -0
  20. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/utils/subprocess_manager.py +0 -0
  21. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/__init__.py +0 -0
  22. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/__main__.py +0 -0
  23. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/__init__.py +0 -0
  24. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/base.py +0 -0
  25. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/bigquery.py +0 -0
  26. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/braintrust.py +0 -0
  27. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  28. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/huggingface.py +0 -0
  29. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/langchain.py +0 -0
  30. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/langfuse.py +0 -0
  31. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/langsmith.py +0 -0
  32. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/openai_responses.py +0 -0
  33. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/trl.py +0 -0
  34. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/utils.py +0 -0
  35. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/weave.py +0 -0
  36. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/__init__.py +0 -0
  37. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/models.py +0 -0
  38. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/orchestrator.py +0 -0
  39. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resource_abc.py +0 -0
  40. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resource_pool.py +0 -0
  41. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
  42. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  43. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  44. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  45. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  46. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  47. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
  48. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  49. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  50. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
  51. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/task_manager.py +0 -0
  52. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/tool_registry.py +0 -0
  53. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
  54. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  55. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  56. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
  57. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  58. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  59. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  60. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  61. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  62. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
  63. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  64. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/common.py +0 -0
  65. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/create_rft.py +0 -0
  66. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
  67. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  68. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/logs.py +0 -0
  69. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/preview.py +0 -0
  70. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  71. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/common_utils.py +0 -0
  72. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/config.py +0 -0
  73. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/__init__.py +0 -0
  74. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  75. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  76. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  77. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  78. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/models.py +0 -0
  79. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
  80. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  81. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  82. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  83. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  84. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/datasets/__init__.py +0 -0
  85. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/datasets/loader.py +0 -0
  86. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/directory_utils.py +0 -0
  87. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/__init__.py +0 -0
  88. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
  89. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/logger.py +0 -0
  90. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  91. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  92. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/exceptions.py +0 -0
  93. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/execution/__init__.py +0 -0
  94. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/execution/pipeline.py +0 -0
  95. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/gcp_tools.py +0 -0
  96. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/generation/cache.py +0 -0
  97. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/generation/clients/base.py +0 -0
  98. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/generation/clients.py +0 -0
  99. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/generic_server.py +0 -0
  100. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/get_pep440_version.py +0 -0
  101. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/human_id/__init__.py +0 -0
  102. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/human_id/dictionary.py +0 -0
  103. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/integrations/__init__.py +0 -0
  104. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/integrations/deepeval.py +0 -0
  105. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/integrations/openeval.py +0 -0
  106. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/integrations/trl.py +0 -0
  107. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/__init__.py +0 -0
  108. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  109. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  110. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  111. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  112. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/init.py +0 -0
  113. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/rollout_context.py +0 -0
  114. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  115. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/util.py +0 -0
  116. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/logging_utils.py +0 -0
  117. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/__init__.py +0 -0
  118. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/adapter.py +0 -0
  119. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
  120. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/client/connection.py +0 -0
  121. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/clients.py +0 -0
  122. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
  123. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
  124. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/execution/manager.py +0 -0
  125. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
  126. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
  127. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  128. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
  129. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/process_manager.py +0 -0
  130. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
  131. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/session/manager.py +0 -0
  132. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
  133. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
  134. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
  135. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/config.py +0 -0
  136. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/main.py +0 -0
  137. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  138. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  139. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  140. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  141. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_env.py +0 -0
  142. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
  143. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  144. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  145. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  146. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  147. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  148. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  149. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  150. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  151. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  152. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  153. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  154. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  155. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  156. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  157. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/models.py +0 -0
  158. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/packaging.py +0 -0
  159. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/platform_api.py +0 -0
  160. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/playback_policy.py +0 -0
  161. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/__init__.py +0 -0
  162. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  163. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/app.py +0 -0
  164. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  165. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  166. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  167. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/main.py +0 -0
  168. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/models.py +0 -0
  169. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  170. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/__init__.py +0 -0
  171. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  172. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  173. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  174. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  175. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  176. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  177. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  178. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  179. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  180. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/evaluation_test.py +0 -0
  181. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  182. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  183. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/exception_config.py +0 -0
  184. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/execution.py +0 -0
  185. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  186. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  187. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/parameterize.py +0 -0
  188. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/plugin.py +0 -0
  189. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  190. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
  191. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
  192. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
  193. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/tracing_utils.py +0 -0
  194. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/types.py +0 -0
  195. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
  196. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/__init__.py +0 -0
  197. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  198. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  199. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  200. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  201. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  202. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  203. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  204. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
  205. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  206. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  207. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  208. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  209. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/utils.py +0 -0
  210. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/resources.py +0 -0
  211. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/reward_function.py +0 -0
  212. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/__init__.py +0 -0
  213. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/accuracy.py +0 -0
  214. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
  215. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  216. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  217. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
  218. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
  219. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/code_execution.py +0 -0
  220. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
  221. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
  222. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  223. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/format.py +0 -0
  224. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/function_calling.py +0 -0
  225. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/json_schema.py +0 -0
  226. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
  227. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
  228. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/length.py +0 -0
  229. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  230. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/math.py +0 -0
  231. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  232. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
  233. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/repetition.py +0 -0
  234. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/tag_count.py +0 -0
  235. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rl_processing.py +0 -0
  236. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/server.py +0 -0
  237. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/stats/__init__.py +0 -0
  238. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
  239. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/typed_interface.py +0 -0
  240. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/types/__init__.py +0 -0
  241. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/types/errors.py +0 -0
  242. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/types/remote_rollout_processor.py +0 -0
  243. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/types/types.py +0 -0
  244. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/__init__.py +0 -0
  245. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
  246. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
  247. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/browser_utils.py +0 -0
  248. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/check_server_status.py +0 -0
  249. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
  250. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  251. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/logs_models.py +0 -0
  252. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/logs_server.py +0 -0
  253. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/module_loader.py +0 -0
  254. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
  255. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/show_results_url.py +0 -0
  256. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/static_policy.py +0 -0
  257. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
  258. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/vite_server.py +0 -0
  259. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
  260. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
  261. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/requires.txt +0 -0
  262. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
  263. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/pyproject.toml +0 -0
  264. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/setup.cfg +0 -0
  265. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/setup.py +0 -0
  266. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_accuracy.py +0 -0
  267. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_accuracy_length.py +0 -0
  268. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_adapters_e2e.py +0 -0
  269. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_agent_orchestrator.py +0 -0
  270. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_agent_resources.py +0 -0
  271. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_batch_evaluation.py +0 -0
  272. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cli.py +0 -0
  273. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cli_agent.py +0 -0
  274. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cli_args.py +0 -0
  275. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cli_create_rft_infer.py +0 -0
  276. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_code_execution.py +0 -0
  277. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_config.py +0 -0
  278. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_control_plane_separation.py +0 -0
  279. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cpp_code.py +0 -0
  280. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_data_driven_task_manager.py +0 -0
  281. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_deepcoder_reward.py +0 -0
  282. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_deepeval_integration.py +0 -0
  283. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_deploy_integration.py +0 -0
  284. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_directory_utils.py +0 -0
  285. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_e2b_integration.py +0 -0
  286. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_e2b_js_integration.py +0 -0
  287. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_edge_cases.py +0 -0
  288. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_ep_upload_e2e.py +0 -0
  289. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_eval_protocol_import.py +0 -0
  290. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_evaluation.py +0 -0
  291. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_evaluation_integration.py +0 -0
  292. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_evaluation_postprocess.py +0 -0
  293. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_evaluation_preview_integration.py +0 -0
  294. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_event_bus.py +0 -0
  295. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_event_bus_helper.py +0 -0
  296. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_examples_end_to_end.py +0 -0
  297. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_exceptions.py +0 -0
  298. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_fireworks_api.py +0 -0
  299. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_format.py +0 -0
  300. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_fractional_code.py +0 -0
  301. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_function_calling.py +0 -0
  302. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_gcp_tools.py +0 -0
  303. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_generic_server.py +0 -0
  304. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_human_id.py +0 -0
  305. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_integration.py +0 -0
  306. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_json_schema.py +0 -0
  307. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_kwargs_validation.py +0 -0
  308. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_language_consistency.py +0 -0
  309. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_lean_prover.py +0 -0
  310. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_lean_prover_runner.py +0 -0
  311. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_length.py +0 -0
  312. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_list_comparison_math_reward.py +0 -0
  313. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_logs_server.py +0 -0
  314. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_logs_server_simple.py +0 -0
  315. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_math.py +0 -0
  316. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_message_field_filtering.py +0 -0
  317. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_minimal.py +0 -0
  318. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_models.py +0 -0
  319. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_models_rl.py +0 -0
  320. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
  321. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_n_variant_batch_integration.py +0 -0
  322. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_n_variant_integration.py +0 -0
  323. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_openai_compatibility.py +0 -0
  324. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_openeval_integration.py +0 -0
  325. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_packaging.py +0 -0
  326. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_parallel_rollouts.py +0 -0
  327. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_platform_api.py +0 -0
  328. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_quickstart_utils.py +0 -0
  329. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_readiness.py +0 -0
  330. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_reasoning_steps.py +0 -0
  331. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_repetition.py +0 -0
  332. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_repetition_debug.py +0 -0
  333. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_retry_mechanism.py +0 -0
  334. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_reward_function.py +0 -0
  335. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_reward_protocol_import.py +0 -0
  336. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_rl_processing.py +0 -0
  337. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
  338. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_server.py +0 -0
  339. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_show_results_url.py +0 -0
  340. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_status_migration_changes.py +0 -0
  341. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_status_migration_integration.py +0 -0
  342. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_status_model.py +0 -0
  343. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_tag_count.py +0 -0
  344. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
  345. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_typed_interface.py +0 -0
  346. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_typed_interface_rl.py +0 -0
  347. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_upload_entrypoint.py +0 -0
  348. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_url_handling.py +0 -0
  349. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_vite_server.py +0 -0
  350. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/__init__.py +0 -0
  351. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/agent/__init__.py +0 -0
  352. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/agent/base.py +0 -0
  353. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
  354. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/api_service/__init__.py +0 -0
  355. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/api_service/api_config.py +0 -0
  356. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/api_service/data_model.py +0 -0
  357. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
  358. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/cli.py +0 -0
  359. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/config.py +0 -0
  360. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
  361. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
  362. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  363. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
  364. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  365. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  366. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  367. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  368. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  369. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  370. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  371. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data_model/__init__.py +0 -0
  372. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data_model/message.py +0 -0
  373. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data_model/simulation.py +0 -0
  374. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data_model/tasks.py +0 -0
  375. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/__init__.py +0 -0
  376. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
  377. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
  378. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
  379. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
  380. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
  381. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
  382. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
  383. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
  384. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
  385. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
  386. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
  387. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
  388. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
  389. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
  390. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
  391. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
  392. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
  393. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
  394. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  395. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  396. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  397. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  398. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  399. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  400. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  401. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  402. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
  403. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  404. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  405. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
  406. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/__init__.py +0 -0
  407. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/db.py +0 -0
  408. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/environment.py +0 -0
  409. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/server.py +0 -0
  410. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/tool.py +0 -0
  411. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/toolkit.py +0 -0
  412. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  413. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
  414. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
  415. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  416. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  417. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  418. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  419. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  420. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/metrics/__init__.py +0 -0
  421. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
  422. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  423. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
  424. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  425. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  426. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
  427. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/registry.py +0 -0
  428. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/run.py +0 -0
  429. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/__init__.py +0 -0
  430. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/check_data.py +0 -0
  431. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  432. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
  433. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
  434. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/user/__init__.py +0 -0
  435. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/user/base.py +0 -0
  436. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/user/user_simulator.py +0 -0
  437. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/__init__.py +0 -0
  438. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/display.py +0 -0
  439. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/io_utils.py +0 -0
  440. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
  441. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
  442. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/utils.py +0 -0
  443. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/versioneer.py +0 -0
  444. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  445. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
  446. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
  447. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
  448. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  449. {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.84
3
+ Version: 0.2.84.dev2
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-11-10T00:30:58-0800",
11
+ "date": "2025-11-10T17:41:27-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "2d75acf5944468856d9f1bea787fce63dcabc16f",
15
- "version": "0.2.84"
14
+ "full-revisionid": "cd9cc91c34f975482fe05b4bf3a60b4a0bcbd746",
15
+ "version": "0.2.84.dev.2"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -136,6 +136,56 @@ def _get_credential_from_config_file(key_name: str) -> Optional[str]:
136
136
  return None
137
137
 
138
138
 
139
+ def _get_credentials_from_config_file() -> Dict[str, Optional[str]]:
140
+ """
141
+ Retrieve both api_key and account_id from auth.ini with a single read/parse.
142
+ Tries simple parsing first for both keys, then falls back to configparser for any missing ones.
143
+ Returns a dict with up to two keys: 'api_key' and 'account_id'.
144
+ """
145
+ results: Dict[str, Optional[str]] = {}
146
+ auth_ini_path = _get_auth_ini_file()
147
+ if not auth_ini_path.exists():
148
+ return results
149
+
150
+ # 1) Simple key=value parsing
151
+ try:
152
+ simple_creds = _parse_simple_auth_file(auth_ini_path)
153
+ if "api_key" in simple_creds and simple_creds["api_key"]:
154
+ results["api_key"] = simple_creds["api_key"]
155
+ if "account_id" in simple_creds and simple_creds["account_id"]:
156
+ results["account_id"] = simple_creds["account_id"]
157
+ if "api_key" in results and "account_id" in results:
158
+ return results
159
+ except Exception as e:
160
+ logger.warning("Error during simple parsing of %s: %s", str(auth_ini_path), e)
161
+
162
+ # 2) ConfigParser for any missing keys
163
+ try:
164
+ config = configparser.ConfigParser()
165
+ config.read(auth_ini_path)
166
+ for key_name in ("api_key", "account_id"):
167
+ if key_name in results and results[key_name]:
168
+ continue
169
+ if "fireworks" in config and config.has_option("fireworks", key_name):
170
+ value_from_file = config.get("fireworks", key_name)
171
+ if value_from_file:
172
+ results[key_name] = value_from_file
173
+ continue
174
+ if config.has_option(config.default_section, key_name):
175
+ value_from_default = config.get(config.default_section, key_name)
176
+ if value_from_default:
177
+ results[key_name] = value_from_default
178
+ except configparser.MissingSectionHeaderError:
179
+ # Purely key=value file without section headers; simple parsing should have handled it already.
180
+ logger.debug("%s has no section headers; falling back to simple parsing results.", str(auth_ini_path))
181
+ except configparser.Error as e_config:
182
+ logger.warning("Configparser error reading %s: %s", str(auth_ini_path), e_config)
183
+ except Exception as e_general:
184
+ logger.warning("Unexpected error reading %s: %s", str(auth_ini_path), e_general)
185
+
186
+ return results
187
+
188
+
139
189
  def get_fireworks_api_key() -> Optional[str]:
140
190
  """
141
191
  Retrieves the Fireworks API key.
@@ -177,13 +227,15 @@ def get_fireworks_account_id() -> Optional[str]:
177
227
  The Account ID is sourced in the following order:
178
228
  1. FIREWORKS_ACCOUNT_ID environment variable.
179
229
  2. 'account_id' from the [fireworks] section of ~/.fireworks/auth.ini.
230
+ 3. If an API key is available (env or auth.ini), resolve via verifyApiKey.
180
231
 
181
232
  Returns:
182
233
  The Account ID if found, otherwise None.
183
234
  """
184
235
  # If a profile is active, prefer profile file first, then env
185
236
  if _is_profile_active():
186
- account_id_from_file = _get_credential_from_config_file("account_id")
237
+ creds = _get_credentials_from_config_file()
238
+ account_id_from_file = creds.get("account_id")
187
239
  if account_id_from_file:
188
240
  return account_id_from_file
189
241
  account_id = os.environ.get("FIREWORKS_ACCOUNT_ID")
@@ -196,11 +248,24 @@ def get_fireworks_account_id() -> Optional[str]:
196
248
  if account_id:
197
249
  logger.debug("Using FIREWORKS_ACCOUNT_ID from environment variable.")
198
250
  return account_id
199
- account_id_from_file = _get_credential_from_config_file("account_id")
251
+ creds = _get_credentials_from_config_file()
252
+ account_id_from_file = creds.get("account_id")
200
253
  if account_id_from_file:
201
254
  return account_id_from_file
202
255
 
203
- logger.debug("Fireworks Account ID not found in environment variables or auth.ini.")
256
+ # 3) Fallback: if API key is present, attempt to resolve via verifyApiKey (env or auth.ini)
257
+ try:
258
+ # Intentionally use get_fireworks_api_key to centralize precedence (env vs file)
259
+ api_key_for_verify = get_fireworks_api_key()
260
+ if api_key_for_verify:
261
+ resolved = verify_api_key_and_get_account_id(api_key=api_key_for_verify, api_base=get_fireworks_api_base())
262
+ if resolved:
263
+ logger.debug("Using FIREWORKS_ACCOUNT_ID resolved via verifyApiKey: %s", resolved)
264
+ return resolved
265
+ except Exception as e:
266
+ logger.debug("Failed to resolve FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", e)
267
+
268
+ logger.debug("Fireworks Account ID not found in environment variables, auth.ini, or via verifyApiKey.")
204
269
  return None
205
270
 
206
271
 
@@ -395,7 +395,7 @@ def parse_args(args=None):
395
395
  rft_parser.add_argument("--base-model", help="Base model resource id")
396
396
  rft_parser.add_argument("--warm-start-from", help="Addon model to warm start from")
397
397
  rft_parser.add_argument("--output-model", help="Output model id (defaults from evaluator)")
398
- rft_parser.add_argument("--epochs", type=int, default=8)
398
+ rft_parser.add_argument("--epochs", type=int, default=1)
399
399
  rft_parser.add_argument("--batch-size", type=int, default=128000)
400
400
  rft_parser.add_argument("--learning-rate", type=float, default=3e-5)
401
401
  rft_parser.add_argument("--max-context-length", type=int, default=65536)
@@ -427,6 +427,27 @@ def parse_args(args=None):
427
427
  rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
428
428
  rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
429
429
 
430
+ # Local test command
431
+ local_test_parser = subparsers.add_parser(
432
+ "local-test",
433
+ help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
434
+ )
435
+ local_test_parser.add_argument(
436
+ "--entry",
437
+ help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
438
+ )
439
+ local_test_parser.add_argument(
440
+ "--ignore-docker",
441
+ action="store_true",
442
+ help="Ignore Dockerfile even if present; run pytest on host",
443
+ )
444
+ local_test_parser.add_argument(
445
+ "--yes",
446
+ "-y",
447
+ action="store_true",
448
+ help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
449
+ )
450
+
430
451
  # Run command (for Hydra-based evaluations)
431
452
  # This subparser intentionally defines no arguments itself.
432
453
  # All arguments after 'run' will be passed to Hydra by parse_known_args.
@@ -559,6 +580,10 @@ def main():
559
580
  return create_rft_command(args)
560
581
  print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
561
582
  return 1
583
+ elif args.command == "local-test":
584
+ from .cli_commands.local_test import local_test_command
585
+
586
+ return local_test_command(args)
562
587
  elif args.command == "run":
563
588
  # For the 'run' command, Hydra takes over argument parsing.
564
589
 
@@ -0,0 +1,140 @@
1
+ import argparse
2
+ import os
3
+ import subprocess
4
+ import sys
5
+ from typing import List
6
+
7
+ from .upload import _discover_tests, _prompt_select
8
+
9
+
10
+ def _find_dockerfiles(root: str) -> List[str]:
11
+ skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
12
+ dockerfiles: List[str] = []
13
+ for dirpath, dirnames, filenames in os.walk(root):
14
+ dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
15
+ for name in filenames:
16
+ if name == "Dockerfile":
17
+ dockerfiles.append(os.path.join(dirpath, name))
18
+ return dockerfiles
19
+
20
+
21
+ def _run_pytest_host(pytest_target: str) -> int:
22
+ print(f"Running locally: pytest {pytest_target} -vs")
23
+ proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
24
+ return proc.returncode
25
+
26
+
27
+ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
28
+ context_dir = os.path.dirname(dockerfile_path)
29
+ print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
30
+ try:
31
+ proc = subprocess.run(
32
+ ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
33
+ stdout=subprocess.PIPE,
34
+ stderr=subprocess.STDOUT,
35
+ text=True,
36
+ )
37
+ print(proc.stdout)
38
+ return proc.returncode == 0
39
+ except FileNotFoundError:
40
+ print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
41
+ return False
42
+
43
+
44
+ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
45
+ workdir = "/workspace"
46
+ # Mount read-only is safer; but tests may write artifacts. Use read-write.
47
+ cmd = [
48
+ "docker",
49
+ "run",
50
+ "--rm",
51
+ "-v",
52
+ f"{project_root}:{workdir}",
53
+ "-w",
54
+ workdir,
55
+ image_tag,
56
+ "pytest",
57
+ pytest_target,
58
+ "-vs",
59
+ ]
60
+ print("Running in Docker:", " ".join(cmd))
61
+ try:
62
+ proc = subprocess.run(cmd)
63
+ return proc.returncode
64
+ except FileNotFoundError:
65
+ print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
66
+ return 1
67
+
68
+
69
+ def local_test_command(args: argparse.Namespace) -> int:
70
+ project_root = os.getcwd()
71
+
72
+ # Selection and pytest target resolution
73
+ pytest_target: str = ""
74
+ entry = getattr(args, "entry", None)
75
+ if entry:
76
+ if "::" in entry:
77
+ file_part = entry.split("::", 1)[0]
78
+ file_path = (
79
+ file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
80
+ )
81
+ pytest_target = entry
82
+ else:
83
+ file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
84
+ # Use path relative to project_root when possible
85
+ try:
86
+ rel = os.path.relpath(file_path, project_root)
87
+ except Exception:
88
+ rel = file_path
89
+ pytest_target = rel
90
+ else:
91
+ tests = _discover_tests(project_root)
92
+ if not tests:
93
+ print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.")
94
+ return 1
95
+ non_interactive = bool(getattr(args, "yes", False))
96
+ selected = _prompt_select(tests, non_interactive=non_interactive)
97
+ if not selected:
98
+ print("No tests selected.")
99
+ return 1
100
+ if len(selected) != 1:
101
+ print("Error: Please select exactly one evaluation test for 'local-test'.")
102
+ return 1
103
+ chosen = selected[0]
104
+ abs_path = os.path.abspath(chosen.file_path)
105
+ try:
106
+ rel = os.path.relpath(abs_path, project_root)
107
+ except Exception:
108
+ rel = abs_path
109
+ pytest_target = rel
110
+
111
+ ignore_docker = bool(getattr(args, "ignore_docker", False))
112
+ if ignore_docker:
113
+ if not pytest_target:
114
+ print("Error: Failed to resolve a pytest target to run.")
115
+ return 1
116
+ return _run_pytest_host(pytest_target)
117
+
118
+ dockerfiles = _find_dockerfiles(project_root)
119
+ if len(dockerfiles) > 1:
120
+ print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.")
121
+ for df in dockerfiles:
122
+ print(f" - {df}")
123
+ print("Hint: use --ignore-docker to bypass Docker.")
124
+ return 1
125
+ if len(dockerfiles) == 1:
126
+ image_tag = "ep-evaluator:local"
127
+ ok = _build_docker_image(dockerfiles[0], image_tag)
128
+ if not ok:
129
+ print("Docker build failed. See logs above.")
130
+ return 1
131
+ if not pytest_target:
132
+ print("Error: Failed to resolve a pytest target to run.")
133
+ return 1
134
+ return _run_pytest_in_docker(project_root, image_tag, pytest_target)
135
+
136
+ # No Dockerfile: run on host
137
+ if not pytest_target:
138
+ print("Error: Failed to resolve a pytest target to run.")
139
+ return 1
140
+ return _run_pytest_host(pytest_target)
@@ -552,6 +552,23 @@ def _load_secrets_from_env_file(env_file_path: str) -> Dict[str, str]:
552
552
  return secrets
553
553
 
554
554
 
555
+ def _mask_secret_value(value: str) -> str:
556
+ """
557
+ Return a masked representation of a secret showing only a small prefix/suffix.
558
+ Example: fw_3Z*******Xgnk
559
+ """
560
+ try:
561
+ if not isinstance(value, str) or not value:
562
+ return "<empty>"
563
+ prefix_len = 6
564
+ suffix_len = 4
565
+ if len(value) <= prefix_len + suffix_len:
566
+ return value[0] + "***" + value[-1]
567
+ return f"{value[:prefix_len]}***{value[-suffix_len:]}"
568
+ except Exception:
569
+ return "<masked>"
570
+
571
+
555
572
  def upload_command(args: argparse.Namespace) -> int:
556
573
  root = os.path.abspath(getattr(args, "path", "."))
557
574
  entries_arg = getattr(args, "entry", None)
@@ -602,9 +619,9 @@ def upload_command(args: argparse.Namespace) -> int:
602
619
  secrets_from_file = _load_secrets_from_env_file(env_file_path)
603
620
  secrets_from_env_file = secrets_from_file.copy() # Track what came from .env file
604
621
 
605
- # Also ensure FIREWORKS_API_KEY from environment is included
622
+ # Also consider FIREWORKS_API_KEY from environment, but prefer .env value
606
623
  fw_api_key_value = get_fireworks_api_key()
607
- if fw_api_key_value:
624
+ if fw_api_key_value and "FIREWORKS_API_KEY" not in secrets_from_file:
608
625
  secrets_from_file["FIREWORKS_API_KEY"] = fw_api_key_value
609
626
 
610
627
  if not fw_account_id and fw_api_key_value:
@@ -622,7 +639,11 @@ def upload_command(args: argparse.Namespace) -> int:
622
639
  print(f"Loading secrets from: {env_file_path}")
623
640
 
624
641
  for secret_name, secret_value in secrets_from_file.items():
625
- print(f"Ensuring {secret_name} is registered as a secret on Fireworks for rollout...")
642
+ source = ".env" if secret_name in secrets_from_env_file else "environment"
643
+ print(
644
+ f"Ensuring {secret_name} is registered as a secret on Fireworks for rollout... "
645
+ f"({source}: {_mask_secret_value(secret_value)})"
646
+ )
626
647
  if create_or_update_fireworks_secret(
627
648
  account_id=fw_account_id,
628
649
  key_name=secret_name,
@@ -595,7 +595,9 @@ class Evaluator:
595
595
  logger.error("Missing requirements.txt in upload directory: %s", source_dir)
596
596
  raise ValueError(
597
597
  "Upload requires requirements.txt in the project root. "
598
- "Please add requirements.txt and re-run ep upload."
598
+ "Create a requirements.txt (it can be empty) and rerun 'eval-protocol upload' "
599
+ "or 'eval-protocol create rft'. If you're running in a notebook (e.g., Colab), "
600
+ f"create the file in your working directory (e.g., {source_dir}/requirements.txt)."
599
601
  )
600
602
 
601
603
  @staticmethod
@@ -5,6 +5,7 @@ import os
5
5
  import sys
6
6
  import tempfile
7
7
  import time
8
+ import uuid
8
9
  from pathlib import Path
9
10
  from typing import Any, Callable, Dict, Iterable, Optional, Tuple
10
11
 
@@ -205,7 +206,8 @@ def build_default_dataset_id(evaluator_id: str) -> str:
205
206
 
206
207
  def build_default_output_model(evaluator_id: str) -> str:
207
208
  base = evaluator_id.lower().replace("_", "-")
208
- return f"{base}-rft"
209
+ uuid_suffix = str(uuid.uuid4())[:4]
210
+ return f"{base}-rft-{uuid_suffix}"
209
211
 
210
212
 
211
213
  __all__ = [
@@ -11,6 +11,12 @@ from eval_protocol.common_utils import get_user_agent
11
11
  from eval_protocol.directory_utils import find_eval_protocol_dir
12
12
  from eval_protocol.models import EvaluationRow
13
13
  from eval_protocol.pytest.store_experiment_link import store_experiment_link
14
+ from eval_protocol.auth import (
15
+ get_fireworks_api_key,
16
+ get_fireworks_account_id,
17
+ verify_api_key_and_get_account_id,
18
+ get_fireworks_api_base,
19
+ )
14
20
 
15
21
  import requests
16
22
 
@@ -90,22 +96,16 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
90
96
  if not should_upload:
91
97
  continue
92
98
 
93
- def get_auth_value(key: str) -> str | None:
94
- """Get auth value from config file or environment."""
99
+ # Resolve credentials using centralized auth helpers with verification fallback
100
+ fireworks_api_key = get_fireworks_api_key()
101
+ fireworks_account_id = get_fireworks_account_id()
102
+ if not fireworks_account_id and fireworks_api_key:
95
103
  try:
96
- config_path = Path.home() / ".fireworks" / "auth.ini"
97
- if config_path.exists():
98
- config = configparser.ConfigParser() # noqa: F821
99
- config.read(config_path)
100
- for section in ["DEFAULT", "auth"]:
101
- if config.has_section(section) and config.has_option(section, key):
102
- return config.get(section, key)
104
+ fireworks_account_id = verify_api_key_and_get_account_id(
105
+ api_key=fireworks_api_key, api_base=get_fireworks_api_base()
106
+ )
103
107
  except Exception:
104
- pass
105
- return os.getenv(key)
106
-
107
- fireworks_api_key = get_auth_value("FIREWORKS_API_KEY")
108
- fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID")
108
+ fireworks_account_id = None
109
109
 
110
110
  if not fireworks_api_key and not fireworks_account_id:
111
111
  store_experiment_link(
@@ -129,7 +129,7 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
129
129
  )
130
130
  continue
131
131
 
132
- api_base = "https://api.fireworks.ai"
132
+ api_base = get_fireworks_api_base()
133
133
  headers = {
134
134
  "Authorization": f"Bearer {fireworks_api_key}",
135
135
  "Content-Type": "application/json",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.84
3
+ Version: 0.2.84.dev2
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -91,6 +91,7 @@ eval_protocol/cli_commands/common.py
91
91
  eval_protocol/cli_commands/create_rft.py
92
92
  eval_protocol/cli_commands/deploy.py
93
93
  eval_protocol/cli_commands/deploy_mcp.py
94
+ eval_protocol/cli_commands/local_test.py
94
95
  eval_protocol/cli_commands/logs.py
95
96
  eval_protocol/cli_commands/preview.py
96
97
  eval_protocol/cli_commands/run_eval_cmd.py
@@ -277,6 +278,7 @@ tests/test_cli.py
277
278
  tests/test_cli_agent.py
278
279
  tests/test_cli_args.py
279
280
  tests/test_cli_create_rft_infer.py
281
+ tests/test_cli_local_test.py
280
282
  tests/test_code_execution.py
281
283
  tests/test_config.py
282
284
  tests/test_control_plane_separation.py
@@ -255,7 +255,8 @@ def test_get_account_id_not_found(mock_path_exists):
255
255
  with patch("eval_protocol.auth._parse_simple_auth_file", return_value={}) as mock_parse_simple:
256
256
  assert get_fireworks_account_id() is None
257
257
  mock_parse_simple.assert_not_called()
258
- mock_path_exists.assert_called_once_with()
258
+ # With verify fallback using get_fireworks_api_key, exists() may be checked more than once
259
+ assert mock_path_exists.call_count >= 1
259
260
 
260
261
 
261
262
  @patch("pathlib.Path.exists", return_value=True)
@@ -269,7 +270,8 @@ def test_get_account_id_ini_exists_no_section(mock_parse_simple, mock_ConfigPars
269
270
  mock_open(read_data="other_key = some_val_but_no_section_header\nanother=val"),
270
271
  ):
271
272
  assert get_fireworks_account_id() is None
272
- mock_parse_simple.assert_called_once_with(AUTH_INI_FILE)
273
+ # Fallback verify path may trigger a second simple parse for api_key; ensure at least one call
274
+ assert mock_parse_simple.call_count >= 1
273
275
 
274
276
 
275
277
  @patch("pathlib.Path.exists", return_value=True)
@@ -283,7 +285,8 @@ def test_get_account_id_ini_exists_no_id_option(mock_parse_simple, mock_ConfigPa
283
285
 
284
286
  with patch("builtins.open", mock_open(read_data="[fireworks]\nsome_other_key=foo")):
285
287
  assert get_fireworks_account_id() is None
286
- mock_parse_simple.assert_called_once_with(AUTH_INI_FILE)
288
+ # Fallback verify path may trigger a second simple parse for api_key; ensure at least one call
289
+ assert mock_parse_simple.call_count >= 1
287
290
 
288
291
 
289
292
  @patch("pathlib.Path.exists", return_value=True)
@@ -301,7 +304,8 @@ def test_get_account_id_ini_empty_value(mock_parse_simple, mock_ConfigParser_cla
301
304
  )
302
305
  with patch("builtins.open", mock_open(read_data="[fireworks]\naccount_id=")):
303
306
  assert get_fireworks_account_id() is None
304
- mock_parse_simple.assert_called_once_with(AUTH_INI_FILE)
307
+ # Fallback verify path may trigger a second simple parse for api_key; ensure at least one call
308
+ assert mock_parse_simple.call_count >= 1
305
309
 
306
310
 
307
311
  @patch("pathlib.Path.exists", return_value=True)
@@ -372,7 +376,8 @@ def test_get_account_id_ini_parse_error(mock_parse_simple, mock_ConfigParser_cla
372
376
  assert get_fireworks_account_id() is None
373
377
  assert "Configparser error reading" in caplog.text
374
378
  assert "Mocked Parsing Error" in caplog.text
375
- mock_parse_simple.assert_called_once_with(AUTH_INI_FILE)
379
+ # Fallback verify path may trigger a second simple parse for api_key; ensure at least one call
380
+ assert mock_parse_simple.call_count >= 1
376
381
 
377
382
 
378
383
  @patch("pathlib.Path.exists", return_value=True)
@@ -0,0 +1,145 @@
1
+ import os
2
+ from types import SimpleNamespace
3
+
4
+ import pytest
5
+
6
+
7
+ def test_local_test_runs_host_pytest_with_entry(tmp_path, monkeypatch):
8
+ project = tmp_path / "proj"
9
+ project.mkdir()
10
+ monkeypatch.chdir(project)
11
+
12
+ # Create a dummy test file
13
+ test_file = project / "metric" / "test_one.py"
14
+ test_file.parent.mkdir(parents=True, exist_ok=True)
15
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
16
+
17
+ # Import module under test
18
+ from eval_protocol.cli_commands import local_test as lt
19
+
20
+ # Avoid Docker path
21
+ monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
22
+
23
+ captured = {"target": ""}
24
+
25
+ def _fake_host(target: str) -> int:
26
+ captured["target"] = target
27
+ return 0
28
+
29
+ monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
30
+
31
+ args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
32
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
33
+ assert rc == 0
34
+ # Expect relative path target
35
+ assert captured["target"] == os.path.relpath(str(test_file), str(project))
36
+
37
+
38
+ def test_local_test_ignores_docker_when_flag_set(tmp_path, monkeypatch):
39
+ project = tmp_path / "proj"
40
+ project.mkdir()
41
+ monkeypatch.chdir(project)
42
+
43
+ test_file = project / "metric" / "test_two.py"
44
+ test_file.parent.mkdir(parents=True, exist_ok=True)
45
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
46
+
47
+ from eval_protocol.cli_commands import local_test as lt
48
+
49
+ # Pretend we have Dockerfile(s), but ignore_docker=True should skip
50
+ monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
51
+
52
+ called = {"host": False}
53
+
54
+ def _fake_host(target: str) -> int:
55
+ called["host"] = True
56
+ return 0
57
+
58
+ monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
59
+
60
+ args = SimpleNamespace(entry=str(test_file), ignore_docker=True, yes=True)
61
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
62
+ assert rc == 0
63
+ assert called["host"] is True
64
+
65
+
66
+ def test_local_test_errors_on_multiple_dockerfiles(tmp_path, monkeypatch):
67
+ project = tmp_path / "proj"
68
+ project.mkdir()
69
+ monkeypatch.chdir(project)
70
+
71
+ test_file = project / "metric" / "test_three.py"
72
+ test_file.parent.mkdir(parents=True, exist_ok=True)
73
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
74
+
75
+ from eval_protocol.cli_commands import local_test as lt
76
+
77
+ monkeypatch.setattr(
78
+ lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile"), str(project / "another" / "Dockerfile")]
79
+ )
80
+
81
+ args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
82
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
83
+ assert rc == 1
84
+
85
+
86
+ def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
87
+ project = tmp_path / "proj"
88
+ project.mkdir()
89
+ monkeypatch.chdir(project)
90
+
91
+ test_file = project / "metric" / "test_four.py"
92
+ test_file.parent.mkdir(parents=True, exist_ok=True)
93
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
94
+
95
+ from eval_protocol.cli_commands import local_test as lt
96
+
97
+ monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
98
+ monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True)
99
+
100
+ captured = {"target": "", "image": ""}
101
+
102
+ def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int:
103
+ captured["target"] = pytest_target
104
+ captured["image"] = image_tag
105
+ return 0
106
+
107
+ monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
108
+
109
+ args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
110
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
111
+ assert rc == 0
112
+ assert captured["image"] == "ep-evaluator:local"
113
+ assert captured["target"] == os.path.relpath(str(test_file), str(project))
114
+
115
+
116
+ def test_local_test_selector_single_test(tmp_path, monkeypatch):
117
+ project = tmp_path / "proj"
118
+ project.mkdir()
119
+ monkeypatch.chdir(project)
120
+
121
+ test_file = project / "metric" / "test_sel.py"
122
+ test_file.parent.mkdir(parents=True, exist_ok=True)
123
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
124
+
125
+ from eval_protocol.cli_commands import local_test as lt
126
+ from eval_protocol.cli_commands import upload as up
127
+
128
+ # No entry; force discover + selector
129
+ disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file))
130
+ monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc])
131
+ monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
132
+ monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
133
+
134
+ called = {"host": False}
135
+
136
+ def _fake_host(target: str) -> int:
137
+ called["host"] = True
138
+ return 0
139
+
140
+ monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
141
+
142
+ args = SimpleNamespace(entry=None, ignore_docker=False, yes=True)
143
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
144
+ assert rc == 0
145
+ assert called["host"] is True