eval-protocol 0.2.84.dev1__tar.gz → 0.2.84.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (449) hide show
  1. {eval_protocol-0.2.84.dev1/eval_protocol.egg-info → eval_protocol-0.2.84.dev3}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli.py +25 -0
  4. eval_protocol-0.2.84.dev3/eval_protocol/cli_commands/local_test.py +151 -0
  5. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/upload.py +2 -2
  6. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3/eval_protocol.egg-info}/PKG-INFO +1 -1
  7. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/SOURCES.txt +2 -0
  8. eval_protocol-0.2.84.dev3/tests/test_cli_local_test.py +145 -0
  9. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/LICENSE +0 -0
  10. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/README.md +0 -0
  11. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/__init__.py +0 -0
  12. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/normalize_sandbox_fusion.py +0 -0
  13. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/utils/__init__.py +0 -0
  14. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/utils/generate_api_key.py +0 -0
  15. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/utils/subprocess_manager.py +0 -0
  16. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/__init__.py +0 -0
  17. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/__main__.py +0 -0
  18. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/__init__.py +0 -0
  19. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/base.py +0 -0
  20. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/bigquery.py +0 -0
  21. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/braintrust.py +0 -0
  22. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  23. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/huggingface.py +0 -0
  24. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/langchain.py +0 -0
  25. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/langfuse.py +0 -0
  26. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/langsmith.py +0 -0
  27. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/openai_responses.py +0 -0
  28. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/trl.py +0 -0
  29. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/utils.py +0 -0
  30. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/weave.py +0 -0
  31. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/__init__.py +0 -0
  32. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/models.py +0 -0
  33. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/orchestrator.py +0 -0
  34. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resource_abc.py +0 -0
  35. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resource_pool.py +0 -0
  36. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/__init__.py +0 -0
  37. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  38. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  39. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  40. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  41. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  42. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/docker_resource.py +0 -0
  43. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  44. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  45. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/sql_resource.py +0 -0
  46. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/task_manager.py +0 -0
  47. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/tool_registry.py +0 -0
  48. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/auth.py +0 -0
  49. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/__init__.py +0 -0
  50. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  51. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  52. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_aime25.py +0 -0
  53. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  54. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  55. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  56. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  57. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  58. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/__init__.py +0 -0
  59. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  60. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/common.py +0 -0
  61. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/create_rft.py +0 -0
  62. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/deploy.py +0 -0
  63. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  64. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/logs.py +0 -0
  65. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/preview.py +0 -0
  66. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  67. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/common_utils.py +0 -0
  68. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/config.py +0 -0
  69. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/__init__.py +0 -0
  70. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  71. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  72. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  73. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  74. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/models.py +0 -0
  75. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/__init__.py +0 -0
  76. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  77. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  78. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  79. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  80. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/datasets/__init__.py +0 -0
  81. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/datasets/loader.py +0 -0
  82. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/directory_utils.py +0 -0
  83. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/evaluation.py +0 -0
  84. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/__init__.py +0 -0
  85. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/event_bus.py +0 -0
  86. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/logger.py +0 -0
  87. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  88. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  89. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/exceptions.py +0 -0
  90. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/execution/__init__.py +0 -0
  91. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/execution/pipeline.py +0 -0
  92. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/fireworks_rft.py +0 -0
  93. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/gcp_tools.py +0 -0
  94. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generation/cache.py +0 -0
  95. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generation/clients/base.py +0 -0
  96. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generation/clients.py +0 -0
  97. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generic_server.py +0 -0
  98. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/get_pep440_version.py +0 -0
  99. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/human_id/__init__.py +0 -0
  100. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/human_id/dictionary.py +0 -0
  101. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/integrations/__init__.py +0 -0
  102. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/integrations/deepeval.py +0 -0
  103. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/integrations/openeval.py +0 -0
  104. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/integrations/trl.py +0 -0
  105. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/__init__.py +0 -0
  106. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  107. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  108. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  109. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  110. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/init.py +0 -0
  111. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/rollout_context.py +0 -0
  112. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  113. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/util.py +0 -0
  114. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/logging_utils.py +0 -0
  115. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/__init__.py +0 -0
  116. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/adapter.py +0 -0
  117. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/client/__init__.py +0 -0
  118. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/client/connection.py +0 -0
  119. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/clients.py +0 -0
  120. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/execution/__init__.py +0 -0
  121. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/execution/base_policy.py +0 -0
  122. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/execution/manager.py +0 -0
  123. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/execution/policy.py +0 -0
  124. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/grid_renderer.py +0 -0
  125. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  126. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/mcpgym.py +0 -0
  127. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/process_manager.py +0 -0
  128. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/session/__init__.py +0 -0
  129. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/session/manager.py +0 -0
  130. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/simple_process_manager.py +0 -0
  131. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/simulation_server.py +0 -0
  132. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/__init__.py +0 -0
  133. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/config.py +0 -0
  134. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/main.py +0 -0
  135. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  136. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  137. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  138. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  139. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_env.py +0 -0
  140. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/__init__.py +0 -0
  141. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  142. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  143. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  144. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  145. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  146. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  147. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  148. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  149. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  150. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  151. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  152. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  153. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  154. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  155. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/models.py +0 -0
  156. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/packaging.py +0 -0
  157. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/platform_api.py +0 -0
  158. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/playback_policy.py +0 -0
  159. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/__init__.py +0 -0
  160. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  161. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/app.py +0 -0
  162. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  163. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  164. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  165. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/main.py +0 -0
  166. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/models.py +0 -0
  167. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  168. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/__init__.py +0 -0
  169. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  170. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  171. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  172. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  173. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  174. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  175. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  176. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  177. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  178. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/evaluation_test.py +0 -0
  179. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  180. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  181. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/exception_config.py +0 -0
  182. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/execution.py +0 -0
  183. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  184. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  185. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  186. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/parameterize.py +0 -0
  187. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/plugin.py +0 -0
  188. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  189. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/rollout_processor.py +0 -0
  190. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/store_experiment_link.py +0 -0
  191. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/store_results_url.py +0 -0
  192. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/tracing_utils.py +0 -0
  193. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/types.py +0 -0
  194. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/validate_signature.py +0 -0
  195. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/__init__.py +0 -0
  196. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  197. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  198. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  199. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  200. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  201. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  202. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  203. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/llm_judge.py +0 -0
  204. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  205. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  206. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  207. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  208. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/utils.py +0 -0
  209. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/resources.py +0 -0
  210. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/reward_function.py +0 -0
  211. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/__init__.py +0 -0
  212. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/accuracy.py +0 -0
  213. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/accuracy_length.py +0 -0
  214. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  215. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  216. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/apps_testing_util.py +0 -0
  217. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/bfcl_reward.py +0 -0
  218. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/code_execution.py +0 -0
  219. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/code_execution_utils.py +0 -0
  220. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/cpp_code.py +0 -0
  221. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  222. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/format.py +0 -0
  223. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/function_calling.py +0 -0
  224. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/json_schema.py +0 -0
  225. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/language_consistency.py +0 -0
  226. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/lean_prover.py +0 -0
  227. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/length.py +0 -0
  228. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  229. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/math.py +0 -0
  230. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  231. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/reasoning_steps.py +0 -0
  232. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/repetition.py +0 -0
  233. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/tag_count.py +0 -0
  234. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rl_processing.py +0 -0
  235. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/server.py +0 -0
  236. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/stats/__init__.py +0 -0
  237. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/stats/confidence_intervals.py +0 -0
  238. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/typed_interface.py +0 -0
  239. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/types/__init__.py +0 -0
  240. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/types/errors.py +0 -0
  241. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/types/remote_rollout_processor.py +0 -0
  242. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/types/types.py +0 -0
  243. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/__init__.py +0 -0
  244. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/batch_evaluation.py +0 -0
  245. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/batch_transformation.py +0 -0
  246. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/browser_utils.py +0 -0
  247. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/check_server_status.py +0 -0
  248. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/dataset_helpers.py +0 -0
  249. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  250. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/logs_models.py +0 -0
  251. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/logs_server.py +0 -0
  252. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/module_loader.py +0 -0
  253. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/packaging_utils.py +0 -0
  254. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/show_results_url.py +0 -0
  255. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/static_policy.py +0 -0
  256. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/subprocess_utils.py +0 -0
  257. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/vite_server.py +0 -0
  258. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/dependency_links.txt +0 -0
  259. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/entry_points.txt +0 -0
  260. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/requires.txt +0 -0
  261. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/top_level.txt +0 -0
  262. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/pyproject.toml +0 -0
  263. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/setup.cfg +0 -0
  264. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/setup.py +0 -0
  265. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_accuracy.py +0 -0
  266. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_accuracy_length.py +0 -0
  267. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_adapters_e2e.py +0 -0
  268. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_agent_orchestrator.py +0 -0
  269. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_agent_resources.py +0 -0
  270. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_auth.py +0 -0
  271. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_batch_evaluation.py +0 -0
  272. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cli.py +0 -0
  273. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cli_agent.py +0 -0
  274. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cli_args.py +0 -0
  275. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cli_create_rft_infer.py +0 -0
  276. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_code_execution.py +0 -0
  277. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_config.py +0 -0
  278. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_control_plane_separation.py +0 -0
  279. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cpp_code.py +0 -0
  280. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_data_driven_task_manager.py +0 -0
  281. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_deepcoder_reward.py +0 -0
  282. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_deepeval_integration.py +0 -0
  283. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_deploy_integration.py +0 -0
  284. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_directory_utils.py +0 -0
  285. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_e2b_integration.py +0 -0
  286. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_e2b_js_integration.py +0 -0
  287. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_edge_cases.py +0 -0
  288. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_ep_upload_e2e.py +0 -0
  289. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_eval_protocol_import.py +0 -0
  290. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_evaluation.py +0 -0
  291. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_evaluation_integration.py +0 -0
  292. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_evaluation_postprocess.py +0 -0
  293. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_evaluation_preview_integration.py +0 -0
  294. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_event_bus.py +0 -0
  295. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_event_bus_helper.py +0 -0
  296. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_examples_end_to_end.py +0 -0
  297. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_exceptions.py +0 -0
  298. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_fireworks_api.py +0 -0
  299. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_format.py +0 -0
  300. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_fractional_code.py +0 -0
  301. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_function_calling.py +0 -0
  302. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_gcp_tools.py +0 -0
  303. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_generic_server.py +0 -0
  304. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_human_id.py +0 -0
  305. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_integration.py +0 -0
  306. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_json_schema.py +0 -0
  307. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_kwargs_validation.py +0 -0
  308. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_language_consistency.py +0 -0
  309. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_lean_prover.py +0 -0
  310. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_lean_prover_runner.py +0 -0
  311. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_length.py +0 -0
  312. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_list_comparison_math_reward.py +0 -0
  313. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_logs_server.py +0 -0
  314. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_logs_server_simple.py +0 -0
  315. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_math.py +0 -0
  316. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_message_field_filtering.py +0 -0
  317. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_minimal.py +0 -0
  318. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_models.py +0 -0
  319. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_models_rl.py +0 -0
  320. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_multiple_choice_math_reward.py +0 -0
  321. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_n_variant_batch_integration.py +0 -0
  322. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_n_variant_integration.py +0 -0
  323. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_openai_compatibility.py +0 -0
  324. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_openeval_integration.py +0 -0
  325. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_packaging.py +0 -0
  326. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_parallel_rollouts.py +0 -0
  327. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_platform_api.py +0 -0
  328. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_quickstart_utils.py +0 -0
  329. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_readiness.py +0 -0
  330. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_reasoning_steps.py +0 -0
  331. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_repetition.py +0 -0
  332. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_repetition_debug.py +0 -0
  333. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_retry_mechanism.py +0 -0
  334. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_reward_function.py +0 -0
  335. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_reward_protocol_import.py +0 -0
  336. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_rl_processing.py +0 -0
  337. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_rollout_control_plane_integration.py +0 -0
  338. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_server.py +0 -0
  339. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_show_results_url.py +0 -0
  340. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_status_migration_changes.py +0 -0
  341. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_status_migration_integration.py +0 -0
  342. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_status_model.py +0 -0
  343. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_tag_count.py +0 -0
  344. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_tau_bench_airline_smoke.py +0 -0
  345. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_typed_interface.py +0 -0
  346. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_typed_interface_rl.py +0 -0
  347. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_upload_entrypoint.py +0 -0
  348. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_url_handling.py +0 -0
  349. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_vite_server.py +0 -0
  350. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/__init__.py +0 -0
  351. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/agent/__init__.py +0 -0
  352. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/agent/base.py +0 -0
  353. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/agent/llm_agent.py +0 -0
  354. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/api_service/__init__.py +0 -0
  355. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/api_service/api_config.py +0 -0
  356. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/api_service/data_model.py +0 -0
  357. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/api_service/simulation_service.py +0 -0
  358. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/cli.py +0 -0
  359. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/config.py +0 -0
  360. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/airline/policy.md +0 -0
  361. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/mock/policy.md +0 -0
  362. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  363. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/retail/policy.md +0 -0
  364. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  365. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  366. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  367. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  368. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  369. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  370. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  371. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data_model/__init__.py +0 -0
  372. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data_model/message.py +0 -0
  373. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data_model/simulation.py +0 -0
  374. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data_model/tasks.py +0 -0
  375. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/__init__.py +0 -0
  376. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/__init__.py +0 -0
  377. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/data_model.py +0 -0
  378. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/environment.py +0 -0
  379. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/tools.py +0 -0
  380. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/utils.py +0 -0
  381. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/__init__.py +0 -0
  382. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/data_model.py +0 -0
  383. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/environment.py +0 -0
  384. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/tools.py +0 -0
  385. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/utils.py +0 -0
  386. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/__init__.py +0 -0
  387. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/data_model.py +0 -0
  388. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/environment.py +0 -0
  389. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/tools.py +0 -0
  390. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/utils.py +0 -0
  391. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/__init__.py +0 -0
  392. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/data_model.py +0 -0
  393. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/environment.py +0 -0
  394. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  395. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  396. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  397. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  398. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  399. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  400. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  401. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  402. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tools.py +0 -0
  403. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  404. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  405. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/utils.py +0 -0
  406. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/__init__.py +0 -0
  407. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/db.py +0 -0
  408. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/environment.py +0 -0
  409. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/server.py +0 -0
  410. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/tool.py +0 -0
  411. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/toolkit.py +0 -0
  412. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  413. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/__init__.py +0 -0
  414. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator.py +0 -0
  415. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  416. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  417. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  418. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  419. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  420. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/metrics/__init__.py +0 -0
  421. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/metrics/agent_metrics.py +0 -0
  422. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  423. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/orchestrator/__init__.py +0 -0
  424. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  425. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  426. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/orchestrator/utils.py +0 -0
  427. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/registry.py +0 -0
  428. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/run.py +0 -0
  429. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/__init__.py +0 -0
  430. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/check_data.py +0 -0
  431. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  432. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/start_servers.py +0 -0
  433. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/view_simulations.py +0 -0
  434. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/user/__init__.py +0 -0
  435. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/user/base.py +0 -0
  436. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/user/user_simulator.py +0 -0
  437. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/__init__.py +0 -0
  438. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/display.py +0 -0
  439. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/io_utils.py +0 -0
  440. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/llm_utils.py +0 -0
  441. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/pydantic_utils.py +0 -0
  442. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/utils.py +0 -0
  443. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/versioneer.py +0 -0
  444. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  445. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
  446. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
  447. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
  448. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  449. {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.84.dev1
3
+ Version: 0.2.84.dev3
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-11-10T16:26:12-0800",
11
+ "date": "2025-11-10T18:00:39-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "66542cf6410b379a35d2aec3de041fb37e18b0e2",
15
- "version": "0.2.84.dev.1"
14
+ "full-revisionid": "e7615d7ec75524b19ed38241d1c6165cf32dd79f",
15
+ "version": "0.2.84.dev.3"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -427,6 +427,27 @@ def parse_args(args=None):
427
427
  rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
428
428
  rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
429
429
 
430
+ # Local test command
431
+ local_test_parser = subparsers.add_parser(
432
+ "local-test",
433
+ help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
434
+ )
435
+ local_test_parser.add_argument(
436
+ "--entry",
437
+ help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
438
+ )
439
+ local_test_parser.add_argument(
440
+ "--ignore-docker",
441
+ action="store_true",
442
+ help="Ignore Dockerfile even if present; run pytest on host",
443
+ )
444
+ local_test_parser.add_argument(
445
+ "--yes",
446
+ "-y",
447
+ action="store_true",
448
+ help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
449
+ )
450
+
430
451
  # Run command (for Hydra-based evaluations)
431
452
  # This subparser intentionally defines no arguments itself.
432
453
  # All arguments after 'run' will be passed to Hydra by parse_known_args.
@@ -559,6 +580,10 @@ def main():
559
580
  return create_rft_command(args)
560
581
  print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
561
582
  return 1
583
+ elif args.command == "local-test":
584
+ from .cli_commands.local_test import local_test_command
585
+
586
+ return local_test_command(args)
562
587
  elif args.command == "run":
563
588
  # For the 'run' command, Hydra takes over argument parsing.
564
589
 
@@ -0,0 +1,151 @@
1
+ import argparse
2
+ import os
3
+ import subprocess
4
+ import sys
5
+ from typing import List
6
+
7
+ from .upload import _discover_tests, _prompt_select
8
+
9
+
10
+ def _find_dockerfiles(root: str) -> List[str]:
11
+ skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
12
+ dockerfiles: List[str] = []
13
+ for dirpath, dirnames, filenames in os.walk(root):
14
+ dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
15
+ for name in filenames:
16
+ if name == "Dockerfile":
17
+ dockerfiles.append(os.path.join(dirpath, name))
18
+ return dockerfiles
19
+
20
+
21
+ def _run_pytest_host(pytest_target: str) -> int:
22
+ print(f"Running locally: pytest {pytest_target} -vs")
23
+ proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
24
+ return proc.returncode
25
+
26
+
27
+ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
28
+ context_dir = os.path.dirname(dockerfile_path)
29
+ print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
30
+ try:
31
+ proc = subprocess.run(
32
+ ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
33
+ stdout=subprocess.PIPE,
34
+ stderr=subprocess.STDOUT,
35
+ text=True,
36
+ )
37
+ print(proc.stdout)
38
+ return proc.returncode == 0
39
+ except FileNotFoundError:
40
+ print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
41
+ return False
42
+
43
+
44
+ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
45
+ workdir = "/workspace"
46
+ # Mount read-only is safer; but tests may write artifacts. Use read-write.
47
+ cmd = [
48
+ "docker",
49
+ "run",
50
+ "--rm",
51
+ "-v",
52
+ f"{project_root}:{workdir}",
53
+ "-e",
54
+ f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol",
55
+ "-w",
56
+ workdir,
57
+ ]
58
+ # Try to match host user to avoid permission problems on mounted volume
59
+ try:
60
+ uid = os.getuid() # type: ignore[attr-defined]
61
+ gid = os.getgid() # type: ignore[attr-defined]
62
+ cmd += ["--user", f"{uid}:{gid}"]
63
+ except Exception:
64
+ pass
65
+ cmd += [image_tag, "pytest", pytest_target, "-vs"]
66
+ print("Running in Docker:", " ".join(cmd))
67
+ try:
68
+ proc = subprocess.run(cmd)
69
+ return proc.returncode
70
+ except FileNotFoundError:
71
+ print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
72
+ return 1
73
+
74
+
75
+ def local_test_command(args: argparse.Namespace) -> int:
76
+ project_root = os.getcwd()
77
+
78
+ # Selection and pytest target resolution
79
+ pytest_target: str = ""
80
+ entry = getattr(args, "entry", None)
81
+ if entry:
82
+ if "::" in entry:
83
+ file_part = entry.split("::", 1)[0]
84
+ file_path = (
85
+ file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
86
+ )
87
+ pytest_target = entry
88
+ else:
89
+ file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
90
+ # Use path relative to project_root when possible
91
+ try:
92
+ rel = os.path.relpath(file_path, project_root)
93
+ except Exception:
94
+ rel = file_path
95
+ pytest_target = rel
96
+ else:
97
+ tests = _discover_tests(project_root)
98
+ if not tests:
99
+ print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.")
100
+ return 1
101
+ non_interactive = bool(getattr(args, "yes", False))
102
+ selected = _prompt_select(tests, non_interactive=non_interactive)
103
+ if not selected:
104
+ print("No tests selected.")
105
+ return 1
106
+ if len(selected) != 1:
107
+ print("Error: Please select exactly one evaluation test for 'local-test'.")
108
+ return 1
109
+ chosen = selected[0]
110
+ abs_path = os.path.abspath(chosen.file_path)
111
+ try:
112
+ rel = os.path.relpath(abs_path, project_root)
113
+ except Exception:
114
+ rel = abs_path
115
+ pytest_target = rel
116
+
117
+ ignore_docker = bool(getattr(args, "ignore_docker", False))
118
+ if ignore_docker:
119
+ if not pytest_target:
120
+ print("Error: Failed to resolve a pytest target to run.")
121
+ return 1
122
+ return _run_pytest_host(pytest_target)
123
+
124
+ dockerfiles = _find_dockerfiles(project_root)
125
+ if len(dockerfiles) > 1:
126
+ print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.")
127
+ for df in dockerfiles:
128
+ print(f" - {df}")
129
+ print("Hint: use --ignore-docker to bypass Docker.")
130
+ return 1
131
+ if len(dockerfiles) == 1:
132
+ # Ensure shared logs directory exists on host so container writes are visible to host ep logs
133
+ try:
134
+ os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True)
135
+ except Exception:
136
+ pass
137
+ image_tag = "ep-evaluator:local"
138
+ ok = _build_docker_image(dockerfiles[0], image_tag)
139
+ if not ok:
140
+ print("Docker build failed. See logs above.")
141
+ return 1
142
+ if not pytest_target:
143
+ print("Error: Failed to resolve a pytest target to run.")
144
+ return 1
145
+ return _run_pytest_in_docker(project_root, image_tag, pytest_target)
146
+
147
+ # No Dockerfile: run on host
148
+ if not pytest_target:
149
+ print("Error: Failed to resolve a pytest target to run.")
150
+ return 1
151
+ return _run_pytest_host(pytest_target)
@@ -437,7 +437,7 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe
437
437
  # Check if only one test - auto-select it
438
438
  if len(tests) == 1:
439
439
  print(f"\nFound 1 test: {_format_test_choice(tests[0], 1)}")
440
- confirm = questionary.confirm("Upload this test?", default=True, style=custom_style).ask()
440
+ confirm = questionary.confirm("Select this test?", default=True, style=custom_style).ask()
441
441
  if confirm:
442
442
  return tests
443
443
  else:
@@ -500,7 +500,7 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]
500
500
 
501
501
  print("=" * 80)
502
502
  try:
503
- choice = input("Enter the number to upload: ").strip()
503
+ choice = input("Enter the number to select: ").strip()
504
504
  except KeyboardInterrupt:
505
505
  print("\n\nUpload cancelled.")
506
506
  return []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.84.dev1
3
+ Version: 0.2.84.dev3
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -91,6 +91,7 @@ eval_protocol/cli_commands/common.py
91
91
  eval_protocol/cli_commands/create_rft.py
92
92
  eval_protocol/cli_commands/deploy.py
93
93
  eval_protocol/cli_commands/deploy_mcp.py
94
+ eval_protocol/cli_commands/local_test.py
94
95
  eval_protocol/cli_commands/logs.py
95
96
  eval_protocol/cli_commands/preview.py
96
97
  eval_protocol/cli_commands/run_eval_cmd.py
@@ -277,6 +278,7 @@ tests/test_cli.py
277
278
  tests/test_cli_agent.py
278
279
  tests/test_cli_args.py
279
280
  tests/test_cli_create_rft_infer.py
281
+ tests/test_cli_local_test.py
280
282
  tests/test_code_execution.py
281
283
  tests/test_config.py
282
284
  tests/test_control_plane_separation.py
@@ -0,0 +1,145 @@
1
+ import os
2
+ from types import SimpleNamespace
3
+
4
+ import pytest
5
+
6
+
7
+ def test_local_test_runs_host_pytest_with_entry(tmp_path, monkeypatch):
8
+ project = tmp_path / "proj"
9
+ project.mkdir()
10
+ monkeypatch.chdir(project)
11
+
12
+ # Create a dummy test file
13
+ test_file = project / "metric" / "test_one.py"
14
+ test_file.parent.mkdir(parents=True, exist_ok=True)
15
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
16
+
17
+ # Import module under test
18
+ from eval_protocol.cli_commands import local_test as lt
19
+
20
+ # Avoid Docker path
21
+ monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
22
+
23
+ captured = {"target": ""}
24
+
25
+ def _fake_host(target: str) -> int:
26
+ captured["target"] = target
27
+ return 0
28
+
29
+ monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
30
+
31
+ args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
32
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
33
+ assert rc == 0
34
+ # Expect relative path target
35
+ assert captured["target"] == os.path.relpath(str(test_file), str(project))
36
+
37
+
38
+ def test_local_test_ignores_docker_when_flag_set(tmp_path, monkeypatch):
39
+ project = tmp_path / "proj"
40
+ project.mkdir()
41
+ monkeypatch.chdir(project)
42
+
43
+ test_file = project / "metric" / "test_two.py"
44
+ test_file.parent.mkdir(parents=True, exist_ok=True)
45
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
46
+
47
+ from eval_protocol.cli_commands import local_test as lt
48
+
49
+ # Pretend we have Dockerfile(s), but ignore_docker=True should skip
50
+ monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
51
+
52
+ called = {"host": False}
53
+
54
+ def _fake_host(target: str) -> int:
55
+ called["host"] = True
56
+ return 0
57
+
58
+ monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
59
+
60
+ args = SimpleNamespace(entry=str(test_file), ignore_docker=True, yes=True)
61
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
62
+ assert rc == 0
63
+ assert called["host"] is True
64
+
65
+
66
+ def test_local_test_errors_on_multiple_dockerfiles(tmp_path, monkeypatch):
67
+ project = tmp_path / "proj"
68
+ project.mkdir()
69
+ monkeypatch.chdir(project)
70
+
71
+ test_file = project / "metric" / "test_three.py"
72
+ test_file.parent.mkdir(parents=True, exist_ok=True)
73
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
74
+
75
+ from eval_protocol.cli_commands import local_test as lt
76
+
77
+ monkeypatch.setattr(
78
+ lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile"), str(project / "another" / "Dockerfile")]
79
+ )
80
+
81
+ args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
82
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
83
+ assert rc == 1
84
+
85
+
86
+ def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
87
+ project = tmp_path / "proj"
88
+ project.mkdir()
89
+ monkeypatch.chdir(project)
90
+
91
+ test_file = project / "metric" / "test_four.py"
92
+ test_file.parent.mkdir(parents=True, exist_ok=True)
93
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
94
+
95
+ from eval_protocol.cli_commands import local_test as lt
96
+
97
+ monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
98
+ monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True)
99
+
100
+ captured = {"target": "", "image": ""}
101
+
102
+ def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int:
103
+ captured["target"] = pytest_target
104
+ captured["image"] = image_tag
105
+ return 0
106
+
107
+ monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
108
+
109
+ args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
110
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
111
+ assert rc == 0
112
+ assert captured["image"] == "ep-evaluator:local"
113
+ assert captured["target"] == os.path.relpath(str(test_file), str(project))
114
+
115
+
116
+ def test_local_test_selector_single_test(tmp_path, monkeypatch):
117
+ project = tmp_path / "proj"
118
+ project.mkdir()
119
+ monkeypatch.chdir(project)
120
+
121
+ test_file = project / "metric" / "test_sel.py"
122
+ test_file.parent.mkdir(parents=True, exist_ok=True)
123
+ test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
124
+
125
+ from eval_protocol.cli_commands import local_test as lt
126
+ from eval_protocol.cli_commands import upload as up
127
+
128
+ # No entry; force discover + selector
129
+ disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file))
130
+ monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc])
131
+ monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
132
+ monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
133
+
134
+ called = {"host": False}
135
+
136
+ def _fake_host(target: str) -> int:
137
+ called["host"] = True
138
+ return 0
139
+
140
+ monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
141
+
142
+ args = SimpleNamespace(entry=None, ignore_docker=False, yes=True)
143
+ rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
144
+ assert rc == 0
145
+ assert called["host"] is True