eval-protocol 0.2.72__tar.gz → 0.2.73__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (447) hide show
  1. {eval_protocol-0.2.72/eval_protocol.egg-info → eval_protocol-0.2.73}/PKG-INFO +1 -1
  2. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli.py +2 -0
  4. eval_protocol-0.2.73/eval_protocol/cli_commands/create_rft.py +492 -0
  5. eval_protocol-0.2.73/eval_protocol/data_loader/__init__.py +5 -0
  6. eval_protocol-0.2.73/eval_protocol/data_loader/jsonl_data_loader.py +42 -0
  7. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/evaluation.py +41 -1
  8. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/fireworks_rft.py +12 -4
  9. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/evaluation_test.py +41 -12
  10. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/evaluation_test_postprocess.py +2 -1
  11. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/exception_config.py +1 -0
  12. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/remote_rollout_processor.py +1 -1
  13. eval_protocol-0.2.73/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +223 -0
  14. eval_protocol-0.2.73/eval_protocol/quickstart/svg_agent/evaluator/utils.py +523 -0
  15. {eval_protocol-0.2.72 → eval_protocol-0.2.73/eval_protocol.egg-info}/PKG-INFO +1 -1
  16. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/SOURCES.txt +3 -0
  17. eval_protocol-0.2.72/eval_protocol/cli_commands/create_rft.py +0 -254
  18. eval_protocol-0.2.72/eval_protocol/data_loader/__init__.py +0 -4
  19. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/LICENSE +0 -0
  20. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/README.md +0 -0
  21. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/__init__.py +0 -0
  22. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/normalize_sandbox_fusion.py +0 -0
  23. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/utils/__init__.py +0 -0
  24. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/utils/generate_api_key.py +0 -0
  25. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/development/utils/subprocess_manager.py +0 -0
  26. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/__init__.py +0 -0
  27. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/__main__.py +0 -0
  28. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/__init__.py +0 -0
  29. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/base.py +0 -0
  30. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/bigquery.py +0 -0
  31. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/braintrust.py +0 -0
  32. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/fireworks_tracing.py +0 -0
  33. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/huggingface.py +0 -0
  34. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/langchain.py +0 -0
  35. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/langfuse.py +0 -0
  36. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/langsmith.py +0 -0
  37. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/openai_responses.py +0 -0
  38. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/trl.py +0 -0
  39. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/utils.py +0 -0
  40. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/adapters/weave.py +0 -0
  41. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/__init__.py +0 -0
  42. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/models.py +0 -0
  43. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/orchestrator.py +0 -0
  44. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resource_abc.py +0 -0
  45. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resource_pool.py +0 -0
  46. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/__init__.py +0 -0
  47. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  48. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  49. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  50. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  51. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  52. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/docker_resource.py +0 -0
  53. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  54. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  55. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/resources/sql_resource.py +0 -0
  56. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/task_manager.py +0 -0
  57. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/agent/tool_registry.py +0 -0
  58. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/auth.py +0 -0
  59. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/__init__.py +0 -0
  60. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  61. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  62. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_aime25.py +0 -0
  63. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  64. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  65. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  66. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  67. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  68. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/__init__.py +0 -0
  69. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  70. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/common.py +0 -0
  71. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/deploy.py +0 -0
  72. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  73. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/logs.py +0 -0
  74. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/preview.py +0 -0
  75. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  76. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/cli_commands/upload.py +0 -0
  77. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/common_utils.py +0 -0
  78. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/config.py +0 -0
  79. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  80. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  81. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  82. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/data_loader/models.py +0 -0
  83. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/__init__.py +0 -0
  84. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  85. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  86. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  87. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  88. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/datasets/__init__.py +0 -0
  89. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/datasets/loader.py +0 -0
  90. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/directory_utils.py +0 -0
  91. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/__init__.py +0 -0
  92. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/event_bus.py +0 -0
  93. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/logger.py +0 -0
  94. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  95. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  96. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/exceptions.py +0 -0
  97. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/execution/__init__.py +0 -0
  98. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/execution/pipeline.py +0 -0
  99. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/gcp_tools.py +0 -0
  100. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/generation/cache.py +0 -0
  101. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/generation/clients/base.py +0 -0
  102. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/generation/clients.py +0 -0
  103. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/generic_server.py +0 -0
  104. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/get_pep440_version.py +0 -0
  105. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/human_id/__init__.py +0 -0
  106. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/human_id/dictionary.py +0 -0
  107. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/integrations/__init__.py +0 -0
  108. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/integrations/deepeval.py +0 -0
  109. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/integrations/openeval.py +0 -0
  110. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/integrations/trl.py +0 -0
  111. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/__init__.py +0 -0
  112. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  113. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  114. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  115. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  116. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/init.py +0 -0
  117. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/rollout_context.py +0 -0
  118. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  119. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/log_utils/util.py +0 -0
  120. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/logging_utils.py +0 -0
  121. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/__init__.py +0 -0
  122. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/adapter.py +0 -0
  123. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/client/__init__.py +0 -0
  124. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/client/connection.py +0 -0
  125. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/clients.py +0 -0
  126. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/execution/__init__.py +0 -0
  127. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/execution/base_policy.py +0 -0
  128. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/execution/manager.py +0 -0
  129. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/execution/policy.py +0 -0
  130. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/grid_renderer.py +0 -0
  131. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  132. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/mcpgym.py +0 -0
  133. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/process_manager.py +0 -0
  134. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/session/__init__.py +0 -0
  135. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/session/manager.py +0 -0
  136. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/simple_process_manager.py +0 -0
  137. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp/simulation_server.py +0 -0
  138. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/__init__.py +0 -0
  139. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/config.py +0 -0
  140. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/main.py +0 -0
  141. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  142. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  143. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  144. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  145. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_env.py +0 -0
  146. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/__init__.py +0 -0
  147. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  148. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  149. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  150. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  151. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  152. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  153. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  154. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  155. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  156. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  157. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  158. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  159. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  160. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  161. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/models.py +0 -0
  162. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/packaging.py +0 -0
  163. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/platform_api.py +0 -0
  164. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/playback_policy.py +0 -0
  165. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/__init__.py +0 -0
  166. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  167. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/app.py +0 -0
  168. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  169. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  170. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  171. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/main.py +0 -0
  172. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/models.py +0 -0
  173. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  174. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/__init__.py +0 -0
  175. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  176. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  177. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  178. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  179. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  180. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  181. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  182. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  183. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  184. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  185. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/execution.py +0 -0
  186. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  187. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  188. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  189. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/parameterize.py +0 -0
  190. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/plugin.py +0 -0
  191. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/rollout_processor.py +0 -0
  192. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/store_experiment_link.py +0 -0
  193. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/store_results_url.py +0 -0
  194. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/tracing_utils.py +0 -0
  195. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/types.py +0 -0
  196. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/pytest/validate_signature.py +0 -0
  197. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/__init__.py +0 -0
  198. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  199. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  200. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  201. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  202. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  203. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  204. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  205. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/llm_judge.py +0 -0
  206. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  207. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  208. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/quickstart/utils.py +0 -0
  209. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/resources.py +0 -0
  210. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/reward_function.py +0 -0
  211. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/__init__.py +0 -0
  212. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/accuracy.py +0 -0
  213. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/accuracy_length.py +0 -0
  214. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  215. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  216. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/apps_testing_util.py +0 -0
  217. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/bfcl_reward.py +0 -0
  218. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/code_execution.py +0 -0
  219. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/code_execution_utils.py +0 -0
  220. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/cpp_code.py +0 -0
  221. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  222. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/format.py +0 -0
  223. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/function_calling.py +0 -0
  224. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/json_schema.py +0 -0
  225. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/language_consistency.py +0 -0
  226. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/lean_prover.py +0 -0
  227. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/length.py +0 -0
  228. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  229. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/math.py +0 -0
  230. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  231. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/reasoning_steps.py +0 -0
  232. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/repetition.py +0 -0
  233. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rewards/tag_count.py +0 -0
  234. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/rl_processing.py +0 -0
  235. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/server.py +0 -0
  236. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/stats/__init__.py +0 -0
  237. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/stats/confidence_intervals.py +0 -0
  238. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/typed_interface.py +0 -0
  239. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/types/__init__.py +0 -0
  240. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/types/errors.py +0 -0
  241. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/types/remote_rollout_processor.py +0 -0
  242. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/types/types.py +0 -0
  243. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/__init__.py +0 -0
  244. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/batch_evaluation.py +0 -0
  245. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/batch_transformation.py +0 -0
  246. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/browser_utils.py +0 -0
  247. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/check_server_status.py +0 -0
  248. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/dataset_helpers.py +0 -0
  249. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  250. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/logs_models.py +0 -0
  251. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/logs_server.py +0 -0
  252. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/module_loader.py +0 -0
  253. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/packaging_utils.py +0 -0
  254. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/show_results_url.py +0 -0
  255. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/static_policy.py +0 -0
  256. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/subprocess_utils.py +0 -0
  257. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol/utils/vite_server.py +0 -0
  258. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/dependency_links.txt +0 -0
  259. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/entry_points.txt +0 -0
  260. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/requires.txt +0 -0
  261. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/eval_protocol.egg-info/top_level.txt +0 -0
  262. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/pyproject.toml +0 -0
  263. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/setup.cfg +0 -0
  264. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/setup.py +0 -0
  265. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_accuracy.py +0 -0
  266. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_accuracy_length.py +0 -0
  267. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_adapters_e2e.py +0 -0
  268. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_agent_orchestrator.py +0 -0
  269. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_agent_resources.py +0 -0
  270. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_auth.py +0 -0
  271. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_batch_evaluation.py +0 -0
  272. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_cli.py +0 -0
  273. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_cli_agent.py +0 -0
  274. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_cli_args.py +0 -0
  275. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_code_execution.py +0 -0
  276. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_config.py +0 -0
  277. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_control_plane_separation.py +0 -0
  278. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_cpp_code.py +0 -0
  279. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_data_driven_task_manager.py +0 -0
  280. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_deepcoder_reward.py +0 -0
  281. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_deepeval_integration.py +0 -0
  282. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_deploy_integration.py +0 -0
  283. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_directory_utils.py +0 -0
  284. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_e2b_integration.py +0 -0
  285. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_e2b_js_integration.py +0 -0
  286. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_edge_cases.py +0 -0
  287. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_ep_upload_e2e.py +0 -0
  288. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_eval_protocol_import.py +0 -0
  289. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_evaluation.py +0 -0
  290. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_evaluation_integration.py +0 -0
  291. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_evaluation_postprocess.py +0 -0
  292. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_evaluation_preview_integration.py +0 -0
  293. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_event_bus.py +0 -0
  294. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_event_bus_helper.py +0 -0
  295. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_examples_end_to_end.py +0 -0
  296. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_exceptions.py +0 -0
  297. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_fireworks_api.py +0 -0
  298. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_format.py +0 -0
  299. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_fractional_code.py +0 -0
  300. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_function_calling.py +0 -0
  301. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_gcp_tools.py +0 -0
  302. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_generic_server.py +0 -0
  303. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_human_id.py +0 -0
  304. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_integration.py +0 -0
  305. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_json_schema.py +0 -0
  306. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_kwargs_validation.py +0 -0
  307. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_language_consistency.py +0 -0
  308. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_lean_prover.py +0 -0
  309. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_lean_prover_runner.py +0 -0
  310. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_length.py +0 -0
  311. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_list_comparison_math_reward.py +0 -0
  312. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_logs_server.py +0 -0
  313. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_logs_server_simple.py +0 -0
  314. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_math.py +0 -0
  315. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_minimal.py +0 -0
  316. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_models.py +0 -0
  317. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_models_rl.py +0 -0
  318. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_multiple_choice_math_reward.py +0 -0
  319. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_n_variant_batch_integration.py +0 -0
  320. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_n_variant_integration.py +0 -0
  321. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_openai_compatibility.py +0 -0
  322. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_openeval_integration.py +0 -0
  323. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_packaging.py +0 -0
  324. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_parallel_rollouts.py +0 -0
  325. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_platform_api.py +0 -0
  326. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_quickstart_utils.py +0 -0
  327. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_readiness.py +0 -0
  328. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_reasoning_steps.py +0 -0
  329. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_repetition.py +0 -0
  330. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_repetition_debug.py +0 -0
  331. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_retry_mechanism.py +0 -0
  332. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_reward_function.py +0 -0
  333. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_reward_protocol_import.py +0 -0
  334. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_rl_processing.py +0 -0
  335. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_rollout_control_plane_integration.py +0 -0
  336. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_server.py +0 -0
  337. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_show_results_url.py +0 -0
  338. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_status_migration_changes.py +0 -0
  339. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_status_migration_integration.py +0 -0
  340. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_status_model.py +0 -0
  341. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_tag_count.py +0 -0
  342. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_tau_bench_airline_smoke.py +0 -0
  343. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_typed_interface.py +0 -0
  344. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_typed_interface_rl.py +0 -0
  345. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_upload_entrypoint.py +0 -0
  346. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_url_handling.py +0 -0
  347. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/tests/test_vite_server.py +0 -0
  348. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/__init__.py +0 -0
  349. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/agent/__init__.py +0 -0
  350. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/agent/base.py +0 -0
  351. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/agent/llm_agent.py +0 -0
  352. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/api_service/__init__.py +0 -0
  353. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/api_service/api_config.py +0 -0
  354. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/api_service/data_model.py +0 -0
  355. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/api_service/simulation_service.py +0 -0
  356. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/cli.py +0 -0
  357. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/config.py +0 -0
  358. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/airline/policy.md +0 -0
  359. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/mock/policy.md +0 -0
  360. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  361. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/retail/policy.md +0 -0
  362. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  363. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  364. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  365. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  366. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  367. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  368. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  369. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data_model/__init__.py +0 -0
  370. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data_model/message.py +0 -0
  371. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data_model/simulation.py +0 -0
  372. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/data_model/tasks.py +0 -0
  373. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/__init__.py +0 -0
  374. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/__init__.py +0 -0
  375. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/data_model.py +0 -0
  376. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/environment.py +0 -0
  377. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/tools.py +0 -0
  378. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/airline/utils.py +0 -0
  379. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/__init__.py +0 -0
  380. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/data_model.py +0 -0
  381. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/environment.py +0 -0
  382. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/tools.py +0 -0
  383. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/mock/utils.py +0 -0
  384. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/__init__.py +0 -0
  385. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/data_model.py +0 -0
  386. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/environment.py +0 -0
  387. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/tools.py +0 -0
  388. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/retail/utils.py +0 -0
  389. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/__init__.py +0 -0
  390. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/data_model.py +0 -0
  391. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/environment.py +0 -0
  392. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  393. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  394. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  395. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  396. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  397. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  398. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  399. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  400. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/tools.py +0 -0
  401. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  402. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  403. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/domains/telecom/utils.py +0 -0
  404. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/__init__.py +0 -0
  405. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/db.py +0 -0
  406. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/environment.py +0 -0
  407. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/server.py +0 -0
  408. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/tool.py +0 -0
  409. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/toolkit.py +0 -0
  410. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  411. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/__init__.py +0 -0
  412. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator.py +0 -0
  413. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  414. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  415. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  416. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  417. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  418. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/metrics/__init__.py +0 -0
  419. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/metrics/agent_metrics.py +0 -0
  420. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  421. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/orchestrator/__init__.py +0 -0
  422. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  423. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  424. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/orchestrator/utils.py +0 -0
  425. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/registry.py +0 -0
  426. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/run.py +0 -0
  427. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/__init__.py +0 -0
  428. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/check_data.py +0 -0
  429. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  430. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/start_servers.py +0 -0
  431. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/scripts/view_simulations.py +0 -0
  432. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/user/__init__.py +0 -0
  433. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/user/base.py +0 -0
  434. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/user/user_simulator.py +0 -0
  435. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/__init__.py +0 -0
  436. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/display.py +0 -0
  437. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/io_utils.py +0 -0
  438. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/llm_utils.py +0 -0
  439. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/pydantic_utils.py +0 -0
  440. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vendor/tau2/utils/utils.py +0 -0
  441. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/versioneer.py +0 -0
  442. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  443. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
  444. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
  445. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
  446. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  447. {eval_protocol-0.2.72 → eval_protocol-0.2.73}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.72
3
+ Version: 0.2.73
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-10-30T03:43:20-0700",
11
+ "date": "2025-11-01T13:56:18-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "a71074ec111c9321e5cb2e8366dbb56504f2fc3a",
15
- "version": "0.2.72"
14
+ "full-revisionid": "d8477be9df5508ec4c7ef53cb7a5e8cb758cec3d",
15
+ "version": "0.2.73"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -402,6 +402,8 @@ def parse_args(args=None):
402
402
  rft_parser.add_argument("--evaluation-dataset", help="Optional separate eval dataset id")
403
403
  rft_parser.add_argument("--eval-auto-carveout", dest="eval_auto_carveout", action="store_true", default=True)
404
404
  rft_parser.add_argument("--no-eval-auto-carveout", dest="eval_auto_carveout", action="store_false")
405
+ # Rollout chunking
406
+ rft_parser.add_argument("--chunk-size", type=int, help="Data chunk size for rollout batching")
405
407
  # Inference params
406
408
  rft_parser.add_argument("--temperature", type=float)
407
409
  rft_parser.add_argument("--top-p", type=float)
@@ -0,0 +1,492 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import time
5
+ import argparse
6
+ from typing import Any, Dict, Optional
7
+
8
+ from ..auth import (
9
+ get_fireworks_account_id,
10
+ get_fireworks_api_base,
11
+ get_fireworks_api_key,
12
+ verify_api_key_and_get_account_id,
13
+ )
14
+ from ..fireworks_rft import (
15
+ _map_api_host_to_app_host,
16
+ create_dataset_from_jsonl,
17
+ create_reinforcement_fine_tuning_job,
18
+ )
19
+ from .upload import _discover_tests, _normalize_evaluator_id, _resolve_entry_to_qual_and_source
20
+
21
+
22
+ def _ensure_account_id() -> Optional[str]:
23
+ account_id = get_fireworks_account_id()
24
+ api_key = get_fireworks_api_key()
25
+ if not account_id and api_key:
26
+ resolved = verify_api_key_and_get_account_id(api_key=api_key, api_base=get_fireworks_api_base())
27
+ if resolved:
28
+ os.environ["FIREWORKS_ACCOUNT_ID"] = resolved
29
+ return resolved
30
+ return account_id
31
+
32
+
33
+ def _extract_terminal_segment(resource_name: str) -> str:
34
+ """Return the last path segment if a fully-qualified resource name is provided."""
35
+ try:
36
+ return resource_name.strip("/").split("/")[-1]
37
+ except Exception:
38
+ return resource_name
39
+
40
+
41
+ def _print_links(evaluator_id: str, dataset_id: str, job_name: Optional[str]) -> None:
42
+ api_base = get_fireworks_api_base()
43
+ app_base = _map_api_host_to_app_host(api_base)
44
+ print("\n📊 Dashboard Links:")
45
+ evaluator_slug = _extract_terminal_segment(evaluator_id)
46
+ print(f" Evaluator: {app_base}/dashboard/evaluators/{evaluator_slug}")
47
+ if dataset_id:
48
+ print(f" Dataset: {app_base}/dashboard/datasets/{dataset_id}")
49
+ if job_name:
50
+ # job_name likely like accounts/{account}/reinforcementFineTuningJobs/{id}
51
+ try:
52
+ job_id = job_name.strip().split("/")[-1]
53
+ print(f" RFT Job: {app_base}/dashboard/fine-tuning/reinforcement/{job_id}")
54
+ except Exception:
55
+ pass
56
+
57
+
58
+ def _auto_find_jsonl(cwd: str) -> Optional[str]:
59
+ """Find a reasonable JSONL dataset file in the current project.
60
+
61
+ Priority order:
62
+ - dataset.jsonl in cwd
63
+ - data/dataset.jsonl
64
+ - first *.jsonl under cwd (depth-first, skipping common vendor/venv/build dirs)
65
+ Returns a RELATIVE path from cwd if possible.
66
+ """
67
+ # Direct candidates
68
+ direct_candidates = [
69
+ os.path.join(cwd, "dataset.jsonl"),
70
+ os.path.join(cwd, "data", "dataset.jsonl"),
71
+ ]
72
+ for p in direct_candidates:
73
+ if os.path.isfile(p):
74
+ try:
75
+ return os.path.relpath(p, cwd)
76
+ except Exception:
77
+ return p
78
+
79
+ # Walk and find any .jsonl
80
+ skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
81
+ for dirpath, dirnames, filenames in os.walk(cwd):
82
+ # prune
83
+ dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
84
+ for name in sorted(filenames):
85
+ if name.endswith(".jsonl"):
86
+ candidate = os.path.join(dirpath, name)
87
+ try:
88
+ return os.path.relpath(candidate, cwd)
89
+ except Exception:
90
+ return candidate
91
+ return None
92
+
93
+
94
+ def _extract_jsonl_from_dataloader(test_file_path: str, test_func_name: str) -> Optional[str]:
95
+ """Import the test module and extract a JSONL path from data_loaders param if present.
96
+
97
+ Looks for a pytest.mark.parametrize with argnames containing 'data_loaders' and attempts to
98
+ find an object with attribute 'jsonl_path'. If a relative path is found, it is resolved
99
+ relative to the directory of the test file.
100
+ """
101
+ try:
102
+ import importlib.util
103
+ from pathlib import Path
104
+
105
+ spec = importlib.util.spec_from_file_location(Path(test_file_path).stem, test_file_path)
106
+ if not spec or not spec.loader:
107
+ return None
108
+ module = importlib.util.module_from_spec(spec)
109
+ sys.modules[spec.name] = module
110
+ spec.loader.exec_module(module) # type: ignore[attr-defined]
111
+ if not hasattr(module, test_func_name):
112
+ return None
113
+ wrapper = getattr(module, test_func_name)
114
+ marks = getattr(wrapper, "pytestmark", [])
115
+ for m in marks:
116
+ if getattr(m, "name", "") == "parametrize":
117
+ kwargs = getattr(m, "kwargs", {})
118
+ argnames = kwargs.get("argnames", (m.args[0] if m.args else []))
119
+ argvalues = kwargs.get("argvalues", (m.args[1] if len(m.args) > 1 else []))
120
+ # Normalize argnames to list
121
+ if isinstance(argnames, str):
122
+ names_list = [n.strip() for n in argnames.split(",") if n.strip()]
123
+ else:
124
+ names_list = list(argnames)
125
+ if "data_loaders" not in names_list:
126
+ continue
127
+ idx = names_list.index("data_loaders")
128
+ # argvalues is a list of tuples/values aligned with argnames
129
+ for val in argvalues:
130
+ # Normalize to tuple
131
+ if not isinstance(val, (tuple, list)):
132
+ params = (val,)
133
+ else:
134
+ params = tuple(val)
135
+ if idx >= len(params):
136
+ continue
137
+ dataloaders_obj = params[idx]
138
+ # May be a list or single loader
139
+ candidates = (
140
+ list(dataloaders_obj) if isinstance(dataloaders_obj, (list, tuple)) else [dataloaders_obj]
141
+ )
142
+ for dl in candidates:
143
+ jsonl_path = getattr(dl, "jsonl_path", None)
144
+ if isinstance(jsonl_path, str) and jsonl_path:
145
+ if os.path.isabs(jsonl_path):
146
+ return jsonl_path
147
+ base_dir = os.path.dirname(os.path.abspath(test_file_path))
148
+ return os.path.abspath(os.path.join(base_dir, jsonl_path))
149
+ return None
150
+ except Exception:
151
+ return None
152
+
153
+
154
+ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str) -> Optional[str]:
155
+ """Import the test module and extract a JSONL path from input_dataset (dataset_path) param if present.
156
+
157
+ Looks for a pytest.mark.parametrize with argnames containing 'dataset_path' and extracts the
158
+ first dataset path value. If a relative path is found, it is resolved relative to the directory
159
+ of the test file.
160
+ """
161
+ try:
162
+ import importlib.util
163
+ from pathlib import Path
164
+
165
+ spec = importlib.util.spec_from_file_location(Path(test_file_path).stem, test_file_path)
166
+ if not spec or not spec.loader:
167
+ return None
168
+ module = importlib.util.module_from_spec(spec)
169
+ sys.modules[spec.name] = module
170
+ spec.loader.exec_module(module) # type: ignore[attr-defined]
171
+ if not hasattr(module, test_func_name):
172
+ return None
173
+ wrapper = getattr(module, test_func_name)
174
+ marks = getattr(wrapper, "pytestmark", [])
175
+ for m in marks:
176
+ if getattr(m, "name", "") == "parametrize":
177
+ kwargs = getattr(m, "kwargs", {})
178
+ argnames = kwargs.get("argnames", (m.args[0] if m.args else []))
179
+ argvalues = kwargs.get("argvalues", (m.args[1] if len(m.args) > 1 else []))
180
+ # Normalize argnames to list
181
+ if isinstance(argnames, str):
182
+ names_list = [n.strip() for n in argnames.split(",") if n.strip()]
183
+ else:
184
+ names_list = list(argnames)
185
+ if "dataset_path" not in names_list:
186
+ continue
187
+ idx = names_list.index("dataset_path")
188
+ # argvalues is a list of tuples/values aligned with argnames
189
+ # Get the first value (first test case)
190
+ if argvalues:
191
+ val = argvalues[0]
192
+ # Normalize to tuple
193
+ if not isinstance(val, (tuple, list)):
194
+ params = (val,)
195
+ else:
196
+ params = tuple(val)
197
+ if idx < len(params):
198
+ dataset_path = params[idx]
199
+ # dataset_path is typically a string, but could be a list if combine_datasets=True
200
+ if isinstance(dataset_path, (list, tuple)) and len(dataset_path) > 0:
201
+ dataset_path = dataset_path[0]
202
+ if isinstance(dataset_path, str) and dataset_path:
203
+ if os.path.isabs(dataset_path):
204
+ return dataset_path
205
+ base_dir = os.path.dirname(os.path.abspath(test_file_path))
206
+ resolved = os.path.abspath(os.path.join(base_dir, dataset_path))
207
+ if os.path.isfile(resolved):
208
+ return resolved
209
+ # Try resolving from project root if relative to test file doesn't work
210
+ if not os.path.isabs(dataset_path):
211
+ # Try resolving from current working directory
212
+ cwd_path = os.path.abspath(os.path.join(os.getcwd(), dataset_path))
213
+ if os.path.isfile(cwd_path):
214
+ return cwd_path
215
+ return None
216
+ except Exception:
217
+ return None
218
+
219
+
220
+ def _build_trimmed_dataset_id(evaluator_id: str) -> str:
221
+ """Build a dataset id derived from evaluator_id, trimmed to 63 chars.
222
+
223
+ Format: <normalized-base>-dataset-YYYYMMDDHHMMSS, where base is trimmed to fit.
224
+ """
225
+ # Normalize base similarly to evaluator id rules
226
+ from .upload import _normalize_evaluator_id # local import to avoid cycle at module import time
227
+
228
+ base = _normalize_evaluator_id(evaluator_id)
229
+ suffix = f"-dataset-{time.strftime('%Y%m%d%H%M%S')}"
230
+ max_total = 63
231
+ max_base_len = max_total - len(suffix)
232
+ if max_base_len < 1:
233
+ max_base_len = 1
234
+ if len(base) > max_base_len:
235
+ base = base[:max_base_len].rstrip("-")
236
+ if not base:
237
+ base = "dataset"
238
+ # Ensure first char is a letter
239
+ if not base[0].isalpha():
240
+ base = f"eval-{base}"
241
+ if len(base) > max_base_len:
242
+ base = base[:max_base_len]
243
+ base = base.rstrip("-") or "dataset"
244
+ return f"{base}{suffix}"
245
+
246
+
247
+ def _auto_select_evaluator_id(cwd: str) -> Optional[str]:
248
+ # Try local traces
249
+ traces_dir = os.path.join(cwd, ".eval_protocol", "evaluators")
250
+ if os.path.isdir(traces_dir):
251
+ candidates = [f[:-5] for f in os.listdir(traces_dir) if f.endswith(".json")]
252
+ if len(candidates) == 1:
253
+ return candidates[0]
254
+ # Fall back to discovering a single evaluation_test
255
+ tests = _discover_tests(cwd)
256
+ if len(tests) == 1:
257
+ qualname, source_file_path = tests[0].qualname, tests[0].file_path
258
+ test_func_name = qualname.split(".")[-1]
259
+ source_file_name = os.path.splitext(os.path.basename(source_file_path))[0]
260
+ evaluator_id = _normalize_evaluator_id(f"{source_file_name}-{test_func_name}")
261
+ return evaluator_id
262
+ return None
263
+
264
+
265
+ def create_rft_command(args) -> int:
266
+ evaluator_id: Optional[str] = getattr(args, "evaluator_id", None)
267
+ non_interactive: bool = bool(getattr(args, "yes", False))
268
+ dry_run: bool = bool(getattr(args, "dry_run", False))
269
+
270
+ api_key = get_fireworks_api_key()
271
+ if not api_key:
272
+ print("Error: FIREWORKS_API_KEY not set.")
273
+ return 1
274
+
275
+ account_id = _ensure_account_id()
276
+ if not account_id:
277
+ print("Error: FIREWORKS_ACCOUNT_ID not set and could not be resolved.")
278
+ return 1
279
+
280
+ api_base = get_fireworks_api_base()
281
+
282
+ # Resolve evaluator id if omitted
283
+ project_root = os.getcwd()
284
+ if not evaluator_id:
285
+ evaluator_id = _auto_select_evaluator_id(project_root)
286
+ if not evaluator_id:
287
+ print("Error: Could not infer evaluator id. Provide --evaluator-id or run 'eval-protocol upload' first.")
288
+ return 1
289
+
290
+ # Resolve evaluator resource name to fully-qualified format required by API
291
+ evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
292
+
293
+ # Ensure evaluator exists by invoking the upload flow programmatically
294
+ try:
295
+ from .upload import upload_command
296
+
297
+ tests = _discover_tests(project_root)
298
+ selected_entry: Optional[str] = None
299
+ if len(tests) == 1:
300
+ func_name = tests[0].qualname.split(".")[-1]
301
+ abs_path = os.path.abspath(tests[0].file_path)
302
+ try:
303
+ rel = os.path.relpath(abs_path, project_root)
304
+ except Exception:
305
+ rel = abs_path
306
+ selected_entry = f"{rel}::{func_name}"
307
+ else:
308
+ # Try to match evaluator_id to a discovered test's normalized ID
309
+ for t in tests:
310
+ func_name = t.qualname.split(".")[-1]
311
+ source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
312
+ candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
313
+ if candidate == evaluator_id:
314
+ abs_path = os.path.abspath(t.file_path)
315
+ try:
316
+ rel = os.path.relpath(abs_path, project_root)
317
+ except Exception:
318
+ rel = abs_path
319
+ selected_entry = f"{rel}::{func_name}"
320
+ break
321
+
322
+ upload_args = argparse.Namespace(
323
+ path=project_root,
324
+ entry=selected_entry,
325
+ id=evaluator_id,
326
+ display_name=None,
327
+ description=None,
328
+ force=False,
329
+ yes=True,
330
+ )
331
+ rc = upload_command(upload_args)
332
+ if rc == 0:
333
+ print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
334
+ else:
335
+ print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
336
+ except Exception as e:
337
+ print(f"Warning: Failed to upload evaluator automatically: {e}")
338
+
339
+ # Determine dataset id and materialization path
340
+ dataset_id = getattr(args, "dataset_id", None)
341
+ dataset_jsonl = getattr(args, "dataset_jsonl", None)
342
+ dataset_display_name = getattr(args, "dataset_display_name", None)
343
+ dataset_builder = getattr(args, "dataset_builder", None) # accepted but unused in simplified flow
344
+
345
+ if not dataset_id:
346
+ # Prefer explicit --dataset-jsonl, else attempt to extract from data loader or input_dataset of the single discovered test
347
+ if not dataset_jsonl:
348
+ tests = _discover_tests(project_root)
349
+ if len(tests) == 1:
350
+ func_name = tests[0].qualname.split(".")[-1]
351
+ # Try data_loaders first (existing behavior)
352
+ dataset_jsonl = _extract_jsonl_from_dataloader(tests[0].file_path, func_name)
353
+ if dataset_jsonl:
354
+ # Display relative path for readability
355
+ try:
356
+ rel = os.path.relpath(dataset_jsonl, project_root)
357
+ except Exception:
358
+ rel = dataset_jsonl
359
+ print(f"✓ Using JSONL from data loader: {rel}")
360
+ else:
361
+ # Fall back to input_dataset (dataset_path)
362
+ dataset_jsonl = _extract_jsonl_from_input_dataset(tests[0].file_path, func_name)
363
+ if dataset_jsonl:
364
+ # Display relative path for readability
365
+ try:
366
+ rel = os.path.relpath(dataset_jsonl, project_root)
367
+ except Exception:
368
+ rel = dataset_jsonl
369
+ print(f"✓ Using JSONL from input_dataset: {rel}")
370
+ if not dataset_jsonl:
371
+ print(
372
+ "Error: Could not determine dataset. Provide --dataset-id or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
373
+ )
374
+ return 1
375
+
376
+ inferred_dataset_id = _build_trimmed_dataset_id(evaluator_id)
377
+ if dry_run:
378
+ print("--dry-run: would create dataset and upload JSONL")
379
+ dataset_id = inferred_dataset_id
380
+ else:
381
+ try:
382
+ # Resolve dataset_jsonl path relative to CWD if needed
383
+ jsonl_path_for_upload = (
384
+ dataset_jsonl
385
+ if os.path.isabs(dataset_jsonl)
386
+ else os.path.abspath(os.path.join(project_root, dataset_jsonl))
387
+ )
388
+ dataset_id, _ = create_dataset_from_jsonl(
389
+ account_id=account_id,
390
+ api_key=api_key,
391
+ api_base=api_base,
392
+ dataset_id=inferred_dataset_id,
393
+ display_name=dataset_display_name or inferred_dataset_id,
394
+ jsonl_path=jsonl_path_for_upload,
395
+ )
396
+ print(f"✓ Created and uploaded dataset: {dataset_id}")
397
+ except Exception as e:
398
+ print(f"Error creating/uploading dataset: {e}")
399
+ return 1
400
+
401
+ # Build training config/body
402
+ # Ensure base model is explicitly provided for clarity
403
+ if not getattr(args, "base_model", None):
404
+ print(
405
+ "Error: --base-model is required. Please specify the base model resource id (e.g., accounts/{account}/models/<model_id>)."
406
+ )
407
+ return 1
408
+
409
+ training_config: Dict[str, Any] = {"baseModel": args.base_model}
410
+ if getattr(args, "warm_start_from", None):
411
+ training_config["warmStartFrom"] = args.warm_start_from
412
+
413
+ # Optional hyperparameters
414
+ for key, arg_name in [
415
+ ("epochs", "epochs"),
416
+ ("batchSize", "batch_size"),
417
+ ("learningRate", "learning_rate"),
418
+ ("maxContextLength", "max_context_length"),
419
+ ("loraRank", "lora_rank"),
420
+ ("acceleratorCount", "accelerator_count"),
421
+ ("region", "region"),
422
+ ]:
423
+ val = getattr(args, arg_name, None)
424
+ if val is not None:
425
+ training_config[key] = val
426
+
427
+ inference_params: Dict[str, Any] = {}
428
+ for key, arg_name in [
429
+ ("temperature", "temperature"),
430
+ ("topP", "top_p"),
431
+ ("topK", "top_k"),
432
+ ("maxTokens", "max_tokens"),
433
+ ("n", "n"),
434
+ ]:
435
+ val = getattr(args, arg_name, None)
436
+ if val is not None:
437
+ inference_params[key] = val
438
+ if getattr(args, "inference_extra_body", None):
439
+ inference_params["extraBody"] = args.inference_extra_body
440
+
441
+ wandb_config: Optional[Dict[str, Any]] = None
442
+ if getattr(args, "wandb_enabled", False):
443
+ wandb_config = {
444
+ "enabled": True,
445
+ "apiKey": getattr(args, "wandb_api_key", None),
446
+ "project": getattr(args, "wandb_project", None),
447
+ "entity": getattr(args, "wandb_entity", None),
448
+ "runId": getattr(args, "wandb_run_id", None),
449
+ }
450
+
451
+ body: Dict[str, Any] = {
452
+ # "displayName": getattr(args, "display_name", None) or f"{evaluator_id}-rft",
453
+ "dataset": f"accounts/{account_id}/datasets/{dataset_id}",
454
+ "evaluator": evaluator_resource_name,
455
+ "evalAutoCarveout": bool(getattr(args, "eval_auto_carveout", True)),
456
+ "trainingConfig": training_config,
457
+ "inferenceParameters": inference_params or None,
458
+ "wandbConfig": wandb_config,
459
+ "chunkSize": getattr(args, "chunk_size", None),
460
+ "outputStats": None,
461
+ "outputMetrics": None,
462
+ "mcpServer": None,
463
+ }
464
+ # Debug: print minimal summary
465
+ print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
466
+ if getattr(args, "evaluation_dataset", None):
467
+ body["evaluationDataset"] = args.evaluation_dataset
468
+ if getattr(args, "output_model", None):
469
+ body.setdefault("trainingConfig", {})["outputModel"] = f"accounts/{account_id}/models/{args.output_model}"
470
+
471
+ # Clean None fields to avoid noisy payloads
472
+ body = {k: v for k, v in body.items() if v is not None}
473
+
474
+ if dry_run:
475
+ print("--dry-run: would create RFT job with body:")
476
+ print(json.dumps(body, indent=2))
477
+ _print_links(evaluator_id, dataset_id, None)
478
+ return 0
479
+
480
+ try:
481
+ result = create_reinforcement_fine_tuning_job(
482
+ account_id=account_id, api_key=api_key, api_base=api_base, body=body
483
+ )
484
+ job_name = result.get("name") if isinstance(result, dict) else None
485
+ print("\n✅ Created Reinforcement Fine-tuning Job")
486
+ if job_name:
487
+ print(f" name: {job_name}")
488
+ _print_links(evaluator_id, dataset_id, job_name)
489
+ return 0
490
+ except Exception as e:
491
+ print(f"Error creating RFT job: {e}")
492
+ return 1
@@ -0,0 +1,5 @@
1
+ from .dynamic_data_loader import DynamicDataLoader
2
+ from .inline_data_loader import InlineDataLoader
3
+ from .jsonl_data_loader import EvaluationRowJsonlDataLoader
4
+
5
+ __all__ = ["DynamicDataLoader", "InlineDataLoader", "EvaluationRowJsonlDataLoader"]
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from collections.abc import Sequence
6
+
7
+ from eval_protocol.common_utils import load_jsonl
8
+ from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
9
+ from eval_protocol.data_loader.models import (
10
+ DataLoaderResult,
11
+ DataLoaderVariant,
12
+ EvaluationDataLoader,
13
+ )
14
+
15
+
16
+ @dataclass(kw_only=True)
17
+ class EvaluationRowJsonlDataLoader(EvaluationDataLoader):
18
+ """Data loader that reads EvaluationRows from a JSONL file path.
19
+
20
+ Each line of the JSONL file should be a serialized EvaluationRow dict.
21
+ The loader will construct EvaluationRow objects via the default dataset adapter.
22
+ """
23
+
24
+ jsonl_path: str
25
+ id: str = "jsonl"
26
+ description: str | None = None
27
+
28
+ def variants(self) -> Sequence[DataLoaderVariant]:
29
+ def _load() -> DataLoaderResult:
30
+ path = self.jsonl_path
31
+ if not os.path.isabs(path):
32
+ path = os.path.abspath(path)
33
+ rows_json = load_jsonl(path)
34
+ eval_rows = default_dataset_adapter(rows_json)
35
+ return DataLoaderResult(
36
+ rows=eval_rows,
37
+ type=self.__class__.__name__,
38
+ variant_id=self.id,
39
+ variant_description=self.description,
40
+ )
41
+
42
+ return [_load]
@@ -602,7 +602,47 @@ class Evaluator:
602
602
  from pathlib import Path
603
603
  import fnmatch
604
604
 
605
- default_ignores = [".git", "__pycache__", "*.pyc", ".venv", "venv", "node_modules", "*.egg-info"]
605
+ default_ignores = [
606
+ ".git",
607
+ ".github",
608
+ "__pycache__",
609
+ "*.pyc",
610
+ "*.pyo",
611
+ "*.pyd",
612
+ ".venv",
613
+ "venv",
614
+ ".tox",
615
+ ".pytest_cache",
616
+ ".mypy_cache",
617
+ ".ruff_cache",
618
+ ".ipynb_checkpoints",
619
+ ".idea",
620
+ ".vscode",
621
+ ".cache",
622
+ "node_modules",
623
+ "vendor",
624
+ "dist",
625
+ "build",
626
+ "*.egg-info",
627
+ "*.egg",
628
+ "*.whl",
629
+ "*.tar.gz",
630
+ "*.zip",
631
+ "*.log",
632
+ "*.tmp",
633
+ "*.swp",
634
+ ".DS_Store",
635
+ "coverage",
636
+ "htmlcov",
637
+ ".coverage",
638
+ "coverage.xml",
639
+ ".env",
640
+ ".env.*",
641
+ "*.so",
642
+ "*.dylib",
643
+ ".pytest_cache/",
644
+ "env/",
645
+ ]
606
646
  all_patterns = default_ignores + ignore_patterns
607
647
 
608
648
  path_obj = Path(path)
@@ -18,12 +18,20 @@ def _map_api_host_to_app_host(api_base: str) -> str:
18
18
  from urllib.parse import urlparse
19
19
 
20
20
  parsed = urlparse(api_base)
21
- host = parsed.netloc or parsed.path
21
+ host = (parsed.netloc or parsed.path).lower()
22
+ scheme = parsed.scheme or "https"
23
+
24
+ # Explicit mappings first
22
25
  if host.startswith("dev.api.fireworks.ai"):
23
- return f"{parsed.scheme or 'https'}://dev.fireworks.ai"
26
+ return f"{scheme}://dev.fireworks.ai"
27
+ if host == "staging.api.fireworks.ai" or host == "api.fireworks.ai":
28
+ return f"{scheme}://app.fireworks.ai"
29
+
30
+ # Generic mapping: api.<...> → app.<...>
24
31
  if host.startswith("api."):
25
- return f"{parsed.scheme or 'https'}://{host.replace('api.', 'app.', 1)}"
26
- return f"{parsed.scheme or 'https'}://{host}"
32
+ return f"{scheme}://{host.replace('api.', 'app.', 1)}"
33
+
34
+ return f"{scheme}://{host}"
27
35
  except Exception:
28
36
  return "https://app.fireworks.ai"
29
37