eval-protocol 0.3.4__tar.gz → 0.3.5.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (468) hide show
  1. {eval_protocol-0.3.4/eval_protocol.egg-info → eval_protocol-0.3.5.dev1}/PKG-INFO +1 -1
  2. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/fireworks_tracing.py +2 -1
  4. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/create_rft.py +95 -25
  5. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/upload.py +3 -8
  6. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/utils.py +28 -2
  7. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +7 -5
  8. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1/eval_protocol.egg-info}/PKG-INFO +1 -1
  9. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cli_create_rft.py +92 -0
  10. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/LICENSE +0 -0
  11. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/README.md +0 -0
  12. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/__init__.py +0 -0
  13. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/normalize_sandbox_fusion.py +0 -0
  14. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/utils/__init__.py +0 -0
  15. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/utils/generate_api_key.py +0 -0
  16. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/development/utils/subprocess_manager.py +0 -0
  17. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/__init__.py +0 -0
  18. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/__main__.py +0 -0
  19. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/__init__.py +0 -0
  20. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/base.py +0 -0
  21. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/bigquery.py +0 -0
  22. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/braintrust.py +0 -0
  23. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/dataframe.py +0 -0
  24. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/huggingface.py +0 -0
  25. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/langchain.py +0 -0
  26. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/langfuse.py +0 -0
  27. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/langsmith.py +0 -0
  28. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/openai_responses.py +0 -0
  29. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/trl.py +0 -0
  30. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/utils.py +0 -0
  31. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/adapters/weave.py +0 -0
  32. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/__init__.py +0 -0
  33. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/models.py +0 -0
  34. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/orchestrator.py +0 -0
  35. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resource_abc.py +0 -0
  36. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resource_pool.py +0 -0
  37. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
  38. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  39. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  40. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  41. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  42. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  43. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/docker_resource.py +0 -0
  44. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  45. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  46. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
  47. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/task_manager.py +0 -0
  48. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/agent/tool_registry.py +0 -0
  49. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/auth.py +0 -0
  50. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/__init__.py +0 -0
  51. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  52. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  53. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_aime25.py +0 -0
  54. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  55. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
  56. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  57. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  58. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  59. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  60. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli.py +0 -0
  61. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
  62. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  63. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/common.py +0 -0
  64. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/export_docs.py +0 -0
  65. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/local_test.py +0 -0
  66. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/logs.py +0 -0
  67. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  68. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/common_utils.py +0 -0
  69. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/config.py +0 -0
  70. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/__init__.py +0 -0
  71. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  72. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  73. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  74. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  75. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/data_loader/models.py +0 -0
  76. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/__init__.py +0 -0
  77. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  78. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  79. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  80. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  81. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/datasets/__init__.py +0 -0
  82. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/datasets/loader.py +0 -0
  83. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/directory_utils.py +0 -0
  84. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/evaluation.py +0 -0
  85. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/__init__.py +0 -0
  86. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/event_bus.py +0 -0
  87. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/logger.py +0 -0
  88. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  89. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  90. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/exceptions.py +0 -0
  91. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/execution/__init__.py +0 -0
  92. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/execution/pipeline.py +0 -0
  93. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/fireworks_rft.py +0 -0
  94. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/gcp_tools.py +0 -0
  95. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/generation/cache.py +0 -0
  96. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/generation/clients/base.py +0 -0
  97. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/generation/clients.py +0 -0
  98. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/generic_server.py +0 -0
  99. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/get_pep440_version.py +0 -0
  100. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/human_id/__init__.py +0 -0
  101. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/human_id/dictionary.py +0 -0
  102. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/__init__.py +0 -0
  103. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/deepeval.py +0 -0
  104. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/openai_rft.py +0 -0
  105. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/openeval.py +0 -0
  106. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  107. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
  108. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/integrations/trl.py +0 -0
  109. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/__init__.py +0 -0
  110. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  111. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  112. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  113. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/init.py +0 -0
  114. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/rollout_context.py +0 -0
  115. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  116. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/log_utils/util.py +0 -0
  117. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/logging_utils.py +0 -0
  118. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/__init__.py +0 -0
  119. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/adapter.py +0 -0
  120. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
  121. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/client/connection.py +0 -0
  122. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/clients.py +0 -0
  123. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
  124. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/base_policy.py +0 -0
  125. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/manager.py +0 -0
  126. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/policy.py +0 -0
  127. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  128. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
  129. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  130. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/mcpgym.py +0 -0
  131. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/process_manager.py +0 -0
  132. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
  133. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/session/manager.py +0 -0
  134. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/simple_process_manager.py +0 -0
  135. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp/simulation_server.py +0 -0
  136. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
  137. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/config.py +0 -0
  138. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/main.py +0 -0
  139. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  140. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  141. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  142. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  143. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_env.py +0 -0
  144. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/__init__.py +0 -0
  145. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  146. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  147. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  148. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  149. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  150. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  151. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  152. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  153. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  154. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  155. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  156. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  157. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  158. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  159. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/models.py +0 -0
  160. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/packaging.py +0 -0
  161. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/platform_api.py +0 -0
  162. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/playback_policy.py +0 -0
  163. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/__init__.py +0 -0
  164. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  165. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/app.py +0 -0
  166. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  167. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  168. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  169. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/main.py +0 -0
  170. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/models.py +0 -0
  171. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  172. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/__init__.py +0 -0
  173. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/buffer.py +0 -0
  174. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  175. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  176. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
  177. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  178. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  179. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  180. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  181. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  182. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  183. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  184. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/evaluation_test.py +0 -0
  185. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  186. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  187. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/exception_config.py +0 -0
  188. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/execution.py +0 -0
  189. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  190. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  191. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  192. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  193. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
  194. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/parameterize.py +0 -0
  195. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/plugin.py +0 -0
  196. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/priority_scheduler.py +0 -0
  197. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
  198. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/rollout_processor.py +0 -0
  199. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  200. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/store_experiment_link.py +0 -0
  201. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/store_results_url.py +0 -0
  202. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/tracing_utils.py +0 -0
  203. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/types.py +0 -0
  204. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/pytest/validate_signature.py +0 -0
  205. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/__init__.py +0 -0
  206. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  207. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  208. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  209. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  210. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  211. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  212. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  213. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/llm_judge.py +0 -0
  214. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  215. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  216. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  217. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  218. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/quickstart/utils.py +0 -0
  219. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/resources.py +0 -0
  220. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/reward_function.py +0 -0
  221. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/__init__.py +0 -0
  222. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/accuracy.py +0 -0
  223. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/accuracy_length.py +0 -0
  224. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  225. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  226. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/apps_testing_util.py +0 -0
  227. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/bfcl_reward.py +0 -0
  228. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/code_execution.py +0 -0
  229. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
  230. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/cpp_code.py +0 -0
  231. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  232. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/format.py +0 -0
  233. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/function_calling.py +0 -0
  234. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/json_schema.py +0 -0
  235. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/language_consistency.py +0 -0
  236. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/lean_prover.py +0 -0
  237. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/length.py +0 -0
  238. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  239. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/math.py +0 -0
  240. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  241. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/reasoning_steps.py +0 -0
  242. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/repetition.py +0 -0
  243. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rewards/tag_count.py +0 -0
  244. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/rl_processing.py +0 -0
  245. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/server.py +0 -0
  246. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/stats/__init__.py +0 -0
  247. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/stats/confidence_intervals.py +0 -0
  248. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/__init__.py +0 -0
  249. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/gepa_trainer.py +0 -0
  250. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/gepa_utils.py +0 -0
  251. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/trainer.py +0 -0
  252. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/training/utils.py +0 -0
  253. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/typed_interface.py +0 -0
  254. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/types/__init__.py +0 -0
  255. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/types/errors.py +0 -0
  256. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/types/remote_rollout_processor.py +0 -0
  257. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/types/types.py +0 -0
  258. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/__init__.py +0 -0
  259. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
  260. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/batch_transformation.py +0 -0
  261. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/browser_utils.py +0 -0
  262. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/check_server_status.py +0 -0
  263. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
  264. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  265. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/logs_models.py +0 -0
  266. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/logs_server.py +0 -0
  267. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/module_loader.py +0 -0
  268. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
  269. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/show_results_url.py +0 -0
  270. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/static_policy.py +0 -0
  271. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/subprocess_utils.py +0 -0
  272. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol/utils/vite_server.py +0 -0
  273. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/SOURCES.txt +0 -0
  274. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
  275. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/entry_points.txt +0 -0
  276. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/requires.txt +0 -0
  277. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
  278. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/pyproject.toml +0 -0
  279. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/setup.cfg +0 -0
  280. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/setup.py +0 -0
  281. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_accuracy.py +0 -0
  282. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_accuracy_length.py +0 -0
  283. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_adapters_e2e.py +0 -0
  284. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_agent_orchestrator.py +0 -0
  285. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_agent_resources.py +0 -0
  286. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_auth.py +0 -0
  287. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_batch_evaluation.py +0 -0
  288. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cli_agent.py +0 -0
  289. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cli_args.py +0 -0
  290. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cli_local_test.py +0 -0
  291. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_code_execution.py +0 -0
  292. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_config.py +0 -0
  293. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_control_plane_separation.py +0 -0
  294. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_cpp_code.py +0 -0
  295. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_data_driven_task_manager.py +0 -0
  296. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_deepcoder_reward.py +0 -0
  297. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_deepeval_integration.py +0 -0
  298. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_directory_utils.py +0 -0
  299. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_e2b_integration.py +0 -0
  300. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_e2b_js_integration.py +0 -0
  301. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_edge_cases.py +0 -0
  302. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_ep_upload_e2e.py +0 -0
  303. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_eval_protocol_import.py +0 -0
  304. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_evaluation.py +0 -0
  305. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_evaluation_postprocess.py +0 -0
  306. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_event_bus.py +0 -0
  307. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_event_bus_helper.py +0 -0
  308. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_examples_end_to_end.py +0 -0
  309. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_exception_config.py +0 -0
  310. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_exceptions.py +0 -0
  311. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_fireworks_api.py +0 -0
  312. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_format.py +0 -0
  313. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_fractional_code.py +0 -0
  314. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_function_calling.py +0 -0
  315. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_gcp_tools.py +0 -0
  316. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_generic_server.py +0 -0
  317. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_human_id.py +0 -0
  318. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_integration.py +0 -0
  319. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_json_schema.py +0 -0
  320. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_kwargs_validation.py +0 -0
  321. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_language_consistency.py +0 -0
  322. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_lean_prover.py +0 -0
  323. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_lean_prover_runner.py +0 -0
  324. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_length.py +0 -0
  325. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_list_comparison_math_reward.py +0 -0
  326. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_litellm_policy_provider_fields.py +0 -0
  327. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_logs_server.py +0 -0
  328. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_logs_server_simple.py +0 -0
  329. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_math.py +0 -0
  330. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_message_field_filtering.py +0 -0
  331. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_minimal.py +0 -0
  332. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_models.py +0 -0
  333. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_models_rl.py +0 -0
  334. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
  335. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_n_variant_batch_integration.py +0 -0
  336. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_n_variant_integration.py +0 -0
  337. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_openai_compatibility.py +0 -0
  338. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_openai_rft_integration.py +0 -0
  339. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_openeval_integration.py +0 -0
  340. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_packaging.py +0 -0
  341. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_parallel_rollouts.py +0 -0
  342. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_platform_api.py +0 -0
  343. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_priority_scheduler.py +0 -0
  344. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_quickstart_utils.py +0 -0
  345. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_readiness.py +0 -0
  346. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_reasoning_steps.py +0 -0
  347. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_repetition.py +0 -0
  348. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_repetition_debug.py +0 -0
  349. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_retry_mechanism.py +0 -0
  350. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_reward_function.py +0 -0
  351. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_reward_protocol_import.py +0 -0
  352. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_rl_processing.py +0 -0
  353. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_rollout_control_plane_integration.py +0 -0
  354. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_rollout_logprobs.py +0 -0
  355. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_server.py +0 -0
  356. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_show_results_url.py +0 -0
  357. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_sqlite_hardening.py +0 -0
  358. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_status_migration_changes.py +0 -0
  359. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_status_migration_integration.py +0 -0
  360. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_status_model.py +0 -0
  361. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_tag_count.py +0 -0
  362. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_tau_bench_airline_smoke.py +0 -0
  363. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_training_utils.py +0 -0
  364. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_typed_interface.py +0 -0
  365. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_typed_interface_rl.py +0 -0
  366. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_upload_entrypoint.py +0 -0
  367. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_url_handling.py +0 -0
  368. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/tests/test_vite_server.py +0 -0
  369. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/__init__.py +0 -0
  370. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/agent/__init__.py +0 -0
  371. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/agent/base.py +0 -0
  372. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/agent/llm_agent.py +0 -0
  373. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/api_service/__init__.py +0 -0
  374. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/api_service/api_config.py +0 -0
  375. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/api_service/data_model.py +0 -0
  376. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
  377. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/cli.py +0 -0
  378. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/config.py +0 -0
  379. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/airline/policy.md +0 -0
  380. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/mock/policy.md +0 -0
  381. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  382. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/retail/policy.md +0 -0
  383. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  384. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  385. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  386. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  387. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  388. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  389. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  390. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data_model/__init__.py +0 -0
  391. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data_model/message.py +0 -0
  392. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data_model/simulation.py +0 -0
  393. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/data_model/tasks.py +0 -0
  394. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/__init__.py +0 -0
  395. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
  396. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/data_model.py +0 -0
  397. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
  398. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/tools.py +0 -0
  399. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
  400. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
  401. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/data_model.py +0 -0
  402. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/environment.py +0 -0
  403. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/tools.py +0 -0
  404. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
  405. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
  406. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/data_model.py +0 -0
  407. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
  408. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/tools.py +0 -0
  409. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
  410. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
  411. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/data_model.py +0 -0
  412. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/environment.py +0 -0
  413. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  414. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  415. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  416. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  417. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  418. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  419. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  420. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  421. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/tools.py +0 -0
  422. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  423. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  424. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
  425. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/__init__.py +0 -0
  426. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/db.py +0 -0
  427. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/environment.py +0 -0
  428. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/server.py +0 -0
  429. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/tool.py +0 -0
  430. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/toolkit.py +0 -0
  431. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  432. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/__init__.py +0 -0
  433. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator.py +0 -0
  434. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  435. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  436. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  437. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  438. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  439. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/metrics/__init__.py +0 -0
  440. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/metrics/agent_metrics.py +0 -0
  441. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  442. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/orchestrator/__init__.py +0 -0
  443. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  444. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  445. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/orchestrator/utils.py +0 -0
  446. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/registry.py +0 -0
  447. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/run.py +0 -0
  448. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/__init__.py +0 -0
  449. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/check_data.py +0 -0
  450. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  451. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/start_servers.py +0 -0
  452. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/scripts/view_simulations.py +0 -0
  453. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/user/__init__.py +0 -0
  454. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/user/base.py +0 -0
  455. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/user/user_simulator.py +0 -0
  456. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/__init__.py +0 -0
  457. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/display.py +0 -0
  458. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/io_utils.py +0 -0
  459. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/llm_utils.py +0 -0
  460. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/pydantic_utils.py +0 -0
  461. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vendor/tau2/utils/utils.py +0 -0
  462. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/versioneer.py +0 -0
  463. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  464. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/index-CuQbfdPD.js +0 -0
  465. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/index-CuQbfdPD.js.map +0 -0
  466. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/index-iZp_HgyW.css +0 -0
  467. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  468. {eval_protocol-0.3.4 → eval_protocol-0.3.5.dev1}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.4
3
+ Version: 0.3.5.dev1
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2026-01-05T15:49:34-0800",
11
+ "date": "2026-01-06T11:14:00-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "f6c3454405d01b098a1911471f0efe0b952b1b7c",
15
- "version": "0.3.4"
14
+ "full-revisionid": "38c47583e50493262d74915850e7a4a7e594baf3",
15
+ "version": "0.3.5.dev.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -268,7 +268,7 @@ class FireworksTracingAdapter(BaseAdapter):
268
268
  def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
269
269
  """Fetch logs from Fireworks tracing gateway /logs endpoint.
270
270
 
271
- Returns entries with keys: timestamp, message, severity, tags.
271
+ Returns entries with keys: timestamp, message, severity, tags, status, extras.
272
272
  """
273
273
  if not tags:
274
274
  raise ValueError("At least one tag is required to fetch logs")
@@ -315,6 +315,7 @@ class FireworksTracingAdapter(BaseAdapter):
315
315
  "severity": e.get("severity", "INFO"),
316
316
  "tags": e.get("tags", []),
317
317
  "status": e.get("status"),
318
+ "extras": e.get("extras"),
318
319
  }
319
320
  )
320
321
  return results
@@ -5,15 +5,15 @@ import json
5
5
  import os
6
6
  import sys
7
7
  import time
8
- from typing import Any, Dict, Optional
8
+ from typing import Any, Callable, Dict, Optional
9
9
  import inspect
10
10
  import requests
11
+ import tempfile
11
12
  from pydantic import ValidationError
12
13
 
13
14
  from ..auth import get_fireworks_api_base, get_fireworks_api_key
14
- from ..common_utils import get_user_agent
15
+ from ..common_utils import get_user_agent, load_jsonl
15
16
  from ..fireworks_rft import (
16
- build_default_output_model,
17
17
  create_dataset_from_jsonl,
18
18
  detect_dataset_builder,
19
19
  materialize_dataset_via_builder,
@@ -31,12 +31,88 @@ from .utils import (
31
31
  _normalize_evaluator_id,
32
32
  _print_links,
33
33
  _resolve_selected_test,
34
+ load_module_from_file_path,
34
35
  )
35
36
  from .local_test import run_evaluator_test
36
37
 
37
38
  from fireworks import Fireworks
38
39
 
39
40
 
41
+ def _extract_dataset_adapter(
42
+ test_file_path: str, test_func_name: str
43
+ ) -> Optional[Callable[[list[dict[str, Any]]], Any]]:
44
+ """Extract dataset_adapter from an @evaluation_test wrapper via __ep_params__."""
45
+ try:
46
+ module = load_module_from_file_path(test_file_path)
47
+ wrapper = getattr(module, test_func_name, None)
48
+ if wrapper is None:
49
+ return None
50
+ ep_params = getattr(wrapper, "__ep_params__", None)
51
+ if ep_params is None:
52
+ return None
53
+ adapter = getattr(ep_params, "dataset_adapter", None)
54
+ if callable(adapter):
55
+ return adapter
56
+ return None
57
+ except Exception:
58
+ return None
59
+
60
+
61
+ def _maybe_transform_dataset_jsonl_via_adapter(
62
+ project_root: str,
63
+ dataset_jsonl: str,
64
+ test_file_path: Optional[str],
65
+ test_func_name: Optional[str],
66
+ ) -> str:
67
+ """Transform dataset_jsonl via the test's dataset_adapter (when available).
68
+
69
+ For RFT dataset uploads, we want the uploaded dataset to match what evaluation-time
70
+ would run on. If the selected evaluation test provides a dataset_adapter, that
71
+ adapter is treated as the source of truth for constructing EvaluationRows.
72
+ """
73
+ if not dataset_jsonl:
74
+ return dataset_jsonl
75
+
76
+ if not test_file_path or not test_func_name:
77
+ return dataset_jsonl
78
+
79
+ adapter = _extract_dataset_adapter(test_file_path, test_func_name)
80
+ if not adapter:
81
+ return dataset_jsonl
82
+
83
+ raw_rows: list[dict[str, Any]] = load_jsonl(dataset_jsonl) # type: ignore[assignment]
84
+ adapted = adapter(raw_rows)
85
+ if not isinstance(adapted, list):
86
+ raise ValueError("dataset_adapter must return a list of EvaluationRow (or dicts parseable as EvaluationRow).")
87
+
88
+ eval_rows: list[EvaluationRow] = []
89
+ for item in adapted:
90
+ if isinstance(item, EvaluationRow):
91
+ eval_rows.append(item)
92
+ else:
93
+ eval_rows.append(EvaluationRow.model_validate(item))
94
+
95
+ output_dir = os.path.join(project_root, ".ep_tmp")
96
+ os.makedirs(output_dir, exist_ok=True)
97
+ with tempfile.NamedTemporaryFile(
98
+ mode="w",
99
+ encoding="utf-8",
100
+ delete=False,
101
+ suffix=".jsonl",
102
+ prefix="ep_rft_dataset_",
103
+ dir=output_dir,
104
+ ) as f:
105
+ for row in eval_rows:
106
+ f.write(json.dumps(row.model_dump(mode="json"), ensure_ascii=False) + "\n")
107
+ out_path = os.path.abspath(f.name)
108
+ try:
109
+ rel = os.path.relpath(out_path, project_root)
110
+ except Exception:
111
+ rel = out_path
112
+ print(f"✓ Transformed dataset via dataset_adapter into EvaluationRow JSONL: {rel} ({len(eval_rows)} rows)")
113
+ return out_path
114
+
115
+
40
116
  def _extract_jsonl_from_dataloader(test_file_path: str, test_func_name: str) -> Optional[str]:
41
117
  """Import the test module and extract a JSONL path from data_loaders param if present.
42
118
 
@@ -45,18 +121,10 @@ def _extract_jsonl_from_dataloader(test_file_path: str, test_func_name: str) ->
45
121
  relative to the directory of the test file.
46
122
  """
47
123
  try:
48
- import importlib.util
49
- from pathlib import Path
50
-
51
- spec = importlib.util.spec_from_file_location(Path(test_file_path).stem, test_file_path)
52
- if not spec or not spec.loader:
124
+ module = load_module_from_file_path(test_file_path)
125
+ wrapper = getattr(module, test_func_name, None)
126
+ if wrapper is None:
53
127
  return None
54
- module = importlib.util.module_from_spec(spec)
55
- sys.modules[spec.name] = module
56
- spec.loader.exec_module(module) # type: ignore[attr-defined]
57
- if not hasattr(module, test_func_name):
58
- return None
59
- wrapper = getattr(module, test_func_name)
60
128
  marks = getattr(wrapper, "pytestmark", [])
61
129
  for m in marks:
62
130
  if getattr(m, "name", "") == "parametrize":
@@ -105,18 +173,10 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
105
173
  of the test file.
106
174
  """
107
175
  try:
108
- import importlib.util
109
- from pathlib import Path
110
-
111
- spec = importlib.util.spec_from_file_location(Path(test_file_path).stem, test_file_path)
112
- if not spec or not spec.loader:
113
- return None
114
- module = importlib.util.module_from_spec(spec)
115
- sys.modules[spec.name] = module
116
- spec.loader.exec_module(module) # type: ignore[attr-defined]
117
- if not hasattr(module, test_func_name):
176
+ module = load_module_from_file_path(test_file_path)
177
+ wrapper = getattr(module, test_func_name, None)
178
+ if wrapper is None:
118
179
  return None
119
- wrapper = getattr(module, test_func_name)
120
180
  marks = getattr(wrapper, "pytestmark", [])
121
181
  for m in marks:
122
182
  if getattr(m, "name", "") == "parametrize":
@@ -719,6 +779,16 @@ def create_rft_command(args) -> int:
719
779
  if dataset_jsonl is None and not dataset_id:
720
780
  return 1
721
781
 
782
+ # 2.5) If the selected evaluation test provides a dataset_adapter, always use it to
783
+ # construct the EvaluationRow dataset that we upload for RFT.
784
+ if dataset_jsonl is not None:
785
+ dataset_jsonl = _maybe_transform_dataset_jsonl_via_adapter(
786
+ project_root=project_root,
787
+ dataset_jsonl=dataset_jsonl,
788
+ test_file_path=selected_test_file_path,
789
+ test_func_name=selected_test_func_name,
790
+ )
791
+
722
792
  # 3) Optional local validation
723
793
  if not skip_validation:
724
794
  # Dataset validation (JSONL must be EvaluationRow-compatible when present)
@@ -1,6 +1,5 @@
1
1
  import argparse
2
2
  from eval_protocol.cli_commands.utils import DiscoveredTest
3
- import importlib.util
4
3
  import os
5
4
  import re
6
5
  import sys
@@ -18,6 +17,7 @@ from .utils import (
18
17
  _discover_tests,
19
18
  _ensure_account_id,
20
19
  _get_questionary_style,
20
+ load_module_from_file_path,
21
21
  _normalize_evaluator_id,
22
22
  _prompt_select,
23
23
  )
@@ -120,13 +120,8 @@ def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
120
120
  source_file_path = os.path.join(cwd, dotted_as_path)
121
121
 
122
122
  # Load the module from the file path
123
- spec = importlib.util.spec_from_file_location(Path(source_file_path).stem, source_file_path)
124
- if not spec or not spec.loader:
125
- raise ValueError(f"Unable to load module from path: {source_file_path}")
126
- module = importlib.util.module_from_spec(spec)
127
- sys.modules[spec.name] = module
128
- spec.loader.exec_module(module) # type: ignore[attr-defined]
129
- module_name = spec.name
123
+ module = load_module_from_file_path(source_file_path)
124
+ module_name = getattr(module, "__name__", Path(source_file_path).stem)
130
125
 
131
126
  if not hasattr(module, func):
132
127
  raise ValueError(f"Function '{func}' not found in module '{module_name}'")
@@ -1,3 +1,6 @@
1
+ from types import ModuleType
2
+
3
+
1
4
  import os
2
5
  import ast
3
6
  import sys
@@ -6,16 +9,16 @@ import inspect
6
9
  import argparse
7
10
  import typing
8
11
  import types
12
+ import importlib.util
9
13
  from dataclasses import dataclass
10
14
  from pathlib import Path
11
- from typing import Any, List, Optional, is_typeddict
15
+ from typing import Any, List, Optional
12
16
  import typing_extensions
13
17
  import inspect
14
18
  from collections.abc import Callable
15
19
  import pytest
16
20
 
17
21
  from ..auth import (
18
- get_fireworks_account_id,
19
22
  get_fireworks_api_base,
20
23
  get_fireworks_api_key,
21
24
  verify_api_key_and_get_account_id,
@@ -23,6 +26,29 @@ from ..auth import (
23
26
  from ..fireworks_rft import _map_api_host_to_app_host
24
27
 
25
28
 
29
+ def load_module_from_file_path(source_file_path: str) -> ModuleType:
30
+ """Load a Python module from an absolute/relative filesystem path.
31
+
32
+ This mirrors the CLI behavior used by `upload.py` and `create_rft.py`:
33
+ - module name is derived from the file stem (e.g. /a/b/foo.py -> foo)
34
+ - the module is inserted into sys.modules under that name before exec
35
+ """
36
+ abs_path = os.path.abspath(source_file_path)
37
+ if not os.path.isfile(abs_path):
38
+ raise ValueError(f"File not found: {abs_path}")
39
+ if not abs_path.endswith(".py"):
40
+ raise ValueError(f"Expected a .py file path, got: {abs_path}")
41
+
42
+ module_name = Path(abs_path).stem
43
+ spec = importlib.util.spec_from_file_location(module_name, abs_path)
44
+ if not spec or not spec.loader:
45
+ raise ValueError(f"Unable to load module from path: {abs_path}")
46
+ module = importlib.util.module_from_spec(spec)
47
+ sys.modules[spec.name] = module
48
+ spec.loader.exec_module(module) # type: ignore[attr-defined]
49
+ return module
50
+
51
+
26
52
  def _get_questionary_style():
27
53
  """Get the shared questionary style for CLI prompts - minimal and clean."""
28
54
  try:
@@ -125,14 +125,16 @@ class FireworksTracingHttpHandler(logging.Handler):
125
125
  pass
126
126
  program = cast(Optional[str], getattr(record, "program", None)) or "eval_protocol"
127
127
 
128
+ extras_input = getattr(record, "extras", None)
129
+ extras: Dict[str, Any] = dict(extras_input) if isinstance(extras_input, dict) else {}
130
+ extras["logger_name"] = record.name
131
+ extras["level"] = record.levelname
132
+ extras["timestamp"] = timestamp
133
+
128
134
  return {
129
135
  "program": program,
130
136
  "status": self._get_status_info(record),
131
137
  "message": message,
132
138
  "tags": tags,
133
- "extras": {
134
- "logger_name": record.name,
135
- "level": record.levelname,
136
- "timestamp": timestamp,
137
- },
139
+ "extras": extras,
138
140
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.4
3
+ Version: 0.3.5.dev1
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -1206,3 +1206,95 @@ def test_create_rft_prefers_explicit_dataset_jsonl_over_input_dataset(rft_test_h
1206
1206
  assert captured["jsonl_path"] != str(inferred_jsonl)
1207
1207
  # And because --dataset-jsonl was provided, we should never call the input_dataset extractor
1208
1208
  assert calls["input_dataset"] == 0
1209
+
1210
+
1211
+ def test_create_rft_transforms_raw_input_dataset_via_dataset_adapter_before_upload(rft_test_harness, monkeypatch):
1212
+ project = rft_test_harness
1213
+
1214
+ # Create a real @evaluation_test-decorated module so create_rft can extract __ep_params__.dataset_adapter
1215
+ metric_dir = project / "metric"
1216
+ metric_dir.mkdir(parents=True, exist_ok=True)
1217
+
1218
+ raw_jsonl = metric_dir / "raw.jsonl"
1219
+ raw_jsonl.write_text('{"q":"hi","a":"ok"}\n{"q":"yo","a":"ok2"}\n', encoding="utf-8")
1220
+
1221
+ test_file = metric_dir / "test_adapt.py"
1222
+ test_file.write_text(
1223
+ """
1224
+ from typing import Any
1225
+ from eval_protocol.models import EvaluationRow, Message
1226
+ from eval_protocol.pytest import evaluation_test
1227
+
1228
+ def my_adapter(rows: list[dict[str, Any]]) -> list[EvaluationRow]:
1229
+ return [
1230
+ EvaluationRow(messages=[Message(role="user", content=r["q"])], ground_truth=r.get("a"))
1231
+ for r in rows
1232
+ ]
1233
+
1234
+ @evaluation_test(
1235
+ input_dataset=["raw.jsonl"],
1236
+ dataset_adapter=my_adapter,
1237
+ num_runs=1,
1238
+ max_dataset_rows=2,
1239
+ mode="pointwise",
1240
+ )
1241
+ def test_adapt(row: EvaluationRow) -> EvaluationRow:
1242
+ return row
1243
+ """.lstrip(),
1244
+ encoding="utf-8",
1245
+ )
1246
+
1247
+ # Discovery: exactly one test, and resolve_selected_test points to our module/function
1248
+ single_disc = SimpleNamespace(qualname="metric.test_adapt.test_adapt", file_path=str(test_file))
1249
+ monkeypatch.setattr(cr, "_discover_and_select_tests", lambda cwd, non_interactive=False: [single_disc])
1250
+ monkeypatch.setattr(
1251
+ cr,
1252
+ "_resolve_selected_test",
1253
+ lambda project_root, evaluator_id, selected_tests=None: (str(test_file), "test_adapt"),
1254
+ )
1255
+
1256
+ captured = {"jsonl_path": None}
1257
+
1258
+ def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, display_name, jsonl_path):
1259
+ captured["jsonl_path"] = jsonl_path
1260
+ return dataset_id, {"name": f"accounts/{account_id}/datasets/{dataset_id}", "state": "UPLOADING"}
1261
+
1262
+ monkeypatch.setattr(cr, "create_dataset_from_jsonl", _fake_create_dataset_from_jsonl)
1263
+
1264
+ # Ensure upload path doesn't touch the network; job creation via stub_fireworks fixture
1265
+ args = argparse.Namespace(
1266
+ evaluator=None,
1267
+ yes=True,
1268
+ dry_run=False,
1269
+ force=False,
1270
+ env_file=None,
1271
+ dataset=None,
1272
+ dataset_jsonl=None,
1273
+ dataset_display_name=None,
1274
+ dataset_builder=None,
1275
+ base_model=None,
1276
+ warm_start_from="accounts/acct123/models/ft-abc123",
1277
+ output_model=None,
1278
+ n=None,
1279
+ max_tokens=None,
1280
+ learning_rate=None,
1281
+ batch_size=None,
1282
+ epochs=None,
1283
+ lora_rank=None,
1284
+ max_context_length=None,
1285
+ chunk_size=None,
1286
+ eval_auto_carveout=None,
1287
+ skip_validation=True,
1288
+ ignore_docker=False,
1289
+ docker_build_extra="",
1290
+ docker_run_extra="",
1291
+ )
1292
+
1293
+ rc = cr.create_rft_command(args)
1294
+ assert rc == 0
1295
+ assert captured["jsonl_path"] is not None
1296
+ # Raw JSONL should NOT be uploaded; transformed EvaluationRow JSONL should be.
1297
+ assert os.path.abspath(captured["jsonl_path"]) != os.path.abspath(str(raw_jsonl))
1298
+ assert os.path.basename(captured["jsonl_path"]).endswith(".jsonl")
1299
+ # The transformed file should validate as EvaluationRow JSONL
1300
+ assert cr._validate_dataset_jsonl(captured["jsonl_path"])