eval-protocol 0.3.2__tar.gz → 0.3.2.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (473) hide show
  1. {eval_protocol-0.3.2/eval_protocol.egg-info → eval_protocol-0.3.2.dev1}/PKG-INFO +3 -1
  2. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/_version.py +3 -3
  3. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/fireworks_tracing.py +30 -24
  4. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/__init__.py +13 -0
  5. eval_protocol-0.3.2.dev1/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +174 -0
  6. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/remote_rollout_processor.py +14 -5
  7. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1/eval_protocol.egg-info}/PKG-INFO +3 -1
  8. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol.egg-info/SOURCES.txt +1 -0
  9. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol.egg-info/requires.txt +3 -0
  10. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/pyproject.toml +3 -0
  11. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/LICENSE +0 -0
  12. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/README.md +0 -0
  13. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/development/__init__.py +0 -0
  14. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/development/normalize_sandbox_fusion.py +0 -0
  15. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/development/utils/__init__.py +0 -0
  16. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/development/utils/generate_api_key.py +0 -0
  17. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/development/utils/subprocess_manager.py +0 -0
  18. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/__init__.py +0 -0
  19. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/__main__.py +0 -0
  20. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/__init__.py +0 -0
  21. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/base.py +0 -0
  22. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/bigquery.py +0 -0
  23. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/braintrust.py +0 -0
  24. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/huggingface.py +0 -0
  25. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/langchain.py +0 -0
  26. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/langfuse.py +0 -0
  27. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/langsmith.py +0 -0
  28. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/openai_responses.py +0 -0
  29. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/trl.py +0 -0
  30. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/utils.py +0 -0
  31. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/adapters/weave.py +0 -0
  32. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/__init__.py +0 -0
  33. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/models.py +0 -0
  34. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/orchestrator.py +0 -0
  35. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resource_abc.py +0 -0
  36. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resource_pool.py +0 -0
  37. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
  38. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  39. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  40. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  41. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  42. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  43. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/docker_resource.py +0 -0
  44. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  45. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  46. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
  47. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/task_manager.py +0 -0
  48. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/agent/tool_registry.py +0 -0
  49. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/auth.py +0 -0
  50. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/__init__.py +0 -0
  51. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
  52. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
  53. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/test_aime25.py +0 -0
  54. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
  55. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
  56. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/test_gpqa.py +0 -0
  57. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
  58. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
  59. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
  60. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli.py +0 -0
  61. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
  62. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  63. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/common.py +0 -0
  64. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/create_rft.py +0 -0
  65. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/deploy.py +0 -0
  66. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  67. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/export_docs.py +0 -0
  68. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/local_test.py +0 -0
  69. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/logs.py +0 -0
  70. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/preview.py +0 -0
  71. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  72. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/upload.py +0 -0
  73. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/cli_commands/utils.py +0 -0
  74. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/common_utils.py +0 -0
  75. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/config.py +0 -0
  76. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/data_loader/__init__.py +0 -0
  77. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
  78. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/data_loader/factory_data_loader.py +0 -0
  79. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/data_loader/inline_data_loader.py +0 -0
  80. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
  81. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/data_loader/models.py +0 -0
  82. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/dataset_logger/__init__.py +0 -0
  83. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
  84. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
  85. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
  86. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
  87. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/datasets/__init__.py +0 -0
  88. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/datasets/loader.py +0 -0
  89. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/directory_utils.py +0 -0
  90. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/evaluation.py +0 -0
  91. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/event_bus/__init__.py +0 -0
  92. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/event_bus/event_bus.py +0 -0
  93. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/event_bus/logger.py +0 -0
  94. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
  95. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
  96. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/exceptions.py +0 -0
  97. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/execution/__init__.py +0 -0
  98. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/execution/pipeline.py +0 -0
  99. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/fireworks_rft.py +0 -0
  100. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/gcp_tools.py +0 -0
  101. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/generation/cache.py +0 -0
  102. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/generation/clients/base.py +0 -0
  103. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/generation/clients.py +0 -0
  104. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/generic_server.py +0 -0
  105. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/get_pep440_version.py +0 -0
  106. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/human_id/__init__.py +0 -0
  107. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/human_id/dictionary.py +0 -0
  108. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/integrations/__init__.py +0 -0
  109. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/integrations/deepeval.py +0 -0
  110. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/integrations/openai_rft.py +0 -0
  111. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/integrations/openeval.py +0 -0
  112. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/integrations/tinker_cookbook.py +0 -0
  113. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
  114. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/integrations/trl.py +0 -0
  115. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/__init__.py +0 -0
  116. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
  117. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
  118. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
  119. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
  120. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/init.py +0 -0
  121. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/rollout_context.py +0 -0
  122. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
  123. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/log_utils/util.py +0 -0
  124. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/logging_utils.py +0 -0
  125. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/__init__.py +0 -0
  126. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/adapter.py +0 -0
  127. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
  128. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/client/connection.py +0 -0
  129. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/clients.py +0 -0
  130. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
  131. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/execution/base_policy.py +0 -0
  132. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/execution/manager.py +0 -0
  133. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/execution/policy.py +0 -0
  134. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
  135. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
  136. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
  137. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/mcpgym.py +0 -0
  138. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/process_manager.py +0 -0
  139. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
  140. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/session/manager.py +0 -0
  141. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/simple_process_manager.py +0 -0
  142. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp/simulation_server.py +0 -0
  143. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
  144. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_agent/config.py +0 -0
  145. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_agent/main.py +0 -0
  146. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  147. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  148. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  149. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  150. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_env.py +0 -0
  151. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/__init__.py +0 -0
  152. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
  153. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
  154. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
  155. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/README.md +0 -0
  156. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
  157. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
  158. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
  159. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
  160. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/server.py +0 -0
  161. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
  162. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
  163. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
  164. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
  165. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
  166. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/models.py +0 -0
  167. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/packaging.py +0 -0
  168. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/platform_api.py +0 -0
  169. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/playback_policy.py +0 -0
  170. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/__init__.py +0 -0
  171. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
  172. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/proxy_core/app.py +0 -0
  173. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/proxy_core/auth.py +0 -0
  174. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
  175. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
  176. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/proxy_core/main.py +0 -0
  177. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/proxy_core/models.py +0 -0
  178. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
  179. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/buffer.py +0 -0
  180. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
  181. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  182. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
  183. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
  184. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
  185. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
  186. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  187. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
  188. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
  189. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/evaluation_test.py +0 -0
  190. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
  191. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
  192. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/exception_config.py +0 -0
  193. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/execution.py +0 -0
  194. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
  195. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
  196. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/handle_persist_flow.py +0 -0
  197. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
  198. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
  199. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/parameterize.py +0 -0
  200. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/plugin.py +0 -0
  201. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/priority_scheduler.py +0 -0
  202. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/rollout_processor.py +0 -0
  203. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
  204. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/store_experiment_link.py +0 -0
  205. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/store_results_url.py +0 -0
  206. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/tracing_utils.py +0 -0
  207. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/types.py +0 -0
  208. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/pytest/validate_signature.py +0 -0
  209. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/__init__.py +0 -0
  210. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
  211. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
  212. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
  213. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
  214. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
  215. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
  216. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
  217. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/llm_judge.py +0 -0
  218. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
  219. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
  220. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
  221. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
  222. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/quickstart/utils.py +0 -0
  223. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/resources.py +0 -0
  224. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/reward_function.py +0 -0
  225. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/__init__.py +0 -0
  226. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/accuracy.py +0 -0
  227. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/accuracy_length.py +0 -0
  228. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  229. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  230. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/apps_testing_util.py +0 -0
  231. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/bfcl_reward.py +0 -0
  232. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/code_execution.py +0 -0
  233. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
  234. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/cpp_code.py +0 -0
  235. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  236. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/format.py +0 -0
  237. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/function_calling.py +0 -0
  238. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/json_schema.py +0 -0
  239. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/language_consistency.py +0 -0
  240. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/lean_prover.py +0 -0
  241. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/length.py +0 -0
  242. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  243. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/math.py +0 -0
  244. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  245. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/reasoning_steps.py +0 -0
  246. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/repetition.py +0 -0
  247. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rewards/tag_count.py +0 -0
  248. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/rl_processing.py +0 -0
  249. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/server.py +0 -0
  250. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/stats/__init__.py +0 -0
  251. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/stats/confidence_intervals.py +0 -0
  252. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/training/__init__.py +0 -0
  253. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/training/gepa_trainer.py +0 -0
  254. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/training/gepa_utils.py +0 -0
  255. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/training/trainer.py +0 -0
  256. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/training/utils.py +0 -0
  257. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/typed_interface.py +0 -0
  258. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/types/__init__.py +0 -0
  259. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/types/errors.py +0 -0
  260. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/types/remote_rollout_processor.py +0 -0
  261. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/types/types.py +0 -0
  262. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/__init__.py +0 -0
  263. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
  264. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/batch_transformation.py +0 -0
  265. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/browser_utils.py +0 -0
  266. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/check_server_status.py +0 -0
  267. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
  268. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/evaluation_row_utils.py +0 -0
  269. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/logs_models.py +0 -0
  270. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/logs_server.py +0 -0
  271. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/module_loader.py +0 -0
  272. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
  273. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/show_results_url.py +0 -0
  274. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/static_policy.py +0 -0
  275. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/subprocess_utils.py +0 -0
  276. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol/utils/vite_server.py +0 -0
  277. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
  278. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol.egg-info/entry_points.txt +0 -0
  279. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
  280. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/setup.cfg +0 -0
  281. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/setup.py +0 -0
  282. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_accuracy.py +0 -0
  283. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_accuracy_length.py +0 -0
  284. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_adapters_e2e.py +0 -0
  285. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_agent_orchestrator.py +0 -0
  286. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_agent_resources.py +0 -0
  287. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_auth.py +0 -0
  288. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_batch_evaluation.py +0 -0
  289. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_cli.py +0 -0
  290. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_cli_agent.py +0 -0
  291. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_cli_args.py +0 -0
  292. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_cli_create_rft.py +0 -0
  293. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_cli_local_test.py +0 -0
  294. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_code_execution.py +0 -0
  295. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_config.py +0 -0
  296. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_control_plane_separation.py +0 -0
  297. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_cpp_code.py +0 -0
  298. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_data_driven_task_manager.py +0 -0
  299. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_deepcoder_reward.py +0 -0
  300. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_deepeval_integration.py +0 -0
  301. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_deploy_integration.py +0 -0
  302. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_directory_utils.py +0 -0
  303. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_e2b_integration.py +0 -0
  304. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_e2b_js_integration.py +0 -0
  305. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_edge_cases.py +0 -0
  306. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_ep_upload_e2e.py +0 -0
  307. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_eval_protocol_import.py +0 -0
  308. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_evaluation.py +0 -0
  309. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_evaluation_integration.py +0 -0
  310. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_evaluation_postprocess.py +0 -0
  311. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_evaluation_preview_integration.py +0 -0
  312. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_event_bus.py +0 -0
  313. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_event_bus_helper.py +0 -0
  314. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_examples_end_to_end.py +0 -0
  315. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_exception_config.py +0 -0
  316. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_exceptions.py +0 -0
  317. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_fireworks_api.py +0 -0
  318. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_format.py +0 -0
  319. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_fractional_code.py +0 -0
  320. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_function_calling.py +0 -0
  321. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_gcp_tools.py +0 -0
  322. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_generic_server.py +0 -0
  323. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_human_id.py +0 -0
  324. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_integration.py +0 -0
  325. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_json_schema.py +0 -0
  326. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_kwargs_validation.py +0 -0
  327. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_language_consistency.py +0 -0
  328. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_lean_prover.py +0 -0
  329. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_lean_prover_runner.py +0 -0
  330. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_length.py +0 -0
  331. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_list_comparison_math_reward.py +0 -0
  332. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_litellm_policy_provider_fields.py +0 -0
  333. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_logs_server.py +0 -0
  334. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_logs_server_simple.py +0 -0
  335. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_math.py +0 -0
  336. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_message_field_filtering.py +0 -0
  337. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_minimal.py +0 -0
  338. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_models.py +0 -0
  339. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_models_rl.py +0 -0
  340. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
  341. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_n_variant_batch_integration.py +0 -0
  342. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_n_variant_integration.py +0 -0
  343. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_openai_compatibility.py +0 -0
  344. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_openai_rft_integration.py +0 -0
  345. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_openeval_integration.py +0 -0
  346. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_packaging.py +0 -0
  347. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_parallel_rollouts.py +0 -0
  348. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_platform_api.py +0 -0
  349. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_priority_scheduler.py +0 -0
  350. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_quickstart_utils.py +0 -0
  351. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_readiness.py +0 -0
  352. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_reasoning_steps.py +0 -0
  353. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_repetition.py +0 -0
  354. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_repetition_debug.py +0 -0
  355. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_retry_mechanism.py +0 -0
  356. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_reward_function.py +0 -0
  357. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_reward_protocol_import.py +0 -0
  358. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_rl_processing.py +0 -0
  359. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_rollout_control_plane_integration.py +0 -0
  360. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_server.py +0 -0
  361. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_show_results_url.py +0 -0
  362. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_sqlite_hardening.py +0 -0
  363. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_status_migration_changes.py +0 -0
  364. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_status_migration_integration.py +0 -0
  365. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_status_model.py +0 -0
  366. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_tag_count.py +0 -0
  367. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_tau_bench_airline_smoke.py +0 -0
  368. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_training_utils.py +0 -0
  369. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_typed_interface.py +0 -0
  370. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_typed_interface_rl.py +0 -0
  371. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_upload_entrypoint.py +0 -0
  372. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_url_handling.py +0 -0
  373. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/tests/test_vite_server.py +0 -0
  374. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/__init__.py +0 -0
  375. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/agent/__init__.py +0 -0
  376. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/agent/base.py +0 -0
  377. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/agent/llm_agent.py +0 -0
  378. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/api_service/__init__.py +0 -0
  379. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/api_service/api_config.py +0 -0
  380. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/api_service/data_model.py +0 -0
  381. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
  382. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/cli.py +0 -0
  383. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/config.py +0 -0
  384. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/airline/policy.md +0 -0
  385. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/mock/policy.md +0 -0
  386. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
  387. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/retail/policy.md +0 -0
  388. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
  389. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
  390. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
  391. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
  392. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
  393. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
  394. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
  395. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data_model/__init__.py +0 -0
  396. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data_model/message.py +0 -0
  397. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data_model/simulation.py +0 -0
  398. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/data_model/tasks.py +0 -0
  399. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/__init__.py +0 -0
  400. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
  401. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/airline/data_model.py +0 -0
  402. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
  403. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/airline/tools.py +0 -0
  404. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
  405. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
  406. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/mock/data_model.py +0 -0
  407. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/mock/environment.py +0 -0
  408. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/mock/tools.py +0 -0
  409. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
  410. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
  411. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/retail/data_model.py +0 -0
  412. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
  413. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/retail/tools.py +0 -0
  414. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
  415. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
  416. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/data_model.py +0 -0
  417. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/environment.py +0 -0
  418. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  419. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  420. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  421. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  422. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  423. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  424. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  425. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  426. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/tools.py +0 -0
  427. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  428. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  429. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
  430. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/environment/__init__.py +0 -0
  431. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/environment/db.py +0 -0
  432. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/environment/environment.py +0 -0
  433. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/environment/server.py +0 -0
  434. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/environment/tool.py +0 -0
  435. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/environment/toolkit.py +0 -0
  436. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  437. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/evaluator/__init__.py +0 -0
  438. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/evaluator/evaluator.py +0 -0
  439. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  440. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  441. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  442. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  443. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  444. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/metrics/__init__.py +0 -0
  445. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/metrics/agent_metrics.py +0 -0
  446. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  447. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/orchestrator/__init__.py +0 -0
  448. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  449. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  450. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/orchestrator/utils.py +0 -0
  451. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/registry.py +0 -0
  452. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/run.py +0 -0
  453. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/scripts/__init__.py +0 -0
  454. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/scripts/check_data.py +0 -0
  455. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  456. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/scripts/start_servers.py +0 -0
  457. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/scripts/view_simulations.py +0 -0
  458. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/user/__init__.py +0 -0
  459. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/user/base.py +0 -0
  460. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/user/user_simulator.py +0 -0
  461. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/utils/__init__.py +0 -0
  462. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/utils/display.py +0 -0
  463. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/utils/io_utils.py +0 -0
  464. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/utils/llm_utils.py +0 -0
  465. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/utils/pydantic_utils.py +0 -0
  466. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vendor/tau2/utils/utils.py +0 -0
  467. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/versioneer.py +0 -0
  468. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
  469. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vite-app/dist/assets/index-CuQbfdPD.js +0 -0
  470. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vite-app/dist/assets/index-CuQbfdPD.js.map +0 -0
  471. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vite-app/dist/assets/index-iZp_HgyW.css +0 -0
  472. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
  473. {eval_protocol-0.3.2 → eval_protocol-0.3.2.dev1}/vite-app/dist/index.html +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.2
3
+ Version: 0.3.2.dev1
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -103,6 +103,8 @@ Provides-Extra: openenv
103
103
  Requires-Dist: openenv-core; extra == "openenv"
104
104
  Provides-Extra: dspy
105
105
  Requires-Dist: dspy>=3.0.0; extra == "dspy"
106
+ Provides-Extra: klavis
107
+ Requires-Dist: klavis>=2.18.0; extra == "klavis"
106
108
  Provides-Extra: langgraph
107
109
  Requires-Dist: langgraph>=0.6.7; extra == "langgraph"
108
110
  Requires-Dist: langchain-core>=0.3.75; extra == "langgraph"
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-12-23T14:49:24-0800",
11
+ "date": "2026-01-03T20:47:59-0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "bd1be9570f0193a6d8277e62c9dfa3a0d9950652",
15
- "version": "0.3.2"
14
+ "full-revisionid": "aee419378ff01cfd8ff38b185ff5527a6550a83d",
15
+ "version": "0.3.2.dev.1"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -264,6 +264,7 @@ class FireworksTracingAdapter(BaseAdapter):
264
264
  self.project_id = project_id
265
265
  self.base_url = base_url.rstrip("/")
266
266
  self.timeout = timeout
267
+ self._session = requests.Session()
267
268
 
268
269
  def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
269
270
  """Fetch logs from Fireworks tracing gateway /logs endpoint.
@@ -287,14 +288,14 @@ class FireworksTracingAdapter(BaseAdapter):
287
288
  last_error: Optional[str] = None
288
289
  for url in urls_to_try:
289
290
  try:
290
- response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
291
- if response.status_code == 404:
292
- # Try next variant
293
- last_error = f"404 for {url}"
294
- continue
295
- response.raise_for_status()
296
- data = response.json() or {}
297
- break
291
+ with self._session.get(url, params=params, timeout=self.timeout, headers=headers) as response:
292
+ if response.status_code == 404:
293
+ # Try next variant (must close response to release connection)
294
+ last_error = f"404 for {url}"
295
+ continue
296
+ response.raise_for_status()
297
+ data = response.json() or {}
298
+ break
298
299
  except requests.exceptions.RequestException as e:
299
300
  last_error = str(e)
300
301
  continue
@@ -412,22 +413,20 @@ class FireworksTracingAdapter(BaseAdapter):
412
413
 
413
414
  result = None
414
415
  try:
415
- response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
416
- response.raise_for_status()
417
- result = response.json()
418
- except requests.exceptions.HTTPError as e:
419
- error_msg = str(e)
420
-
421
- # Try to extract detail message from response
422
- if e.response is not None:
423
- try:
424
- error_detail = e.response.json().get("detail", {})
425
- error_msg = error_detail or e.response.text
426
- except Exception: # In case e.response.json() fails
427
- error_msg = f"Proxy error: {e.response.text}"
428
-
429
- logger.error("Failed to fetch traces from proxy (HTTP %s): %s", e.response.status_code, error_msg)
430
- return eval_rows
416
+ with self._session.get(url, params=params, timeout=self.timeout, headers=headers) as response:
417
+ if response.status_code >= 400:
418
+ error_msg: str = response.text
419
+ try:
420
+ payload = response.json()
421
+ if isinstance(payload, dict) and "detail" in payload:
422
+ detail = payload.get("detail")
423
+ if detail:
424
+ error_msg = str(detail)
425
+ except Exception:
426
+ pass
427
+ logger.error("Failed to fetch traces from proxy (HTTP %s): %s", response.status_code, error_msg)
428
+ return eval_rows
429
+ result = response.json()
431
430
  except requests.exceptions.RequestException as e:
432
431
  # Non-HTTP errors (network issues, timeouts, etc.)
433
432
  logger.error("Failed to fetch traces from proxy: %s", str(e))
@@ -451,3 +450,10 @@ class FireworksTracingAdapter(BaseAdapter):
451
450
 
452
451
  logger.info("Successfully converted %d traces to evaluation rows", len(eval_rows))
453
452
  return eval_rows
453
+
454
+ def close(self) -> None:
455
+ """Close underlying HTTP resources."""
456
+ try:
457
+ self._session.close()
458
+ except Exception:
459
+ pass
@@ -11,6 +11,15 @@ from .rollout_processor import RolloutProcessor
11
11
  from .rollout_result_post_processor import RolloutResultPostProcessor, NoOpRolloutResultPostProcessor
12
12
  from .types import RolloutProcessorConfig
13
13
 
14
+ # Conditional import for optional Klavis dependency
15
+ try:
16
+ from .default_klavis_sandbox_rollout_processor import KlavisSandboxRolloutProcessor
17
+
18
+ KLAVIS_AVAILABLE = True
19
+ except ImportError:
20
+ KLAVIS_AVAILABLE = False
21
+ KlavisSandboxRolloutProcessor = None
22
+
14
23
  # Conditional import for optional dependencies
15
24
  try:
16
25
  from .default_pydantic_ai_rollout_processor import PydanticAgentRolloutProcessor
@@ -47,6 +56,10 @@ __all__ = [
47
56
  "NoOpRolloutResultPostProcessor",
48
57
  ]
49
58
 
59
+ # Only add to __all__ if available
60
+ if KLAVIS_AVAILABLE:
61
+ __all__.append("KlavisSandboxRolloutProcessor")
62
+
50
63
  # Only add to __all__ if available
51
64
  if PYDANTIC_AI_AVAILABLE:
52
65
  __all__.append("PydanticAgentRolloutProcessor")
@@ -0,0 +1,174 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import tempfile
6
+ import time
7
+ from typing import Any, Callable, Dict, List, Optional
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+ from eval_protocol.models import EvaluationRow
12
+ from eval_protocol.pytest.rollout_processor import RolloutProcessor
13
+ from eval_protocol.pytest.types import RolloutProcessorConfig
14
+
15
+ from eval_protocol.pytest.default_agent_rollout_processor import Agent
16
+ from klavis import Klavis
17
+ from klavis.types import CreateSandboxResponse, SandboxMcpServer
18
+ from openai.types import CompletionUsage
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class KlavisSandboxRolloutProcessor(RolloutProcessor):
24
+ def __init__(
25
+ self,
26
+ server_name: str,
27
+ initialize_data_factory: Optional[Callable[[EvaluationRow], Dict[str, Any]]] = None,
28
+ ):
29
+ super().__init__()
30
+ self.server_name = server_name
31
+ self.initialize_data_factory = initialize_data_factory
32
+ self.klavis_client = Klavis(api_key=os.environ.get("KLAVIS_API_KEY"))
33
+
34
+ def _init_sandbox(self) -> CreateSandboxResponse:
35
+ try:
36
+ server_name_enum = SandboxMcpServer(self.server_name)
37
+ return self.klavis_client.sandbox.create_sandbox(server_name=server_name_enum)
38
+ except Exception as e:
39
+ logger.error(f"Error creating sandbox: {str(e)}", exc_info=True)
40
+ raise
41
+
42
+ @staticmethod
43
+ def create_mcp_config(server_url: str, server_key: str = "main", auth_token: str | None = None) -> str:
44
+ """Create a temporary MCP config file and return its path."""
45
+ config = {
46
+ "mcpServers": {
47
+ server_key: {
48
+ "url": server_url,
49
+ "transport": "streamable_http",
50
+ **({"authorization": f"Bearer {auth_token}"} if auth_token else {})
51
+ }
52
+ }
53
+ }
54
+
55
+ # Create a temp file that persists for the session
56
+ fd, path = tempfile.mkstemp(suffix=".json", prefix="mcp_config_")
57
+ with os.fdopen(fd, 'w') as f:
58
+ json.dump(config, f)
59
+ return path
60
+
61
+ def __call__(
62
+ self, rows: List[EvaluationRow], config: RolloutProcessorConfig
63
+ ) -> List[asyncio.Task[EvaluationRow]]:
64
+ """Process evaluation rows with Klavis sandbox lifecycle management"""
65
+ semaphore = config.semaphore
66
+
67
+ async def process_row(row: EvaluationRow) -> EvaluationRow:
68
+ """Process a single row with complete sandbox lifecycle"""
69
+
70
+ start_time = time.perf_counter()
71
+ agent: Agent | None = None
72
+ temp_config_path: str | None = None
73
+ sandbox: CreateSandboxResponse | None = None
74
+
75
+ try:
76
+ # Step 0: Create a sandbox for this row
77
+ sandbox = self._init_sandbox()
78
+ logger.info(f"Sandbox created: {sandbox}")
79
+
80
+ # Step 1: Initialize data in the sandbox
81
+ init_data: Dict[str, Any] | None = None
82
+ if self.initialize_data_factory:
83
+ init_data = self.initialize_data_factory(row)
84
+ else:
85
+ # Allow datasets to provide initialization payload directly
86
+ init_data = (
87
+ (row.input_metadata.session_data or {}).get("initialize_data")
88
+ if row.input_metadata is not None
89
+ else None
90
+ )
91
+
92
+ if init_data:
93
+ logger.info(f"Initializing {self.server_name} sandbox {sandbox.sandbox_id}")
94
+ initialize_method = getattr(
95
+ self.klavis_client.sandbox, f"initialize_{sandbox.server_name.value}_sandbox"
96
+ )
97
+ init_response = initialize_method(sandbox_id=sandbox.sandbox_id, **init_data)
98
+ logger.info(f"Initialization response: {init_response}")
99
+
100
+ # Step 2: Create temporary MCP config with sandbox URL
101
+ temp_config_path = self.create_mcp_config(
102
+ server_url=sandbox.server_url, server_key=sandbox.server_name.value
103
+ )
104
+ logger.info(f"MCP config created: {temp_config_path}")
105
+
106
+ # Step 3: Run agent with sandbox MCP server
107
+ logger.info(f"Running agent for row {row.execution_metadata.rollout_id} with {self.server_name} sandbox")
108
+ agent = Agent(
109
+ model=row.input_metadata.completion_params["model"],
110
+ row=row,
111
+ config_path=temp_config_path,
112
+ logger=config.logger,
113
+ )
114
+ await agent.setup()
115
+ await agent.call_agent()
116
+
117
+ # Update usage metadata
118
+ row.execution_metadata.usage = CompletionUsage(
119
+ prompt_tokens=agent.usage.get("prompt_tokens", 0),
120
+ completion_tokens=agent.usage.get("completion_tokens", 0),
121
+ total_tokens=agent.usage.get("total_tokens", 0),
122
+ )
123
+ row = agent.evaluation_row
124
+ logger.info(f"Agent execution completed for row {row.execution_metadata.rollout_id}")
125
+
126
+ # Step 4: Export sandbox data
127
+ dump_method = getattr(self.klavis_client.sandbox, f"dump_{sandbox.server_name.value}_sandbox")
128
+ dump_response = dump_method(sandbox_id=sandbox.sandbox_id)
129
+ sandbox_data = dump_response.data
130
+ logger.info(f"Sandbox data: {sandbox_data}")
131
+
132
+ # Store sandbox data in row metadata for evaluation
133
+ if not row.execution_metadata.extra:
134
+ row.execution_metadata.extra = {}
135
+ row.execution_metadata.extra["sandbox_data"] = sandbox_data
136
+ row.execution_metadata.extra["sandbox_id"] = sandbox.sandbox_id
137
+ row.execution_metadata.extra["server_name"] = self.server_name
138
+
139
+ except Exception as e:
140
+ logger.error(f"Error processing row {row.execution_metadata.rollout_id}: {str(e)}", exc_info=True)
141
+ if not row.execution_metadata.extra:
142
+ row.execution_metadata.extra = {}
143
+ row.execution_metadata.extra["error"] = str(e)
144
+ raise
145
+
146
+ finally:
147
+ # Cleanup agent MCP client and temp config
148
+ if agent and agent.mcp_client:
149
+ await agent.mcp_client.cleanup()
150
+ if temp_config_path and os.path.exists(temp_config_path):
151
+ os.unlink(temp_config_path)
152
+
153
+ # Release sandbox
154
+ if sandbox and sandbox.sandbox_id:
155
+ try:
156
+ self.klavis_client.sandbox.delete_sandbox(
157
+ server_name=sandbox.server_name, sandbox_id=sandbox.sandbox_id
158
+ )
159
+ logger.info(f"Sandbox {sandbox.sandbox_id} released successfully")
160
+ except Exception as e:
161
+ logger.error(f"Error releasing sandbox {sandbox.sandbox_id}: {str(e)}", exc_info=True)
162
+
163
+ row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
164
+
165
+ return row
166
+
167
+ async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
168
+ async with semaphore:
169
+ result = await process_row(r)
170
+ return result
171
+
172
+ # Create and return tasks
173
+ tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows]
174
+ return tasks
@@ -54,6 +54,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
54
54
  self._timeout_seconds = timeout_seconds
55
55
  self._output_data_loader = output_data_loader or default_fireworks_output_data_loader
56
56
  self._tracing_adapter = FireworksTracingAdapter(base_url=self._model_base_url)
57
+ self._session = requests.Session()
57
58
 
58
59
  def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]:
59
60
  tasks: List[asyncio.Task[EvaluationRow]] = []
@@ -94,8 +95,8 @@ class RemoteRolloutProcessor(RolloutProcessor):
94
95
  def _post_init() -> None:
95
96
  url = f"{remote_base_url}/init"
96
97
  try:
97
- r = requests.post(url, json=init_payload.model_dump(), timeout=300)
98
- r.raise_for_status()
98
+ with self._session.post(url, json=init_payload.model_dump(), timeout=300) as r:
99
+ r.raise_for_status()
99
100
  except requests.exceptions.Timeout:
100
101
  raise TimeoutError(
101
102
  f"The /init endpoint tried {url} with {init_payload.model_dump()} but timed out after 300 seconds."
@@ -108,9 +109,9 @@ class RemoteRolloutProcessor(RolloutProcessor):
108
109
 
109
110
  def _get_status() -> Dict[str, Any]:
110
111
  url = f"{remote_base_url}/status"
111
- r = requests.get(url, params={"rollout_id": row.execution_metadata.rollout_id}, timeout=15)
112
- r.raise_for_status()
113
- return r.json()
112
+ with self._session.get(url, params={"rollout_id": row.execution_metadata.rollout_id}, timeout=15) as r:
113
+ r.raise_for_status()
114
+ return r.json()
114
115
 
115
116
  continue_polling_status = True
116
117
  while time.time() < deadline:
@@ -204,4 +205,12 @@ class RemoteRolloutProcessor(RolloutProcessor):
204
205
  return tasks
205
206
 
206
207
  def cleanup(self) -> None:
208
+ try:
209
+ self._tracing_adapter.close()
210
+ except Exception:
211
+ pass
212
+ try:
213
+ self._session.close()
214
+ except Exception:
215
+ pass
207
216
  return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.3.2
3
+ Version: 0.3.2.dev1
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
6
  License-Expression: MIT
@@ -103,6 +103,8 @@ Provides-Extra: openenv
103
103
  Requires-Dist: openenv-core; extra == "openenv"
104
104
  Provides-Extra: dspy
105
105
  Requires-Dist: dspy>=3.0.0; extra == "dspy"
106
+ Provides-Extra: klavis
107
+ Requires-Dist: klavis>=2.18.0; extra == "klavis"
106
108
  Provides-Extra: langgraph
107
109
  Requires-Dist: langgraph>=0.6.7; extra == "langgraph"
108
110
  Requires-Dist: langchain-core>=0.3.75; extra == "langgraph"
@@ -193,6 +193,7 @@ eval_protocol/pytest/__init__.py
193
193
  eval_protocol/pytest/buffer.py
194
194
  eval_protocol/pytest/default_agent_rollout_processor.py
195
195
  eval_protocol/pytest/default_dataset_adapter.py
196
+ eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py
196
197
  eval_protocol/pytest/default_langchain_rollout_processor.py
197
198
  eval_protocol/pytest/default_mcp_gym_rollout_processor.py
198
199
  eval_protocol/pytest/default_no_op_rollout_processor.py
@@ -77,6 +77,9 @@ dspy>=3.0.0
77
77
  datasets>=3.0.0
78
78
  transformers>=4.0.0
79
79
 
80
+ [klavis]
81
+ klavis>=2.18.0
82
+
80
83
  [langchain]
81
84
  langchain-core>=0.3.0
82
85
 
@@ -134,6 +134,9 @@ openenv = [
134
134
  dspy = [
135
135
  "dspy>=3.0.0",
136
136
  ]
137
+ klavis = [
138
+ "klavis>=2.18.0",
139
+ ]
137
140
 
138
141
  # Optional deps for LangGraph example/tests
139
142
  langgraph = [