eval-protocol 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. {eval_protocol-0.2.3/eval_protocol.egg-info → eval_protocol-0.2.5}/PKG-INFO +18 -9
  2. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/README.md +12 -6
  3. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/_version.py +3 -3
  4. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/langfuse.py +120 -135
  5. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli.py +7 -0
  6. eval_protocol-0.2.5/eval_protocol/cli_commands/logs.py +29 -0
  7. eval_protocol-0.2.5/eval_protocol/dataset_logger/__init__.py +3 -0
  8. eval_protocol-0.2.5/eval_protocol/dataset_logger/dataset_logger.py +35 -0
  9. eval_protocol-0.2.5/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +114 -0
  10. eval_protocol-0.2.5/eval_protocol/human_id/__init__.py +34 -0
  11. eval_protocol-0.2.5/eval_protocol/human_id/dictionary.py +507 -0
  12. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/client/connection.py +19 -1
  13. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/execution/manager.py +3 -38
  14. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/mcp_multi_client.py +48 -24
  15. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/mcpgym.py +33 -2
  16. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/session/manager.py +7 -9
  17. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_env.py +28 -12
  18. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/models.py +58 -6
  19. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_agent_rollout_processor.py +48 -34
  20. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +17 -19
  21. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/evaluation_test.py +145 -68
  22. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/types/types.py +4 -18
  23. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/__init__.py +5 -0
  24. eval_protocol-0.2.5/eval_protocol/utils/logs_server.py +295 -0
  25. eval_protocol-0.2.5/eval_protocol/utils/vite_server.py +112 -0
  26. {eval_protocol-0.2.3 → eval_protocol-0.2.5/eval_protocol.egg-info}/PKG-INFO +18 -9
  27. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/SOURCES.txt +8 -0
  28. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/requires.txt +4 -1
  29. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/pyproject.toml +6 -2
  30. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_parallel_rollouts.py +2 -2
  31. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_rollout_control_plane_integration.py +10 -2
  32. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_url_handling.py +26 -12
  33. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/LICENSE +0 -0
  34. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/__init__.py +0 -0
  35. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/normalize_sandbox_fusion.py +0 -0
  36. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/utils/__init__.py +0 -0
  37. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/utils/generate_api_key.py +0 -0
  38. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/development/utils/subprocess_manager.py +0 -0
  39. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/__init__.py +0 -0
  40. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/__main__.py +0 -0
  41. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/__init__.py +0 -0
  42. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/braintrust.py +0 -0
  43. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/huggingface.py +0 -0
  44. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/adapters/trl.py +0 -0
  45. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/__init__.py +0 -0
  46. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/models.py +0 -0
  47. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/orchestrator.py +0 -0
  48. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resource_abc.py +0 -0
  49. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resource_pool.py +0 -0
  50. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/__init__.py +0 -0
  51. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
  52. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
  53. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
  54. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
  55. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
  56. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/docker_resource.py +0 -0
  57. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
  58. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
  59. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
  60. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/python_state_resource.py +0 -0
  61. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/resources/sql_resource.py +0 -0
  62. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/task_manager.py +0 -0
  63. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/agent/tool_registry.py +0 -0
  64. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/auth.py +0 -0
  65. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/__init__.py +0 -0
  66. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
  67. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/common.py +0 -0
  68. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/deploy.py +0 -0
  69. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
  70. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/preview.py +0 -0
  71. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
  72. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/common_utils.py +0 -0
  73. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/config.py +0 -0
  74. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/datasets/__init__.py +0 -0
  75. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/datasets/loader.py +0 -0
  76. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/evaluation.py +0 -0
  77. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/execution/__init__.py +0 -0
  78. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/execution/pipeline.py +0 -0
  79. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/gcp_tools.py +0 -0
  80. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/generation/cache.py +0 -0
  81. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/generation/clients/base.py +0 -0
  82. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/generation/clients.py +0 -0
  83. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/generic_server.py +0 -0
  84. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/__init__.py +0 -0
  85. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/braintrust.py +0 -0
  86. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/deepeval.py +0 -0
  87. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/openeval.py +0 -0
  88. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/integrations/trl.py +0 -0
  89. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/__init__.py +0 -0
  90. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/adapter.py +0 -0
  91. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/client/__init__.py +0 -0
  92. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/clients.py +0 -0
  93. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/execution/__init__.py +0 -0
  94. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/execution/base_policy.py +0 -0
  95. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/execution/policy.py +0 -0
  96. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/grid_renderer.py +0 -0
  97. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/process_manager.py +0 -0
  98. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/session/__init__.py +0 -0
  99. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/simple_process_manager.py +0 -0
  100. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp/simulation_server.py +0 -0
  101. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/__init__.py +0 -0
  102. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/config.py +0 -0
  103. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
  104. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/main.py +0 -0
  105. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
  106. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
  107. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
  108. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
  109. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
  110. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/mcp_agent/session.py +0 -0
  111. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/packaging.py +0 -0
  112. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/platform_api.py +0 -0
  113. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/playback_policy.py +0 -0
  114. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/__init__.py +0 -0
  115. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
  116. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
  117. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
  118. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/types.py +0 -0
  119. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/pytest/utils.py +0 -0
  120. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/resources.py +0 -0
  121. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/reward_function.py +0 -0
  122. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/__init__.py +0 -0
  123. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/accuracy.py +0 -0
  124. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/accuracy_length.py +0 -0
  125. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/apps_coding_reward.py +0 -0
  126. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/apps_execution_utils.py +0 -0
  127. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/apps_testing_util.py +0 -0
  128. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/bfcl_reward.py +0 -0
  129. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/code_execution.py +0 -0
  130. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/code_execution_utils.py +0 -0
  131. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/cpp_code.py +0 -0
  132. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/deepcoder_reward.py +0 -0
  133. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/format.py +0 -0
  134. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/function_calling.py +0 -0
  135. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/json_schema.py +0 -0
  136. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/language_consistency.py +0 -0
  137. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/lean_prover.py +0 -0
  138. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/length.py +0 -0
  139. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
  140. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/math.py +0 -0
  141. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
  142. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/reasoning_steps.py +0 -0
  143. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/repetition.py +0 -0
  144. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rewards/tag_count.py +0 -0
  145. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/rl_processing.py +0 -0
  146. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/server.py +0 -0
  147. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/typed_interface.py +0 -0
  148. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/types/__init__.py +0 -0
  149. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/batch_evaluation.py +0 -0
  150. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/batch_transformation.py +0 -0
  151. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/dataset_helpers.py +0 -0
  152. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/module_loader.py +0 -0
  153. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/packaging_utils.py +0 -0
  154. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol/utils/static_policy.py +0 -0
  155. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/dependency_links.txt +0 -0
  156. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/entry_points.txt +0 -0
  157. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/eval_protocol.egg-info/top_level.txt +0 -0
  158. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/setup.cfg +0 -0
  159. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/setup.py +0 -0
  160. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_accuracy.py +0 -0
  161. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_accuracy_length.py +0 -0
  162. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_adapters_e2e.py +0 -0
  163. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_agent_orchestrator.py +0 -0
  164. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_agent_resources.py +0 -0
  165. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_auth.py +0 -0
  166. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_batch_evaluation.py +0 -0
  167. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_braintrust_adapter.py +0 -0
  168. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_braintrust_example.py +0 -0
  169. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_cli.py +0 -0
  170. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_cli_agent.py +0 -0
  171. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_cli_args.py +0 -0
  172. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_code_execution.py +0 -0
  173. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_config.py +0 -0
  174. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_control_plane_separation.py +0 -0
  175. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_cpp_code.py +0 -0
  176. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_data_driven_task_manager.py +0 -0
  177. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_deepcoder_reward.py +0 -0
  178. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_deepeval_integration.py +0 -0
  179. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_deploy_integration.py +0 -0
  180. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_e2b_integration.py +0 -0
  181. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_e2b_js_integration.py +0 -0
  182. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_edge_cases.py +0 -0
  183. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_eval_protocol_import.py +0 -0
  184. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_evaluation.py +0 -0
  185. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_evaluation_integration.py +0 -0
  186. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_evaluation_preview_integration.py +0 -0
  187. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_examples_end_to_end.py +0 -0
  188. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_fireworks_api.py +0 -0
  189. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_format.py +0 -0
  190. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_fractional_code.py +0 -0
  191. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_frozen_lake_http_server.py +0 -0
  192. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_frozen_lake_seed_evaluation.py +0 -0
  193. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_function_calling.py +0 -0
  194. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_gcp_tools.py +0 -0
  195. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_generic_server.py +0 -0
  196. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_integration.py +0 -0
  197. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_json_schema.py +0 -0
  198. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_kwargs_validation.py +0 -0
  199. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_language_consistency.py +0 -0
  200. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_lean_prover.py +0 -0
  201. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_lean_prover_runner.py +0 -0
  202. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_length.py +0 -0
  203. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_list_comparison_math_reward.py +0 -0
  204. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_math.py +0 -0
  205. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_minimal.py +0 -0
  206. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_models.py +0 -0
  207. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_models_rl.py +0 -0
  208. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_multiple_choice_math_reward.py +0 -0
  209. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_n_variant_batch_integration.py +0 -0
  210. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_n_variant_integration.py +0 -0
  211. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_openai_compatibility.py +0 -0
  212. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_openeval_integration.py +0 -0
  213. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_packaging.py +0 -0
  214. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_platform_api.py +0 -0
  215. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_readiness.py +0 -0
  216. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_reasoning_steps.py +0 -0
  217. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_repetition.py +0 -0
  218. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_repetition_debug.py +0 -0
  219. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_reward_function.py +0 -0
  220. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_reward_protocol_import.py +0 -0
  221. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_rl_processing.py +0 -0
  222. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_server.py +0 -0
  223. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_tag_count.py +0 -0
  224. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_typed_interface.py +0 -0
  225. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/tests/test_typed_interface_rl.py +0 -0
  226. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/__init__.py +0 -0
  227. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/agent/__init__.py +0 -0
  228. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/agent/base.py +0 -0
  229. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/agent/llm_agent.py +0 -0
  230. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/api_service/__init__.py +0 -0
  231. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/api_service/api_config.py +0 -0
  232. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/api_service/data_model.py +0 -0
  233. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/api_service/simulation_service.py +0 -0
  234. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/cli.py +0 -0
  235. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/config.py +0 -0
  236. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/data_model/__init__.py +0 -0
  237. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/data_model/message.py +0 -0
  238. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/data_model/simulation.py +0 -0
  239. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/data_model/tasks.py +0 -0
  240. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/__init__.py +0 -0
  241. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/__init__.py +0 -0
  242. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/data_model.py +0 -0
  243. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/environment.py +0 -0
  244. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/tools.py +0 -0
  245. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/airline/utils.py +0 -0
  246. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/__init__.py +0 -0
  247. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/data_model.py +0 -0
  248. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/environment.py +0 -0
  249. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/tools.py +0 -0
  250. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/mock/utils.py +0 -0
  251. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/__init__.py +0 -0
  252. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/data_model.py +0 -0
  253. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/environment.py +0 -0
  254. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/tools.py +0 -0
  255. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/retail/utils.py +0 -0
  256. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/__init__.py +0 -0
  257. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/data_model.py +0 -0
  258. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/environment.py +0 -0
  259. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
  260. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
  261. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
  262. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
  263. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
  264. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
  265. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
  266. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
  267. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/tools.py +0 -0
  268. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
  269. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/user_tools.py +0 -0
  270. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/domains/telecom/utils.py +0 -0
  271. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/__init__.py +0 -0
  272. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/db.py +0 -0
  273. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/environment.py +0 -0
  274. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/server.py +0 -0
  275. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/tool.py +0 -0
  276. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/toolkit.py +0 -0
  277. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/environment/utils/interface_agent.py +0 -0
  278. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/__init__.py +0 -0
  279. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator.py +0 -0
  280. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_action.py +0 -0
  281. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_base.py +0 -0
  282. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
  283. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_env.py +0 -0
  284. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
  285. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/metrics/__init__.py +0 -0
  286. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/metrics/agent_metrics.py +0 -0
  287. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/metrics/break_down_metrics.py +0 -0
  288. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/orchestrator/__init__.py +0 -0
  289. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/orchestrator/environment_manager.py +0 -0
  290. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/orchestrator/orchestrator.py +0 -0
  291. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/orchestrator/utils.py +0 -0
  292. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/registry.py +0 -0
  293. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/run.py +0 -0
  294. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/__init__.py +0 -0
  295. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/check_data.py +0 -0
  296. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/show_domain_doc.py +0 -0
  297. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/start_servers.py +0 -0
  298. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/scripts/view_simulations.py +0 -0
  299. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/user/__init__.py +0 -0
  300. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/user/base.py +0 -0
  301. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/user/user_simulator.py +0 -0
  302. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/__init__.py +0 -0
  303. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/display.py +0 -0
  304. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/io_utils.py +0 -0
  305. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/llm_utils.py +0 -0
  306. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/pydantic_utils.py +0 -0
  307. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/vendor/tau2/utils/utils.py +0 -0
  308. {eval_protocol-0.2.3 → eval_protocol-0.2.5}/versioneer.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-protocol
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
5
5
  Author-email: Fireworks AI <info@fireworks.ai>
6
- License-Expression: Apache-2.0
6
+ License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/fireworks-ai/eval-protocol
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Operating System :: OS Independent
@@ -13,7 +13,6 @@ License-File: LICENSE
13
13
  Requires-Dist: requests>=2.25.0
14
14
  Requires-Dist: pydantic>=2.0.0
15
15
  Requires-Dist: dataclasses-json>=0.5.7
16
- Requires-Dist: fastapi>=0.68.0
17
16
  Requires-Dist: uvicorn>=0.15.0
18
17
  Requires-Dist: python-dotenv>=0.19.0
19
18
  Requires-Dist: openai==1.78.1
@@ -39,6 +38,10 @@ Requires-Dist: litellm>=1.0.0
39
38
  Requires-Dist: addict>=2.4.0
40
39
  Requires-Dist: deepdiff>=6.0.0
41
40
  Requires-Dist: pandas>=1.5.0
41
+ Requires-Dist: watchdog>=2.1.0
42
+ Requires-Dist: websockets>=15.0.1
43
+ Requires-Dist: fireworks-ai>=0.19.12
44
+ Requires-Dist: fastapi>=0.116.1
42
45
  Provides-Extra: dev
43
46
  Requires-Dist: build; extra == "dev"
44
47
  Requires-Dist: twine; extra == "dev"
@@ -96,8 +99,8 @@ Dynamic: license-file
96
99
 
97
100
  [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
98
101
 
99
- EP is an open specification, Python SDK, and pytest wrapper that provides a
100
- standardized way to write evaluations for large language model (LLM)
102
+ EP is an open specification, Python SDK, pytest wrapper, and suite of tools that
103
+ provides a standardized way to write evaluations for large language model (LLM)
101
104
  applications. Start with simple single-turn evals for model selection and prompt
102
105
  engineering, then scale up to complex multi-turn reinforcement learning (RL) for
103
106
  agents using Model Context Protocol (MCP). EP ensures consistent patterns for
@@ -106,6 +109,12 @@ sophisticated agent evaluations that work across real-world scenarios, from
106
109
  markdown generation tasks to customer service agents with tool calling
107
110
  capabilities.
108
111
 
112
+ <p align="center">
113
+ <img src="./assets/ui.png" alt="UI" />
114
+ <br>
115
+ <sub><b>Log Viewer: Monitor your evaluation rollouts in real time.</b></sub>
116
+ </p>
117
+
109
118
  ## Quick Example
110
119
 
111
120
  Here's a simple test function that checks if a model's response contains **bold** text formatting:
@@ -129,17 +138,17 @@ def test_bold_format(row: EvaluationRow) -> EvaluationRow:
129
138
  """
130
139
  Simple evaluation that checks if the model's response contains bold text.
131
140
  """
132
-
141
+
133
142
  assistant_response = row.messages[-1].content
134
-
143
+
135
144
  # Check if response contains **bold** text
136
145
  has_bold = "**" in assistant_response
137
-
146
+
138
147
  if has_bold:
139
148
  result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
140
149
  else:
141
150
  result = EvaluateResult(score=0.0, reason="❌ No bold text found")
142
-
151
+
143
152
  row.evaluation_result = result
144
153
  return row
145
154
  ```
@@ -2,8 +2,8 @@
2
2
 
3
3
  [![PyPI - Version](https://img.shields.io/pypi/v/eval-protocol)](https://pypi.org/project/eval-protocol/)
4
4
 
5
- EP is an open specification, Python SDK, and pytest wrapper that provides a
6
- standardized way to write evaluations for large language model (LLM)
5
+ EP is an open specification, Python SDK, pytest wrapper, and suite of tools that
6
+ provides a standardized way to write evaluations for large language model (LLM)
7
7
  applications. Start with simple single-turn evals for model selection and prompt
8
8
  engineering, then scale up to complex multi-turn reinforcement learning (RL) for
9
9
  agents using Model Context Protocol (MCP). EP ensures consistent patterns for
@@ -12,6 +12,12 @@ sophisticated agent evaluations that work across real-world scenarios, from
12
12
  markdown generation tasks to customer service agents with tool calling
13
13
  capabilities.
14
14
 
15
+ <p align="center">
16
+ <img src="./assets/ui.png" alt="UI" />
17
+ <br>
18
+ <sub><b>Log Viewer: Monitor your evaluation rollouts in real time.</b></sub>
19
+ </p>
20
+
15
21
  ## Quick Example
16
22
 
17
23
  Here's a simple test function that checks if a model's response contains **bold** text formatting:
@@ -35,17 +41,17 @@ def test_bold_format(row: EvaluationRow) -> EvaluationRow:
35
41
  """
36
42
  Simple evaluation that checks if the model's response contains bold text.
37
43
  """
38
-
44
+
39
45
  assistant_response = row.messages[-1].content
40
-
46
+
41
47
  # Check if response contains **bold** text
42
48
  has_bold = "**" in assistant_response
43
-
49
+
44
50
  if has_bold:
45
51
  result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
46
52
  else:
47
53
  result = EvaluateResult(score=0.0, reason="❌ No bold text found")
48
-
54
+
49
55
  row.evaluation_result = result
50
56
  return row
51
57
  ```
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2025-08-04T20:35:33-0700",
11
+ "date": "2025-08-06T01:34:18-0700",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "52b46a7d3f8455d848d8d5138ec4ca4d6343d3d2",
15
- "version": "0.2.3"
14
+ "full-revisionid": "1a37ee141ebe4084654889ace2aba9c1529acf1c",
15
+ "version": "0.2.5"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -14,21 +14,19 @@ logger = logging.getLogger(__name__)
14
14
 
15
15
  try:
16
16
  from langfuse import Langfuse
17
+
17
18
  LANGFUSE_AVAILABLE = True
18
19
  except ImportError:
19
20
  LANGFUSE_AVAILABLE = False
20
- logger.warning(
21
- "Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'"
22
- )
23
21
 
24
22
 
25
23
  class LangfuseAdapter:
26
24
  """Adapter to pull data from Langfuse and convert to EvaluationRow format.
27
-
25
+
28
26
  This adapter can pull both chat conversations and tool calling traces from
29
27
  Langfuse deployments and convert them into the EvaluationRow format expected
30
28
  by the evaluation protocol.
31
-
29
+
32
30
  Examples:
33
31
  Basic usage:
34
32
  >>> adapter = LangfuseAdapter(
@@ -37,7 +35,7 @@ class LangfuseAdapter:
37
35
  ... host="https://your-langfuse-deployment.com"
38
36
  ... )
39
37
  >>> rows = list(adapter.get_evaluation_rows(limit=10))
40
-
38
+
41
39
  Filter by specific criteria:
42
40
  >>> rows = list(adapter.get_evaluation_rows(
43
41
  ... limit=50,
@@ -46,7 +44,7 @@ class LangfuseAdapter:
46
44
  ... from_timestamp=datetime.now() - timedelta(days=7)
47
45
  ... ))
48
46
  """
49
-
47
+
50
48
  def __init__(
51
49
  self,
52
50
  public_key: str,
@@ -55,25 +53,19 @@ class LangfuseAdapter:
55
53
  project_id: Optional[str] = None,
56
54
  ):
57
55
  """Initialize the Langfuse adapter.
58
-
56
+
59
57
  Args:
60
58
  public_key: Langfuse public key
61
- secret_key: Langfuse secret key
59
+ secret_key: Langfuse secret key
62
60
  host: Langfuse host URL (default: https://cloud.langfuse.com)
63
61
  project_id: Optional project ID to filter traces
64
62
  """
65
63
  if not LANGFUSE_AVAILABLE:
66
- raise ImportError(
67
- "Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'"
68
- )
69
-
70
- self.client = Langfuse(
71
- public_key=public_key,
72
- secret_key=secret_key,
73
- host=host
74
- )
64
+ raise ImportError("Langfuse not installed. Install with: pip install 'eval-protocol[langfuse]'")
65
+
66
+ self.client = Langfuse(public_key=public_key, secret_key=secret_key, host=host)
75
67
  self.project_id = project_id
76
-
68
+
77
69
  def get_evaluation_rows(
78
70
  self,
79
71
  limit: int = 100,
@@ -85,16 +77,16 @@ class LangfuseAdapter:
85
77
  include_tool_calls: bool = True,
86
78
  ) -> Iterator[EvaluationRow]:
87
79
  """Pull traces from Langfuse and convert to EvaluationRow format.
88
-
80
+
89
81
  Args:
90
82
  limit: Maximum number of rows to return
91
83
  tags: Filter by specific tags
92
84
  user_id: Filter by user ID
93
- session_id: Filter by session ID
85
+ session_id: Filter by session ID
94
86
  from_timestamp: Filter traces after this timestamp
95
87
  to_timestamp: Filter traces before this timestamp
96
88
  include_tool_calls: Whether to include tool calling traces
97
-
89
+
98
90
  Yields:
99
91
  EvaluationRow: Converted evaluation rows
100
92
  """
@@ -102,12 +94,12 @@ class LangfuseAdapter:
102
94
  traces = self.client.get_traces(
103
95
  limit=limit,
104
96
  tags=tags,
105
- user_id=user_id,
97
+ user_id=user_id,
106
98
  session_id=session_id,
107
99
  from_timestamp=from_timestamp,
108
- to_timestamp=to_timestamp
100
+ to_timestamp=to_timestamp,
109
101
  )
110
-
102
+
111
103
  for trace in traces.data:
112
104
  try:
113
105
  eval_row = self._convert_trace_to_evaluation_row(trace, include_tool_calls)
@@ -116,18 +108,18 @@ class LangfuseAdapter:
116
108
  except (AttributeError, ValueError, KeyError) as e:
117
109
  logger.warning("Failed to convert trace %s: %s", trace.id, e)
118
110
  continue
119
-
111
+
120
112
  def get_evaluation_rows_by_ids(
121
- self,
113
+ self,
122
114
  trace_ids: List[str],
123
115
  include_tool_calls: bool = True,
124
116
  ) -> Iterator[EvaluationRow]:
125
117
  """Get specific traces by their IDs and convert to EvaluationRow format.
126
-
118
+
127
119
  Args:
128
120
  trace_ids: List of trace IDs to fetch
129
121
  include_tool_calls: Whether to include tool calling traces
130
-
122
+
131
123
  Yields:
132
124
  EvaluationRow: Converted evaluation rows
133
125
  """
@@ -140,137 +132,131 @@ class LangfuseAdapter:
140
132
  except (AttributeError, ValueError, KeyError) as e:
141
133
  logger.warning("Failed to fetch/convert trace %s: %s", trace_id, e)
142
134
  continue
143
-
144
- def _convert_trace_to_evaluation_row(
145
- self,
146
- trace: Any,
147
- include_tool_calls: bool = True
148
- ) -> Optional[EvaluationRow]:
135
+
136
+ def _convert_trace_to_evaluation_row(self, trace: Any, include_tool_calls: bool = True) -> Optional[EvaluationRow]:
149
137
  """Convert a Langfuse trace to EvaluationRow format.
150
-
138
+
151
139
  Args:
152
140
  trace: Langfuse trace object
153
141
  include_tool_calls: Whether to include tool calling information
154
-
142
+
155
143
  Returns:
156
144
  EvaluationRow or None if conversion fails
157
145
  """
158
146
  try:
159
147
  # Get observations (generations, spans) from the trace
160
148
  observations = self.client.get_observations(trace_id=trace.id).data
161
-
149
+
162
150
  # Convert observations to messages
163
151
  messages = self._extract_messages_from_observations(observations, include_tool_calls)
164
-
152
+
165
153
  if not messages:
166
154
  return None
167
-
155
+
168
156
  # Extract metadata
169
157
  input_metadata = self._create_input_metadata(trace, observations)
170
-
158
+
171
159
  # Extract ground truth if available (from trace metadata or tags)
172
160
  ground_truth = self._extract_ground_truth(trace)
173
-
161
+
174
162
  # Extract tools if available
175
163
  tools = self._extract_tools(observations) if include_tool_calls else None
176
-
164
+
177
165
  return EvaluationRow(
178
166
  messages=messages,
179
167
  tools=tools,
180
168
  input_metadata=input_metadata,
181
169
  ground_truth=ground_truth,
182
170
  )
183
-
171
+
184
172
  except (AttributeError, ValueError, KeyError) as e:
185
173
  logger.error("Error converting trace %s: %s", trace.id, e)
186
174
  return None
187
-
175
+
188
176
  def _extract_messages_from_observations(
189
- self,
190
- observations: List[Any],
191
- include_tool_calls: bool = True
177
+ self, observations: List[Any], include_tool_calls: bool = True
192
178
  ) -> List[Message]:
193
179
  """Extract messages from Langfuse observations.
194
-
180
+
195
181
  Args:
196
182
  observations: List of Langfuse observation objects
197
183
  include_tool_calls: Whether to include tool calling information
198
-
184
+
199
185
  Returns:
200
186
  List of Message objects
201
187
  """
202
188
  messages = []
203
-
189
+
204
190
  # Sort observations by timestamp
205
191
  sorted_observations = sorted(observations, key=lambda x: x.start_time or datetime.min)
206
-
192
+
207
193
  for obs in sorted_observations:
208
194
  try:
209
- if hasattr(obs, 'input') and obs.input:
195
+ if hasattr(obs, "input") and obs.input:
210
196
  # Handle different input formats
211
197
  if isinstance(obs.input, dict):
212
- if 'messages' in obs.input:
198
+ if "messages" in obs.input:
213
199
  # OpenAI-style messages format
214
- for msg in obs.input['messages']:
200
+ for msg in obs.input["messages"]:
215
201
  messages.append(self._dict_to_message(msg, include_tool_calls))
216
- elif 'role' in obs.input:
202
+ elif "role" in obs.input:
217
203
  # Single message format
218
204
  messages.append(self._dict_to_message(obs.input, include_tool_calls))
219
- elif 'prompt' in obs.input:
205
+ elif "prompt" in obs.input:
220
206
  # Simple prompt format
221
- messages.append(Message(role="user", content=str(obs.input['prompt'])))
207
+ messages.append(Message(role="user", content=str(obs.input["prompt"])))
222
208
  elif isinstance(obs.input, str):
223
209
  # Simple string input
224
210
  messages.append(Message(role="user", content=obs.input))
225
-
226
- if hasattr(obs, 'output') and obs.output:
211
+
212
+ if hasattr(obs, "output") and obs.output:
227
213
  # Handle output
228
214
  if isinstance(obs.output, dict):
229
- if 'content' in obs.output:
230
- messages.append(Message(role="assistant", content=str(obs.output['content'])))
231
- elif 'message' in obs.output:
232
- msg_dict = obs.output['message']
215
+ if "content" in obs.output:
216
+ messages.append(Message(role="assistant", content=str(obs.output["content"])))
217
+ elif "message" in obs.output:
218
+ msg_dict = obs.output["message"]
233
219
  messages.append(self._dict_to_message(msg_dict, include_tool_calls))
234
220
  else:
235
221
  # Fallback: convert entire output to string
236
222
  messages.append(Message(role="assistant", content=str(obs.output)))
237
223
  elif isinstance(obs.output, str):
238
224
  messages.append(Message(role="assistant", content=obs.output))
239
-
225
+
240
226
  except (AttributeError, ValueError, KeyError) as e:
241
227
  logger.warning("Error processing observation %s: %s", obs.id, e)
242
228
  continue
243
-
229
+
244
230
  return messages
245
-
231
+
246
232
  def _dict_to_message(self, msg_dict: Dict[str, Any], include_tool_calls: bool = True) -> Message:
247
233
  """Convert a dictionary to a Message object.
248
-
234
+
249
235
  Args:
250
236
  msg_dict: Dictionary containing message data
251
237
  include_tool_calls: Whether to include tool calling information
252
-
238
+
253
239
  Returns:
254
240
  Message object
255
241
  """
256
242
  # Extract basic message components
257
- role = msg_dict.get('role', 'assistant')
258
- content = msg_dict.get('content')
259
- name = msg_dict.get('name')
260
-
243
+ role = msg_dict.get("role", "assistant")
244
+ content = msg_dict.get("content")
245
+ name = msg_dict.get("name")
246
+
261
247
  # Handle tool calls if enabled
262
248
  tool_calls = None
263
249
  tool_call_id = None
264
250
  function_call = None
265
-
251
+
266
252
  if include_tool_calls:
267
- if 'tool_calls' in msg_dict:
268
- tool_calls = msg_dict['tool_calls']
269
- if 'tool_call_id' in msg_dict:
270
- tool_call_id = msg_dict['tool_call_id']
271
- if 'function_call' in msg_dict:
272
- function_call = msg_dict['function_call']
273
-
253
+ if "tool_calls" in msg_dict:
254
+ tool_calls = msg_dict["tool_calls"]
255
+ if "tool_call_id" in msg_dict:
256
+ tool_call_id = msg_dict["tool_call_id"]
257
+ if "function_call" in msg_dict:
258
+ function_call = msg_dict["function_call"]
259
+
274
260
  return Message(
275
261
  role=role,
276
262
  content=content,
@@ -279,106 +265,105 @@ class LangfuseAdapter:
279
265
  tool_calls=tool_calls,
280
266
  function_call=function_call,
281
267
  )
282
-
268
+
283
269
  def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMetadata:
284
270
  """Create InputMetadata from trace and observations.
285
-
271
+
286
272
  Args:
287
273
  trace: Langfuse trace object
288
274
  observations: List of observation objects
289
-
275
+
290
276
  Returns:
291
277
  InputMetadata object
292
278
  """
293
279
  # Extract completion parameters from observations
294
280
  completion_params = CompletionParams()
295
-
281
+
296
282
  # Look for model parameters in observations
297
283
  for obs in observations:
298
- if hasattr(obs, 'model') and obs.model:
284
+ if hasattr(obs, "model") and obs.model:
299
285
  completion_params.model = obs.model
300
- if hasattr(obs, 'model_parameters') and obs.model_parameters:
286
+ if hasattr(obs, "model_parameters") and obs.model_parameters:
301
287
  params = obs.model_parameters
302
- if 'temperature' in params:
303
- completion_params.temperature = params['temperature']
304
- if 'max_tokens' in params:
305
- completion_params.max_tokens = params['max_tokens']
306
- if 'top_p' in params:
307
- completion_params.top_p = params['top_p']
288
+ if "temperature" in params:
289
+ completion_params.temperature = params["temperature"]
290
+ if "max_tokens" in params:
291
+ completion_params.max_tokens = params["max_tokens"]
292
+ if "top_p" in params:
293
+ completion_params.top_p = params["top_p"]
308
294
  break
309
-
295
+
310
296
  # Create dataset info from trace metadata
311
297
  dataset_info = {
312
- 'trace_id': trace.id,
313
- 'trace_name': getattr(trace, 'name', None),
314
- 'trace_tags': getattr(trace, 'tags', []),
315
- 'langfuse_project_id': self.project_id,
298
+ "trace_id": trace.id,
299
+ "trace_name": getattr(trace, "name", None),
300
+ "trace_tags": getattr(trace, "tags", []),
301
+ "langfuse_project_id": self.project_id,
316
302
  }
317
-
303
+
318
304
  # Add trace metadata if available
319
- if hasattr(trace, 'metadata') and trace.metadata:
320
- dataset_info['trace_metadata'] = trace.metadata
321
-
305
+ if hasattr(trace, "metadata") and trace.metadata:
306
+ dataset_info["trace_metadata"] = trace.metadata
307
+
322
308
  # Create session data
323
309
  session_data = {
324
- 'session_id': getattr(trace, 'session_id', None),
325
- 'user_id': getattr(trace, 'user_id', None),
326
- 'timestamp': getattr(trace, 'timestamp', None),
327
- 'langfuse_trace_url': f"{self.client.host}/project/{self.project_id}/traces/{trace.id}" if self.project_id else None,
310
+ "session_id": getattr(trace, "session_id", None),
311
+ "user_id": getattr(trace, "user_id", None),
312
+ "timestamp": getattr(trace, "timestamp", None),
313
+ "langfuse_trace_url": (
314
+ f"{self.client.host}/project/{self.project_id}/traces/{trace.id}" if self.project_id else None
315
+ ),
328
316
  }
329
-
317
+
330
318
  return InputMetadata(
331
319
  row_id=trace.id,
332
320
  completion_params=completion_params,
333
321
  dataset_info=dataset_info,
334
322
  session_data=session_data,
335
323
  )
336
-
324
+
337
325
  def _extract_ground_truth(self, trace: Any) -> Optional[str]:
338
326
  """Extract ground truth from trace if available.
339
-
327
+
340
328
  Args:
341
329
  trace: Langfuse trace object
342
-
330
+
343
331
  Returns:
344
332
  Ground truth string or None
345
333
  """
346
334
  # Check trace metadata for ground truth
347
- if hasattr(trace, 'metadata') and trace.metadata:
335
+ if hasattr(trace, "metadata") and trace.metadata:
348
336
  if isinstance(trace.metadata, dict):
349
- return trace.metadata.get('ground_truth') or trace.metadata.get('expected_answer')
350
-
337
+ return trace.metadata.get("ground_truth") or trace.metadata.get("expected_answer")
338
+
351
339
  # Check tags for ground truth indicators
352
- if hasattr(trace, 'tags') and trace.tags:
340
+ if hasattr(trace, "tags") and trace.tags:
353
341
  for tag in trace.tags:
354
- if tag.startswith('ground_truth:'):
355
- return tag.replace('ground_truth:', '', 1)
356
-
342
+ if tag.startswith("ground_truth:"):
343
+ return tag.replace("ground_truth:", "", 1)
344
+
357
345
  return None
358
-
346
+
359
347
  def _extract_tools(self, observations: List[Any]) -> Optional[List[Dict[str, Any]]]:
360
348
  """Extract tool definitions from observations.
361
-
349
+
362
350
  Args:
363
351
  observations: List of observation objects
364
-
352
+
365
353
  Returns:
366
354
  List of tool definitions or None
367
355
  """
368
356
  tools = []
369
-
357
+
370
358
  for obs in observations:
371
- if hasattr(obs, 'input') and obs.input and isinstance(obs.input, dict):
372
- if 'tools' in obs.input:
373
- tools.extend(obs.input['tools'])
374
- elif 'functions' in obs.input:
359
+ if hasattr(obs, "input") and obs.input and isinstance(obs.input, dict):
360
+ if "tools" in obs.input:
361
+ tools.extend(obs.input["tools"])
362
+ elif "functions" in obs.input:
375
363
  # Convert functions to tools format
376
- for func in obs.input['functions']:
377
- tools.append({
378
- 'type': 'function',
379
- 'function': func
380
- })
381
-
364
+ for func in obs.input["functions"]:
365
+ tools.append({"type": "function", "function": func})
366
+
382
367
  return tools if tools else None
383
368
 
384
369
 
@@ -389,19 +374,19 @@ def create_langfuse_adapter(
389
374
  project_id: Optional[str] = None,
390
375
  ) -> LangfuseAdapter:
391
376
  """Factory function to create a Langfuse adapter.
392
-
377
+
393
378
  Args:
394
379
  public_key: Langfuse public key
395
380
  secret_key: Langfuse secret key
396
381
  host: Langfuse host URL
397
382
  project_id: Optional project ID
398
-
383
+
399
384
  Returns:
400
385
  LangfuseAdapter instance
401
386
  """
402
387
  return LangfuseAdapter(
403
388
  public_key=public_key,
404
- secret_key=secret_key,
389
+ secret_key=secret_key,
405
390
  host=host,
406
391
  project_id=project_id,
407
- )
392
+ )
@@ -14,6 +14,7 @@ from pathlib import Path
14
14
 
15
15
  logger = logging.getLogger(__name__)
16
16
 
17
+
17
18
  from eval_protocol.evaluation import create_evaluation, preview_evaluation
18
19
 
19
20
  from .cli_commands.agent_eval_cmd import agent_eval_command
@@ -24,6 +25,7 @@ from .cli_commands.common import (
24
25
  )
25
26
  from .cli_commands.deploy import deploy_command
26
27
  from .cli_commands.deploy_mcp import deploy_mcp_command
28
+ from .cli_commands.logs import logs_command
27
29
  from .cli_commands.preview import preview_command
28
30
  from .cli_commands.run_eval_cmd import hydra_cli_entry_point
29
31
 
@@ -285,6 +287,9 @@ def parse_args(args=None):
285
287
  help="Override the number of parallel rollouts to execute for each task.",
286
288
  )
287
289
 
290
+ # Logs command
291
+ logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
292
+
288
293
  # Run command (for Hydra-based evaluations)
289
294
  # This subparser intentionally defines no arguments itself.
290
295
  # All arguments after 'run' will be passed to Hydra by parse_known_args.
@@ -338,6 +343,8 @@ def main():
338
343
  return deploy_mcp_command(args)
339
344
  elif args.command == "agent-eval":
340
345
  return agent_eval_command(args)
346
+ elif args.command == "logs":
347
+ return logs_command(args)
341
348
  elif args.command == "run":
342
349
  # For the 'run' command, Hydra takes over argument parsing.
343
350