agentevals-cli 0.9.4__tar.gz → 0.9.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (293) hide show
  1. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/PKG-INFO +11 -7
  2. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/README.md +10 -6
  3. agentevals_cli-0.9.6/docs/run-history.md +105 -0
  4. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/agent.py +1 -1
  5. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/adk/run.py +1 -0
  6. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/langchain/run.py +1 -0
  7. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/ollama/run.py +1 -0
  8. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/openai-agents/run.py +1 -0
  9. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/pydantic-ai/run.py +2 -0
  10. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/strands/run.py +1 -0
  11. agentevals_cli-0.9.6/src/agentevals/_static/assets/index-4Q_gYF8x.js +341 -0
  12. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_static/index.html +1 -1
  13. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/models.py +9 -0
  14. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/routes.py +178 -84
  15. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/streaming_routes.py +81 -40
  16. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/result_builder.py +44 -8
  17. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/runner.py +9 -2
  18. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/ws_server.py +12 -1
  19. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/trace_attrs.py +3 -0
  20. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/trace_metrics.py +33 -0
  21. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_api.py +210 -0
  22. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/App.tsx +2 -0
  23. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/api/client.ts +48 -0
  24. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/DashboardView.tsx +18 -3
  25. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/PerformanceCharts.tsx +10 -2
  26. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/TraceTable.tsx +57 -18
  27. agentevals_cli-0.9.6/ui/src/components/runs/PassRateTrendChart.tsx +161 -0
  28. agentevals_cli-0.9.6/ui/src/components/runs/PerMetricTrendChart.tsx +123 -0
  29. agentevals_cli-0.9.6/ui/src/components/runs/RunDetailView.tsx +536 -0
  30. agentevals_cli-0.9.6/ui/src/components/runs/RunsHistoryTable.tsx +273 -0
  31. agentevals_cli-0.9.6/ui/src/components/runs/RunsView.tsx +376 -0
  32. agentevals_cli-0.9.6/ui/src/components/runs/runHistory.ts +171 -0
  33. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/sidebar/Sidebar.tsx +11 -2
  34. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/LiveStreamingView.tsx +6 -0
  35. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/config.ts +1 -0
  36. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/context/TraceContext.tsx +4 -0
  37. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/context/TraceProvider.tsx +5 -0
  38. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/types.ts +94 -1
  39. agentevals_cli-0.9.4/src/agentevals/_static/assets/index-RIquRPno.js +0 -341
  40. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.claude/skills/eval/SKILL.md +0 -0
  41. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.claude/skills/eval/evals/evals.json +0 -0
  42. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.claude/skills/inspect/SKILL.md +0 -0
  43. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.claude/skills/inspect/evals/evals.json +0 -0
  44. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.dockerignore +0 -0
  45. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  46. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  47. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  48. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/workflows/ci.yml +0 -0
  49. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  50. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/workflows/release.yml +0 -0
  51. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.gitignore +0 -0
  52. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.mcp.json +0 -0
  53. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/CONTRIBUTING.md +0 -0
  54. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/DEVELOPMENT.md +0 -0
  55. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/Dockerfile +0 -0
  56. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/LICENSE +0 -0
  57. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/Makefile +0 -0
  58. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/Chart.yaml +0 -0
  59. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/NOTES.txt +0 -0
  60. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/_helpers.tpl +0 -0
  61. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/deployment.yaml +0 -0
  62. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  63. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/postgresql.yaml +0 -0
  64. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/rbac.yaml +0 -0
  65. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/service.yaml +0 -0
  66. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  67. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/values.yaml +0 -0
  68. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/assets/logo-color-on-transparent.svg +0 -0
  69. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/assets/logo-color.png +0 -0
  70. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/assets/logo-dark-on-transparent.svg +0 -0
  71. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/custom-evaluators.md +0 -0
  72. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/eval-set-format.md +0 -0
  73. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/otel-compatibility.md +0 -0
  74. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/streaming.md +0 -0
  75. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/README.md +0 -0
  76. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_evaluators/eval_config.yaml +0 -0
  77. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_evaluators/eval_config_openai_eval.yaml +0 -0
  78. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_evaluators/response_quality.py +0 -0
  79. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_evaluators/tool_call_checker.py +0 -0
  80. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_sink/README.md +0 -0
  81. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
  82. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
  83. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_sink/pyproject.toml +0 -0
  84. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/README.md +0 -0
  85. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/eval_set.json +0 -0
  86. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/main.py +0 -0
  87. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/test_streaming.py +0 -0
  88. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/kubernetes/README.md +0 -0
  89. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/README.md +0 -0
  90. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/agent.py +0 -0
  91. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/eval_set.json +0 -0
  92. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/main.py +0 -0
  93. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/requirements.txt +0 -0
  94. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/test_streaming.py +0 -0
  95. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/sdk_example/async_example.py +0 -0
  96. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/sdk_example/context_manager_example.py +0 -0
  97. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/sdk_example/decorator_example.py +0 -0
  98. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/sdk_example/requirements.txt +0 -0
  99. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/strands_agent/agent.py +0 -0
  100. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/strands_agent/eval_set.json +0 -0
  101. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/strands_agent/main.py +0 -0
  102. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/strands_agent/requirements.txt +0 -0
  103. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/adk/requirements.txt +0 -0
  104. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  105. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  106. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  107. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  108. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/strands/requirements.txt +0 -0
  109. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/flake.lock +0 -0
  110. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/flake.nix +0 -0
  111. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/README.md +0 -0
  112. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  113. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  114. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  115. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  116. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/pyproject.toml +0 -0
  117. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/eval_set_helm.json +0 -0
  118. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/evalset_helm_3_2026-02-23.json +0 -0
  119. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/evalset_k8s_2026-02-20.json +0 -0
  120. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/helm.json +0 -0
  121. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/helm_2.json +0 -0
  122. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/helm_3.json +0 -0
  123. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/k8s.json +0 -0
  124. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/tempo_export_with_batches.json +0 -0
  125. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/__init__.py +0 -0
  126. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_protocol.py +0 -0
  127. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  128. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_static/logo.svg +0 -0
  129. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_static/vite.svg +0 -0
  130. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/__init__.py +0 -0
  131. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/app.py +0 -0
  132. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/debug_routes.py +0 -0
  133. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/dependencies.py +0 -0
  134. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/otlp_app.py +0 -0
  135. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/otlp_grpc.py +0 -0
  136. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/otlp_processing.py +0 -0
  137. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/otlp_routes.py +0 -0
  138. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/runs_routes.py +0 -0
  139. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/builtin_metrics.py +0 -0
  140. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/cli.py +0 -0
  141. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/config.py +0 -0
  142. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/converter.py +0 -0
  143. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/custom_evaluators.py +0 -0
  144. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/eval_config_loader.py +0 -0
  145. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/__init__.py +0 -0
  146. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/resolver.py +0 -0
  147. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/sources.py +0 -0
  148. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/templates.py +0 -0
  149. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/venv.py +0 -0
  150. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/extraction.py +0 -0
  151. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/genai_converter.py +0 -0
  152. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/__init__.py +0 -0
  153. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/auto.py +0 -0
  154. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/base.py +0 -0
  155. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/jaeger.py +0 -0
  156. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/otlp.py +0 -0
  157. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/mcp_server.py +0 -0
  158. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/openai_eval_backend.py +0 -0
  159. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/output.py +0 -0
  160. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/resolvers/__init__.py +0 -0
  161. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/resolvers/kubernetes.py +0 -0
  162. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/__init__.py +0 -0
  163. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/fetcher.py +0 -0
  164. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/service.py +0 -0
  165. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/sinks.py +0 -0
  166. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/worker.py +0 -0
  167. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/sdk.py +0 -0
  168. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/__init__.py +0 -0
  169. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/config.py +0 -0
  170. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/models.py +0 -0
  171. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/__init__.py +0 -0
  172. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  173. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  174. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/migrator.py +0 -0
  175. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/pool.py +0 -0
  176. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/repos/__init__.py +0 -0
  177. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/repos/memory.py +0 -0
  178. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/repos/postgres.py +0 -0
  179. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/__init__.py +0 -0
  180. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/incremental_processor.py +0 -0
  181. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/processor.py +0 -0
  182. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/session.py +0 -0
  183. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/utils/__init__.py +0 -0
  184. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/utils/genai_messages.py +0 -0
  185. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/utils/log_buffer.py +0 -0
  186. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/utils/log_enrichment.py +0 -0
  187. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/api/__init__.py +0 -0
  188. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/api/test_evaluate_persistence.py +0 -0
  189. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/api/test_runs_routes.py +0 -0
  190. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/__init__.py +0 -0
  191. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/conftest.py +0 -0
  192. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_evaluation_pipeline.py +0 -0
  193. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_live_agents.py +0 -0
  194. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  195. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_session_grouping.py +0 -0
  196. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_timing_stress.py +0 -0
  197. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/resolvers/__init__.py +0 -0
  198. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/resolvers/test_kubernetes.py +0 -0
  199. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/resolvers/test_registry.py +0 -0
  200. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/__init__.py +0 -0
  201. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/test_fetcher.py +0 -0
  202. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/test_result_builder.py +0 -0
  203. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/test_service.py +0 -0
  204. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/test_sinks.py +0 -0
  205. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/__init__.py +0 -0
  206. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/test_config.py +0 -0
  207. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/test_memory_repos.py +0 -0
  208. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/test_migrator.py +0 -0
  209. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/test_models.py +0 -0
  210. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_cli.py +0 -0
  211. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_converter.py +0 -0
  212. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_credential_injection.py +0 -0
  213. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_eval_config_loader.py +0 -0
  214. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_extraction.py +0 -0
  215. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_genai_converter.py +0 -0
  216. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_jaeger_loader.py +0 -0
  217. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_loader_auto.py +0 -0
  218. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_log_enrichment.py +0 -0
  219. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_mcp_server.py +0 -0
  220. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_openai_eval_backend.py +0 -0
  221. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_otlp_loader.py +0 -0
  222. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_otlp_receiver.py +0 -0
  223. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_output.py +0 -0
  224. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_protocol.py +0 -0
  225. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_runner.py +0 -0
  226. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_sdk.py +0 -0
  227. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_trace_metrics.py +0 -0
  228. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/.gitignore +0 -0
  229. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/README.md +0 -0
  230. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/eslint.config.js +0 -0
  231. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/index.html +0 -0
  232. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/package-lock.json +0 -0
  233. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/package.json +0 -0
  234. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/public/logo.svg +0 -0
  235. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/public/vite.svg +0 -0
  236. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/App.css +0 -0
  237. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/assets/react.svg +0 -0
  238. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  239. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  240. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  241. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  242. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  243. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/BuilderView.tsx +0 -0
  244. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  245. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  246. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  247. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/JsonPreview.tsx +0 -0
  248. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  249. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  250. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/index.ts +0 -0
  251. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  252. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  253. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  254. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  255. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  256. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/DataSection.tsx +0 -0
  257. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  258. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  259. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InspectorView.tsx +0 -0
  260. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  261. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  262. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  263. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  264. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  265. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  266. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  267. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  268. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  269. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/SessionCard.tsx +0 -0
  270. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  271. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  272. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/FileDropZone.tsx +0 -0
  273. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/MetricSelector.tsx +0 -0
  274. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  275. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  276. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/UploadView.tsx +0 -0
  277. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  278. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/index.css +0 -0
  279. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/console-capture.ts +0 -0
  280. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/eval-config.ts +0 -0
  281. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/evalset-builder.ts +0 -0
  282. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/network-capture.ts +0 -0
  283. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/trace-helpers.ts +0 -0
  284. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/trace-loader.ts +0 -0
  285. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/trace-metadata.ts +0 -0
  286. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/trace-patcher.ts +0 -0
  287. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/utils.ts +0 -0
  288. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/main.tsx +0 -0
  289. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/tsconfig.app.json +0 -0
  290. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/tsconfig.json +0 -0
  291. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/tsconfig.node.json +0 -0
  292. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/vite.config.ts +0 -0
  293. {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.9.4
3
+ Version: 0.9.6
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -280,7 +280,7 @@ See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protoc
280
280
  agentevals serve # bundled UI on http://localhost:8001
281
281
  ```
282
282
 
283
- Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
283
+ Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. With the Postgres backend enabled, the "Run History" tab persists every evaluation and lets you group and trend runs by eval set or agent over time; see the [Run History guide](docs/run-history.md). For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
284
284
 
285
285
  Interactive API docs are available at `/docs` (Swagger) and `/redoc` while the server is running. The OTLP receiver on port 4318 serves its own docs at `http://localhost:4318/docs`.
286
286
 
@@ -318,11 +318,12 @@ See the [Kubernetes example](examples/kubernetes/README.md) for an end-to-end wa
318
318
 
319
319
  #### Postgres backend (`/api/runs`)
320
320
 
321
- > **Preview.** Persistent run history backed by Postgres is under active
322
- > development. The `storage.*` and `database.postgres.*` chart values, the
323
- > `/api/runs` HTTP surface, and the database schema may change incompatibly
324
- > in upcoming releases. Operators evaluating this feature should plan to
325
- > recreate the agentevals schema when upgrading between minor versions.
321
+ > **Preview.** Persisting evaluations and exploring them in the UI works end
322
+ > to end (see the [Run History guide](docs/run-history.md)), but the storage
323
+ > layer is still stabilizing. The `storage.*` and `database.postgres.*` chart
324
+ > values, the `/api/runs` HTTP surface, and the database schema may change
325
+ > incompatibly in upcoming releases. Operators evaluating this feature should
326
+ > plan to recreate the agentevals schema when upgrading between minor versions.
326
327
  > Default in-memory mode is unaffected.
327
328
 
328
329
  By default the chart deploys agentevals with an in-memory backend; runs and results are not persisted. To enable the async `POST /api/runs` pipeline with durable Postgres-backed state:
@@ -341,6 +342,8 @@ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals
341
342
 
342
343
  When `storage.backend=postgres` the app applies any pending schema migrations on startup (advisory-lock protected, safe across replicas) and starts an in-process worker that processes the run queue. Without `storage.backend=postgres` the `/api/runs` endpoints return 503 with a hint pointing at the env var.
343
344
 
345
+ Persisted runs power the **Run History** view in the UI, where you can group and trend evaluations by eval set or agent and drill into per-run detail. See the [Run History guide](docs/run-history.md) for the full feature walkthrough and local-dev setup.
346
+
344
347
  ## MCP Server
345
348
 
346
349
  Exposes evaluation tools to MCP clients. A `.mcp.json` at the project root lets Claude Code pick it up automatically.
@@ -389,6 +392,7 @@ Working examples are in the [`examples/`](examples/) directory:
389
392
  | [Eval Set Format](docs/eval-set-format.md) | Schema, field reference, and examples for golden eval set JSON files |
390
393
  | [Custom Evaluators](docs/custom-evaluators.md) | Write your own scoring logic in Python, JavaScript, or any language |
391
394
  | [Live Streaming](docs/streaming.md) | Real-time trace streaming, dev server setup, and session management |
395
+ | [Run History](docs/run-history.md) | Persisting evaluations to Postgres and exploring them over time in the UI |
392
396
  | [OpenTelemetry Compatibility](docs/otel-compatibility.md) | Supported OTel conventions, message delivery mechanisms, and OTLP receiver |
393
397
 
394
398
  ## Development
@@ -250,7 +250,7 @@ See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protoc
250
250
  agentevals serve # bundled UI on http://localhost:8001
251
251
  ```
252
252
 
253
- Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
253
+ Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. With the Postgres backend enabled, the "Run History" tab persists every evaluation and lets you group and trend runs by eval set or agent over time; see the [Run History guide](docs/run-history.md). For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
254
254
 
255
255
  Interactive API docs are available at `/docs` (Swagger) and `/redoc` while the server is running. The OTLP receiver on port 4318 serves its own docs at `http://localhost:4318/docs`.
256
256
 
@@ -288,11 +288,12 @@ See the [Kubernetes example](examples/kubernetes/README.md) for an end-to-end wa
288
288
 
289
289
  #### Postgres backend (`/api/runs`)
290
290
 
291
- > **Preview.** Persistent run history backed by Postgres is under active
292
- > development. The `storage.*` and `database.postgres.*` chart values, the
293
- > `/api/runs` HTTP surface, and the database schema may change incompatibly
294
- > in upcoming releases. Operators evaluating this feature should plan to
295
- > recreate the agentevals schema when upgrading between minor versions.
291
+ > **Preview.** Persisting evaluations and exploring them in the UI works end
292
+ > to end (see the [Run History guide](docs/run-history.md)), but the storage
293
+ > layer is still stabilizing. The `storage.*` and `database.postgres.*` chart
294
+ > values, the `/api/runs` HTTP surface, and the database schema may change
295
+ > incompatibly in upcoming releases. Operators evaluating this feature should
296
+ > plan to recreate the agentevals schema when upgrading between minor versions.
296
297
  > Default in-memory mode is unaffected.
297
298
 
298
299
  By default the chart deploys agentevals with an in-memory backend; runs and results are not persisted. To enable the async `POST /api/runs` pipeline with durable Postgres-backed state:
@@ -311,6 +312,8 @@ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals
311
312
 
312
313
  When `storage.backend=postgres` the app applies any pending schema migrations on startup (advisory-lock protected, safe across replicas) and starts an in-process worker that processes the run queue. Without `storage.backend=postgres` the `/api/runs` endpoints return 503 with a hint pointing at the env var.
313
314
 
315
+ Persisted runs power the **Run History** view in the UI, where you can group and trend evaluations by eval set or agent and drill into per-run detail. See the [Run History guide](docs/run-history.md) for the full feature walkthrough and local-dev setup.
316
+
314
317
  ## MCP Server
315
318
 
316
319
  Exposes evaluation tools to MCP clients. A `.mcp.json` at the project root lets Claude Code pick it up automatically.
@@ -359,6 +362,7 @@ Working examples are in the [`examples/`](examples/) directory:
359
362
  | [Eval Set Format](docs/eval-set-format.md) | Schema, field reference, and examples for golden eval set JSON files |
360
363
  | [Custom Evaluators](docs/custom-evaluators.md) | Write your own scoring logic in Python, JavaScript, or any language |
361
364
  | [Live Streaming](docs/streaming.md) | Real-time trace streaming, dev server setup, and session management |
365
+ | [Run History](docs/run-history.md) | Persisting evaluations to Postgres and exploring them over time in the UI |
362
366
  | [OpenTelemetry Compatibility](docs/otel-compatibility.md) | Supported OTel conventions, message delivery mechanisms, and OTLP receiver |
363
367
 
364
368
  ## Development
@@ -0,0 +1,105 @@
1
+ # Run History
2
+
3
+ Run history turns each evaluation into a durable record you can revisit, group, and trend over time. When agentevals runs with the Postgres storage backend, every evaluation (whether an uploaded trace file or a live streaming session) is persisted as a **run** with its per case scores, and the UI's **Run History** view lets you explore how an agent or eval set performs across many runs.
4
+
5
+ Without the Postgres backend, agentevals is stateless: evaluations still work and results show on the dashboard, but nothing is persisted and the run-history endpoints return `503`.
6
+
7
+ ## Enabling durable storage
8
+
9
+ Run history requires the Postgres storage backend. It is opt in.
10
+
11
+ ### Local development
12
+
13
+ The quickest path uses the Makefile target, which starts a throwaway Postgres container, applies migrations, and serves the app wired to it:
14
+
15
+ ```bash
16
+ make dev-backend-pg
17
+ ```
18
+
19
+ That is equivalent to:
20
+
21
+ ```bash
22
+ export AGENTEVALS_STORAGE_BACKEND=postgres
23
+ export AGENTEVALS_DATABASE_URL=postgresql://agentevals:agentevals@localhost:5432/agentevals
24
+ uv run agentevals migrate up # apply schema migrations
25
+ uv run agentevals serve --dev # serve with the Postgres backend
26
+ ```
27
+
28
+ Run the UI in a second terminal (`cd ui && npm run dev`) and open the **Run History** tab.
29
+
30
+ > The `make pg-up` container runs with `--rm` and no volume, so its data is ephemeral: `make pg-down` (or a reboot) resets your run history. Point `AGENTEVALS_DATABASE_URL` at a persistent Postgres if you want runs to survive across sessions.
31
+
32
+ ### Configuration reference
33
+
34
+ | Variable | Purpose |
35
+ |----------|---------|
36
+ | `AGENTEVALS_STORAGE_BACKEND` | `postgres` to enable durable storage; anything else (default) keeps the in-memory backend |
37
+ | `AGENTEVALS_DATABASE_URL` | Postgres DSN, e.g. `postgresql://user:pass@host:5432/dbname` |
38
+ | `AGENTEVALS_DATABASE_URL_FILE` | Path to a file containing the DSN (preferred over the inline variable; useful for mounted secrets) |
39
+ | `AGENTEVALS_DATABASE_SCHEMA` | Schema name to use (default `agentevals`) |
40
+
41
+ On startup with `storage.backend=postgres` the app applies any pending migrations (advisory-lock protected, safe across replicas). For deployment via Helm, see the [Postgres backend section of the README](../README.md#postgres-backend-apiruns).
42
+
43
+ ## How runs get persisted
44
+
45
+ A run is created once per evaluation, best effort: if persistence fails the evaluation result is still returned to the caller. Both evaluation paths persist:
46
+
47
+ - **Uploaded traces** (`POST /api/evaluate`): the run aggregates every uploaded trace as one evaluation.
48
+ - **Live sessions** (streaming dev server): scoring sessions from the UI persists one run per "Evaluate" click, aggregating the sessions it scored.
49
+
50
+ Each run stores a pre-aggregated `summary` plus one `result` row per (eval case, evaluator):
51
+
52
+ ```jsonc
53
+ // run.summary
54
+ {
55
+ "trace_count": 8,
56
+ "result_counts": { "passed": 6, "failed": 2, "errored": 0, "skipped": 0 },
57
+ "per_metric": {
58
+ "tool_trajectory_avg_score": { "passed": 7, "failed": 1, "errored": 0, "skipped": 0, "avg_score": 0.94 }
59
+ },
60
+ "agents": ["langchain-agent", "openai-agents-agent"],
61
+ "performance_metrics": { "models": ["gpt-4o"], /* tokens, latency, counts */ },
62
+ "errors": []
63
+ }
64
+ ```
65
+
66
+ ## Exploring runs in the UI
67
+
68
+ Open **Run History** from the sidebar. It reads from `GET /api/runs`, so it shows the same friendly notice if durable storage is not configured.
69
+
70
+ - **Trends.** A pass-rate line and a per-metric average-score line plot across runs over time, so regressions and improvements are visible at a glance.
71
+ - **Group by.** Toggle between grouping by **eval set** or by **agent**, then pick a specific group to isolate its runs and trends. The pass-rate chart draws one line per agent.
72
+ - **History table.** Every run with its status, eval set, agent, trace count, pass/fail counts, pass-rate bar, duration, and models. Click a row to open the run detail.
73
+ - **Run detail.** For a single run: the evaluator configuration (metrics, thresholds, judge model), the golden eval set it was scored against, and per eval case results. Tool-trajectory results expand to an expected vs actual diff per invocation, showing exactly where the run diverged from the reference.
74
+
75
+ ### What is and is not persisted
76
+
77
+ Run detail is an *evaluation record*, not a full trace record. It faithfully shows the expected behavior, each metric's pass or fail, and (for trajectory metrics) where the actual tool calls diverged. It does not retain the raw trace spans or timeline, and text-similarity metrics keep only their score, not the actual response text. To replay a full trace, use the live inspector at evaluation time.
78
+
79
+ ## Agent identity and grouping
80
+
81
+ Runs group by **agent** using the OpenTelemetry `service.name` resource attribute, the cross-framework identifier for a service. Set it on your agent with the standard `OTEL_SERVICE_NAME` environment variable:
82
+
83
+ ```bash
84
+ OTEL_SERVICE_NAME=my-agent python my_agent.py
85
+ ```
86
+
87
+ The zero-code examples set this for you (for example `service.name=langchain-agent`). When `service.name` is absent, agentevals falls back to the framework agent name (`gen_ai.agent.name`); it never falls back to a model or span operation name, so a group is always a real agent identity.
88
+
89
+ ## Golden reference handling
90
+
91
+ When you score other agents against a golden session, the golden defines the eval set and therefore matches itself trivially. To keep scoring meaningful, the golden is excluded from pass or fail counts, the agent list, and the results table, but its latency and token usage are still plotted in the performance charts (labeled as the reference) so you can compare the scored agents against the baseline.
92
+
93
+ ## HTTP API
94
+
95
+ All endpoints return `503` (with a hint pointing at `AGENTEVALS_STORAGE_BACKEND=postgres`) when durable storage is not configured.
96
+
97
+ | Method + path | Description |
98
+ |---------------|-------------|
99
+ | `GET /api/runs` | List runs, newest first. Filter with `status`, `limit` (1-1000), and `before` (a `created_at` cursor for pagination) |
100
+ | `GET /api/runs/{run_id}` | Fetch a single run (spec + summary) |
101
+ | `GET /api/runs/{run_id}/results` | List the per (eval case, evaluator) result rows for a run |
102
+ | `POST /api/runs` | Submit a run for asynchronous execution by the in-process worker; idempotent on `run_id` |
103
+ | `POST /api/runs/{run_id}/cancel` | Request cancellation of a queued or running run (idempotent) |
104
+
105
+ Interactive API docs are available at `/docs` (Swagger) and `/redoc` while the server is running.
@@ -56,7 +56,7 @@ def check_prime(nums: list[int]) -> dict:
56
56
  dice_agent = Agent(
57
57
  name="dice_agent",
58
58
  # model="gemini-2.5-flash",
59
- model="gemini-2.5-flash-lite",
59
+ model="gemini-3-flash-preview",
60
60
  instruction="""You are a helpful assistant that can roll dice and check if numbers are prime.
61
61
 
62
62
  When a user asks you to roll a die, use the roll_die tool with the appropriate number of sides.
@@ -44,6 +44,7 @@ async def main():
44
44
  endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
45
45
  print(f"OTLP endpoint: {endpoint}")
46
46
 
47
+ os.environ.setdefault("OTEL_SERVICE_NAME", "adk-agent")
47
48
  os.environ.setdefault(
48
49
  "OTEL_RESOURCE_ATTRIBUTES",
49
50
  "agentevals.eval_set_id=dice_agent_eval,agentevals.session_name=adk-zero-code",
@@ -48,6 +48,7 @@ def main():
48
48
 
49
49
  os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "true"
50
50
 
51
+ os.environ.setdefault("OTEL_SERVICE_NAME", "langchain-agent")
51
52
  os.environ.setdefault(
52
53
  "OTEL_RESOURCE_ATTRIBUTES",
53
54
  "agentevals.eval_set_id=langchain_agent_eval,agentevals.session_name=langchain-zero-code",
@@ -112,6 +112,7 @@ def main():
112
112
  print(f"Local model: {model}")
113
113
 
114
114
  os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "true"
115
+ os.environ.setdefault("OTEL_SERVICE_NAME", "ollama-agent")
115
116
  os.environ.setdefault(
116
117
  "OTEL_RESOURCE_ATTRIBUTES",
117
118
  "agentevals.eval_set_id=langchain_local_ollama_openai_eval,agentevals.session_name=langchain-ollama-openai-zero-code",
@@ -58,6 +58,7 @@ def main():
58
58
  os.environ.setdefault("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "span_and_event")
59
59
  os.environ.setdefault("OTEL_SEMCONV_STABILITY_OPT_IN", "gen_ai_latest_experimental")
60
60
 
61
+ os.environ.setdefault("OTEL_SERVICE_NAME", "openai-agents-agent")
61
62
  os.environ.setdefault(
62
63
  "OTEL_RESOURCE_ATTRIBUTES",
63
64
  "agentevals.eval_set_id=openai_agents_eval,agentevals.session_name=openai-agents-zero-code",
@@ -54,6 +54,7 @@ def main():
54
54
  endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
55
55
  print(f"OTLP endpoint: {endpoint}")
56
56
 
57
+ os.environ.setdefault("OTEL_SERVICE_NAME", "pydantic-ai-agent")
57
58
  os.environ.setdefault(
58
59
  "OTEL_RESOURCE_ATTRIBUTES",
59
60
  "agentevals.eval_set_id=pydantic_ai_eval,agentevals.session_name=pydantic-ai-zero-code",
@@ -72,6 +73,7 @@ def main():
72
73
 
73
74
  agent = Agent(
74
75
  "openai:gpt-4o-mini",
76
+ # "openai:gpt-5.4-mini-2026-03-17",
75
77
  instructions="You are a helpful assistant. You can roll dice and check if numbers are prime.",
76
78
  )
77
79
  agent.tool_plain(roll_die)
@@ -40,6 +40,7 @@ def main():
40
40
  endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
41
41
  print(f"OTLP endpoint: {endpoint}")
42
42
 
43
+ os.environ.setdefault("OTEL_SERVICE_NAME", "strands-agent")
43
44
  os.environ.setdefault(
44
45
  "OTEL_RESOURCE_ATTRIBUTES",
45
46
  "agentevals.eval_set_id=strands_agent_eval,agentevals.session_name=strands-zero-code",