agentevals-cli 0.7.2__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (272) hide show
  1. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.github/workflows/release.yml +27 -18
  2. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/DEVELOPMENT.md +29 -1
  3. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/Dockerfile +1 -1
  4. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/Makefile +32 -1
  5. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/PKG-INFO +37 -6
  6. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/README.md +34 -5
  7. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/charts/agentevals/templates/NOTES.txt +8 -0
  8. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/charts/agentevals/templates/_helpers.tpl +33 -0
  9. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/charts/agentevals/templates/deployment.yaml +25 -2
  10. agentevals_cli-0.8.0/charts/agentevals/templates/postgresql-secret.yaml +13 -0
  11. agentevals_cli-0.8.0/charts/agentevals/templates/postgresql.yaml +142 -0
  12. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/charts/agentevals/templates/service.yaml +1 -1
  13. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/charts/agentevals/values.yaml +74 -1
  14. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/docs/streaming.md +12 -6
  15. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/kubernetes/README.md +1 -2
  16. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/pyproject.toml +7 -1
  17. agentevals_cli-0.8.0/samples/tempo_export_with_batches.json +1 -0
  18. agentevals_cli-0.7.2/src/agentevals/_static/assets/index-7YPfPT4N.js → agentevals_cli-0.8.0/src/agentevals/_static/assets/index-Cl6S2lcn.js +64 -65
  19. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/_static/index.html +1 -1
  20. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/app.py +52 -0
  21. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/routes.py +97 -26
  22. agentevals_cli-0.8.0/src/agentevals/api/runs_routes.py +130 -0
  23. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/cli.py +149 -3
  24. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/config.py +7 -4
  25. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/converter.py +19 -6
  26. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/eval_config_loader.py +1 -1
  27. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/extraction.py +51 -2
  28. agentevals_cli-0.8.0/src/agentevals/loader/__init__.py +29 -0
  29. agentevals_cli-0.8.0/src/agentevals/loader/auto.py +108 -0
  30. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/loader/otlp.py +38 -12
  31. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/mcp_server.py +5 -6
  32. agentevals_cli-0.8.0/src/agentevals/run/__init__.py +9 -0
  33. agentevals_cli-0.8.0/src/agentevals/run/fetcher.py +83 -0
  34. agentevals_cli-0.8.0/src/agentevals/run/result_builder.py +125 -0
  35. agentevals_cli-0.8.0/src/agentevals/run/service.py +128 -0
  36. agentevals_cli-0.8.0/src/agentevals/run/sinks.py +230 -0
  37. agentevals_cli-0.8.0/src/agentevals/run/worker.py +184 -0
  38. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/runner.py +4 -15
  39. agentevals_cli-0.8.0/src/agentevals/storage/__init__.py +48 -0
  40. agentevals_cli-0.8.0/src/agentevals/storage/config.py +95 -0
  41. agentevals_cli-0.8.0/src/agentevals/storage/models.py +123 -0
  42. agentevals_cli-0.8.0/src/agentevals/storage/postgres/__init__.py +5 -0
  43. agentevals_cli-0.8.0/src/agentevals/storage/postgres/migrations/000001_init.down.sql +5 -0
  44. agentevals_cli-0.8.0/src/agentevals/storage/postgres/migrations/000001_init.up.sql +110 -0
  45. agentevals_cli-0.8.0/src/agentevals/storage/postgres/migrator.py +287 -0
  46. agentevals_cli-0.8.0/src/agentevals/storage/postgres/pool.py +80 -0
  47. agentevals_cli-0.8.0/src/agentevals/storage/repos/__init__.py +90 -0
  48. agentevals_cli-0.8.0/src/agentevals/storage/repos/memory.py +179 -0
  49. agentevals_cli-0.8.0/src/agentevals/storage/repos/postgres.py +406 -0
  50. agentevals_cli-0.8.0/tests/api/test_evaluate_persistence.py +173 -0
  51. agentevals_cli-0.8.0/tests/api/test_runs_routes.py +185 -0
  52. agentevals_cli-0.8.0/tests/integration/__init__.py +0 -0
  53. agentevals_cli-0.8.0/tests/run/__init__.py +0 -0
  54. agentevals_cli-0.8.0/tests/run/test_fetcher.py +79 -0
  55. agentevals_cli-0.8.0/tests/run/test_result_builder.py +184 -0
  56. agentevals_cli-0.8.0/tests/run/test_service.py +169 -0
  57. agentevals_cli-0.8.0/tests/run/test_sinks.py +248 -0
  58. agentevals_cli-0.8.0/tests/storage/__init__.py +0 -0
  59. agentevals_cli-0.8.0/tests/storage/test_config.py +90 -0
  60. agentevals_cli-0.8.0/tests/storage/test_memory_repos.py +226 -0
  61. agentevals_cli-0.8.0/tests/storage/test_migrator.py +122 -0
  62. agentevals_cli-0.8.0/tests/storage/test_models.py +96 -0
  63. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_api.py +50 -6
  64. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_converter.py +33 -0
  65. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_extraction.py +74 -0
  66. agentevals_cli-0.8.0/tests/test_loader_auto.py +241 -0
  67. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_otlp_loader.py +125 -0
  68. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/api/client.ts +1 -4
  69. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/TraceUploadZone.tsx +1 -1
  70. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/upload/TraceEditorDrawer.tsx +2 -2
  71. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/upload/UploadView.tsx +2 -1
  72. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/context/TraceProvider.tsx +20 -2
  73. agentevals_cli-0.8.0/ui/src/lib/trace-loader.ts +320 -0
  74. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/uv.lock +54 -2
  75. agentevals_cli-0.7.2/src/agentevals/loader/__init__.py +0 -7
  76. agentevals_cli-0.7.2/ui/src/lib/trace-loader.ts +0 -249
  77. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.claude/skills/eval/SKILL.md +0 -0
  78. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.claude/skills/eval/evals/evals.json +0 -0
  79. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.claude/skills/inspect/SKILL.md +0 -0
  80. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.claude/skills/inspect/evals/evals.json +0 -0
  81. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.dockerignore +0 -0
  82. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  83. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  84. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  85. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.github/workflows/ci.yml +0 -0
  86. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  87. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.gitignore +0 -0
  88. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/.mcp.json +0 -0
  89. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/CONTRIBUTING.md +0 -0
  90. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/LICENSE +0 -0
  91. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/charts/agentevals/Chart.yaml +0 -0
  92. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  93. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/docs/assets/logo-color-on-transparent.svg +0 -0
  94. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/docs/assets/logo-color.png +0 -0
  95. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/docs/assets/logo-dark-on-transparent.svg +0 -0
  96. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/docs/custom-evaluators.md +0 -0
  97. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/docs/eval-set-format.md +0 -0
  98. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/docs/otel-compatibility.md +0 -0
  99. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/README.md +0 -0
  100. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/custom_evaluators/eval_config.yaml +0 -0
  101. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/custom_evaluators/response_quality.py +0 -0
  102. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/custom_evaluators/tool_call_checker.py +0 -0
  103. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/dice_agent/README.md +0 -0
  104. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/dice_agent/agent.py +0 -0
  105. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/dice_agent/eval_set.json +0 -0
  106. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/dice_agent/main.py +0 -0
  107. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/dice_agent/test_streaming.py +0 -0
  108. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/langchain_agent/README.md +0 -0
  109. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/langchain_agent/agent.py +0 -0
  110. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/langchain_agent/eval_set.json +0 -0
  111. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/langchain_agent/main.py +0 -0
  112. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/langchain_agent/requirements.txt +0 -0
  113. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/langchain_agent/test_streaming.py +0 -0
  114. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/sdk_example/async_example.py +0 -0
  115. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/sdk_example/context_manager_example.py +0 -0
  116. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/sdk_example/decorator_example.py +0 -0
  117. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/sdk_example/requirements.txt +0 -0
  118. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/strands_agent/agent.py +0 -0
  119. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/strands_agent/eval_set.json +0 -0
  120. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/strands_agent/main.py +0 -0
  121. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/strands_agent/requirements.txt +0 -0
  122. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/adk/requirements.txt +0 -0
  123. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/adk/run.py +0 -0
  124. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  125. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/langchain/run.py +0 -0
  126. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  127. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/ollama/run.py +0 -0
  128. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  129. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/openai-agents/run.py +0 -0
  130. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  131. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  132. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/strands/requirements.txt +0 -0
  133. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/examples/zero-code-examples/strands/run.py +0 -0
  134. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/flake.lock +0 -0
  135. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/flake.nix +0 -0
  136. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/packages/evaluator-sdk-py/README.md +0 -0
  137. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  138. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  139. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  140. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  141. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/samples/eval_set_helm.json +0 -0
  142. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/samples/evalset_helm_3_2026-02-23.json +0 -0
  143. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/samples/evalset_k8s_2026-02-20.json +0 -0
  144. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/samples/helm.json +0 -0
  145. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/samples/helm_2.json +0 -0
  146. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/samples/helm_3.json +0 -0
  147. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/samples/k8s.json +0 -0
  148. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/__init__.py +0 -0
  149. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/_protocol.py +0 -0
  150. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  151. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/_static/logo.svg +0 -0
  152. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/_static/vite.svg +0 -0
  153. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/__init__.py +0 -0
  154. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/debug_routes.py +0 -0
  155. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/dependencies.py +0 -0
  156. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/models.py +0 -0
  157. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/otlp_app.py +0 -0
  158. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/otlp_grpc.py +0 -0
  159. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/otlp_processing.py +0 -0
  160. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/otlp_routes.py +0 -0
  161. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/api/streaming_routes.py +0 -0
  162. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/builtin_metrics.py +0 -0
  163. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/custom_evaluators.py +0 -0
  164. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/evaluator/__init__.py +0 -0
  165. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/evaluator/resolver.py +0 -0
  166. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/evaluator/sources.py +0 -0
  167. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/evaluator/templates.py +0 -0
  168. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/evaluator/venv.py +0 -0
  169. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/genai_converter.py +0 -0
  170. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/loader/base.py +0 -0
  171. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/loader/jaeger.py +0 -0
  172. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/openai_eval_backend.py +0 -0
  173. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/output.py +0 -0
  174. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/sdk.py +0 -0
  175. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/streaming/__init__.py +0 -0
  176. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/streaming/incremental_processor.py +0 -0
  177. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/streaming/processor.py +0 -0
  178. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/streaming/session.py +0 -0
  179. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/streaming/ws_server.py +0 -0
  180. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/trace_attrs.py +0 -0
  181. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/trace_metrics.py +0 -0
  182. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/utils/__init__.py +0 -0
  183. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/utils/genai_messages.py +0 -0
  184. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/utils/log_buffer.py +0 -0
  185. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/src/agentevals/utils/log_enrichment.py +0 -0
  186. {agentevals_cli-0.7.2/tests/integration → agentevals_cli-0.8.0/tests/api}/__init__.py +0 -0
  187. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/integration/conftest.py +0 -0
  188. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/integration/test_evaluation_pipeline.py +0 -0
  189. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/integration/test_live_agents.py +0 -0
  190. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  191. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/integration/test_session_grouping.py +0 -0
  192. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/integration/test_timing_stress.py +0 -0
  193. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_cli.py +0 -0
  194. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_genai_converter.py +0 -0
  195. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_jaeger_loader.py +0 -0
  196. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_log_enrichment.py +0 -0
  197. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_mcp_server.py +0 -0
  198. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_otlp_receiver.py +0 -0
  199. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_output.py +0 -0
  200. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_protocol.py +0 -0
  201. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_runner.py +0 -0
  202. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_sdk.py +0 -0
  203. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/tests/test_trace_metrics.py +0 -0
  204. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/.gitignore +0 -0
  205. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/README.md +0 -0
  206. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/eslint.config.js +0 -0
  207. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/index.html +0 -0
  208. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/package-lock.json +0 -0
  209. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/package.json +0 -0
  210. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/public/logo.svg +0 -0
  211. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/public/vite.svg +0 -0
  212. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/App.css +0 -0
  213. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/App.tsx +0 -0
  214. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/assets/react.svg +0 -0
  215. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  216. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  217. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  218. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  219. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  220. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/BuilderView.tsx +0 -0
  221. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  222. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  223. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  224. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/JsonPreview.tsx +0 -0
  225. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  226. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/builder/index.ts +0 -0
  227. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  228. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  229. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  230. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  231. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  232. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  233. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  234. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  235. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/DataSection.tsx +0 -0
  236. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  237. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  238. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/InspectorView.tsx +0 -0
  239. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  240. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  241. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  242. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  243. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  244. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  245. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  246. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  247. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  248. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  249. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  250. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/streaming/SessionCard.tsx +0 -0
  251. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  252. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  253. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/upload/FileDropZone.tsx +0 -0
  254. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/upload/MetricSelector.tsx +0 -0
  255. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  256. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  257. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/config.ts +0 -0
  258. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/context/TraceContext.tsx +0 -0
  259. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/index.css +0 -0
  260. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/lib/console-capture.ts +0 -0
  261. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/lib/evalset-builder.ts +0 -0
  262. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/lib/network-capture.ts +0 -0
  263. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/lib/trace-helpers.ts +0 -0
  264. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/lib/trace-metadata.ts +0 -0
  265. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/lib/trace-patcher.ts +0 -0
  266. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/lib/types.ts +0 -0
  267. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/lib/utils.ts +0 -0
  268. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/src/main.tsx +0 -0
  269. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/tsconfig.app.json +0 -0
  270. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/tsconfig.json +0 -0
  271. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/tsconfig.node.json +0 -0
  272. {agentevals_cli-0.7.2 → agentevals_cli-0.8.0}/ui/vite.config.ts +0 -0
@@ -18,6 +18,8 @@ jobs:
18
18
  runs-on: ubuntu-latest
19
19
  steps:
20
20
  - uses: actions/checkout@v6
21
+ with:
22
+ ref: ${{ github.event.inputs.tag || github.ref_name }}
21
23
 
22
24
  - uses: astral-sh/setup-uv@v7
23
25
  with:
@@ -42,29 +44,13 @@ jobs:
42
44
  dist/core/*.whl
43
45
  dist/bundle/*.whl
44
46
 
45
- github-release:
46
- needs: build
47
- runs-on: ubuntu-latest
48
- permissions:
49
- contents: write
50
-
51
- steps:
52
- - uses: actions/download-artifact@v8
53
- with:
54
- name: wheels
55
- path: dist/
56
-
57
- - uses: softprops/action-gh-release@v2.5.0
58
- with:
59
- tag_name: ${{ github.event.inputs.tag || github.ref_name }}
60
- files: dist/**/*.whl
61
- generate_release_notes: true
62
-
63
47
  publish:
64
48
  needs: build
65
49
  runs-on: ubuntu-latest
66
50
  steps:
67
51
  - uses: actions/checkout@v6
52
+ with:
53
+ ref: ${{ github.event.inputs.tag || github.ref_name }}
68
54
 
69
55
  - uses: astral-sh/setup-uv@v7
70
56
  with:
@@ -93,13 +79,34 @@ jobs:
93
79
  uv publish dist/* --token ${{ secrets.PYPI_TOKEN }}
94
80
  rm -rf src/agentevals/_static
95
81
 
82
+ github-release:
83
+ needs: publish
84
+ runs-on: ubuntu-latest
85
+ permissions:
86
+ contents: write
87
+
88
+ steps:
89
+ - uses: actions/download-artifact@v8
90
+ with:
91
+ name: wheels
92
+ path: dist/
93
+
94
+ - uses: softprops/action-gh-release@v2.5.0
95
+ with:
96
+ tag_name: ${{ github.event.inputs.tag || github.ref_name }}
97
+ files: dist/**/*.whl
98
+ generate_release_notes: true
99
+
96
100
  push-docker:
101
+ needs: github-release
97
102
  runs-on: ubuntu-latest
98
103
  permissions:
99
104
  contents: read
100
105
  packages: write
101
106
  steps:
102
107
  - uses: actions/checkout@v6
108
+ with:
109
+ ref: ${{ github.event.inputs.tag || github.ref_name }}
103
110
 
104
111
  - name: Login to GitHub Container Registry
105
112
  uses: docker/login-action@v4
@@ -131,6 +138,8 @@ jobs:
131
138
  packages: write
132
139
  steps:
133
140
  - uses: actions/checkout@v6
141
+ with:
142
+ ref: ${{ github.event.inputs.tag || github.ref_name }}
134
143
 
135
144
  - name: Login to GitHub Container Registry
136
145
  uses: docker/login-action@v4
@@ -23,10 +23,38 @@ make dev-frontend # start Vite dev server (port 5173) with HMR
23
23
  make dev-bundle # build UI, serve full bundled experience at port 8001 via uv run
24
24
  ```
25
25
 
26
- Standard development uses `dev-backend` + `dev-frontend` in separate terminals. The Vite dev server proxies nothing the frontend calls the backend at `http://localhost:8001` directly via CORS.
26
+ Standard development uses `dev-backend` + `dev-frontend` in separate terminals. The Vite dev server proxies nothing; the frontend calls the backend at `http://localhost:8001` directly via CORS.
27
27
 
28
28
  `dev-bundle` is useful for testing the bundled UI experience without building a wheel. It copies `ui/dist` into the source tree temporarily and cleans up when the server exits.
29
29
 
30
+ ### Postgres backend (optional, for `/api/runs`)
31
+
32
+ > **Preview.** The schema, the CLI surface, and `/api/runs` shape are still
33
+ > stabilizing. Recreate the agentevals schema between minor version upgrades
34
+ > until further notice; do not depend on persisted data surviving a
35
+ > `git pull` of agentevals itself.
36
+
37
+ The default in-memory backend keeps `make dev-backend` zero-config. To exercise the async run pipeline locally, bring up a Postgres alongside the app:
38
+
39
+ ```bash
40
+ make pg-up # start postgres:18.3-alpine in a docker container (port 5432, ephemeral via --rm)
41
+ make migrate # apply the agentevals schema
42
+ make dev-backend-pg # pg-up + migrate + serve --dev with backend=postgres wired up
43
+ make pg-down # stop the container; data is discarded with --rm
44
+ ```
45
+
46
+ Override the defaults via `PG_PORT=5433 make pg-up` etc. The `migrate` target is idempotent (a second invocation is a no-op).
47
+
48
+ Once running, submit a run with:
49
+
50
+ ```bash
51
+ curl -X POST http://localhost:8001/api/runs \
52
+ -H 'content-type: application/json' \
53
+ -d '{"spec": {"approach": "trace_replay", "target": {"kind": "inline", "inline": {...}}, "evalConfig": {"metrics": ["tool_trajectory_avg_score"]}}}'
54
+ ```
55
+
56
+ Then poll `GET /api/runs/{runId}` and `GET /api/runs/{runId}/results`. Without `storage.backend=postgres`, the `/api/runs` endpoints return 503 with a hint pointing at the env var.
57
+
30
58
  ### Building
31
59
 
32
60
  ```bash
@@ -24,7 +24,7 @@ COPY src ./src
24
24
 
25
25
  COPY --from=ui /build/ui/dist ./src/agentevals/_static
26
26
 
27
- RUN uv sync --frozen --no-dev --extra live \
27
+ RUN uv sync --frozen --no-dev --extra live --extra postgres \
28
28
  && groupadd --gid 1000 app \
29
29
  && useradd --uid 1000 --gid app --home-dir /app --no-log-init app \
30
30
  && chown -R app:app /app
@@ -15,7 +15,14 @@ HELM_CHART_DIR ?= charts/agentevals
15
15
  HELM_CHART_OCI_URL ?= $(HELM_REPO)/helm
16
16
  HELM_CHART_VERSION ?= $(VERSION)
17
17
 
18
- .PHONY: build build-bundle build-docker build-ui release clean dev-backend dev-frontend dev-bundle test test-unit test-integration test-e2e helm-lint helm-template helm-test helm-cleanup helm-package helm-publish
18
+ .PHONY: build build-bundle build-docker build-ui release clean dev-backend dev-backend-pg dev-frontend dev-bundle pg-up pg-down migrate test test-unit test-integration test-e2e helm-lint helm-template helm-test helm-cleanup helm-package helm-publish
19
+
20
+ PG_CONTAINER ?= agentevals-pg
21
+ PG_PORT ?= 5432
22
+ PG_USER ?= agentevals
23
+ PG_PASSWORD ?= agentevals
24
+ PG_DATABASE ?= agentevals
25
+ PG_DSN ?= postgresql://$(PG_USER):$(PG_PASSWORD)@localhost:$(PG_PORT)/$(PG_DATABASE)
19
26
 
20
27
  build:
21
28
  uv build
@@ -53,6 +60,30 @@ release: clean build-ui
53
60
  dev-backend:
54
61
  uv run agentevals serve --dev
55
62
 
63
+ pg-up:
64
+ @if [ -z "$$(docker ps -q -f name=^/$(PG_CONTAINER)$$)" ]; then \
65
+ docker run -d --rm --name $(PG_CONTAINER) \
66
+ -e POSTGRES_USER=$(PG_USER) \
67
+ -e POSTGRES_PASSWORD=$(PG_PASSWORD) \
68
+ -e POSTGRES_DB=$(PG_DATABASE) \
69
+ -p $(PG_PORT):5432 postgres:18.3-alpine; \
70
+ else \
71
+ echo "container $(PG_CONTAINER) already running"; \
72
+ fi
73
+ @until docker exec $(PG_CONTAINER) pg_isready -U $(PG_USER) >/dev/null 2>&1; do sleep 1; done
74
+ @echo "Postgres ready at $(PG_DSN)"
75
+
76
+ pg-down:
77
+ -docker stop $(PG_CONTAINER)
78
+
79
+ migrate:
80
+ AGENTEVALS_DATABASE_URL=$(PG_DSN) uv run agentevals migrate up
81
+
82
+ dev-backend-pg: pg-up migrate
83
+ AGENTEVALS_STORAGE_BACKEND=postgres \
84
+ AGENTEVALS_DATABASE_URL=$(PG_DSN) \
85
+ uv run agentevals serve --dev
86
+
56
87
  dev-frontend:
57
88
  cd ui && npm run dev
58
89
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.7.2
3
+ Version: 0.8.0
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -19,6 +19,8 @@ Requires-Dist: httpx>=0.27.0; extra == 'live'
19
19
  Requires-Dist: mcp>=1.26.0; extra == 'live'
20
20
  Provides-Extra: openai
21
21
  Requires-Dist: openai>=2.0; extra == 'openai'
22
+ Provides-Extra: postgres
23
+ Requires-Dist: asyncpg>=0.30.0; extra == 'postgres'
22
24
  Provides-Extra: streaming
23
25
  Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'streaming'
24
26
  Requires-Dist: websockets>=12.0; extra == 'streaming'
@@ -26,9 +28,9 @@ Description-Content-Type: text/markdown
26
28
 
27
29
  <p align="center">
28
30
  <picture>
29
- <source media="(prefers-color-scheme: dark)" srcset="docs/assets/logo-color-on-transparent.svg">
30
- <source media="(prefers-color-scheme: light)" srcset="docs/assets/logo-dark-on-transparent.svg">
31
- <img src="docs/assets/logo-color-on-transparent.svg" alt="agentevals" width="420" />
31
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/agentevals-dev/agentevals/main/docs/assets/logo-color-on-transparent.svg">
32
+ <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/agentevals-dev/agentevals/main/docs/assets/logo-dark-on-transparent.svg">
33
+ <img src="https://raw.githubusercontent.com/agentevals-dev/agentevals/main/docs/assets/logo-color-on-transparent.svg" alt="agentevals" width="420" />
32
34
  </picture>
33
35
  </p>
34
36
 
@@ -300,14 +302,43 @@ docker run -p 8001:8001 -p 4317:4317 -p 4318:4318 agentevals
300
302
 
301
303
  ### Helm
302
304
 
303
- A Helm chart is available in [`charts/agentevals/`](charts/agentevals/):
305
+ The Helm chart is published as an OCI artifact to GitHub Container Registry:
304
306
 
305
307
  ```bash
306
- helm install agentevals ./charts/agentevals
308
+ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals
307
309
  ```
308
310
 
311
+ Pass `--version <x.y.z>` to pin to a specific release. Available versions are listed under [packages](https://github.com/agentevals-dev/agentevals/pkgs/container/agentevals%2Fhelm%2Fagentevals).
312
+
313
+ The source for the chart lives in [`charts/agentevals/`](charts/agentevals/) if you want to install from a local checkout instead.
314
+
309
315
  See the [Kubernetes example](examples/kubernetes/README.md) for an end-to-end walkthrough deploying agentevals alongside kagent and an OTel Collector on Kubernetes.
310
316
 
317
+ #### Postgres backend (`/api/runs`)
318
+
319
+ > **Preview.** Persistent run history backed by Postgres is under active
320
+ > development. The `storage.*` and `database.postgres.*` chart values, the
321
+ > `/api/runs` HTTP surface, and the database schema may change incompatibly
322
+ > in upcoming releases. Operators evaluating this feature should plan to
323
+ > recreate the agentevals schema when upgrading between minor versions.
324
+ > Default in-memory mode is unaffected.
325
+
326
+ By default the chart deploys agentevals with an in-memory backend; runs and results are not persisted. To enable the async `POST /api/runs` pipeline with durable Postgres-backed state:
327
+
328
+ ```bash
329
+ # Bundled Postgres (dev / evaluation only):
330
+ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals \
331
+ --set storage.backend=postgres \
332
+ --set database.postgres.bundled.enabled=true
333
+
334
+ # Or supply an external Postgres DSN:
335
+ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals \
336
+ --set storage.backend=postgres \
337
+ --set database.postgres.url='postgresql://user:pass@host:5432/dbname'
338
+ ```
339
+
340
+ When `storage.backend=postgres` the app applies any pending schema migrations on startup (advisory-lock protected, safe across replicas) and starts an in-process worker that processes the run queue. Without `storage.backend=postgres` the `/api/runs` endpoints return 503 with a hint pointing at the env var.
341
+
311
342
  ## MCP Server
312
343
 
313
344
  Exposes evaluation tools to MCP clients. A `.mcp.json` at the project root lets Claude Code pick it up automatically.
@@ -1,8 +1,8 @@
1
1
  <p align="center">
2
2
  <picture>
3
- <source media="(prefers-color-scheme: dark)" srcset="docs/assets/logo-color-on-transparent.svg">
4
- <source media="(prefers-color-scheme: light)" srcset="docs/assets/logo-dark-on-transparent.svg">
5
- <img src="docs/assets/logo-color-on-transparent.svg" alt="agentevals" width="420" />
3
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/agentevals-dev/agentevals/main/docs/assets/logo-color-on-transparent.svg">
4
+ <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/agentevals-dev/agentevals/main/docs/assets/logo-dark-on-transparent.svg">
5
+ <img src="https://raw.githubusercontent.com/agentevals-dev/agentevals/main/docs/assets/logo-color-on-transparent.svg" alt="agentevals" width="420" />
6
6
  </picture>
7
7
  </p>
8
8
 
@@ -274,14 +274,43 @@ docker run -p 8001:8001 -p 4317:4317 -p 4318:4318 agentevals
274
274
 
275
275
  ### Helm
276
276
 
277
- A Helm chart is available in [`charts/agentevals/`](charts/agentevals/):
277
+ The Helm chart is published as an OCI artifact to GitHub Container Registry:
278
278
 
279
279
  ```bash
280
- helm install agentevals ./charts/agentevals
280
+ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals
281
281
  ```
282
282
 
283
+ Pass `--version <x.y.z>` to pin to a specific release. Available versions are listed under [packages](https://github.com/agentevals-dev/agentevals/pkgs/container/agentevals%2Fhelm%2Fagentevals).
284
+
285
+ The source for the chart lives in [`charts/agentevals/`](charts/agentevals/) if you want to install from a local checkout instead.
286
+
283
287
  See the [Kubernetes example](examples/kubernetes/README.md) for an end-to-end walkthrough deploying agentevals alongside kagent and an OTel Collector on Kubernetes.
284
288
 
289
+ #### Postgres backend (`/api/runs`)
290
+
291
+ > **Preview.** Persistent run history backed by Postgres is under active
292
+ > development. The `storage.*` and `database.postgres.*` chart values, the
293
+ > `/api/runs` HTTP surface, and the database schema may change incompatibly
294
+ > in upcoming releases. Operators evaluating this feature should plan to
295
+ > recreate the agentevals schema when upgrading between minor versions.
296
+ > Default in-memory mode is unaffected.
297
+
298
+ By default the chart deploys agentevals with an in-memory backend; runs and results are not persisted. To enable the async `POST /api/runs` pipeline with durable Postgres-backed state:
299
+
300
+ ```bash
301
+ # Bundled Postgres (dev / evaluation only):
302
+ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals \
303
+ --set storage.backend=postgres \
304
+ --set database.postgres.bundled.enabled=true
305
+
306
+ # Or supply an external Postgres DSN:
307
+ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals \
308
+ --set storage.backend=postgres \
309
+ --set database.postgres.url='postgresql://user:pass@host:5432/dbname'
310
+ ```
311
+
312
+ When `storage.backend=postgres` the app applies any pending schema migrations on startup (advisory-lock protected, safe across replicas) and starts an in-process worker that processes the run queue. Without `storage.backend=postgres` the `/api/runs` endpoints return 503 with a hint pointing at the env var.
313
+
285
314
  ## MCP Server
286
315
 
287
316
  Exposes evaluation tools to MCP clients. A `.mcp.json` at the project root lets Claude Code pick it up automatically.
@@ -11,3 +11,11 @@ Get the Service URL:
11
11
  kubectl --namespace {{ include "agentevals.namespace" . }} port-forward $POD_NAME {{ .Values.service.http.port }}:{{ .Values.service.http.port }}
12
12
 
13
13
  Health check: GET http://<pod-ip>:{{ .Values.service.http.containerPort }}/api/health
14
+
15
+ {{- if eq .Values.storage.backend "postgres" }}
16
+
17
+ NOTE: Postgres-backed storage is a preview feature. The storage.* and
18
+ database.postgres.* values, the /api/runs HTTP surface, and the database
19
+ schema may change incompatibly in upcoming releases. Recreate the
20
+ agentevals schema when upgrading between minor versions.
21
+ {{- end }}
@@ -48,6 +48,17 @@ app.kubernetes.io/name: {{ include "agentevals.name" . }}
48
48
  app.kubernetes.io/instance: {{ .Release.Name }}
49
49
  {{- end }}
50
50
 
51
+ {{- /*
52
+ Selector labels scoped to the main app Pod and its Service. Carries the
53
+ ``app.kubernetes.io/component: agentevals`` discriminator so the agentevals
54
+ Service does not also match the bundled Postgres Pod (which carries
55
+ ``app.kubernetes.io/component: database`` instead).
56
+ */ -}}
57
+ {{- define "agentevals.app.selectorLabels" -}}
58
+ {{ include "agentevals.selectorLabels" . }}
59
+ app.kubernetes.io/component: agentevals
60
+ {{- end }}
61
+
51
62
  {{- define "agentevals.serviceAccountName" -}}
52
63
  {{- if .Values.serviceAccount.create }}
53
64
  {{- default (include "agentevals.fullname" .) .Values.serviceAccount.name }}
@@ -55,3 +66,25 @@ app.kubernetes.io/instance: {{ .Release.Name }}
55
66
  {{- default "default" .Values.serviceAccount.name }}
56
67
  {{- end }}
57
68
  {{- end }}
69
+
70
+ {{/*
71
+ Service name for the bundled Postgres instance.
72
+ */}}
73
+ {{- define "agentevals.postgresqlServiceName" -}}
74
+ {{- printf "%s-postgresql" (include "agentevals.fullname" .) -}}
75
+ {{- end -}}
76
+
77
+ {{/*
78
+ Bundled Postgres image reference (registry/repository/name:tag).
79
+ */}}
80
+ {{- define "agentevals.postgresql.image" -}}
81
+ {{- $pg := .Values.database.postgres.bundled -}}
82
+ {{- printf "%s/%s/%s:%s" $pg.image.registry $pg.image.repository $pg.image.name $pg.image.tag -}}
83
+ {{- end -}}
84
+
85
+ {{/*
86
+ Secret name holding POSTGRES_PASSWORD for the bundled Postgres instance.
87
+ */}}
88
+ {{- define "agentevals.passwordSecretName" -}}
89
+ {{- printf "%s-postgresql" (include "agentevals.fullname" .) -}}
90
+ {{- end -}}
@@ -9,7 +9,7 @@ spec:
9
9
  replicas: {{ .Values.replicaCount }}
10
10
  selector:
11
11
  matchLabels:
12
- {{- include "agentevals.selectorLabels" . | nindent 6 }}
12
+ {{- include "agentevals.app.selectorLabels" . | nindent 6 }}
13
13
  template:
14
14
  metadata:
15
15
  {{- with .Values.podAnnotations }}
@@ -17,7 +17,7 @@ spec:
17
17
  {{- toYaml . | nindent 8 }}
18
18
  {{- end }}
19
19
  labels:
20
- {{- include "agentevals.selectorLabels" . | nindent 8 }}
20
+ {{- include "agentevals.app.selectorLabels" . | nindent 8 }}
21
21
  {{- with .Values.podLabels }}
22
22
  {{- toYaml . | nindent 8 }}
23
23
  {{- end }}
@@ -65,6 +65,29 @@ spec:
65
65
  - name: HOME
66
66
  value: "/tmp/agentevals-home"
67
67
  {{- end }}
68
+ {{- if eq .Values.storage.backend "postgres" }}
69
+ - name: AGENTEVALS_STORAGE_BACKEND
70
+ value: "postgres"
71
+ - name: AGENTEVALS_DATABASE_SCHEMA
72
+ value: {{ .Values.database.postgres.schema | quote }}
73
+ {{- if .Values.database.postgres.urlFile }}
74
+ - name: AGENTEVALS_DATABASE_URL_FILE
75
+ value: {{ .Values.database.postgres.urlFile | quote }}
76
+ {{- else if .Values.database.postgres.url }}
77
+ - name: AGENTEVALS_DATABASE_URL
78
+ value: {{ .Values.database.postgres.url | quote }}
79
+ {{- else if .Values.database.postgres.bundled.enabled }}
80
+ - name: POSTGRES_PASSWORD
81
+ valueFrom:
82
+ secretKeyRef:
83
+ name: {{ include "agentevals.passwordSecretName" . }}
84
+ key: POSTGRES_PASSWORD
85
+ - name: AGENTEVALS_DATABASE_URL
86
+ value: {{ printf "postgresql://agentevals:$(POSTGRES_PASSWORD)@%s.%s.svc.cluster.local:5432/agentevals?sslmode=disable" (include "agentevals.postgresqlServiceName" .) (include "agentevals.namespace" .) | quote }}
87
+ {{- else }}
88
+ {{ fail "storage.backend=postgres requires database.postgres.url, database.postgres.urlFile, or database.postgres.bundled.enabled=true" }}
89
+ {{- end }}
90
+ {{- end }}
68
91
  {{- with .Values.env }}
69
92
  {{- toYaml . | nindent 12 }}
70
93
  {{- end }}
@@ -0,0 +1,13 @@
1
+ {{- if and (eq .Values.storage.backend "postgres") .Values.database.postgres.bundled.enabled (not .Values.database.postgres.url) (not .Values.database.postgres.urlFile) }}
2
+ apiVersion: v1
3
+ kind: Secret
4
+ metadata:
5
+ name: {{ include "agentevals.passwordSecretName" . }}
6
+ namespace: {{ include "agentevals.namespace" . }}
7
+ labels:
8
+ {{- include "agentevals.labels" . | nindent 4 }}
9
+ app.kubernetes.io/component: database
10
+ type: Opaque
11
+ data:
12
+ POSTGRES_PASSWORD: {{ "agentevals" | b64enc | quote }}
13
+ {{- end }}
@@ -0,0 +1,142 @@
1
+ {{- if and (eq .Values.storage.backend "postgres") .Values.database.postgres.bundled.enabled (not .Values.database.postgres.url) (not .Values.database.postgres.urlFile) }}
2
+ {{- $pg := .Values.database.postgres.bundled }}
3
+ {{- $fullname := include "agentevals.postgresqlServiceName" . }}
4
+ ---
5
+ apiVersion: v1
6
+ kind: ServiceAccount
7
+ metadata:
8
+ name: {{ $fullname }}
9
+ namespace: {{ include "agentevals.namespace" . }}
10
+ labels:
11
+ {{- include "agentevals.labels" . | nindent 4 }}
12
+ app.kubernetes.io/component: database
13
+ ---
14
+ apiVersion: v1
15
+ kind: PersistentVolumeClaim
16
+ metadata:
17
+ name: {{ $fullname }}
18
+ namespace: {{ include "agentevals.namespace" . }}
19
+ labels:
20
+ {{- include "agentevals.labels" . | nindent 4 }}
21
+ app.kubernetes.io/component: database
22
+ spec:
23
+ accessModes:
24
+ - ReadWriteOnce
25
+ {{- if $pg.storageClassName }}
26
+ storageClassName: {{ $pg.storageClassName | quote }}
27
+ {{- end }}
28
+ resources:
29
+ requests:
30
+ storage: {{ $pg.storage | quote }}
31
+ ---
32
+ apiVersion: apps/v1
33
+ kind: Deployment
34
+ metadata:
35
+ name: {{ $fullname }}
36
+ namespace: {{ include "agentevals.namespace" . }}
37
+ labels:
38
+ {{- include "agentevals.labels" . | nindent 4 }}
39
+ app.kubernetes.io/component: database
40
+ spec:
41
+ replicas: 1
42
+ strategy:
43
+ type: Recreate
44
+ selector:
45
+ matchLabels:
46
+ {{- include "agentevals.selectorLabels" . | nindent 6 }}
47
+ app.kubernetes.io/component: database
48
+ template:
49
+ metadata:
50
+ labels:
51
+ {{- include "agentevals.selectorLabels" . | nindent 8 }}
52
+ app.kubernetes.io/component: database
53
+ spec:
54
+ {{- with .Values.imagePullSecrets }}
55
+ imagePullSecrets:
56
+ {{- toYaml . | nindent 8 }}
57
+ {{- end }}
58
+ serviceAccountName: {{ $fullname }}
59
+ securityContext:
60
+ fsGroup: 999
61
+ runAsUser: 999
62
+ runAsGroup: 999
63
+ runAsNonRoot: true
64
+ containers:
65
+ - name: postgresql
66
+ image: {{ include "agentevals.postgresql.image" . }}
67
+ imagePullPolicy: {{ $pg.image.pullPolicy }}
68
+ securityContext:
69
+ allowPrivilegeEscalation: false
70
+ ports:
71
+ - name: postgresql
72
+ containerPort: 5432
73
+ protocol: TCP
74
+ env:
75
+ - name: POSTGRES_DB
76
+ value: "agentevals"
77
+ - name: POSTGRES_USER
78
+ value: "agentevals"
79
+ - name: POSTGRES_PASSWORD
80
+ valueFrom:
81
+ secretKeyRef:
82
+ name: {{ include "agentevals.passwordSecretName" . }}
83
+ key: POSTGRES_PASSWORD
84
+ - name: PGDATA
85
+ value: /var/lib/postgresql/data/pgdata
86
+ livenessProbe:
87
+ exec:
88
+ command:
89
+ - pg_isready
90
+ - -U
91
+ - agentevals
92
+ - -d
93
+ - agentevals
94
+ initialDelaySeconds: 20
95
+ periodSeconds: 10
96
+ timeoutSeconds: 5
97
+ failureThreshold: 6
98
+ successThreshold: 1
99
+ readinessProbe:
100
+ exec:
101
+ command:
102
+ - pg_isready
103
+ - -U
104
+ - agentevals
105
+ - -d
106
+ - agentevals
107
+ initialDelaySeconds: 5
108
+ periodSeconds: 5
109
+ timeoutSeconds: 3
110
+ failureThreshold: 3
111
+ successThreshold: 1
112
+ {{- with $pg.resources }}
113
+ resources:
114
+ {{- toYaml . | nindent 12 }}
115
+ {{- end }}
116
+ volumeMounts:
117
+ - name: data
118
+ mountPath: /var/lib/postgresql/data
119
+ volumes:
120
+ - name: data
121
+ persistentVolumeClaim:
122
+ claimName: {{ $fullname }}
123
+ ---
124
+ apiVersion: v1
125
+ kind: Service
126
+ metadata:
127
+ name: {{ $fullname }}
128
+ namespace: {{ include "agentevals.namespace" . }}
129
+ labels:
130
+ {{- include "agentevals.labels" . | nindent 4 }}
131
+ app.kubernetes.io/component: database
132
+ spec:
133
+ type: ClusterIP
134
+ ports:
135
+ - name: postgresql
136
+ port: 5432
137
+ targetPort: postgresql
138
+ protocol: TCP
139
+ selector:
140
+ {{- include "agentevals.selectorLabels" . | nindent 4 }}
141
+ app.kubernetes.io/component: database
142
+ {{- end }}
@@ -25,4 +25,4 @@ spec:
25
25
  targetPort: mcp
26
26
  protocol: TCP
27
27
  selector:
28
- {{- include "agentevals.selectorLabels" . | nindent 4 }}
28
+ {{- include "agentevals.app.selectorLabels" . | nindent 4 }}
@@ -2,7 +2,10 @@
2
2
  # Global
3
3
  # ==============================================================================
4
4
 
5
- # -- Number of replicas. Only 1 is supported (no shared job state across pods).
5
+ # -- Number of replicas. The default in-memory backend has no shared state, so
6
+ # scale beyond 1 only when storage.backend is "postgres" (durable runs/results
7
+ # in Postgres are safe to share across replicas via SELECT FOR UPDATE SKIP
8
+ # LOCKED claim semantics).
6
9
  replicaCount: 1
7
10
 
8
11
  # -- Global container image registry (prepended to image.repository)
@@ -155,3 +158,73 @@ env: []
155
158
 
156
159
  # -- Extra envFrom sources (ConfigMapRef, SecretRef)
157
160
  envFrom: []
161
+
162
+ # ==============================================================================
163
+ # STORAGE (preview feature)
164
+ #
165
+ # Persistent run history backed by Postgres is under active development.
166
+ # storage.* and database.postgres.* keys, and the underlying schema, may
167
+ # change incompatibly in upcoming releases. Treat persisted runs and
168
+ # results as ephemeral; recreate the agentevals schema when upgrading
169
+ # between minor versions. Default in-memory backend is unaffected.
170
+ # ==============================================================================
171
+
172
+ storage:
173
+ # -- Storage backend. "memory" (default) keeps the developer experience
174
+ # zero-config: nothing persisted, restarts lose in-flight state. "postgres"
175
+ # enables /api/runs and persists runs + results in Postgres (preview).
176
+ backend: memory
177
+
178
+ # ==============================================================================
179
+ # DATABASE CONFIGURATION
180
+ # ==============================================================================
181
+ # Used only when storage.backend is "postgres". Priority order (first match wins):
182
+ # 1. database.postgres.urlFile -- file-based DSN (workload identity friendly)
183
+ # 2. database.postgres.url -- literal DSN
184
+ # 3. database.postgres.bundled -- chart-bundled Postgres (dev/eval only)
185
+ # If none is configured the chart fails to render.
186
+
187
+ database:
188
+ postgres:
189
+ # -- External Postgres connection string.
190
+ # When set, takes precedence over the bundled instance regardless of
191
+ # database.postgres.bundled.enabled.
192
+ url: ""
193
+ # -- Path to a file containing the connection string. Takes precedence
194
+ # over url when set. Useful for projected workload-identity tokens.
195
+ urlFile: ""
196
+ # -- Postgres schema to use for agentevals tables.
197
+ schema: agentevals
198
+ # -- Bundled Postgres instance for development and evaluation only.
199
+ # Not suitable for production. Deployed when enabled is true and url /
200
+ # urlFile are not set.
201
+ bundled:
202
+ # -- Set to true to deploy a chart-managed Postgres alongside the app.
203
+ # Off by default so the zero-config install stays in-memory.
204
+ enabled: false
205
+ image:
206
+ # -- Bundled Postgres image registry
207
+ registry: docker.io
208
+ # -- Bundled Postgres image repository (org/namespace)
209
+ repository: library
210
+ # -- Bundled Postgres image name
211
+ name: postgres
212
+ # -- Bundled Postgres image tag
213
+ tag: "18.3-alpine"
214
+ # -- Bundled Postgres image pull policy
215
+ pullPolicy: IfNotPresent
216
+ # -- PersistentVolumeClaim size for the bundled Postgres data
217
+ storage: 1Gi
218
+ # -- StorageClass for the PVC. Defaults to the cluster default when empty.
219
+ storageClassName: ""
220
+ # The database name, user, and password are hardcoded for the bundled
221
+ # instance (all: "agentevals"). This is intentional for a dev/eval
222
+ # setup. Switch to an external database for production.
223
+ # -- Resource requests/limits for the bundled Postgres container
224
+ resources:
225
+ requests:
226
+ cpu: 250m
227
+ memory: 256Mi
228
+ limits:
229
+ cpu: 500m
230
+ memory: 512Mi