agentevals-cli 0.8.1__tar.gz → 0.8.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/PKG-INFO +17 -1
  2. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/README.md +16 -0
  3. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/templates/deployment.yaml +14 -2
  4. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/values.yaml +16 -0
  5. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/README.md +6 -0
  6. agentevals_cli-0.8.3/examples/custom_sink/README.md +80 -0
  7. agentevals_cli-0.8.3/examples/custom_sink/agentevals_example_custom_sink/__init__.py +1 -0
  8. agentevals_cli-0.8.3/examples/custom_sink/agentevals_example_custom_sink/sink.py +71 -0
  9. agentevals_cli-0.8.3/examples/custom_sink/pyproject.toml +19 -0
  10. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/pyproject.toml +1 -1
  11. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/app.py +42 -3
  12. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/run/__init__.py +1 -1
  13. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/run/sinks.py +110 -26
  14. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/run/test_sinks.py +180 -1
  15. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/uv.lock +1 -1
  16. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.claude/skills/eval/SKILL.md +0 -0
  17. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.claude/skills/eval/evals/evals.json +0 -0
  18. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.claude/skills/inspect/SKILL.md +0 -0
  19. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.claude/skills/inspect/evals/evals.json +0 -0
  20. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.dockerignore +0 -0
  21. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  22. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  23. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  24. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.github/workflows/ci.yml +0 -0
  25. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  26. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.github/workflows/release.yml +0 -0
  27. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.gitignore +0 -0
  28. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/.mcp.json +0 -0
  29. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/CONTRIBUTING.md +0 -0
  30. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/DEVELOPMENT.md +0 -0
  31. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/Dockerfile +0 -0
  32. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/LICENSE +0 -0
  33. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/Makefile +0 -0
  34. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/Chart.yaml +0 -0
  35. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/templates/NOTES.txt +0 -0
  36. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/templates/_helpers.tpl +0 -0
  37. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  38. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/templates/postgresql.yaml +0 -0
  39. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/templates/service.yaml +0 -0
  40. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  41. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/docs/assets/logo-color-on-transparent.svg +0 -0
  42. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/docs/assets/logo-color.png +0 -0
  43. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/docs/assets/logo-dark-on-transparent.svg +0 -0
  44. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/docs/custom-evaluators.md +0 -0
  45. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/docs/eval-set-format.md +0 -0
  46. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/docs/otel-compatibility.md +0 -0
  47. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/docs/streaming.md +0 -0
  48. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/custom_evaluators/eval_config.yaml +0 -0
  49. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/custom_evaluators/response_quality.py +0 -0
  50. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/custom_evaluators/tool_call_checker.py +0 -0
  51. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/dice_agent/README.md +0 -0
  52. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/dice_agent/agent.py +0 -0
  53. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/dice_agent/eval_set.json +0 -0
  54. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/dice_agent/main.py +0 -0
  55. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/dice_agent/test_streaming.py +0 -0
  56. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/kubernetes/README.md +0 -0
  57. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/langchain_agent/README.md +0 -0
  58. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/langchain_agent/agent.py +0 -0
  59. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/langchain_agent/eval_set.json +0 -0
  60. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/langchain_agent/main.py +0 -0
  61. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/langchain_agent/requirements.txt +0 -0
  62. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/langchain_agent/test_streaming.py +0 -0
  63. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/sdk_example/async_example.py +0 -0
  64. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/sdk_example/context_manager_example.py +0 -0
  65. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/sdk_example/decorator_example.py +0 -0
  66. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/sdk_example/requirements.txt +0 -0
  67. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/strands_agent/agent.py +0 -0
  68. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/strands_agent/eval_set.json +0 -0
  69. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/strands_agent/main.py +0 -0
  70. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/strands_agent/requirements.txt +0 -0
  71. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/adk/requirements.txt +0 -0
  72. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/adk/run.py +0 -0
  73. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  74. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/langchain/run.py +0 -0
  75. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  76. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/ollama/run.py +0 -0
  77. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  78. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/openai-agents/run.py +0 -0
  79. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  80. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  81. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/strands/requirements.txt +0 -0
  82. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/examples/zero-code-examples/strands/run.py +0 -0
  83. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/flake.lock +0 -0
  84. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/flake.nix +0 -0
  85. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/packages/evaluator-sdk-py/README.md +0 -0
  86. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  87. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  88. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  89. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  90. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/samples/eval_set_helm.json +0 -0
  91. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/samples/evalset_helm_3_2026-02-23.json +0 -0
  92. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/samples/evalset_k8s_2026-02-20.json +0 -0
  93. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/samples/helm.json +0 -0
  94. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/samples/helm_2.json +0 -0
  95. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/samples/helm_3.json +0 -0
  96. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/samples/k8s.json +0 -0
  97. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/samples/tempo_export_with_batches.json +0 -0
  98. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/__init__.py +0 -0
  99. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/_protocol.py +0 -0
  100. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  101. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/_static/assets/index-Cl6S2lcn.js +0 -0
  102. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/_static/index.html +0 -0
  103. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/_static/logo.svg +0 -0
  104. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/_static/vite.svg +0 -0
  105. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/__init__.py +0 -0
  106. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/debug_routes.py +0 -0
  107. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/dependencies.py +0 -0
  108. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/models.py +0 -0
  109. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/otlp_app.py +0 -0
  110. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/otlp_grpc.py +0 -0
  111. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/otlp_processing.py +0 -0
  112. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/otlp_routes.py +0 -0
  113. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/routes.py +0 -0
  114. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/runs_routes.py +0 -0
  115. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/api/streaming_routes.py +0 -0
  116. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/builtin_metrics.py +0 -0
  117. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/cli.py +0 -0
  118. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/config.py +0 -0
  119. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/converter.py +0 -0
  120. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/custom_evaluators.py +0 -0
  121. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/eval_config_loader.py +0 -0
  122. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/evaluator/__init__.py +0 -0
  123. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/evaluator/resolver.py +0 -0
  124. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/evaluator/sources.py +0 -0
  125. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/evaluator/templates.py +0 -0
  126. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/evaluator/venv.py +0 -0
  127. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/extraction.py +0 -0
  128. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/genai_converter.py +0 -0
  129. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/loader/__init__.py +0 -0
  130. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/loader/auto.py +0 -0
  131. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/loader/base.py +0 -0
  132. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/loader/jaeger.py +0 -0
  133. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/loader/otlp.py +0 -0
  134. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/mcp_server.py +0 -0
  135. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/openai_eval_backend.py +0 -0
  136. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/output.py +0 -0
  137. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/run/fetcher.py +0 -0
  138. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/run/result_builder.py +0 -0
  139. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/run/service.py +0 -0
  140. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/run/worker.py +0 -0
  141. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/runner.py +0 -0
  142. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/sdk.py +0 -0
  143. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/__init__.py +0 -0
  144. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/config.py +0 -0
  145. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/models.py +0 -0
  146. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/postgres/__init__.py +0 -0
  147. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  148. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  149. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/postgres/migrator.py +0 -0
  150. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/postgres/pool.py +0 -0
  151. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/repos/__init__.py +0 -0
  152. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/repos/memory.py +0 -0
  153. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/storage/repos/postgres.py +0 -0
  154. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/streaming/__init__.py +0 -0
  155. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/streaming/incremental_processor.py +0 -0
  156. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/streaming/processor.py +0 -0
  157. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/streaming/session.py +0 -0
  158. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/streaming/ws_server.py +0 -0
  159. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/trace_attrs.py +0 -0
  160. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/trace_metrics.py +0 -0
  161. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/utils/__init__.py +0 -0
  162. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/utils/genai_messages.py +0 -0
  163. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/utils/log_buffer.py +0 -0
  164. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/src/agentevals/utils/log_enrichment.py +0 -0
  165. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/api/__init__.py +0 -0
  166. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/api/test_evaluate_persistence.py +0 -0
  167. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/api/test_runs_routes.py +0 -0
  168. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/integration/__init__.py +0 -0
  169. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/integration/conftest.py +0 -0
  170. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/integration/test_evaluation_pipeline.py +0 -0
  171. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/integration/test_live_agents.py +0 -0
  172. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  173. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/integration/test_session_grouping.py +0 -0
  174. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/integration/test_timing_stress.py +0 -0
  175. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/run/__init__.py +0 -0
  176. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/run/test_fetcher.py +0 -0
  177. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/run/test_result_builder.py +0 -0
  178. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/run/test_service.py +0 -0
  179. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/storage/__init__.py +0 -0
  180. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/storage/test_config.py +0 -0
  181. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/storage/test_memory_repos.py +0 -0
  182. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/storage/test_migrator.py +0 -0
  183. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/storage/test_models.py +0 -0
  184. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_api.py +0 -0
  185. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_cli.py +0 -0
  186. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_converter.py +0 -0
  187. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_extraction.py +0 -0
  188. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_genai_converter.py +0 -0
  189. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_jaeger_loader.py +0 -0
  190. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_loader_auto.py +0 -0
  191. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_log_enrichment.py +0 -0
  192. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_mcp_server.py +0 -0
  193. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_otlp_loader.py +0 -0
  194. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_otlp_receiver.py +0 -0
  195. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_output.py +0 -0
  196. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_protocol.py +0 -0
  197. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_runner.py +0 -0
  198. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_sdk.py +0 -0
  199. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/tests/test_trace_metrics.py +0 -0
  200. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/.gitignore +0 -0
  201. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/README.md +0 -0
  202. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/eslint.config.js +0 -0
  203. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/index.html +0 -0
  204. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/package-lock.json +0 -0
  205. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/package.json +0 -0
  206. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/public/logo.svg +0 -0
  207. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/public/vite.svg +0 -0
  208. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/App.css +0 -0
  209. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/App.tsx +0 -0
  210. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/api/client.ts +0 -0
  211. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/assets/react.svg +0 -0
  212. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  213. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  214. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  215. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  216. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  217. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/BuilderView.tsx +0 -0
  218. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  219. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  220. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  221. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/JsonPreview.tsx +0 -0
  222. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  223. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  224. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/builder/index.ts +0 -0
  225. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  226. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  227. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  228. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  229. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  230. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  231. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  232. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  233. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/DataSection.tsx +0 -0
  234. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  235. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  236. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/InspectorView.tsx +0 -0
  237. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  238. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  239. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  240. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  241. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  242. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  243. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  244. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  245. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  246. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  247. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  248. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/streaming/SessionCard.tsx +0 -0
  249. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  250. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  251. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/upload/FileDropZone.tsx +0 -0
  252. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/upload/MetricSelector.tsx +0 -0
  253. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  254. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  255. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/upload/UploadView.tsx +0 -0
  256. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  257. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/config.ts +0 -0
  258. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/context/TraceContext.tsx +0 -0
  259. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/context/TraceProvider.tsx +0 -0
  260. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/index.css +0 -0
  261. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/console-capture.ts +0 -0
  262. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/evalset-builder.ts +0 -0
  263. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/network-capture.ts +0 -0
  264. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/trace-helpers.ts +0 -0
  265. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/trace-loader.ts +0 -0
  266. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/trace-metadata.ts +0 -0
  267. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/trace-patcher.ts +0 -0
  268. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/types.ts +0 -0
  269. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/lib/utils.ts +0 -0
  270. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/src/main.tsx +0 -0
  271. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/tsconfig.app.json +0 -0
  272. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/tsconfig.json +0 -0
  273. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/tsconfig.node.json +0 -0
  274. {agentevals_cli-0.8.1 → agentevals_cli-0.8.3}/ui/vite.config.ts +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.8.1
3
+ Version: 0.8.3
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -425,6 +425,18 @@ Yes. A custom evaluator is any program that reads JSON from stdin and writes a s
425
425
 
426
426
  Yes. The OTLP receiver on port 4318 accepts standard `http/protobuf` and `http/json` trace exports, so it slots into any OpenTelemetry pipeline as just another exporter destination. If your pipeline uses gRPC (port 4317), place an [OTel Collector](https://opentelemetry.io/docs/collector/) in front to bridge gRPC to HTTP. The [Kubernetes example](examples/kubernetes/README.md) shows this pattern.
427
427
 
428
+ **Can I use agentevals to evaluate Claude Code, Codex, or OpenCode?**
429
+
430
+ Not today. agentevals scores agent behavior from OpenTelemetry GenAI traces (spans for model calls, tool calls, agent invocations following the [OTel GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)). The major coding agents do not currently emit telemetry in that shape:
431
+
432
+ - **Claude Code** ships OTel telemetry as logs, not GenAI spans. A prior proof of concept on a feature branch made it work by stitching hook events into synthetic traces. Reviving that path is on the backlog, not a near-term commitment.
433
+ - **Codex** exposes OTel, but in a different shape we have not yet validated against the GenAI semconv.
434
+ - **OpenCode** did not have OTel support merged the last time we checked.
435
+
436
+ Retrofitting agentevals to ingest each harness's bespoke telemetry is multiple thousands of lines of glue code per agent, for a use case where the dominant signal is "did the final output feel right," not "did the agent call the right tool with the right arguments in the right order." That kind of vibes evaluation is interesting work for harness and coding-agent vendors themselves, but it is not what agentevals is optimized for.
437
+
438
+ agentevals is built for the opposite end of the spectrum: smaller, purpose-built, properly instrumented agents (kagent, agentregistry, custom Strands / ADK / LangChain / OpenAI Agents SDK flows) running in cloud native environments, where success is measurable through tool trajectories, response matching, and deterministic pass/fail gates. If that is your use case, we are a good fit. If you are evaluating long-running coding sessions end to end, you probably want a tool built specifically for that shape.
439
+
428
440
  **How does this compare to ADK's evaluations?**
429
441
 
430
442
  Unlike ADK's eval method, which couples agent execution with evaluation, agentevals only handles scoring: it takes pre-recorded traces and compares them against expected behavior using metrics like tool trajectory matching, response quality, and LLM-based judgments.
@@ -448,3 +460,7 @@ Langfuse is a full observability platform (requires Postgres, ClickHouse, Redis,
448
460
  **How does this compare to Opik?**
449
461
 
450
462
  Opik's primary evaluation path re-runs your application code against a dataset, incurring additional LLM costs per eval run. It also supports online evaluation rules that auto-score production traces. While Opik supports OpenTelemetry ingestion alongside its own SDK, its evaluation workflow still centers on re-execution against datasets. agentevals evaluates pre-recorded OTel traces from any framework without re-execution, and runs entirely locally with no cloud dependency.
463
+
464
+ ## Acknowledgements
465
+
466
+ agentevals is built on top of [Google's Agent Development Kit](https://github.com/google/adk-python). ADK provides the evaluator protocol and the canonical eval data model (`Invocation`, `EvalSet`, `Evaluator`, prebuilt metrics) that this project extends. `google-adk` is licensed under [Apache 2.0](https://github.com/google/adk-python/blob/main/LICENSE), the same license as agentevals. Thanks to the ADK team and contributors.
@@ -397,6 +397,18 @@ Yes. A custom evaluator is any program that reads JSON from stdin and writes a s
397
397
 
398
398
  Yes. The OTLP receiver on port 4318 accepts standard `http/protobuf` and `http/json` trace exports, so it slots into any OpenTelemetry pipeline as just another exporter destination. If your pipeline uses gRPC (port 4317), place an [OTel Collector](https://opentelemetry.io/docs/collector/) in front to bridge gRPC to HTTP. The [Kubernetes example](examples/kubernetes/README.md) shows this pattern.
399
399
 
400
+ **Can I use agentevals to evaluate Claude Code, Codex, or OpenCode?**
401
+
402
+ Not today. agentevals scores agent behavior from OpenTelemetry GenAI traces (spans for model calls, tool calls, agent invocations following the [OTel GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)). The major coding agents do not currently emit telemetry in that shape:
403
+
404
+ - **Claude Code** ships OTel telemetry as logs, not GenAI spans. A prior proof of concept on a feature branch made it work by stitching hook events into synthetic traces. Reviving that path is on the backlog, not a near-term commitment.
405
+ - **Codex** exposes OTel, but in a different shape we have not yet validated against the GenAI semconv.
406
+ - **OpenCode** did not have OTel support merged the last time we checked.
407
+
408
+ Retrofitting agentevals to ingest each harness's bespoke telemetry is multiple thousands of lines of glue code per agent, for a use case where the dominant signal is "did the final output feel right," not "did the agent call the right tool with the right arguments in the right order." That kind of vibes evaluation is interesting work for harness and coding-agent vendors themselves, but it is not what agentevals is optimized for.
409
+
410
+ agentevals is built for the opposite end of the spectrum: smaller, purpose-built, properly instrumented agents (kagent, agentregistry, custom Strands / ADK / LangChain / OpenAI Agents SDK flows) running in cloud native environments, where success is measurable through tool trajectories, response matching, and deterministic pass/fail gates. If that is your use case, we are a good fit. If you are evaluating long-running coding sessions end to end, you probably want a tool built specifically for that shape.
411
+
400
412
  **How does this compare to ADK's evaluations?**
401
413
 
402
414
  Unlike ADK's eval method, which couples agent execution with evaluation, agentevals only handles scoring: it takes pre-recorded traces and compares them against expected behavior using metrics like tool trajectory matching, response quality, and LLM-based judgments.
@@ -420,3 +432,7 @@ Langfuse is a full observability platform (requires Postgres, ClickHouse, Redis,
420
432
  **How does this compare to Opik?**
421
433
 
422
434
  Opik's primary evaluation path re-runs your application code against a dataset, incurring additional LLM costs per eval run. It also supports online evaluation rules that auto-score production traces. While Opik supports OpenTelemetry ingestion alongside its own SDK, its evaluation workflow still centers on re-execution against datasets. agentevals evaluates pre-recorded OTel traces from any framework without re-execution, and runs entirely locally with no cloud dependency.
435
+
436
+ ## Acknowledgements
437
+
438
+ agentevals is built on top of [Google's Agent Development Kit](https://github.com/google/adk-python). ADK provides the evaluator protocol and the canonical eval data model (`Invocation`, `EvalSet`, `Evaluator`, prebuilt metrics) that this project extends. `google-adk` is licensed under [Apache 2.0](https://github.com/google/adk-python/blob/main/LICENSE), the same license as agentevals. Thanks to the ADK team and contributors.
@@ -29,8 +29,9 @@ spec:
29
29
  securityContext:
30
30
  {{- toYaml .Values.podSecurityContext | nindent 8 }}
31
31
  serviceAccountName: {{ include "agentevals.serviceAccountName" . }}
32
- {{- if .Values.ephemeralVolume.enabled }}
32
+ {{- if or .Values.ephemeralVolume.enabled .Values.extraVolumes }}
33
33
  volumes:
34
+ {{- if .Values.ephemeralVolume.enabled }}
34
35
  - name: agentevals-tmp
35
36
  {{- if or .Values.ephemeralVolume.sizeLimit (eq .Values.ephemeralVolume.medium "Memory") }}
36
37
  emptyDir:
@@ -43,6 +44,10 @@ spec:
43
44
  {{- else }}
44
45
  emptyDir: {}
45
46
  {{- end }}
47
+ {{- end }}
48
+ {{- with .Values.extraVolumes }}
49
+ {{- toYaml . | nindent 8 }}
50
+ {{- end }}
46
51
  {{- end }}
47
52
  containers:
48
53
  - name: agentevals
@@ -70,6 +75,8 @@ spec:
70
75
  value: "postgres"
71
76
  - name: AGENTEVALS_DATABASE_SCHEMA
72
77
  value: {{ .Values.database.postgres.schema | quote }}
78
+ - name: AGENTEVALS_AUTO_MIGRATE
79
+ value: {{ .Values.database.postgres.autoMigrate | quote }}
73
80
  {{- if .Values.database.postgres.urlFile }}
74
81
  - name: AGENTEVALS_DATABASE_URL_FILE
75
82
  value: {{ .Values.database.postgres.urlFile | quote }}
@@ -135,10 +142,15 @@ spec:
135
142
  port: http
136
143
  initialDelaySeconds: 15
137
144
  periodSeconds: 20
138
- {{- if .Values.ephemeralVolume.enabled }}
145
+ {{- if or .Values.ephemeralVolume.enabled .Values.extraVolumeMounts }}
139
146
  volumeMounts:
147
+ {{- if .Values.ephemeralVolume.enabled }}
140
148
  - name: agentevals-tmp
141
149
  mountPath: /tmp
150
+ {{- end }}
151
+ {{- with .Values.extraVolumeMounts }}
152
+ {{- toYaml . | nindent 12 }}
153
+ {{- end }}
142
154
  {{- end }}
143
155
  {{- with .Values.nodeSelector }}
144
156
  nodeSelector:
@@ -159,6 +159,16 @@ env: []
159
159
  # -- Extra envFrom sources (ConfigMapRef, SecretRef)
160
160
  envFrom: []
161
161
 
162
+ # -- Extra volumes appended to the pod spec. Use this to mount additional
163
+ # config files or secrets (e.g. result-sink credentials) into the pod.
164
+ extraVolumes: []
165
+
166
+ # -- Extra volumeMounts appended to the main container. Pair with
167
+ # extraVolumes by name. securityContext.readOnlyRootFilesystem is true by
168
+ # default; that only makes the root filesystem read-only, mounted paths
169
+ # themselves are unaffected, so a writable extraVolumes entry works fine.
170
+ extraVolumeMounts: []
171
+
162
172
  # ==============================================================================
163
173
  # STORAGE (preview feature)
164
174
  #
@@ -195,6 +205,12 @@ database:
195
205
  urlFile: ""
196
206
  # -- Postgres schema to use for agentevals tables.
197
207
  schema: agentevals
208
+ # -- Apply pending database migrations during server startup before the
209
+ # HTTP listener opens. The Postgres advisory lock serialises concurrent
210
+ # replica starts so this is safe with replicaCount > 1. When set to
211
+ # false the server refuses to start if the schema is behind or dirty;
212
+ # run "agentevals migrate up" manually in that case.
213
+ autoMigrate: true
198
214
  # -- Bundled Postgres instance for development and evaluation only.
199
215
  # Not suitable for production. Deployed when enabled is true and url /
200
216
  # urlFile are not set.
@@ -119,6 +119,12 @@ The zero-code and SDK examples implement the same toy agent (dice rolling + prim
119
119
  |---------|-------------|
120
120
  | [kubernetes/](./kubernetes/) | Deploy agentevals with kagent on Kubernetes using native OTLP gRPC ingestion (or optionally an OTel Collector). Includes a walkthrough for comparing two kagent agents (different models) and evaluating them with tool trajectory and response match scores. |
121
121
 
122
+ ## Custom result sinks
123
+
124
+ Plugins can deliver run results (partial metrics, final summary, errors) to arbitrary backends alongside the database. Install a package that declares `[project.entry-points."agentevals.sinks"]`, restart agentevals, then reference the plugin’s `kind` in `spec.sinks` on `POST /api/runs`.
125
+
126
+ See [custom_sink/README.md](./custom_sink/README.md) for a minimal setuptools plugin and configuration examples.
127
+
122
128
  ## Advanced: GenAI Semantic Convention Patterns
123
129
 
124
130
  > [!TIP]
@@ -0,0 +1,80 @@
1
+ # Custom result sink plugin
2
+
3
+ This folder is a tiny installable Python package that registers a result **sink** with agentevals via setuptools **entry points**. The worker fans out partial/final/error events to every configured sink in addition to the database.
4
+
5
+ ## What gets implemented
6
+
7
+ - **`DemoNdjsonSink`** — subclasses `ResultSink` from `agentevals.run.sinks` and appends one JSON object per line to `path` from the run spec (same pattern as the built-in `file` sink, with a `"demo": true` marker on each line).
8
+ - **`create_demo_sink(spec)`** — factory callable; must accept the full sink dict from the run spec and return a `ResultSink` (see return type in code).
9
+
10
+ The entry point **name** (`demo_ndjson` in `pyproject.toml`) is the **`kind`** string clients put under `spec.sinks`.
11
+
12
+ ## Install (local dev)
13
+
14
+ From the agentevals repo root, install the framework first, then this example:
15
+
16
+ ```bash
17
+ uv pip install -e .
18
+ uv pip install -e examples/custom_sink
19
+ ```
20
+
21
+ Restart the agentevals process so `importlib.metadata` picks up the new distribution.
22
+
23
+ PyPI-style usage is the same: depend on `agentevals-example-custom-sink` next to `agentevals-cli`, install both into the server environment, restart.
24
+
25
+ ## Configure runs
26
+
27
+ Async runs are submitted with **`POST /api/runs`**. Put your sink in **`spec.sinks`** (requires Postgres storage — see main docs).
28
+
29
+ Example body (use **absolute** `path` on the host where the agentevals process runs when possible). **`path` must be a file path** (e.g. `/tmp/demo.ndjson`). If `path` is an **existing directory** (including `"."` for the process working directory), output goes to `<path>/agentevals-demo-sink.ndjson`, or `<path>/<filename>` if you add an optional `"filename"` field next to `path` in the sink dict.
30
+
31
+ The `inline` object must contain real trace data (Jaeger JSON or OTLP), not an empty object.
32
+
33
+ ```json
34
+ {
35
+ "spec": {
36
+ "approach": "trace_replay",
37
+ "target": {
38
+ "kind": "inline",
39
+ "traceFormat": "jaeger-json",
40
+ "inline": {
41
+ "data": [
42
+ {
43
+ "traceID": "61646461646164646164616461646164",
44
+ "spans": [
45
+ {
46
+ "traceID": "61646461646164646164616461646164",
47
+ "spanID": "6164616461646164",
48
+ "operationName": "demo-op",
49
+ "startTime": 1000000,
50
+ "duration": 100000,
51
+ "tags": [],
52
+ "logs": [],
53
+ "references": [],
54
+ "processID": "p1"
55
+ }
56
+ ],
57
+ "processes": { "p1": { "serviceName": "demo" } }
58
+ }
59
+ ]
60
+ }
61
+ },
62
+ "sinks": [{ "kind": "demo_ndjson", "path": "/tmp/agentevals-demo.ndjson" }]
63
+ }
64
+ }
65
+ ```
66
+
67
+ You can list several sinks; they run in parallel. Built-in kinds are `stdout`, `file`, and `http_webhook`.
68
+
69
+ ## Publishing your own sink
70
+
71
+ 1. Implement `ResultSink` from `agentevals.run.sinks` (subclass the protocol, or provide the three async methods).
72
+ 2. Expose a factory `def create_*(spec: dict) -> ResultSink`.
73
+ 3. Add the following to your `pyproject.toml`:
74
+
75
+ ```toml
76
+ [project.entry-points."agentevals.sinks"]
77
+ your_kind = "your_package.module:your_factory"
78
+ ```
79
+
80
+ 4. Install the package into the **same environment** as `agentevals serve`, restart, and reference `"kind": "your_kind"` in `spec.sinks`.
@@ -0,0 +1 @@
1
+ """Example result sink plugin for agentevals (see README)."""
@@ -0,0 +1,71 @@
1
+ """Minimal NDJSON sink registered via setuptools entry points."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from uuid import UUID
10
+
11
+ from agentevals.run.sinks import ResultSink
12
+ from agentevals.storage.models import Result
13
+
14
+
15
+ def _result_payload(r: Result) -> dict:
16
+ return r.model_dump(mode="json", by_alias=True)
17
+
18
+
19
+ _DEFAULT_FILENAME = "agentevals-demo-sink.ndjson"
20
+
21
+
22
+ def _resolve_output_file(spec: dict[str, Any]) -> Path:
23
+ """If ``path`` is an existing directory (including ``.``), write NDJSON inside it."""
24
+ p = Path(spec["path"]).expanduser()
25
+ if p.exists() and p.is_dir():
26
+ name = spec.get("filename") or _DEFAULT_FILENAME
27
+ return p / name
28
+ return p
29
+
30
+
31
+ class DemoNdjsonSink(ResultSink):
32
+ """Concrete :class:`~agentevals.run.sinks.ResultSink`; append-only JSON lines with a ``demo`` marker."""
33
+
34
+ def __init__(self, path: Path) -> None:
35
+ self._path = path
36
+ self._lock = asyncio.Lock()
37
+
38
+ async def _write(self, payload: dict) -> None:
39
+ async with self._lock:
40
+ self._path.parent.mkdir(parents=True, exist_ok=True)
41
+ with self._path.open("a") as f: # noqa: ASYNC230
42
+ f.write(json.dumps(payload) + "\n")
43
+
44
+ async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None:
45
+ for r in results:
46
+ await self._write(
47
+ {
48
+ "phase": "partial",
49
+ "run_id": str(run_id),
50
+ "attempt": attempt,
51
+ "demo": True,
52
+ "result": _result_payload(r),
53
+ }
54
+ )
55
+
56
+ async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None:
57
+ await self._write(
58
+ {"phase": "final", "run_id": str(run_id), "attempt": attempt, "demo": True, "summary": summary}
59
+ )
60
+
61
+ async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None:
62
+ await self._write({"phase": "error", "run_id": str(run_id), "attempt": attempt, "demo": True, "error": error})
63
+
64
+
65
+ def create_demo_sink(spec: dict[str, Any]) -> ResultSink:
66
+ """Entry-point factory: returns a :class:`ResultSink`; ``kind`` must be ``demo_ndjson`` (see pyproject).
67
+
68
+ ``path`` should normally be a **file** path. If it points at an existing directory (e.g. ``.`` or ``/tmp``),
69
+ lines are appended to ``<path>/agentevals-demo-sink.ndjson``, or ``<path>/<filename>`` if ``filename`` is set.
70
+ """
71
+ return DemoNdjsonSink(_resolve_output_file(spec))
@@ -0,0 +1,19 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "agentevals-example-custom-sink"
7
+ version = "0.1.0"
8
+ description = "Example setuptools plugin that registers an agentevals result sink"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "agentevals-cli>=0.7.0",
13
+ ]
14
+
15
+ [project.entry-points."agentevals.sinks"]
16
+ demo_ndjson = "agentevals_example_custom_sink.sink:create_demo_sink"
17
+
18
+ [tool.hatch.build.targets.wheel]
19
+ packages = ["agentevals_example_custom_sink"]
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "agentevals-cli"
7
- version = "0.8.1"
7
+ version = "0.8.3"
8
8
  description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -17,9 +17,10 @@ from fastapi.responses import StreamingResponse
17
17
  from agentevals import __version__
18
18
 
19
19
  from ..run.service import RunService
20
+ from ..run.sinks import log_registered_sinks
20
21
  from ..run.worker import AsyncRunWorker
21
22
  from ..storage import StorageSettings, build_repos
22
- from ..storage.postgres.migrator import Migrator
23
+ from ..storage.postgres.migrator import Migrator, discover_migrations
23
24
  from ..utils.log_buffer import log_buffer
24
25
  from .debug_routes import debug_router
25
26
  from .routes import router
@@ -30,6 +31,22 @@ if TYPE_CHECKING:
30
31
 
31
32
  logger = logging.getLogger(__name__)
32
33
 
34
+ _TRUE_VALUES = {"true", "1", "yes", "on"}
35
+ _FALSE_VALUES = {"false", "0", "no", "off"}
36
+
37
+
38
+ def _env_bool(name: str, *, default: bool) -> bool:
39
+ raw = os.getenv(name)
40
+ if raw is None or raw == "":
41
+ return default
42
+ val = raw.strip().lower()
43
+ if val in _TRUE_VALUES:
44
+ return True
45
+ if val in _FALSE_VALUES:
46
+ return False
47
+ raise ValueError(f"{name} must be one of true/false/1/0/yes/no/on/off (got: {raw!r})")
48
+
49
+
33
50
  try:
34
51
  from dotenv import load_dotenv
35
52
 
@@ -67,13 +84,34 @@ def _build_lifespan():
67
84
  logger.error("Storage configuration invalid; /api/runs will not be available: %s", exc)
68
85
 
69
86
  if storage_settings is not None and storage_settings.backend == "postgres":
70
- logger.info("Applying any pending migrations to schema '%s'", storage_settings.schema_name)
71
87
  migrator = Migrator(
72
88
  dsn=storage_settings.database_url or "",
73
89
  schema=storage_settings.schema_name,
74
90
  lock_timeout_s=storage_settings.migrate_lock_timeout_s,
75
91
  )
76
- await migrator.up()
92
+ if _env_bool("AGENTEVALS_AUTO_MIGRATE", default=True):
93
+ logger.info("Applying any pending migrations to schema '%s'", storage_settings.schema_name)
94
+ await migrator.up()
95
+ else:
96
+ logger.info(
97
+ "AGENTEVALS_AUTO_MIGRATE is disabled; verifying schema '%s' is up to date",
98
+ storage_settings.schema_name,
99
+ )
100
+ status = await migrator.status()
101
+ if status.dirty:
102
+ raise RuntimeError(
103
+ f"schema_migrations is dirty at version {status.version}. "
104
+ "Resolve manually and run 'agentevals migrate force <version>', "
105
+ "or set AGENTEVALS_AUTO_MIGRATE=true to retry on startup."
106
+ )
107
+ current = status.version
108
+ pending = [m.version for m in discover_migrations() if current is None or m.version > current]
109
+ if pending:
110
+ raise RuntimeError(
111
+ f"Database schema is behind: pending migrations {pending}. "
112
+ "Run 'agentevals migrate up' to apply them, "
113
+ "or set AGENTEVALS_AUTO_MIGRATE=true to apply on startup."
114
+ )
77
115
 
78
116
  repos = await build_repos(storage_settings)
79
117
  app.state.storage_settings = storage_settings
@@ -83,6 +121,7 @@ def _build_lifespan():
83
121
  worker = AsyncRunWorker(runs=repos.runs, results=repos.results, settings=storage_settings)
84
122
  await worker.start()
85
123
  app.state.run_worker = worker
124
+ log_registered_sinks()
86
125
 
87
126
  yield
88
127
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  Contents:
4
4
  - :mod:`fetcher` resolves a run spec's ``target`` into a list of traces.
5
- - :mod:`sinks` fan-out result delivery (stdout, file, http_webhook).
5
+ - :mod:`sinks` fan-out result delivery (built-ins plus setuptools plugins / :func:`~agentevals.run.sinks.register_sink_factory`).
6
6
  - :mod:`service` is the synchronous control surface used by HTTP handlers.
7
7
  - :mod:`worker` is the in-process loop that claims runs and drives the
8
8
  existing :func:`agentevals.runner.run_evaluation_from_traces` pipeline.
@@ -3,6 +3,14 @@
3
3
  The :class:`agentevals.storage.repos.ResultRepository` is always written;
4
4
  sinks are an additional delivery channel. Sink failures are logged with
5
5
  ``run_id`` / ``result_id`` but do not fail the run.
6
+
7
+ **Plugins:** third-party packages declare setuptools entry points in group
8
+ ``agentevals.sinks`` (entry **name** = ``kind`` string; **value** = ``module:factory``
9
+ callable ``factory(spec: dict) -> ResultSink``). Built-in kinds
10
+ (``stdout``, ``file``, ``http_webhook``) are not overridden by entry points;
11
+ hosts may replace any kind via :func:`register_sink_factory` (highest precedence).
12
+
13
+ Tests may call :func:`clear_sink_plugin_registry` to drop programmatic registrations.
6
14
  """
7
15
 
8
16
  from __future__ import annotations
@@ -12,8 +20,10 @@ import json
12
20
  import logging
13
21
  import os
14
22
  import sys
23
+ from collections.abc import Callable
24
+ from importlib.metadata import entry_points
15
25
  from pathlib import Path
16
- from typing import Any, Protocol
26
+ from typing import Any, Protocol, cast
17
27
  from uuid import UUID
18
28
 
19
29
  import httpx
@@ -22,6 +32,8 @@ from ..storage.models import Result
22
32
 
23
33
  logger = logging.getLogger(__name__)
24
34
 
35
+ SINK_ENTRY_POINT_GROUP = "agentevals.sinks"
36
+
25
37
 
26
38
  class ResultSink(Protocol):
27
39
  async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None: ...
@@ -29,6 +41,11 @@ class ResultSink(Protocol):
29
41
  async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None: ...
30
42
 
31
43
 
44
+ SinkFactory = Callable[[dict[str, Any]], ResultSink]
45
+
46
+ _PLUGIN_FACTORIES: dict[str, SinkFactory] = {}
47
+
48
+
32
49
  def _result_payload(r: Result) -> dict:
33
50
  return r.model_dump(mode="json", by_alias=True)
34
51
 
@@ -187,33 +204,18 @@ class SinkFanout:
187
204
  logger.exception("sink delivery failed in phase=%s", phase)
188
205
 
189
206
 
190
- def build_sinks(specs: list[dict]) -> SinkFanout:
191
- """Construct a fan-out from the run spec's ``sinks`` array.
207
+ def register_sink_factory(kind: str, factory: SinkFactory) -> None:
208
+ """Register or replace the factory for ``kind`` (overrides built-ins and entry points).
192
209
 
193
- Each spec is a dict with ``kind`` plus kind-specific args. Unknown kinds
194
- are skipped with a warning so a future kind added by a host doesn't
195
- break older agentevals replicas mid-rollout.
210
+ Call during process startup before run workers consume specs. The factory receives
211
+ the full sink spec dict (including ``kind``) and returns a :class:`ResultSink`.
196
212
  """
197
- sinks: list[ResultSink] = []
198
- for spec in specs:
199
- kind = spec.get("kind")
200
- if kind == "stdout":
201
- sinks.append(StdoutSink())
202
- elif kind == "file":
203
- sinks.append(FileSink(spec["path"]))
204
- elif kind == "http_webhook":
205
- sinks.append(
206
- HttpWebhookSink(
207
- url=spec["url"],
208
- headers=spec.get("headers"),
209
- headers_from_env=spec.get("headers_from_env") or _extract_env_headers(spec.get("auth")),
210
- timeout_s=float(spec.get("timeout_s", 10.0)),
211
- max_attempts=int(spec.get("max_attempts", 5)),
212
- )
213
- )
214
- else:
215
- logger.warning("unknown sink kind '%s'; skipping", kind)
216
- return SinkFanout(sinks)
213
+ _PLUGIN_FACTORIES[kind] = factory
214
+
215
+
216
+ def clear_sink_plugin_registry() -> None:
217
+ """Drop all registrations from :func:`register_sink_factory` (for tests)."""
218
+ _PLUGIN_FACTORIES.clear()
217
219
 
218
220
 
219
221
  def _extract_env_headers(auth: Any) -> dict[str, str]:
@@ -228,3 +230,85 @@ def _extract_env_headers(auth: Any) -> dict[str, str]:
228
230
  if isinstance(value, dict) and "from_env" in value:
229
231
  result[header_name] = value["from_env"]
230
232
  return result
233
+
234
+
235
+ def _http_webhook_from_spec(spec: dict[str, Any]) -> HttpWebhookSink:
236
+ return HttpWebhookSink(
237
+ url=spec["url"],
238
+ headers=spec.get("headers"),
239
+ headers_from_env=spec.get("headers_from_env") or _extract_env_headers(spec.get("auth")),
240
+ timeout_s=float(spec.get("timeout_s", 10.0)),
241
+ max_attempts=int(spec.get("max_attempts", 5)),
242
+ )
243
+
244
+
245
+ def _builtin_factories() -> dict[str, SinkFactory]:
246
+ return {
247
+ "stdout": lambda _spec: StdoutSink(),
248
+ "file": lambda spec: FileSink(spec["path"]),
249
+ "http_webhook": _http_webhook_from_spec,
250
+ }
251
+
252
+
253
+ def _merge_sink_factories() -> dict[str, SinkFactory]:
254
+ """Built-ins, then entry points (no built-in shadowing), then programmatic overrides."""
255
+ merged: dict[str, SinkFactory] = dict(_builtin_factories())
256
+ eps = entry_points(group=SINK_ENTRY_POINT_GROUP)
257
+ for ep in eps:
258
+ if ep.name in merged:
259
+ logger.debug("skipping sink entry point %r; built-in kind takes precedence", ep.name)
260
+ continue
261
+ try:
262
+ loaded = ep.load()
263
+ if not callable(loaded):
264
+ logger.warning("sink entry point %r is not callable; skipping", ep.name)
265
+ continue
266
+ merged[ep.name] = cast(SinkFactory, loaded)
267
+ except Exception:
268
+ logger.exception("failed to load sink entry point %r", ep.name)
269
+ merged.update(_PLUGIN_FACTORIES)
270
+ return merged
271
+
272
+
273
+ def registered_sink_kinds() -> tuple[str, ...]:
274
+ """Sorted sink ``kind`` strings that would resolve if :func:`build_sinks` ran now.
275
+
276
+ Includes built-ins, successfully loaded setuptools entry points for group
277
+ :data:`SINK_ENTRY_POINT_GROUP`, and registrations from
278
+ :func:`register_sink_factory`. The tuple reflects current process state and
279
+ can change if the programmatic registry is mutated after startup.
280
+ """
281
+ return tuple(sorted(_merge_sink_factories().keys()))
282
+
283
+
284
+ def log_registered_sinks() -> None:
285
+ """Emit one INFO line listing available sink kinds (for operator diagnostics)."""
286
+ kinds = registered_sink_kinds()
287
+ logger.info("Result sinks available (%d kinds): %s", len(kinds), ", ".join(kinds))
288
+
289
+
290
+ def build_sinks(specs: list[dict]) -> SinkFanout:
291
+ """Construct a fan-out from the run spec's ``sinks`` array.
292
+
293
+ Each spec is a dict with ``kind`` plus kind-specific args. Unknown kinds
294
+ are skipped with a warning so a future kind added by a host doesn't
295
+ break older agentevals replicas mid-rollout.
296
+
297
+ Factory lookup starts from built-ins, adds setuptools entry points (group
298
+ ``agentevals.sinks``) for ``kind`` names not already built-in, then applies
299
+ :func:`register_sink_factory` registrations, which override any prior factory
300
+ for the same ``kind``. See :func:`_merge_sink_factories`.
301
+ """
302
+ factories = _merge_sink_factories()
303
+ sinks: list[ResultSink] = []
304
+ for spec in specs:
305
+ kind = spec.get("kind")
306
+ factory = factories.get(kind) if kind is not None else None
307
+ if factory is None:
308
+ logger.warning("unknown sink kind '%s'; skipping", kind)
309
+ continue
310
+ try:
311
+ sinks.append(factory(spec))
312
+ except Exception:
313
+ logger.exception("sink factory failed for kind=%s", kind)
314
+ return SinkFanout(sinks)