agentevals-cli 0.8.2__tar.gz → 0.8.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/PKG-INFO +17 -1
  2. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/README.md +16 -0
  3. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/templates/deployment.yaml +16 -2
  4. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/values.yaml +22 -0
  5. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/pyproject.toml +1 -1
  6. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/app.py +40 -3
  7. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/postgres/migrator.py +45 -8
  8. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/postgres/pool.py +2 -2
  9. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/storage/test_migrator.py +58 -0
  10. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/uv.lock +1 -1
  11. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.claude/skills/eval/SKILL.md +0 -0
  12. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.claude/skills/eval/evals/evals.json +0 -0
  13. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.claude/skills/inspect/SKILL.md +0 -0
  14. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.claude/skills/inspect/evals/evals.json +0 -0
  15. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.dockerignore +0 -0
  16. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  17. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  18. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  19. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.github/workflows/ci.yml +0 -0
  20. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  21. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.github/workflows/release.yml +0 -0
  22. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.gitignore +0 -0
  23. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/.mcp.json +0 -0
  24. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/CONTRIBUTING.md +0 -0
  25. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/DEVELOPMENT.md +0 -0
  26. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/Dockerfile +0 -0
  27. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/LICENSE +0 -0
  28. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/Makefile +0 -0
  29. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/Chart.yaml +0 -0
  30. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/templates/NOTES.txt +0 -0
  31. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/templates/_helpers.tpl +0 -0
  32. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  33. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/templates/postgresql.yaml +0 -0
  34. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/templates/service.yaml +0 -0
  35. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  36. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/docs/assets/logo-color-on-transparent.svg +0 -0
  37. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/docs/assets/logo-color.png +0 -0
  38. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/docs/assets/logo-dark-on-transparent.svg +0 -0
  39. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/docs/custom-evaluators.md +0 -0
  40. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/docs/eval-set-format.md +0 -0
  41. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/docs/otel-compatibility.md +0 -0
  42. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/docs/streaming.md +0 -0
  43. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/README.md +0 -0
  44. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/custom_evaluators/eval_config.yaml +0 -0
  45. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/custom_evaluators/response_quality.py +0 -0
  46. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/custom_evaluators/tool_call_checker.py +0 -0
  47. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/custom_sink/README.md +0 -0
  48. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
  49. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
  50. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/custom_sink/pyproject.toml +0 -0
  51. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/dice_agent/README.md +0 -0
  52. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/dice_agent/agent.py +0 -0
  53. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/dice_agent/eval_set.json +0 -0
  54. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/dice_agent/main.py +0 -0
  55. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/dice_agent/test_streaming.py +0 -0
  56. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/kubernetes/README.md +0 -0
  57. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/langchain_agent/README.md +0 -0
  58. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/langchain_agent/agent.py +0 -0
  59. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/langchain_agent/eval_set.json +0 -0
  60. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/langchain_agent/main.py +0 -0
  61. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/langchain_agent/requirements.txt +0 -0
  62. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/langchain_agent/test_streaming.py +0 -0
  63. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/sdk_example/async_example.py +0 -0
  64. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/sdk_example/context_manager_example.py +0 -0
  65. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/sdk_example/decorator_example.py +0 -0
  66. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/sdk_example/requirements.txt +0 -0
  67. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/strands_agent/agent.py +0 -0
  68. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/strands_agent/eval_set.json +0 -0
  69. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/strands_agent/main.py +0 -0
  70. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/strands_agent/requirements.txt +0 -0
  71. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/adk/requirements.txt +0 -0
  72. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/adk/run.py +0 -0
  73. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  74. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/langchain/run.py +0 -0
  75. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  76. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/ollama/run.py +0 -0
  77. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  78. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/openai-agents/run.py +0 -0
  79. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  80. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  81. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/strands/requirements.txt +0 -0
  82. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/examples/zero-code-examples/strands/run.py +0 -0
  83. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/flake.lock +0 -0
  84. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/flake.nix +0 -0
  85. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/packages/evaluator-sdk-py/README.md +0 -0
  86. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  87. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  88. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  89. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  90. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/samples/eval_set_helm.json +0 -0
  91. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/samples/evalset_helm_3_2026-02-23.json +0 -0
  92. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/samples/evalset_k8s_2026-02-20.json +0 -0
  93. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/samples/helm.json +0 -0
  94. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/samples/helm_2.json +0 -0
  95. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/samples/helm_3.json +0 -0
  96. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/samples/k8s.json +0 -0
  97. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/samples/tempo_export_with_batches.json +0 -0
  98. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/__init__.py +0 -0
  99. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/_protocol.py +0 -0
  100. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  101. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/_static/assets/index-Cl6S2lcn.js +0 -0
  102. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/_static/index.html +0 -0
  103. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/_static/logo.svg +0 -0
  104. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/_static/vite.svg +0 -0
  105. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/__init__.py +0 -0
  106. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/debug_routes.py +0 -0
  107. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/dependencies.py +0 -0
  108. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/models.py +0 -0
  109. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/otlp_app.py +0 -0
  110. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/otlp_grpc.py +0 -0
  111. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/otlp_processing.py +0 -0
  112. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/otlp_routes.py +0 -0
  113. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/routes.py +0 -0
  114. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/runs_routes.py +0 -0
  115. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/api/streaming_routes.py +0 -0
  116. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/builtin_metrics.py +0 -0
  117. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/cli.py +0 -0
  118. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/config.py +0 -0
  119. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/converter.py +0 -0
  120. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/custom_evaluators.py +0 -0
  121. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/eval_config_loader.py +0 -0
  122. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/evaluator/__init__.py +0 -0
  123. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/evaluator/resolver.py +0 -0
  124. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/evaluator/sources.py +0 -0
  125. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/evaluator/templates.py +0 -0
  126. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/evaluator/venv.py +0 -0
  127. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/extraction.py +0 -0
  128. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/genai_converter.py +0 -0
  129. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/loader/__init__.py +0 -0
  130. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/loader/auto.py +0 -0
  131. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/loader/base.py +0 -0
  132. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/loader/jaeger.py +0 -0
  133. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/loader/otlp.py +0 -0
  134. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/mcp_server.py +0 -0
  135. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/openai_eval_backend.py +0 -0
  136. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/output.py +0 -0
  137. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/run/__init__.py +0 -0
  138. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/run/fetcher.py +0 -0
  139. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/run/result_builder.py +0 -0
  140. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/run/service.py +0 -0
  141. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/run/sinks.py +0 -0
  142. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/run/worker.py +0 -0
  143. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/runner.py +0 -0
  144. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/sdk.py +0 -0
  145. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/__init__.py +0 -0
  146. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/config.py +0 -0
  147. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/models.py +0 -0
  148. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/postgres/__init__.py +0 -0
  149. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  150. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  151. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/repos/__init__.py +0 -0
  152. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/repos/memory.py +0 -0
  153. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/storage/repos/postgres.py +0 -0
  154. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/streaming/__init__.py +0 -0
  155. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/streaming/incremental_processor.py +0 -0
  156. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/streaming/processor.py +0 -0
  157. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/streaming/session.py +0 -0
  158. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/streaming/ws_server.py +0 -0
  159. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/trace_attrs.py +0 -0
  160. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/trace_metrics.py +0 -0
  161. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/utils/__init__.py +0 -0
  162. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/utils/genai_messages.py +0 -0
  163. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/utils/log_buffer.py +0 -0
  164. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/src/agentevals/utils/log_enrichment.py +0 -0
  165. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/api/__init__.py +0 -0
  166. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/api/test_evaluate_persistence.py +0 -0
  167. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/api/test_runs_routes.py +0 -0
  168. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/integration/__init__.py +0 -0
  169. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/integration/conftest.py +0 -0
  170. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/integration/test_evaluation_pipeline.py +0 -0
  171. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/integration/test_live_agents.py +0 -0
  172. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  173. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/integration/test_session_grouping.py +0 -0
  174. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/integration/test_timing_stress.py +0 -0
  175. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/run/__init__.py +0 -0
  176. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/run/test_fetcher.py +0 -0
  177. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/run/test_result_builder.py +0 -0
  178. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/run/test_service.py +0 -0
  179. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/run/test_sinks.py +0 -0
  180. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/storage/__init__.py +0 -0
  181. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/storage/test_config.py +0 -0
  182. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/storage/test_memory_repos.py +0 -0
  183. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/storage/test_models.py +0 -0
  184. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_api.py +0 -0
  185. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_cli.py +0 -0
  186. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_converter.py +0 -0
  187. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_extraction.py +0 -0
  188. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_genai_converter.py +0 -0
  189. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_jaeger_loader.py +0 -0
  190. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_loader_auto.py +0 -0
  191. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_log_enrichment.py +0 -0
  192. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_mcp_server.py +0 -0
  193. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_otlp_loader.py +0 -0
  194. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_otlp_receiver.py +0 -0
  195. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_output.py +0 -0
  196. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_protocol.py +0 -0
  197. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_runner.py +0 -0
  198. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_sdk.py +0 -0
  199. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/tests/test_trace_metrics.py +0 -0
  200. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/.gitignore +0 -0
  201. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/README.md +0 -0
  202. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/eslint.config.js +0 -0
  203. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/index.html +0 -0
  204. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/package-lock.json +0 -0
  205. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/package.json +0 -0
  206. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/public/logo.svg +0 -0
  207. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/public/vite.svg +0 -0
  208. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/App.css +0 -0
  209. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/App.tsx +0 -0
  210. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/api/client.ts +0 -0
  211. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/assets/react.svg +0 -0
  212. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  213. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  214. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  215. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  216. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  217. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/BuilderView.tsx +0 -0
  218. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  219. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  220. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  221. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/JsonPreview.tsx +0 -0
  222. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  223. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  224. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/builder/index.ts +0 -0
  225. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  226. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  227. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  228. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  229. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  230. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  231. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  232. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  233. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/DataSection.tsx +0 -0
  234. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  235. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  236. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/InspectorView.tsx +0 -0
  237. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  238. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  239. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  240. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  241. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  242. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  243. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  244. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  245. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  246. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  247. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  248. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/streaming/SessionCard.tsx +0 -0
  249. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  250. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  251. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/upload/FileDropZone.tsx +0 -0
  252. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/upload/MetricSelector.tsx +0 -0
  253. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  254. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  255. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/upload/UploadView.tsx +0 -0
  256. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  257. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/config.ts +0 -0
  258. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/context/TraceContext.tsx +0 -0
  259. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/context/TraceProvider.tsx +0 -0
  260. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/index.css +0 -0
  261. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/console-capture.ts +0 -0
  262. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/evalset-builder.ts +0 -0
  263. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/network-capture.ts +0 -0
  264. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/trace-helpers.ts +0 -0
  265. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/trace-loader.ts +0 -0
  266. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/trace-metadata.ts +0 -0
  267. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/trace-patcher.ts +0 -0
  268. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/types.ts +0 -0
  269. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/lib/utils.ts +0 -0
  270. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/src/main.tsx +0 -0
  271. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/tsconfig.app.json +0 -0
  272. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/tsconfig.json +0 -0
  273. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/tsconfig.node.json +0 -0
  274. {agentevals_cli-0.8.2 → agentevals_cli-0.8.4}/ui/vite.config.ts +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.8.2
3
+ Version: 0.8.4
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -425,6 +425,18 @@ Yes. A custom evaluator is any program that reads JSON from stdin and writes a s
425
425
 
426
426
  Yes. The OTLP receiver on port 4318 accepts standard `http/protobuf` and `http/json` trace exports, so it slots into any OpenTelemetry pipeline as just another exporter destination. If your pipeline uses gRPC (port 4317), place an [OTel Collector](https://opentelemetry.io/docs/collector/) in front to bridge gRPC to HTTP. The [Kubernetes example](examples/kubernetes/README.md) shows this pattern.
427
427
 
428
+ **Can I use agentevals to evaluate Claude Code, Codex, or OpenCode?**
429
+
430
+ Not today. agentevals scores agent behavior from OpenTelemetry GenAI traces (spans for model calls, tool calls, agent invocations following the [OTel GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)). The major coding agents do not currently emit telemetry in that shape:
431
+
432
+ - **Claude Code** ships OTel telemetry as logs, not GenAI spans. A prior proof of concept on a feature branch made it work by stitching hook events into synthetic traces. Reviving that path is on the backlog, not a near-term commitment.
433
+ - **Codex** exposes OTel, but in a different shape we have not yet validated against the GenAI semconv.
434
+ - **OpenCode** did not have OTel support merged the last time we checked.
435
+
436
+ Retrofitting agentevals to ingest each harness's bespoke telemetry is multiple thousands of lines of glue code per agent, for a use case where the dominant signal is "did the final output feel right," not "did the agent call the right tool with the right arguments in the right order." That kind of vibes evaluation is interesting work for harness and coding-agent vendors themselves, but it is not what agentevals is optimized for.
437
+
438
+ agentevals is built for the opposite end of the spectrum: smaller, purpose-built, properly instrumented agents (kagent, agentregistry, custom Strands / ADK / LangChain / OpenAI Agents SDK flows) running in cloud native environments, where success is measurable through tool trajectories, response matching, and deterministic pass/fail gates. If that is your use case, we are a good fit. If you are evaluating long-running coding sessions end to end, you probably want a tool built specifically for that shape.
439
+
428
440
  **How does this compare to ADK's evaluations?**
429
441
 
430
442
  Unlike ADK's eval method, which couples agent execution with evaluation, agentevals only handles scoring: it takes pre-recorded traces and compares them against expected behavior using metrics like tool trajectory matching, response quality, and LLM-based judgments.
@@ -448,3 +460,7 @@ Langfuse is a full observability platform (requires Postgres, ClickHouse, Redis,
448
460
  **How does this compare to Opik?**
449
461
 
450
462
  Opik's primary evaluation path re-runs your application code against a dataset, incurring additional LLM costs per eval run. It also supports online evaluation rules that auto-score production traces. While Opik supports OpenTelemetry ingestion alongside its own SDK, its evaluation workflow still centers on re-execution against datasets. agentevals evaluates pre-recorded OTel traces from any framework without re-execution, and runs entirely locally with no cloud dependency.
463
+
464
+ ## Acknowledgements
465
+
466
+ agentevals is built on top of [Google's Agent Development Kit](https://github.com/google/adk-python). ADK provides the evaluator protocol and the canonical eval data model (`Invocation`, `EvalSet`, `Evaluator`, prebuilt metrics) that this project extends. `google-adk` is licensed under [Apache 2.0](https://github.com/google/adk-python/blob/main/LICENSE), the same license as agentevals. Thanks to the ADK team and contributors.
@@ -397,6 +397,18 @@ Yes. A custom evaluator is any program that reads JSON from stdin and writes a s
397
397
 
398
398
  Yes. The OTLP receiver on port 4318 accepts standard `http/protobuf` and `http/json` trace exports, so it slots into any OpenTelemetry pipeline as just another exporter destination. If your pipeline uses gRPC (port 4317), place an [OTel Collector](https://opentelemetry.io/docs/collector/) in front to bridge gRPC to HTTP. The [Kubernetes example](examples/kubernetes/README.md) shows this pattern.
399
399
 
400
+ **Can I use agentevals to evaluate Claude Code, Codex, or OpenCode?**
401
+
402
+ Not today. agentevals scores agent behavior from OpenTelemetry GenAI traces (spans for model calls, tool calls, agent invocations following the [OTel GenAI semantic conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)). The major coding agents do not currently emit telemetry in that shape:
403
+
404
+ - **Claude Code** ships OTel telemetry as logs, not GenAI spans. A prior proof of concept on a feature branch made it work by stitching hook events into synthetic traces. Reviving that path is on the backlog, not a near-term commitment.
405
+ - **Codex** exposes OTel, but in a different shape we have not yet validated against the GenAI semconv.
406
+ - **OpenCode** did not have OTel support merged the last time we checked.
407
+
408
+ Retrofitting agentevals to ingest each harness's bespoke telemetry is multiple thousands of lines of glue code per agent, for a use case where the dominant signal is "did the final output feel right," not "did the agent call the right tool with the right arguments in the right order." That kind of vibes evaluation is interesting work for harness and coding-agent vendors themselves, but it is not what agentevals is optimized for.
409
+
410
+ agentevals is built for the opposite end of the spectrum: smaller, purpose-built, properly instrumented agents (kagent, agentregistry, custom Strands / ADK / LangChain / OpenAI Agents SDK flows) running in cloud native environments, where success is measurable through tool trajectories, response matching, and deterministic pass/fail gates. If that is your use case, we are a good fit. If you are evaluating long-running coding sessions end to end, you probably want a tool built specifically for that shape.
411
+
400
412
  **How does this compare to ADK's evaluations?**
401
413
 
402
414
  Unlike ADK's eval method, which couples agent execution with evaluation, agentevals only handles scoring: it takes pre-recorded traces and compares them against expected behavior using metrics like tool trajectory matching, response quality, and LLM-based judgments.
@@ -420,3 +432,7 @@ Langfuse is a full observability platform (requires Postgres, ClickHouse, Redis,
420
432
  **How does this compare to Opik?**
421
433
 
422
434
  Opik's primary evaluation path re-runs your application code against a dataset, incurring additional LLM costs per eval run. It also supports online evaluation rules that auto-score production traces. While Opik supports OpenTelemetry ingestion alongside its own SDK, its evaluation workflow still centers on re-execution against datasets. agentevals evaluates pre-recorded OTel traces from any framework without re-execution, and runs entirely locally with no cloud dependency.
435
+
436
+ ## Acknowledgements
437
+
438
+ agentevals is built on top of [Google's Agent Development Kit](https://github.com/google/adk-python). ADK provides the evaluator protocol and the canonical eval data model (`Invocation`, `EvalSet`, `Evaluator`, prebuilt metrics) that this project extends. `google-adk` is licensed under [Apache 2.0](https://github.com/google/adk-python/blob/main/LICENSE), the same license as agentevals. Thanks to the ADK team and contributors.
@@ -29,8 +29,9 @@ spec:
29
29
  securityContext:
30
30
  {{- toYaml .Values.podSecurityContext | nindent 8 }}
31
31
  serviceAccountName: {{ include "agentevals.serviceAccountName" . }}
32
- {{- if .Values.ephemeralVolume.enabled }}
32
+ {{- if or .Values.ephemeralVolume.enabled .Values.extraVolumes }}
33
33
  volumes:
34
+ {{- if .Values.ephemeralVolume.enabled }}
34
35
  - name: agentevals-tmp
35
36
  {{- if or .Values.ephemeralVolume.sizeLimit (eq .Values.ephemeralVolume.medium "Memory") }}
36
37
  emptyDir:
@@ -43,6 +44,10 @@ spec:
43
44
  {{- else }}
44
45
  emptyDir: {}
45
46
  {{- end }}
47
+ {{- end }}
48
+ {{- with .Values.extraVolumes }}
49
+ {{- toYaml . | nindent 8 }}
50
+ {{- end }}
46
51
  {{- end }}
47
52
  containers:
48
53
  - name: agentevals
@@ -70,6 +75,10 @@ spec:
70
75
  value: "postgres"
71
76
  - name: AGENTEVALS_DATABASE_SCHEMA
72
77
  value: {{ .Values.database.postgres.schema | quote }}
78
+ - name: AGENTEVALS_AUTO_MIGRATE
79
+ value: {{ .Values.database.postgres.autoMigrate | quote }}
80
+ - name: AGENTEVALS_DB_CONNECT_TIMEOUT_S
81
+ value: {{ .Values.database.postgres.connectTimeoutSeconds | quote }}
73
82
  {{- if .Values.database.postgres.urlFile }}
74
83
  - name: AGENTEVALS_DATABASE_URL_FILE
75
84
  value: {{ .Values.database.postgres.urlFile | quote }}
@@ -135,10 +144,15 @@ spec:
135
144
  port: http
136
145
  initialDelaySeconds: 15
137
146
  periodSeconds: 20
138
- {{- if .Values.ephemeralVolume.enabled }}
147
+ {{- if or .Values.ephemeralVolume.enabled .Values.extraVolumeMounts }}
139
148
  volumeMounts:
149
+ {{- if .Values.ephemeralVolume.enabled }}
140
150
  - name: agentevals-tmp
141
151
  mountPath: /tmp
152
+ {{- end }}
153
+ {{- with .Values.extraVolumeMounts }}
154
+ {{- toYaml . | nindent 12 }}
155
+ {{- end }}
142
156
  {{- end }}
143
157
  {{- with .Values.nodeSelector }}
144
158
  nodeSelector:
@@ -159,6 +159,16 @@ env: []
159
159
  # -- Extra envFrom sources (ConfigMapRef, SecretRef)
160
160
  envFrom: []
161
161
 
162
+ # -- Extra volumes appended to the pod spec. Use this to mount additional
163
+ # config files or secrets (e.g. result-sink credentials) into the pod.
164
+ extraVolumes: []
165
+
166
+ # -- Extra volumeMounts appended to the main container. Pair with
167
+ # extraVolumes by name. securityContext.readOnlyRootFilesystem is true by
168
+ # default; that only makes the root filesystem read-only, mounted paths
169
+ # themselves are unaffected, so a writable extraVolumes entry works fine.
170
+ extraVolumeMounts: []
171
+
162
172
  # ==============================================================================
163
173
  # STORAGE (preview feature)
164
174
  #
@@ -195,6 +205,18 @@ database:
195
205
  urlFile: ""
196
206
  # -- Postgres schema to use for agentevals tables.
197
207
  schema: agentevals
208
+ # -- Apply pending database migrations during server startup before the
209
+ # HTTP listener opens. The Postgres advisory lock serialises concurrent
210
+ # replica starts so this is safe with replicaCount > 1. When set to
211
+ # false the server refuses to start if the schema is behind or dirty;
212
+ # run "agentevals migrate up" manually in that case.
213
+ autoMigrate: true
214
+ # -- Seconds the startup will spend retrying the initial Postgres
215
+ # connection before the pod aborts. Default 600s matches the chart's
216
+ # hard-coded startupProbe budget (failureThreshold 60 x periodSeconds
217
+ # 10). Going above 600s requires overriding the probe in your own
218
+ # downstream template.
219
+ connectTimeoutSeconds: 600
198
220
  # -- Bundled Postgres instance for development and evaluation only.
199
221
  # Not suitable for production. Deployed when enabled is true and url /
200
222
  # urlFile are not set.
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "agentevals-cli"
7
- version = "0.8.2"
7
+ version = "0.8.4"
8
8
  description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -20,7 +20,7 @@ from ..run.service import RunService
20
20
  from ..run.sinks import log_registered_sinks
21
21
  from ..run.worker import AsyncRunWorker
22
22
  from ..storage import StorageSettings, build_repos
23
- from ..storage.postgres.migrator import Migrator
23
+ from ..storage.postgres.migrator import Migrator, discover_migrations
24
24
  from ..utils.log_buffer import log_buffer
25
25
  from .debug_routes import debug_router
26
26
  from .routes import router
@@ -31,6 +31,22 @@ if TYPE_CHECKING:
31
31
 
32
32
  logger = logging.getLogger(__name__)
33
33
 
34
+ _TRUE_VALUES = {"true", "1", "yes", "on"}
35
+ _FALSE_VALUES = {"false", "0", "no", "off"}
36
+
37
+
38
+ def _env_bool(name: str, *, default: bool) -> bool:
39
+ raw = os.getenv(name)
40
+ if raw is None or raw == "":
41
+ return default
42
+ val = raw.strip().lower()
43
+ if val in _TRUE_VALUES:
44
+ return True
45
+ if val in _FALSE_VALUES:
46
+ return False
47
+ raise ValueError(f"{name} must be one of true/false/1/0/yes/no/on/off (got: {raw!r})")
48
+
49
+
34
50
  try:
35
51
  from dotenv import load_dotenv
36
52
 
@@ -68,13 +84,34 @@ def _build_lifespan():
68
84
  logger.error("Storage configuration invalid; /api/runs will not be available: %s", exc)
69
85
 
70
86
  if storage_settings is not None and storage_settings.backend == "postgres":
71
- logger.info("Applying any pending migrations to schema '%s'", storage_settings.schema_name)
72
87
  migrator = Migrator(
73
88
  dsn=storage_settings.database_url or "",
74
89
  schema=storage_settings.schema_name,
75
90
  lock_timeout_s=storage_settings.migrate_lock_timeout_s,
76
91
  )
77
- await migrator.up()
92
+ if _env_bool("AGENTEVALS_AUTO_MIGRATE", default=True):
93
+ logger.info("Applying any pending migrations to schema '%s'", storage_settings.schema_name)
94
+ await migrator.up()
95
+ else:
96
+ logger.info(
97
+ "AGENTEVALS_AUTO_MIGRATE is disabled; verifying schema '%s' is up to date",
98
+ storage_settings.schema_name,
99
+ )
100
+ status = await migrator.status()
101
+ if status.dirty:
102
+ raise RuntimeError(
103
+ f"schema_migrations is dirty at version {status.version}. "
104
+ "Resolve manually and run 'agentevals migrate force <version>', "
105
+ "or set AGENTEVALS_AUTO_MIGRATE=true to retry on startup."
106
+ )
107
+ current = status.version
108
+ pending = [m.version for m in discover_migrations() if current is None or m.version > current]
109
+ if pending:
110
+ raise RuntimeError(
111
+ f"Database schema is behind: pending migrations {pending}. "
112
+ "Run 'agentevals migrate up' to apply them, "
113
+ "or set AGENTEVALS_AUTO_MIGRATE=true to apply on startup."
114
+ )
78
115
 
79
116
  repos = await build_repos(storage_settings)
80
117
  app.state.storage_settings = storage_settings
@@ -13,6 +13,8 @@ from __future__ import annotations
13
13
 
14
14
  import asyncio
15
15
  import logging
16
+ import math
17
+ import os
16
18
  import re
17
19
  from dataclasses import dataclass
18
20
  from importlib.resources import files
@@ -252,23 +254,58 @@ def discover_migrations() -> list[Migration]:
252
254
  return _discover_migrations()
253
255
 
254
256
 
255
- CONNECT_RETRY_DEADLINE_S = 60.0
256
- """Total wall-clock budget for the initial Postgres connection. Bundled PG
257
- in Kubernetes typically takes 5-15s to be ready (PVC bind, initdb, listener
258
- bind), so the agentevals lifespan can race the database on a fresh deploy.
259
- Retrying tolerates that gap rather than failing pod startup and relying on
260
- CrashLoopBackOff timing to eventually line up."""
257
+ CONNECT_RETRY_DEADLINE_S = 600.0
258
+ """Default total wall-clock budget for the initial Postgres connection.
259
+ Sized to span Kubernetes bring-up of a freshly provisioned database (PVC
260
+ bind, initdb, listener bind, network policy propagation). Override at
261
+ runtime by setting ``AGENTEVALS_DB_CONNECT_TIMEOUT_S`` to a positive
262
+ number of seconds; an invalid value logs a warning and falls back to this
263
+ default."""
264
+
265
+
266
+ def connect_deadline_seconds() -> float:
267
+ """Resolve the connect-retry budget. Reads ``AGENTEVALS_DB_CONNECT_TIMEOUT_S``
268
+ and falls back to :data:`CONNECT_RETRY_DEADLINE_S` if the env var is
269
+ unset, empty, non-numeric, non-finite, or non-positive."""
270
+ raw = os.getenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S")
271
+ if raw is None or raw == "":
272
+ return CONNECT_RETRY_DEADLINE_S
273
+ try:
274
+ val = float(raw)
275
+ except ValueError:
276
+ logger.warning(
277
+ "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (not a number); using default %.0fs",
278
+ raw,
279
+ CONNECT_RETRY_DEADLINE_S,
280
+ )
281
+ return CONNECT_RETRY_DEADLINE_S
282
+ if not math.isfinite(val):
283
+ logger.warning(
284
+ "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be finite); using default %.0fs",
285
+ raw,
286
+ CONNECT_RETRY_DEADLINE_S,
287
+ )
288
+ return CONNECT_RETRY_DEADLINE_S
289
+ if val <= 0:
290
+ logger.warning(
291
+ "Invalid AGENTEVALS_DB_CONNECT_TIMEOUT_S=%r (must be positive); using default %.0fs",
292
+ raw,
293
+ CONNECT_RETRY_DEADLINE_S,
294
+ )
295
+ return CONNECT_RETRY_DEADLINE_S
296
+ return val
261
297
 
262
298
 
263
299
  async def connect_with_retry(dsn: str, asyncpg_module) -> "asyncpg.Connection":
264
300
  """Open a single asyncpg connection, retrying on connection-refused or
265
- server-not-ready errors for up to ``CONNECT_RETRY_DEADLINE_S`` seconds.
301
+ server-not-ready errors for up to :func:`connect_deadline_seconds`
302
+ seconds.
266
303
 
267
304
  Connection-time errors are tolerated; once a connection has been
268
305
  established and a query returned, all subsequent failures propagate
269
306
  normally.
270
307
  """
271
- deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S
308
+ deadline = asyncio.get_event_loop().time() + connect_deadline_seconds()
272
309
  delay = 0.5
273
310
  while True:
274
311
  try:
@@ -50,9 +50,9 @@ async def create_pool(settings: StorageSettings) -> "asyncpg.Pool":
50
50
  settings.schema_name,
51
51
  )
52
52
 
53
- from .migrator import CONNECT_RETRY_DEADLINE_S
53
+ from .migrator import connect_deadline_seconds
54
54
 
55
- deadline = asyncio.get_event_loop().time() + CONNECT_RETRY_DEADLINE_S
55
+ deadline = asyncio.get_event_loop().time() + connect_deadline_seconds()
56
56
  delay = 0.5
57
57
  while True:
58
58
  try:
@@ -7,6 +7,7 @@ otherwise those tests skip so the suite stays runnable in pure-Python sandboxes.
7
7
 
8
8
  from __future__ import annotations
9
9
 
10
+ import logging
10
11
  import os
11
12
  import re
12
13
 
@@ -14,10 +15,12 @@ import pytest
14
15
 
15
16
  from agentevals.storage.postgres.migrator import (
16
17
  ADVISORY_LOCK_KEY,
18
+ CONNECT_RETRY_DEADLINE_S,
17
19
  Migration,
18
20
  Migrator,
19
21
  _apply_schema,
20
22
  _discover_migrations,
23
+ connect_deadline_seconds,
21
24
  discover_migrations,
22
25
  )
23
26
 
@@ -71,6 +74,61 @@ class TestAdvisoryLockKey:
71
74
  assert ADVISORY_LOCK_KEY == 7259820376655812345
72
75
 
73
76
 
77
+ class TestConnectDeadlineSeconds:
78
+ """``connect_deadline_seconds`` resolves AGENTEVALS_DB_CONNECT_TIMEOUT_S
79
+ to a float, falling back to CONNECT_RETRY_DEADLINE_S on any input the
80
+ retry loop cannot consume. Each failure mode logs at WARNING so the
81
+ cause is diagnosable from pod logs."""
82
+
83
+ @pytest.fixture(autouse=True)
84
+ def _clean_env(self, monkeypatch):
85
+ monkeypatch.delenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raising=False)
86
+
87
+ def test_unset_returns_default(self):
88
+ assert connect_deadline_seconds() == CONNECT_RETRY_DEADLINE_S
89
+
90
+ def test_empty_returns_default(self, monkeypatch):
91
+ monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", "")
92
+ assert connect_deadline_seconds() == CONNECT_RETRY_DEADLINE_S
93
+
94
+ @pytest.mark.parametrize(
95
+ ("raw", "expected"),
96
+ [
97
+ ("42", 42.0),
98
+ ("120.5", 120.5),
99
+ ("0.1", 0.1),
100
+ ("3600", 3600.0),
101
+ ],
102
+ )
103
+ def test_parses_valid_positive_values(self, monkeypatch, raw, expected):
104
+ monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raw)
105
+ assert connect_deadline_seconds() == expected
106
+
107
+ @pytest.mark.parametrize(
108
+ ("raw", "reason_substring"),
109
+ [
110
+ ("foo", "not a number"),
111
+ ("nan", "must be finite"),
112
+ ("inf", "must be finite"),
113
+ ("-inf", "must be finite"),
114
+ ("0", "must be positive"),
115
+ ("-5", "must be positive"),
116
+ ],
117
+ )
118
+ def test_invalid_values_fall_back_with_warning(self, monkeypatch, caplog, raw, reason_substring):
119
+ """Bad inputs return the default and log exactly one warning that
120
+ names the specific validation branch. The cardinality check
121
+ guards against a refactor that double-logs (e.g. emits both a
122
+ generic and a specific message)."""
123
+ monkeypatch.setenv("AGENTEVALS_DB_CONNECT_TIMEOUT_S", raw)
124
+ with caplog.at_level(logging.WARNING, logger="agentevals.storage.postgres.migrator"):
125
+ result = connect_deadline_seconds()
126
+ assert result == CONNECT_RETRY_DEADLINE_S
127
+ warnings = [r.getMessage() for r in caplog.records if r.levelname == "WARNING"]
128
+ assert len(warnings) == 1, f"expected one warning, got {warnings}"
129
+ assert reason_substring in warnings[0]
130
+
131
+
74
132
  class TestMigrationFilePattern:
75
133
  def test_filename_format(self):
76
134
  migrations = _discover_migrations()
@@ -30,7 +30,7 @@ wheels = [
30
30
 
31
31
  [[package]]
32
32
  name = "agentevals-cli"
33
- version = "0.8.2"
33
+ version = "0.8.4"
34
34
  source = { editable = "." }
35
35
  dependencies = [
36
36
  { name = "click" },
File without changes
File without changes
File without changes
File without changes