agentevals-cli 0.9.2__tar.gz → 0.9.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.github/workflows/release.yml +1 -0
  2. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/Dockerfile +8 -1
  3. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/Makefile +2 -1
  4. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/PKG-INFO +4 -2
  5. agentevals_cli-0.9.4/charts/agentevals/templates/rbac.yaml +33 -0
  6. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/values.yaml +14 -0
  7. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/adk/run.py +1 -1
  8. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/pyproject.toml +8 -1
  9. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/builtin_metrics.py +77 -0
  10. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/config.py +8 -0
  11. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/custom_evaluators.py +2 -0
  12. agentevals_cli-0.9.4/src/agentevals/resolvers/__init__.py +167 -0
  13. agentevals_cli-0.9.4/src/agentevals/resolvers/kubernetes.py +62 -0
  14. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/run/worker.py +10 -0
  15. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/models.py +9 -0
  16. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/integration/test_live_agents.py +7 -0
  17. agentevals_cli-0.9.4/tests/resolvers/test_kubernetes.py +63 -0
  18. agentevals_cli-0.9.4/tests/resolvers/test_registry.py +145 -0
  19. agentevals_cli-0.9.4/tests/storage/__init__.py +0 -0
  20. agentevals_cli-0.9.4/tests/test_credential_injection.py +122 -0
  21. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_mcp_server.py +2 -0
  22. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/uv.lock +176 -802
  23. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.claude/skills/eval/SKILL.md +0 -0
  24. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.claude/skills/eval/evals/evals.json +0 -0
  25. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.claude/skills/inspect/SKILL.md +0 -0
  26. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.claude/skills/inspect/evals/evals.json +0 -0
  27. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.dockerignore +0 -0
  28. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  29. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  30. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  31. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.github/workflows/ci.yml +0 -0
  32. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  33. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.gitignore +0 -0
  34. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/.mcp.json +0 -0
  35. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/CONTRIBUTING.md +0 -0
  36. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/DEVELOPMENT.md +0 -0
  37. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/LICENSE +0 -0
  38. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/README.md +0 -0
  39. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/Chart.yaml +0 -0
  40. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/templates/NOTES.txt +0 -0
  41. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/templates/_helpers.tpl +0 -0
  42. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/templates/deployment.yaml +0 -0
  43. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  44. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/templates/postgresql.yaml +0 -0
  45. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/templates/service.yaml +0 -0
  46. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  47. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/docs/assets/logo-color-on-transparent.svg +0 -0
  48. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/docs/assets/logo-color.png +0 -0
  49. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/docs/assets/logo-dark-on-transparent.svg +0 -0
  50. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/docs/custom-evaluators.md +0 -0
  51. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/docs/eval-set-format.md +0 -0
  52. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/docs/otel-compatibility.md +0 -0
  53. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/docs/streaming.md +0 -0
  54. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/README.md +0 -0
  55. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/custom_evaluators/eval_config.yaml +0 -0
  56. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/custom_evaluators/eval_config_openai_eval.yaml +0 -0
  57. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/custom_evaluators/response_quality.py +0 -0
  58. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/custom_evaluators/tool_call_checker.py +0 -0
  59. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/custom_sink/README.md +0 -0
  60. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
  61. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
  62. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/custom_sink/pyproject.toml +0 -0
  63. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/dice_agent/README.md +0 -0
  64. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/dice_agent/agent.py +0 -0
  65. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/dice_agent/eval_set.json +0 -0
  66. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/dice_agent/main.py +0 -0
  67. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/dice_agent/test_streaming.py +0 -0
  68. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/kubernetes/README.md +0 -0
  69. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/langchain_agent/README.md +0 -0
  70. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/langchain_agent/agent.py +0 -0
  71. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/langchain_agent/eval_set.json +0 -0
  72. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/langchain_agent/main.py +0 -0
  73. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/langchain_agent/requirements.txt +0 -0
  74. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/langchain_agent/test_streaming.py +0 -0
  75. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/sdk_example/async_example.py +0 -0
  76. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/sdk_example/context_manager_example.py +0 -0
  77. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/sdk_example/decorator_example.py +0 -0
  78. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/sdk_example/requirements.txt +0 -0
  79. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/strands_agent/agent.py +0 -0
  80. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/strands_agent/eval_set.json +0 -0
  81. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/strands_agent/main.py +0 -0
  82. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/strands_agent/requirements.txt +0 -0
  83. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/adk/requirements.txt +0 -0
  84. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  85. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/langchain/run.py +0 -0
  86. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  87. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/ollama/run.py +0 -0
  88. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  89. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/openai-agents/run.py +0 -0
  90. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  91. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  92. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/strands/requirements.txt +0 -0
  93. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/examples/zero-code-examples/strands/run.py +0 -0
  94. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/flake.lock +0 -0
  95. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/flake.nix +0 -0
  96. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/packages/evaluator-sdk-py/README.md +0 -0
  97. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  98. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  99. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  100. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  101. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/samples/eval_set_helm.json +0 -0
  102. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/samples/evalset_helm_3_2026-02-23.json +0 -0
  103. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/samples/evalset_k8s_2026-02-20.json +0 -0
  104. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/samples/helm.json +0 -0
  105. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/samples/helm_2.json +0 -0
  106. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/samples/helm_3.json +0 -0
  107. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/samples/k8s.json +0 -0
  108. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/samples/tempo_export_with_batches.json +0 -0
  109. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/__init__.py +0 -0
  110. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/_protocol.py +0 -0
  111. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  112. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/_static/assets/index-RIquRPno.js +0 -0
  113. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/_static/index.html +0 -0
  114. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/_static/logo.svg +0 -0
  115. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/_static/vite.svg +0 -0
  116. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/__init__.py +0 -0
  117. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/app.py +0 -0
  118. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/debug_routes.py +0 -0
  119. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/dependencies.py +0 -0
  120. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/models.py +0 -0
  121. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/otlp_app.py +0 -0
  122. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/otlp_grpc.py +0 -0
  123. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/otlp_processing.py +0 -0
  124. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/otlp_routes.py +0 -0
  125. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/routes.py +0 -0
  126. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/runs_routes.py +0 -0
  127. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/api/streaming_routes.py +0 -0
  128. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/cli.py +0 -0
  129. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/converter.py +0 -0
  130. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/eval_config_loader.py +0 -0
  131. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/evaluator/__init__.py +0 -0
  132. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/evaluator/resolver.py +0 -0
  133. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/evaluator/sources.py +0 -0
  134. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/evaluator/templates.py +0 -0
  135. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/evaluator/venv.py +0 -0
  136. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/extraction.py +0 -0
  137. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/genai_converter.py +0 -0
  138. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/loader/__init__.py +0 -0
  139. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/loader/auto.py +0 -0
  140. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/loader/base.py +0 -0
  141. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/loader/jaeger.py +0 -0
  142. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/loader/otlp.py +0 -0
  143. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/mcp_server.py +0 -0
  144. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/openai_eval_backend.py +0 -0
  145. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/output.py +0 -0
  146. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/run/__init__.py +0 -0
  147. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/run/fetcher.py +0 -0
  148. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/run/result_builder.py +0 -0
  149. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/run/service.py +0 -0
  150. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/run/sinks.py +0 -0
  151. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/runner.py +0 -0
  152. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/sdk.py +0 -0
  153. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/__init__.py +0 -0
  154. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/config.py +0 -0
  155. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/postgres/__init__.py +0 -0
  156. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  157. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  158. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/postgres/migrator.py +0 -0
  159. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/postgres/pool.py +0 -0
  160. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/repos/__init__.py +0 -0
  161. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/repos/memory.py +0 -0
  162. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/storage/repos/postgres.py +0 -0
  163. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/streaming/__init__.py +0 -0
  164. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/streaming/incremental_processor.py +0 -0
  165. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/streaming/processor.py +0 -0
  166. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/streaming/session.py +0 -0
  167. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/streaming/ws_server.py +0 -0
  168. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/trace_attrs.py +0 -0
  169. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/trace_metrics.py +0 -0
  170. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/utils/__init__.py +0 -0
  171. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/utils/genai_messages.py +0 -0
  172. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/utils/log_buffer.py +0 -0
  173. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/src/agentevals/utils/log_enrichment.py +0 -0
  174. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/api/__init__.py +0 -0
  175. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/api/test_evaluate_persistence.py +0 -0
  176. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/api/test_runs_routes.py +0 -0
  177. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/integration/__init__.py +0 -0
  178. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/integration/conftest.py +0 -0
  179. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/integration/test_evaluation_pipeline.py +0 -0
  180. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  181. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/integration/test_session_grouping.py +0 -0
  182. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/integration/test_timing_stress.py +0 -0
  183. {agentevals_cli-0.9.2/tests/run → agentevals_cli-0.9.4/tests/resolvers}/__init__.py +0 -0
  184. {agentevals_cli-0.9.2/tests/storage → agentevals_cli-0.9.4/tests/run}/__init__.py +0 -0
  185. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/run/test_fetcher.py +0 -0
  186. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/run/test_result_builder.py +0 -0
  187. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/run/test_service.py +0 -0
  188. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/run/test_sinks.py +0 -0
  189. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/storage/test_config.py +0 -0
  190. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/storage/test_memory_repos.py +0 -0
  191. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/storage/test_migrator.py +0 -0
  192. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/storage/test_models.py +0 -0
  193. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_api.py +0 -0
  194. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_cli.py +0 -0
  195. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_converter.py +0 -0
  196. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_eval_config_loader.py +0 -0
  197. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_extraction.py +0 -0
  198. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_genai_converter.py +0 -0
  199. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_jaeger_loader.py +0 -0
  200. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_loader_auto.py +0 -0
  201. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_log_enrichment.py +0 -0
  202. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_openai_eval_backend.py +0 -0
  203. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_otlp_loader.py +0 -0
  204. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_otlp_receiver.py +0 -0
  205. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_output.py +0 -0
  206. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_protocol.py +0 -0
  207. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_runner.py +0 -0
  208. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_sdk.py +0 -0
  209. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/tests/test_trace_metrics.py +0 -0
  210. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/.gitignore +0 -0
  211. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/README.md +0 -0
  212. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/eslint.config.js +0 -0
  213. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/index.html +0 -0
  214. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/package-lock.json +0 -0
  215. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/package.json +0 -0
  216. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/public/logo.svg +0 -0
  217. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/public/vite.svg +0 -0
  218. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/App.css +0 -0
  219. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/App.tsx +0 -0
  220. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/api/client.ts +0 -0
  221. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/assets/react.svg +0 -0
  222. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  223. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  224. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  225. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  226. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  227. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/BuilderView.tsx +0 -0
  228. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  229. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  230. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  231. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/JsonPreview.tsx +0 -0
  232. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  233. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  234. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/builder/index.ts +0 -0
  235. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  236. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  237. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  238. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  239. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  240. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  241. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  242. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  243. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/DataSection.tsx +0 -0
  244. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  245. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  246. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/InspectorView.tsx +0 -0
  247. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  248. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  249. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  250. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  251. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  252. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  253. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  254. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  255. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  256. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  257. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  258. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/streaming/SessionCard.tsx +0 -0
  259. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  260. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  261. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/upload/FileDropZone.tsx +0 -0
  262. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/upload/MetricSelector.tsx +0 -0
  263. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  264. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  265. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/upload/UploadView.tsx +0 -0
  266. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  267. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/config.ts +0 -0
  268. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/context/TraceContext.tsx +0 -0
  269. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/context/TraceProvider.tsx +0 -0
  270. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/index.css +0 -0
  271. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/console-capture.ts +0 -0
  272. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/eval-config.ts +0 -0
  273. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/evalset-builder.ts +0 -0
  274. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/network-capture.ts +0 -0
  275. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/trace-helpers.ts +0 -0
  276. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/trace-loader.ts +0 -0
  277. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/trace-metadata.ts +0 -0
  278. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/trace-patcher.ts +0 -0
  279. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/types.ts +0 -0
  280. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/lib/utils.ts +0 -0
  281. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/src/main.tsx +0 -0
  282. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/tsconfig.app.json +0 -0
  283. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/tsconfig.json +0 -0
  284. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/tsconfig.node.json +0 -0
  285. {agentevals_cli-0.9.2 → agentevals_cli-0.9.4}/ui/vite.config.ts +0 -0
@@ -121,6 +121,7 @@ jobs:
121
121
  run: |
122
122
  VERSION="${TAG#v}"
123
123
  make build-docker \
124
+ VERSION="$VERSION" \
124
125
  DOCKER_REGISTRY="ghcr.io/${{ github.repository_owner }}" \
125
126
  DOCKER_TAG="$VERSION"
126
127
  env:
@@ -24,7 +24,14 @@ COPY src ./src
24
24
 
25
25
  COPY --from=ui /build/ui/dist ./src/agentevals/_static
26
26
 
27
- RUN uv sync --frozen --no-dev --extra live --extra postgres \
27
+ # hatch-vcs reads the version from .git, which the docker build context omits.
28
+ # Pass it in as a build arg. We use the generic SETUPTOOLS_SCM_PRETEND_VERSION
29
+ # because hatch-vcs does not forward dist_name to setuptools-scm, so the
30
+ # per-package SETUPTOOLS_SCM_PRETEND_VERSION_FOR_<DIST> form is never consulted.
31
+ ARG VERSION
32
+ ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
33
+
34
+ RUN uv sync --frozen --no-dev --extra live --extra postgres --extra kubernetes \
28
35
  && groupadd --gid 1000 app \
29
36
  && useradd --uid 1000 --gid app --home-dir /app --no-log-init app \
30
37
  && chown -R app:app /app
@@ -28,7 +28,8 @@ build:
28
28
  uv build
29
29
 
30
30
  build-docker:
31
- docker buildx build --platform $(PLATFORMS) -t $(DOCKER_IMAGE_REF):$(DOCKER_TAG) --push .
31
+ @test -n "$(VERSION)" || { echo "ERROR: VERSION is empty. Pass VERSION=x.y.z explicitly, or install uv so hatch-vcs can resolve it."; exit 1; }
32
+ docker buildx build --platform $(PLATFORMS) --build-arg VERSION=$(VERSION) -t $(DOCKER_IMAGE_REF):$(DOCKER_TAG) --push .
32
33
 
33
34
  build-ui:
34
35
  cd ui && npm ci && npm run build
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.9.2
3
+ Version: 0.9.4
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
7
7
  Requires-Dist: click>=8.0
8
8
  Requires-Dist: fastapi>=0.115.0
9
- Requires-Dist: google-adk[eval]>=1.30.0
9
+ Requires-Dist: google-adk[eval]<2.2,>=2.1.0
10
10
  Requires-Dist: httpx>=0.27.0
11
11
  Requires-Dist: opentelemetry-proto>=1.36.0
12
12
  Requires-Dist: python-dotenv>=1.0.0
@@ -14,6 +14,8 @@ Requires-Dist: python-multipart>=0.0.12
14
14
  Requires-Dist: pyyaml>=6.0
15
15
  Requires-Dist: tabulate>=0.9.0
16
16
  Requires-Dist: uvicorn[standard]>=0.32.0
17
+ Provides-Extra: kubernetes
18
+ Requires-Dist: kubernetes>=36.0.0; extra == 'kubernetes'
17
19
  Provides-Extra: live
18
20
  Requires-Dist: httpx>=0.27.0; extra == 'live'
19
21
  Requires-Dist: mcp>=1.26.0; extra == 'live'
@@ -0,0 +1,33 @@
1
+ {{- if .Values.rbac.create -}}
2
+ apiVersion: rbac.authorization.k8s.io/v1
3
+ kind: Role
4
+ metadata:
5
+ name: {{ include "agentevals.fullname" . }}
6
+ namespace: {{ include "agentevals.namespace" . }}
7
+ labels:
8
+ {{- include "agentevals.labels" . | nindent 4 }}
9
+ rules:
10
+ - apiGroups: [""]
11
+ resources: ["secrets"]
12
+ verbs: ["get"]
13
+ {{- with .Values.rbac.secretNames }}
14
+ resourceNames:
15
+ {{- toYaml . | nindent 6 }}
16
+ {{- end }}
17
+ ---
18
+ apiVersion: rbac.authorization.k8s.io/v1
19
+ kind: RoleBinding
20
+ metadata:
21
+ name: {{ include "agentevals.fullname" . }}
22
+ namespace: {{ include "agentevals.namespace" . }}
23
+ labels:
24
+ {{- include "agentevals.labels" . | nindent 4 }}
25
+ roleRef:
26
+ apiGroup: rbac.authorization.k8s.io
27
+ kind: Role
28
+ name: {{ include "agentevals.fullname" . }}
29
+ subjects:
30
+ - kind: ServiceAccount
31
+ name: {{ include "agentevals.serviceAccountName" . }}
32
+ namespace: {{ include "agentevals.namespace" . }}
33
+ {{- end }}
@@ -57,6 +57,20 @@ serviceAccount:
57
57
  # -- ServiceAccount name override
58
58
  name: ""
59
59
 
60
+ # ==============================================================================
61
+ # RBAC
62
+ # ==============================================================================
63
+
64
+ # -- Namespaced Role + RoleBinding granting the pod's ServiceAccount read
65
+ # access to Secrets. Enable this when the kubernetes secret resolver reads
66
+ # provider credentials from Secrets via in-cluster config.
67
+ rbac:
68
+ # -- Create the Role and RoleBinding
69
+ create: false
70
+ # -- Restrict the Role to these Secret names. Empty grants get on all
71
+ # Secrets in the release namespace.
72
+ secretNames: []
73
+
60
74
  # ==============================================================================
61
75
  # Pod
62
76
  # ==============================================================================
@@ -74,7 +74,7 @@ async def main():
74
74
 
75
75
  agent_response = ""
76
76
  async for event in runner.run_async(user_id=user_id, session_id=session.id, new_message=content):
77
- if event.content.parts and event.content.parts[0].text:
77
+ if event.content and event.content.parts and event.content.parts[0].text:
78
78
  agent_response = event.content.parts[0].text
79
79
 
80
80
  print(f" Agent: {agent_response}")
@@ -9,7 +9,7 @@ description = "Standalone framework to evaluate agent correctness based on porta
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
11
11
  dependencies = [
12
- "google-adk[eval]>=1.30.0",
12
+ "google-adk[eval]>=2.1.0,<2.2",
13
13
  "click>=8.0",
14
14
  "tabulate>=0.9.0",
15
15
  "fastapi>=0.115.0",
@@ -36,10 +36,17 @@ openai = [
36
36
  postgres = [
37
37
  "asyncpg>=0.30.0",
38
38
  ]
39
+ kubernetes = [
40
+ "kubernetes>=36.0.0",
41
+ ]
39
42
 
40
43
  [project.scripts]
41
44
  agentevals = "agentevals.cli:main"
42
45
 
46
+ [project.entry-points."agentevals.secret_resolvers"]
47
+ env = "agentevals.resolvers:create_env_resolver"
48
+ kubernetes = "agentevals.resolvers.kubernetes:create_kubernetes_resolver"
49
+
43
50
  [tool.hatch.version]
44
51
  source = "vcs"
45
52
 
@@ -27,6 +27,8 @@ from google.adk.evaluation.eval_metrics import (
27
27
  from google.adk.evaluation.eval_rubrics import Rubric, RubricContent
28
28
  from google.adk.evaluation.evaluator import EvaluationResult, Evaluator
29
29
 
30
+ from .resolvers import get_resolved_credential
31
+
30
32
  logger = logging.getLogger(__name__)
31
33
 
32
34
  METRICS_NEEDING_EXPECTED = {
@@ -267,6 +269,67 @@ def get_evaluator(eval_metric: EvalMetric) -> Evaluator:
267
269
  return DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(eval_metric)
268
270
 
269
271
 
272
+ def _build_judge_model(model_id: str, api_key: str, base_url: str | None = None):
273
+ """Build a judge ``BaseLlm`` carrying *api_key* directly, instead of reading it from env.
274
+
275
+ LiteLlm-backed providers take ``api_key`` (and optional ``base_url``) as constructor
276
+ kwargs that forward into every ``litellm.acompletion`` call. The Gemini-native model
277
+ class takes no ``api_key``; its cached ``google.genai`` client is replaced with one
278
+ built from the resolved key.
279
+
280
+ Routing is by ADK's ``LLMRegistry`` class resolution, which is authoritative: the
281
+ evaluator already resolved this same *model_id* to a model class when ``_setup_auto_rater``
282
+ ran at construction, so this lookup cannot disagree or fail here.
283
+ """
284
+ from google.adk.models.lite_llm import LiteLlm
285
+ from google.adk.models.registry import LLMRegistry
286
+
287
+ if issubclass(LLMRegistry().resolve(model_id), LiteLlm):
288
+ kwargs: dict[str, Any] = {"api_key": api_key}
289
+ if base_url:
290
+ kwargs["base_url"] = base_url
291
+ return LiteLlm(model=model_id, **kwargs)
292
+
293
+ from google.adk.models.google_llm import Gemini
294
+ from google.genai import Client
295
+ from google.genai import types as genai_types
296
+
297
+ model = Gemini(model=model_id)
298
+ client_kwargs: dict[str, Any] = {"api_key": api_key}
299
+ if base_url:
300
+ client_kwargs["http_options"] = genai_types.HttpOptions(base_url=base_url)
301
+ # api_client is a functools.cached_property that memoizes into the instance __dict__;
302
+ # seeding that slot pre-empts the lazily-built client so the judge uses the resolved key.
303
+ model.__dict__["api_client"] = Client(**client_kwargs)
304
+ return model
305
+
306
+
307
+ def _inject_judge_credential(evaluator: Evaluator, api_key: str, base_url: str | None = None) -> None:
308
+ """Replace a judge evaluator's auto-rater model with one built from *api_key*.
309
+
310
+ Keyed on the ADK private seam (``_judge_model_options`` / ``_judge_model``, set by
311
+ ``LlmAsJudge._setup_auto_rater``) rather than on a class, so this single path covers
312
+ ``FinalResponseMatchV2Evaluator``, the ``rubric_based_*_v1`` evaluators, and
313
+ ``HallucinationsV1Evaluator`` (which exposes the same attributes without subclassing
314
+ ``LlmAsJudge``). ``get_evaluator`` returns a fresh instance per evaluation, so mutating
315
+ it here carries no shared state and is safe across concurrent runs.
316
+
317
+ TODO(upstream): propose that ADK ``JudgeModelOptions`` carry a credential or a prebuilt
318
+ model instance, so judge auth no longer depends on this private seam or process env.
319
+ """
320
+ opts = getattr(evaluator, "_judge_model_options", None)
321
+ if opts is None or not hasattr(evaluator, "_judge_model"):
322
+ logger.warning("evaluator %s is not judge-backed; cannot inject credential", type(evaluator).__name__)
323
+ return
324
+ model_id = getattr(opts, "judge_model", None)
325
+ if not model_id:
326
+ logger.warning(
327
+ "evaluator %s has no resolved judge_model; skipping credential injection", type(evaluator).__name__
328
+ )
329
+ return
330
+ evaluator._judge_model = _build_judge_model(model_id, api_key, base_url)
331
+
332
+
270
333
  def extract_trajectory_details(eval_result: EvaluationResult) -> dict[str, Any]:
271
334
  """Extract expected vs actual tool call details from trajectory evaluation."""
272
335
  comparisons = []
@@ -305,6 +368,8 @@ async def evaluate_builtin_metric(
305
368
  judge_model: str | None,
306
369
  threshold: float | None,
307
370
  match_type: str | None = None,
371
+ credential_ref: str | None = None,
372
+ judge_base_url: str | None = None,
308
373
  ) -> dict[str, Any]:
309
374
  """Evaluate a single built-in ADK metric.
310
375
 
@@ -326,6 +391,18 @@ async def evaluate_builtin_metric(
326
391
  eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
327
392
  evaluator: Evaluator = get_evaluator(eval_metric)
328
393
 
394
+ if credential_ref:
395
+ api_key = get_resolved_credential(credential_ref)
396
+ if api_key is None:
397
+ return MetricResult(
398
+ metric_name=metric_name,
399
+ error=(
400
+ f"Metric '{metric_name}' references credential '{credential_ref}', "
401
+ f"which was not provided in the run's credentialRefs."
402
+ ),
403
+ )
404
+ _inject_judge_credential(evaluator, api_key, judge_base_url)
405
+
329
406
  if metric_name in _METRICS_NEEDING_INVOCATION_EVENTS:
330
407
  actual_invocations = _enrich_app_details([_to_invocation_events(inv) for inv in actual_invocations])
331
408
  if expected_invocations is not None:
@@ -27,6 +27,14 @@ class BuiltinMetricDef(BaseModel):
27
27
  threshold: float | None = Field(default=None, ge=0, le=1)
28
28
  judge_model: str | None = None
29
29
  trajectory_match_type: str | None = None
30
+ credential_ref: str | None = Field(
31
+ default=None,
32
+ description="Logical name of a RunSpec.credential_refs entry whose resolved value is the judge API key.",
33
+ )
34
+ judge_base_url: str | None = Field(
35
+ default=None,
36
+ description="Optional base URL for the judge endpoint (e.g. an OpenAI-compatible proxy).",
37
+ )
30
38
 
31
39
  @field_validator("trajectory_match_type")
32
40
  @classmethod
@@ -453,6 +453,8 @@ async def evaluate_custom_evaluator(
453
453
  judge_model=evaluator_def.judge_model,
454
454
  threshold=evaluator_def.threshold,
455
455
  match_type=evaluator_def.trajectory_match_type,
456
+ credential_ref=evaluator_def.credential_ref,
457
+ judge_base_url=evaluator_def.judge_base_url,
456
458
  )
457
459
 
458
460
  if isinstance(evaluator_def, OpenAIEvalDef):
@@ -0,0 +1,167 @@
1
+ """Secret resolvers — a generic, pluggable layer for resolving secret references.
2
+
3
+ A host attaches *secret references* to a run (``RunSpec.credential_refs``); each
4
+ reference is a ``dict`` with a ``kind`` plus kind-specific locator fields. At run
5
+ time the worker resolves every reference once to its secret value and stashes the
6
+ ``logical-name -> value`` map in a :class:`contextvars.ContextVar` scoped to that
7
+ run's asyncio task. Consumers (e.g. judge construction) read the value they need
8
+ with no ``os.environ`` mutation and no shared state across concurrently running
9
+ evaluations.
10
+
11
+ This layer is deliberately consumer-agnostic: a resolver turns a reference into a
12
+ secret value and nothing more. How that value is used — which provider it
13
+ authenticates, what base URL it pairs with — is the consumer's concern, configured
14
+ where the consumer is built (for judges, on the evaluator definition).
15
+
16
+ **Plugins:** third-party packages declare setuptools entry points in group
17
+ ``agentevals.secret_resolvers`` (entry **name** = ``kind`` string; **value** =
18
+ ``module:factory`` callable ``factory(spec: dict) -> SecretResolver``). The
19
+ zero-dependency ``env`` resolver ships with agentevals through this same group so
20
+ the discovery path is exercised in OSS. Hosts may replace any kind via
21
+ :func:`register_resolver_factory` (highest precedence).
22
+
23
+ Tests may call :func:`clear_resolver_plugin_registry` to drop programmatic
24
+ registrations.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ import os
31
+ from collections.abc import Callable
32
+ from contextvars import ContextVar, Token
33
+ from importlib.metadata import entry_points
34
+ from typing import Any, Protocol, cast
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ SECRET_RESOLVER_ENTRY_POINT_GROUP = "agentevals.secret_resolvers"
39
+
40
+
41
+ class SecretResolver(Protocol):
42
+ async def resolve(self, ref: dict[str, Any]) -> str: ...
43
+
44
+
45
+ SecretResolverFactory = Callable[[dict[str, Any]], SecretResolver]
46
+
47
+ _PLUGIN_FACTORIES: dict[str, SecretResolverFactory] = {}
48
+
49
+
50
+ class EnvSecretResolver:
51
+ """Resolve ``{"kind": "env", "name": "OPENAI_API_KEY"}`` from ``os.environ``."""
52
+
53
+ async def resolve(self, ref: dict[str, Any]) -> str:
54
+ name = ref.get("name")
55
+ if not name:
56
+ raise ValueError("env secret reference requires a 'name' field")
57
+ value = os.environ.get(name)
58
+ if value is None:
59
+ raise ValueError(f"environment variable {name!r} is not set")
60
+ return value
61
+
62
+
63
+ def create_env_resolver(spec: dict[str, Any]) -> EnvSecretResolver:
64
+ return EnvSecretResolver()
65
+
66
+
67
+ def register_resolver_factory(kind: str, factory: SecretResolverFactory) -> None:
68
+ """Register or replace the factory for ``kind`` (overrides built-ins and entry points).
69
+
70
+ Call during process startup before run workers consume specs. The factory receives
71
+ the full reference dict (including ``kind``) and returns a :class:`SecretResolver`.
72
+ """
73
+ _PLUGIN_FACTORIES[kind] = factory
74
+
75
+
76
+ def clear_resolver_plugin_registry() -> None:
77
+ """Drop all registrations from :func:`register_resolver_factory` (for tests)."""
78
+ _PLUGIN_FACTORIES.clear()
79
+
80
+
81
+ def _builtin_factories() -> dict[str, SecretResolverFactory]:
82
+ """No hardcoded built-ins: ``env`` ships via the entry-point group."""
83
+ return {}
84
+
85
+
86
+ def _merge_resolver_factories() -> dict[str, SecretResolverFactory]:
87
+ """Built-ins, then entry points (no built-in shadowing), then programmatic overrides."""
88
+ merged: dict[str, SecretResolverFactory] = dict(_builtin_factories())
89
+ eps = entry_points(group=SECRET_RESOLVER_ENTRY_POINT_GROUP)
90
+ for ep in eps:
91
+ if ep.name in merged:
92
+ logger.debug("skipping resolver entry point %r; built-in kind takes precedence", ep.name)
93
+ continue
94
+ try:
95
+ loaded = ep.load()
96
+ if not callable(loaded):
97
+ logger.warning("resolver entry point %r is not callable; skipping", ep.name)
98
+ continue
99
+ merged[ep.name] = cast(SecretResolverFactory, loaded)
100
+ except Exception:
101
+ logger.exception("failed to load resolver entry point %r", ep.name)
102
+ merged.update(_PLUGIN_FACTORIES)
103
+ return merged
104
+
105
+
106
+ def registered_resolver_kinds() -> tuple[str, ...]:
107
+ """Sorted resolver ``kind`` strings that would resolve if :func:`build_resolver` ran now.
108
+
109
+ Includes built-ins, successfully loaded setuptools entry points for group
110
+ :data:`SECRET_RESOLVER_ENTRY_POINT_GROUP`, and registrations from
111
+ :func:`register_resolver_factory`.
112
+ """
113
+ return tuple(sorted(_merge_resolver_factories().keys()))
114
+
115
+
116
+ def build_resolver(ref: dict[str, Any]) -> SecretResolver:
117
+ """Construct the :class:`SecretResolver` for a reference's ``kind``.
118
+
119
+ Factory lookup starts from built-ins, adds setuptools entry points (group
120
+ ``agentevals.secret_resolvers``) for ``kind`` names not already built-in, then
121
+ applies :func:`register_resolver_factory` registrations, which override any prior
122
+ factory for the same ``kind``.
123
+ """
124
+ kind = ref.get("kind")
125
+ if not kind:
126
+ raise ValueError("secret reference is missing a 'kind' field")
127
+ factories = _merge_resolver_factories()
128
+ factory = factories.get(kind)
129
+ if factory is None:
130
+ raise ValueError(
131
+ f"unknown secret resolver kind '{kind}'. Available: {', '.join(sorted(factories)) or '(none)'}"
132
+ )
133
+ return factory(ref)
134
+
135
+
136
+ async def resolve_credential_refs(refs: dict[str, dict[str, Any]]) -> dict[str, str]:
137
+ """Resolve every ``logical-name -> reference`` entry to its secret value.
138
+
139
+ Each resolver reads only its own kind-specific locator fields. Any non-locator
140
+ fields a host puts on a reference are ignored here; consumer-specific config
141
+ belongs with the consumer (for judges, on the evaluator definition).
142
+ """
143
+ resolved: dict[str, str] = {}
144
+ for logical_name, ref in refs.items():
145
+ resolver = build_resolver(ref)
146
+ resolved[logical_name] = await resolver.resolve(ref)
147
+ return resolved
148
+
149
+
150
+ _RESOLVED: ContextVar[dict[str, str] | None] = ContextVar("agentevals_resolved_credentials", default=None)
151
+
152
+
153
+ def set_resolved_credentials(mapping: dict[str, str]) -> Token:
154
+ """Scope a ``logical-name -> secret value`` map to the current asyncio task. Returns a reset token."""
155
+ return _RESOLVED.set(mapping)
156
+
157
+
158
+ def reset_resolved_credentials(token: Token) -> None:
159
+ _RESOLVED.reset(token)
160
+
161
+
162
+ def get_resolved_credential(logical_name: str) -> str | None:
163
+ """Look up a secret value resolved for the current run, or ``None`` if absent."""
164
+ mapping = _RESOLVED.get()
165
+ if not mapping:
166
+ return None
167
+ return mapping.get(logical_name)
@@ -0,0 +1,62 @@
1
+ """Kubernetes Secret resolver — an optional :class:`SecretResolver` plugin.
2
+
3
+ Resolves a reference of the form ``{"kind": "kubernetes", "namespace": ..., "name": ...,
4
+ "key": ...}`` by reading the named Secret and base64-decoding the requested key. Ships
5
+ behind the ``kubernetes`` extra and is wired through the ``agentevals.secret_resolvers``
6
+ entry-point group; the ``kubernetes`` package is imported lazily inside the factory so
7
+ installing agentevals without the extra never breaks import or plugin discovery.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import base64
14
+ import logging
15
+ from typing import Any
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class KubernetesSecretResolver:
21
+ """Reads a key out of a Kubernetes Secret via a ``CoreV1Api`` client."""
22
+
23
+ def __init__(self, core_v1_api: Any) -> None:
24
+ self._core_v1 = core_v1_api
25
+
26
+ async def resolve(self, ref: dict[str, Any]) -> str:
27
+ namespace = ref.get("namespace")
28
+ name = ref.get("name")
29
+ key = ref.get("key")
30
+ missing = [f for f, v in (("namespace", namespace), ("name", name), ("key", key)) if not v]
31
+ if missing:
32
+ raise ValueError(f"kubernetes secret reference is missing required field(s): {', '.join(missing)}")
33
+
34
+ secret = await asyncio.to_thread(self._core_v1.read_namespaced_secret, name, namespace)
35
+ data = secret.data or {}
36
+ if key not in data:
37
+ available = ", ".join(sorted(data)) or "(none)"
38
+ raise ValueError(f"key '{key}' not found in Secret {namespace}/{name}; available keys: {available}")
39
+ return base64.b64decode(data[key]).decode("utf-8")
40
+
41
+
42
+ def create_kubernetes_resolver(spec: dict[str, Any]) -> KubernetesSecretResolver:
43
+ """Build a :class:`KubernetesSecretResolver`, loading cluster config lazily.
44
+
45
+ Tries in-cluster config first (for pods with a mounted service account), then falls
46
+ back to the local kubeconfig for development. The ``kubernetes`` package is imported
47
+ here rather than at module load so the plugin can be discovered even when the extra
48
+ is not installed.
49
+ """
50
+ try:
51
+ from kubernetes import client, config
52
+ except ImportError as exc:
53
+ raise RuntimeError(
54
+ "the kubernetes secret resolver requires the 'kubernetes' extra; install agentevals-cli[kubernetes]"
55
+ ) from exc
56
+
57
+ try:
58
+ config.load_incluster_config()
59
+ except config.ConfigException:
60
+ config.load_kube_config()
61
+
62
+ return KubernetesSecretResolver(client.CoreV1Api())
@@ -21,6 +21,11 @@ from uuid import UUID
21
21
  from google.adk.evaluation.eval_set import EvalSet
22
22
 
23
23
  from ..config import EvalParams
24
+ from ..resolvers import (
25
+ reset_resolved_credentials,
26
+ resolve_credential_refs,
27
+ set_resolved_credentials,
28
+ )
24
29
  from ..runner import RunResult, TraceResult, run_evaluation_from_traces
25
30
  from ..storage.config import StorageSettings
26
31
  from ..storage.models import Run, RunStatus
@@ -107,7 +112,10 @@ class AsyncRunWorker:
107
112
  cancel_event = asyncio.Event()
108
113
  hb_task = asyncio.create_task(self._heartbeat(run.run_id, worker_id, cancel_event))
109
114
  sinks = build_sinks(run.spec.sinks or [])
115
+ cred_token = None
110
116
  try:
117
+ if run.spec.credential_refs:
118
+ cred_token = set_resolved_credentials(await resolve_credential_refs(run.spec.credential_refs))
111
119
  await self._run_evaluation(run, sinks, cancel_event)
112
120
  except asyncio.CancelledError:
113
121
  await self._runs.update_status(run.run_id, RunStatus.CANCELLED, error="worker cancelled")
@@ -126,6 +134,8 @@ class AsyncRunWorker:
126
134
  await self._runs.update_status(run.run_id, RunStatus.FAILED, error=str(exc))
127
135
  await sinks.emit_error(run.run_id, str(exc), run.attempt)
128
136
  finally:
137
+ if cred_token is not None:
138
+ reset_resolved_credentials(cred_token)
129
139
  hb_task.cancel()
130
140
  try:
131
141
  await hb_task
@@ -84,6 +84,15 @@ class RunSpec(BaseModel):
84
84
  eval_config: dict[str, Any] = Field(default_factory=dict)
85
85
  sinks: list[dict[str, Any]] = Field(default_factory=list)
86
86
  context: dict[str, Any] = Field(default_factory=dict)
87
+ credential_refs: dict[str, dict[str, Any]] | None = Field(
88
+ default=None,
89
+ description=(
90
+ "Map of logical credential name to a secret reference dict. Each reference has a "
91
+ "'kind' (the resolver to use) plus that kind's locator fields. Resolved per run to its "
92
+ "secret value; never written to the process environment. How a value is used (e.g. which "
93
+ "judge provider it authenticates) is configured on the consumer, not the reference."
94
+ ),
95
+ )
87
96
 
88
97
 
89
98
  class Run(BaseModel):
@@ -15,6 +15,7 @@ Tests are synchronous because:
15
15
 
16
16
  from __future__ import annotations
17
17
 
18
+ import importlib.util
18
19
  import os
19
20
  import subprocess
20
21
  import sys
@@ -38,6 +39,11 @@ _skip_no_google = pytest.mark.skipif(
38
39
  reason="GOOGLE_API_KEY not set",
39
40
  )
40
41
 
42
+ _skip_no_pydantic_ai = pytest.mark.skipif(
43
+ importlib.util.find_spec("pydantic_ai") is None,
44
+ reason="pydantic_ai SDK not installed",
45
+ )
46
+
41
47
 
42
48
  def _run_agent(
43
49
  script: str,
@@ -305,6 +311,7 @@ class TestOpenAIAgentsZeroCode:
305
311
  assert session_name in session_ids
306
312
 
307
313
 
314
+ @_skip_no_pydantic_ai
308
315
  @_skip_no_openai
309
316
  class TestPydanticAIZeroCode:
310
317
  """Run the Pydantic AI zero-code OTLP example and verify session grouping."""
@@ -0,0 +1,63 @@
1
+ """Kubernetes secret resolver tests.
2
+
3
+ The kubernetes client is mocked, so these run whether or not the optional
4
+ ``kubernetes`` extra is installed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import base64
10
+ import sys
11
+ from unittest.mock import MagicMock
12
+
13
+ import pytest
14
+
15
+ from agentevals.resolvers.kubernetes import KubernetesSecretResolver, create_kubernetes_resolver
16
+
17
+
18
+ def _client_returning(data: dict[str, str]) -> MagicMock:
19
+ client = MagicMock()
20
+ client.read_namespaced_secret.return_value = MagicMock(data=data)
21
+ return client
22
+
23
+
24
+ def _b64(value: str) -> str:
25
+ return base64.b64encode(value.encode()).decode()
26
+
27
+
28
+ class TestResolve:
29
+ async def test_reads_and_base64_decodes_value(self):
30
+ client = _client_returning({"api-key": _b64("sk-secret-value")})
31
+ resolver = KubernetesSecretResolver(client)
32
+
33
+ value = await resolver.resolve({"namespace": "ns", "name": "creds", "key": "api-key"})
34
+
35
+ assert value == "sk-secret-value"
36
+ client.read_namespaced_secret.assert_called_once_with("creds", "ns")
37
+
38
+ async def test_missing_fields_raise(self):
39
+ resolver = KubernetesSecretResolver(MagicMock())
40
+ with pytest.raises(ValueError, match="namespace, name, key"):
41
+ await resolver.resolve({"kind": "kubernetes"})
42
+
43
+ async def test_key_not_found_lists_names_not_values(self):
44
+ secret_value = _b64("sk-do-not-leak")
45
+ client = _client_returning({"api-key": secret_value, "tls.crt": _b64("cert")})
46
+ resolver = KubernetesSecretResolver(client)
47
+
48
+ with pytest.raises(ValueError) as exc:
49
+ await resolver.resolve({"namespace": "ns", "name": "creds", "key": "wrong"})
50
+
51
+ message = str(exc.value)
52
+ assert "api-key" in message and "tls.crt" in message
53
+ # The enumeration must never echo the base64-encoded secret values.
54
+ assert secret_value not in message
55
+
56
+
57
+ class TestFactory:
58
+ def test_requires_kubernetes_extra(self, monkeypatch):
59
+ # Shadow the kubernetes package so the lazy import fails regardless of
60
+ # whether the extra is installed in the test venv.
61
+ monkeypatch.setitem(sys.modules, "kubernetes", None)
62
+ with pytest.raises(RuntimeError, match="kubernetes"):
63
+ create_kubernetes_resolver({"kind": "kubernetes"})