agentevals-cli 0.9.3__tar.gz → 0.9.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/Dockerfile +1 -1
  2. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/PKG-INFO +4 -2
  3. agentevals_cli-0.9.5/charts/agentevals/templates/rbac.yaml +33 -0
  4. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/values.yaml +14 -0
  5. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/run.py +1 -1
  6. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/pyproject.toml +8 -1
  7. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/models.py +9 -0
  8. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/routes.py +178 -84
  9. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/builtin_metrics.py +77 -0
  10. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/config.py +8 -0
  11. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/custom_evaluators.py +2 -0
  12. agentevals_cli-0.9.5/src/agentevals/resolvers/__init__.py +167 -0
  13. agentevals_cli-0.9.5/src/agentevals/resolvers/kubernetes.py +62 -0
  14. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/worker.py +10 -0
  15. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/models.py +9 -0
  16. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_live_agents.py +7 -0
  17. agentevals_cli-0.9.5/tests/resolvers/test_kubernetes.py +63 -0
  18. agentevals_cli-0.9.5/tests/resolvers/test_registry.py +145 -0
  19. agentevals_cli-0.9.5/tests/storage/__init__.py +0 -0
  20. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_api.py +210 -0
  21. agentevals_cli-0.9.5/tests/test_credential_injection.py +122 -0
  22. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_mcp_server.py +2 -0
  23. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/uv.lock +176 -802
  24. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.claude/skills/eval/SKILL.md +0 -0
  25. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.claude/skills/eval/evals/evals.json +0 -0
  26. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.claude/skills/inspect/SKILL.md +0 -0
  27. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.claude/skills/inspect/evals/evals.json +0 -0
  28. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.dockerignore +0 -0
  29. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  30. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  31. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  32. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/workflows/ci.yml +0 -0
  33. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  34. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/workflows/release.yml +0 -0
  35. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.gitignore +0 -0
  36. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.mcp.json +0 -0
  37. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/CONTRIBUTING.md +0 -0
  38. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/DEVELOPMENT.md +0 -0
  39. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/LICENSE +0 -0
  40. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/Makefile +0 -0
  41. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/README.md +0 -0
  42. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/Chart.yaml +0 -0
  43. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/NOTES.txt +0 -0
  44. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/_helpers.tpl +0 -0
  45. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/deployment.yaml +0 -0
  46. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  47. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/postgresql.yaml +0 -0
  48. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/service.yaml +0 -0
  49. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  50. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/assets/logo-color-on-transparent.svg +0 -0
  51. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/assets/logo-color.png +0 -0
  52. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/assets/logo-dark-on-transparent.svg +0 -0
  53. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/custom-evaluators.md +0 -0
  54. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/eval-set-format.md +0 -0
  55. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/otel-compatibility.md +0 -0
  56. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/streaming.md +0 -0
  57. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/README.md +0 -0
  58. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_evaluators/eval_config.yaml +0 -0
  59. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_evaluators/eval_config_openai_eval.yaml +0 -0
  60. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_evaluators/response_quality.py +0 -0
  61. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_evaluators/tool_call_checker.py +0 -0
  62. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_sink/README.md +0 -0
  63. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
  64. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
  65. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_sink/pyproject.toml +0 -0
  66. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/README.md +0 -0
  67. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/agent.py +0 -0
  68. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/eval_set.json +0 -0
  69. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/main.py +0 -0
  70. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/test_streaming.py +0 -0
  71. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/kubernetes/README.md +0 -0
  72. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/README.md +0 -0
  73. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/agent.py +0 -0
  74. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/eval_set.json +0 -0
  75. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/main.py +0 -0
  76. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/requirements.txt +0 -0
  77. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/test_streaming.py +0 -0
  78. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/sdk_example/async_example.py +0 -0
  79. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/sdk_example/context_manager_example.py +0 -0
  80. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/sdk_example/decorator_example.py +0 -0
  81. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/sdk_example/requirements.txt +0 -0
  82. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/strands_agent/agent.py +0 -0
  83. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/strands_agent/eval_set.json +0 -0
  84. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/strands_agent/main.py +0 -0
  85. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/strands_agent/requirements.txt +0 -0
  86. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/requirements.txt +0 -0
  87. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  88. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/langchain/run.py +0 -0
  89. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  90. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/ollama/run.py +0 -0
  91. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  92. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/openai-agents/run.py +0 -0
  93. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  94. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  95. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/strands/requirements.txt +0 -0
  96. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/strands/run.py +0 -0
  97. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/flake.lock +0 -0
  98. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/flake.nix +0 -0
  99. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/README.md +0 -0
  100. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  101. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  102. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  103. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  104. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/eval_set_helm.json +0 -0
  105. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/evalset_helm_3_2026-02-23.json +0 -0
  106. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/evalset_k8s_2026-02-20.json +0 -0
  107. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/helm.json +0 -0
  108. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/helm_2.json +0 -0
  109. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/helm_3.json +0 -0
  110. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/k8s.json +0 -0
  111. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/tempo_export_with_batches.json +0 -0
  112. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/__init__.py +0 -0
  113. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_protocol.py +0 -0
  114. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  115. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/assets/index-RIquRPno.js +0 -0
  116. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/index.html +0 -0
  117. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/logo.svg +0 -0
  118. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/vite.svg +0 -0
  119. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/__init__.py +0 -0
  120. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/app.py +0 -0
  121. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/debug_routes.py +0 -0
  122. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/dependencies.py +0 -0
  123. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_app.py +0 -0
  124. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_grpc.py +0 -0
  125. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_processing.py +0 -0
  126. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_routes.py +0 -0
  127. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/runs_routes.py +0 -0
  128. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/streaming_routes.py +0 -0
  129. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/cli.py +0 -0
  130. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/converter.py +0 -0
  131. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/eval_config_loader.py +0 -0
  132. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/__init__.py +0 -0
  133. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/resolver.py +0 -0
  134. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/sources.py +0 -0
  135. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/templates.py +0 -0
  136. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/venv.py +0 -0
  137. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/extraction.py +0 -0
  138. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/genai_converter.py +0 -0
  139. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/__init__.py +0 -0
  140. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/auto.py +0 -0
  141. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/base.py +0 -0
  142. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/jaeger.py +0 -0
  143. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/otlp.py +0 -0
  144. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/mcp_server.py +0 -0
  145. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/openai_eval_backend.py +0 -0
  146. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/output.py +0 -0
  147. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/__init__.py +0 -0
  148. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/fetcher.py +0 -0
  149. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/result_builder.py +0 -0
  150. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/service.py +0 -0
  151. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/sinks.py +0 -0
  152. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/runner.py +0 -0
  153. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/sdk.py +0 -0
  154. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/__init__.py +0 -0
  155. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/config.py +0 -0
  156. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/__init__.py +0 -0
  157. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  158. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  159. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrator.py +0 -0
  160. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/pool.py +0 -0
  161. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/__init__.py +0 -0
  162. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/memory.py +0 -0
  163. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/postgres.py +0 -0
  164. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/__init__.py +0 -0
  165. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/incremental_processor.py +0 -0
  166. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/processor.py +0 -0
  167. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/session.py +0 -0
  168. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/ws_server.py +0 -0
  169. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/trace_attrs.py +0 -0
  170. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/trace_metrics.py +0 -0
  171. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/utils/__init__.py +0 -0
  172. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/utils/genai_messages.py +0 -0
  173. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/utils/log_buffer.py +0 -0
  174. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/utils/log_enrichment.py +0 -0
  175. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/api/__init__.py +0 -0
  176. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/api/test_evaluate_persistence.py +0 -0
  177. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/api/test_runs_routes.py +0 -0
  178. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/__init__.py +0 -0
  179. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/conftest.py +0 -0
  180. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_evaluation_pipeline.py +0 -0
  181. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  182. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_session_grouping.py +0 -0
  183. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_timing_stress.py +0 -0
  184. {agentevals_cli-0.9.3/tests/run → agentevals_cli-0.9.5/tests/resolvers}/__init__.py +0 -0
  185. {agentevals_cli-0.9.3/tests/storage → agentevals_cli-0.9.5/tests/run}/__init__.py +0 -0
  186. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/run/test_fetcher.py +0 -0
  187. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/run/test_result_builder.py +0 -0
  188. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/run/test_service.py +0 -0
  189. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/run/test_sinks.py +0 -0
  190. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/storage/test_config.py +0 -0
  191. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/storage/test_memory_repos.py +0 -0
  192. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/storage/test_migrator.py +0 -0
  193. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/storage/test_models.py +0 -0
  194. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_cli.py +0 -0
  195. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_converter.py +0 -0
  196. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_eval_config_loader.py +0 -0
  197. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_extraction.py +0 -0
  198. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_genai_converter.py +0 -0
  199. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_jaeger_loader.py +0 -0
  200. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_loader_auto.py +0 -0
  201. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_log_enrichment.py +0 -0
  202. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_openai_eval_backend.py +0 -0
  203. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_otlp_loader.py +0 -0
  204. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_otlp_receiver.py +0 -0
  205. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_output.py +0 -0
  206. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_protocol.py +0 -0
  207. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_runner.py +0 -0
  208. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_sdk.py +0 -0
  209. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_trace_metrics.py +0 -0
  210. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/.gitignore +0 -0
  211. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/README.md +0 -0
  212. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/eslint.config.js +0 -0
  213. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/index.html +0 -0
  214. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/package-lock.json +0 -0
  215. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/package.json +0 -0
  216. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/public/logo.svg +0 -0
  217. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/public/vite.svg +0 -0
  218. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/App.css +0 -0
  219. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/App.tsx +0 -0
  220. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/api/client.ts +0 -0
  221. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/assets/react.svg +0 -0
  222. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  223. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  224. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  225. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  226. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  227. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/BuilderView.tsx +0 -0
  228. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  229. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  230. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  231. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/JsonPreview.tsx +0 -0
  232. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  233. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  234. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/index.ts +0 -0
  235. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  236. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  237. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  238. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  239. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  240. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  241. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  242. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  243. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/DataSection.tsx +0 -0
  244. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  245. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  246. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorView.tsx +0 -0
  247. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  248. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  249. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  250. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  251. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  252. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  253. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  254. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  255. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  256. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  257. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  258. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/SessionCard.tsx +0 -0
  259. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  260. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  261. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/FileDropZone.tsx +0 -0
  262. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/MetricSelector.tsx +0 -0
  263. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  264. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  265. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/UploadView.tsx +0 -0
  266. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  267. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/config.ts +0 -0
  268. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/context/TraceContext.tsx +0 -0
  269. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/context/TraceProvider.tsx +0 -0
  270. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/index.css +0 -0
  271. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/console-capture.ts +0 -0
  272. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/eval-config.ts +0 -0
  273. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/evalset-builder.ts +0 -0
  274. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/network-capture.ts +0 -0
  275. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/trace-helpers.ts +0 -0
  276. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/trace-loader.ts +0 -0
  277. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/trace-metadata.ts +0 -0
  278. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/trace-patcher.ts +0 -0
  279. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/types.ts +0 -0
  280. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/utils.ts +0 -0
  281. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/main.tsx +0 -0
  282. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/tsconfig.app.json +0 -0
  283. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/tsconfig.json +0 -0
  284. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/tsconfig.node.json +0 -0
  285. {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/vite.config.ts +0 -0
@@ -31,7 +31,7 @@ COPY --from=ui /build/ui/dist ./src/agentevals/_static
31
31
  ARG VERSION
32
32
  ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
33
33
 
34
- RUN uv sync --frozen --no-dev --extra live --extra postgres \
34
+ RUN uv sync --frozen --no-dev --extra live --extra postgres --extra kubernetes \
35
35
  && groupadd --gid 1000 app \
36
36
  && useradd --uid 1000 --gid app --home-dir /app --no-log-init app \
37
37
  && chown -R app:app /app
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.9.3
3
+ Version: 0.9.5
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
7
7
  Requires-Dist: click>=8.0
8
8
  Requires-Dist: fastapi>=0.115.0
9
- Requires-Dist: google-adk[eval]>=1.30.0
9
+ Requires-Dist: google-adk[eval]<2.2,>=2.1.0
10
10
  Requires-Dist: httpx>=0.27.0
11
11
  Requires-Dist: opentelemetry-proto>=1.36.0
12
12
  Requires-Dist: python-dotenv>=1.0.0
@@ -14,6 +14,8 @@ Requires-Dist: python-multipart>=0.0.12
14
14
  Requires-Dist: pyyaml>=6.0
15
15
  Requires-Dist: tabulate>=0.9.0
16
16
  Requires-Dist: uvicorn[standard]>=0.32.0
17
+ Provides-Extra: kubernetes
18
+ Requires-Dist: kubernetes>=36.0.0; extra == 'kubernetes'
17
19
  Provides-Extra: live
18
20
  Requires-Dist: httpx>=0.27.0; extra == 'live'
19
21
  Requires-Dist: mcp>=1.26.0; extra == 'live'
@@ -0,0 +1,33 @@
1
+ {{- if .Values.rbac.create -}}
2
+ apiVersion: rbac.authorization.k8s.io/v1
3
+ kind: Role
4
+ metadata:
5
+ name: {{ include "agentevals.fullname" . }}
6
+ namespace: {{ include "agentevals.namespace" . }}
7
+ labels:
8
+ {{- include "agentevals.labels" . | nindent 4 }}
9
+ rules:
10
+ - apiGroups: [""]
11
+ resources: ["secrets"]
12
+ verbs: ["get"]
13
+ {{- with .Values.rbac.secretNames }}
14
+ resourceNames:
15
+ {{- toYaml . | nindent 6 }}
16
+ {{- end }}
17
+ ---
18
+ apiVersion: rbac.authorization.k8s.io/v1
19
+ kind: RoleBinding
20
+ metadata:
21
+ name: {{ include "agentevals.fullname" . }}
22
+ namespace: {{ include "agentevals.namespace" . }}
23
+ labels:
24
+ {{- include "agentevals.labels" . | nindent 4 }}
25
+ roleRef:
26
+ apiGroup: rbac.authorization.k8s.io
27
+ kind: Role
28
+ name: {{ include "agentevals.fullname" . }}
29
+ subjects:
30
+ - kind: ServiceAccount
31
+ name: {{ include "agentevals.serviceAccountName" . }}
32
+ namespace: {{ include "agentevals.namespace" . }}
33
+ {{- end }}
@@ -57,6 +57,20 @@ serviceAccount:
57
57
  # -- ServiceAccount name override
58
58
  name: ""
59
59
 
60
+ # ==============================================================================
61
+ # RBAC
62
+ # ==============================================================================
63
+
64
+ # -- Namespaced Role + RoleBinding granting the pod's ServiceAccount read
65
+ # access to Secrets. Enable this when the kubernetes secret resolver reads
66
+ # provider credentials from Secrets via in-cluster config.
67
+ rbac:
68
+ # -- Create the Role and RoleBinding
69
+ create: false
70
+ # -- Restrict the Role to these Secret names. Empty grants get on all
71
+ # Secrets in the release namespace.
72
+ secretNames: []
73
+
60
74
  # ==============================================================================
61
75
  # Pod
62
76
  # ==============================================================================
@@ -74,7 +74,7 @@ async def main():
74
74
 
75
75
  agent_response = ""
76
76
  async for event in runner.run_async(user_id=user_id, session_id=session.id, new_message=content):
77
- if event.content.parts and event.content.parts[0].text:
77
+ if event.content and event.content.parts and event.content.parts[0].text:
78
78
  agent_response = event.content.parts[0].text
79
79
 
80
80
  print(f" Agent: {agent_response}")
@@ -9,7 +9,7 @@ description = "Standalone framework to evaluate agent correctness based on porta
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
11
11
  dependencies = [
12
- "google-adk[eval]>=1.30.0",
12
+ "google-adk[eval]>=2.1.0,<2.2",
13
13
  "click>=8.0",
14
14
  "tabulate>=0.9.0",
15
15
  "fastapi>=0.115.0",
@@ -36,10 +36,17 @@ openai = [
36
36
  postgres = [
37
37
  "asyncpg>=0.30.0",
38
38
  ]
39
+ kubernetes = [
40
+ "kubernetes>=36.0.0",
41
+ ]
39
42
 
40
43
  [project.scripts]
41
44
  agentevals = "agentevals.cli:main"
42
45
 
46
+ [project.entry-points."agentevals.secret_resolvers"]
47
+ env = "agentevals.resolvers:create_env_resolver"
48
+ kubernetes = "agentevals.resolvers.kubernetes:create_kubernetes_resolver"
49
+
43
50
  [tool.hatch.version]
44
51
  source = "vcs"
45
52
 
@@ -142,6 +142,15 @@ class EvaluateJsonRequest(CamelModel):
142
142
  traces: dict = Field(description="OTLP JSON export with resourceSpans structure.")
143
143
  config: EvalParams = Field(default_factory=EvalParams, description="Evaluation parameters.")
144
144
  eval_set: dict | None = Field(default=None, description="Optional ADK EvalSet JSON.")
145
+ credential_refs: dict[str, dict[str, Any]] | None = Field(
146
+ default=None,
147
+ description=(
148
+ "Map of logical credential name to a secret reference dict. Each reference has a "
149
+ "'kind' (the resolver to use) plus that kind's locator fields. Resolved per call to its "
150
+ "secret value; never written to the process environment. How a value is used (e.g. which "
151
+ "judge provider it authenticates) is configured on the consumer, not the reference."
152
+ ),
153
+ )
145
154
 
146
155
 
147
156
  # ---------------------------------------------------------------------------
@@ -9,6 +9,7 @@ import os
9
9
  import re
10
10
  import shutil
11
11
  import tempfile
12
+ from contextlib import contextmanager
12
13
  from typing import Any
13
14
 
14
15
  from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
@@ -23,6 +24,11 @@ from ..converter import convert_traces
23
24
  from ..extraction import get_extractor
24
25
  from ..loader import load_traces
25
26
  from ..loader.otlp import OtlpJsonLoader
27
+ from ..resolvers import (
28
+ reset_resolved_credentials,
29
+ resolve_credential_refs,
30
+ set_resolved_credentials,
31
+ )
26
32
  from ..runner import (
27
33
  RunResult,
28
34
  load_eval_set,
@@ -53,6 +59,57 @@ from .models import (
53
59
  logger = logging.getLogger(__name__)
54
60
 
55
61
 
62
+ @contextmanager
63
+ def _scoped_credentials(resolved: dict[str, str] | None):
64
+ """Scope an already-resolved ``logical-name -> secret value`` map to the current task.
65
+
66
+ Mirrors the async worker's set/reset (``run/worker.py``) so the synchronous evaluate
67
+ paths populate the same credential ContextVar that judge graders read. A falsy map is a
68
+ no-op, keeping callers byte-for-byte backward compatible. For streaming endpoints, enter
69
+ this BEFORE ``asyncio.create_task`` so the eval task inherits the populated context (a
70
+ child task snapshots its parent's context at creation time). Resolution is done by the
71
+ caller so its failures surface as request errors rather than scoping concerns.
72
+ """
73
+ token = set_resolved_credentials(resolved) if resolved else None
74
+ try:
75
+ yield
76
+ finally:
77
+ if token is not None:
78
+ reset_resolved_credentials(token)
79
+
80
+
81
+ async def _resolve_credentials(refs: dict[str, dict[str, Any]] | None) -> dict[str, str] | None:
82
+ """Resolve credentialRefs to secret values, mapping bad references to a 400.
83
+
84
+ Resolver ``ValueError``s (missing/unknown ``kind``, missing locator fields, an unset
85
+ env var) are request/input errors, so surface them as 400s instead of letting them
86
+ bubble up as 500s. Infrastructure failures from custom resolvers raise other exception
87
+ types and are left to propagate as 5xx.
88
+ """
89
+ if not refs:
90
+ return None
91
+ try:
92
+ return await resolve_credential_refs(refs)
93
+ except ValueError as exc:
94
+ raise HTTPException(status_code=400, detail=f"Could not resolve credentialRefs: {exc}") from exc
95
+
96
+
97
+ def _parse_credential_refs_form(raw: str | None) -> dict[str, dict[str, Any]] | None:
98
+ """Parse and validate the multipart ``credential_refs`` form field (a JSON object string).
99
+
100
+ Empty/absent is treated as no credentials. Raises ``ValueError`` (which
101
+ ``json.JSONDecodeError`` subclasses) on malformed JSON or a non-object shape, so callers
102
+ map both to the same error they use for a bad ``config``. The JSON request endpoints get
103
+ this shape check for free from the ``EvaluateJsonRequest`` model.
104
+ """
105
+ if not raw:
106
+ return None
107
+ refs = json.loads(raw)
108
+ if not isinstance(refs, dict) or not all(isinstance(ref, dict) for ref in refs.values()):
109
+ raise ValueError("credentialRefs must be a JSON object mapping each logical name to a reference object")
110
+ return refs
111
+
112
+
56
113
  def _camel_keys(obj: Any) -> Any:
57
114
  """Recursively convert dict keys from snake_case to camelCase."""
58
115
  if isinstance(obj, dict):
@@ -462,6 +519,7 @@ async def evaluate_traces(
462
519
  trace_files: list[UploadFile] = File(...),
463
520
  config: str = Form(...),
464
521
  eval_set_file: UploadFile | None = File(None),
522
+ credential_refs: str | None = Form(None),
465
523
  ):
466
524
  """
467
525
  Evaluate agent traces using the provided evaluator configuration.
@@ -470,6 +528,8 @@ async def evaluate_traces(
470
528
  trace_files: List of Jaeger or OTLP JSON trace files
471
529
  config: JSON string with evaluation configuration
472
530
  eval_set_file: Optional golden eval set file
531
+ credential_refs: Optional JSON string mapping logical credential names to
532
+ secret references, resolved so LLM-as-Judge graders can authenticate
473
533
 
474
534
  Returns:
475
535
  RunResult with trace results and any errors
@@ -481,6 +541,11 @@ async def evaluate_traces(
481
541
  except json.JSONDecodeError as exc:
482
542
  raise HTTPException(status_code=400, detail=f"Invalid config JSON: {exc}") from exc
483
543
 
544
+ try:
545
+ cred_refs = _parse_credential_refs_form(credential_refs)
546
+ except ValueError as exc:
547
+ raise HTTPException(status_code=400, detail=f"Invalid credentialRefs: {exc}") from exc
548
+
484
549
  trace_paths = []
485
550
  for trace_file in trace_files:
486
551
  if not trace_file.filename:
@@ -548,7 +613,9 @@ async def evaluate_traces(
548
613
  len(trace_paths),
549
614
  [e.name for e in eval_config.evaluators],
550
615
  )
551
- result = await run_evaluation(eval_config)
616
+ resolved_creds = await _resolve_credentials(cred_refs)
617
+ with _scoped_credentials(resolved_creds):
618
+ result = await run_evaluation(eval_config)
552
619
 
553
620
  run_id = await _maybe_persist_evaluate_run(
554
621
  request,
@@ -580,6 +647,7 @@ async def evaluate_traces_stream(
580
647
  trace_files: list[UploadFile] = File(...),
581
648
  config: str = Form(...),
582
649
  eval_set_file: UploadFile | None = File(None),
650
+ credential_refs: str | None = Form(None),
583
651
  ):
584
652
  """Evaluate traces with real-time progress via SSE."""
585
653
  temp_dir = tempfile.mkdtemp()
@@ -593,6 +661,12 @@ async def evaluate_traces_stream(
593
661
  yield f"data: {SSEErrorEvent(error=f'Invalid config JSON: {exc}').model_dump_json(by_alias=True)}\n\n"
594
662
  return
595
663
 
664
+ try:
665
+ cred_refs = _parse_credential_refs_form(credential_refs)
666
+ except ValueError as exc:
667
+ yield f"data: {SSEErrorEvent(error=f'Invalid credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
668
+ return
669
+
596
670
  trace_paths = []
597
671
  for trace_file in trace_files:
598
672
  if not trace_file.filename:
@@ -674,47 +748,54 @@ async def evaluate_traces_stream(
674
748
  result = await run_evaluation(eval_config, progress_callback, trace_progress_callback)
675
749
  await queue.put(("done", result))
676
750
 
677
- eval_task = asyncio.create_task(run_with_progress())
678
-
679
751
  try:
680
- while True:
681
- msg = await queue.get()
682
- tag, payload = msg
683
-
684
- if tag == "done":
685
- run_id = await _maybe_persist_evaluate_run(
686
- request,
687
- params=eval_config,
688
- eval_set_dict=_load_eval_set_dict(eval_set_path),
689
- trace_format=eval_config.trace_format,
690
- upload_filenames=upload_filenames,
691
- run_result=payload,
692
- )
693
- if run_id:
694
- payload.run_id = run_id
695
- evt = SSEDoneEvent(
696
- result=_camel_keys(payload.model_dump(by_alias=True)),
697
- )
698
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
699
- break
700
- elif tag == "trace_progress":
701
- evt = SSETraceProgressEvent(
702
- trace_progress=SSETraceProgress(
703
- trace_id=payload.trace_id,
704
- partial_result=_camel_keys(payload.model_dump(by_alias=True)),
752
+ resolved_creds = await resolve_credential_refs(cred_refs) if cred_refs else None
753
+ except ValueError as exc:
754
+ yield f"data: {SSEErrorEvent(error=f'Could not resolve credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
755
+ return
756
+
757
+ with _scoped_credentials(resolved_creds):
758
+ eval_task = asyncio.create_task(run_with_progress())
759
+
760
+ try:
761
+ while True:
762
+ msg = await queue.get()
763
+ tag, payload = msg
764
+
765
+ if tag == "done":
766
+ run_id = await _maybe_persist_evaluate_run(
767
+ request,
768
+ params=eval_config,
769
+ eval_set_dict=_load_eval_set_dict(eval_set_path),
770
+ trace_format=eval_config.trace_format,
771
+ upload_filenames=upload_filenames,
772
+ run_result=payload,
705
773
  )
706
- )
707
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
708
- elif tag == "progress":
709
- evt = SSEProgressEvent(message=payload)
710
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
711
- finally:
712
- if not eval_task.done():
713
- eval_task.cancel()
714
- try:
715
- await eval_task
716
- except asyncio.CancelledError:
717
- pass
774
+ if run_id:
775
+ payload.run_id = run_id
776
+ evt = SSEDoneEvent(
777
+ result=_camel_keys(payload.model_dump(by_alias=True)),
778
+ )
779
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
780
+ break
781
+ elif tag == "trace_progress":
782
+ evt = SSETraceProgressEvent(
783
+ trace_progress=SSETraceProgress(
784
+ trace_id=payload.trace_id,
785
+ partial_result=_camel_keys(payload.model_dump(by_alias=True)),
786
+ )
787
+ )
788
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
789
+ elif tag == "progress":
790
+ evt = SSEProgressEvent(message=payload)
791
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
792
+ finally:
793
+ if not eval_task.done():
794
+ eval_task.cancel()
795
+ try:
796
+ await eval_task
797
+ except asyncio.CancelledError:
798
+ pass
718
799
 
719
800
  except Exception as exc:
720
801
  logger.exception("Evaluation stream failed")
@@ -775,13 +856,15 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
775
856
  """Evaluate OTLP JSON traces passed in the request body."""
776
857
  _check_json_body_size(raw_request)
777
858
  traces, eval_set = _parse_json_request(request)
859
+ resolved_creds = await _resolve_credentials(request.credential_refs)
778
860
 
779
861
  try:
780
- result = await run_evaluation_from_traces(
781
- traces=traces,
782
- config=request.config,
783
- eval_set=eval_set,
784
- )
862
+ with _scoped_credentials(resolved_creds):
863
+ result = await run_evaluation_from_traces(
864
+ traces=traces,
865
+ config=request.config,
866
+ eval_set=eval_set,
867
+ )
785
868
  run_id = await _maybe_persist_evaluate_run(
786
869
  raw_request,
787
870
  params=request.config,
@@ -793,6 +876,8 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
793
876
  if run_id:
794
877
  result.run_id = run_id
795
878
  return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
879
+ except HTTPException:
880
+ raise
796
881
  except Exception as exc:
797
882
  logger.exception("JSON evaluation failed")
798
883
  raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
@@ -843,47 +928,56 @@ async def evaluate_traces_json_stream(request: EvaluateJsonRequest, raw_request:
843
928
  )
844
929
  await queue.put(("done", result))
845
930
 
846
- eval_task = asyncio.create_task(run_with_progress())
847
-
848
931
  try:
849
- while True:
850
- msg = await queue.get()
851
- tag, payload = msg
852
-
853
- if tag == "done":
854
- run_id = await _maybe_persist_evaluate_run(
855
- raw_request,
856
- params=request.config,
857
- eval_set_dict=request.eval_set,
858
- trace_format=None,
859
- upload_filenames=None,
860
- run_result=payload,
861
- )
862
- if run_id:
863
- payload.run_id = run_id
864
- evt = SSEDoneEvent(
865
- result=_camel_keys(payload.model_dump(by_alias=True)),
866
- )
867
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
868
- break
869
- elif tag == "trace_progress":
870
- evt = SSETraceProgressEvent(
871
- trace_progress=SSETraceProgress(
872
- trace_id=payload.trace_id,
873
- partial_result=_camel_keys(payload.model_dump(by_alias=True)),
932
+ resolved_creds = (
933
+ await resolve_credential_refs(request.credential_refs) if request.credential_refs else None
934
+ )
935
+ except ValueError as exc:
936
+ yield _sse_error(f"Could not resolve credentialRefs: {exc}")
937
+ return
938
+
939
+ with _scoped_credentials(resolved_creds):
940
+ eval_task = asyncio.create_task(run_with_progress())
941
+
942
+ try:
943
+ while True:
944
+ msg = await queue.get()
945
+ tag, payload = msg
946
+
947
+ if tag == "done":
948
+ run_id = await _maybe_persist_evaluate_run(
949
+ raw_request,
950
+ params=request.config,
951
+ eval_set_dict=request.eval_set,
952
+ trace_format=None,
953
+ upload_filenames=None,
954
+ run_result=payload,
874
955
  )
875
- )
876
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
877
- elif tag == "progress":
878
- evt = SSEProgressEvent(message=payload)
879
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
880
- finally:
881
- if not eval_task.done():
882
- eval_task.cancel()
883
- try:
884
- await eval_task
885
- except asyncio.CancelledError:
886
- pass
956
+ if run_id:
957
+ payload.run_id = run_id
958
+ evt = SSEDoneEvent(
959
+ result=_camel_keys(payload.model_dump(by_alias=True)),
960
+ )
961
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
962
+ break
963
+ elif tag == "trace_progress":
964
+ evt = SSETraceProgressEvent(
965
+ trace_progress=SSETraceProgress(
966
+ trace_id=payload.trace_id,
967
+ partial_result=_camel_keys(payload.model_dump(by_alias=True)),
968
+ )
969
+ )
970
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
971
+ elif tag == "progress":
972
+ evt = SSEProgressEvent(message=payload)
973
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
974
+ finally:
975
+ if not eval_task.done():
976
+ eval_task.cancel()
977
+ try:
978
+ await eval_task
979
+ except asyncio.CancelledError:
980
+ pass
887
981
 
888
982
  except Exception as exc:
889
983
  logger.exception("JSON evaluation stream failed")
@@ -27,6 +27,8 @@ from google.adk.evaluation.eval_metrics import (
27
27
  from google.adk.evaluation.eval_rubrics import Rubric, RubricContent
28
28
  from google.adk.evaluation.evaluator import EvaluationResult, Evaluator
29
29
 
30
+ from .resolvers import get_resolved_credential
31
+
30
32
  logger = logging.getLogger(__name__)
31
33
 
32
34
  METRICS_NEEDING_EXPECTED = {
@@ -267,6 +269,67 @@ def get_evaluator(eval_metric: EvalMetric) -> Evaluator:
267
269
  return DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(eval_metric)
268
270
 
269
271
 
272
+ def _build_judge_model(model_id: str, api_key: str, base_url: str | None = None):
273
+ """Build a judge ``BaseLlm`` carrying *api_key* directly, instead of reading it from env.
274
+
275
+ LiteLlm-backed providers take ``api_key`` (and optional ``base_url``) as constructor
276
+ kwargs that forward into every ``litellm.acompletion`` call. The Gemini-native model
277
+ class takes no ``api_key``; its cached ``google.genai`` client is replaced with one
278
+ built from the resolved key.
279
+
280
+ Routing is by ADK's ``LLMRegistry`` class resolution, which is authoritative: the
281
+ evaluator already resolved this same *model_id* to a model class when ``_setup_auto_rater``
282
+ ran at construction, so this lookup cannot disagree or fail here.
283
+ """
284
+ from google.adk.models.lite_llm import LiteLlm
285
+ from google.adk.models.registry import LLMRegistry
286
+
287
+ if issubclass(LLMRegistry().resolve(model_id), LiteLlm):
288
+ kwargs: dict[str, Any] = {"api_key": api_key}
289
+ if base_url:
290
+ kwargs["base_url"] = base_url
291
+ return LiteLlm(model=model_id, **kwargs)
292
+
293
+ from google.adk.models.google_llm import Gemini
294
+ from google.genai import Client
295
+ from google.genai import types as genai_types
296
+
297
+ model = Gemini(model=model_id)
298
+ client_kwargs: dict[str, Any] = {"api_key": api_key}
299
+ if base_url:
300
+ client_kwargs["http_options"] = genai_types.HttpOptions(base_url=base_url)
301
+ # api_client is a functools.cached_property that memoizes into the instance __dict__;
302
+ # seeding that slot pre-empts the lazily-built client so the judge uses the resolved key.
303
+ model.__dict__["api_client"] = Client(**client_kwargs)
304
+ return model
305
+
306
+
307
+ def _inject_judge_credential(evaluator: Evaluator, api_key: str, base_url: str | None = None) -> None:
308
+ """Replace a judge evaluator's auto-rater model with one built from *api_key*.
309
+
310
+ Keyed on the ADK private seam (``_judge_model_options`` / ``_judge_model``, set by
311
+ ``LlmAsJudge._setup_auto_rater``) rather than on a class, so this single path covers
312
+ ``FinalResponseMatchV2Evaluator``, the ``rubric_based_*_v1`` evaluators, and
313
+ ``HallucinationsV1Evaluator`` (which exposes the same attributes without subclassing
314
+ ``LlmAsJudge``). ``get_evaluator`` returns a fresh instance per evaluation, so mutating
315
+ it here carries no shared state and is safe across concurrent runs.
316
+
317
+ TODO(upstream): propose that ADK ``JudgeModelOptions`` carry a credential or a prebuilt
318
+ model instance, so judge auth no longer depends on this private seam or process env.
319
+ """
320
+ opts = getattr(evaluator, "_judge_model_options", None)
321
+ if opts is None or not hasattr(evaluator, "_judge_model"):
322
+ logger.warning("evaluator %s is not judge-backed; cannot inject credential", type(evaluator).__name__)
323
+ return
324
+ model_id = getattr(opts, "judge_model", None)
325
+ if not model_id:
326
+ logger.warning(
327
+ "evaluator %s has no resolved judge_model; skipping credential injection", type(evaluator).__name__
328
+ )
329
+ return
330
+ evaluator._judge_model = _build_judge_model(model_id, api_key, base_url)
331
+
332
+
270
333
  def extract_trajectory_details(eval_result: EvaluationResult) -> dict[str, Any]:
271
334
  """Extract expected vs actual tool call details from trajectory evaluation."""
272
335
  comparisons = []
@@ -305,6 +368,8 @@ async def evaluate_builtin_metric(
305
368
  judge_model: str | None,
306
369
  threshold: float | None,
307
370
  match_type: str | None = None,
371
+ credential_ref: str | None = None,
372
+ judge_base_url: str | None = None,
308
373
  ) -> dict[str, Any]:
309
374
  """Evaluate a single built-in ADK metric.
310
375
 
@@ -326,6 +391,18 @@ async def evaluate_builtin_metric(
326
391
  eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
327
392
  evaluator: Evaluator = get_evaluator(eval_metric)
328
393
 
394
+ if credential_ref:
395
+ api_key = get_resolved_credential(credential_ref)
396
+ if api_key is None:
397
+ return MetricResult(
398
+ metric_name=metric_name,
399
+ error=(
400
+ f"Metric '{metric_name}' references credential '{credential_ref}', "
401
+ f"which was not provided in the run's credentialRefs."
402
+ ),
403
+ )
404
+ _inject_judge_credential(evaluator, api_key, judge_base_url)
405
+
329
406
  if metric_name in _METRICS_NEEDING_INVOCATION_EVENTS:
330
407
  actual_invocations = _enrich_app_details([_to_invocation_events(inv) for inv in actual_invocations])
331
408
  if expected_invocations is not None:
@@ -27,6 +27,14 @@ class BuiltinMetricDef(BaseModel):
27
27
  threshold: float | None = Field(default=None, ge=0, le=1)
28
28
  judge_model: str | None = None
29
29
  trajectory_match_type: str | None = None
30
+ credential_ref: str | None = Field(
31
+ default=None,
32
+ description="Logical name of a RunSpec.credential_refs entry whose resolved value is the judge API key.",
33
+ )
34
+ judge_base_url: str | None = Field(
35
+ default=None,
36
+ description="Optional base URL for the judge endpoint (e.g. an OpenAI-compatible proxy).",
37
+ )
30
38
 
31
39
  @field_validator("trajectory_match_type")
32
40
  @classmethod
@@ -453,6 +453,8 @@ async def evaluate_custom_evaluator(
453
453
  judge_model=evaluator_def.judge_model,
454
454
  threshold=evaluator_def.threshold,
455
455
  match_type=evaluator_def.trajectory_match_type,
456
+ credential_ref=evaluator_def.credential_ref,
457
+ judge_base_url=evaluator_def.judge_base_url,
456
458
  )
457
459
 
458
460
  if isinstance(evaluator_def, OpenAIEvalDef):