agentevals-cli 0.9.4__tar.gz → 0.9.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/PKG-INFO +1 -1
  2. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/models.py +9 -0
  3. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/routes.py +178 -84
  4. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_api.py +210 -0
  5. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.claude/skills/eval/SKILL.md +0 -0
  6. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.claude/skills/eval/evals/evals.json +0 -0
  7. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.claude/skills/inspect/SKILL.md +0 -0
  8. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.claude/skills/inspect/evals/evals.json +0 -0
  9. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.dockerignore +0 -0
  10. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  11. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  12. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  13. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/workflows/ci.yml +0 -0
  14. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/workflows/publish-evaluator-sdk.yml +0 -0
  15. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/workflows/release.yml +0 -0
  16. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.gitignore +0 -0
  17. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.mcp.json +0 -0
  18. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/CONTRIBUTING.md +0 -0
  19. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/DEVELOPMENT.md +0 -0
  20. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/Dockerfile +0 -0
  21. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/LICENSE +0 -0
  22. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/Makefile +0 -0
  23. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/README.md +0 -0
  24. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/Chart.yaml +0 -0
  25. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/NOTES.txt +0 -0
  26. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/_helpers.tpl +0 -0
  27. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/deployment.yaml +0 -0
  28. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
  29. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/postgresql.yaml +0 -0
  30. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/rbac.yaml +0 -0
  31. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/service.yaml +0 -0
  32. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/serviceaccount.yaml +0 -0
  33. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/values.yaml +0 -0
  34. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/assets/logo-color-on-transparent.svg +0 -0
  35. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/assets/logo-color.png +0 -0
  36. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/assets/logo-dark-on-transparent.svg +0 -0
  37. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/custom-evaluators.md +0 -0
  38. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/eval-set-format.md +0 -0
  39. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/otel-compatibility.md +0 -0
  40. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/streaming.md +0 -0
  41. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/README.md +0 -0
  42. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_evaluators/eval_config.yaml +0 -0
  43. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_evaluators/eval_config_openai_eval.yaml +0 -0
  44. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_evaluators/response_quality.py +0 -0
  45. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_evaluators/tool_call_checker.py +0 -0
  46. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_sink/README.md +0 -0
  47. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
  48. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
  49. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_sink/pyproject.toml +0 -0
  50. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/README.md +0 -0
  51. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/agent.py +0 -0
  52. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/eval_set.json +0 -0
  53. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/main.py +0 -0
  54. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/test_streaming.py +0 -0
  55. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/kubernetes/README.md +0 -0
  56. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/README.md +0 -0
  57. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/agent.py +0 -0
  58. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/eval_set.json +0 -0
  59. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/main.py +0 -0
  60. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/requirements.txt +0 -0
  61. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/test_streaming.py +0 -0
  62. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/sdk_example/async_example.py +0 -0
  63. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/sdk_example/context_manager_example.py +0 -0
  64. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/sdk_example/decorator_example.py +0 -0
  65. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/sdk_example/requirements.txt +0 -0
  66. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/strands_agent/agent.py +0 -0
  67. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/strands_agent/eval_set.json +0 -0
  68. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/strands_agent/main.py +0 -0
  69. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/strands_agent/requirements.txt +0 -0
  70. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/requirements.txt +0 -0
  71. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/run.py +0 -0
  72. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/langchain/requirements.txt +0 -0
  73. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/langchain/run.py +0 -0
  74. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/ollama/requirements.txt +0 -0
  75. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/ollama/run.py +0 -0
  76. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
  77. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/openai-agents/run.py +0 -0
  78. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
  79. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
  80. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/strands/requirements.txt +0 -0
  81. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/strands/run.py +0 -0
  82. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/flake.lock +0 -0
  83. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/flake.nix +0 -0
  84. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/README.md +0 -0
  85. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/pyproject.toml +0 -0
  86. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
  87. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
  88. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
  89. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/pyproject.toml +0 -0
  90. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/eval_set_helm.json +0 -0
  91. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/evalset_helm_3_2026-02-23.json +0 -0
  92. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/evalset_k8s_2026-02-20.json +0 -0
  93. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/helm.json +0 -0
  94. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/helm_2.json +0 -0
  95. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/helm_3.json +0 -0
  96. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/k8s.json +0 -0
  97. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/tempo_export_with_batches.json +0 -0
  98. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/__init__.py +0 -0
  99. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_protocol.py +0 -0
  100. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
  101. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/assets/index-RIquRPno.js +0 -0
  102. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/index.html +0 -0
  103. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/logo.svg +0 -0
  104. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/vite.svg +0 -0
  105. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/__init__.py +0 -0
  106. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/app.py +0 -0
  107. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/debug_routes.py +0 -0
  108. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/dependencies.py +0 -0
  109. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_app.py +0 -0
  110. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_grpc.py +0 -0
  111. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_processing.py +0 -0
  112. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_routes.py +0 -0
  113. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/runs_routes.py +0 -0
  114. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/streaming_routes.py +0 -0
  115. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/builtin_metrics.py +0 -0
  116. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/cli.py +0 -0
  117. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/config.py +0 -0
  118. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/converter.py +0 -0
  119. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/custom_evaluators.py +0 -0
  120. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/eval_config_loader.py +0 -0
  121. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/__init__.py +0 -0
  122. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/resolver.py +0 -0
  123. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/sources.py +0 -0
  124. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/templates.py +0 -0
  125. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/venv.py +0 -0
  126. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/extraction.py +0 -0
  127. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/genai_converter.py +0 -0
  128. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/__init__.py +0 -0
  129. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/auto.py +0 -0
  130. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/base.py +0 -0
  131. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/jaeger.py +0 -0
  132. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/otlp.py +0 -0
  133. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/mcp_server.py +0 -0
  134. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/openai_eval_backend.py +0 -0
  135. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/output.py +0 -0
  136. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/resolvers/__init__.py +0 -0
  137. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/resolvers/kubernetes.py +0 -0
  138. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/__init__.py +0 -0
  139. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/fetcher.py +0 -0
  140. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/result_builder.py +0 -0
  141. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/service.py +0 -0
  142. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/sinks.py +0 -0
  143. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/worker.py +0 -0
  144. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/runner.py +0 -0
  145. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/sdk.py +0 -0
  146. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/__init__.py +0 -0
  147. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/config.py +0 -0
  148. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/models.py +0 -0
  149. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/__init__.py +0 -0
  150. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
  151. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
  152. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrator.py +0 -0
  153. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/pool.py +0 -0
  154. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/__init__.py +0 -0
  155. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/memory.py +0 -0
  156. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/postgres.py +0 -0
  157. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/__init__.py +0 -0
  158. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/incremental_processor.py +0 -0
  159. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/processor.py +0 -0
  160. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/session.py +0 -0
  161. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/ws_server.py +0 -0
  162. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/trace_attrs.py +0 -0
  163. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/trace_metrics.py +0 -0
  164. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/utils/__init__.py +0 -0
  165. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/utils/genai_messages.py +0 -0
  166. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/utils/log_buffer.py +0 -0
  167. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/utils/log_enrichment.py +0 -0
  168. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/api/__init__.py +0 -0
  169. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/api/test_evaluate_persistence.py +0 -0
  170. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/api/test_runs_routes.py +0 -0
  171. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/__init__.py +0 -0
  172. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/conftest.py +0 -0
  173. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_evaluation_pipeline.py +0 -0
  174. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_live_agents.py +0 -0
  175. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_otlp_grpc_receiver.py +0 -0
  176. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_session_grouping.py +0 -0
  177. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_timing_stress.py +0 -0
  178. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/resolvers/__init__.py +0 -0
  179. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/resolvers/test_kubernetes.py +0 -0
  180. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/resolvers/test_registry.py +0 -0
  181. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/__init__.py +0 -0
  182. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/test_fetcher.py +0 -0
  183. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/test_result_builder.py +0 -0
  184. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/test_service.py +0 -0
  185. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/test_sinks.py +0 -0
  186. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/__init__.py +0 -0
  187. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/test_config.py +0 -0
  188. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/test_memory_repos.py +0 -0
  189. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/test_migrator.py +0 -0
  190. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/test_models.py +0 -0
  191. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_cli.py +0 -0
  192. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_converter.py +0 -0
  193. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_credential_injection.py +0 -0
  194. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_eval_config_loader.py +0 -0
  195. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_extraction.py +0 -0
  196. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_genai_converter.py +0 -0
  197. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_jaeger_loader.py +0 -0
  198. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_loader_auto.py +0 -0
  199. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_log_enrichment.py +0 -0
  200. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_mcp_server.py +0 -0
  201. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_openai_eval_backend.py +0 -0
  202. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_otlp_loader.py +0 -0
  203. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_otlp_receiver.py +0 -0
  204. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_output.py +0 -0
  205. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_protocol.py +0 -0
  206. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_runner.py +0 -0
  207. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_sdk.py +0 -0
  208. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_trace_metrics.py +0 -0
  209. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/.gitignore +0 -0
  210. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/README.md +0 -0
  211. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/eslint.config.js +0 -0
  212. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/index.html +0 -0
  213. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/package-lock.json +0 -0
  214. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/package.json +0 -0
  215. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/public/logo.svg +0 -0
  216. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/public/vite.svg +0 -0
  217. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/App.css +0 -0
  218. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/App.tsx +0 -0
  219. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/api/client.ts +0 -0
  220. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/assets/react.svg +0 -0
  221. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
  222. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
  223. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
  224. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
  225. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/BuilderHeader.tsx +0 -0
  226. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/BuilderView.tsx +0 -0
  227. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
  228. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/EvalCasesList.tsx +0 -0
  229. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/InvocationEditor.tsx +0 -0
  230. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/JsonPreview.tsx +0 -0
  231. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/MetadataEditor.tsx +0 -0
  232. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
  233. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/index.ts +0 -0
  234. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/DashboardView.tsx +0 -0
  235. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
  236. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
  237. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
  238. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
  239. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/TraceCard.tsx +0 -0
  240. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/TraceTable.tsx +0 -0
  241. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
  242. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/DataSection.tsx +0 -0
  243. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
  244. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
  245. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorView.tsx +0 -0
  246. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InvocationCard.tsx +0 -0
  247. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
  248. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
  249. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
  250. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
  251. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/ToolCallList.tsx +0 -0
  252. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
  253. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/sidebar/Sidebar.tsx +0 -0
  254. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
  255. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveMessage.tsx +0 -0
  256. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
  257. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/SessionCard.tsx +0 -0
  258. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
  259. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
  260. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/FileDropZone.tsx +0 -0
  261. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/MetricSelector.tsx +0 -0
  262. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
  263. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
  264. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/UploadView.tsx +0 -0
  265. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/welcome/WelcomeView.tsx +0 -0
  266. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/config.ts +0 -0
  267. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/context/TraceContext.tsx +0 -0
  268. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/context/TraceProvider.tsx +0 -0
  269. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/index.css +0 -0
  270. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/console-capture.ts +0 -0
  271. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/eval-config.ts +0 -0
  272. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/evalset-builder.ts +0 -0
  273. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/network-capture.ts +0 -0
  274. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/trace-helpers.ts +0 -0
  275. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/trace-loader.ts +0 -0
  276. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/trace-metadata.ts +0 -0
  277. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/trace-patcher.ts +0 -0
  278. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/types.ts +0 -0
  279. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/utils.ts +0 -0
  280. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/main.tsx +0 -0
  281. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/tsconfig.app.json +0 -0
  282. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/tsconfig.json +0 -0
  283. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/tsconfig.node.json +0 -0
  284. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/vite.config.ts +0 -0
  285. {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentevals-cli
3
- Version: 0.9.4
3
+ Version: 0.9.5
4
4
  Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.11
@@ -142,6 +142,15 @@ class EvaluateJsonRequest(CamelModel):
142
142
  traces: dict = Field(description="OTLP JSON export with resourceSpans structure.")
143
143
  config: EvalParams = Field(default_factory=EvalParams, description="Evaluation parameters.")
144
144
  eval_set: dict | None = Field(default=None, description="Optional ADK EvalSet JSON.")
145
+ credential_refs: dict[str, dict[str, Any]] | None = Field(
146
+ default=None,
147
+ description=(
148
+ "Map of logical credential name to a secret reference dict. Each reference has a "
149
+ "'kind' (the resolver to use) plus that kind's locator fields. Resolved per call to its "
150
+ "secret value; never written to the process environment. How a value is used (e.g. which "
151
+ "judge provider it authenticates) is configured on the consumer, not the reference."
152
+ ),
153
+ )
145
154
 
146
155
 
147
156
  # ---------------------------------------------------------------------------
@@ -9,6 +9,7 @@ import os
9
9
  import re
10
10
  import shutil
11
11
  import tempfile
12
+ from contextlib import contextmanager
12
13
  from typing import Any
13
14
 
14
15
  from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
@@ -23,6 +24,11 @@ from ..converter import convert_traces
23
24
  from ..extraction import get_extractor
24
25
  from ..loader import load_traces
25
26
  from ..loader.otlp import OtlpJsonLoader
27
+ from ..resolvers import (
28
+ reset_resolved_credentials,
29
+ resolve_credential_refs,
30
+ set_resolved_credentials,
31
+ )
26
32
  from ..runner import (
27
33
  RunResult,
28
34
  load_eval_set,
@@ -53,6 +59,57 @@ from .models import (
53
59
  logger = logging.getLogger(__name__)
54
60
 
55
61
 
62
+ @contextmanager
63
+ def _scoped_credentials(resolved: dict[str, str] | None):
64
+ """Scope an already-resolved ``logical-name -> secret value`` map to the current task.
65
+
66
+ Mirrors the async worker's set/reset (``run/worker.py``) so the synchronous evaluate
67
+ paths populate the same credential ContextVar that judge graders read. A falsy map is a
68
+ no-op, keeping callers byte-for-byte backward compatible. For streaming endpoints, enter
69
+ this BEFORE ``asyncio.create_task`` so the eval task inherits the populated context (a
70
+ child task snapshots its parent's context at creation time). Resolution is done by the
71
+ caller so its failures surface as request errors rather than scoping concerns.
72
+ """
73
+ token = set_resolved_credentials(resolved) if resolved else None
74
+ try:
75
+ yield
76
+ finally:
77
+ if token is not None:
78
+ reset_resolved_credentials(token)
79
+
80
+
81
+ async def _resolve_credentials(refs: dict[str, dict[str, Any]] | None) -> dict[str, str] | None:
82
+ """Resolve credentialRefs to secret values, mapping bad references to a 400.
83
+
84
+ Resolver ``ValueError``s (missing/unknown ``kind``, missing locator fields, an unset
85
+ env var) are request/input errors, so surface them as 400s instead of letting them
86
+ bubble up as 500s. Infrastructure failures from custom resolvers raise other exception
87
+ types and are left to propagate as 5xx.
88
+ """
89
+ if not refs:
90
+ return None
91
+ try:
92
+ return await resolve_credential_refs(refs)
93
+ except ValueError as exc:
94
+ raise HTTPException(status_code=400, detail=f"Could not resolve credentialRefs: {exc}") from exc
95
+
96
+
97
+ def _parse_credential_refs_form(raw: str | None) -> dict[str, dict[str, Any]] | None:
98
+ """Parse and validate the multipart ``credential_refs`` form field (a JSON object string).
99
+
100
+ Empty/absent is treated as no credentials. Raises ``ValueError`` (which
101
+ ``json.JSONDecodeError`` subclasses) on malformed JSON or a non-object shape, so callers
102
+ map both to the same error they use for a bad ``config``. The JSON request endpoints get
103
+ this shape check for free from the ``EvaluateJsonRequest`` model.
104
+ """
105
+ if not raw:
106
+ return None
107
+ refs = json.loads(raw)
108
+ if not isinstance(refs, dict) or not all(isinstance(ref, dict) for ref in refs.values()):
109
+ raise ValueError("credentialRefs must be a JSON object mapping each logical name to a reference object")
110
+ return refs
111
+
112
+
56
113
  def _camel_keys(obj: Any) -> Any:
57
114
  """Recursively convert dict keys from snake_case to camelCase."""
58
115
  if isinstance(obj, dict):
@@ -462,6 +519,7 @@ async def evaluate_traces(
462
519
  trace_files: list[UploadFile] = File(...),
463
520
  config: str = Form(...),
464
521
  eval_set_file: UploadFile | None = File(None),
522
+ credential_refs: str | None = Form(None),
465
523
  ):
466
524
  """
467
525
  Evaluate agent traces using the provided evaluator configuration.
@@ -470,6 +528,8 @@ async def evaluate_traces(
470
528
  trace_files: List of Jaeger or OTLP JSON trace files
471
529
  config: JSON string with evaluation configuration
472
530
  eval_set_file: Optional golden eval set file
531
+ credential_refs: Optional JSON string mapping logical credential names to
532
+ secret references, resolved so LLM-as-Judge graders can authenticate
473
533
 
474
534
  Returns:
475
535
  RunResult with trace results and any errors
@@ -481,6 +541,11 @@ async def evaluate_traces(
481
541
  except json.JSONDecodeError as exc:
482
542
  raise HTTPException(status_code=400, detail=f"Invalid config JSON: {exc}") from exc
483
543
 
544
+ try:
545
+ cred_refs = _parse_credential_refs_form(credential_refs)
546
+ except ValueError as exc:
547
+ raise HTTPException(status_code=400, detail=f"Invalid credentialRefs: {exc}") from exc
548
+
484
549
  trace_paths = []
485
550
  for trace_file in trace_files:
486
551
  if not trace_file.filename:
@@ -548,7 +613,9 @@ async def evaluate_traces(
548
613
  len(trace_paths),
549
614
  [e.name for e in eval_config.evaluators],
550
615
  )
551
- result = await run_evaluation(eval_config)
616
+ resolved_creds = await _resolve_credentials(cred_refs)
617
+ with _scoped_credentials(resolved_creds):
618
+ result = await run_evaluation(eval_config)
552
619
 
553
620
  run_id = await _maybe_persist_evaluate_run(
554
621
  request,
@@ -580,6 +647,7 @@ async def evaluate_traces_stream(
580
647
  trace_files: list[UploadFile] = File(...),
581
648
  config: str = Form(...),
582
649
  eval_set_file: UploadFile | None = File(None),
650
+ credential_refs: str | None = Form(None),
583
651
  ):
584
652
  """Evaluate traces with real-time progress via SSE."""
585
653
  temp_dir = tempfile.mkdtemp()
@@ -593,6 +661,12 @@ async def evaluate_traces_stream(
593
661
  yield f"data: {SSEErrorEvent(error=f'Invalid config JSON: {exc}').model_dump_json(by_alias=True)}\n\n"
594
662
  return
595
663
 
664
+ try:
665
+ cred_refs = _parse_credential_refs_form(credential_refs)
666
+ except ValueError as exc:
667
+ yield f"data: {SSEErrorEvent(error=f'Invalid credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
668
+ return
669
+
596
670
  trace_paths = []
597
671
  for trace_file in trace_files:
598
672
  if not trace_file.filename:
@@ -674,47 +748,54 @@ async def evaluate_traces_stream(
674
748
  result = await run_evaluation(eval_config, progress_callback, trace_progress_callback)
675
749
  await queue.put(("done", result))
676
750
 
677
- eval_task = asyncio.create_task(run_with_progress())
678
-
679
751
  try:
680
- while True:
681
- msg = await queue.get()
682
- tag, payload = msg
683
-
684
- if tag == "done":
685
- run_id = await _maybe_persist_evaluate_run(
686
- request,
687
- params=eval_config,
688
- eval_set_dict=_load_eval_set_dict(eval_set_path),
689
- trace_format=eval_config.trace_format,
690
- upload_filenames=upload_filenames,
691
- run_result=payload,
692
- )
693
- if run_id:
694
- payload.run_id = run_id
695
- evt = SSEDoneEvent(
696
- result=_camel_keys(payload.model_dump(by_alias=True)),
697
- )
698
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
699
- break
700
- elif tag == "trace_progress":
701
- evt = SSETraceProgressEvent(
702
- trace_progress=SSETraceProgress(
703
- trace_id=payload.trace_id,
704
- partial_result=_camel_keys(payload.model_dump(by_alias=True)),
752
+ resolved_creds = await resolve_credential_refs(cred_refs) if cred_refs else None
753
+ except ValueError as exc:
754
+ yield f"data: {SSEErrorEvent(error=f'Could not resolve credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
755
+ return
756
+
757
+ with _scoped_credentials(resolved_creds):
758
+ eval_task = asyncio.create_task(run_with_progress())
759
+
760
+ try:
761
+ while True:
762
+ msg = await queue.get()
763
+ tag, payload = msg
764
+
765
+ if tag == "done":
766
+ run_id = await _maybe_persist_evaluate_run(
767
+ request,
768
+ params=eval_config,
769
+ eval_set_dict=_load_eval_set_dict(eval_set_path),
770
+ trace_format=eval_config.trace_format,
771
+ upload_filenames=upload_filenames,
772
+ run_result=payload,
705
773
  )
706
- )
707
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
708
- elif tag == "progress":
709
- evt = SSEProgressEvent(message=payload)
710
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
711
- finally:
712
- if not eval_task.done():
713
- eval_task.cancel()
714
- try:
715
- await eval_task
716
- except asyncio.CancelledError:
717
- pass
774
+ if run_id:
775
+ payload.run_id = run_id
776
+ evt = SSEDoneEvent(
777
+ result=_camel_keys(payload.model_dump(by_alias=True)),
778
+ )
779
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
780
+ break
781
+ elif tag == "trace_progress":
782
+ evt = SSETraceProgressEvent(
783
+ trace_progress=SSETraceProgress(
784
+ trace_id=payload.trace_id,
785
+ partial_result=_camel_keys(payload.model_dump(by_alias=True)),
786
+ )
787
+ )
788
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
789
+ elif tag == "progress":
790
+ evt = SSEProgressEvent(message=payload)
791
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
792
+ finally:
793
+ if not eval_task.done():
794
+ eval_task.cancel()
795
+ try:
796
+ await eval_task
797
+ except asyncio.CancelledError:
798
+ pass
718
799
 
719
800
  except Exception as exc:
720
801
  logger.exception("Evaluation stream failed")
@@ -775,13 +856,15 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
775
856
  """Evaluate OTLP JSON traces passed in the request body."""
776
857
  _check_json_body_size(raw_request)
777
858
  traces, eval_set = _parse_json_request(request)
859
+ resolved_creds = await _resolve_credentials(request.credential_refs)
778
860
 
779
861
  try:
780
- result = await run_evaluation_from_traces(
781
- traces=traces,
782
- config=request.config,
783
- eval_set=eval_set,
784
- )
862
+ with _scoped_credentials(resolved_creds):
863
+ result = await run_evaluation_from_traces(
864
+ traces=traces,
865
+ config=request.config,
866
+ eval_set=eval_set,
867
+ )
785
868
  run_id = await _maybe_persist_evaluate_run(
786
869
  raw_request,
787
870
  params=request.config,
@@ -793,6 +876,8 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
793
876
  if run_id:
794
877
  result.run_id = run_id
795
878
  return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
879
+ except HTTPException:
880
+ raise
796
881
  except Exception as exc:
797
882
  logger.exception("JSON evaluation failed")
798
883
  raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
@@ -843,47 +928,56 @@ async def evaluate_traces_json_stream(request: EvaluateJsonRequest, raw_request:
843
928
  )
844
929
  await queue.put(("done", result))
845
930
 
846
- eval_task = asyncio.create_task(run_with_progress())
847
-
848
931
  try:
849
- while True:
850
- msg = await queue.get()
851
- tag, payload = msg
852
-
853
- if tag == "done":
854
- run_id = await _maybe_persist_evaluate_run(
855
- raw_request,
856
- params=request.config,
857
- eval_set_dict=request.eval_set,
858
- trace_format=None,
859
- upload_filenames=None,
860
- run_result=payload,
861
- )
862
- if run_id:
863
- payload.run_id = run_id
864
- evt = SSEDoneEvent(
865
- result=_camel_keys(payload.model_dump(by_alias=True)),
866
- )
867
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
868
- break
869
- elif tag == "trace_progress":
870
- evt = SSETraceProgressEvent(
871
- trace_progress=SSETraceProgress(
872
- trace_id=payload.trace_id,
873
- partial_result=_camel_keys(payload.model_dump(by_alias=True)),
932
+ resolved_creds = (
933
+ await resolve_credential_refs(request.credential_refs) if request.credential_refs else None
934
+ )
935
+ except ValueError as exc:
936
+ yield _sse_error(f"Could not resolve credentialRefs: {exc}")
937
+ return
938
+
939
+ with _scoped_credentials(resolved_creds):
940
+ eval_task = asyncio.create_task(run_with_progress())
941
+
942
+ try:
943
+ while True:
944
+ msg = await queue.get()
945
+ tag, payload = msg
946
+
947
+ if tag == "done":
948
+ run_id = await _maybe_persist_evaluate_run(
949
+ raw_request,
950
+ params=request.config,
951
+ eval_set_dict=request.eval_set,
952
+ trace_format=None,
953
+ upload_filenames=None,
954
+ run_result=payload,
874
955
  )
875
- )
876
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
877
- elif tag == "progress":
878
- evt = SSEProgressEvent(message=payload)
879
- yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
880
- finally:
881
- if not eval_task.done():
882
- eval_task.cancel()
883
- try:
884
- await eval_task
885
- except asyncio.CancelledError:
886
- pass
956
+ if run_id:
957
+ payload.run_id = run_id
958
+ evt = SSEDoneEvent(
959
+ result=_camel_keys(payload.model_dump(by_alias=True)),
960
+ )
961
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
962
+ break
963
+ elif tag == "trace_progress":
964
+ evt = SSETraceProgressEvent(
965
+ trace_progress=SSETraceProgress(
966
+ trace_id=payload.trace_id,
967
+ partial_result=_camel_keys(payload.model_dump(by_alias=True)),
968
+ )
969
+ )
970
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
971
+ elif tag == "progress":
972
+ evt = SSEProgressEvent(message=payload)
973
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
974
+ finally:
975
+ if not eval_task.done():
976
+ eval_task.cancel()
977
+ try:
978
+ await eval_task
979
+ except asyncio.CancelledError:
980
+ pass
887
981
 
888
982
  except Exception as exc:
889
983
  logger.exception("JSON evaluation stream failed")
@@ -229,6 +229,35 @@ def _eval_config_json(**overrides) -> str:
229
229
  return json.dumps(cfg)
230
230
 
231
231
 
232
+ def _judge_config(**overrides) -> dict:
233
+ cfg = {
234
+ "evaluators": [
235
+ {"name": "hallucinations_v1", "type": "builtin", "judgeModel": "openai/gpt-4o", "credentialRef": "k"}
236
+ ]
237
+ }
238
+ cfg.update(overrides)
239
+ return cfg
240
+
241
+
242
+ def _capturing_run_eval(captured: dict):
243
+ """Build an AsyncMock side_effect that records, at evaluator-invocation time, the value the
244
+ judge would resolve for credential ``k``.
245
+
246
+ This is the correct boundary for the sync routes: their job is to populate the credential
247
+ ContextVar before the evaluator runs. The ContextVar -> judge injection step itself is
248
+ already covered by test_credential_injection.py, so recording ``get_resolved_credential``
249
+ here (rather than mocking it) is not a false positive -- it fails when the route omits the
250
+ set/reset, which is exactly the gap being closed.
251
+ """
252
+ from agentevals.resolvers import get_resolved_credential
253
+
254
+ def _side_effect(*args, **kwargs):
255
+ captured["judge_key"] = get_resolved_credential("k")
256
+ return _make_run_result()
257
+
258
+ return _side_effect
259
+
260
+
232
261
  # ---------------------------------------------------------------------------
233
262
  # Model Serialization
234
263
  # ---------------------------------------------------------------------------
@@ -528,6 +557,68 @@ class TestEvaluateTraces:
528
557
  )
529
558
  assert resp.status_code in (400, 422)
530
559
 
560
+ @patch("agentevals.api.routes.run_evaluation", new_callable=AsyncMock)
561
+ def test_evaluate_resolves_credential_refs(self, mock_eval, monkeypatch):
562
+ monkeypatch.setenv("AE_TEST_JUDGE_KEY", "sk-resolved-multipart")
563
+ captured: dict = {}
564
+ mock_eval.side_effect = _capturing_run_eval(captured)
565
+ resp = self.client.post(
566
+ "/api/evaluate",
567
+ files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
568
+ data={
569
+ "config": json.dumps(_judge_config()),
570
+ "credential_refs": json.dumps({"k": {"kind": "env", "name": "AE_TEST_JUDGE_KEY"}}),
571
+ },
572
+ )
573
+ _assert_envelope(resp)
574
+ assert captured["judge_key"] == "sk-resolved-multipart"
575
+
576
+ @patch("agentevals.api.routes.run_evaluation", new_callable=AsyncMock)
577
+ def test_evaluate_without_credential_refs_is_noop(self, mock_eval):
578
+ captured: dict = {}
579
+ mock_eval.side_effect = _capturing_run_eval(captured)
580
+ resp = self.client.post(
581
+ "/api/evaluate",
582
+ files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
583
+ data={"config": _eval_config_json()},
584
+ )
585
+ _assert_envelope(resp)
586
+ assert captured["judge_key"] is None
587
+
588
+ def test_evaluate_bad_credential_refs_returns_400(self):
589
+ resp = self.client.post(
590
+ "/api/evaluate",
591
+ files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
592
+ data={"config": _eval_config_json(), "credential_refs": "{not json"},
593
+ )
594
+ assert resp.status_code == 400
595
+ assert "credentialRefs" in resp.json()["detail"]
596
+
597
+ def test_evaluate_credential_refs_wrong_shape_returns_400(self):
598
+ resp = self.client.post(
599
+ "/api/evaluate",
600
+ files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
601
+ data={"config": _eval_config_json(), "credential_refs": json.dumps(["not", "a", "map"])},
602
+ )
603
+ assert resp.status_code == 400
604
+ assert "credentialRefs" in resp.json()["detail"]
605
+
606
+ @patch("agentevals.api.routes.run_evaluation", new_callable=AsyncMock)
607
+ def test_evaluate_unresolvable_credential_returns_400(self, mock_eval, monkeypatch):
608
+ monkeypatch.delenv("AE_MISSING_KEY", raising=False)
609
+ mock_eval.return_value = _make_run_result()
610
+ resp = self.client.post(
611
+ "/api/evaluate",
612
+ files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
613
+ data={
614
+ "config": json.dumps(_judge_config()),
615
+ "credential_refs": json.dumps({"k": {"kind": "env", "name": "AE_MISSING_KEY"}}),
616
+ },
617
+ )
618
+ assert resp.status_code == 400
619
+ assert "Could not resolve credentialRefs" in resp.json()["detail"]
620
+ mock_eval.assert_not_called()
621
+
531
622
 
532
623
  # ---------------------------------------------------------------------------
533
624
  # POST /api/evaluate/stream (SSE)
@@ -591,6 +682,34 @@ class TestEvaluateStream:
591
682
  assert "result" in done
592
683
  assert "traceResults" in done["result"]
593
684
 
685
+ @patch("agentevals.api.routes.run_evaluation", new_callable=AsyncMock)
686
+ @patch("agentevals.api.routes.load_traces")
687
+ def test_stream_resolves_credential_refs(self, mock_load_traces, mock_eval, monkeypatch):
688
+ monkeypatch.setenv("AE_TEST_JUDGE_KEY", "sk-resolved-stream")
689
+ mock_load_traces.return_value = []
690
+ captured: dict = {}
691
+ mock_eval.side_effect = _capturing_run_eval(captured)
692
+ resp = self.client.post(
693
+ "/api/evaluate/stream",
694
+ files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
695
+ data={
696
+ "config": json.dumps(_judge_config()),
697
+ "credential_refs": json.dumps({"k": {"kind": "env", "name": "AE_TEST_JUDGE_KEY"}}),
698
+ },
699
+ )
700
+ assert '"done"' in resp.text
701
+ assert captured["judge_key"] == "sk-resolved-stream"
702
+
703
+ def test_stream_bad_credential_refs(self):
704
+ resp = self.client.post(
705
+ "/api/evaluate/stream",
706
+ files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
707
+ data={"config": _eval_config_json(), "credential_refs": "{not json"},
708
+ )
709
+ assert resp.status_code == 200
710
+ assert '"error"' in resp.text
711
+ assert "credentialRefs" in resp.text
712
+
594
713
 
595
714
  # ---------------------------------------------------------------------------
596
715
  # POST /api/evaluate/json
@@ -767,6 +886,56 @@ class TestEvaluateJson:
767
886
  body = _assert_envelope(resp)
768
887
  assert "traceResults" in body["data"]
769
888
 
889
+ @patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
890
+ def test_evaluate_json_resolves_credential_refs(self, mock_eval, monkeypatch):
891
+ monkeypatch.setenv("AE_TEST_JUDGE_KEY", "sk-resolved-json")
892
+ captured: dict = {}
893
+ mock_eval.side_effect = _capturing_run_eval(captured)
894
+ resp = self.client.post(
895
+ "/api/evaluate/json",
896
+ json={
897
+ "traces": _make_otlp_json_payload(),
898
+ "config": _judge_config(),
899
+ "credentialRefs": {"k": {"kind": "env", "name": "AE_TEST_JUDGE_KEY"}},
900
+ },
901
+ )
902
+ _assert_envelope(resp)
903
+ assert captured["judge_key"] == "sk-resolved-json"
904
+
905
+ @patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
906
+ def test_evaluate_json_without_credential_refs_is_noop(self, mock_eval):
907
+ captured: dict = {}
908
+ mock_eval.side_effect = _capturing_run_eval(captured)
909
+ resp = self.client.post(
910
+ "/api/evaluate/json",
911
+ json={"traces": _make_otlp_json_payload(), "config": _judge_config()},
912
+ )
913
+ _assert_envelope(resp)
914
+ assert captured["judge_key"] is None
915
+
916
+ def test_evaluate_json_credential_refs_wrong_shape_returns_422(self):
917
+ resp = self.client.post(
918
+ "/api/evaluate/json",
919
+ json={"traces": _make_otlp_json_payload(), "credentialRefs": ["not", "a", "map"]},
920
+ )
921
+ assert resp.status_code == 422
922
+
923
+ @patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
924
+ def test_evaluate_json_unresolvable_credential_returns_400(self, mock_eval, monkeypatch):
925
+ monkeypatch.delenv("AE_MISSING_KEY", raising=False)
926
+ mock_eval.return_value = _make_run_result()
927
+ resp = self.client.post(
928
+ "/api/evaluate/json",
929
+ json={
930
+ "traces": _make_otlp_json_payload(),
931
+ "config": _judge_config(),
932
+ "credentialRefs": {"k": {"kind": "env", "name": "AE_MISSING_KEY"}},
933
+ },
934
+ )
935
+ assert resp.status_code == 400
936
+ assert "Could not resolve credentialRefs" in resp.json()["detail"]
937
+ mock_eval.assert_not_called()
938
+
770
939
 
771
940
  # ---------------------------------------------------------------------------
772
941
  # POST /api/evaluate/json/stream (SSE)
@@ -827,6 +996,47 @@ class TestEvaluateJsonStream:
827
996
  assert '"error"' in body
828
997
  assert "No traces" in body
829
998
 
999
+ @patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
1000
+ @patch("agentevals.api.routes.OtlpJsonLoader")
1001
+ def test_stream_resolves_credential_refs(self, mock_loader_cls, mock_eval, monkeypatch):
1002
+ monkeypatch.setenv("AE_TEST_JUDGE_KEY", "sk-resolved-json-stream")
1003
+ mock_trace = MagicMock()
1004
+ mock_trace.trace_id = "abc123"
1005
+ mock_loader_cls.return_value.load_from_dict.return_value = [mock_trace]
1006
+ captured: dict = {}
1007
+ mock_eval.side_effect = _capturing_run_eval(captured)
1008
+ resp = self.client.post(
1009
+ "/api/evaluate/json/stream",
1010
+ json={
1011
+ "traces": _make_otlp_json_payload(),
1012
+ "config": _judge_config(),
1013
+ "credentialRefs": {"k": {"kind": "env", "name": "AE_TEST_JUDGE_KEY"}},
1014
+ },
1015
+ )
1016
+ assert '"done"' in resp.text
1017
+ assert captured["judge_key"] == "sk-resolved-json-stream"
1018
+
1019
+ @patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
1020
+ @patch("agentevals.api.routes.OtlpJsonLoader")
1021
+ def test_stream_unresolvable_credential_yields_error(self, mock_loader_cls, mock_eval, monkeypatch):
1022
+ monkeypatch.delenv("AE_MISSING_KEY", raising=False)
1023
+ mock_trace = MagicMock()
1024
+ mock_trace.trace_id = "abc123"
1025
+ mock_loader_cls.return_value.load_from_dict.return_value = [mock_trace]
1026
+ mock_eval.return_value = _make_run_result()
1027
+ resp = self.client.post(
1028
+ "/api/evaluate/json/stream",
1029
+ json={
1030
+ "traces": _make_otlp_json_payload(),
1031
+ "config": _judge_config(),
1032
+ "credentialRefs": {"k": {"kind": "env", "name": "AE_MISSING_KEY"}},
1033
+ },
1034
+ )
1035
+ assert '"error"' in resp.text
1036
+ assert "Could not resolve credentialRefs" in resp.text
1037
+ assert '"done"' not in resp.text
1038
+ mock_eval.assert_not_called()
1039
+
830
1040
 
831
1041
  # ---------------------------------------------------------------------------
832
1042
  # GET /api/streaming/sessions
File without changes