agentevals-cli 0.9.3__tar.gz → 0.9.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/Dockerfile +1 -1
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/PKG-INFO +4 -2
- agentevals_cli-0.9.5/charts/agentevals/templates/rbac.yaml +33 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/values.yaml +14 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/run.py +1 -1
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/pyproject.toml +8 -1
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/models.py +9 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/routes.py +178 -84
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/builtin_metrics.py +77 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/config.py +8 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/custom_evaluators.py +2 -0
- agentevals_cli-0.9.5/src/agentevals/resolvers/__init__.py +167 -0
- agentevals_cli-0.9.5/src/agentevals/resolvers/kubernetes.py +62 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/worker.py +10 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/models.py +9 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_live_agents.py +7 -0
- agentevals_cli-0.9.5/tests/resolvers/test_kubernetes.py +63 -0
- agentevals_cli-0.9.5/tests/resolvers/test_registry.py +145 -0
- agentevals_cli-0.9.5/tests/storage/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_api.py +210 -0
- agentevals_cli-0.9.5/tests/test_credential_injection.py +122 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_mcp_server.py +2 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/uv.lock +176 -802
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.claude/skills/eval/SKILL.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.claude/skills/eval/evals/evals.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.claude/skills/inspect/SKILL.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.claude/skills/inspect/evals/evals.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.dockerignore +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/workflows/ci.yml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/workflows/publish-evaluator-sdk.yml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.github/workflows/release.yml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.gitignore +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/.mcp.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/CONTRIBUTING.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/DEVELOPMENT.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/LICENSE +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/Makefile +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/README.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/Chart.yaml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/NOTES.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/_helpers.tpl +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/deployment.yaml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/postgresql.yaml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/service.yaml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/templates/serviceaccount.yaml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/assets/logo-color-on-transparent.svg +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/assets/logo-color.png +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/assets/logo-dark-on-transparent.svg +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/custom-evaluators.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/eval-set-format.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/otel-compatibility.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/docs/streaming.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/README.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_evaluators/eval_config.yaml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_evaluators/eval_config_openai_eval.yaml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_evaluators/response_quality.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_evaluators/tool_call_checker.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_sink/README.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/custom_sink/pyproject.toml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/README.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/agent.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/main.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/dice_agent/test_streaming.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/kubernetes/README.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/README.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/agent.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/main.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/langchain_agent/test_streaming.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/sdk_example/async_example.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/sdk_example/context_manager_example.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/sdk_example/decorator_example.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/sdk_example/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/strands_agent/agent.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/strands_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/strands_agent/main.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/strands_agent/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/langchain/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/langchain/run.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/ollama/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/ollama/run.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/openai-agents/run.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/strands/requirements.txt +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/strands/run.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/flake.lock +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/flake.nix +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/README.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/pyproject.toml +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/eval_set_helm.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/evalset_helm_3_2026-02-23.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/evalset_k8s_2026-02-20.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/helm.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/helm_2.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/helm_3.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/k8s.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/samples/tempo_export_with_batches.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_protocol.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/assets/index-RIquRPno.js +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/index.html +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/logo.svg +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/_static/vite.svg +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/app.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/debug_routes.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/dependencies.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_app.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_grpc.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_processing.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_routes.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/runs_routes.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/streaming_routes.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/cli.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/converter.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/eval_config_loader.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/resolver.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/sources.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/templates.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/evaluator/venv.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/extraction.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/genai_converter.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/auto.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/base.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/jaeger.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/loader/otlp.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/mcp_server.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/openai_eval_backend.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/output.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/fetcher.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/result_builder.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/service.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/run/sinks.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/runner.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/sdk.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/config.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrator.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/pool.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/memory.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/postgres.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/incremental_processor.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/processor.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/session.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/streaming/ws_server.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/trace_attrs.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/trace_metrics.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/utils/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/utils/genai_messages.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/utils/log_buffer.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/utils/log_enrichment.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/api/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/api/test_evaluate_persistence.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/api/test_runs_routes.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/conftest.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_evaluation_pipeline.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_otlp_grpc_receiver.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_session_grouping.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/integration/test_timing_stress.py +0 -0
- {agentevals_cli-0.9.3/tests/run → agentevals_cli-0.9.5/tests/resolvers}/__init__.py +0 -0
- {agentevals_cli-0.9.3/tests/storage → agentevals_cli-0.9.5/tests/run}/__init__.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/run/test_fetcher.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/run/test_result_builder.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/run/test_service.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/run/test_sinks.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/storage/test_config.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/storage/test_memory_repos.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/storage/test_migrator.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/storage/test_models.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_cli.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_converter.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_eval_config_loader.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_extraction.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_genai_converter.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_jaeger_loader.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_loader_auto.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_log_enrichment.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_openai_eval_backend.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_otlp_loader.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_otlp_receiver.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_output.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_protocol.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_runner.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_sdk.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/tests/test_trace_metrics.py +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/.gitignore +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/README.md +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/eslint.config.js +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/index.html +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/package-lock.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/package.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/public/logo.svg +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/public/vite.svg +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/App.css +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/App.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/api/client.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/assets/react.svg +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/BuilderHeader.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/BuilderView.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/EvalCasesList.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/InvocationEditor.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/JsonPreview.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/MetadataEditor.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/builder/index.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/DashboardView.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/TraceCard.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/dashboard/TraceTable.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/DataSection.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorView.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InvocationCard.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/ToolCallList.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/sidebar/Sidebar.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveMessage.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/SessionCard.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/FileDropZone.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/MetricSelector.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/upload/UploadView.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/components/welcome/WelcomeView.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/config.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/context/TraceContext.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/context/TraceProvider.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/index.css +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/console-capture.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/eval-config.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/evalset-builder.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/network-capture.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/trace-helpers.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/trace-loader.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/trace-metadata.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/trace-patcher.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/types.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/lib/utils.ts +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/src/main.tsx +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/tsconfig.app.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/tsconfig.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/tsconfig.node.json +0 -0
- {agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/ui/vite.config.ts +0 -0
|
@@ -31,7 +31,7 @@ COPY --from=ui /build/ui/dist ./src/agentevals/_static
|
|
|
31
31
|
ARG VERSION
|
|
32
32
|
ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
|
|
33
33
|
|
|
34
|
-
RUN uv sync --frozen --no-dev --extra live --extra postgres \
|
|
34
|
+
RUN uv sync --frozen --no-dev --extra live --extra postgres --extra kubernetes \
|
|
35
35
|
&& groupadd --gid 1000 app \
|
|
36
36
|
&& useradd --uid 1000 --gid app --home-dir /app --no-log-init app \
|
|
37
37
|
&& chown -R app:app /app
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agentevals-cli
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.5
|
|
4
4
|
Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Requires-Python: >=3.11
|
|
7
7
|
Requires-Dist: click>=8.0
|
|
8
8
|
Requires-Dist: fastapi>=0.115.0
|
|
9
|
-
Requires-Dist: google-adk[eval]
|
|
9
|
+
Requires-Dist: google-adk[eval]<2.2,>=2.1.0
|
|
10
10
|
Requires-Dist: httpx>=0.27.0
|
|
11
11
|
Requires-Dist: opentelemetry-proto>=1.36.0
|
|
12
12
|
Requires-Dist: python-dotenv>=1.0.0
|
|
@@ -14,6 +14,8 @@ Requires-Dist: python-multipart>=0.0.12
|
|
|
14
14
|
Requires-Dist: pyyaml>=6.0
|
|
15
15
|
Requires-Dist: tabulate>=0.9.0
|
|
16
16
|
Requires-Dist: uvicorn[standard]>=0.32.0
|
|
17
|
+
Provides-Extra: kubernetes
|
|
18
|
+
Requires-Dist: kubernetes>=36.0.0; extra == 'kubernetes'
|
|
17
19
|
Provides-Extra: live
|
|
18
20
|
Requires-Dist: httpx>=0.27.0; extra == 'live'
|
|
19
21
|
Requires-Dist: mcp>=1.26.0; extra == 'live'
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{{- if .Values.rbac.create -}}
|
|
2
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
3
|
+
kind: Role
|
|
4
|
+
metadata:
|
|
5
|
+
name: {{ include "agentevals.fullname" . }}
|
|
6
|
+
namespace: {{ include "agentevals.namespace" . }}
|
|
7
|
+
labels:
|
|
8
|
+
{{- include "agentevals.labels" . | nindent 4 }}
|
|
9
|
+
rules:
|
|
10
|
+
- apiGroups: [""]
|
|
11
|
+
resources: ["secrets"]
|
|
12
|
+
verbs: ["get"]
|
|
13
|
+
{{- with .Values.rbac.secretNames }}
|
|
14
|
+
resourceNames:
|
|
15
|
+
{{- toYaml . | nindent 6 }}
|
|
16
|
+
{{- end }}
|
|
17
|
+
---
|
|
18
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
19
|
+
kind: RoleBinding
|
|
20
|
+
metadata:
|
|
21
|
+
name: {{ include "agentevals.fullname" . }}
|
|
22
|
+
namespace: {{ include "agentevals.namespace" . }}
|
|
23
|
+
labels:
|
|
24
|
+
{{- include "agentevals.labels" . | nindent 4 }}
|
|
25
|
+
roleRef:
|
|
26
|
+
apiGroup: rbac.authorization.k8s.io
|
|
27
|
+
kind: Role
|
|
28
|
+
name: {{ include "agentevals.fullname" . }}
|
|
29
|
+
subjects:
|
|
30
|
+
- kind: ServiceAccount
|
|
31
|
+
name: {{ include "agentevals.serviceAccountName" . }}
|
|
32
|
+
namespace: {{ include "agentevals.namespace" . }}
|
|
33
|
+
{{- end }}
|
|
@@ -57,6 +57,20 @@ serviceAccount:
|
|
|
57
57
|
# -- ServiceAccount name override
|
|
58
58
|
name: ""
|
|
59
59
|
|
|
60
|
+
# ==============================================================================
|
|
61
|
+
# RBAC
|
|
62
|
+
# ==============================================================================
|
|
63
|
+
|
|
64
|
+
# -- Namespaced Role + RoleBinding granting the pod's ServiceAccount read
|
|
65
|
+
# access to Secrets. Enable this when the kubernetes secret resolver reads
|
|
66
|
+
# provider credentials from Secrets via in-cluster config.
|
|
67
|
+
rbac:
|
|
68
|
+
# -- Create the Role and RoleBinding
|
|
69
|
+
create: false
|
|
70
|
+
# -- Restrict the Role to these Secret names. Empty grants get on all
|
|
71
|
+
# Secrets in the release namespace.
|
|
72
|
+
secretNames: []
|
|
73
|
+
|
|
60
74
|
# ==============================================================================
|
|
61
75
|
# Pod
|
|
62
76
|
# ==============================================================================
|
|
@@ -74,7 +74,7 @@ async def main():
|
|
|
74
74
|
|
|
75
75
|
agent_response = ""
|
|
76
76
|
async for event in runner.run_async(user_id=user_id, session_id=session.id, new_message=content):
|
|
77
|
-
if event.content.parts and event.content.parts[0].text:
|
|
77
|
+
if event.content and event.content.parts and event.content.parts[0].text:
|
|
78
78
|
agent_response = event.content.parts[0].text
|
|
79
79
|
|
|
80
80
|
print(f" Agent: {agent_response}")
|
|
@@ -9,7 +9,7 @@ description = "Standalone framework to evaluate agent correctness based on porta
|
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
11
11
|
dependencies = [
|
|
12
|
-
"google-adk[eval]>=1.
|
|
12
|
+
"google-adk[eval]>=2.1.0,<2.2",
|
|
13
13
|
"click>=8.0",
|
|
14
14
|
"tabulate>=0.9.0",
|
|
15
15
|
"fastapi>=0.115.0",
|
|
@@ -36,10 +36,17 @@ openai = [
|
|
|
36
36
|
postgres = [
|
|
37
37
|
"asyncpg>=0.30.0",
|
|
38
38
|
]
|
|
39
|
+
kubernetes = [
|
|
40
|
+
"kubernetes>=36.0.0",
|
|
41
|
+
]
|
|
39
42
|
|
|
40
43
|
[project.scripts]
|
|
41
44
|
agentevals = "agentevals.cli:main"
|
|
42
45
|
|
|
46
|
+
[project.entry-points."agentevals.secret_resolvers"]
|
|
47
|
+
env = "agentevals.resolvers:create_env_resolver"
|
|
48
|
+
kubernetes = "agentevals.resolvers.kubernetes:create_kubernetes_resolver"
|
|
49
|
+
|
|
43
50
|
[tool.hatch.version]
|
|
44
51
|
source = "vcs"
|
|
45
52
|
|
|
@@ -142,6 +142,15 @@ class EvaluateJsonRequest(CamelModel):
|
|
|
142
142
|
traces: dict = Field(description="OTLP JSON export with resourceSpans structure.")
|
|
143
143
|
config: EvalParams = Field(default_factory=EvalParams, description="Evaluation parameters.")
|
|
144
144
|
eval_set: dict | None = Field(default=None, description="Optional ADK EvalSet JSON.")
|
|
145
|
+
credential_refs: dict[str, dict[str, Any]] | None = Field(
|
|
146
|
+
default=None,
|
|
147
|
+
description=(
|
|
148
|
+
"Map of logical credential name to a secret reference dict. Each reference has a "
|
|
149
|
+
"'kind' (the resolver to use) plus that kind's locator fields. Resolved per call to its "
|
|
150
|
+
"secret value; never written to the process environment. How a value is used (e.g. which "
|
|
151
|
+
"judge provider it authenticates) is configured on the consumer, not the reference."
|
|
152
|
+
),
|
|
153
|
+
)
|
|
145
154
|
|
|
146
155
|
|
|
147
156
|
# ---------------------------------------------------------------------------
|
|
@@ -9,6 +9,7 @@ import os
|
|
|
9
9
|
import re
|
|
10
10
|
import shutil
|
|
11
11
|
import tempfile
|
|
12
|
+
from contextlib import contextmanager
|
|
12
13
|
from typing import Any
|
|
13
14
|
|
|
14
15
|
from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
|
|
@@ -23,6 +24,11 @@ from ..converter import convert_traces
|
|
|
23
24
|
from ..extraction import get_extractor
|
|
24
25
|
from ..loader import load_traces
|
|
25
26
|
from ..loader.otlp import OtlpJsonLoader
|
|
27
|
+
from ..resolvers import (
|
|
28
|
+
reset_resolved_credentials,
|
|
29
|
+
resolve_credential_refs,
|
|
30
|
+
set_resolved_credentials,
|
|
31
|
+
)
|
|
26
32
|
from ..runner import (
|
|
27
33
|
RunResult,
|
|
28
34
|
load_eval_set,
|
|
@@ -53,6 +59,57 @@ from .models import (
|
|
|
53
59
|
logger = logging.getLogger(__name__)
|
|
54
60
|
|
|
55
61
|
|
|
62
|
+
@contextmanager
|
|
63
|
+
def _scoped_credentials(resolved: dict[str, str] | None):
|
|
64
|
+
"""Scope an already-resolved ``logical-name -> secret value`` map to the current task.
|
|
65
|
+
|
|
66
|
+
Mirrors the async worker's set/reset (``run/worker.py``) so the synchronous evaluate
|
|
67
|
+
paths populate the same credential ContextVar that judge graders read. A falsy map is a
|
|
68
|
+
no-op, keeping callers byte-for-byte backward compatible. For streaming endpoints, enter
|
|
69
|
+
this BEFORE ``asyncio.create_task`` so the eval task inherits the populated context (a
|
|
70
|
+
child task snapshots its parent's context at creation time). Resolution is done by the
|
|
71
|
+
caller so its failures surface as request errors rather than scoping concerns.
|
|
72
|
+
"""
|
|
73
|
+
token = set_resolved_credentials(resolved) if resolved else None
|
|
74
|
+
try:
|
|
75
|
+
yield
|
|
76
|
+
finally:
|
|
77
|
+
if token is not None:
|
|
78
|
+
reset_resolved_credentials(token)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def _resolve_credentials(refs: dict[str, dict[str, Any]] | None) -> dict[str, str] | None:
|
|
82
|
+
"""Resolve credentialRefs to secret values, mapping bad references to a 400.
|
|
83
|
+
|
|
84
|
+
Resolver ``ValueError``s (missing/unknown ``kind``, missing locator fields, an unset
|
|
85
|
+
env var) are request/input errors, so surface them as 400s instead of letting them
|
|
86
|
+
bubble up as 500s. Infrastructure failures from custom resolvers raise other exception
|
|
87
|
+
types and are left to propagate as 5xx.
|
|
88
|
+
"""
|
|
89
|
+
if not refs:
|
|
90
|
+
return None
|
|
91
|
+
try:
|
|
92
|
+
return await resolve_credential_refs(refs)
|
|
93
|
+
except ValueError as exc:
|
|
94
|
+
raise HTTPException(status_code=400, detail=f"Could not resolve credentialRefs: {exc}") from exc
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _parse_credential_refs_form(raw: str | None) -> dict[str, dict[str, Any]] | None:
|
|
98
|
+
"""Parse and validate the multipart ``credential_refs`` form field (a JSON object string).
|
|
99
|
+
|
|
100
|
+
Empty/absent is treated as no credentials. Raises ``ValueError`` (which
|
|
101
|
+
``json.JSONDecodeError`` subclasses) on malformed JSON or a non-object shape, so callers
|
|
102
|
+
map both to the same error they use for a bad ``config``. The JSON request endpoints get
|
|
103
|
+
this shape check for free from the ``EvaluateJsonRequest`` model.
|
|
104
|
+
"""
|
|
105
|
+
if not raw:
|
|
106
|
+
return None
|
|
107
|
+
refs = json.loads(raw)
|
|
108
|
+
if not isinstance(refs, dict) or not all(isinstance(ref, dict) for ref in refs.values()):
|
|
109
|
+
raise ValueError("credentialRefs must be a JSON object mapping each logical name to a reference object")
|
|
110
|
+
return refs
|
|
111
|
+
|
|
112
|
+
|
|
56
113
|
def _camel_keys(obj: Any) -> Any:
|
|
57
114
|
"""Recursively convert dict keys from snake_case to camelCase."""
|
|
58
115
|
if isinstance(obj, dict):
|
|
@@ -462,6 +519,7 @@ async def evaluate_traces(
|
|
|
462
519
|
trace_files: list[UploadFile] = File(...),
|
|
463
520
|
config: str = Form(...),
|
|
464
521
|
eval_set_file: UploadFile | None = File(None),
|
|
522
|
+
credential_refs: str | None = Form(None),
|
|
465
523
|
):
|
|
466
524
|
"""
|
|
467
525
|
Evaluate agent traces using the provided evaluator configuration.
|
|
@@ -470,6 +528,8 @@ async def evaluate_traces(
|
|
|
470
528
|
trace_files: List of Jaeger or OTLP JSON trace files
|
|
471
529
|
config: JSON string with evaluation configuration
|
|
472
530
|
eval_set_file: Optional golden eval set file
|
|
531
|
+
credential_refs: Optional JSON string mapping logical credential names to
|
|
532
|
+
secret references, resolved so LLM-as-Judge graders can authenticate
|
|
473
533
|
|
|
474
534
|
Returns:
|
|
475
535
|
RunResult with trace results and any errors
|
|
@@ -481,6 +541,11 @@ async def evaluate_traces(
|
|
|
481
541
|
except json.JSONDecodeError as exc:
|
|
482
542
|
raise HTTPException(status_code=400, detail=f"Invalid config JSON: {exc}") from exc
|
|
483
543
|
|
|
544
|
+
try:
|
|
545
|
+
cred_refs = _parse_credential_refs_form(credential_refs)
|
|
546
|
+
except ValueError as exc:
|
|
547
|
+
raise HTTPException(status_code=400, detail=f"Invalid credentialRefs: {exc}") from exc
|
|
548
|
+
|
|
484
549
|
trace_paths = []
|
|
485
550
|
for trace_file in trace_files:
|
|
486
551
|
if not trace_file.filename:
|
|
@@ -548,7 +613,9 @@ async def evaluate_traces(
|
|
|
548
613
|
len(trace_paths),
|
|
549
614
|
[e.name for e in eval_config.evaluators],
|
|
550
615
|
)
|
|
551
|
-
|
|
616
|
+
resolved_creds = await _resolve_credentials(cred_refs)
|
|
617
|
+
with _scoped_credentials(resolved_creds):
|
|
618
|
+
result = await run_evaluation(eval_config)
|
|
552
619
|
|
|
553
620
|
run_id = await _maybe_persist_evaluate_run(
|
|
554
621
|
request,
|
|
@@ -580,6 +647,7 @@ async def evaluate_traces_stream(
|
|
|
580
647
|
trace_files: list[UploadFile] = File(...),
|
|
581
648
|
config: str = Form(...),
|
|
582
649
|
eval_set_file: UploadFile | None = File(None),
|
|
650
|
+
credential_refs: str | None = Form(None),
|
|
583
651
|
):
|
|
584
652
|
"""Evaluate traces with real-time progress via SSE."""
|
|
585
653
|
temp_dir = tempfile.mkdtemp()
|
|
@@ -593,6 +661,12 @@ async def evaluate_traces_stream(
|
|
|
593
661
|
yield f"data: {SSEErrorEvent(error=f'Invalid config JSON: {exc}').model_dump_json(by_alias=True)}\n\n"
|
|
594
662
|
return
|
|
595
663
|
|
|
664
|
+
try:
|
|
665
|
+
cred_refs = _parse_credential_refs_form(credential_refs)
|
|
666
|
+
except ValueError as exc:
|
|
667
|
+
yield f"data: {SSEErrorEvent(error=f'Invalid credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
|
|
668
|
+
return
|
|
669
|
+
|
|
596
670
|
trace_paths = []
|
|
597
671
|
for trace_file in trace_files:
|
|
598
672
|
if not trace_file.filename:
|
|
@@ -674,47 +748,54 @@ async def evaluate_traces_stream(
|
|
|
674
748
|
result = await run_evaluation(eval_config, progress_callback, trace_progress_callback)
|
|
675
749
|
await queue.put(("done", result))
|
|
676
750
|
|
|
677
|
-
eval_task = asyncio.create_task(run_with_progress())
|
|
678
|
-
|
|
679
751
|
try:
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
if
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
evt = SSETraceProgressEvent(
|
|
702
|
-
trace_progress=SSETraceProgress(
|
|
703
|
-
trace_id=payload.trace_id,
|
|
704
|
-
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
752
|
+
resolved_creds = await resolve_credential_refs(cred_refs) if cred_refs else None
|
|
753
|
+
except ValueError as exc:
|
|
754
|
+
yield f"data: {SSEErrorEvent(error=f'Could not resolve credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
|
|
755
|
+
return
|
|
756
|
+
|
|
757
|
+
with _scoped_credentials(resolved_creds):
|
|
758
|
+
eval_task = asyncio.create_task(run_with_progress())
|
|
759
|
+
|
|
760
|
+
try:
|
|
761
|
+
while True:
|
|
762
|
+
msg = await queue.get()
|
|
763
|
+
tag, payload = msg
|
|
764
|
+
|
|
765
|
+
if tag == "done":
|
|
766
|
+
run_id = await _maybe_persist_evaluate_run(
|
|
767
|
+
request,
|
|
768
|
+
params=eval_config,
|
|
769
|
+
eval_set_dict=_load_eval_set_dict(eval_set_path),
|
|
770
|
+
trace_format=eval_config.trace_format,
|
|
771
|
+
upload_filenames=upload_filenames,
|
|
772
|
+
run_result=payload,
|
|
705
773
|
)
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
774
|
+
if run_id:
|
|
775
|
+
payload.run_id = run_id
|
|
776
|
+
evt = SSEDoneEvent(
|
|
777
|
+
result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
778
|
+
)
|
|
779
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
780
|
+
break
|
|
781
|
+
elif tag == "trace_progress":
|
|
782
|
+
evt = SSETraceProgressEvent(
|
|
783
|
+
trace_progress=SSETraceProgress(
|
|
784
|
+
trace_id=payload.trace_id,
|
|
785
|
+
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
786
|
+
)
|
|
787
|
+
)
|
|
788
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
789
|
+
elif tag == "progress":
|
|
790
|
+
evt = SSEProgressEvent(message=payload)
|
|
791
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
792
|
+
finally:
|
|
793
|
+
if not eval_task.done():
|
|
794
|
+
eval_task.cancel()
|
|
795
|
+
try:
|
|
796
|
+
await eval_task
|
|
797
|
+
except asyncio.CancelledError:
|
|
798
|
+
pass
|
|
718
799
|
|
|
719
800
|
except Exception as exc:
|
|
720
801
|
logger.exception("Evaluation stream failed")
|
|
@@ -775,13 +856,15 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
|
|
|
775
856
|
"""Evaluate OTLP JSON traces passed in the request body."""
|
|
776
857
|
_check_json_body_size(raw_request)
|
|
777
858
|
traces, eval_set = _parse_json_request(request)
|
|
859
|
+
resolved_creds = await _resolve_credentials(request.credential_refs)
|
|
778
860
|
|
|
779
861
|
try:
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
862
|
+
with _scoped_credentials(resolved_creds):
|
|
863
|
+
result = await run_evaluation_from_traces(
|
|
864
|
+
traces=traces,
|
|
865
|
+
config=request.config,
|
|
866
|
+
eval_set=eval_set,
|
|
867
|
+
)
|
|
785
868
|
run_id = await _maybe_persist_evaluate_run(
|
|
786
869
|
raw_request,
|
|
787
870
|
params=request.config,
|
|
@@ -793,6 +876,8 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
|
|
|
793
876
|
if run_id:
|
|
794
877
|
result.run_id = run_id
|
|
795
878
|
return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
|
|
879
|
+
except HTTPException:
|
|
880
|
+
raise
|
|
796
881
|
except Exception as exc:
|
|
797
882
|
logger.exception("JSON evaluation failed")
|
|
798
883
|
raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
|
|
@@ -843,47 +928,56 @@ async def evaluate_traces_json_stream(request: EvaluateJsonRequest, raw_request:
|
|
|
843
928
|
)
|
|
844
929
|
await queue.put(("done", result))
|
|
845
930
|
|
|
846
|
-
eval_task = asyncio.create_task(run_with_progress())
|
|
847
|
-
|
|
848
931
|
try:
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
)
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
trace_id=payload.trace_id,
|
|
873
|
-
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
932
|
+
resolved_creds = (
|
|
933
|
+
await resolve_credential_refs(request.credential_refs) if request.credential_refs else None
|
|
934
|
+
)
|
|
935
|
+
except ValueError as exc:
|
|
936
|
+
yield _sse_error(f"Could not resolve credentialRefs: {exc}")
|
|
937
|
+
return
|
|
938
|
+
|
|
939
|
+
with _scoped_credentials(resolved_creds):
|
|
940
|
+
eval_task = asyncio.create_task(run_with_progress())
|
|
941
|
+
|
|
942
|
+
try:
|
|
943
|
+
while True:
|
|
944
|
+
msg = await queue.get()
|
|
945
|
+
tag, payload = msg
|
|
946
|
+
|
|
947
|
+
if tag == "done":
|
|
948
|
+
run_id = await _maybe_persist_evaluate_run(
|
|
949
|
+
raw_request,
|
|
950
|
+
params=request.config,
|
|
951
|
+
eval_set_dict=request.eval_set,
|
|
952
|
+
trace_format=None,
|
|
953
|
+
upload_filenames=None,
|
|
954
|
+
run_result=payload,
|
|
874
955
|
)
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
956
|
+
if run_id:
|
|
957
|
+
payload.run_id = run_id
|
|
958
|
+
evt = SSEDoneEvent(
|
|
959
|
+
result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
960
|
+
)
|
|
961
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
962
|
+
break
|
|
963
|
+
elif tag == "trace_progress":
|
|
964
|
+
evt = SSETraceProgressEvent(
|
|
965
|
+
trace_progress=SSETraceProgress(
|
|
966
|
+
trace_id=payload.trace_id,
|
|
967
|
+
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
968
|
+
)
|
|
969
|
+
)
|
|
970
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
971
|
+
elif tag == "progress":
|
|
972
|
+
evt = SSEProgressEvent(message=payload)
|
|
973
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
974
|
+
finally:
|
|
975
|
+
if not eval_task.done():
|
|
976
|
+
eval_task.cancel()
|
|
977
|
+
try:
|
|
978
|
+
await eval_task
|
|
979
|
+
except asyncio.CancelledError:
|
|
980
|
+
pass
|
|
887
981
|
|
|
888
982
|
except Exception as exc:
|
|
889
983
|
logger.exception("JSON evaluation stream failed")
|
|
@@ -27,6 +27,8 @@ from google.adk.evaluation.eval_metrics import (
|
|
|
27
27
|
from google.adk.evaluation.eval_rubrics import Rubric, RubricContent
|
|
28
28
|
from google.adk.evaluation.evaluator import EvaluationResult, Evaluator
|
|
29
29
|
|
|
30
|
+
from .resolvers import get_resolved_credential
|
|
31
|
+
|
|
30
32
|
logger = logging.getLogger(__name__)
|
|
31
33
|
|
|
32
34
|
METRICS_NEEDING_EXPECTED = {
|
|
@@ -267,6 +269,67 @@ def get_evaluator(eval_metric: EvalMetric) -> Evaluator:
|
|
|
267
269
|
return DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(eval_metric)
|
|
268
270
|
|
|
269
271
|
|
|
272
|
+
def _build_judge_model(model_id: str, api_key: str, base_url: str | None = None):
|
|
273
|
+
"""Build a judge ``BaseLlm`` carrying *api_key* directly, instead of reading it from env.
|
|
274
|
+
|
|
275
|
+
LiteLlm-backed providers take ``api_key`` (and optional ``base_url``) as constructor
|
|
276
|
+
kwargs that forward into every ``litellm.acompletion`` call. The Gemini-native model
|
|
277
|
+
class takes no ``api_key``; its cached ``google.genai`` client is replaced with one
|
|
278
|
+
built from the resolved key.
|
|
279
|
+
|
|
280
|
+
Routing is by ADK's ``LLMRegistry`` class resolution, which is authoritative: the
|
|
281
|
+
evaluator already resolved this same *model_id* to a model class when ``_setup_auto_rater``
|
|
282
|
+
ran at construction, so this lookup cannot disagree or fail here.
|
|
283
|
+
"""
|
|
284
|
+
from google.adk.models.lite_llm import LiteLlm
|
|
285
|
+
from google.adk.models.registry import LLMRegistry
|
|
286
|
+
|
|
287
|
+
if issubclass(LLMRegistry().resolve(model_id), LiteLlm):
|
|
288
|
+
kwargs: dict[str, Any] = {"api_key": api_key}
|
|
289
|
+
if base_url:
|
|
290
|
+
kwargs["base_url"] = base_url
|
|
291
|
+
return LiteLlm(model=model_id, **kwargs)
|
|
292
|
+
|
|
293
|
+
from google.adk.models.google_llm import Gemini
|
|
294
|
+
from google.genai import Client
|
|
295
|
+
from google.genai import types as genai_types
|
|
296
|
+
|
|
297
|
+
model = Gemini(model=model_id)
|
|
298
|
+
client_kwargs: dict[str, Any] = {"api_key": api_key}
|
|
299
|
+
if base_url:
|
|
300
|
+
client_kwargs["http_options"] = genai_types.HttpOptions(base_url=base_url)
|
|
301
|
+
# api_client is a functools.cached_property that memoizes into the instance __dict__;
|
|
302
|
+
# seeding that slot pre-empts the lazily-built client so the judge uses the resolved key.
|
|
303
|
+
model.__dict__["api_client"] = Client(**client_kwargs)
|
|
304
|
+
return model
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _inject_judge_credential(evaluator: Evaluator, api_key: str, base_url: str | None = None) -> None:
|
|
308
|
+
"""Replace a judge evaluator's auto-rater model with one built from *api_key*.
|
|
309
|
+
|
|
310
|
+
Keyed on the ADK private seam (``_judge_model_options`` / ``_judge_model``, set by
|
|
311
|
+
``LlmAsJudge._setup_auto_rater``) rather than on a class, so this single path covers
|
|
312
|
+
``FinalResponseMatchV2Evaluator``, the ``rubric_based_*_v1`` evaluators, and
|
|
313
|
+
``HallucinationsV1Evaluator`` (which exposes the same attributes without subclassing
|
|
314
|
+
``LlmAsJudge``). ``get_evaluator`` returns a fresh instance per evaluation, so mutating
|
|
315
|
+
it here carries no shared state and is safe across concurrent runs.
|
|
316
|
+
|
|
317
|
+
TODO(upstream): propose that ADK ``JudgeModelOptions`` carry a credential or a prebuilt
|
|
318
|
+
model instance, so judge auth no longer depends on this private seam or process env.
|
|
319
|
+
"""
|
|
320
|
+
opts = getattr(evaluator, "_judge_model_options", None)
|
|
321
|
+
if opts is None or not hasattr(evaluator, "_judge_model"):
|
|
322
|
+
logger.warning("evaluator %s is not judge-backed; cannot inject credential", type(evaluator).__name__)
|
|
323
|
+
return
|
|
324
|
+
model_id = getattr(opts, "judge_model", None)
|
|
325
|
+
if not model_id:
|
|
326
|
+
logger.warning(
|
|
327
|
+
"evaluator %s has no resolved judge_model; skipping credential injection", type(evaluator).__name__
|
|
328
|
+
)
|
|
329
|
+
return
|
|
330
|
+
evaluator._judge_model = _build_judge_model(model_id, api_key, base_url)
|
|
331
|
+
|
|
332
|
+
|
|
270
333
|
def extract_trajectory_details(eval_result: EvaluationResult) -> dict[str, Any]:
|
|
271
334
|
"""Extract expected vs actual tool call details from trajectory evaluation."""
|
|
272
335
|
comparisons = []
|
|
@@ -305,6 +368,8 @@ async def evaluate_builtin_metric(
|
|
|
305
368
|
judge_model: str | None,
|
|
306
369
|
threshold: float | None,
|
|
307
370
|
match_type: str | None = None,
|
|
371
|
+
credential_ref: str | None = None,
|
|
372
|
+
judge_base_url: str | None = None,
|
|
308
373
|
) -> dict[str, Any]:
|
|
309
374
|
"""Evaluate a single built-in ADK metric.
|
|
310
375
|
|
|
@@ -326,6 +391,18 @@ async def evaluate_builtin_metric(
|
|
|
326
391
|
eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
|
|
327
392
|
evaluator: Evaluator = get_evaluator(eval_metric)
|
|
328
393
|
|
|
394
|
+
if credential_ref:
|
|
395
|
+
api_key = get_resolved_credential(credential_ref)
|
|
396
|
+
if api_key is None:
|
|
397
|
+
return MetricResult(
|
|
398
|
+
metric_name=metric_name,
|
|
399
|
+
error=(
|
|
400
|
+
f"Metric '{metric_name}' references credential '{credential_ref}', "
|
|
401
|
+
f"which was not provided in the run's credentialRefs."
|
|
402
|
+
),
|
|
403
|
+
)
|
|
404
|
+
_inject_judge_credential(evaluator, api_key, judge_base_url)
|
|
405
|
+
|
|
329
406
|
if metric_name in _METRICS_NEEDING_INVOCATION_EVENTS:
|
|
330
407
|
actual_invocations = _enrich_app_details([_to_invocation_events(inv) for inv in actual_invocations])
|
|
331
408
|
if expected_invocations is not None:
|
|
@@ -27,6 +27,14 @@ class BuiltinMetricDef(BaseModel):
|
|
|
27
27
|
threshold: float | None = Field(default=None, ge=0, le=1)
|
|
28
28
|
judge_model: str | None = None
|
|
29
29
|
trajectory_match_type: str | None = None
|
|
30
|
+
credential_ref: str | None = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="Logical name of a RunSpec.credential_refs entry whose resolved value is the judge API key.",
|
|
33
|
+
)
|
|
34
|
+
judge_base_url: str | None = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="Optional base URL for the judge endpoint (e.g. an OpenAI-compatible proxy).",
|
|
37
|
+
)
|
|
30
38
|
|
|
31
39
|
@field_validator("trajectory_match_type")
|
|
32
40
|
@classmethod
|
|
@@ -453,6 +453,8 @@ async def evaluate_custom_evaluator(
|
|
|
453
453
|
judge_model=evaluator_def.judge_model,
|
|
454
454
|
threshold=evaluator_def.threshold,
|
|
455
455
|
match_type=evaluator_def.trajectory_match_type,
|
|
456
|
+
credential_ref=evaluator_def.credential_ref,
|
|
457
|
+
judge_base_url=evaluator_def.judge_base_url,
|
|
456
458
|
)
|
|
457
459
|
|
|
458
460
|
if isinstance(evaluator_def, OpenAIEvalDef):
|