agentevals-cli 0.9.4__tar.gz → 0.9.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/PKG-INFO +1 -1
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/models.py +9 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/routes.py +178 -84
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_api.py +210 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.claude/skills/eval/SKILL.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.claude/skills/eval/evals/evals.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.claude/skills/inspect/SKILL.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.claude/skills/inspect/evals/evals.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.dockerignore +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/workflows/ci.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/workflows/publish-evaluator-sdk.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.github/workflows/release.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.gitignore +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/.mcp.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/CONTRIBUTING.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/DEVELOPMENT.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/Dockerfile +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/LICENSE +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/Makefile +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/Chart.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/NOTES.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/_helpers.tpl +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/deployment.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/postgresql.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/rbac.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/service.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/templates/serviceaccount.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/charts/agentevals/values.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/assets/logo-color-on-transparent.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/assets/logo-color.png +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/assets/logo-dark-on-transparent.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/custom-evaluators.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/eval-set-format.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/otel-compatibility.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/docs/streaming.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_evaluators/eval_config.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_evaluators/eval_config_openai_eval.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_evaluators/response_quality.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_evaluators/tool_call_checker.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_sink/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/custom_sink/pyproject.toml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/agent.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/main.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/dice_agent/test_streaming.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/kubernetes/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/agent.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/main.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/langchain_agent/test_streaming.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/sdk_example/async_example.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/sdk_example/context_manager_example.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/sdk_example/decorator_example.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/sdk_example/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/strands_agent/agent.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/strands_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/strands_agent/main.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/strands_agent/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/run.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/langchain/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/langchain/run.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/ollama/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/ollama/run.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/openai-agents/run.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/strands/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/examples/zero-code-examples/strands/run.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/flake.lock +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/flake.nix +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/pyproject.toml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/pyproject.toml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/eval_set_helm.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/evalset_helm_3_2026-02-23.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/evalset_k8s_2026-02-20.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/helm.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/helm_2.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/helm_3.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/k8s.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/samples/tempo_export_with_batches.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_protocol.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/assets/index-RIquRPno.js +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/index.html +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/logo.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/_static/vite.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/app.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/debug_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/dependencies.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_app.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_grpc.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_processing.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/otlp_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/runs_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/api/streaming_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/builtin_metrics.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/cli.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/config.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/converter.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/custom_evaluators.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/eval_config_loader.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/resolver.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/sources.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/templates.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/evaluator/venv.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/extraction.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/genai_converter.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/auto.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/base.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/jaeger.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/loader/otlp.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/mcp_server.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/openai_eval_backend.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/output.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/resolvers/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/resolvers/kubernetes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/fetcher.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/result_builder.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/service.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/sinks.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/run/worker.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/runner.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/sdk.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/config.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/models.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/migrator.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/postgres/pool.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/memory.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/storage/repos/postgres.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/incremental_processor.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/processor.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/session.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/streaming/ws_server.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/trace_attrs.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/trace_metrics.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/utils/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/utils/genai_messages.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/utils/log_buffer.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/src/agentevals/utils/log_enrichment.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/api/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/api/test_evaluate_persistence.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/api/test_runs_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/conftest.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_evaluation_pipeline.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_live_agents.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_otlp_grpc_receiver.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_session_grouping.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/integration/test_timing_stress.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/resolvers/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/resolvers/test_kubernetes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/resolvers/test_registry.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/test_fetcher.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/test_result_builder.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/test_service.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/run/test_sinks.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/test_config.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/test_memory_repos.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/test_migrator.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/storage/test_models.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_cli.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_converter.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_credential_injection.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_eval_config_loader.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_extraction.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_genai_converter.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_jaeger_loader.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_loader_auto.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_log_enrichment.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_mcp_server.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_openai_eval_backend.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_otlp_loader.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_otlp_receiver.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_output.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_protocol.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_runner.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_sdk.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/tests/test_trace_metrics.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/.gitignore +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/eslint.config.js +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/index.html +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/package-lock.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/package.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/public/logo.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/public/vite.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/App.css +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/App.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/api/client.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/assets/react.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/BuilderHeader.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/BuilderView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/EvalCasesList.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/InvocationEditor.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/JsonPreview.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/MetadataEditor.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/builder/index.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/DashboardView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/TraceCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/dashboard/TraceTable.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/DataSection.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InspectorView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InvocationCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/ToolCallList.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/sidebar/Sidebar.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveMessage.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/SessionCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/FileDropZone.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/MetricSelector.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/upload/UploadView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/components/welcome/WelcomeView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/config.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/context/TraceContext.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/context/TraceProvider.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/index.css +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/console-capture.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/eval-config.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/evalset-builder.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/network-capture.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/trace-helpers.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/trace-loader.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/trace-metadata.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/trace-patcher.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/types.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/lib/utils.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/src/main.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/tsconfig.app.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/tsconfig.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/tsconfig.node.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/ui/vite.config.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.5}/uv.lock +0 -0
|
@@ -142,6 +142,15 @@ class EvaluateJsonRequest(CamelModel):
|
|
|
142
142
|
traces: dict = Field(description="OTLP JSON export with resourceSpans structure.")
|
|
143
143
|
config: EvalParams = Field(default_factory=EvalParams, description="Evaluation parameters.")
|
|
144
144
|
eval_set: dict | None = Field(default=None, description="Optional ADK EvalSet JSON.")
|
|
145
|
+
credential_refs: dict[str, dict[str, Any]] | None = Field(
|
|
146
|
+
default=None,
|
|
147
|
+
description=(
|
|
148
|
+
"Map of logical credential name to a secret reference dict. Each reference has a "
|
|
149
|
+
"'kind' (the resolver to use) plus that kind's locator fields. Resolved per call to its "
|
|
150
|
+
"secret value; never written to the process environment. How a value is used (e.g. which "
|
|
151
|
+
"judge provider it authenticates) is configured on the consumer, not the reference."
|
|
152
|
+
),
|
|
153
|
+
)
|
|
145
154
|
|
|
146
155
|
|
|
147
156
|
# ---------------------------------------------------------------------------
|
|
@@ -9,6 +9,7 @@ import os
|
|
|
9
9
|
import re
|
|
10
10
|
import shutil
|
|
11
11
|
import tempfile
|
|
12
|
+
from contextlib import contextmanager
|
|
12
13
|
from typing import Any
|
|
13
14
|
|
|
14
15
|
from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
|
|
@@ -23,6 +24,11 @@ from ..converter import convert_traces
|
|
|
23
24
|
from ..extraction import get_extractor
|
|
24
25
|
from ..loader import load_traces
|
|
25
26
|
from ..loader.otlp import OtlpJsonLoader
|
|
27
|
+
from ..resolvers import (
|
|
28
|
+
reset_resolved_credentials,
|
|
29
|
+
resolve_credential_refs,
|
|
30
|
+
set_resolved_credentials,
|
|
31
|
+
)
|
|
26
32
|
from ..runner import (
|
|
27
33
|
RunResult,
|
|
28
34
|
load_eval_set,
|
|
@@ -53,6 +59,57 @@ from .models import (
|
|
|
53
59
|
logger = logging.getLogger(__name__)
|
|
54
60
|
|
|
55
61
|
|
|
62
|
+
@contextmanager
|
|
63
|
+
def _scoped_credentials(resolved: dict[str, str] | None):
|
|
64
|
+
"""Scope an already-resolved ``logical-name -> secret value`` map to the current task.
|
|
65
|
+
|
|
66
|
+
Mirrors the async worker's set/reset (``run/worker.py``) so the synchronous evaluate
|
|
67
|
+
paths populate the same credential ContextVar that judge graders read. A falsy map is a
|
|
68
|
+
no-op, keeping callers byte-for-byte backward compatible. For streaming endpoints, enter
|
|
69
|
+
this BEFORE ``asyncio.create_task`` so the eval task inherits the populated context (a
|
|
70
|
+
child task snapshots its parent's context at creation time). Resolution is done by the
|
|
71
|
+
caller so its failures surface as request errors rather than scoping concerns.
|
|
72
|
+
"""
|
|
73
|
+
token = set_resolved_credentials(resolved) if resolved else None
|
|
74
|
+
try:
|
|
75
|
+
yield
|
|
76
|
+
finally:
|
|
77
|
+
if token is not None:
|
|
78
|
+
reset_resolved_credentials(token)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def _resolve_credentials(refs: dict[str, dict[str, Any]] | None) -> dict[str, str] | None:
|
|
82
|
+
"""Resolve credentialRefs to secret values, mapping bad references to a 400.
|
|
83
|
+
|
|
84
|
+
Resolver ``ValueError``s (missing/unknown ``kind``, missing locator fields, an unset
|
|
85
|
+
env var) are request/input errors, so surface them as 400s instead of letting them
|
|
86
|
+
bubble up as 500s. Infrastructure failures from custom resolvers raise other exception
|
|
87
|
+
types and are left to propagate as 5xx.
|
|
88
|
+
"""
|
|
89
|
+
if not refs:
|
|
90
|
+
return None
|
|
91
|
+
try:
|
|
92
|
+
return await resolve_credential_refs(refs)
|
|
93
|
+
except ValueError as exc:
|
|
94
|
+
raise HTTPException(status_code=400, detail=f"Could not resolve credentialRefs: {exc}") from exc
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _parse_credential_refs_form(raw: str | None) -> dict[str, dict[str, Any]] | None:
|
|
98
|
+
"""Parse and validate the multipart ``credential_refs`` form field (a JSON object string).
|
|
99
|
+
|
|
100
|
+
Empty/absent is treated as no credentials. Raises ``ValueError`` (which
|
|
101
|
+
``json.JSONDecodeError`` subclasses) on malformed JSON or a non-object shape, so callers
|
|
102
|
+
map both to the same error they use for a bad ``config``. The JSON request endpoints get
|
|
103
|
+
this shape check for free from the ``EvaluateJsonRequest`` model.
|
|
104
|
+
"""
|
|
105
|
+
if not raw:
|
|
106
|
+
return None
|
|
107
|
+
refs = json.loads(raw)
|
|
108
|
+
if not isinstance(refs, dict) or not all(isinstance(ref, dict) for ref in refs.values()):
|
|
109
|
+
raise ValueError("credentialRefs must be a JSON object mapping each logical name to a reference object")
|
|
110
|
+
return refs
|
|
111
|
+
|
|
112
|
+
|
|
56
113
|
def _camel_keys(obj: Any) -> Any:
|
|
57
114
|
"""Recursively convert dict keys from snake_case to camelCase."""
|
|
58
115
|
if isinstance(obj, dict):
|
|
@@ -462,6 +519,7 @@ async def evaluate_traces(
|
|
|
462
519
|
trace_files: list[UploadFile] = File(...),
|
|
463
520
|
config: str = Form(...),
|
|
464
521
|
eval_set_file: UploadFile | None = File(None),
|
|
522
|
+
credential_refs: str | None = Form(None),
|
|
465
523
|
):
|
|
466
524
|
"""
|
|
467
525
|
Evaluate agent traces using the provided evaluator configuration.
|
|
@@ -470,6 +528,8 @@ async def evaluate_traces(
|
|
|
470
528
|
trace_files: List of Jaeger or OTLP JSON trace files
|
|
471
529
|
config: JSON string with evaluation configuration
|
|
472
530
|
eval_set_file: Optional golden eval set file
|
|
531
|
+
credential_refs: Optional JSON string mapping logical credential names to
|
|
532
|
+
secret references, resolved so LLM-as-Judge graders can authenticate
|
|
473
533
|
|
|
474
534
|
Returns:
|
|
475
535
|
RunResult with trace results and any errors
|
|
@@ -481,6 +541,11 @@ async def evaluate_traces(
|
|
|
481
541
|
except json.JSONDecodeError as exc:
|
|
482
542
|
raise HTTPException(status_code=400, detail=f"Invalid config JSON: {exc}") from exc
|
|
483
543
|
|
|
544
|
+
try:
|
|
545
|
+
cred_refs = _parse_credential_refs_form(credential_refs)
|
|
546
|
+
except ValueError as exc:
|
|
547
|
+
raise HTTPException(status_code=400, detail=f"Invalid credentialRefs: {exc}") from exc
|
|
548
|
+
|
|
484
549
|
trace_paths = []
|
|
485
550
|
for trace_file in trace_files:
|
|
486
551
|
if not trace_file.filename:
|
|
@@ -548,7 +613,9 @@ async def evaluate_traces(
|
|
|
548
613
|
len(trace_paths),
|
|
549
614
|
[e.name for e in eval_config.evaluators],
|
|
550
615
|
)
|
|
551
|
-
|
|
616
|
+
resolved_creds = await _resolve_credentials(cred_refs)
|
|
617
|
+
with _scoped_credentials(resolved_creds):
|
|
618
|
+
result = await run_evaluation(eval_config)
|
|
552
619
|
|
|
553
620
|
run_id = await _maybe_persist_evaluate_run(
|
|
554
621
|
request,
|
|
@@ -580,6 +647,7 @@ async def evaluate_traces_stream(
|
|
|
580
647
|
trace_files: list[UploadFile] = File(...),
|
|
581
648
|
config: str = Form(...),
|
|
582
649
|
eval_set_file: UploadFile | None = File(None),
|
|
650
|
+
credential_refs: str | None = Form(None),
|
|
583
651
|
):
|
|
584
652
|
"""Evaluate traces with real-time progress via SSE."""
|
|
585
653
|
temp_dir = tempfile.mkdtemp()
|
|
@@ -593,6 +661,12 @@ async def evaluate_traces_stream(
|
|
|
593
661
|
yield f"data: {SSEErrorEvent(error=f'Invalid config JSON: {exc}').model_dump_json(by_alias=True)}\n\n"
|
|
594
662
|
return
|
|
595
663
|
|
|
664
|
+
try:
|
|
665
|
+
cred_refs = _parse_credential_refs_form(credential_refs)
|
|
666
|
+
except ValueError as exc:
|
|
667
|
+
yield f"data: {SSEErrorEvent(error=f'Invalid credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
|
|
668
|
+
return
|
|
669
|
+
|
|
596
670
|
trace_paths = []
|
|
597
671
|
for trace_file in trace_files:
|
|
598
672
|
if not trace_file.filename:
|
|
@@ -674,47 +748,54 @@ async def evaluate_traces_stream(
|
|
|
674
748
|
result = await run_evaluation(eval_config, progress_callback, trace_progress_callback)
|
|
675
749
|
await queue.put(("done", result))
|
|
676
750
|
|
|
677
|
-
eval_task = asyncio.create_task(run_with_progress())
|
|
678
|
-
|
|
679
751
|
try:
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
if
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
evt = SSETraceProgressEvent(
|
|
702
|
-
trace_progress=SSETraceProgress(
|
|
703
|
-
trace_id=payload.trace_id,
|
|
704
|
-
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
752
|
+
resolved_creds = await resolve_credential_refs(cred_refs) if cred_refs else None
|
|
753
|
+
except ValueError as exc:
|
|
754
|
+
yield f"data: {SSEErrorEvent(error=f'Could not resolve credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
|
|
755
|
+
return
|
|
756
|
+
|
|
757
|
+
with _scoped_credentials(resolved_creds):
|
|
758
|
+
eval_task = asyncio.create_task(run_with_progress())
|
|
759
|
+
|
|
760
|
+
try:
|
|
761
|
+
while True:
|
|
762
|
+
msg = await queue.get()
|
|
763
|
+
tag, payload = msg
|
|
764
|
+
|
|
765
|
+
if tag == "done":
|
|
766
|
+
run_id = await _maybe_persist_evaluate_run(
|
|
767
|
+
request,
|
|
768
|
+
params=eval_config,
|
|
769
|
+
eval_set_dict=_load_eval_set_dict(eval_set_path),
|
|
770
|
+
trace_format=eval_config.trace_format,
|
|
771
|
+
upload_filenames=upload_filenames,
|
|
772
|
+
run_result=payload,
|
|
705
773
|
)
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
774
|
+
if run_id:
|
|
775
|
+
payload.run_id = run_id
|
|
776
|
+
evt = SSEDoneEvent(
|
|
777
|
+
result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
778
|
+
)
|
|
779
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
780
|
+
break
|
|
781
|
+
elif tag == "trace_progress":
|
|
782
|
+
evt = SSETraceProgressEvent(
|
|
783
|
+
trace_progress=SSETraceProgress(
|
|
784
|
+
trace_id=payload.trace_id,
|
|
785
|
+
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
786
|
+
)
|
|
787
|
+
)
|
|
788
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
789
|
+
elif tag == "progress":
|
|
790
|
+
evt = SSEProgressEvent(message=payload)
|
|
791
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
792
|
+
finally:
|
|
793
|
+
if not eval_task.done():
|
|
794
|
+
eval_task.cancel()
|
|
795
|
+
try:
|
|
796
|
+
await eval_task
|
|
797
|
+
except asyncio.CancelledError:
|
|
798
|
+
pass
|
|
718
799
|
|
|
719
800
|
except Exception as exc:
|
|
720
801
|
logger.exception("Evaluation stream failed")
|
|
@@ -775,13 +856,15 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
|
|
|
775
856
|
"""Evaluate OTLP JSON traces passed in the request body."""
|
|
776
857
|
_check_json_body_size(raw_request)
|
|
777
858
|
traces, eval_set = _parse_json_request(request)
|
|
859
|
+
resolved_creds = await _resolve_credentials(request.credential_refs)
|
|
778
860
|
|
|
779
861
|
try:
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
862
|
+
with _scoped_credentials(resolved_creds):
|
|
863
|
+
result = await run_evaluation_from_traces(
|
|
864
|
+
traces=traces,
|
|
865
|
+
config=request.config,
|
|
866
|
+
eval_set=eval_set,
|
|
867
|
+
)
|
|
785
868
|
run_id = await _maybe_persist_evaluate_run(
|
|
786
869
|
raw_request,
|
|
787
870
|
params=request.config,
|
|
@@ -793,6 +876,8 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
|
|
|
793
876
|
if run_id:
|
|
794
877
|
result.run_id = run_id
|
|
795
878
|
return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
|
|
879
|
+
except HTTPException:
|
|
880
|
+
raise
|
|
796
881
|
except Exception as exc:
|
|
797
882
|
logger.exception("JSON evaluation failed")
|
|
798
883
|
raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
|
|
@@ -843,47 +928,56 @@ async def evaluate_traces_json_stream(request: EvaluateJsonRequest, raw_request:
|
|
|
843
928
|
)
|
|
844
929
|
await queue.put(("done", result))
|
|
845
930
|
|
|
846
|
-
eval_task = asyncio.create_task(run_with_progress())
|
|
847
|
-
|
|
848
931
|
try:
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
)
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
trace_id=payload.trace_id,
|
|
873
|
-
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
932
|
+
resolved_creds = (
|
|
933
|
+
await resolve_credential_refs(request.credential_refs) if request.credential_refs else None
|
|
934
|
+
)
|
|
935
|
+
except ValueError as exc:
|
|
936
|
+
yield _sse_error(f"Could not resolve credentialRefs: {exc}")
|
|
937
|
+
return
|
|
938
|
+
|
|
939
|
+
with _scoped_credentials(resolved_creds):
|
|
940
|
+
eval_task = asyncio.create_task(run_with_progress())
|
|
941
|
+
|
|
942
|
+
try:
|
|
943
|
+
while True:
|
|
944
|
+
msg = await queue.get()
|
|
945
|
+
tag, payload = msg
|
|
946
|
+
|
|
947
|
+
if tag == "done":
|
|
948
|
+
run_id = await _maybe_persist_evaluate_run(
|
|
949
|
+
raw_request,
|
|
950
|
+
params=request.config,
|
|
951
|
+
eval_set_dict=request.eval_set,
|
|
952
|
+
trace_format=None,
|
|
953
|
+
upload_filenames=None,
|
|
954
|
+
run_result=payload,
|
|
874
955
|
)
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
956
|
+
if run_id:
|
|
957
|
+
payload.run_id = run_id
|
|
958
|
+
evt = SSEDoneEvent(
|
|
959
|
+
result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
960
|
+
)
|
|
961
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
962
|
+
break
|
|
963
|
+
elif tag == "trace_progress":
|
|
964
|
+
evt = SSETraceProgressEvent(
|
|
965
|
+
trace_progress=SSETraceProgress(
|
|
966
|
+
trace_id=payload.trace_id,
|
|
967
|
+
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
968
|
+
)
|
|
969
|
+
)
|
|
970
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
971
|
+
elif tag == "progress":
|
|
972
|
+
evt = SSEProgressEvent(message=payload)
|
|
973
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
974
|
+
finally:
|
|
975
|
+
if not eval_task.done():
|
|
976
|
+
eval_task.cancel()
|
|
977
|
+
try:
|
|
978
|
+
await eval_task
|
|
979
|
+
except asyncio.CancelledError:
|
|
980
|
+
pass
|
|
887
981
|
|
|
888
982
|
except Exception as exc:
|
|
889
983
|
logger.exception("JSON evaluation stream failed")
|
|
@@ -229,6 +229,35 @@ def _eval_config_json(**overrides) -> str:
|
|
|
229
229
|
return json.dumps(cfg)
|
|
230
230
|
|
|
231
231
|
|
|
232
|
+
def _judge_config(**overrides) -> dict:
|
|
233
|
+
cfg = {
|
|
234
|
+
"evaluators": [
|
|
235
|
+
{"name": "hallucinations_v1", "type": "builtin", "judgeModel": "openai/gpt-4o", "credentialRef": "k"}
|
|
236
|
+
]
|
|
237
|
+
}
|
|
238
|
+
cfg.update(overrides)
|
|
239
|
+
return cfg
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _capturing_run_eval(captured: dict):
|
|
243
|
+
"""Build an AsyncMock side_effect that records, at evaluator-invocation time, the value the
|
|
244
|
+
judge would resolve for credential ``k``.
|
|
245
|
+
|
|
246
|
+
This is the correct boundary for the sync routes: their job is to populate the credential
|
|
247
|
+
ContextVar before the evaluator runs. The ContextVar -> judge injection step itself is
|
|
248
|
+
already covered by test_credential_injection.py, so recording ``get_resolved_credential``
|
|
249
|
+
here (rather than mocking it) is not a false positive -- it fails when the route omits the
|
|
250
|
+
set/reset, which is exactly the gap being closed.
|
|
251
|
+
"""
|
|
252
|
+
from agentevals.resolvers import get_resolved_credential
|
|
253
|
+
|
|
254
|
+
def _side_effect(*args, **kwargs):
|
|
255
|
+
captured["judge_key"] = get_resolved_credential("k")
|
|
256
|
+
return _make_run_result()
|
|
257
|
+
|
|
258
|
+
return _side_effect
|
|
259
|
+
|
|
260
|
+
|
|
232
261
|
# ---------------------------------------------------------------------------
|
|
233
262
|
# Model Serialization
|
|
234
263
|
# ---------------------------------------------------------------------------
|
|
@@ -528,6 +557,68 @@ class TestEvaluateTraces:
|
|
|
528
557
|
)
|
|
529
558
|
assert resp.status_code in (400, 422)
|
|
530
559
|
|
|
560
|
+
@patch("agentevals.api.routes.run_evaluation", new_callable=AsyncMock)
|
|
561
|
+
def test_evaluate_resolves_credential_refs(self, mock_eval, monkeypatch):
|
|
562
|
+
monkeypatch.setenv("AE_TEST_JUDGE_KEY", "sk-resolved-multipart")
|
|
563
|
+
captured: dict = {}
|
|
564
|
+
mock_eval.side_effect = _capturing_run_eval(captured)
|
|
565
|
+
resp = self.client.post(
|
|
566
|
+
"/api/evaluate",
|
|
567
|
+
files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
|
|
568
|
+
data={
|
|
569
|
+
"config": json.dumps(_judge_config()),
|
|
570
|
+
"credential_refs": json.dumps({"k": {"kind": "env", "name": "AE_TEST_JUDGE_KEY"}}),
|
|
571
|
+
},
|
|
572
|
+
)
|
|
573
|
+
_assert_envelope(resp)
|
|
574
|
+
assert captured["judge_key"] == "sk-resolved-multipart"
|
|
575
|
+
|
|
576
|
+
@patch("agentevals.api.routes.run_evaluation", new_callable=AsyncMock)
|
|
577
|
+
def test_evaluate_without_credential_refs_is_noop(self, mock_eval):
|
|
578
|
+
captured: dict = {}
|
|
579
|
+
mock_eval.side_effect = _capturing_run_eval(captured)
|
|
580
|
+
resp = self.client.post(
|
|
581
|
+
"/api/evaluate",
|
|
582
|
+
files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
|
|
583
|
+
data={"config": _eval_config_json()},
|
|
584
|
+
)
|
|
585
|
+
_assert_envelope(resp)
|
|
586
|
+
assert captured["judge_key"] is None
|
|
587
|
+
|
|
588
|
+
def test_evaluate_bad_credential_refs_returns_400(self):
|
|
589
|
+
resp = self.client.post(
|
|
590
|
+
"/api/evaluate",
|
|
591
|
+
files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
|
|
592
|
+
data={"config": _eval_config_json(), "credential_refs": "{not json"},
|
|
593
|
+
)
|
|
594
|
+
assert resp.status_code == 400
|
|
595
|
+
assert "credentialRefs" in resp.json()["detail"]
|
|
596
|
+
|
|
597
|
+
def test_evaluate_credential_refs_wrong_shape_returns_400(self):
|
|
598
|
+
resp = self.client.post(
|
|
599
|
+
"/api/evaluate",
|
|
600
|
+
files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
|
|
601
|
+
data={"config": _eval_config_json(), "credential_refs": json.dumps(["not", "a", "map"])},
|
|
602
|
+
)
|
|
603
|
+
assert resp.status_code == 400
|
|
604
|
+
assert "credentialRefs" in resp.json()["detail"]
|
|
605
|
+
|
|
606
|
+
@patch("agentevals.api.routes.run_evaluation", new_callable=AsyncMock)
|
|
607
|
+
def test_evaluate_unresolvable_credential_returns_400(self, mock_eval, monkeypatch):
|
|
608
|
+
monkeypatch.delenv("AE_MISSING_KEY", raising=False)
|
|
609
|
+
mock_eval.return_value = _make_run_result()
|
|
610
|
+
resp = self.client.post(
|
|
611
|
+
"/api/evaluate",
|
|
612
|
+
files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
|
|
613
|
+
data={
|
|
614
|
+
"config": json.dumps(_judge_config()),
|
|
615
|
+
"credential_refs": json.dumps({"k": {"kind": "env", "name": "AE_MISSING_KEY"}}),
|
|
616
|
+
},
|
|
617
|
+
)
|
|
618
|
+
assert resp.status_code == 400
|
|
619
|
+
assert "Could not resolve credentialRefs" in resp.json()["detail"]
|
|
620
|
+
mock_eval.assert_not_called()
|
|
621
|
+
|
|
531
622
|
|
|
532
623
|
# ---------------------------------------------------------------------------
|
|
533
624
|
# POST /api/evaluate/stream (SSE)
|
|
@@ -591,6 +682,34 @@ class TestEvaluateStream:
|
|
|
591
682
|
assert "result" in done
|
|
592
683
|
assert "traceResults" in done["result"]
|
|
593
684
|
|
|
685
|
+
@patch("agentevals.api.routes.run_evaluation", new_callable=AsyncMock)
|
|
686
|
+
@patch("agentevals.api.routes.load_traces")
|
|
687
|
+
def test_stream_resolves_credential_refs(self, mock_load_traces, mock_eval, monkeypatch):
|
|
688
|
+
monkeypatch.setenv("AE_TEST_JUDGE_KEY", "sk-resolved-stream")
|
|
689
|
+
mock_load_traces.return_value = []
|
|
690
|
+
captured: dict = {}
|
|
691
|
+
mock_eval.side_effect = _capturing_run_eval(captured)
|
|
692
|
+
resp = self.client.post(
|
|
693
|
+
"/api/evaluate/stream",
|
|
694
|
+
files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
|
|
695
|
+
data={
|
|
696
|
+
"config": json.dumps(_judge_config()),
|
|
697
|
+
"credential_refs": json.dumps({"k": {"kind": "env", "name": "AE_TEST_JUDGE_KEY"}}),
|
|
698
|
+
},
|
|
699
|
+
)
|
|
700
|
+
assert '"done"' in resp.text
|
|
701
|
+
assert captured["judge_key"] == "sk-resolved-stream"
|
|
702
|
+
|
|
703
|
+
def test_stream_bad_credential_refs(self):
|
|
704
|
+
resp = self.client.post(
|
|
705
|
+
"/api/evaluate/stream",
|
|
706
|
+
files={"trace_files": ("trace.json", io.BytesIO(_make_trace_json()))},
|
|
707
|
+
data={"config": _eval_config_json(), "credential_refs": "{not json"},
|
|
708
|
+
)
|
|
709
|
+
assert resp.status_code == 200
|
|
710
|
+
assert '"error"' in resp.text
|
|
711
|
+
assert "credentialRefs" in resp.text
|
|
712
|
+
|
|
594
713
|
|
|
595
714
|
# ---------------------------------------------------------------------------
|
|
596
715
|
# POST /api/evaluate/json
|
|
@@ -767,6 +886,56 @@ class TestEvaluateJson:
|
|
|
767
886
|
body = _assert_envelope(resp)
|
|
768
887
|
assert "traceResults" in body["data"]
|
|
769
888
|
|
|
889
|
+
@patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
|
|
890
|
+
def test_evaluate_json_resolves_credential_refs(self, mock_eval, monkeypatch):
|
|
891
|
+
monkeypatch.setenv("AE_TEST_JUDGE_KEY", "sk-resolved-json")
|
|
892
|
+
captured: dict = {}
|
|
893
|
+
mock_eval.side_effect = _capturing_run_eval(captured)
|
|
894
|
+
resp = self.client.post(
|
|
895
|
+
"/api/evaluate/json",
|
|
896
|
+
json={
|
|
897
|
+
"traces": _make_otlp_json_payload(),
|
|
898
|
+
"config": _judge_config(),
|
|
899
|
+
"credentialRefs": {"k": {"kind": "env", "name": "AE_TEST_JUDGE_KEY"}},
|
|
900
|
+
},
|
|
901
|
+
)
|
|
902
|
+
_assert_envelope(resp)
|
|
903
|
+
assert captured["judge_key"] == "sk-resolved-json"
|
|
904
|
+
|
|
905
|
+
@patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
|
|
906
|
+
def test_evaluate_json_without_credential_refs_is_noop(self, mock_eval):
|
|
907
|
+
captured: dict = {}
|
|
908
|
+
mock_eval.side_effect = _capturing_run_eval(captured)
|
|
909
|
+
resp = self.client.post(
|
|
910
|
+
"/api/evaluate/json",
|
|
911
|
+
json={"traces": _make_otlp_json_payload(), "config": _judge_config()},
|
|
912
|
+
)
|
|
913
|
+
_assert_envelope(resp)
|
|
914
|
+
assert captured["judge_key"] is None
|
|
915
|
+
|
|
916
|
+
def test_evaluate_json_credential_refs_wrong_shape_returns_422(self):
|
|
917
|
+
resp = self.client.post(
|
|
918
|
+
"/api/evaluate/json",
|
|
919
|
+
json={"traces": _make_otlp_json_payload(), "credentialRefs": ["not", "a", "map"]},
|
|
920
|
+
)
|
|
921
|
+
assert resp.status_code == 422
|
|
922
|
+
|
|
923
|
+
@patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
|
|
924
|
+
def test_evaluate_json_unresolvable_credential_returns_400(self, mock_eval, monkeypatch):
|
|
925
|
+
monkeypatch.delenv("AE_MISSING_KEY", raising=False)
|
|
926
|
+
mock_eval.return_value = _make_run_result()
|
|
927
|
+
resp = self.client.post(
|
|
928
|
+
"/api/evaluate/json",
|
|
929
|
+
json={
|
|
930
|
+
"traces": _make_otlp_json_payload(),
|
|
931
|
+
"config": _judge_config(),
|
|
932
|
+
"credentialRefs": {"k": {"kind": "env", "name": "AE_MISSING_KEY"}},
|
|
933
|
+
},
|
|
934
|
+
)
|
|
935
|
+
assert resp.status_code == 400
|
|
936
|
+
assert "Could not resolve credentialRefs" in resp.json()["detail"]
|
|
937
|
+
mock_eval.assert_not_called()
|
|
938
|
+
|
|
770
939
|
|
|
771
940
|
# ---------------------------------------------------------------------------
|
|
772
941
|
# POST /api/evaluate/json/stream (SSE)
|
|
@@ -827,6 +996,47 @@ class TestEvaluateJsonStream:
|
|
|
827
996
|
assert '"error"' in body
|
|
828
997
|
assert "No traces" in body
|
|
829
998
|
|
|
999
|
+
@patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
|
|
1000
|
+
@patch("agentevals.api.routes.OtlpJsonLoader")
|
|
1001
|
+
def test_stream_resolves_credential_refs(self, mock_loader_cls, mock_eval, monkeypatch):
|
|
1002
|
+
monkeypatch.setenv("AE_TEST_JUDGE_KEY", "sk-resolved-json-stream")
|
|
1003
|
+
mock_trace = MagicMock()
|
|
1004
|
+
mock_trace.trace_id = "abc123"
|
|
1005
|
+
mock_loader_cls.return_value.load_from_dict.return_value = [mock_trace]
|
|
1006
|
+
captured: dict = {}
|
|
1007
|
+
mock_eval.side_effect = _capturing_run_eval(captured)
|
|
1008
|
+
resp = self.client.post(
|
|
1009
|
+
"/api/evaluate/json/stream",
|
|
1010
|
+
json={
|
|
1011
|
+
"traces": _make_otlp_json_payload(),
|
|
1012
|
+
"config": _judge_config(),
|
|
1013
|
+
"credentialRefs": {"k": {"kind": "env", "name": "AE_TEST_JUDGE_KEY"}},
|
|
1014
|
+
},
|
|
1015
|
+
)
|
|
1016
|
+
assert '"done"' in resp.text
|
|
1017
|
+
assert captured["judge_key"] == "sk-resolved-json-stream"
|
|
1018
|
+
|
|
1019
|
+
@patch("agentevals.api.routes.run_evaluation_from_traces", new_callable=AsyncMock)
|
|
1020
|
+
@patch("agentevals.api.routes.OtlpJsonLoader")
|
|
1021
|
+
def test_stream_unresolvable_credential_yields_error(self, mock_loader_cls, mock_eval, monkeypatch):
|
|
1022
|
+
monkeypatch.delenv("AE_MISSING_KEY", raising=False)
|
|
1023
|
+
mock_trace = MagicMock()
|
|
1024
|
+
mock_trace.trace_id = "abc123"
|
|
1025
|
+
mock_loader_cls.return_value.load_from_dict.return_value = [mock_trace]
|
|
1026
|
+
mock_eval.return_value = _make_run_result()
|
|
1027
|
+
resp = self.client.post(
|
|
1028
|
+
"/api/evaluate/json/stream",
|
|
1029
|
+
json={
|
|
1030
|
+
"traces": _make_otlp_json_payload(),
|
|
1031
|
+
"config": _judge_config(),
|
|
1032
|
+
"credentialRefs": {"k": {"kind": "env", "name": "AE_MISSING_KEY"}},
|
|
1033
|
+
},
|
|
1034
|
+
)
|
|
1035
|
+
assert '"error"' in resp.text
|
|
1036
|
+
assert "Could not resolve credentialRefs" in resp.text
|
|
1037
|
+
assert '"done"' not in resp.text
|
|
1038
|
+
mock_eval.assert_not_called()
|
|
1039
|
+
|
|
830
1040
|
|
|
831
1041
|
# ---------------------------------------------------------------------------
|
|
832
1042
|
# GET /api/streaming/sessions
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|