agentevals-cli 0.9.4__tar.gz → 0.9.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/PKG-INFO +11 -7
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/README.md +10 -6
- agentevals_cli-0.9.6/docs/run-history.md +105 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/agent.py +1 -1
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/adk/run.py +1 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/langchain/run.py +1 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/ollama/run.py +1 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/openai-agents/run.py +1 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/pydantic-ai/run.py +2 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/strands/run.py +1 -0
- agentevals_cli-0.9.6/src/agentevals/_static/assets/index-4Q_gYF8x.js +341 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_static/index.html +1 -1
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/models.py +9 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/routes.py +178 -84
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/streaming_routes.py +81 -40
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/result_builder.py +44 -8
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/runner.py +9 -2
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/ws_server.py +12 -1
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/trace_attrs.py +3 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/trace_metrics.py +33 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_api.py +210 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/App.tsx +2 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/api/client.ts +48 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/DashboardView.tsx +18 -3
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/PerformanceCharts.tsx +10 -2
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/TraceTable.tsx +57 -18
- agentevals_cli-0.9.6/ui/src/components/runs/PassRateTrendChart.tsx +161 -0
- agentevals_cli-0.9.6/ui/src/components/runs/PerMetricTrendChart.tsx +123 -0
- agentevals_cli-0.9.6/ui/src/components/runs/RunDetailView.tsx +536 -0
- agentevals_cli-0.9.6/ui/src/components/runs/RunsHistoryTable.tsx +273 -0
- agentevals_cli-0.9.6/ui/src/components/runs/RunsView.tsx +376 -0
- agentevals_cli-0.9.6/ui/src/components/runs/runHistory.ts +171 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/sidebar/Sidebar.tsx +11 -2
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/LiveStreamingView.tsx +6 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/config.ts +1 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/context/TraceContext.tsx +4 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/context/TraceProvider.tsx +5 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/types.ts +94 -1
- agentevals_cli-0.9.4/src/agentevals/_static/assets/index-RIquRPno.js +0 -341
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.claude/skills/eval/SKILL.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.claude/skills/eval/evals/evals.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.claude/skills/inspect/SKILL.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.claude/skills/inspect/evals/evals.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.dockerignore +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/workflows/ci.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/workflows/publish-evaluator-sdk.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.github/workflows/release.yml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.gitignore +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/.mcp.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/CONTRIBUTING.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/DEVELOPMENT.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/Dockerfile +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/LICENSE +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/Makefile +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/Chart.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/NOTES.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/_helpers.tpl +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/deployment.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/postgresql.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/rbac.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/service.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/templates/serviceaccount.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/charts/agentevals/values.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/assets/logo-color-on-transparent.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/assets/logo-color.png +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/assets/logo-dark-on-transparent.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/custom-evaluators.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/eval-set-format.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/otel-compatibility.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/docs/streaming.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_evaluators/eval_config.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_evaluators/eval_config_openai_eval.yaml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_evaluators/response_quality.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_evaluators/tool_call_checker.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_sink/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/custom_sink/pyproject.toml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/main.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/dice_agent/test_streaming.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/kubernetes/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/agent.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/main.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/langchain_agent/test_streaming.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/sdk_example/async_example.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/sdk_example/context_manager_example.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/sdk_example/decorator_example.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/sdk_example/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/strands_agent/agent.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/strands_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/strands_agent/main.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/strands_agent/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/adk/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/langchain/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/ollama/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/strands/requirements.txt +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/flake.lock +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/flake.nix +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/pyproject.toml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/pyproject.toml +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/eval_set_helm.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/evalset_helm_3_2026-02-23.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/evalset_k8s_2026-02-20.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/helm.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/helm_2.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/helm_3.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/k8s.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/samples/tempo_export_with_batches.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_protocol.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_static/logo.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/_static/vite.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/app.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/debug_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/dependencies.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/otlp_app.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/otlp_grpc.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/otlp_processing.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/otlp_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/api/runs_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/builtin_metrics.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/cli.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/config.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/converter.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/custom_evaluators.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/eval_config_loader.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/resolver.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/sources.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/templates.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/evaluator/venv.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/extraction.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/genai_converter.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/auto.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/base.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/jaeger.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/loader/otlp.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/mcp_server.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/openai_eval_backend.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/output.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/resolvers/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/resolvers/kubernetes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/fetcher.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/service.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/sinks.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/run/worker.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/sdk.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/config.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/models.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/migrator.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/postgres/pool.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/repos/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/repos/memory.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/storage/repos/postgres.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/incremental_processor.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/processor.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/streaming/session.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/utils/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/utils/genai_messages.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/utils/log_buffer.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/src/agentevals/utils/log_enrichment.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/api/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/api/test_evaluate_persistence.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/api/test_runs_routes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/conftest.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_evaluation_pipeline.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_live_agents.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_otlp_grpc_receiver.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_session_grouping.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/integration/test_timing_stress.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/resolvers/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/resolvers/test_kubernetes.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/resolvers/test_registry.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/test_fetcher.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/test_result_builder.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/test_service.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/run/test_sinks.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/__init__.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/test_config.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/test_memory_repos.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/test_migrator.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/storage/test_models.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_cli.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_converter.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_credential_injection.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_eval_config_loader.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_extraction.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_genai_converter.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_jaeger_loader.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_loader_auto.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_log_enrichment.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_mcp_server.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_openai_eval_backend.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_otlp_loader.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_otlp_receiver.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_output.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_protocol.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_runner.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_sdk.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/tests/test_trace_metrics.py +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/.gitignore +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/README.md +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/eslint.config.js +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/index.html +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/package-lock.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/package.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/public/logo.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/public/vite.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/App.css +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/assets/react.svg +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/BuilderHeader.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/BuilderView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/EvalCasesList.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/InvocationEditor.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/JsonPreview.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/MetadataEditor.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/builder/index.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/dashboard/TraceCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/DataSection.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InspectorView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InvocationCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/ToolCallList.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/LiveMessage.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/SessionCard.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/FileDropZone.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/MetricSelector.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/upload/UploadView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/components/welcome/WelcomeView.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/index.css +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/console-capture.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/eval-config.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/evalset-builder.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/network-capture.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/trace-helpers.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/trace-loader.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/trace-metadata.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/trace-patcher.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/lib/utils.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/src/main.tsx +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/tsconfig.app.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/tsconfig.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/tsconfig.node.json +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/ui/vite.config.ts +0 -0
- {agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agentevals-cli
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.6
|
|
4
4
|
Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Requires-Python: >=3.11
|
|
@@ -280,7 +280,7 @@ See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protoc
|
|
|
280
280
|
agentevals serve # bundled UI on http://localhost:8001
|
|
281
281
|
```
|
|
282
282
|
|
|
283
|
-
Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
|
|
283
|
+
Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. With the Postgres backend enabled, the "Run History" tab persists every evaluation and lets you group and trend runs by eval set or agent over time; see the [Run History guide](docs/run-history.md). For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
|
|
284
284
|
|
|
285
285
|
Interactive API docs are available at `/docs` (Swagger) and `/redoc` while the server is running. The OTLP receiver on port 4318 serves its own docs at `http://localhost:4318/docs`.
|
|
286
286
|
|
|
@@ -318,11 +318,12 @@ See the [Kubernetes example](examples/kubernetes/README.md) for an end-to-end wa
|
|
|
318
318
|
|
|
319
319
|
#### Postgres backend (`/api/runs`)
|
|
320
320
|
|
|
321
|
-
> **Preview.**
|
|
322
|
-
>
|
|
323
|
-
>
|
|
324
|
-
>
|
|
325
|
-
>
|
|
321
|
+
> **Preview.** Persisting evaluations and exploring them in the UI works end
|
|
322
|
+
> to end (see the [Run History guide](docs/run-history.md)), but the storage
|
|
323
|
+
> layer is still stabilizing. The `storage.*` and `database.postgres.*` chart
|
|
324
|
+
> values, the `/api/runs` HTTP surface, and the database schema may change
|
|
325
|
+
> incompatibly in upcoming releases. Operators evaluating this feature should
|
|
326
|
+
> plan to recreate the agentevals schema when upgrading between minor versions.
|
|
326
327
|
> Default in-memory mode is unaffected.
|
|
327
328
|
|
|
328
329
|
By default the chart deploys agentevals with an in-memory backend; runs and results are not persisted. To enable the async `POST /api/runs` pipeline with durable Postgres-backed state:
|
|
@@ -341,6 +342,8 @@ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals
|
|
|
341
342
|
|
|
342
343
|
When `storage.backend=postgres` the app applies any pending schema migrations on startup (advisory-lock protected, safe across replicas) and starts an in-process worker that processes the run queue. Without `storage.backend=postgres` the `/api/runs` endpoints return 503 with a hint pointing at the env var.
|
|
343
344
|
|
|
345
|
+
Persisted runs power the **Run History** view in the UI, where you can group and trend evaluations by eval set or agent and drill into per-run detail. See the [Run History guide](docs/run-history.md) for the full feature walkthrough and local-dev setup.
|
|
346
|
+
|
|
344
347
|
## MCP Server
|
|
345
348
|
|
|
346
349
|
Exposes evaluation tools to MCP clients. A `.mcp.json` at the project root lets Claude Code pick it up automatically.
|
|
@@ -389,6 +392,7 @@ Working examples are in the [`examples/`](examples/) directory:
|
|
|
389
392
|
| [Eval Set Format](docs/eval-set-format.md) | Schema, field reference, and examples for golden eval set JSON files |
|
|
390
393
|
| [Custom Evaluators](docs/custom-evaluators.md) | Write your own scoring logic in Python, JavaScript, or any language |
|
|
391
394
|
| [Live Streaming](docs/streaming.md) | Real-time trace streaming, dev server setup, and session management |
|
|
395
|
+
| [Run History](docs/run-history.md) | Persisting evaluations to Postgres and exploring them over time in the UI |
|
|
392
396
|
| [OpenTelemetry Compatibility](docs/otel-compatibility.md) | Supported OTel conventions, message delivery mechanisms, and OTLP receiver |
|
|
393
397
|
|
|
394
398
|
## Development
|
|
@@ -250,7 +250,7 @@ See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protoc
|
|
|
250
250
|
agentevals serve # bundled UI on http://localhost:8001
|
|
251
251
|
```
|
|
252
252
|
|
|
253
|
-
Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
|
|
253
|
+
Upload traces and eval sets, select evaluators, and view results with interactive span trees. Live-streamed traces appear in the "Local Dev" tab, grouped by session ID. With the Postgres backend enabled, the "Run History" tab persists every evaluation and lets you group and trend runs by eval set or agent over time; see the [Run History guide](docs/run-history.md). For running from source, see [DEVELOPMENT.md](DEVELOPMENT.md).
|
|
254
254
|
|
|
255
255
|
Interactive API docs are available at `/docs` (Swagger) and `/redoc` while the server is running. The OTLP receiver on port 4318 serves its own docs at `http://localhost:4318/docs`.
|
|
256
256
|
|
|
@@ -288,11 +288,12 @@ See the [Kubernetes example](examples/kubernetes/README.md) for an end-to-end wa
|
|
|
288
288
|
|
|
289
289
|
#### Postgres backend (`/api/runs`)
|
|
290
290
|
|
|
291
|
-
> **Preview.**
|
|
292
|
-
>
|
|
293
|
-
>
|
|
294
|
-
>
|
|
295
|
-
>
|
|
291
|
+
> **Preview.** Persisting evaluations and exploring them in the UI works end
|
|
292
|
+
> to end (see the [Run History guide](docs/run-history.md)), but the storage
|
|
293
|
+
> layer is still stabilizing. The `storage.*` and `database.postgres.*` chart
|
|
294
|
+
> values, the `/api/runs` HTTP surface, and the database schema may change
|
|
295
|
+
> incompatibly in upcoming releases. Operators evaluating this feature should
|
|
296
|
+
> plan to recreate the agentevals schema when upgrading between minor versions.
|
|
296
297
|
> Default in-memory mode is unaffected.
|
|
297
298
|
|
|
298
299
|
By default the chart deploys agentevals with an in-memory backend; runs and results are not persisted. To enable the async `POST /api/runs` pipeline with durable Postgres-backed state:
|
|
@@ -311,6 +312,8 @@ helm install agentevals oci://ghcr.io/agentevals-dev/agentevals/helm/agentevals
|
|
|
311
312
|
|
|
312
313
|
When `storage.backend=postgres` the app applies any pending schema migrations on startup (advisory-lock protected, safe across replicas) and starts an in-process worker that processes the run queue. Without `storage.backend=postgres` the `/api/runs` endpoints return 503 with a hint pointing at the env var.
|
|
313
314
|
|
|
315
|
+
Persisted runs power the **Run History** view in the UI, where you can group and trend evaluations by eval set or agent and drill into per-run detail. See the [Run History guide](docs/run-history.md) for the full feature walkthrough and local-dev setup.
|
|
316
|
+
|
|
314
317
|
## MCP Server
|
|
315
318
|
|
|
316
319
|
Exposes evaluation tools to MCP clients. A `.mcp.json` at the project root lets Claude Code pick it up automatically.
|
|
@@ -359,6 +362,7 @@ Working examples are in the [`examples/`](examples/) directory:
|
|
|
359
362
|
| [Eval Set Format](docs/eval-set-format.md) | Schema, field reference, and examples for golden eval set JSON files |
|
|
360
363
|
| [Custom Evaluators](docs/custom-evaluators.md) | Write your own scoring logic in Python, JavaScript, or any language |
|
|
361
364
|
| [Live Streaming](docs/streaming.md) | Real-time trace streaming, dev server setup, and session management |
|
|
365
|
+
| [Run History](docs/run-history.md) | Persisting evaluations to Postgres and exploring them over time in the UI |
|
|
362
366
|
| [OpenTelemetry Compatibility](docs/otel-compatibility.md) | Supported OTel conventions, message delivery mechanisms, and OTLP receiver |
|
|
363
367
|
|
|
364
368
|
## Development
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Run History
|
|
2
|
+
|
|
3
|
+
Run history turns each evaluation into a durable record you can revisit, group, and trend over time. When agentevals runs with the Postgres storage backend, every evaluation (whether an uploaded trace file or a live streaming session) is persisted as a **run** with its per case scores, and the UI's **Run History** view lets you explore how an agent or eval set performs across many runs.
|
|
4
|
+
|
|
5
|
+
Without the Postgres backend, agentevals is stateless: evaluations still work and results show on the dashboard, but nothing is persisted and the run-history endpoints return `503`.
|
|
6
|
+
|
|
7
|
+
## Enabling durable storage
|
|
8
|
+
|
|
9
|
+
Run history requires the Postgres storage backend. It is opt in.
|
|
10
|
+
|
|
11
|
+
### Local development
|
|
12
|
+
|
|
13
|
+
The quickest path uses the Makefile target, which starts a throwaway Postgres container, applies migrations, and serves the app wired to it:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
make dev-backend-pg
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
That is equivalent to:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
export AGENTEVALS_STORAGE_BACKEND=postgres
|
|
23
|
+
export AGENTEVALS_DATABASE_URL=postgresql://agentevals:agentevals@localhost:5432/agentevals
|
|
24
|
+
uv run agentevals migrate up # apply schema migrations
|
|
25
|
+
uv run agentevals serve --dev # serve with the Postgres backend
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Run the UI in a second terminal (`cd ui && npm run dev`) and open the **Run History** tab.
|
|
29
|
+
|
|
30
|
+
> The `make pg-up` container runs with `--rm` and no volume, so its data is ephemeral: `make pg-down` (or a reboot) resets your run history. Point `AGENTEVALS_DATABASE_URL` at a persistent Postgres if you want runs to survive across sessions.
|
|
31
|
+
|
|
32
|
+
### Configuration reference
|
|
33
|
+
|
|
34
|
+
| Variable | Purpose |
|
|
35
|
+
|----------|---------|
|
|
36
|
+
| `AGENTEVALS_STORAGE_BACKEND` | `postgres` to enable durable storage; anything else (default) keeps the in-memory backend |
|
|
37
|
+
| `AGENTEVALS_DATABASE_URL` | Postgres DSN, e.g. `postgresql://user:pass@host:5432/dbname` |
|
|
38
|
+
| `AGENTEVALS_DATABASE_URL_FILE` | Path to a file containing the DSN (preferred over the inline variable; useful for mounted secrets) |
|
|
39
|
+
| `AGENTEVALS_DATABASE_SCHEMA` | Schema name to use (default `agentevals`) |
|
|
40
|
+
|
|
41
|
+
On startup with `storage.backend=postgres` the app applies any pending migrations (advisory-lock protected, safe across replicas). For deployment via Helm, see the [Postgres backend section of the README](../README.md#postgres-backend-apiruns).
|
|
42
|
+
|
|
43
|
+
## How runs get persisted
|
|
44
|
+
|
|
45
|
+
A run is created once per evaluation, best effort: if persistence fails the evaluation result is still returned to the caller. Both evaluation paths persist:
|
|
46
|
+
|
|
47
|
+
- **Uploaded traces** (`POST /api/evaluate`): the run aggregates every uploaded trace as one evaluation.
|
|
48
|
+
- **Live sessions** (streaming dev server): scoring sessions from the UI persists one run per "Evaluate" click, aggregating the sessions it scored.
|
|
49
|
+
|
|
50
|
+
Each run stores a pre-aggregated `summary` plus one `result` row per (eval case, evaluator):
|
|
51
|
+
|
|
52
|
+
```jsonc
|
|
53
|
+
// run.summary
|
|
54
|
+
{
|
|
55
|
+
"trace_count": 8,
|
|
56
|
+
"result_counts": { "passed": 6, "failed": 2, "errored": 0, "skipped": 0 },
|
|
57
|
+
"per_metric": {
|
|
58
|
+
"tool_trajectory_avg_score": { "passed": 7, "failed": 1, "errored": 0, "skipped": 0, "avg_score": 0.94 }
|
|
59
|
+
},
|
|
60
|
+
"agents": ["langchain-agent", "openai-agents-agent"],
|
|
61
|
+
"performance_metrics": { "models": ["gpt-4o"], /* tokens, latency, counts */ },
|
|
62
|
+
"errors": []
|
|
63
|
+
}
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Exploring runs in the UI
|
|
67
|
+
|
|
68
|
+
Open **Run History** from the sidebar. It reads from `GET /api/runs`, so it shows the same friendly notice if durable storage is not configured.
|
|
69
|
+
|
|
70
|
+
- **Trends.** A pass-rate line and a per-metric average-score line plot across runs over time, so regressions and improvements are visible at a glance.
|
|
71
|
+
- **Group by.** Toggle between grouping by **eval set** or by **agent**, then pick a specific group to isolate its runs and trends. The pass-rate chart draws one line per agent.
|
|
72
|
+
- **History table.** Every run with its status, eval set, agent, trace count, pass/fail counts, pass-rate bar, duration, and models. Click a row to open the run detail.
|
|
73
|
+
- **Run detail.** For a single run: the evaluator configuration (metrics, thresholds, judge model), the golden eval set it was scored against, and per eval case results. Tool-trajectory results expand to an expected vs actual diff per invocation, showing exactly where the run diverged from the reference.
|
|
74
|
+
|
|
75
|
+
### What is and is not persisted
|
|
76
|
+
|
|
77
|
+
Run detail is an *evaluation record*, not a full trace record. It faithfully shows the expected behavior, each metric's pass or fail, and (for trajectory metrics) where the actual tool calls diverged. It does not retain the raw trace spans or timeline, and text-similarity metrics keep only their score, not the actual response text. To replay a full trace, use the live inspector at evaluation time.
|
|
78
|
+
|
|
79
|
+
## Agent identity and grouping
|
|
80
|
+
|
|
81
|
+
Runs group by **agent** using the OpenTelemetry `service.name` resource attribute, the cross-framework identifier for a service. Set it on your agent with the standard `OTEL_SERVICE_NAME` environment variable:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
OTEL_SERVICE_NAME=my-agent python my_agent.py
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
The zero-code examples set this for you (for example `service.name=langchain-agent`). When `service.name` is absent, agentevals falls back to the framework agent name (`gen_ai.agent.name`); it never falls back to a model or span operation name, so a group is always a real agent identity.
|
|
88
|
+
|
|
89
|
+
## Golden reference handling
|
|
90
|
+
|
|
91
|
+
When you score other agents against a golden session, the golden defines the eval set and therefore matches itself trivially. To keep scoring meaningful, the golden is excluded from pass or fail counts, the agent list, and the results table, but its latency and token usage are still plotted in the performance charts (labeled as the reference) so you can compare the scored agents against the baseline.
|
|
92
|
+
|
|
93
|
+
## HTTP API
|
|
94
|
+
|
|
95
|
+
All endpoints return `503` (with a hint pointing at `AGENTEVALS_STORAGE_BACKEND=postgres`) when durable storage is not configured.
|
|
96
|
+
|
|
97
|
+
| Method + path | Description |
|
|
98
|
+
|---------------|-------------|
|
|
99
|
+
| `GET /api/runs` | List runs, newest first. Filter with `status`, `limit` (1-1000), and `before` (a `created_at` cursor for pagination) |
|
|
100
|
+
| `GET /api/runs/{run_id}` | Fetch a single run (spec + summary) |
|
|
101
|
+
| `GET /api/runs/{run_id}/results` | List the per (eval case, evaluator) result rows for a run |
|
|
102
|
+
| `POST /api/runs` | Submit a run for asynchronous execution by the in-process worker; idempotent on `run_id` |
|
|
103
|
+
| `POST /api/runs/{run_id}/cancel` | Request cancellation of a queued or running run (idempotent) |
|
|
104
|
+
|
|
105
|
+
Interactive API docs are available at `/docs` (Swagger) and `/redoc` while the server is running.
|
|
@@ -56,7 +56,7 @@ def check_prime(nums: list[int]) -> dict:
|
|
|
56
56
|
dice_agent = Agent(
|
|
57
57
|
name="dice_agent",
|
|
58
58
|
# model="gemini-2.5-flash",
|
|
59
|
-
model="gemini-
|
|
59
|
+
model="gemini-3-flash-preview",
|
|
60
60
|
instruction="""You are a helpful assistant that can roll dice and check if numbers are prime.
|
|
61
61
|
|
|
62
62
|
When a user asks you to roll a die, use the roll_die tool with the appropriate number of sides.
|
|
@@ -44,6 +44,7 @@ async def main():
|
|
|
44
44
|
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
|
|
45
45
|
print(f"OTLP endpoint: {endpoint}")
|
|
46
46
|
|
|
47
|
+
os.environ.setdefault("OTEL_SERVICE_NAME", "adk-agent")
|
|
47
48
|
os.environ.setdefault(
|
|
48
49
|
"OTEL_RESOURCE_ATTRIBUTES",
|
|
49
50
|
"agentevals.eval_set_id=dice_agent_eval,agentevals.session_name=adk-zero-code",
|
|
@@ -48,6 +48,7 @@ def main():
|
|
|
48
48
|
|
|
49
49
|
os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "true"
|
|
50
50
|
|
|
51
|
+
os.environ.setdefault("OTEL_SERVICE_NAME", "langchain-agent")
|
|
51
52
|
os.environ.setdefault(
|
|
52
53
|
"OTEL_RESOURCE_ATTRIBUTES",
|
|
53
54
|
"agentevals.eval_set_id=langchain_agent_eval,agentevals.session_name=langchain-zero-code",
|
|
@@ -112,6 +112,7 @@ def main():
|
|
|
112
112
|
print(f"Local model: {model}")
|
|
113
113
|
|
|
114
114
|
os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "true"
|
|
115
|
+
os.environ.setdefault("OTEL_SERVICE_NAME", "ollama-agent")
|
|
115
116
|
os.environ.setdefault(
|
|
116
117
|
"OTEL_RESOURCE_ATTRIBUTES",
|
|
117
118
|
"agentevals.eval_set_id=langchain_local_ollama_openai_eval,agentevals.session_name=langchain-ollama-openai-zero-code",
|
{agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/openai-agents/run.py
RENAMED
|
@@ -58,6 +58,7 @@ def main():
|
|
|
58
58
|
os.environ.setdefault("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "span_and_event")
|
|
59
59
|
os.environ.setdefault("OTEL_SEMCONV_STABILITY_OPT_IN", "gen_ai_latest_experimental")
|
|
60
60
|
|
|
61
|
+
os.environ.setdefault("OTEL_SERVICE_NAME", "openai-agents-agent")
|
|
61
62
|
os.environ.setdefault(
|
|
62
63
|
"OTEL_RESOURCE_ATTRIBUTES",
|
|
63
64
|
"agentevals.eval_set_id=openai_agents_eval,agentevals.session_name=openai-agents-zero-code",
|
{agentevals_cli-0.9.4 → agentevals_cli-0.9.6}/examples/zero-code-examples/pydantic-ai/run.py
RENAMED
|
@@ -54,6 +54,7 @@ def main():
|
|
|
54
54
|
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
|
|
55
55
|
print(f"OTLP endpoint: {endpoint}")
|
|
56
56
|
|
|
57
|
+
os.environ.setdefault("OTEL_SERVICE_NAME", "pydantic-ai-agent")
|
|
57
58
|
os.environ.setdefault(
|
|
58
59
|
"OTEL_RESOURCE_ATTRIBUTES",
|
|
59
60
|
"agentevals.eval_set_id=pydantic_ai_eval,agentevals.session_name=pydantic-ai-zero-code",
|
|
@@ -72,6 +73,7 @@ def main():
|
|
|
72
73
|
|
|
73
74
|
agent = Agent(
|
|
74
75
|
"openai:gpt-4o-mini",
|
|
76
|
+
# "openai:gpt-5.4-mini-2026-03-17",
|
|
75
77
|
instructions="You are a helpful assistant. You can roll dice and check if numbers are prime.",
|
|
76
78
|
)
|
|
77
79
|
agent.tool_plain(roll_die)
|
|
@@ -40,6 +40,7 @@ def main():
|
|
|
40
40
|
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
|
|
41
41
|
print(f"OTLP endpoint: {endpoint}")
|
|
42
42
|
|
|
43
|
+
os.environ.setdefault("OTEL_SERVICE_NAME", "strands-agent")
|
|
43
44
|
os.environ.setdefault(
|
|
44
45
|
"OTEL_RESOURCE_ATTRIBUTES",
|
|
45
46
|
"agentevals.eval_set_id=strands_agent_eval,agentevals.session_name=strands-zero-code",
|