agentevals-cli 0.7.0__tar.gz → 0.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/PKG-INFO +1 -1
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/README.md +4 -1
- agentevals_cli-0.7.1/examples/zero-code-examples/pydantic-ai/requirements.txt +5 -0
- agentevals_cli-0.7.1/examples/zero-code-examples/pydantic-ai/run.py +105 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/pyproject.toml +1 -1
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/app.py +1 -2
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/models.py +10 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/routes.py +158 -2
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/config.py +35 -18
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/loader/otlp.py +55 -13
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/runner.py +59 -28
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_live_agents.py +60 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_api.py +221 -0
- agentevals_cli-0.7.1/tests/test_otlp_loader.py +454 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/uv.lock +1 -1
- agentevals_cli-0.7.0/tests/test_otlp_loader.py +0 -210
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.claude/skills/eval/SKILL.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.claude/skills/eval/evals/evals.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.claude/skills/inspect/SKILL.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.claude/skills/inspect/evals/evals.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.dockerignore +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/workflows/ci.yml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/workflows/publish-evaluator-sdk.yml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.github/workflows/release.yml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.gitignore +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/.mcp.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/CONTRIBUTING.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/DEVELOPMENT.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/Dockerfile +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/LICENSE +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/Makefile +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/README.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/Chart.yaml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/NOTES.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/_helpers.tpl +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/deployment.yaml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/service.yaml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/templates/serviceaccount.yaml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/charts/agentevals/values.yaml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/assets/logo-color-on-transparent.svg +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/assets/logo-color.png +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/assets/logo-dark-on-transparent.svg +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/custom-evaluators.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/eval-set-format.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/otel-compatibility.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/docs/streaming.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/custom_evaluators/eval_config.yaml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/custom_evaluators/response_quality.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/custom_evaluators/tool_call_checker.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/README.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/agent.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/eval_set.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/main.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/dice_agent/test_streaming.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/kubernetes/README.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/README.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/agent.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/eval_set.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/main.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/requirements.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/langchain_agent/test_streaming.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/sdk_example/async_example.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/sdk_example/context_manager_example.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/sdk_example/decorator_example.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/sdk_example/requirements.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/strands_agent/agent.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/strands_agent/eval_set.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/strands_agent/main.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/strands_agent/requirements.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/adk/requirements.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/adk/run.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/langchain/requirements.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/langchain/run.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/ollama/requirements.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/ollama/run.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/openai-agents/run.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/strands/requirements.txt +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/examples/zero-code-examples/strands/run.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/flake.lock +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/flake.nix +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/README.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/pyproject.toml +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/eval_set_helm.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/evalset_helm_3_2026-02-23.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/evalset_k8s_2026-02-20.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/helm.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/helm_2.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/helm_3.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/samples/k8s.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/__init__.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_protocol.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/assets/index-7YPfPT4N.js +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/index.html +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/logo.svg +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/_static/vite.svg +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/__init__.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/debug_routes.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/dependencies.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_app.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_grpc.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_processing.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/otlp_routes.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/api/streaming_routes.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/builtin_metrics.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/cli.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/converter.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/custom_evaluators.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/eval_config_loader.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/__init__.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/resolver.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/sources.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/templates.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/evaluator/venv.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/extraction.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/genai_converter.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/loader/__init__.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/loader/base.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/loader/jaeger.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/mcp_server.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/openai_eval_backend.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/output.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/sdk.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/__init__.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/incremental_processor.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/processor.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/session.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/streaming/ws_server.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/trace_attrs.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/trace_metrics.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/utils/__init__.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/utils/genai_messages.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/utils/log_buffer.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/src/agentevals/utils/log_enrichment.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/__init__.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/conftest.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_evaluation_pipeline.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_otlp_grpc_receiver.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_session_grouping.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/integration/test_timing_stress.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_cli.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_converter.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_extraction.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_genai_converter.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_jaeger_loader.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_log_enrichment.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_mcp_server.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_otlp_receiver.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_output.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_protocol.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_runner.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_sdk.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/tests/test_trace_metrics.py +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/.gitignore +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/README.md +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/eslint.config.js +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/index.html +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/package-lock.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/package.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/public/logo.svg +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/public/vite.svg +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/App.css +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/App.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/api/client.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/assets/react.svg +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/BuilderHeader.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/BuilderView.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/EvalCasesList.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/InvocationEditor.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/JsonPreview.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/MetadataEditor.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/builder/index.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/DashboardView.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/TraceCard.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/dashboard/TraceTable.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/DataSection.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InspectorView.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InvocationCard.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/ToolCallList.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/sidebar/Sidebar.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveMessage.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/SessionCard.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/FileDropZone.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/MetricSelector.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/upload/UploadView.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/components/welcome/WelcomeView.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/config.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/context/TraceContext.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/context/TraceProvider.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/index.css +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/console-capture.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/evalset-builder.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/network-capture.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/trace-helpers.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/trace-loader.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/trace-metadata.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/trace-patcher.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/types.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/lib/utils.ts +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/src/main.tsx +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/tsconfig.app.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/tsconfig.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/tsconfig.node.json +0 -0
- {agentevals_cli-0.7.0 → agentevals_cli-0.7.1}/ui/vite.config.ts +0 -0
|
@@ -29,6 +29,7 @@ agentevals accepts OTLP/HTTP on port 4318 (`http/protobuf` and `http/json`) and
|
|
|
29
29
|
| [zero-code-examples/ollama/](./zero-code-examples/ollama/) | LangChain | Ollama |
|
|
30
30
|
| [zero-code-examples/strands/](./zero-code-examples/strands/) | Strands | OpenAI |
|
|
31
31
|
| [zero-code-examples/adk/](./zero-code-examples/adk/) | Google ADK | Gemini |
|
|
32
|
+
| [zero-code-examples/pydantic-ai/](./zero-code-examples/pydantic-ai/) | Pydantic AI | OpenAI |
|
|
32
33
|
|
|
33
34
|
This approach works with any framework that has OTel instrumentation: LangChain, Strands, Google ADK, etc. If your framework already emits OTel spans, you only need to add `OTLPSpanExporter` (and `OTLPLogExporter` if it uses GenAI log-based content delivery).
|
|
34
35
|
|
|
@@ -103,6 +104,7 @@ Detection checks for `gen_ai.request.model` / `gen_ai.input.messages` (GenAI sem
|
|
|
103
104
|
| [zero-code-examples/ollama/](./zero-code-examples/ollama/) | LangChain | Ollama | GenAI semconv (logs) | Standard OTLP export |
|
|
104
105
|
| [zero-code-examples/strands/](./zero-code-examples/strands/) | Strands | OpenAI | GenAI semconv (events*) | Standard OTLP export |
|
|
105
106
|
| [zero-code-examples/adk/](./zero-code-examples/adk/) | Google ADK | Gemini | ADK built-in | Standard OTLP export |
|
|
107
|
+
| [zero-code-examples/pydantic-ai/](./zero-code-examples/pydantic-ai/) | Pydantic AI | OpenAI | GenAI semconv (span attrs) | Standard OTLP export |
|
|
106
108
|
| [langchain_agent](./langchain_agent/) | LangChain | OpenAI | GenAI semconv (logs) | SDK WebSocket |
|
|
107
109
|
| [strands_agent](./strands_agent/) | Strands | OpenAI | GenAI semconv (events*) | SDK WebSocket |
|
|
108
110
|
| [dice_agent](./dice_agent/) | Google ADK | Gemini | ADK built-in | SDK WebSocket |
|
|
@@ -217,6 +219,7 @@ python examples/zero-code-examples/langchain/run.py
|
|
|
217
219
|
python examples/zero-code-examples/ollama/run.py
|
|
218
220
|
python examples/zero-code-examples/strands/run.py
|
|
219
221
|
python examples/zero-code-examples/adk/run.py
|
|
222
|
+
python examples/zero-code-examples/pydantic-ai/run.py
|
|
220
223
|
|
|
221
224
|
# SDK examples:
|
|
222
225
|
python examples/sdk_example/context_manager_example.py
|
|
@@ -232,7 +235,7 @@ python examples/strands_agent/main.py
|
|
|
232
235
|
Traces stream to the dev server in real-time. Evaluation runs automatically when the session completes.
|
|
233
236
|
|
|
234
237
|
See each example's README for prerequisites and detailed instructions:
|
|
235
|
-
- [zero-code-examples/](./zero-code-examples/) (LangChain
|
|
238
|
+
- [zero-code-examples/](./zero-code-examples/) (LangChain, Strands, ADK, OpenAI Agents, Pydantic AI — standard OTLP)
|
|
236
239
|
- [dice_agent/README.md](./dice_agent/README.md) (Google ADK + Gemini)
|
|
237
240
|
- [langchain_agent/README.md](./langchain_agent/README.md) (LangChain + OpenAI, SDK)
|
|
238
241
|
- [strands_agent/](./strands_agent/) (Strands + OpenAI, SDK)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Run a dice-rolling Pydantic AI agent with OTLP export — no agentevals SDK.
|
|
2
|
+
|
|
3
|
+
Demonstrates zero-code integration: any OTel-instrumented agent streams
|
|
4
|
+
traces to agentevals by pointing the OTLP exporter at the receiver.
|
|
5
|
+
|
|
6
|
+
Pydantic AI has built-in OTel support via Agent.instrument_all(). By default
|
|
7
|
+
it uses version 2 of the GenAI semconv format, storing message content in span
|
|
8
|
+
attributes — only a TracerProvider is needed.
|
|
9
|
+
No separate instrumentation library is needed.
|
|
10
|
+
|
|
11
|
+
Prerequisites:
|
|
12
|
+
1. pip install -r requirements.txt
|
|
13
|
+
2. agentevals serve --dev
|
|
14
|
+
3. export OPENAI_API_KEY="your-key-here"
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python examples/zero-code-examples/pydantic-ai/run.py
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import random
|
|
22
|
+
|
|
23
|
+
from dotenv import load_dotenv
|
|
24
|
+
from opentelemetry import trace
|
|
25
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
26
|
+
from opentelemetry.sdk.resources import Resource
|
|
27
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
28
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
29
|
+
from pydantic_ai import Agent
|
|
30
|
+
|
|
31
|
+
load_dotenv(override=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def roll_die(sides: int) -> int:
|
|
35
|
+
"""Roll a die with the given number of sides and return the result."""
|
|
36
|
+
return random.randint(1, sides)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def check_prime(number: int) -> bool:
|
|
40
|
+
"""Return True if the number is prime, False otherwise."""
|
|
41
|
+
if number < 2:
|
|
42
|
+
return False
|
|
43
|
+
for i in range(2, int(number**0.5) + 1):
|
|
44
|
+
if number % i == 0:
|
|
45
|
+
return False
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def main():
|
|
50
|
+
if not os.getenv("OPENAI_API_KEY"):
|
|
51
|
+
print("OPENAI_API_KEY not set.")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
|
|
55
|
+
print(f"OTLP endpoint: {endpoint}")
|
|
56
|
+
|
|
57
|
+
os.environ.setdefault(
|
|
58
|
+
"OTEL_RESOURCE_ATTRIBUTES",
|
|
59
|
+
"agentevals.eval_set_id=pydantic_ai_eval,agentevals.session_name=pydantic-ai-zero-code",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
resource = Resource.create()
|
|
63
|
+
|
|
64
|
+
tracer_provider = TracerProvider(resource=resource)
|
|
65
|
+
tracer_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter(), schedule_delay_millis=1000))
|
|
66
|
+
trace.set_tracer_provider(tracer_provider)
|
|
67
|
+
|
|
68
|
+
# Enable Pydantic AI's built-in OTel instrumentation. This one call
|
|
69
|
+
# wires up all agents globally — no framework-specific instrumentor
|
|
70
|
+
# library (like opentelemetry-instrumentation-openai-v2) is needed.
|
|
71
|
+
Agent.instrument_all()
|
|
72
|
+
|
|
73
|
+
agent = Agent(
|
|
74
|
+
"openai:gpt-4o-mini",
|
|
75
|
+
instructions="You are a helpful assistant. You can roll dice and check if numbers are prime.",
|
|
76
|
+
)
|
|
77
|
+
agent.tool_plain(roll_die)
|
|
78
|
+
agent.tool_plain(check_prime)
|
|
79
|
+
|
|
80
|
+
test_queries = [
|
|
81
|
+
"Hi! Can you help me?",
|
|
82
|
+
"Roll a 20-sided die for me",
|
|
83
|
+
"Is the number you rolled prime?",
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
message_history = []
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
for i, query in enumerate(test_queries, 1):
|
|
90
|
+
print(f"\n[{i}/{len(test_queries)}] User: {query}")
|
|
91
|
+
|
|
92
|
+
result = agent.run_sync(query, message_history=message_history)
|
|
93
|
+
|
|
94
|
+
print(f" Agent: {result.output}")
|
|
95
|
+
|
|
96
|
+
# Pass the full message history forward for multi-turn conversation.
|
|
97
|
+
message_history = result.all_messages()
|
|
98
|
+
finally:
|
|
99
|
+
print()
|
|
100
|
+
tracer_provider.force_flush()
|
|
101
|
+
print("All traces flushed to OTLP receiver.")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
main()
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "agentevals-cli"
|
|
7
|
-
version = "0.7.
|
|
7
|
+
version = "0.7.1"
|
|
8
8
|
description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
@@ -10,8 +10,7 @@ from contextlib import asynccontextmanager
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import TYPE_CHECKING
|
|
12
12
|
|
|
13
|
-
from fastapi import FastAPI, Request
|
|
14
|
-
from fastapi import WebSocket
|
|
13
|
+
from fastapi import FastAPI, Request, WebSocket
|
|
15
14
|
from fastapi.middleware.cors import CORSMiddleware
|
|
16
15
|
from fastapi.responses import StreamingResponse
|
|
17
16
|
|
|
@@ -11,6 +11,8 @@ from typing import Any, Generic, TypeVar
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, Field
|
|
12
12
|
from pydantic.alias_generators import to_camel
|
|
13
13
|
|
|
14
|
+
from ..config import EvalParams
|
|
15
|
+
|
|
14
16
|
T = TypeVar("T")
|
|
15
17
|
|
|
16
18
|
|
|
@@ -134,6 +136,14 @@ class ConvertTracesData(CamelModel):
|
|
|
134
136
|
traces: list[TraceConversionEntry]
|
|
135
137
|
|
|
136
138
|
|
|
139
|
+
class EvaluateJsonRequest(CamelModel):
|
|
140
|
+
"""Request body for JSON-based trace evaluation (``POST /evaluate/json``)."""
|
|
141
|
+
|
|
142
|
+
traces: dict = Field(description="OTLP JSON export with resourceSpans structure.")
|
|
143
|
+
config: EvalParams = Field(default_factory=EvalParams, description="Evaluation parameters.")
|
|
144
|
+
eval_set: dict | None = Field(default=None, description="Optional ADK EvalSet JSON.")
|
|
145
|
+
|
|
146
|
+
|
|
137
147
|
# ---------------------------------------------------------------------------
|
|
138
148
|
# SSE evaluation event models
|
|
139
149
|
# ---------------------------------------------------------------------------
|
|
@@ -11,7 +11,7 @@ import shutil
|
|
|
11
11
|
import tempfile
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
-
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
|
14
|
+
from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
|
|
15
15
|
from fastapi.responses import StreamingResponse
|
|
16
16
|
from pydantic.alias_generators import to_camel
|
|
17
17
|
|
|
@@ -27,13 +27,22 @@ from ..config import (
|
|
|
27
27
|
)
|
|
28
28
|
from ..converter import convert_traces
|
|
29
29
|
from ..extraction import get_extractor
|
|
30
|
-
from ..
|
|
30
|
+
from ..loader.otlp import OtlpJsonLoader
|
|
31
|
+
from ..runner import (
|
|
32
|
+
RunResult,
|
|
33
|
+
get_loader,
|
|
34
|
+
load_eval_set,
|
|
35
|
+
load_eval_set_from_dict,
|
|
36
|
+
run_evaluation,
|
|
37
|
+
run_evaluation_from_traces,
|
|
38
|
+
)
|
|
31
39
|
from ..trace_metrics import extract_performance_metrics, extract_trace_metadata
|
|
32
40
|
from .models import (
|
|
33
41
|
ApiKeyStatus,
|
|
34
42
|
ConfigData,
|
|
35
43
|
ConvertTracesData,
|
|
36
44
|
EvalSetValidation,
|
|
45
|
+
EvaluateJsonRequest,
|
|
37
46
|
HealthData,
|
|
38
47
|
MetricInfo,
|
|
39
48
|
SSEDoneEvent,
|
|
@@ -61,6 +70,8 @@ def _camel_keys(obj: Any) -> Any:
|
|
|
61
70
|
|
|
62
71
|
router = APIRouter()
|
|
63
72
|
|
|
73
|
+
_MAX_JSON_BODY_BYTES = 50 * 1024 * 1024 # 50 MB (multipart endpoints allow 10 MB per file)
|
|
74
|
+
|
|
64
75
|
_TYPE_TO_MODEL = {
|
|
65
76
|
"builtin": BuiltinMetricDef,
|
|
66
77
|
"code": CodeEvaluatorDef,
|
|
@@ -729,3 +740,148 @@ async def evaluate_traces_stream(
|
|
|
729
740
|
"Connection": "keep-alive",
|
|
730
741
|
},
|
|
731
742
|
)
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
def _parse_json_request(request: EvaluateJsonRequest):
|
|
746
|
+
"""Parse traces and eval set from an EvaluateJsonRequest.
|
|
747
|
+
|
|
748
|
+
Returns (traces, eval_set). Raises HTTPException on invalid input.
|
|
749
|
+
"""
|
|
750
|
+
try:
|
|
751
|
+
traces = OtlpJsonLoader().load_from_dict(request.traces)
|
|
752
|
+
except ValueError as exc:
|
|
753
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
754
|
+
|
|
755
|
+
if not traces:
|
|
756
|
+
raise HTTPException(status_code=400, detail="No traces found in OTLP JSON")
|
|
757
|
+
|
|
758
|
+
eval_set = None
|
|
759
|
+
if request.eval_set:
|
|
760
|
+
try:
|
|
761
|
+
eval_set = load_eval_set_from_dict(request.eval_set)
|
|
762
|
+
except Exception as exc:
|
|
763
|
+
raise HTTPException(status_code=400, detail=f"Invalid eval set: {exc}") from exc
|
|
764
|
+
|
|
765
|
+
return traces, eval_set
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def _check_json_body_size(raw_request: Request):
|
|
769
|
+
content_length = int(raw_request.headers.get("content-length", 0))
|
|
770
|
+
if content_length > _MAX_JSON_BODY_BYTES:
|
|
771
|
+
raise HTTPException(
|
|
772
|
+
status_code=413,
|
|
773
|
+
detail=f"Request body exceeds {_MAX_JSON_BODY_BYTES // (1024 * 1024)}MB limit",
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def _sse_error(message: str) -> str:
|
|
778
|
+
return f"data: {SSEErrorEvent(error=message).model_dump_json(by_alias=True)}\n\n"
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
@router.post("/evaluate/json", response_model=StandardResponse[RunResult])
|
|
782
|
+
async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Request):
|
|
783
|
+
"""Evaluate OTLP JSON traces passed in the request body."""
|
|
784
|
+
_check_json_body_size(raw_request)
|
|
785
|
+
traces, eval_set = _parse_json_request(request)
|
|
786
|
+
|
|
787
|
+
try:
|
|
788
|
+
result = await run_evaluation_from_traces(
|
|
789
|
+
traces=traces,
|
|
790
|
+
config=request.config,
|
|
791
|
+
eval_set=eval_set,
|
|
792
|
+
)
|
|
793
|
+
return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
|
|
794
|
+
except Exception as exc:
|
|
795
|
+
logger.exception("JSON evaluation failed")
|
|
796
|
+
raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
@router.post("/evaluate/json/stream")
|
|
800
|
+
async def evaluate_traces_json_stream(request: EvaluateJsonRequest, raw_request: Request):
|
|
801
|
+
"""Evaluate OTLP JSON traces with real-time progress via SSE."""
|
|
802
|
+
_check_json_body_size(raw_request)
|
|
803
|
+
|
|
804
|
+
async def event_generator():
|
|
805
|
+
try:
|
|
806
|
+
try:
|
|
807
|
+
traces, eval_set = _parse_json_request(request)
|
|
808
|
+
except HTTPException as exc:
|
|
809
|
+
yield _sse_error(exc.detail)
|
|
810
|
+
return
|
|
811
|
+
|
|
812
|
+
for trace in traces:
|
|
813
|
+
try:
|
|
814
|
+
extractor = get_extractor(trace)
|
|
815
|
+
perf_metrics = _camel_keys(extract_performance_metrics(trace, extractor))
|
|
816
|
+
trace_metadata = _camel_keys(extract_trace_metadata(trace, extractor))
|
|
817
|
+
evt = SSEPerformanceMetricsEvent(
|
|
818
|
+
trace_id=trace.trace_id,
|
|
819
|
+
performance_metrics=perf_metrics,
|
|
820
|
+
trace_metadata=trace_metadata,
|
|
821
|
+
)
|
|
822
|
+
yield f"event: performance_metrics\ndata: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
823
|
+
except Exception as e:
|
|
824
|
+
logger.error(f"Failed to extract early performance metrics: {e}")
|
|
825
|
+
|
|
826
|
+
queue: asyncio.Queue = asyncio.Queue()
|
|
827
|
+
|
|
828
|
+
async def progress_callback(message: str):
|
|
829
|
+
await queue.put(("progress", message))
|
|
830
|
+
|
|
831
|
+
async def trace_progress_callback(trace_result):
|
|
832
|
+
await queue.put(("trace_progress", trace_result))
|
|
833
|
+
|
|
834
|
+
async def run_with_progress():
|
|
835
|
+
result = await run_evaluation_from_traces(
|
|
836
|
+
traces=traces,
|
|
837
|
+
config=request.config,
|
|
838
|
+
eval_set=eval_set,
|
|
839
|
+
progress_callback=progress_callback,
|
|
840
|
+
trace_progress_callback=trace_progress_callback,
|
|
841
|
+
)
|
|
842
|
+
await queue.put(("done", result))
|
|
843
|
+
|
|
844
|
+
eval_task = asyncio.create_task(run_with_progress())
|
|
845
|
+
|
|
846
|
+
try:
|
|
847
|
+
while True:
|
|
848
|
+
msg = await queue.get()
|
|
849
|
+
tag, payload = msg
|
|
850
|
+
|
|
851
|
+
if tag == "done":
|
|
852
|
+
evt = SSEDoneEvent(
|
|
853
|
+
result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
854
|
+
)
|
|
855
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
856
|
+
break
|
|
857
|
+
elif tag == "trace_progress":
|
|
858
|
+
evt = SSETraceProgressEvent(
|
|
859
|
+
trace_progress=SSETraceProgress(
|
|
860
|
+
trace_id=payload.trace_id,
|
|
861
|
+
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
862
|
+
)
|
|
863
|
+
)
|
|
864
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
865
|
+
elif tag == "progress":
|
|
866
|
+
evt = SSEProgressEvent(message=payload)
|
|
867
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
868
|
+
finally:
|
|
869
|
+
if not eval_task.done():
|
|
870
|
+
eval_task.cancel()
|
|
871
|
+
try:
|
|
872
|
+
await eval_task
|
|
873
|
+
except asyncio.CancelledError:
|
|
874
|
+
pass
|
|
875
|
+
|
|
876
|
+
except Exception as exc:
|
|
877
|
+
logger.exception("JSON evaluation stream failed")
|
|
878
|
+
yield _sse_error(str(exc))
|
|
879
|
+
|
|
880
|
+
return StreamingResponse(
|
|
881
|
+
event_generator(),
|
|
882
|
+
media_type="text/event-stream",
|
|
883
|
+
headers={
|
|
884
|
+
"Cache-Control": "no-cache",
|
|
885
|
+
"Connection": "keep-alive",
|
|
886
|
+
},
|
|
887
|
+
)
|
|
@@ -5,7 +5,8 @@ from __future__ import annotations
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Annotated, Any, Literal
|
|
7
7
|
|
|
8
|
-
from pydantic import BaseModel, Field, field_validator
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
9
|
+
from pydantic.alias_generators import to_camel
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class BuiltinMetricDef(BaseModel):
|
|
@@ -99,13 +100,14 @@ CustomEvaluatorDef = Annotated[
|
|
|
99
100
|
]
|
|
100
101
|
|
|
101
102
|
|
|
102
|
-
class
|
|
103
|
-
|
|
103
|
+
class EvalParams(BaseModel):
|
|
104
|
+
"""Evaluation parameters independent of how traces are provided.
|
|
104
105
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
106
|
+
Used by ``run_evaluation_from_traces`` for programmatic / API-driven
|
|
107
|
+
evaluation. ``EvalRunConfig`` inherits from this and adds file I/O fields.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
109
111
|
|
|
110
112
|
metrics: list[str] = Field(
|
|
111
113
|
default_factory=lambda: ["tool_trajectory_avg_score"],
|
|
@@ -117,11 +119,6 @@ class EvalRunConfig(BaseModel):
|
|
|
117
119
|
description="Custom evaluator definitions.",
|
|
118
120
|
)
|
|
119
121
|
|
|
120
|
-
trace_format: str = Field(
|
|
121
|
-
default="jaeger-json",
|
|
122
|
-
description="Format of the trace files (jaeger-json or otlp-json).",
|
|
123
|
-
)
|
|
124
|
-
|
|
125
122
|
judge_model: str | None = Field(
|
|
126
123
|
default=None,
|
|
127
124
|
description="LLM model for judge-based metrics.",
|
|
@@ -129,7 +126,9 @@ class EvalRunConfig(BaseModel):
|
|
|
129
126
|
|
|
130
127
|
threshold: float | None = Field(
|
|
131
128
|
default=None,
|
|
132
|
-
|
|
129
|
+
ge=0,
|
|
130
|
+
le=1,
|
|
131
|
+
description="Score threshold for pass/fail (0.0 to 1.0).",
|
|
133
132
|
)
|
|
134
133
|
|
|
135
134
|
trajectory_match_type: str | None = Field(
|
|
@@ -145,17 +144,35 @@ class EvalRunConfig(BaseModel):
|
|
|
145
144
|
raise ValueError(f"Invalid trajectory_match_type '{v}'. Valid values: {sorted(valid)}")
|
|
146
145
|
return v.upper() if v is not None else v
|
|
147
146
|
|
|
148
|
-
output_format: str = Field(
|
|
149
|
-
default="table",
|
|
150
|
-
description="Output format: 'table', 'json', or 'summary'.",
|
|
151
|
-
)
|
|
152
|
-
|
|
153
147
|
max_concurrent_traces: int = Field(
|
|
154
148
|
default=10,
|
|
149
|
+
ge=1,
|
|
155
150
|
description="Maximum number of traces to evaluate concurrently.",
|
|
156
151
|
)
|
|
157
152
|
|
|
158
153
|
max_concurrent_evals: int = Field(
|
|
159
154
|
default=5,
|
|
155
|
+
ge=1,
|
|
160
156
|
description="Maximum number of concurrent metric evaluations (LLM API calls).",
|
|
161
157
|
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class EvalRunConfig(EvalParams):
|
|
161
|
+
"""Full configuration for file-based evaluation runs."""
|
|
162
|
+
|
|
163
|
+
trace_files: list[str] = Field(description="Paths to trace files (Jaeger JSON or OTLP JSON).")
|
|
164
|
+
|
|
165
|
+
eval_set_file: str | None = Field(
|
|
166
|
+
default=None,
|
|
167
|
+
description="Path to a golden eval set JSON file (ADK EvalSet format).",
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
trace_format: str = Field(
|
|
171
|
+
default="jaeger-json",
|
|
172
|
+
description="Format of the trace files (jaeger-json or otlp-json).",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
output_format: str = Field(
|
|
176
|
+
default="table",
|
|
177
|
+
description="Output format: 'table', 'json', or 'summary'.",
|
|
178
|
+
)
|
|
@@ -56,6 +56,12 @@ class OtlpJsonLoader(TraceLoader):
|
|
|
56
56
|
logger.info("Loaded %d trace(s) from %s", len(traces), source)
|
|
57
57
|
return traces
|
|
58
58
|
|
|
59
|
+
def load_from_dict(self, data: dict) -> list[Trace]:
|
|
60
|
+
"""Load traces from an OTLP JSON dict (resourceSpans structure)."""
|
|
61
|
+
if "resourceSpans" not in data:
|
|
62
|
+
raise ValueError("Expected OTLP JSON with 'resourceSpans' key")
|
|
63
|
+
return self._parse_otlp_export(data)
|
|
64
|
+
|
|
59
65
|
def _parse_otlp_export(self, data: dict) -> list[Trace]:
|
|
60
66
|
"""Parse full OTLP export structure with resourceSpans."""
|
|
61
67
|
all_spans = []
|
|
@@ -122,23 +128,40 @@ class OtlpJsonLoader(TraceLoader):
|
|
|
122
128
|
Some SDKs (e.g. Strands) store message content in span events rather
|
|
123
129
|
than span attributes. This promotes those values so the converter can
|
|
124
130
|
find them via normal attribute lookups.
|
|
131
|
+
|
|
132
|
+
Accepts events in OTLP array format or flat/nested dict format.
|
|
125
133
|
"""
|
|
126
134
|
for event in span_data.get("events", []):
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
if
|
|
132
|
-
attributes[key] =
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
135
|
+
event_attrs = event.get("attributes", [])
|
|
136
|
+
if isinstance(event_attrs, dict):
|
|
137
|
+
flat = self._flatten_nested_dict(event_attrs)
|
|
138
|
+
for key in self._GENAI_EVENT_KEYS:
|
|
139
|
+
if key in flat and key not in attributes:
|
|
140
|
+
attributes[key] = flat[key]
|
|
141
|
+
else:
|
|
142
|
+
for attr in event_attrs:
|
|
143
|
+
key = attr.get("key", "")
|
|
144
|
+
if key in self._GENAI_EVENT_KEYS and key not in attributes:
|
|
145
|
+
value_obj = attr.get("value", {})
|
|
146
|
+
if "stringValue" in value_obj:
|
|
147
|
+
attributes[key] = value_obj["stringValue"]
|
|
148
|
+
|
|
149
|
+
def _extract_attributes(self, attrs) -> dict:
|
|
150
|
+
"""Convert attributes to a flat ``{key: value}`` dict.
|
|
151
|
+
|
|
152
|
+
Accepts three formats:
|
|
153
|
+
1. OTLP array: ``[{key, value: {stringValue|intValue|...}}]``
|
|
154
|
+
2. Flat dict: ``{"gen_ai.operation.name": "chat"}``
|
|
155
|
+
3. Nested dict (ClickHouse JSON column): ``{"gen_ai": {"operation": {"name": "chat"}}}``
|
|
156
|
+
|
|
157
|
+
Formats 2 and 3 are auto-detected by checking whether *attrs* is a dict.
|
|
158
|
+
Nested dicts are recursively flattened to dot-notation keys.
|
|
139
159
|
"""
|
|
160
|
+
if isinstance(attrs, dict):
|
|
161
|
+
return self._flatten_nested_dict(attrs)
|
|
162
|
+
|
|
140
163
|
result = {}
|
|
141
|
-
for attr in
|
|
164
|
+
for attr in attrs:
|
|
142
165
|
key = attr.get("key", "")
|
|
143
166
|
value_obj = attr.get("value", {})
|
|
144
167
|
|
|
@@ -157,6 +180,25 @@ class OtlpJsonLoader(TraceLoader):
|
|
|
157
180
|
|
|
158
181
|
return result
|
|
159
182
|
|
|
183
|
+
@staticmethod
|
|
184
|
+
def _flatten_nested_dict(d: dict, prefix: str = "") -> dict:
|
|
185
|
+
"""Recursively flatten a nested dict to dot-notation keys.
|
|
186
|
+
|
|
187
|
+
``{"gen_ai": {"operation": {"name": "chat"}}}``
|
|
188
|
+
becomes ``{"gen_ai.operation.name": "chat"}``.
|
|
189
|
+
|
|
190
|
+
Already-flat keys (e.g. ``{"service.name": "agent"}``) pass through
|
|
191
|
+
unchanged.
|
|
192
|
+
"""
|
|
193
|
+
result = {}
|
|
194
|
+
for key, value in d.items():
|
|
195
|
+
full_key = f"{prefix}{key}" if not prefix else f"{prefix}.{key}"
|
|
196
|
+
if isinstance(value, dict):
|
|
197
|
+
result.update(OtlpJsonLoader._flatten_nested_dict(value, full_key))
|
|
198
|
+
else:
|
|
199
|
+
result[full_key] = value
|
|
200
|
+
return result
|
|
201
|
+
|
|
160
202
|
def _build_traces(self, all_spans: list[Span]) -> list[Trace]:
|
|
161
203
|
"""Group spans by trace_id and build parent-child relationships."""
|
|
162
204
|
traces_by_id: dict[str, list[Span]] = {}
|