agentevals-cli 0.9.0__tar.gz → 0.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/PKG-INFO +1 -1
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/custom-evaluators.md +20 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_evaluators/eval_config.yaml +1 -0
- agentevals_cli-0.9.1/examples/custom_evaluators/eval_config_openai_eval.yaml +18 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/pyproject.toml +1 -1
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/config.py +15 -7
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/converter.py +19 -15
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/extraction.py +38 -8
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/openai_eval_backend.py +40 -19
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_converter.py +131 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_extraction.py +50 -0
- agentevals_cli-0.9.1/tests/test_openai_eval_backend.py +116 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/uv.lock +1 -1
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.claude/skills/eval/SKILL.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.claude/skills/eval/evals/evals.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.claude/skills/inspect/SKILL.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.claude/skills/inspect/evals/evals.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.dockerignore +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/workflows/ci.yml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/workflows/publish-evaluator-sdk.yml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.github/workflows/release.yml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.gitignore +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/.mcp.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/CONTRIBUTING.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/DEVELOPMENT.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/Dockerfile +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/LICENSE +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/Makefile +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/README.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/Chart.yaml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/NOTES.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/_helpers.tpl +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/deployment.yaml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/postgresql.yaml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/service.yaml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/templates/serviceaccount.yaml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/charts/agentevals/values.yaml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/assets/logo-color-on-transparent.svg +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/assets/logo-color.png +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/assets/logo-dark-on-transparent.svg +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/eval-set-format.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/otel-compatibility.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/docs/streaming.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/README.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_evaluators/response_quality.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_evaluators/tool_call_checker.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_sink/README.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_sink/agentevals_example_custom_sink/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_sink/agentevals_example_custom_sink/sink.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/custom_sink/pyproject.toml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/README.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/agent.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/main.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/dice_agent/test_streaming.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/kubernetes/README.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/README.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/agent.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/main.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/langchain_agent/test_streaming.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/sdk_example/async_example.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/sdk_example/context_manager_example.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/sdk_example/decorator_example.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/sdk_example/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/strands_agent/agent.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/strands_agent/eval_set.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/strands_agent/main.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/strands_agent/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/adk/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/adk/run.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/langchain/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/langchain/run.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/ollama/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/ollama/run.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/openai-agents/run.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/strands/requirements.txt +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/examples/zero-code-examples/strands/run.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/flake.lock +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/flake.nix +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/README.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/pyproject.toml +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/eval_set_helm.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/evalset_helm_3_2026-02-23.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/evalset_k8s_2026-02-20.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/helm.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/helm_2.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/helm_3.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/k8s.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/samples/tempo_export_with_batches.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_protocol.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/assets/index-f8LUVQc3.js +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/index.html +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/logo.svg +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/_static/vite.svg +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/app.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/debug_routes.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/dependencies.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/models.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_app.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_grpc.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_processing.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/otlp_routes.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/routes.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/runs_routes.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/api/streaming_routes.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/builtin_metrics.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/cli.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/custom_evaluators.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/eval_config_loader.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/resolver.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/sources.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/templates.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/evaluator/venv.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/genai_converter.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/auto.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/base.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/jaeger.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/loader/otlp.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/mcp_server.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/output.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/fetcher.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/result_builder.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/service.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/sinks.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/run/worker.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/runner.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/sdk.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/config.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/models.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/migrator.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/postgres/pool.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/memory.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/storage/repos/postgres.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/incremental_processor.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/processor.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/session.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/streaming/ws_server.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/trace_attrs.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/trace_metrics.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/utils/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/utils/genai_messages.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/utils/log_buffer.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/src/agentevals/utils/log_enrichment.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/api/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/api/test_evaluate_persistence.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/api/test_runs_routes.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/conftest.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_evaluation_pipeline.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_live_agents.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_otlp_grpc_receiver.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_session_grouping.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/integration/test_timing_stress.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/test_fetcher.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/test_result_builder.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/test_service.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/run/test_sinks.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/__init__.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/test_config.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/test_memory_repos.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/test_migrator.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/storage/test_models.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_api.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_cli.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_eval_config_loader.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_genai_converter.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_jaeger_loader.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_loader_auto.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_log_enrichment.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_mcp_server.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_otlp_loader.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_otlp_receiver.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_output.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_protocol.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_runner.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_sdk.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/tests/test_trace_metrics.py +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/.gitignore +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/README.md +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/eslint.config.js +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/index.html +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/package-lock.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/package.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/public/logo.svg +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/public/vite.svg +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/App.css +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/App.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/api/client.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/assets/react.svg +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/BuilderHeader.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/BuilderView.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/EvalCasesList.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/InvocationEditor.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/JsonPreview.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/MetadataEditor.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/builder/index.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/DashboardView.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/TraceCard.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/dashboard/TraceTable.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/DataSection.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InspectorView.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InvocationCard.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/ToolCallList.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/sidebar/Sidebar.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveMessage.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/SessionCard.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/FileDropZone.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/MetricSelector.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/upload/UploadView.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/components/welcome/WelcomeView.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/config.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/context/TraceContext.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/context/TraceProvider.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/index.css +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/console-capture.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/eval-config.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/evalset-builder.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/network-capture.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/trace-helpers.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/trace-loader.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/trace-metadata.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/trace-patcher.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/types.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/lib/utils.ts +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/src/main.tsx +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/tsconfig.app.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/tsconfig.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/tsconfig.node.json +0 -0
- {agentevals_cli-0.9.0 → agentevals_cli-0.9.1}/ui/vite.config.ts +0 -0
|
@@ -317,6 +317,26 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
|
|
|
317
317
|
| `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
|
|
318
318
|
| `rouge_l` | Longest common subsequence overlap (F-measure) |
|
|
319
319
|
|
|
320
|
+
### Label Model Grader
|
|
321
|
+
|
|
322
|
+
Scores responses without a golden set. The model reads each response and assigns a label from a fixed list. Passing labels are defined in the config.
|
|
323
|
+
|
|
324
|
+
```yaml
|
|
325
|
+
evaluators:
|
|
326
|
+
- name: quality_check
|
|
327
|
+
type: openai_eval
|
|
328
|
+
grader:
|
|
329
|
+
type: label_model
|
|
330
|
+
model: gpt-4o-mini
|
|
331
|
+
input:
|
|
332
|
+
- role: user
|
|
333
|
+
content: "Rate this response: {{ item.actual_response }}"
|
|
334
|
+
labels: [good, bad]
|
|
335
|
+
passing_labels: [good]
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
The `threshold` field is not used for `label_model`. A response passes if its assigned label is in `passing_labels`.
|
|
339
|
+
|
|
320
340
|
### How it works
|
|
321
341
|
|
|
322
342
|
Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Eval config using OpenAI Evals API graders.
|
|
2
|
+
# Requires OPENAI_API_KEY to be set.
|
|
3
|
+
#
|
|
4
|
+
# Run with:
|
|
5
|
+
# agentevals run samples/helm.json \
|
|
6
|
+
# --config examples/custom_evaluators/eval_config_openai_eval.yaml
|
|
7
|
+
|
|
8
|
+
evaluators:
|
|
9
|
+
- name: quality_check
|
|
10
|
+
type: openai_eval
|
|
11
|
+
grader:
|
|
12
|
+
type: label_model
|
|
13
|
+
model: gpt-4o-mini
|
|
14
|
+
input:
|
|
15
|
+
- role: user
|
|
16
|
+
content: "Rate this response: {{ item.actual_response }}"
|
|
17
|
+
labels: [good, bad]
|
|
18
|
+
passing_labels: [good]
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "agentevals-cli"
|
|
7
|
-
version = "0.9.
|
|
7
|
+
version = "0.9.1"
|
|
8
8
|
description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
@@ -100,13 +100,21 @@ class OpenAIEvalDef(BaseModel):
|
|
|
100
100
|
@classmethod
|
|
101
101
|
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
|
|
102
102
|
grader_type = v.get("type")
|
|
103
|
-
if grader_type
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
103
|
+
if grader_type == "text_similarity":
|
|
104
|
+
metric = v.get("evaluation_metric")
|
|
105
|
+
if not metric:
|
|
106
|
+
raise ValueError("'evaluation_metric' is required for text_similarity grader")
|
|
107
|
+
if metric not in _VALID_SIMILARITY_METRICS:
|
|
108
|
+
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
|
|
109
|
+
elif grader_type == "label_model":
|
|
110
|
+
for field in ("model", "input", "labels", "passing_labels"):
|
|
111
|
+
if not v.get(field):
|
|
112
|
+
raise ValueError(f"'{field}' is required for label_model grader")
|
|
113
|
+
invalid = [lbl for lbl in v["passing_labels"] if lbl not in v["labels"]]
|
|
114
|
+
if invalid:
|
|
115
|
+
raise ValueError(f"passing_labels contains labels not declared in labels: {invalid}")
|
|
116
|
+
else:
|
|
117
|
+
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
|
|
110
118
|
return v
|
|
111
119
|
|
|
112
120
|
|
|
@@ -23,6 +23,7 @@ from .extraction import (
|
|
|
23
23
|
extract_tool_call_from_span,
|
|
24
24
|
extract_tool_result_from_span,
|
|
25
25
|
extract_user_text_from_attrs,
|
|
26
|
+
find_adk_llm_spans_in,
|
|
26
27
|
get_extractor,
|
|
27
28
|
has_adk_descendant,
|
|
28
29
|
is_adk_scope,
|
|
@@ -127,15 +128,18 @@ def _find_adk_spans(trace: Trace, operation: str) -> list[Span]:
|
|
|
127
128
|
|
|
128
129
|
|
|
129
130
|
def _convert_invoke_span(invoke_span: Span) -> Invocation:
|
|
130
|
-
|
|
131
|
-
if not
|
|
132
|
-
raise ValueError(
|
|
131
|
+
llm_spans = find_adk_llm_spans_in(invoke_span)
|
|
132
|
+
if not llm_spans:
|
|
133
|
+
raise ValueError(
|
|
134
|
+
f"invoke_agent span {invoke_span.span_id} has no converter-compatible ADK LLM descendants; "
|
|
135
|
+
"expected call_llm or ADK generate_content spans"
|
|
136
|
+
)
|
|
133
137
|
|
|
134
138
|
tool_spans = _find_children_by_op(invoke_span, "execute_tool")
|
|
135
139
|
|
|
136
|
-
user_content = _extract_user_content(
|
|
137
|
-
final_response = _extract_final_response(
|
|
138
|
-
tool_uses, tool_responses = _extract_tool_trajectory(
|
|
140
|
+
user_content = _extract_user_content(llm_spans[0])
|
|
141
|
+
final_response = _extract_final_response(llm_spans[-1])
|
|
142
|
+
tool_uses, tool_responses = _extract_tool_trajectory(llm_spans, tool_spans)
|
|
139
143
|
|
|
140
144
|
intermediate_data = IntermediateData(
|
|
141
145
|
tool_uses=tool_uses,
|
|
@@ -177,7 +181,7 @@ def _extract_user_content(first_call_llm: Span) -> genai_types.Content:
|
|
|
177
181
|
)
|
|
178
182
|
llm_request_raw = first_call_llm.get_tag(ADK_LLM_REQUEST, "{}")
|
|
179
183
|
llm_request = parse_json(llm_request_raw)
|
|
180
|
-
for content_dict in llm_request.get("contents", []):
|
|
184
|
+
for content_dict in llm_request.get("contents", llm_request.get("Contents", [])):
|
|
181
185
|
if content_dict.get("role") == "user":
|
|
182
186
|
return _content_from_dict(content_dict)
|
|
183
187
|
raise ValueError(f"call_llm span {first_call_llm.span_id}: no user content found in llm_request")
|
|
@@ -193,7 +197,7 @@ def _extract_final_response(last_call_llm: Span) -> genai_types.Content:
|
|
|
193
197
|
)
|
|
194
198
|
llm_response_raw = last_call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
|
|
195
199
|
llm_response = parse_json(llm_response_raw)
|
|
196
|
-
content_dict = llm_response.get("content", {})
|
|
200
|
+
content_dict = llm_response.get("content", llm_response.get("Content", {}))
|
|
197
201
|
if not content_dict:
|
|
198
202
|
raise ValueError(f"call_llm span {last_call_llm.span_id}: no content in llm_response")
|
|
199
203
|
logger.warning(
|
|
@@ -263,12 +267,12 @@ def _extract_function_calls_from_llm_response(
|
|
|
263
267
|
llm_response_raw = call_llm.get_tag(ADK_LLM_RESPONSE, "{}")
|
|
264
268
|
llm_response = parse_json(llm_response_raw)
|
|
265
269
|
|
|
266
|
-
content_dict = llm_response.get("content", {})
|
|
270
|
+
content_dict = llm_response.get("content", llm_response.get("Content", {}))
|
|
267
271
|
parts = content_dict.get("parts", [])
|
|
268
272
|
|
|
269
273
|
calls = []
|
|
270
274
|
for part in parts:
|
|
271
|
-
fc_dict = part.get("function_call")
|
|
275
|
+
fc_dict = part.get("function_call", part.get("functionCall"))
|
|
272
276
|
if fc_dict:
|
|
273
277
|
calls.append(
|
|
274
278
|
genai_types.FunctionCall(
|
|
@@ -288,9 +292,9 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
|
|
|
288
292
|
parts: list[genai_types.Part] = []
|
|
289
293
|
for p in parts_dicts:
|
|
290
294
|
if "text" in p:
|
|
291
|
-
parts.append(genai_types.Part(text=p
|
|
292
|
-
elif "function_call" in p:
|
|
293
|
-
fc = p
|
|
295
|
+
parts.append(genai_types.Part(text=p.get("text")))
|
|
296
|
+
elif "function_call" in p or "functionCall" in p:
|
|
297
|
+
fc = p.get("function_call", p.get("functionCall"))
|
|
294
298
|
parts.append(
|
|
295
299
|
genai_types.Part(
|
|
296
300
|
function_call=genai_types.FunctionCall(
|
|
@@ -300,8 +304,8 @@ def _content_from_dict(content_dict: dict[str, Any]) -> genai_types.Content:
|
|
|
300
304
|
)
|
|
301
305
|
)
|
|
302
306
|
)
|
|
303
|
-
elif "function_response" in p:
|
|
304
|
-
fr = p
|
|
307
|
+
elif "function_response" in p or "functionResponse" in p:
|
|
308
|
+
fr = p.get("function_response", p.get("functionResponse"))
|
|
305
309
|
parts.append(
|
|
306
310
|
genai_types.Part(
|
|
307
311
|
function_response=genai_types.FunctionResponse(
|
|
@@ -69,14 +69,15 @@ def extract_user_text_from_attrs(attrs: dict[str, Any]) -> str | None:
|
|
|
69
69
|
if llm_request_raw:
|
|
70
70
|
llm_request = parse_json(llm_request_raw)
|
|
71
71
|
if isinstance(llm_request, dict):
|
|
72
|
-
|
|
72
|
+
contents = llm_request.get("contents", llm_request.get("Contents", []))
|
|
73
|
+
for content_dict in reversed(contents):
|
|
73
74
|
if content_dict.get("role") != "user":
|
|
74
75
|
continue
|
|
75
76
|
parts = content_dict.get("parts", [])
|
|
76
77
|
text_parts = [p for p in parts if "text" in p]
|
|
77
78
|
if text_parts:
|
|
78
79
|
return " ".join(p["text"] for p in text_parts)
|
|
79
|
-
for content_dict in
|
|
80
|
+
for content_dict in contents:
|
|
80
81
|
if content_dict.get("role") == "user":
|
|
81
82
|
parts = content_dict.get("parts", [])
|
|
82
83
|
if parts:
|
|
@@ -101,7 +102,7 @@ def extract_agent_response_from_attrs(attrs: dict[str, Any]) -> str | None:
|
|
|
101
102
|
if llm_response_raw:
|
|
102
103
|
llm_response = parse_json(llm_response_raw)
|
|
103
104
|
if isinstance(llm_response, dict):
|
|
104
|
-
content_dict = llm_response.get("content", {})
|
|
105
|
+
content_dict = llm_response.get("content", llm_response.get("Content", {}))
|
|
105
106
|
if content_dict:
|
|
106
107
|
parts_dicts = content_dict.get("parts", [])
|
|
107
108
|
text_parts = [p for p in parts_dicts if "text" in p]
|
|
@@ -392,6 +393,38 @@ def is_adk_scope(span: Span) -> bool:
|
|
|
392
393
|
return False
|
|
393
394
|
|
|
394
395
|
|
|
396
|
+
def is_adk_generate_content_llm_span(span: Span) -> bool:
|
|
397
|
+
if not (span.operation_name.startswith("generate_content") or span.get_tag(OTEL_GENAI_OP) == "generate_content"):
|
|
398
|
+
return False
|
|
399
|
+
return bool(span.get_tag(ADK_LLM_REQUEST) or span.get_tag(ADK_LLM_RESPONSE))
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def is_adk_llm_span(span: Span) -> bool:
|
|
403
|
+
return span.operation_name.startswith("call_llm") or is_adk_generate_content_llm_span(span)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def find_adk_llm_spans_in(root: Span) -> list[Span]:
|
|
407
|
+
call_llm_spans: list[Span] = []
|
|
408
|
+
generate_content_spans: list[Span] = []
|
|
409
|
+
|
|
410
|
+
def collect(span: Span) -> None:
|
|
411
|
+
if span.operation_name.startswith("call_llm"):
|
|
412
|
+
call_llm_spans.append(span)
|
|
413
|
+
elif is_adk_generate_content_llm_span(span):
|
|
414
|
+
generate_content_spans.append(span)
|
|
415
|
+
|
|
416
|
+
_walk_descendants(root, collect)
|
|
417
|
+
call_llm_spans.sort(key=lambda s: s.start_time)
|
|
418
|
+
generate_content_spans.sort(key=lambda s: s.start_time)
|
|
419
|
+
return call_llm_spans or generate_content_spans
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _walk_descendants(span: Span, visit) -> None:
|
|
423
|
+
for child in span.children:
|
|
424
|
+
visit(child)
|
|
425
|
+
_walk_descendants(child, visit)
|
|
426
|
+
|
|
427
|
+
|
|
395
428
|
def is_llm_span(span: Span) -> bool:
|
|
396
429
|
return span.get_tag(OTEL_GENAI_REQUEST_MODEL) is not None
|
|
397
430
|
|
|
@@ -477,10 +510,7 @@ class AdkExtractor:
|
|
|
477
510
|
return matches
|
|
478
511
|
|
|
479
512
|
def find_llm_spans_in(self, root: Span) -> list[Span]:
|
|
480
|
-
|
|
481
|
-
self._walk(root, lambda s: s.operation_name.startswith("call_llm"), results)
|
|
482
|
-
results.sort(key=lambda s: s.start_time)
|
|
483
|
-
return results
|
|
513
|
+
return find_adk_llm_spans_in(root)
|
|
484
514
|
|
|
485
515
|
def find_tool_spans_in(self, root: Span) -> list[Span]:
|
|
486
516
|
results: list[Span] = []
|
|
@@ -493,7 +523,7 @@ class AdkExtractor:
|
|
|
493
523
|
return None
|
|
494
524
|
if span.operation_name.startswith("invoke_agent"):
|
|
495
525
|
return "invocation"
|
|
496
|
-
if span
|
|
526
|
+
if is_adk_llm_span(span):
|
|
497
527
|
return "llm"
|
|
498
528
|
if span.operation_name.startswith("execute_tool"):
|
|
499
529
|
return "tool"
|
|
@@ -31,6 +31,12 @@ _TEXT_PAIR_SCHEMA = {
|
|
|
31
31
|
"required": ["actual_response", "expected_response"],
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
+
_ACTUAL_ONLY_SCHEMA = {
|
|
35
|
+
"type": "object",
|
|
36
|
+
"properties": {"actual_response": {"type": "string"}},
|
|
37
|
+
"required": ["actual_response"],
|
|
38
|
+
}
|
|
39
|
+
|
|
34
40
|
|
|
35
41
|
def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
|
|
36
42
|
"""Build the OpenAI testing_criteria dict from the evaluator config.
|
|
@@ -51,28 +57,33 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
|
|
|
51
57
|
"pass_threshold": evaluator_def.threshold,
|
|
52
58
|
}
|
|
53
59
|
|
|
60
|
+
if grader_type == "label_model":
|
|
61
|
+
return {
|
|
62
|
+
"type": "label_model",
|
|
63
|
+
"name": evaluator_def.name,
|
|
64
|
+
"model": grader["model"],
|
|
65
|
+
"input": grader["input"],
|
|
66
|
+
"labels": grader["labels"],
|
|
67
|
+
"passing_labels": grader["passing_labels"],
|
|
68
|
+
}
|
|
69
|
+
|
|
54
70
|
raise ValueError(f"Unsupported grader type: {grader_type}")
|
|
55
71
|
|
|
56
72
|
|
|
57
73
|
def _build_jsonl_items(
|
|
58
74
|
actual_invocations: list[Invocation],
|
|
59
75
|
expected_invocations: list[Invocation],
|
|
76
|
+
include_expected: bool = True,
|
|
60
77
|
) -> list[dict[str, Any]]:
|
|
61
78
|
items = []
|
|
62
79
|
for i, actual_inv in enumerate(actual_invocations):
|
|
63
|
-
|
|
64
|
-
if
|
|
65
|
-
expected_text =
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
"item": {
|
|
71
|
-
"actual_response": actual_text,
|
|
72
|
-
"expected_response": expected_text,
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
)
|
|
80
|
+
entry: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
|
|
81
|
+
if include_expected:
|
|
82
|
+
expected_text = (
|
|
83
|
+
_content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
|
|
84
|
+
)
|
|
85
|
+
entry["expected_response"] = expected_text
|
|
86
|
+
items.append({"item": entry})
|
|
76
87
|
return items
|
|
77
88
|
|
|
78
89
|
|
|
@@ -111,13 +122,17 @@ async def evaluate_openai_eval(
|
|
|
111
122
|
error="OPENAI_API_KEY environment variable is not set.",
|
|
112
123
|
)
|
|
113
124
|
|
|
114
|
-
|
|
125
|
+
grader_type = evaluator_def.grader["type"]
|
|
126
|
+
|
|
127
|
+
if grader_type == "text_similarity" and expected_invocations is None:
|
|
115
128
|
return MetricResult(
|
|
116
129
|
metric_name=evaluator_def.name,
|
|
117
130
|
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
|
|
118
131
|
)
|
|
119
132
|
|
|
120
|
-
items = _build_jsonl_items(
|
|
133
|
+
items = _build_jsonl_items(
|
|
134
|
+
actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model")
|
|
135
|
+
)
|
|
121
136
|
if not items:
|
|
122
137
|
return MetricResult(
|
|
123
138
|
metric_name=evaluator_def.name,
|
|
@@ -130,12 +145,13 @@ async def evaluate_openai_eval(
|
|
|
130
145
|
try:
|
|
131
146
|
client = await asyncio.to_thread(_get_openai_client)
|
|
132
147
|
|
|
148
|
+
item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
|
|
133
149
|
eval_obj = await asyncio.to_thread(
|
|
134
150
|
client.evals.create,
|
|
135
|
-
name=f"agentevals-{evaluator_def.name}",
|
|
151
|
+
name=f"agentevals-openai-{evaluator_def.name}",
|
|
136
152
|
data_source_config={
|
|
137
153
|
"type": "custom",
|
|
138
|
-
"item_schema":
|
|
154
|
+
"item_schema": item_schema,
|
|
139
155
|
"include_sample_schema": False,
|
|
140
156
|
},
|
|
141
157
|
testing_criteria=[testing_criteria],
|
|
@@ -146,7 +162,7 @@ async def evaluate_openai_eval(
|
|
|
146
162
|
run = await asyncio.to_thread(
|
|
147
163
|
client.evals.runs.create,
|
|
148
164
|
eval_id=eval_id,
|
|
149
|
-
name=f"agentevals-run-{evaluator_def.name}",
|
|
165
|
+
name=f"agentevals-openai-run-{evaluator_def.name}",
|
|
150
166
|
data_source={
|
|
151
167
|
"type": "jsonl",
|
|
152
168
|
"source": {
|
|
@@ -225,12 +241,17 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
|
|
|
225
241
|
total = result_counts.total if result_counts else 0
|
|
226
242
|
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
|
|
227
243
|
|
|
244
|
+
grader = evaluator_def.grader
|
|
228
245
|
details: dict[str, Any] = {
|
|
229
246
|
"openai_eval_id": eval_id,
|
|
230
247
|
"openai_run_id": run_id,
|
|
231
|
-
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
|
|
232
248
|
"result_counts": {"passed": passed, "failed": failed, "total": total},
|
|
233
249
|
}
|
|
250
|
+
if grader["type"] == "text_similarity":
|
|
251
|
+
details["evaluation_metric"] = grader.get("evaluation_metric")
|
|
252
|
+
elif grader["type"] == "label_model":
|
|
253
|
+
details["model"] = grader.get("model")
|
|
254
|
+
details["passing_labels"] = grader.get("passing_labels")
|
|
234
255
|
per_criteria = getattr(run, "per_testing_criteria_results", None)
|
|
235
256
|
if per_criteria:
|
|
236
257
|
details["per_testing_criteria"] = [
|
|
@@ -186,6 +186,108 @@ class TestConverter:
|
|
|
186
186
|
assert len(results) == 2
|
|
187
187
|
assert all(r.trace_id == "t1" for r in results)
|
|
188
188
|
|
|
189
|
+
def test_convert_adk_generate_content_llm_spans(self):
|
|
190
|
+
invoke = Span(
|
|
191
|
+
trace_id="t-gc",
|
|
192
|
+
span_id="invoke1",
|
|
193
|
+
parent_span_id=None,
|
|
194
|
+
operation_name="invoke_agent query_agent",
|
|
195
|
+
start_time=1000,
|
|
196
|
+
duration=10000,
|
|
197
|
+
tags={"gen_ai.operation.name": "invoke_agent"},
|
|
198
|
+
)
|
|
199
|
+
llm_1 = Span(
|
|
200
|
+
trace_id="t-gc",
|
|
201
|
+
span_id="llm1",
|
|
202
|
+
parent_span_id="invoke1",
|
|
203
|
+
operation_name="generate_content mockllm-deterministic",
|
|
204
|
+
start_time=2000,
|
|
205
|
+
duration=1000,
|
|
206
|
+
tags={
|
|
207
|
+
"gen_ai.operation.name": "generate_content",
|
|
208
|
+
"gcp.vertex.agent.llm_request": json.dumps(
|
|
209
|
+
{"Contents": [{"role": "user", "parts": [{"text": "inspect pods"}]}]}
|
|
210
|
+
),
|
|
211
|
+
"gcp.vertex.agent.llm_response": json.dumps(
|
|
212
|
+
{"Content": {"role": "model", "parts": [{"text": "Calling tools."}]}}
|
|
213
|
+
),
|
|
214
|
+
},
|
|
215
|
+
)
|
|
216
|
+
tool_1 = Span(
|
|
217
|
+
trace_id="t-gc",
|
|
218
|
+
span_id="tool1",
|
|
219
|
+
parent_span_id="invoke1",
|
|
220
|
+
operation_name="execute_tool list_pods",
|
|
221
|
+
start_time=3000,
|
|
222
|
+
duration=500,
|
|
223
|
+
tags={
|
|
224
|
+
"gen_ai.tool.name": "list_pods",
|
|
225
|
+
"gen_ai.tool.call.id": "call_1",
|
|
226
|
+
"gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}),
|
|
227
|
+
"gcp.vertex.agent.tool_response": json.dumps({"pods": []}),
|
|
228
|
+
},
|
|
229
|
+
)
|
|
230
|
+
llm_2 = Span(
|
|
231
|
+
trace_id="t-gc",
|
|
232
|
+
span_id="llm2",
|
|
233
|
+
parent_span_id="invoke1",
|
|
234
|
+
operation_name="generate_content mockllm-deterministic",
|
|
235
|
+
start_time=4000,
|
|
236
|
+
duration=1000,
|
|
237
|
+
tags={
|
|
238
|
+
"gen_ai.operation.name": "generate_content",
|
|
239
|
+
"gcp.vertex.agent.llm_request": json.dumps({"contents": []}),
|
|
240
|
+
"gcp.vertex.agent.llm_response": json.dumps(
|
|
241
|
+
{
|
|
242
|
+
"Content": {
|
|
243
|
+
"role": "model",
|
|
244
|
+
"parts": [
|
|
245
|
+
{
|
|
246
|
+
"functionCall": {
|
|
247
|
+
"name": "summarize_pods",
|
|
248
|
+
"args": {"namespace": "default"},
|
|
249
|
+
"id": "call_final",
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
],
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
),
|
|
256
|
+
},
|
|
257
|
+
)
|
|
258
|
+
tool_2 = Span(
|
|
259
|
+
trace_id="t-gc",
|
|
260
|
+
span_id="tool2",
|
|
261
|
+
parent_span_id="invoke1",
|
|
262
|
+
operation_name="execute_tool get_events",
|
|
263
|
+
start_time=5000,
|
|
264
|
+
duration=500,
|
|
265
|
+
tags={
|
|
266
|
+
"gen_ai.tool.name": "get_events",
|
|
267
|
+
"gen_ai.tool.call.id": "call_2",
|
|
268
|
+
"gcp.vertex.agent.tool_call_args": json.dumps({"namespace": "default"}),
|
|
269
|
+
"gcp.vertex.agent.tool_response": json.dumps({"events": []}),
|
|
270
|
+
},
|
|
271
|
+
)
|
|
272
|
+
invoke.children.extend([llm_1, tool_1, llm_2, tool_2])
|
|
273
|
+
trace = Trace(
|
|
274
|
+
trace_id="t-gc",
|
|
275
|
+
root_spans=[invoke],
|
|
276
|
+
all_spans=[invoke, llm_1, tool_1, llm_2, tool_2],
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
result = convert_trace(trace)
|
|
280
|
+
|
|
281
|
+
assert result.warnings == []
|
|
282
|
+
assert len(result.invocations) == 1
|
|
283
|
+
inv = result.invocations[0]
|
|
284
|
+
assert inv.user_content.parts[0].text == "inspect pods"
|
|
285
|
+
final_call = inv.final_response.parts[0].function_call
|
|
286
|
+
assert final_call.name == "summarize_pods"
|
|
287
|
+
assert final_call.args == {"namespace": "default"}
|
|
288
|
+
assert final_call.id == "call_final"
|
|
289
|
+
assert [t.name for t in inv.intermediate_data.tool_uses] == ["list_pods", "get_events"]
|
|
290
|
+
|
|
189
291
|
def test_no_invoke_agent_warns(self):
|
|
190
292
|
trace = Trace(
|
|
191
293
|
trace_id="empty",
|
|
@@ -207,6 +309,35 @@ class TestConverter:
|
|
|
207
309
|
assert len(result.warnings) == 1
|
|
208
310
|
assert "no invoke_agent" in result.warnings[0]
|
|
209
311
|
|
|
312
|
+
def test_no_llm_descendants_warns_with_compatible_shapes(self):
|
|
313
|
+
invoke = Span(
|
|
314
|
+
trace_id="no-llm",
|
|
315
|
+
span_id="invoke-no-llm",
|
|
316
|
+
parent_span_id=None,
|
|
317
|
+
operation_name="invoke_agent test_agent",
|
|
318
|
+
start_time=1000,
|
|
319
|
+
duration=1000,
|
|
320
|
+
tags={
|
|
321
|
+
"otel.scope.name": "gcp.vertex.agent",
|
|
322
|
+
"gen_ai.operation.name": "invoke_agent",
|
|
323
|
+
},
|
|
324
|
+
)
|
|
325
|
+
trace = Trace(
|
|
326
|
+
trace_id="no-llm",
|
|
327
|
+
root_spans=[invoke],
|
|
328
|
+
all_spans=[invoke],
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
result = convert_trace(trace)
|
|
332
|
+
|
|
333
|
+
assert result.invocations == []
|
|
334
|
+
assert len(result.warnings) == 1
|
|
335
|
+
warning = result.warnings[0]
|
|
336
|
+
assert "invoke-no-llm" in warning
|
|
337
|
+
assert "no converter-compatible ADK LLM descendants" in warning
|
|
338
|
+
assert "call_llm" in warning
|
|
339
|
+
assert "ADK generate_content" in warning
|
|
340
|
+
|
|
210
341
|
def test_no_tool_spans_fallback_to_llm_response(self):
|
|
211
342
|
"""When no execute_tool spans exist, function_calls should be
|
|
212
343
|
extracted from call_llm responses instead."""
|
|
@@ -107,6 +107,18 @@ class TestExtractUserText:
|
|
|
107
107
|
}
|
|
108
108
|
assert extract_user_text_from_attrs(attrs) == "Second"
|
|
109
109
|
|
|
110
|
+
def test_adk_llm_request_outer_contents_pascalcase(self):
|
|
111
|
+
attrs = {
|
|
112
|
+
ADK_LLM_REQUEST: json.dumps(
|
|
113
|
+
{
|
|
114
|
+
"Contents": [
|
|
115
|
+
{"role": "user", "parts": [{"text": "Outer PascalCase only"}]},
|
|
116
|
+
]
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
}
|
|
120
|
+
assert extract_user_text_from_attrs(attrs) == "Outer PascalCase only"
|
|
121
|
+
|
|
110
122
|
def test_genai_content_based(self):
|
|
111
123
|
attrs = {
|
|
112
124
|
OTEL_GENAI_INPUT_MESSAGES: json.dumps(
|
|
@@ -170,6 +182,10 @@ class TestExtractAgentResponse:
|
|
|
170
182
|
attrs = {ADK_LLM_RESPONSE: json.dumps({"content": {"parts": [{"text": "ADK response"}]}})}
|
|
171
183
|
assert extract_agent_response_from_attrs(attrs) == "ADK response"
|
|
172
184
|
|
|
185
|
+
def test_adk_llm_response_outer_content_pascalcase(self):
|
|
186
|
+
attrs = {ADK_LLM_RESPONSE: json.dumps({"Content": {"parts": [{"text": "Outer Content only"}]}})}
|
|
187
|
+
assert extract_agent_response_from_attrs(attrs) == "Outer Content only"
|
|
188
|
+
|
|
173
189
|
def test_genai_content_based(self):
|
|
174
190
|
attrs = {
|
|
175
191
|
OTEL_GENAI_OUTPUT_MESSAGES: json.dumps(
|
|
@@ -519,6 +535,39 @@ class TestAdkExtractorSpanFinding:
|
|
|
519
535
|
ext = AdkExtractor()
|
|
520
536
|
assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
|
|
521
537
|
|
|
538
|
+
def test_find_llm_spans_in_falls_back_to_adk_generate_content(self):
|
|
539
|
+
child_llm = _span(
|
|
540
|
+
op="generate_content mockllm-deterministic",
|
|
541
|
+
tags={ADK_LLM_REQUEST: "{}"},
|
|
542
|
+
span_id="llm1",
|
|
543
|
+
)
|
|
544
|
+
child_tool = _span(op="execute_tool search", span_id="tool1")
|
|
545
|
+
root = _span(op="invoke_agent a", children=[child_llm, child_tool])
|
|
546
|
+
ext = AdkExtractor()
|
|
547
|
+
assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
|
|
548
|
+
|
|
549
|
+
def test_find_llm_spans_in_ignores_provider_generate_content_without_adk_payload(self):
|
|
550
|
+
child_llm = _span(
|
|
551
|
+
op="generate_content gpt-4",
|
|
552
|
+
tags={OTEL_GENAI_REQUEST_MODEL: "gpt-4"},
|
|
553
|
+
span_id="llm1",
|
|
554
|
+
)
|
|
555
|
+
root = _span(op="invoke_agent a", children=[child_llm])
|
|
556
|
+
ext = AdkExtractor()
|
|
557
|
+
assert ext.find_llm_spans_in(root) == []
|
|
558
|
+
|
|
559
|
+
def test_find_llm_spans_in_prefers_call_llm_over_generate_content(self):
|
|
560
|
+
call_llm = _span(op="call_llm gemini", span_id="llm1", start_time=20)
|
|
561
|
+
generate_content = _span(
|
|
562
|
+
op="generate_content gemini",
|
|
563
|
+
tags={ADK_LLM_REQUEST: "{}"},
|
|
564
|
+
span_id="llm2",
|
|
565
|
+
start_time=10,
|
|
566
|
+
)
|
|
567
|
+
root = _span(op="invoke_agent a", children=[generate_content, call_llm])
|
|
568
|
+
ext = AdkExtractor()
|
|
569
|
+
assert [s.span_id for s in ext.find_llm_spans_in(root)] == ["llm1"]
|
|
570
|
+
|
|
522
571
|
def test_find_tool_spans_in(self):
|
|
523
572
|
child_llm = _span(op="call_llm gemini", span_id="llm1")
|
|
524
573
|
child_tool = _span(op="execute_tool search", span_id="tool1")
|
|
@@ -530,6 +579,7 @@ class TestAdkExtractorSpanFinding:
|
|
|
530
579
|
ext = AdkExtractor()
|
|
531
580
|
assert ext.classify_span(_span(op="invoke_agent a", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "invocation"
|
|
532
581
|
assert ext.classify_span(_span(op="call_llm", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "llm"
|
|
582
|
+
assert ext.classify_span(_span(op="generate_content", tags={ADK_LLM_REQUEST: "{}"})) == "llm"
|
|
533
583
|
assert ext.classify_span(_span(op="execute_tool x", tags={OTEL_SCOPE: ADK_SCOPE_VALUE})) == "tool"
|
|
534
584
|
assert ext.classify_span(_span(op="random")) is None
|
|
535
585
|
|