agentevals-cli 0.8.1__tar.gz → 0.8.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/PKG-INFO +1 -1
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/README.md +6 -0
- agentevals_cli-0.8.2/examples/custom_sink/README.md +80 -0
- agentevals_cli-0.8.2/examples/custom_sink/agentevals_example_custom_sink/__init__.py +1 -0
- agentevals_cli-0.8.2/examples/custom_sink/agentevals_example_custom_sink/sink.py +71 -0
- agentevals_cli-0.8.2/examples/custom_sink/pyproject.toml +19 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/pyproject.toml +1 -1
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/app.py +2 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/run/__init__.py +1 -1
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/run/sinks.py +110 -26
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/run/test_sinks.py +180 -1
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/uv.lock +1 -1
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.claude/skills/eval/SKILL.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.claude/skills/eval/evals/evals.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.claude/skills/inspect/SKILL.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.claude/skills/inspect/evals/evals.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.dockerignore +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.github/workflows/ci.yml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.github/workflows/publish-evaluator-sdk.yml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.github/workflows/release.yml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.gitignore +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/.mcp.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/CONTRIBUTING.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/DEVELOPMENT.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/Dockerfile +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/LICENSE +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/Makefile +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/README.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/Chart.yaml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/templates/NOTES.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/templates/_helpers.tpl +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/templates/deployment.yaml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/templates/postgresql-secret.yaml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/templates/postgresql.yaml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/templates/service.yaml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/templates/serviceaccount.yaml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/charts/agentevals/values.yaml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/docs/assets/logo-color-on-transparent.svg +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/docs/assets/logo-color.png +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/docs/assets/logo-dark-on-transparent.svg +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/docs/custom-evaluators.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/docs/eval-set-format.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/docs/otel-compatibility.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/docs/streaming.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/custom_evaluators/eval_config.yaml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/custom_evaluators/response_quality.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/custom_evaluators/tool_call_checker.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/dice_agent/README.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/dice_agent/agent.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/dice_agent/eval_set.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/dice_agent/main.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/dice_agent/test_streaming.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/kubernetes/README.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/langchain_agent/README.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/langchain_agent/agent.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/langchain_agent/eval_set.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/langchain_agent/main.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/langchain_agent/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/langchain_agent/test_streaming.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/sdk_example/async_example.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/sdk_example/context_manager_example.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/sdk_example/decorator_example.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/sdk_example/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/strands_agent/agent.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/strands_agent/eval_set.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/strands_agent/main.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/strands_agent/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/adk/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/adk/run.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/langchain/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/langchain/run.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/ollama/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/ollama/run.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/openai-agents/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/openai-agents/run.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/pydantic-ai/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/pydantic-ai/run.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/strands/requirements.txt +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/examples/zero-code-examples/strands/run.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/flake.lock +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/flake.nix +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/packages/evaluator-sdk-py/README.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/packages/evaluator-sdk-py/pyproject.toml +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/samples/eval_set_helm.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/samples/evalset_helm_3_2026-02-23.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/samples/evalset_k8s_2026-02-20.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/samples/helm.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/samples/helm_2.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/samples/helm_3.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/samples/k8s.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/samples/tempo_export_with_batches.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/_protocol.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/_static/assets/index-Cl6S2lcn.js +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/_static/index.html +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/_static/logo.svg +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/_static/vite.svg +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/debug_routes.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/dependencies.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/models.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/otlp_app.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/otlp_grpc.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/otlp_processing.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/otlp_routes.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/routes.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/runs_routes.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/api/streaming_routes.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/builtin_metrics.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/cli.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/config.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/converter.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/custom_evaluators.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/eval_config_loader.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/evaluator/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/evaluator/resolver.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/evaluator/sources.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/evaluator/templates.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/evaluator/venv.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/extraction.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/genai_converter.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/loader/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/loader/auto.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/loader/base.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/loader/jaeger.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/loader/otlp.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/mcp_server.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/openai_eval_backend.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/output.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/run/fetcher.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/run/result_builder.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/run/service.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/run/worker.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/runner.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/sdk.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/config.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/models.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/postgres/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/postgres/migrations/000001_init.down.sql +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/postgres/migrations/000001_init.up.sql +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/postgres/migrator.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/postgres/pool.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/repos/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/repos/memory.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/storage/repos/postgres.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/streaming/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/streaming/incremental_processor.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/streaming/processor.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/streaming/session.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/streaming/ws_server.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/trace_attrs.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/trace_metrics.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/utils/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/utils/genai_messages.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/utils/log_buffer.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/src/agentevals/utils/log_enrichment.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/api/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/api/test_evaluate_persistence.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/api/test_runs_routes.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/integration/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/integration/conftest.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/integration/test_evaluation_pipeline.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/integration/test_live_agents.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/integration/test_otlp_grpc_receiver.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/integration/test_session_grouping.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/integration/test_timing_stress.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/run/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/run/test_fetcher.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/run/test_result_builder.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/run/test_service.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/storage/__init__.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/storage/test_config.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/storage/test_memory_repos.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/storage/test_migrator.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/storage/test_models.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_api.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_cli.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_converter.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_extraction.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_genai_converter.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_jaeger_loader.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_loader_auto.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_log_enrichment.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_mcp_server.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_otlp_loader.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_otlp_receiver.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_output.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_protocol.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_runner.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_sdk.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/tests/test_trace_metrics.py +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/.gitignore +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/README.md +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/eslint.config.js +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/index.html +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/package-lock.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/package.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/public/logo.svg +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/public/vite.svg +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/App.css +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/App.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/api/client.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/assets/react.svg +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/BuilderHeader.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/BuilderView.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/EvalCasesList.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/InvocationEditor.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/JsonPreview.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/MetadataEditor.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/TraceUploadZone.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/builder/index.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/dashboard/DashboardView.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/dashboard/TraceCard.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/dashboard/TraceTable.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/DataSection.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/InspectorHeader.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/InspectorView.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/InvocationCard.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/ToolCallList.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/sidebar/Sidebar.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/streaming/LiveMessage.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/streaming/SessionCard.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/upload/FileDropZone.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/upload/MetricSelector.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/upload/TraceEditorDrawer.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/upload/UploadView.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/components/welcome/WelcomeView.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/config.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/context/TraceContext.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/context/TraceProvider.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/index.css +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/console-capture.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/evalset-builder.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/network-capture.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/trace-helpers.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/trace-loader.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/trace-metadata.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/trace-patcher.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/types.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/lib/utils.ts +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/src/main.tsx +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/tsconfig.app.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/tsconfig.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/tsconfig.node.json +0 -0
- {agentevals_cli-0.8.1 → agentevals_cli-0.8.2}/ui/vite.config.ts +0 -0
|
@@ -119,6 +119,12 @@ The zero-code and SDK examples implement the same toy agent (dice rolling + prim
|
|
|
119
119
|
|---------|-------------|
|
|
120
120
|
| [kubernetes/](./kubernetes/) | Deploy agentevals with kagent on Kubernetes using native OTLP gRPC ingestion (or optionally an OTel Collector). Includes a walkthrough for comparing two kagent agents (different models) and evaluating them with tool trajectory and response match scores. |
|
|
121
121
|
|
|
122
|
+
## Custom result sinks
|
|
123
|
+
|
|
124
|
+
Plugins can deliver run results (partial metrics, final summary, errors) to arbitrary backends alongside the database. Install a package that declares `[project.entry-points."agentevals.sinks"]`, restart agentevals, then reference the plugin’s `kind` in `spec.sinks` on `POST /api/runs`.
|
|
125
|
+
|
|
126
|
+
See [custom_sink/README.md](./custom_sink/README.md) for a minimal setuptools plugin and configuration examples.
|
|
127
|
+
|
|
122
128
|
## Advanced: GenAI Semantic Convention Patterns
|
|
123
129
|
|
|
124
130
|
> [!TIP]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Custom result sink plugin
|
|
2
|
+
|
|
3
|
+
This folder is a tiny installable Python package that registers a result **sink** with agentevals via setuptools **entry points**. The worker fans out partial/final/error events to every configured sink in addition to the database.
|
|
4
|
+
|
|
5
|
+
## What gets implemented
|
|
6
|
+
|
|
7
|
+
- **`DemoNdjsonSink`** — subclasses `ResultSink` from `agentevals.run.sinks` and appends one JSON object per line to `path` from the run spec (same pattern as the built-in `file` sink, with a `"demo": true` marker on each line).
|
|
8
|
+
- **`create_demo_sink(spec)`** — factory callable; must accept the full sink dict from the run spec and return a `ResultSink` (see return type in code).
|
|
9
|
+
|
|
10
|
+
The entry point **name** (`demo_ndjson` in `pyproject.toml`) is the **`kind`** string clients put under `spec.sinks`.
|
|
11
|
+
|
|
12
|
+
## Install (local dev)
|
|
13
|
+
|
|
14
|
+
From the agentevals repo root, install the framework first, then this example:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
uv pip install -e .
|
|
18
|
+
uv pip install -e examples/custom_sink
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Restart the agentevals process so `importlib.metadata` picks up the new distribution.
|
|
22
|
+
|
|
23
|
+
PyPI-style usage is the same: depend on `agentevals-example-custom-sink` next to `agentevals-cli`, install both into the server environment, restart.
|
|
24
|
+
|
|
25
|
+
## Configure runs
|
|
26
|
+
|
|
27
|
+
Async runs are submitted with **`POST /api/runs`**. Put your sink in **`spec.sinks`** (requires Postgres storage — see main docs).
|
|
28
|
+
|
|
29
|
+
Example body (use **absolute** `path` on the host where the agentevals process runs when possible). **`path` must be a file path** (e.g. `/tmp/demo.ndjson`). If `path` is an **existing directory** (including `"."` for the process working directory), output goes to `<path>/agentevals-demo-sink.ndjson`, or `<path>/<filename>` if you add an optional `"filename"` field next to `path` in the sink dict.
|
|
30
|
+
|
|
31
|
+
The `inline` object must contain real trace data (Jaeger JSON or OTLP), not an empty object.
|
|
32
|
+
|
|
33
|
+
```json
|
|
34
|
+
{
|
|
35
|
+
"spec": {
|
|
36
|
+
"approach": "trace_replay",
|
|
37
|
+
"target": {
|
|
38
|
+
"kind": "inline",
|
|
39
|
+
"traceFormat": "jaeger-json",
|
|
40
|
+
"inline": {
|
|
41
|
+
"data": [
|
|
42
|
+
{
|
|
43
|
+
"traceID": "61646461646164646164616461646164",
|
|
44
|
+
"spans": [
|
|
45
|
+
{
|
|
46
|
+
"traceID": "61646461646164646164616461646164",
|
|
47
|
+
"spanID": "6164616461646164",
|
|
48
|
+
"operationName": "demo-op",
|
|
49
|
+
"startTime": 1000000,
|
|
50
|
+
"duration": 100000,
|
|
51
|
+
"tags": [],
|
|
52
|
+
"logs": [],
|
|
53
|
+
"references": [],
|
|
54
|
+
"processID": "p1"
|
|
55
|
+
}
|
|
56
|
+
],
|
|
57
|
+
"processes": { "p1": { "serviceName": "demo" } }
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"sinks": [{ "kind": "demo_ndjson", "path": "/tmp/agentevals-demo.ndjson" }]
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
You can list several sinks; they run in parallel. Built-in kinds are `stdout`, `file`, and `http_webhook`.
|
|
68
|
+
|
|
69
|
+
## Publishing your own sink
|
|
70
|
+
|
|
71
|
+
1. Implement `ResultSink` from `agentevals.run.sinks` (subclass the protocol, or provide the three async methods).
|
|
72
|
+
2. Expose a factory `def create_*(spec: dict) -> ResultSink`.
|
|
73
|
+
3. Add the following to your `pyproject.toml`:
|
|
74
|
+
|
|
75
|
+
```toml
|
|
76
|
+
[project.entry-points."agentevals.sinks"]
|
|
77
|
+
your_kind = "your_package.module:your_factory"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
4. Install the package into the **same environment** as `agentevals serve`, restart, and reference `"kind": "your_kind"` in `spec.sinks`.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Example result sink plugin for agentevals (see README)."""
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Minimal NDJSON sink registered via setuptools entry points."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
from agentevals.run.sinks import ResultSink
|
|
12
|
+
from agentevals.storage.models import Result
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _result_payload(r: Result) -> dict:
|
|
16
|
+
return r.model_dump(mode="json", by_alias=True)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_DEFAULT_FILENAME = "agentevals-demo-sink.ndjson"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _resolve_output_file(spec: dict[str, Any]) -> Path:
|
|
23
|
+
"""If ``path`` is an existing directory (including ``.``), write NDJSON inside it."""
|
|
24
|
+
p = Path(spec["path"]).expanduser()
|
|
25
|
+
if p.exists() and p.is_dir():
|
|
26
|
+
name = spec.get("filename") or _DEFAULT_FILENAME
|
|
27
|
+
return p / name
|
|
28
|
+
return p
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DemoNdjsonSink(ResultSink):
|
|
32
|
+
"""Concrete :class:`~agentevals.run.sinks.ResultSink`; append-only JSON lines with a ``demo`` marker."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, path: Path) -> None:
|
|
35
|
+
self._path = path
|
|
36
|
+
self._lock = asyncio.Lock()
|
|
37
|
+
|
|
38
|
+
async def _write(self, payload: dict) -> None:
|
|
39
|
+
async with self._lock:
|
|
40
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
41
|
+
with self._path.open("a") as f: # noqa: ASYNC230
|
|
42
|
+
f.write(json.dumps(payload) + "\n")
|
|
43
|
+
|
|
44
|
+
async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None:
|
|
45
|
+
for r in results:
|
|
46
|
+
await self._write(
|
|
47
|
+
{
|
|
48
|
+
"phase": "partial",
|
|
49
|
+
"run_id": str(run_id),
|
|
50
|
+
"attempt": attempt,
|
|
51
|
+
"demo": True,
|
|
52
|
+
"result": _result_payload(r),
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
async def emit_final(self, run_id: UUID, summary: dict, attempt: int) -> None:
|
|
57
|
+
await self._write(
|
|
58
|
+
{"phase": "final", "run_id": str(run_id), "attempt": attempt, "demo": True, "summary": summary}
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None:
|
|
62
|
+
await self._write({"phase": "error", "run_id": str(run_id), "attempt": attempt, "demo": True, "error": error})
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def create_demo_sink(spec: dict[str, Any]) -> ResultSink:
|
|
66
|
+
"""Entry-point factory: returns a :class:`ResultSink`; ``kind`` must be ``demo_ndjson`` (see pyproject).
|
|
67
|
+
|
|
68
|
+
``path`` should normally be a **file** path. If it points at an existing directory (e.g. ``.`` or ``/tmp``),
|
|
69
|
+
lines are appended to ``<path>/agentevals-demo-sink.ndjson``, or ``<path>/<filename>`` if ``filename`` is set.
|
|
70
|
+
"""
|
|
71
|
+
return DemoNdjsonSink(_resolve_output_file(spec))
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "agentevals-example-custom-sink"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Example setuptools plugin that registers an agentevals result sink"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"agentevals-cli>=0.7.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.entry-points."agentevals.sinks"]
|
|
16
|
+
demo_ndjson = "agentevals_example_custom_sink.sink:create_demo_sink"
|
|
17
|
+
|
|
18
|
+
[tool.hatch.build.targets.wheel]
|
|
19
|
+
packages = ["agentevals_example_custom_sink"]
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "agentevals-cli"
|
|
7
|
-
version = "0.8.
|
|
7
|
+
version = "0.8.2"
|
|
8
8
|
description = "Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
@@ -17,6 +17,7 @@ from fastapi.responses import StreamingResponse
|
|
|
17
17
|
from agentevals import __version__
|
|
18
18
|
|
|
19
19
|
from ..run.service import RunService
|
|
20
|
+
from ..run.sinks import log_registered_sinks
|
|
20
21
|
from ..run.worker import AsyncRunWorker
|
|
21
22
|
from ..storage import StorageSettings, build_repos
|
|
22
23
|
from ..storage.postgres.migrator import Migrator
|
|
@@ -83,6 +84,7 @@ def _build_lifespan():
|
|
|
83
84
|
worker = AsyncRunWorker(runs=repos.runs, results=repos.results, settings=storage_settings)
|
|
84
85
|
await worker.start()
|
|
85
86
|
app.state.run_worker = worker
|
|
87
|
+
log_registered_sinks()
|
|
86
88
|
|
|
87
89
|
yield
|
|
88
90
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Contents:
|
|
4
4
|
- :mod:`fetcher` resolves a run spec's ``target`` into a list of traces.
|
|
5
|
-
- :mod:`sinks` fan-out result delivery (
|
|
5
|
+
- :mod:`sinks` fan-out result delivery (built-ins plus setuptools plugins / :func:`~agentevals.run.sinks.register_sink_factory`).
|
|
6
6
|
- :mod:`service` is the synchronous control surface used by HTTP handlers.
|
|
7
7
|
- :mod:`worker` is the in-process loop that claims runs and drives the
|
|
8
8
|
existing :func:`agentevals.runner.run_evaluation_from_traces` pipeline.
|
|
@@ -3,6 +3,14 @@
|
|
|
3
3
|
The :class:`agentevals.storage.repos.ResultRepository` is always written;
|
|
4
4
|
sinks are an additional delivery channel. Sink failures are logged with
|
|
5
5
|
``run_id`` / ``result_id`` but do not fail the run.
|
|
6
|
+
|
|
7
|
+
**Plugins:** third-party packages declare setuptools entry points in group
|
|
8
|
+
``agentevals.sinks`` (entry **name** = ``kind`` string; **value** = ``module:factory``
|
|
9
|
+
callable ``factory(spec: dict) -> ResultSink``). Built-in kinds
|
|
10
|
+
(``stdout``, ``file``, ``http_webhook``) are not overridden by entry points;
|
|
11
|
+
hosts may replace any kind via :func:`register_sink_factory` (highest precedence).
|
|
12
|
+
|
|
13
|
+
Tests may call :func:`clear_sink_plugin_registry` to drop programmatic registrations.
|
|
6
14
|
"""
|
|
7
15
|
|
|
8
16
|
from __future__ import annotations
|
|
@@ -12,8 +20,10 @@ import json
|
|
|
12
20
|
import logging
|
|
13
21
|
import os
|
|
14
22
|
import sys
|
|
23
|
+
from collections.abc import Callable
|
|
24
|
+
from importlib.metadata import entry_points
|
|
15
25
|
from pathlib import Path
|
|
16
|
-
from typing import Any, Protocol
|
|
26
|
+
from typing import Any, Protocol, cast
|
|
17
27
|
from uuid import UUID
|
|
18
28
|
|
|
19
29
|
import httpx
|
|
@@ -22,6 +32,8 @@ from ..storage.models import Result
|
|
|
22
32
|
|
|
23
33
|
logger = logging.getLogger(__name__)
|
|
24
34
|
|
|
35
|
+
SINK_ENTRY_POINT_GROUP = "agentevals.sinks"
|
|
36
|
+
|
|
25
37
|
|
|
26
38
|
class ResultSink(Protocol):
|
|
27
39
|
async def emit_partial(self, run_id: UUID, results: list[Result], attempt: int) -> None: ...
|
|
@@ -29,6 +41,11 @@ class ResultSink(Protocol):
|
|
|
29
41
|
async def emit_error(self, run_id: UUID, error: str, attempt: int) -> None: ...
|
|
30
42
|
|
|
31
43
|
|
|
44
|
+
SinkFactory = Callable[[dict[str, Any]], ResultSink]
|
|
45
|
+
|
|
46
|
+
_PLUGIN_FACTORIES: dict[str, SinkFactory] = {}
|
|
47
|
+
|
|
48
|
+
|
|
32
49
|
def _result_payload(r: Result) -> dict:
|
|
33
50
|
return r.model_dump(mode="json", by_alias=True)
|
|
34
51
|
|
|
@@ -187,33 +204,18 @@ class SinkFanout:
|
|
|
187
204
|
logger.exception("sink delivery failed in phase=%s", phase)
|
|
188
205
|
|
|
189
206
|
|
|
190
|
-
def
|
|
191
|
-
"""
|
|
207
|
+
def register_sink_factory(kind: str, factory: SinkFactory) -> None:
|
|
208
|
+
"""Register or replace the factory for ``kind`` (overrides built-ins and entry points).
|
|
192
209
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
break older agentevals replicas mid-rollout.
|
|
210
|
+
Call during process startup before run workers consume specs. The factory receives
|
|
211
|
+
the full sink spec dict (including ``kind``) and returns a :class:`ResultSink`.
|
|
196
212
|
"""
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
sinks.append(FileSink(spec["path"]))
|
|
204
|
-
elif kind == "http_webhook":
|
|
205
|
-
sinks.append(
|
|
206
|
-
HttpWebhookSink(
|
|
207
|
-
url=spec["url"],
|
|
208
|
-
headers=spec.get("headers"),
|
|
209
|
-
headers_from_env=spec.get("headers_from_env") or _extract_env_headers(spec.get("auth")),
|
|
210
|
-
timeout_s=float(spec.get("timeout_s", 10.0)),
|
|
211
|
-
max_attempts=int(spec.get("max_attempts", 5)),
|
|
212
|
-
)
|
|
213
|
-
)
|
|
214
|
-
else:
|
|
215
|
-
logger.warning("unknown sink kind '%s'; skipping", kind)
|
|
216
|
-
return SinkFanout(sinks)
|
|
213
|
+
_PLUGIN_FACTORIES[kind] = factory
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def clear_sink_plugin_registry() -> None:
|
|
217
|
+
"""Drop all registrations from :func:`register_sink_factory` (for tests)."""
|
|
218
|
+
_PLUGIN_FACTORIES.clear()
|
|
217
219
|
|
|
218
220
|
|
|
219
221
|
def _extract_env_headers(auth: Any) -> dict[str, str]:
|
|
@@ -228,3 +230,85 @@ def _extract_env_headers(auth: Any) -> dict[str, str]:
|
|
|
228
230
|
if isinstance(value, dict) and "from_env" in value:
|
|
229
231
|
result[header_name] = value["from_env"]
|
|
230
232
|
return result
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _http_webhook_from_spec(spec: dict[str, Any]) -> HttpWebhookSink:
|
|
236
|
+
return HttpWebhookSink(
|
|
237
|
+
url=spec["url"],
|
|
238
|
+
headers=spec.get("headers"),
|
|
239
|
+
headers_from_env=spec.get("headers_from_env") or _extract_env_headers(spec.get("auth")),
|
|
240
|
+
timeout_s=float(spec.get("timeout_s", 10.0)),
|
|
241
|
+
max_attempts=int(spec.get("max_attempts", 5)),
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _builtin_factories() -> dict[str, SinkFactory]:
|
|
246
|
+
return {
|
|
247
|
+
"stdout": lambda _spec: StdoutSink(),
|
|
248
|
+
"file": lambda spec: FileSink(spec["path"]),
|
|
249
|
+
"http_webhook": _http_webhook_from_spec,
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _merge_sink_factories() -> dict[str, SinkFactory]:
|
|
254
|
+
"""Built-ins, then entry points (no built-in shadowing), then programmatic overrides."""
|
|
255
|
+
merged: dict[str, SinkFactory] = dict(_builtin_factories())
|
|
256
|
+
eps = entry_points(group=SINK_ENTRY_POINT_GROUP)
|
|
257
|
+
for ep in eps:
|
|
258
|
+
if ep.name in merged:
|
|
259
|
+
logger.debug("skipping sink entry point %r; built-in kind takes precedence", ep.name)
|
|
260
|
+
continue
|
|
261
|
+
try:
|
|
262
|
+
loaded = ep.load()
|
|
263
|
+
if not callable(loaded):
|
|
264
|
+
logger.warning("sink entry point %r is not callable; skipping", ep.name)
|
|
265
|
+
continue
|
|
266
|
+
merged[ep.name] = cast(SinkFactory, loaded)
|
|
267
|
+
except Exception:
|
|
268
|
+
logger.exception("failed to load sink entry point %r", ep.name)
|
|
269
|
+
merged.update(_PLUGIN_FACTORIES)
|
|
270
|
+
return merged
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def registered_sink_kinds() -> tuple[str, ...]:
|
|
274
|
+
"""Sorted sink ``kind`` strings that would resolve if :func:`build_sinks` ran now.
|
|
275
|
+
|
|
276
|
+
Includes built-ins, successfully loaded setuptools entry points for group
|
|
277
|
+
:data:`SINK_ENTRY_POINT_GROUP`, and registrations from
|
|
278
|
+
:func:`register_sink_factory`. The tuple reflects current process state and
|
|
279
|
+
can change if the programmatic registry is mutated after startup.
|
|
280
|
+
"""
|
|
281
|
+
return tuple(sorted(_merge_sink_factories().keys()))
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def log_registered_sinks() -> None:
|
|
285
|
+
"""Emit one INFO line listing available sink kinds (for operator diagnostics)."""
|
|
286
|
+
kinds = registered_sink_kinds()
|
|
287
|
+
logger.info("Result sinks available (%d kinds): %s", len(kinds), ", ".join(kinds))
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def build_sinks(specs: list[dict]) -> SinkFanout:
|
|
291
|
+
"""Construct a fan-out from the run spec's ``sinks`` array.
|
|
292
|
+
|
|
293
|
+
Each spec is a dict with ``kind`` plus kind-specific args. Unknown kinds
|
|
294
|
+
are skipped with a warning so a future kind added by a host doesn't
|
|
295
|
+
break older agentevals replicas mid-rollout.
|
|
296
|
+
|
|
297
|
+
Factory lookup starts from built-ins, adds setuptools entry points (group
|
|
298
|
+
``agentevals.sinks``) for ``kind`` names not already built-in, then applies
|
|
299
|
+
:func:`register_sink_factory` registrations, which override any prior factory
|
|
300
|
+
for the same ``kind``. See :func:`_merge_sink_factories`.
|
|
301
|
+
"""
|
|
302
|
+
factories = _merge_sink_factories()
|
|
303
|
+
sinks: list[ResultSink] = []
|
|
304
|
+
for spec in specs:
|
|
305
|
+
kind = spec.get("kind")
|
|
306
|
+
factory = factories.get(kind) if kind is not None else None
|
|
307
|
+
if factory is None:
|
|
308
|
+
logger.warning("unknown sink kind '%s'; skipping", kind)
|
|
309
|
+
continue
|
|
310
|
+
try:
|
|
311
|
+
sinks.append(factory(spec))
|
|
312
|
+
except Exception:
|
|
313
|
+
logger.exception("sink factory failed for kind=%s", kind)
|
|
314
|
+
return SinkFanout(sinks)
|
|
@@ -8,8 +8,9 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import contextlib
|
|
10
10
|
import json
|
|
11
|
+
import logging
|
|
11
12
|
from pathlib import Path
|
|
12
|
-
from unittest.mock import patch
|
|
13
|
+
from unittest.mock import MagicMock, patch
|
|
13
14
|
from uuid import UUID, uuid4
|
|
14
15
|
|
|
15
16
|
import httpx
|
|
@@ -21,6 +22,10 @@ from agentevals.run.sinks import (
|
|
|
21
22
|
SinkFanout,
|
|
22
23
|
StdoutSink,
|
|
23
24
|
build_sinks,
|
|
25
|
+
clear_sink_plugin_registry,
|
|
26
|
+
log_registered_sinks,
|
|
27
|
+
register_sink_factory,
|
|
28
|
+
registered_sink_kinds,
|
|
24
29
|
)
|
|
25
30
|
from agentevals.storage.models import Result, ResultStatus
|
|
26
31
|
|
|
@@ -43,6 +48,14 @@ def _mock_async_client(transport: httpx.MockTransport):
|
|
|
43
48
|
yield
|
|
44
49
|
|
|
45
50
|
|
|
51
|
+
@pytest.fixture
|
|
52
|
+
def isolated_sink_plugins():
|
|
53
|
+
"""``register_sink_factory`` is process-global; reset around plugin tests."""
|
|
54
|
+
clear_sink_plugin_registry()
|
|
55
|
+
yield
|
|
56
|
+
clear_sink_plugin_registry()
|
|
57
|
+
|
|
58
|
+
|
|
46
59
|
def _result(run_id: UUID) -> Result:
|
|
47
60
|
return Result(
|
|
48
61
|
result_id="rid-1",
|
|
@@ -182,6 +195,17 @@ class TestHttpWebhookSink:
|
|
|
182
195
|
assert "authorization" not in captured[0]
|
|
183
196
|
|
|
184
197
|
|
|
198
|
+
class TestRegisteredSinkKinds:
|
|
199
|
+
def test_includes_builtins(self):
|
|
200
|
+
kinds = set(registered_sink_kinds())
|
|
201
|
+
assert {"stdout", "file", "http_webhook"}.issubset(kinds)
|
|
202
|
+
|
|
203
|
+
def test_log_registered_sinks_info(self, caplog):
|
|
204
|
+
with caplog.at_level(logging.INFO, logger="agentevals.run.sinks"):
|
|
205
|
+
log_registered_sinks()
|
|
206
|
+
assert any("Result sinks available" in r.getMessage() for r in caplog.records)
|
|
207
|
+
|
|
208
|
+
|
|
185
209
|
class TestBuildSinks:
|
|
186
210
|
def test_stdout(self):
|
|
187
211
|
fanout = build_sinks([{"kind": "stdout"}])
|
|
@@ -214,6 +238,161 @@ class TestBuildSinks:
|
|
|
214
238
|
assert isinstance(fanout, SinkFanout)
|
|
215
239
|
|
|
216
240
|
|
|
241
|
+
class TestPluginSinkRegistry:
|
|
242
|
+
async def test_register_sink_factory_kind(self, isolated_sink_plugins, tmp_path):
|
|
243
|
+
path = tmp_path / "plugin.jsonl"
|
|
244
|
+
|
|
245
|
+
def _factory(spec: dict) -> FileSink:
|
|
246
|
+
return FileSink(spec["path"])
|
|
247
|
+
|
|
248
|
+
register_sink_factory("plugin_file", _factory)
|
|
249
|
+
fanout = build_sinks([{"kind": "plugin_file", "path": str(path)}])
|
|
250
|
+
run_id = uuid4()
|
|
251
|
+
await fanout.emit_final(run_id, {"ok": True}, attempt=1)
|
|
252
|
+
assert json.loads(path.read_text().strip())["summary"] == {"ok": True}
|
|
253
|
+
|
|
254
|
+
async def test_programmatic_registration_overrides_builtin(self, isolated_sink_plugins):
|
|
255
|
+
finals: list[dict] = []
|
|
256
|
+
|
|
257
|
+
class CaptureSink:
|
|
258
|
+
async def emit_partial(self, run_id, results, attempt):
|
|
259
|
+
pass
|
|
260
|
+
|
|
261
|
+
async def emit_final(self, run_id, summary, attempt):
|
|
262
|
+
finals.append(summary)
|
|
263
|
+
|
|
264
|
+
async def emit_error(self, run_id, error, attempt):
|
|
265
|
+
pass
|
|
266
|
+
|
|
267
|
+
register_sink_factory("stdout", lambda _spec: CaptureSink())
|
|
268
|
+
fanout = build_sinks([{"kind": "stdout"}])
|
|
269
|
+
await fanout.emit_final(uuid4(), {"captured": True}, attempt=1)
|
|
270
|
+
assert finals == [{"captured": True}]
|
|
271
|
+
|
|
272
|
+
async def test_factory_exception_skips_sink(self, isolated_sink_plugins, caplog):
|
|
273
|
+
def broken_factory(_spec):
|
|
274
|
+
raise RuntimeError("no")
|
|
275
|
+
|
|
276
|
+
register_sink_factory("broken", broken_factory)
|
|
277
|
+
with caplog.at_level(logging.ERROR):
|
|
278
|
+
fanout = build_sinks([{"kind": "broken"}, {"kind": "stdout"}])
|
|
279
|
+
assert isinstance(fanout, SinkFanout)
|
|
280
|
+
assert any("sink factory failed for kind=broken" in r.getMessage() for r in caplog.records)
|
|
281
|
+
await fanout.emit_final(uuid4(), {}, attempt=1)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class TestSinkEntryPoints:
|
|
285
|
+
"""Entry-point discovery without relying on packages installed in the test venv."""
|
|
286
|
+
|
|
287
|
+
async def test_entry_point_factory_resolves_kind(self, isolated_sink_plugins, tmp_path):
|
|
288
|
+
path = tmp_path / "ep.jsonl"
|
|
289
|
+
|
|
290
|
+
def file_factory(spec: dict) -> FileSink:
|
|
291
|
+
return FileSink(spec["path"])
|
|
292
|
+
|
|
293
|
+
ep = MagicMock()
|
|
294
|
+
ep.name = "from_ep"
|
|
295
|
+
ep.load.return_value = file_factory
|
|
296
|
+
|
|
297
|
+
with patch("agentevals.run.sinks.entry_points", return_value=[ep]):
|
|
298
|
+
fanout = build_sinks([{"kind": "from_ep", "path": str(path)}])
|
|
299
|
+
ep.load.assert_called_once_with()
|
|
300
|
+
run_id = uuid4()
|
|
301
|
+
await fanout.emit_final(run_id, {"via": "ep"}, attempt=1)
|
|
302
|
+
assert json.loads(path.read_text().strip())["summary"] == {"via": "ep"}
|
|
303
|
+
|
|
304
|
+
async def test_builtin_kind_does_not_load_colliding_entry_point(self, isolated_sink_plugins, capsys):
|
|
305
|
+
ep = MagicMock()
|
|
306
|
+
ep.name = "stdout"
|
|
307
|
+
ep.load.side_effect = AssertionError("built-in kinds must not load shadow entry points")
|
|
308
|
+
|
|
309
|
+
with patch("agentevals.run.sinks.entry_points", return_value=[ep]):
|
|
310
|
+
fanout = build_sinks([{"kind": "stdout"}])
|
|
311
|
+
ep.load.assert_not_called()
|
|
312
|
+
await fanout.emit_final(uuid4(), {"k": "v"}, attempt=1)
|
|
313
|
+
assert json.loads(capsys.readouterr().out.strip().splitlines()[-1])["phase"] == "final"
|
|
314
|
+
|
|
315
|
+
def test_entry_point_load_failure_skipped(self, isolated_sink_plugins, caplog):
|
|
316
|
+
ep = MagicMock()
|
|
317
|
+
ep.name = "broken_pkg_sink"
|
|
318
|
+
ep.load.side_effect = ImportError("dist not installed")
|
|
319
|
+
|
|
320
|
+
with patch("agentevals.run.sinks.entry_points", return_value=[ep]), caplog.at_level(logging.ERROR):
|
|
321
|
+
build_sinks([{"kind": "broken_pkg_sink"}])
|
|
322
|
+
assert any("failed to load sink entry point" in r.getMessage() for r in caplog.records)
|
|
323
|
+
|
|
324
|
+
def test_non_callable_entry_point_skipped(self, isolated_sink_plugins, caplog):
|
|
325
|
+
ep = MagicMock()
|
|
326
|
+
ep.name = "bad_export"
|
|
327
|
+
ep.load.return_value = "not_callable"
|
|
328
|
+
|
|
329
|
+
with patch("agentevals.run.sinks.entry_points", return_value=[ep]), caplog.at_level(logging.WARNING):
|
|
330
|
+
build_sinks([{"kind": "bad_export"}])
|
|
331
|
+
assert any("not callable" in r.getMessage() for r in caplog.records)
|
|
332
|
+
|
|
333
|
+
def test_kind_only_from_missing_plugin_is_unknown(self, isolated_sink_plugins, caplog):
|
|
334
|
+
"""Same behavior as an unpublished PyPI sink: no entry point, no registration."""
|
|
335
|
+
with patch("agentevals.run.sinks.entry_points", return_value=[]), caplog.at_level(logging.WARNING):
|
|
336
|
+
fanout = build_sinks([{"kind": "demo_ndjson", "path": "/tmp/would_not_be_used.jsonl"}])
|
|
337
|
+
assert any("unknown sink kind 'demo_ndjson'" in r.getMessage() for r in caplog.records)
|
|
338
|
+
assert isinstance(fanout, SinkFanout)
|
|
339
|
+
|
|
340
|
+
async def test_missing_plugin_sink_does_not_break_fanout(self, isolated_sink_plugins, tmp_path, caplog):
|
|
341
|
+
"""Unknown plugin kind skipped; remaining sinks still run."""
|
|
342
|
+
path = tmp_path / "only_builtin.jsonl"
|
|
343
|
+
with patch("agentevals.run.sinks.entry_points", return_value=[]), caplog.at_level(logging.WARNING):
|
|
344
|
+
fanout = build_sinks(
|
|
345
|
+
[
|
|
346
|
+
{"kind": "demo_ndjson", "path": str(tmp_path / "absent.jsonl")},
|
|
347
|
+
{"kind": "file", "path": str(path)},
|
|
348
|
+
]
|
|
349
|
+
)
|
|
350
|
+
assert any("unknown sink kind 'demo_ndjson'" in r.getMessage() for r in caplog.records)
|
|
351
|
+
run_id = uuid4()
|
|
352
|
+
await fanout.emit_final(run_id, {"ok": True}, attempt=1)
|
|
353
|
+
assert json.loads(path.read_text().strip())["summary"] == {"ok": True}
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _demo_example_sink_installed() -> bool:
|
|
357
|
+
try:
|
|
358
|
+
import agentevals_example_custom_sink.sink # noqa: F401
|
|
359
|
+
except ImportError:
|
|
360
|
+
return False
|
|
361
|
+
return True
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
@pytest.mark.skipif(
|
|
365
|
+
not _demo_example_sink_installed(), reason="install editable: uv pip install -e examples/custom_sink"
|
|
366
|
+
)
|
|
367
|
+
class TestDemoNdjsonExampleSink:
|
|
368
|
+
async def test_path_dot_writes_default_ndjson(self, tmp_path, monkeypatch):
|
|
369
|
+
monkeypatch.chdir(tmp_path)
|
|
370
|
+
from agentevals_example_custom_sink.sink import create_demo_sink
|
|
371
|
+
|
|
372
|
+
sink = create_demo_sink({"path": "."})
|
|
373
|
+
await sink.emit_final(uuid4(), {"x": 1}, attempt=1)
|
|
374
|
+
out = tmp_path / "agentevals-demo-sink.ndjson"
|
|
375
|
+
assert json.loads(out.read_text().strip())["summary"] == {"x": 1}
|
|
376
|
+
|
|
377
|
+
async def test_existing_directory_appends_default_filename(self, tmp_path):
|
|
378
|
+
d = tmp_path / "outdir"
|
|
379
|
+
d.mkdir()
|
|
380
|
+
from agentevals_example_custom_sink.sink import create_demo_sink
|
|
381
|
+
|
|
382
|
+
sink = create_demo_sink({"path": str(d)})
|
|
383
|
+
await sink.emit_final(uuid4(), {}, attempt=1)
|
|
384
|
+
assert (d / "agentevals-demo-sink.ndjson").exists()
|
|
385
|
+
|
|
386
|
+
async def test_directory_with_custom_filename(self, tmp_path):
|
|
387
|
+
d = tmp_path / "logs"
|
|
388
|
+
d.mkdir()
|
|
389
|
+
from agentevals_example_custom_sink.sink import create_demo_sink
|
|
390
|
+
|
|
391
|
+
sink = create_demo_sink({"path": str(d), "filename": "runs.jsonl"})
|
|
392
|
+
await sink.emit_final(uuid4(), {"n": 2}, attempt=1)
|
|
393
|
+
assert json.loads((d / "runs.jsonl").read_text().strip())["summary"] == {"n": 2}
|
|
394
|
+
|
|
395
|
+
|
|
217
396
|
class TestSinkFanoutErrorIsolation:
|
|
218
397
|
"""A sink that raises must not abort other sinks or the run itself."""
|
|
219
398
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|