agentevals-cli 0.5.3__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentevals_cli-0.6.1/.dockerignore +16 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.github/workflows/release.yml +40 -0
- agentevals_cli-0.6.1/Dockerfile +38 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/Makefile +12 -1
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/PKG-INFO +59 -5
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/README.md +56 -4
- agentevals_cli-0.6.1/charts/agentevals/Chart.yaml +6 -0
- agentevals_cli-0.6.1/charts/agentevals/templates/NOTES.txt +12 -0
- agentevals_cli-0.6.1/charts/agentevals/templates/_helpers.tpl +57 -0
- agentevals_cli-0.6.1/charts/agentevals/templates/deployment.yaml +128 -0
- agentevals_cli-0.6.1/charts/agentevals/templates/service.yaml +24 -0
- agentevals_cli-0.6.1/charts/agentevals/templates/serviceaccount.yaml +14 -0
- agentevals_cli-0.6.1/charts/agentevals/values.yaml +153 -0
- agentevals_cli-0.6.1/docs/assets/logo-color-on-transparent.svg +13 -0
- agentevals_cli-0.6.1/docs/assets/logo-dark-on-transparent.svg +13 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/docs/custom-evaluators.md +82 -35
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/custom_evaluators/eval_config.yaml +0 -1
- agentevals_cli-0.6.1/examples/zero-code-examples/openai-agents/requirements.txt +6 -0
- agentevals_cli-0.6.1/examples/zero-code-examples/openai-agents/run.py +105 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/pyproject.toml +4 -1
- agentevals_cli-0.6.1/src/agentevals/_static/assets/index-lHPO8TkI.js +342 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/_static/index.html +1 -1
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/api/app.py +14 -18
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/api/debug_routes.py +19 -25
- agentevals_cli-0.6.1/src/agentevals/api/dependencies.py +23 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/api/models.py +20 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/api/otlp_app.py +4 -4
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/api/otlp_routes.py +34 -40
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/api/routes.py +142 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/api/streaming_routes.py +67 -51
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/cli.py +62 -7
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/config.py +41 -1
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/converter.py +35 -61
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/custom_evaluators.py +45 -11
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/eval_config_loader.py +3 -1
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/evaluator/sources.py +23 -3
- agentevals_cli-0.6.1/src/agentevals/evaluator/venv.py +119 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/extraction.py +25 -2
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/genai_converter.py +37 -98
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/mcp_server.py +3 -2
- agentevals_cli-0.6.1/src/agentevals/openai_eval_backend.py +246 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/output.py +21 -4
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/runner.py +6 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/integration/conftest.py +8 -10
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/integration/test_live_agents.py +57 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_api.py +7 -15
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_extraction.py +11 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_otlp_receiver.py +25 -49
- agentevals_cli-0.6.1/tests/test_output.py +112 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_runner.py +4 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/api/client.ts +29 -1
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/TraceUploadZone.tsx +12 -12
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/dashboard/TraceCard.tsx +11 -20
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/InspectorHeader.tsx +11 -20
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/InspectorView.tsx +10 -39
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/upload/TraceEditorDrawer.tsx +11 -14
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/context/TraceProvider.tsx +23 -13
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/lib/evalset-builder.ts +10 -36
- agentevals_cli-0.6.1/ui/src/lib/trace-helpers.ts +73 -0
- agentevals_cli-0.6.1/ui/src/lib/trace-metadata.ts +12 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/lib/trace-patcher.ts +1 -1
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/lib/types.ts +21 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/uv.lock +6 -2
- agentevals_cli-0.5.3/src/agentevals/_static/assets/index-Dz2NgC8m.js +0 -343
- agentevals_cli-0.5.3/ui/src/lib/trace-converter.ts +0 -734
- agentevals_cli-0.5.3/ui/src/lib/trace-metadata.ts +0 -391
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.claude/skills/eval/SKILL.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.claude/skills/eval/evals/evals.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.claude/skills/inspect/SKILL.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.claude/skills/inspect/evals/evals.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.github/workflows/ci.yml +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.github/workflows/publish-evaluator-sdk.yml +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.gitignore +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/.mcp.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/CONTRIBUTING.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/DEVELOPMENT.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/LICENSE +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/docs/assets/logo-color.png +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/docs/eval-set-format.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/docs/otel-compatibility.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/docs/streaming.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/README.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/custom_evaluators/response_quality.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/custom_evaluators/tool_call_checker.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/dice_agent/README.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/dice_agent/agent.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/dice_agent/eval_set.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/dice_agent/main.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/dice_agent/test_streaming.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/langchain_agent/README.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/langchain_agent/agent.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/langchain_agent/eval_set.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/langchain_agent/main.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/langchain_agent/requirements.txt +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/langchain_agent/test_streaming.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/sdk_example/async_example.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/sdk_example/context_manager_example.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/sdk_example/decorator_example.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/sdk_example/requirements.txt +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/strands_agent/agent.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/strands_agent/eval_set.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/strands_agent/main.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/strands_agent/requirements.txt +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/zero-code-examples/adk/requirements.txt +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/zero-code-examples/adk/run.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/zero-code-examples/langchain/requirements.txt +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/zero-code-examples/langchain/run.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/zero-code-examples/strands/requirements.txt +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/examples/zero-code-examples/strands/run.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/flake.lock +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/flake.nix +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/packages/evaluator-sdk-py/README.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/packages/evaluator-sdk-py/pyproject.toml +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/samples/eval_set_helm.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/samples/evalset_helm_3_2026-02-23.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/samples/evalset_k8s_2026-02-20.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/samples/helm.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/samples/helm_2.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/samples/helm_3.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/samples/k8s.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/__init__.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/_protocol.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/_static/assets/index-BqibLiHO.css +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/_static/logo.svg +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/_static/vite.svg +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/api/__init__.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/builtin_metrics.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/evaluator/__init__.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/evaluator/resolver.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/evaluator/templates.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/loader/__init__.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/loader/base.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/loader/jaeger.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/loader/otlp.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/sdk.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/streaming/__init__.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/streaming/incremental_processor.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/streaming/processor.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/streaming/session.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/streaming/ws_server.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/trace_attrs.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/trace_metrics.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/utils/__init__.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/utils/genai_messages.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/utils/log_buffer.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/src/agentevals/utils/log_enrichment.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/integration/__init__.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/integration/test_evaluation_pipeline.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/integration/test_session_grouping.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/integration/test_timing_stress.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_converter.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_genai_converter.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_jaeger_loader.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_log_enrichment.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_otlp_loader.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_protocol.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/tests/test_sdk.py +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/.gitignore +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/README.md +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/eslint.config.js +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/index.html +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/package-lock.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/package.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/public/logo.svg +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/public/vite.svg +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/App.css +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/App.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/assets/react.svg +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/annotation-queue/AnnotationQueueView.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/annotation-queue/AnnotationTable.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/bug-report/BugReportModal.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/BuilderHeader.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/BuilderView.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/EvalCaseCard.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/EvalCasesList.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/InvocationEditor.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/JsonPreview.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/MetadataEditor.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/builder/index.ts +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/dashboard/DashboardView.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/dashboard/MetricScoreCard.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/dashboard/PerformanceCard.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/dashboard/PerformanceCharts.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/dashboard/SummaryStats.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/dashboard/TraceTable.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/ComparisonPanel.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/DataSection.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/InspectorLayout.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/InvocationCard.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/InvocationSummaryPanel.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/MetricResultsSection.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/MetricsComparisonSection.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/PerformanceSection.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/ToolCallList.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/sidebar/Sidebar.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/streaming/LiveConversationPanel.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/streaming/LiveMessage.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/streaming/LiveStreamingView.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/streaming/SessionCard.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/streaming/SessionMetadata.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/upload/EvalSetEditorDrawer.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/upload/FileDropZone.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/upload/MetricSelector.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/upload/RawJsonPreview.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/upload/UploadView.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/components/welcome/WelcomeView.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/config.ts +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/context/TraceContext.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/index.css +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/lib/console-capture.ts +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/lib/network-capture.ts +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/lib/trace-loader.ts +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/lib/utils.ts +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/src/main.tsx +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/tsconfig.app.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/tsconfig.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/tsconfig.node.json +0 -0
- {agentevals_cli-0.5.3 → agentevals_cli-0.6.1}/ui/vite.config.ts +0 -0
|
@@ -29,6 +29,9 @@ jobs:
|
|
|
29
29
|
cache: npm
|
|
30
30
|
cache-dependency-path: ui/package-lock.json
|
|
31
31
|
|
|
32
|
+
- name: Set version from tag
|
|
33
|
+
run: uv version "${{ github.event.inputs.tag || github.ref_name }}" --package agentevals-cli
|
|
34
|
+
|
|
32
35
|
- name: Build core and bundled wheels
|
|
33
36
|
run: make release
|
|
34
37
|
|
|
@@ -89,3 +92,40 @@ jobs:
|
|
|
89
92
|
uv build --package agentevals-cli
|
|
90
93
|
uv publish dist/* --token ${{ secrets.PYPI_TOKEN }}
|
|
91
94
|
rm -rf src/agentevals/_static
|
|
95
|
+
|
|
96
|
+
push-docker:
|
|
97
|
+
runs-on: ubuntu-latest
|
|
98
|
+
permissions:
|
|
99
|
+
contents: read
|
|
100
|
+
packages: write
|
|
101
|
+
steps:
|
|
102
|
+
- uses: actions/checkout@v6
|
|
103
|
+
|
|
104
|
+
- name: Login to GitHub Container Registry
|
|
105
|
+
uses: docker/login-action@v4
|
|
106
|
+
with:
|
|
107
|
+
registry: ghcr.io
|
|
108
|
+
username: ${{ github.actor }}
|
|
109
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
110
|
+
|
|
111
|
+
- name: Set up QEMU
|
|
112
|
+
uses: docker/setup-qemu-action@v4
|
|
113
|
+
|
|
114
|
+
- name: Set up Docker Buildx
|
|
115
|
+
uses: docker/setup-buildx-action@v4
|
|
116
|
+
|
|
117
|
+
- name: Set appVersion in Chart.yaml
|
|
118
|
+
run: |
|
|
119
|
+
VERSION="${TAG#v}"
|
|
120
|
+
sed -i "s/^appVersion:.*/appVersion: \"$VERSION\"/" charts/agentevals/Chart.yaml
|
|
121
|
+
env:
|
|
122
|
+
TAG: ${{ github.event.inputs.tag || github.ref_name }}
|
|
123
|
+
|
|
124
|
+
- name: Build and push
|
|
125
|
+
run: |
|
|
126
|
+
VERSION="${TAG#v}"
|
|
127
|
+
make build-docker \
|
|
128
|
+
DOCKER_REGISTRY="ghcr.io/${{ github.repository_owner }}" \
|
|
129
|
+
DOCKER_TAG="$VERSION"
|
|
130
|
+
env:
|
|
131
|
+
TAG: ${{ github.event.inputs.tag || github.ref_name }}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# syntax=docker/dockerfile:1
|
|
2
|
+
|
|
3
|
+
FROM node:25-bookworm-slim AS ui
|
|
4
|
+
WORKDIR /build/ui
|
|
5
|
+
COPY ui/package.json ui/package-lock.json ./
|
|
6
|
+
# Skip lifecycle scripts during ci, then rebuild esbuild in its own layer — avoids ETXTBSY when
|
|
7
|
+
# install.js execs the binary while overlayfs still has the file busy (common with BuildKit).
|
|
8
|
+
RUN npm ci --ignore-scripts
|
|
9
|
+
RUN npm rebuild esbuild
|
|
10
|
+
COPY ui/ ./
|
|
11
|
+
RUN npm run build
|
|
12
|
+
|
|
13
|
+
FROM python:3.14-slim-bookworm
|
|
14
|
+
|
|
15
|
+
WORKDIR /app
|
|
16
|
+
|
|
17
|
+
# Install uv binary only (no pip); same approach as astral-sh/uv's Dockerfile.
|
|
18
|
+
# https://github.com/astral-sh/uv/blob/6d889fd53d5c108d304c5a4085eb3140ec6a9cdb/Dockerfile#L21
|
|
19
|
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
|
20
|
+
|
|
21
|
+
COPY pyproject.toml uv.lock README.md ./
|
|
22
|
+
COPY packages ./packages
|
|
23
|
+
COPY src ./src
|
|
24
|
+
|
|
25
|
+
COPY --from=ui /build/ui/dist ./src/agentevals/_static
|
|
26
|
+
|
|
27
|
+
RUN uv sync --frozen --no-dev --extra live \
|
|
28
|
+
&& groupadd --gid 1000 app \
|
|
29
|
+
&& useradd --uid 1000 --gid app --home-dir /app --no-log-init app \
|
|
30
|
+
&& chown -R app:app /app
|
|
31
|
+
|
|
32
|
+
USER app
|
|
33
|
+
ENV PATH="/app/.venv/bin:$PATH"
|
|
34
|
+
ENV AGENTEVALS_SERVER_URL=http://127.0.0.1:8001
|
|
35
|
+
|
|
36
|
+
EXPOSE 8001 4318 8080
|
|
37
|
+
|
|
38
|
+
CMD ["agentevals", "serve", "--host", "0.0.0.0", "--port", "8001", "--otlp-port", "4318", "--mcp-port", "8080"]
|
|
@@ -1,11 +1,22 @@
|
|
|
1
1
|
VERSION := $(shell grep '^version' pyproject.toml | cut -d'"' -f2)
|
|
2
2
|
WHEEL := dist/agentevals_cli-$(VERSION)-py3-none-any.whl
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
DOCKER_REGISTRY ?= soloio
|
|
5
|
+
DOCKER_IMAGE ?= agentevals
|
|
6
|
+
DOCKER_TAG ?= $(VERSION)
|
|
7
|
+
DOCKER_IMAGE_REF := $(if $(DOCKER_REGISTRY),$(DOCKER_REGISTRY:%/=%)/$(DOCKER_IMAGE),$(DOCKER_IMAGE))
|
|
8
|
+
|
|
9
|
+
# Multi-arch build (requires docker buildx). Manifest lists must be pushed — use build-docker-local for a single-arch --load.
|
|
10
|
+
PLATFORMS ?= linux/amd64,linux/arm64
|
|
11
|
+
|
|
12
|
+
.PHONY: build build-bundle build-docker build-ui release clean dev-backend dev-frontend dev-bundle test test-unit test-integration test-e2e
|
|
5
13
|
|
|
6
14
|
build:
|
|
7
15
|
uv build
|
|
8
16
|
|
|
17
|
+
build-docker:
|
|
18
|
+
docker buildx build --platform $(PLATFORMS) -t $(DOCKER_IMAGE_REF):$(DOCKER_TAG) --push .
|
|
19
|
+
|
|
9
20
|
build-ui:
|
|
10
21
|
cd ui && npm ci && npm run build
|
|
11
22
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agentevals-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Requires-Python: >=3.11
|
|
@@ -17,23 +17,76 @@ Requires-Dist: uvicorn[standard]>=0.32.0
|
|
|
17
17
|
Provides-Extra: live
|
|
18
18
|
Requires-Dist: httpx>=0.27.0; extra == 'live'
|
|
19
19
|
Requires-Dist: mcp>=1.26.0; extra == 'live'
|
|
20
|
+
Provides-Extra: openai
|
|
21
|
+
Requires-Dist: openai>=2.0; extra == 'openai'
|
|
20
22
|
Provides-Extra: streaming
|
|
21
23
|
Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'streaming'
|
|
22
24
|
Requires-Dist: websockets>=12.0; extra == 'streaming'
|
|
23
25
|
Description-Content-Type: text/markdown
|
|
24
26
|
|
|
25
27
|
<p align="center">
|
|
26
|
-
<
|
|
28
|
+
<picture>
|
|
29
|
+
<source media="(prefers-color-scheme: dark)" srcset="docs/assets/logo-color-on-transparent.svg">
|
|
30
|
+
<source media="(prefers-color-scheme: light)" srcset="docs/assets/logo-dark-on-transparent.svg">
|
|
31
|
+
<img src="docs/assets/logo-color-on-transparent.svg" alt="agentevals" width="420" />
|
|
32
|
+
</picture>
|
|
27
33
|
</p>
|
|
28
34
|
|
|
29
|
-
|
|
35
|
+
<h1 align="center">Ship Agents Reliably</h1>
|
|
30
36
|
|
|
31
|
-
|
|
37
|
+
<p align="center">
|
|
38
|
+
Benchmark your agents before they hit production.<br>
|
|
39
|
+
agentevals scores performance and inference quality from OpenTelemetry traces — no re-runs, no guesswork.
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
<p align="center">
|
|
43
|
+
<a href="https://github.com/agentevals-dev/agentevals/stargazers"><img src="https://img.shields.io/github/stars/agentevals-dev/agentevals?style=social" alt="GitHub Stars"></a>
|
|
44
|
+
|
|
45
|
+
<a href="https://discord.gg/cpveEn8Ah2"><img src="https://img.shields.io/discord/1435836734666707190?label=Discord&logo=discord&logoColor=white&color=5865F2" alt="Discord"></a>
|
|
46
|
+
|
|
47
|
+
<a href="https://github.com/agentevals-dev/agentevals/releases"><img src="https://img.shields.io/github/v/release/agentevals-dev/agentevals?label=Release" alt="Release"></a>
|
|
48
|
+
|
|
49
|
+
<a href="https://github.com/agentevals-dev/agentevals/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache%202.0-green.svg" alt="License"></a>
|
|
50
|
+
|
|
51
|
+
<a href="https://pypi.org/project/agentevals-cli/"><img src="https://img.shields.io/pypi/v/agentevals-cli?label=PyPI&color=blue" alt="PyPI"></a>
|
|
52
|
+
</p>
|
|
53
|
+
|
|
54
|
+
<p align="center">
|
|
55
|
+
<a href="#installation">Install</a> · <a href="#quick-start">Quick Start</a> · <a href="https://github.com/agentevals-dev/agentevals/releases">Releases</a> · <a href="CONTRIBUTING.md">Contributing</a> · <a href="https://discord.gg/cpveEn8Ah2">Discord</a>
|
|
56
|
+
</p>
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## What is agentevals?
|
|
61
|
+
|
|
62
|
+
agentevals is a framework-agnostic evaluation solution that scores AI agent behavior directly from [OpenTelemetry](https://opentelemetry.io/) traces. Record your agent's actions once, then evaluate as many times as you want — no re-runs, no guesswork.
|
|
63
|
+
|
|
64
|
+
It works with any OTel-instrumented framework (LangChain, Strands, Google ADK, and others), supports Jaeger JSON and OTLP trace formats, and ships with built-in evaluators, custom evaluator support, and LLM-based judges.
|
|
32
65
|
|
|
33
66
|
- **CLI** for scripting and CI pipelines
|
|
34
67
|
- **Web UI** for visual inspection and local developer experience
|
|
35
68
|
- **MCP server** so MCP clients can run evaluations from a conversation
|
|
36
69
|
|
|
70
|
+
## Why agentevals?
|
|
71
|
+
|
|
72
|
+
Most evaluation tools require you to **re-execute your agent** for every test — burning tokens, time, and money on duplicate LLM calls. agentevals takes a different approach:
|
|
73
|
+
|
|
74
|
+
- **No re-execution** — score agents from existing traces without replaying expensive LLM calls
|
|
75
|
+
- **Framework-agnostic** — works with any agent framework that emits OpenTelemetry spans
|
|
76
|
+
- **Golden eval sets** — compare actual behavior against defined expected behaviors for deterministic pass/fail gating
|
|
77
|
+
- **Custom evaluators** — write scoring logic in Python, JavaScript, or any language
|
|
78
|
+
- **CI/CD ready** — gate deployments on quality thresholds directly in your pipeline
|
|
79
|
+
- **Local-first** — no cloud dependency required; everything runs on your machine
|
|
80
|
+
|
|
81
|
+
## How It Works
|
|
82
|
+
|
|
83
|
+
agentevals follows three simple steps:
|
|
84
|
+
|
|
85
|
+
1. **Collect traces** — Instrument your agent with OpenTelemetry (or export traces from your tracing backend). Point the OTLP exporter at the agentevals receiver, or load trace files directly.
|
|
86
|
+
2. **Define eval sets** — Create golden evaluation sets that describe expected agent behavior: which tools should be called, in what order, and what the output should look like.
|
|
87
|
+
3. **Run evaluations** — Use the CLI, Web UI, or MCP server to score traces against your eval sets. Get per-metric scores, pass/fail results, and detailed span-level breakdowns.
|
|
88
|
+
|
|
89
|
+
|
|
37
90
|
> [!IMPORTANT]
|
|
38
91
|
> This project is under active development. Expect breaking changes.
|
|
39
92
|
|
|
@@ -64,6 +117,7 @@ Optional extras:
|
|
|
64
117
|
|
|
65
118
|
```bash
|
|
66
119
|
pip install "agentevals-cli[live]" # MCP server support
|
|
120
|
+
pip install "agentevals-cli[openai]" # OpenAI Evals API graders
|
|
67
121
|
```
|
|
68
122
|
|
|
69
123
|
**GitHub [releases](../../releases)** also ship **core** wheels (CLI and API only) and **bundle** wheels (with the embedded UI) if you need a specific version or offline `pip install ./path/to.whl`.
|
|
@@ -188,7 +242,7 @@ evaluators:
|
|
|
188
242
|
agentevals run trace.json --config eval_config.yaml --eval-set eval_set.json
|
|
189
243
|
```
|
|
190
244
|
|
|
191
|
-
Community evaluators can be referenced directly from a shared GitHub repository using `type: remote`. See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK usage, and how to contribute evaluators.
|
|
245
|
+
Community evaluators can be referenced directly from a shared GitHub repository using `type: remote`. You can also delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) using `type: openai_eval` (requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY`). See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK usage, and how to contribute evaluators.
|
|
192
246
|
|
|
193
247
|
## Web UI
|
|
194
248
|
|
|
@@ -1,15 +1,66 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<
|
|
2
|
+
<picture>
|
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset="docs/assets/logo-color-on-transparent.svg">
|
|
4
|
+
<source media="(prefers-color-scheme: light)" srcset="docs/assets/logo-dark-on-transparent.svg">
|
|
5
|
+
<img src="docs/assets/logo-color-on-transparent.svg" alt="agentevals" width="420" />
|
|
6
|
+
</picture>
|
|
3
7
|
</p>
|
|
4
8
|
|
|
5
|
-
|
|
9
|
+
<h1 align="center">Ship Agents Reliably</h1>
|
|
6
10
|
|
|
7
|
-
|
|
11
|
+
<p align="center">
|
|
12
|
+
Benchmark your agents before they hit production.<br>
|
|
13
|
+
agentevals scores performance and inference quality from OpenTelemetry traces — no re-runs, no guesswork.
|
|
14
|
+
</p>
|
|
15
|
+
|
|
16
|
+
<p align="center">
|
|
17
|
+
<a href="https://github.com/agentevals-dev/agentevals/stargazers"><img src="https://img.shields.io/github/stars/agentevals-dev/agentevals?style=social" alt="GitHub Stars"></a>
|
|
18
|
+
|
|
19
|
+
<a href="https://discord.gg/cpveEn8Ah2"><img src="https://img.shields.io/discord/1435836734666707190?label=Discord&logo=discord&logoColor=white&color=5865F2" alt="Discord"></a>
|
|
20
|
+
|
|
21
|
+
<a href="https://github.com/agentevals-dev/agentevals/releases"><img src="https://img.shields.io/github/v/release/agentevals-dev/agentevals?label=Release" alt="Release"></a>
|
|
22
|
+
|
|
23
|
+
<a href="https://github.com/agentevals-dev/agentevals/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-Apache%202.0-green.svg" alt="License"></a>
|
|
24
|
+
|
|
25
|
+
<a href="https://pypi.org/project/agentevals-cli/"><img src="https://img.shields.io/pypi/v/agentevals-cli?label=PyPI&color=blue" alt="PyPI"></a>
|
|
26
|
+
</p>
|
|
27
|
+
|
|
28
|
+
<p align="center">
|
|
29
|
+
<a href="#installation">Install</a> · <a href="#quick-start">Quick Start</a> · <a href="https://github.com/agentevals-dev/agentevals/releases">Releases</a> · <a href="CONTRIBUTING.md">Contributing</a> · <a href="https://discord.gg/cpveEn8Ah2">Discord</a>
|
|
30
|
+
</p>
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## What is agentevals?
|
|
35
|
+
|
|
36
|
+
agentevals is a framework-agnostic evaluation solution that scores AI agent behavior directly from [OpenTelemetry](https://opentelemetry.io/) traces. Record your agent's actions once, then evaluate as many times as you want — no re-runs, no guesswork.
|
|
37
|
+
|
|
38
|
+
It works with any OTel-instrumented framework (LangChain, Strands, Google ADK, and others), supports Jaeger JSON and OTLP trace formats, and ships with built-in evaluators, custom evaluator support, and LLM-based judges.
|
|
8
39
|
|
|
9
40
|
- **CLI** for scripting and CI pipelines
|
|
10
41
|
- **Web UI** for visual inspection and local developer experience
|
|
11
42
|
- **MCP server** so MCP clients can run evaluations from a conversation
|
|
12
43
|
|
|
44
|
+
## Why agentevals?
|
|
45
|
+
|
|
46
|
+
Most evaluation tools require you to **re-execute your agent** for every test — burning tokens, time, and money on duplicate LLM calls. agentevals takes a different approach:
|
|
47
|
+
|
|
48
|
+
- **No re-execution** — score agents from existing traces without replaying expensive LLM calls
|
|
49
|
+
- **Framework-agnostic** — works with any agent framework that emits OpenTelemetry spans
|
|
50
|
+
- **Golden eval sets** — compare actual behavior against defined expected behaviors for deterministic pass/fail gating
|
|
51
|
+
- **Custom evaluators** — write scoring logic in Python, JavaScript, or any language
|
|
52
|
+
- **CI/CD ready** — gate deployments on quality thresholds directly in your pipeline
|
|
53
|
+
- **Local-first** — no cloud dependency required; everything runs on your machine
|
|
54
|
+
|
|
55
|
+
## How It Works
|
|
56
|
+
|
|
57
|
+
agentevals follows three simple steps:
|
|
58
|
+
|
|
59
|
+
1. **Collect traces** — Instrument your agent with OpenTelemetry (or export traces from your tracing backend). Point the OTLP exporter at the agentevals receiver, or load trace files directly.
|
|
60
|
+
2. **Define eval sets** — Create golden evaluation sets that describe expected agent behavior: which tools should be called, in what order, and what the output should look like.
|
|
61
|
+
3. **Run evaluations** — Use the CLI, Web UI, or MCP server to score traces against your eval sets. Get per-metric scores, pass/fail results, and detailed span-level breakdowns.
|
|
62
|
+
|
|
63
|
+
|
|
13
64
|
> [!IMPORTANT]
|
|
14
65
|
> This project is under active development. Expect breaking changes.
|
|
15
66
|
|
|
@@ -40,6 +91,7 @@ Optional extras:
|
|
|
40
91
|
|
|
41
92
|
```bash
|
|
42
93
|
pip install "agentevals-cli[live]" # MCP server support
|
|
94
|
+
pip install "agentevals-cli[openai]" # OpenAI Evals API graders
|
|
43
95
|
```
|
|
44
96
|
|
|
45
97
|
**GitHub [releases](../../releases)** also ship **core** wheels (CLI and API only) and **bundle** wheels (with the embedded UI) if you need a specific version or offline `pip install ./path/to.whl`.
|
|
@@ -164,7 +216,7 @@ evaluators:
|
|
|
164
216
|
agentevals run trace.json --config eval_config.yaml --eval-set eval_set.json
|
|
165
217
|
```
|
|
166
218
|
|
|
167
|
-
Community evaluators can be referenced directly from a shared GitHub repository using `type: remote`. See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK usage, and how to contribute evaluators.
|
|
219
|
+
Community evaluators can be referenced directly from a shared GitHub repository using `type: remote`. You can also delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) using `type: openai_eval` (requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY`). See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK usage, and how to contribute evaluators.
|
|
168
220
|
|
|
169
221
|
## Web UI
|
|
170
222
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
1. UI and API are available at port {{ .Values.service.http.port }} (Service port name: http).
|
|
2
|
+
2. OTLP HTTP receiver: port {{ .Values.service.otlpHttp.port }} (OTEL_EXPORTER_OTLP_ENDPOINT=http://<service>:{{ .Values.service.otlpHttp.port }}).
|
|
3
|
+
3. MCP (Streamable HTTP): port {{ .Values.service.mcp.port }}, path /mcp (e.g. http://<service>:{{ .Values.service.mcp.port }}/mcp).
|
|
4
|
+
{{- if .Values.ephemeralVolume.enabled }}
|
|
5
|
+
4. An emptyDir is mounted at /tmp with HOME=/tmp/agentevals-home (ephemeral; lost on pod restart). Set ephemeralVolume.enabled=false and readOnlyRootFilesystem=false if you need a writable root without this mount.
|
|
6
|
+
{{- end }}
|
|
7
|
+
|
|
8
|
+
Get the Service URL:
|
|
9
|
+
export POD_NAME=$(kubectl get pods --namespace {{ include "agentevals.namespace" . }} -l "app.kubernetes.io/name={{ include "agentevals.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
|
|
10
|
+
kubectl --namespace {{ include "agentevals.namespace" . }} port-forward $POD_NAME {{ .Values.service.http.port }}:{{ .Values.service.http.port }}
|
|
11
|
+
|
|
12
|
+
Health check: GET http://<pod-ip>:{{ .Values.service.http.containerPort }}/api/health
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{{- define "agentevals.name" -}}
|
|
2
|
+
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
|
|
3
|
+
{{- end }}
|
|
4
|
+
|
|
5
|
+
{{- define "agentevals.fullname" -}}
|
|
6
|
+
{{- if .Values.fullnameOverride }}
|
|
7
|
+
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
|
8
|
+
{{- else }}
|
|
9
|
+
{{- $name := default .Chart.Name .Values.nameOverride }}
|
|
10
|
+
{{- if contains $name .Release.Name }}
|
|
11
|
+
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
|
12
|
+
{{- else }}
|
|
13
|
+
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
|
14
|
+
{{- end }}
|
|
15
|
+
{{- end }}
|
|
16
|
+
{{- end }}
|
|
17
|
+
|
|
18
|
+
{{- define "agentevals.namespace" -}}
|
|
19
|
+
{{- default .Release.Namespace .Values.namespaceOverride }}
|
|
20
|
+
{{- end }}
|
|
21
|
+
|
|
22
|
+
{{- define "agentevals.chart" -}}
|
|
23
|
+
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
|
24
|
+
{{- end }}
|
|
25
|
+
|
|
26
|
+
{{- define "agentevals.image" -}}
|
|
27
|
+
{{- $registry := .Values.image.registry | default .Values.registry -}}
|
|
28
|
+
{{- $tag := .Values.image.tag | default .Values.tag | default .Chart.AppVersion -}}
|
|
29
|
+
{{- if $registry -}}
|
|
30
|
+
{{- printf "%s/%s:%s" $registry .Values.image.repository $tag -}}
|
|
31
|
+
{{- else -}}
|
|
32
|
+
{{- printf "%s:%s" .Values.image.repository $tag -}}
|
|
33
|
+
{{- end -}}
|
|
34
|
+
{{- end }}
|
|
35
|
+
|
|
36
|
+
{{- define "agentevals.labels" -}}
|
|
37
|
+
helm.sh/chart: {{ include "agentevals.chart" . }}
|
|
38
|
+
{{ include "agentevals.selectorLabels" . }}
|
|
39
|
+
{{- if .Chart.AppVersion }}
|
|
40
|
+
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
|
41
|
+
{{- end }}
|
|
42
|
+
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
|
43
|
+
app.kubernetes.io/part-of: agentevals
|
|
44
|
+
{{- end }}
|
|
45
|
+
|
|
46
|
+
{{- define "agentevals.selectorLabels" -}}
|
|
47
|
+
app.kubernetes.io/name: {{ include "agentevals.name" . }}
|
|
48
|
+
app.kubernetes.io/instance: {{ .Release.Name }}
|
|
49
|
+
{{- end }}
|
|
50
|
+
|
|
51
|
+
{{- define "agentevals.serviceAccountName" -}}
|
|
52
|
+
{{- if .Values.serviceAccount.create }}
|
|
53
|
+
{{- default (include "agentevals.fullname" .) .Values.serviceAccount.name }}
|
|
54
|
+
{{- else }}
|
|
55
|
+
{{- default "default" .Values.serviceAccount.name }}
|
|
56
|
+
{{- end }}
|
|
57
|
+
{{- end }}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
apiVersion: apps/v1
|
|
2
|
+
kind: Deployment
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ include "agentevals.fullname" . }}
|
|
5
|
+
namespace: {{ include "agentevals.namespace" . }}
|
|
6
|
+
labels:
|
|
7
|
+
{{- include "agentevals.labels" . | nindent 4 }}
|
|
8
|
+
spec:
|
|
9
|
+
replicas: {{ .Values.replicaCount }}
|
|
10
|
+
selector:
|
|
11
|
+
matchLabels:
|
|
12
|
+
{{- include "agentevals.selectorLabels" . | nindent 6 }}
|
|
13
|
+
template:
|
|
14
|
+
metadata:
|
|
15
|
+
{{- with .Values.podAnnotations }}
|
|
16
|
+
annotations:
|
|
17
|
+
{{- toYaml . | nindent 8 }}
|
|
18
|
+
{{- end }}
|
|
19
|
+
labels:
|
|
20
|
+
{{- include "agentevals.selectorLabels" . | nindent 8 }}
|
|
21
|
+
{{- with .Values.podLabels }}
|
|
22
|
+
{{- toYaml . | nindent 8 }}
|
|
23
|
+
{{- end }}
|
|
24
|
+
spec:
|
|
25
|
+
{{- with .Values.imagePullSecrets }}
|
|
26
|
+
imagePullSecrets:
|
|
27
|
+
{{- toYaml . | nindent 8 }}
|
|
28
|
+
{{- end }}
|
|
29
|
+
securityContext:
|
|
30
|
+
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
|
31
|
+
serviceAccountName: {{ include "agentevals.serviceAccountName" . }}
|
|
32
|
+
{{- if .Values.ephemeralVolume.enabled }}
|
|
33
|
+
volumes:
|
|
34
|
+
- name: agentevals-tmp
|
|
35
|
+
{{- if or .Values.ephemeralVolume.sizeLimit (eq .Values.ephemeralVolume.medium "Memory") }}
|
|
36
|
+
emptyDir:
|
|
37
|
+
{{- if eq .Values.ephemeralVolume.medium "Memory" }}
|
|
38
|
+
medium: Memory
|
|
39
|
+
{{- end }}
|
|
40
|
+
{{- with .Values.ephemeralVolume.sizeLimit }}
|
|
41
|
+
sizeLimit: {{ . }}
|
|
42
|
+
{{- end }}
|
|
43
|
+
{{- else }}
|
|
44
|
+
emptyDir: {}
|
|
45
|
+
{{- end }}
|
|
46
|
+
{{- end }}
|
|
47
|
+
containers:
|
|
48
|
+
- name: agentevals
|
|
49
|
+
image: {{ include "agentevals.image" . | quote }}
|
|
50
|
+
imagePullPolicy: {{ .Values.image.pullPolicy | default .Values.imagePullPolicy }}
|
|
51
|
+
{{- if .Values.command }}
|
|
52
|
+
command:
|
|
53
|
+
{{- toYaml .Values.command | nindent 12 }}
|
|
54
|
+
{{- end }}
|
|
55
|
+
{{- if .Values.args }}
|
|
56
|
+
args:
|
|
57
|
+
{{- toYaml .Values.args | nindent 12 }}
|
|
58
|
+
{{- end }}
|
|
59
|
+
env:
|
|
60
|
+
- name: AGENTEVALS_SERVER_URL
|
|
61
|
+
value: "http://127.0.0.1:{{ .Values.service.http.containerPort }}"
|
|
62
|
+
{{- if .Values.ephemeralVolume.enabled }}
|
|
63
|
+
- name: TMPDIR
|
|
64
|
+
value: "/tmp"
|
|
65
|
+
- name: HOME
|
|
66
|
+
value: "/tmp/agentevals-home"
|
|
67
|
+
{{- end }}
|
|
68
|
+
{{- with .Values.env }}
|
|
69
|
+
{{- toYaml . | nindent 12 }}
|
|
70
|
+
{{- end }}
|
|
71
|
+
{{- with .Values.envFrom }}
|
|
72
|
+
envFrom:
|
|
73
|
+
{{- toYaml . | nindent 12 }}
|
|
74
|
+
{{- end }}
|
|
75
|
+
ports:
|
|
76
|
+
- name: http
|
|
77
|
+
containerPort: {{ .Values.service.http.containerPort }}
|
|
78
|
+
protocol: TCP
|
|
79
|
+
- name: otlp-http
|
|
80
|
+
containerPort: {{ .Values.service.otlpHttp.containerPort }}
|
|
81
|
+
protocol: TCP
|
|
82
|
+
- name: mcp
|
|
83
|
+
containerPort: {{ .Values.service.mcp.containerPort }}
|
|
84
|
+
protocol: TCP
|
|
85
|
+
resources:
|
|
86
|
+
{{- toYaml .Values.resources | nindent 12 }}
|
|
87
|
+
securityContext:
|
|
88
|
+
{{- $sc := deepCopy .Values.securityContext }}
|
|
89
|
+
{{- if not .Values.ephemeralVolume.enabled }}
|
|
90
|
+
{{- $_ := set $sc "readOnlyRootFilesystem" false }}
|
|
91
|
+
{{- end }}
|
|
92
|
+
{{- toYaml $sc | nindent 12 }}
|
|
93
|
+
startupProbe:
|
|
94
|
+
httpGet:
|
|
95
|
+
path: /api/health
|
|
96
|
+
port: http
|
|
97
|
+
failureThreshold: 60
|
|
98
|
+
periodSeconds: 10
|
|
99
|
+
timeoutSeconds: 5
|
|
100
|
+
readinessProbe:
|
|
101
|
+
httpGet:
|
|
102
|
+
path: /api/health
|
|
103
|
+
port: http
|
|
104
|
+
initialDelaySeconds: 5
|
|
105
|
+
periodSeconds: 10
|
|
106
|
+
livenessProbe:
|
|
107
|
+
httpGet:
|
|
108
|
+
path: /api/health
|
|
109
|
+
port: http
|
|
110
|
+
initialDelaySeconds: 15
|
|
111
|
+
periodSeconds: 20
|
|
112
|
+
{{- if .Values.ephemeralVolume.enabled }}
|
|
113
|
+
volumeMounts:
|
|
114
|
+
- name: agentevals-tmp
|
|
115
|
+
mountPath: /tmp
|
|
116
|
+
{{- end }}
|
|
117
|
+
{{- with .Values.nodeSelector }}
|
|
118
|
+
nodeSelector:
|
|
119
|
+
{{- toYaml . | nindent 8 }}
|
|
120
|
+
{{- end }}
|
|
121
|
+
{{- with .Values.affinity }}
|
|
122
|
+
affinity:
|
|
123
|
+
{{- toYaml . | nindent 8 }}
|
|
124
|
+
{{- end }}
|
|
125
|
+
{{- with .Values.tolerations }}
|
|
126
|
+
tolerations:
|
|
127
|
+
{{- toYaml . | nindent 8 }}
|
|
128
|
+
{{- end }}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
apiVersion: v1
|
|
2
|
+
kind: Service
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ include "agentevals.fullname" . }}
|
|
5
|
+
namespace: {{ include "agentevals.namespace" . }}
|
|
6
|
+
labels:
|
|
7
|
+
{{- include "agentevals.labels" . | nindent 4 }}
|
|
8
|
+
spec:
|
|
9
|
+
type: {{ .Values.service.type }}
|
|
10
|
+
ports:
|
|
11
|
+
- name: http
|
|
12
|
+
port: {{ .Values.service.http.port }}
|
|
13
|
+
targetPort: http
|
|
14
|
+
protocol: TCP
|
|
15
|
+
- name: otlp-http
|
|
16
|
+
port: {{ .Values.service.otlpHttp.port }}
|
|
17
|
+
targetPort: otlp-http
|
|
18
|
+
protocol: TCP
|
|
19
|
+
- name: mcp
|
|
20
|
+
port: {{ .Values.service.mcp.port }}
|
|
21
|
+
targetPort: mcp
|
|
22
|
+
protocol: TCP
|
|
23
|
+
selector:
|
|
24
|
+
{{- include "agentevals.selectorLabels" . | nindent 4 }}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{{- if .Values.serviceAccount.create -}}
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
kind: ServiceAccount
|
|
4
|
+
metadata:
|
|
5
|
+
name: {{ include "agentevals.serviceAccountName" . }}
|
|
6
|
+
namespace: {{ include "agentevals.namespace" . }}
|
|
7
|
+
labels:
|
|
8
|
+
{{- include "agentevals.labels" . | nindent 4 }}
|
|
9
|
+
{{- with .Values.serviceAccount.annotations }}
|
|
10
|
+
annotations:
|
|
11
|
+
{{- toYaml . | nindent 4 }}
|
|
12
|
+
{{- end }}
|
|
13
|
+
automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
|
|
14
|
+
{{- end }}
|