PyPI - agentevals-cli - Versions diffs - 0.5.2__tar.gz - Mend

agentevals-cli 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (201) hide show

agentevals_cli-0.5.2/.claude/skills/eval/SKILL.md +83 -0
agentevals_cli-0.5.2/.claude/skills/eval/evals/evals.json +46 -0
agentevals_cli-0.5.2/.claude/skills/inspect/SKILL.md +49 -0
agentevals_cli-0.5.2/.claude/skills/inspect/evals/evals.json +25 -0
agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/bug_report.yml +114 -0
agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/config.yml +1 -0
agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/feature_request.yml +39 -0
agentevals_cli-0.5.2/.github/workflows/ci.yml +56 -0
agentevals_cli-0.5.2/.github/workflows/publish-evaluator-sdk.yml +44 -0
agentevals_cli-0.5.2/.github/workflows/release.yml +79 -0
agentevals_cli-0.5.2/.gitignore +57 -0
agentevals_cli-0.5.2/.mcp.json +8 -0
agentevals_cli-0.5.2/CONTRIBUTING.md +197 -0
agentevals_cli-0.5.2/DEVELOPMENT.md +143 -0
agentevals_cli-0.5.2/LICENSE +201 -0
agentevals_cli-0.5.2/Makefile +61 -0
agentevals_cli-0.5.2/PKG-INFO +22 -0
agentevals_cli-0.5.2/README.md +248 -0
agentevals_cli-0.5.2/docs/assets/logo-color.png +0 -0
agentevals_cli-0.5.2/docs/custom-evaluators.md +439 -0
agentevals_cli-0.5.2/docs/eval-set-format.md +244 -0
agentevals_cli-0.5.2/docs/otel-compatibility.md +90 -0
agentevals_cli-0.5.2/docs/streaming.md +316 -0
agentevals_cli-0.5.2/examples/README.md +224 -0
agentevals_cli-0.5.2/examples/custom_evaluators/eval_config.yaml +35 -0
agentevals_cli-0.5.2/examples/custom_evaluators/response_quality.py +57 -0
agentevals_cli-0.5.2/examples/custom_evaluators/tool_call_checker.py +36 -0
agentevals_cli-0.5.2/examples/dice_agent/README.md +186 -0
agentevals_cli-0.5.2/examples/dice_agent/agent.py +67 -0
agentevals_cli-0.5.2/examples/dice_agent/eval_set.json +127 -0
agentevals_cli-0.5.2/examples/dice_agent/main.py +139 -0
agentevals_cli-0.5.2/examples/dice_agent/test_streaming.py +63 -0
agentevals_cli-0.5.2/examples/langchain_agent/README.md +230 -0
agentevals_cli-0.5.2/examples/langchain_agent/agent.py +62 -0
agentevals_cli-0.5.2/examples/langchain_agent/eval_set.json +29 -0
agentevals_cli-0.5.2/examples/langchain_agent/main.py +180 -0
agentevals_cli-0.5.2/examples/langchain_agent/requirements.txt +5 -0
agentevals_cli-0.5.2/examples/langchain_agent/test_streaming.py +79 -0
agentevals_cli-0.5.2/examples/sdk_example/async_example.py +61 -0
agentevals_cli-0.5.2/examples/sdk_example/context_manager_example.py +35 -0
agentevals_cli-0.5.2/examples/sdk_example/decorator_example.py +37 -0
agentevals_cli-0.5.2/examples/sdk_example/requirements.txt +4 -0
agentevals_cli-0.5.2/examples/strands_agent/agent.py +68 -0
agentevals_cli-0.5.2/examples/strands_agent/eval_set.json +127 -0
agentevals_cli-0.5.2/examples/strands_agent/main.py +138 -0
agentevals_cli-0.5.2/examples/strands_agent/requirements.txt +2 -0
agentevals_cli-0.5.2/examples/zero-code-examples/adk/requirements.txt +5 -0
agentevals_cli-0.5.2/examples/zero-code-examples/adk/run.py +88 -0
agentevals_cli-0.5.2/examples/zero-code-examples/langchain/requirements.txt +7 -0
agentevals_cli-0.5.2/examples/zero-code-examples/langchain/run.py +110 -0
agentevals_cli-0.5.2/examples/zero-code-examples/strands/requirements.txt +5 -0
agentevals_cli-0.5.2/examples/zero-code-examples/strands/run.py +72 -0
agentevals_cli-0.5.2/flake.lock +154 -0
agentevals_cli-0.5.2/flake.nix +96 -0
agentevals_cli-0.5.2/packages/evaluator-sdk-py/README.md +45 -0
agentevals_cli-0.5.2/packages/evaluator-sdk-py/pyproject.toml +32 -0
agentevals_cli-0.5.2/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +43 -0
agentevals_cli-0.5.2/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +77 -0
agentevals_cli-0.5.2/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +84 -0
agentevals_cli-0.5.2/pyproject.toml +86 -0
agentevals_cli-0.5.2/samples/eval_set_helm.json +51 -0
agentevals_cli-0.5.2/samples/evalset_helm_3_2026-02-23.json +59 -0
agentevals_cli-0.5.2/samples/evalset_k8s_2026-02-20.json +36 -0
agentevals_cli-0.5.2/samples/helm.json +5333 -0
agentevals_cli-0.5.2/samples/helm_2.json +3708 -0
agentevals_cli-0.5.2/samples/helm_3.json +7182 -0
agentevals_cli-0.5.2/samples/k8s.json +3908 -0
agentevals_cli-0.5.2/src/agentevals/__init__.py +16 -0
agentevals_cli-0.5.2/src/agentevals/_protocol.py +83 -0
agentevals_cli-0.5.2/src/agentevals/api/__init__.py +0 -0
agentevals_cli-0.5.2/src/agentevals/api/app.py +137 -0
agentevals_cli-0.5.2/src/agentevals/api/debug_routes.py +268 -0
agentevals_cli-0.5.2/src/agentevals/api/models.py +204 -0
agentevals_cli-0.5.2/src/agentevals/api/otlp_app.py +25 -0
agentevals_cli-0.5.2/src/agentevals/api/otlp_routes.py +383 -0
agentevals_cli-0.5.2/src/agentevals/api/routes.py +554 -0
agentevals_cli-0.5.2/src/agentevals/api/streaming_routes.py +373 -0
agentevals_cli-0.5.2/src/agentevals/builtin_metrics.py +234 -0
agentevals_cli-0.5.2/src/agentevals/cli.py +643 -0
agentevals_cli-0.5.2/src/agentevals/config.py +108 -0
agentevals_cli-0.5.2/src/agentevals/converter.py +328 -0
agentevals_cli-0.5.2/src/agentevals/custom_evaluators.py +468 -0
agentevals_cli-0.5.2/src/agentevals/eval_config_loader.py +147 -0
agentevals_cli-0.5.2/src/agentevals/evaluator/__init__.py +24 -0
agentevals_cli-0.5.2/src/agentevals/evaluator/resolver.py +70 -0
agentevals_cli-0.5.2/src/agentevals/evaluator/sources.py +293 -0
agentevals_cli-0.5.2/src/agentevals/evaluator/templates.py +224 -0
agentevals_cli-0.5.2/src/agentevals/extraction.py +444 -0
agentevals_cli-0.5.2/src/agentevals/genai_converter.py +538 -0
agentevals_cli-0.5.2/src/agentevals/loader/__init__.py +7 -0
agentevals_cli-0.5.2/src/agentevals/loader/base.py +53 -0
agentevals_cli-0.5.2/src/agentevals/loader/jaeger.py +112 -0
agentevals_cli-0.5.2/src/agentevals/loader/otlp.py +193 -0
agentevals_cli-0.5.2/src/agentevals/mcp_server.py +236 -0
agentevals_cli-0.5.2/src/agentevals/output.py +204 -0
agentevals_cli-0.5.2/src/agentevals/runner.py +310 -0
agentevals_cli-0.5.2/src/agentevals/sdk.py +433 -0
agentevals_cli-0.5.2/src/agentevals/streaming/__init__.py +120 -0
agentevals_cli-0.5.2/src/agentevals/streaming/incremental_processor.py +337 -0
agentevals_cli-0.5.2/src/agentevals/streaming/processor.py +285 -0
agentevals_cli-0.5.2/src/agentevals/streaming/session.py +36 -0
agentevals_cli-0.5.2/src/agentevals/streaming/ws_server.py +806 -0
agentevals_cli-0.5.2/src/agentevals/trace_attrs.py +32 -0
agentevals_cli-0.5.2/src/agentevals/trace_metrics.py +126 -0
agentevals_cli-0.5.2/src/agentevals/utils/__init__.py +0 -0
agentevals_cli-0.5.2/src/agentevals/utils/genai_messages.py +142 -0
agentevals_cli-0.5.2/src/agentevals/utils/log_buffer.py +43 -0
agentevals_cli-0.5.2/src/agentevals/utils/log_enrichment.py +187 -0
agentevals_cli-0.5.2/tests/integration/__init__.py +0 -0
agentevals_cli-0.5.2/tests/integration/conftest.py +341 -0
agentevals_cli-0.5.2/tests/integration/test_evaluation_pipeline.py +131 -0
agentevals_cli-0.5.2/tests/integration/test_live_agents.py +348 -0
agentevals_cli-0.5.2/tests/integration/test_session_grouping.py +509 -0
agentevals_cli-0.5.2/tests/integration/test_timing_stress.py +346 -0
agentevals_cli-0.5.2/tests/test_api.py +1033 -0
agentevals_cli-0.5.2/tests/test_converter.py +403 -0
agentevals_cli-0.5.2/tests/test_extraction.py +513 -0
agentevals_cli-0.5.2/tests/test_genai_converter.py +992 -0
agentevals_cli-0.5.2/tests/test_jaeger_loader.py +146 -0
agentevals_cli-0.5.2/tests/test_log_enrichment.py +197 -0
agentevals_cli-0.5.2/tests/test_otlp_loader.py +210 -0
agentevals_cli-0.5.2/tests/test_otlp_receiver.py +1659 -0
agentevals_cli-0.5.2/tests/test_protocol.py +25 -0
agentevals_cli-0.5.2/tests/test_runner.py +166 -0
agentevals_cli-0.5.2/tests/test_sdk.py +472 -0
agentevals_cli-0.5.2/ui/.gitignore +24 -0
agentevals_cli-0.5.2/ui/README.md +34 -0
agentevals_cli-0.5.2/ui/eslint.config.js +23 -0
agentevals_cli-0.5.2/ui/index.html +13 -0
agentevals_cli-0.5.2/ui/package-lock.json +4691 -0
agentevals_cli-0.5.2/ui/package.json +38 -0
agentevals_cli-0.5.2/ui/public/logo.svg +13 -0
agentevals_cli-0.5.2/ui/public/vite.svg +1 -0
agentevals_cli-0.5.2/ui/src/App.css +42 -0
agentevals_cli-0.5.2/ui/src/App.tsx +45 -0
agentevals_cli-0.5.2/ui/src/api/client.ts +267 -0
agentevals_cli-0.5.2/ui/src/assets/react.svg +1 -0
agentevals_cli-0.5.2/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +331 -0
agentevals_cli-0.5.2/ui/src/components/annotation-queue/AnnotationQueueView.tsx +332 -0
agentevals_cli-0.5.2/ui/src/components/annotation-queue/AnnotationTable.tsx +360 -0
agentevals_cli-0.5.2/ui/src/components/bug-report/BugReportModal.tsx +367 -0
agentevals_cli-0.5.2/ui/src/components/builder/BuilderHeader.tsx +94 -0
agentevals_cli-0.5.2/ui/src/components/builder/BuilderView.tsx +98 -0
agentevals_cli-0.5.2/ui/src/components/builder/EvalCaseCard.tsx +153 -0
agentevals_cli-0.5.2/ui/src/components/builder/EvalCasesList.tsx +91 -0
agentevals_cli-0.5.2/ui/src/components/builder/InvocationEditor.tsx +123 -0
agentevals_cli-0.5.2/ui/src/components/builder/JsonPreview.tsx +77 -0
agentevals_cli-0.5.2/ui/src/components/builder/MetadataEditor.tsx +79 -0
agentevals_cli-0.5.2/ui/src/components/builder/TraceUploadZone.tsx +194 -0
agentevals_cli-0.5.2/ui/src/components/builder/index.ts +7 -0
agentevals_cli-0.5.2/ui/src/components/dashboard/DashboardView.tsx +213 -0
agentevals_cli-0.5.2/ui/src/components/dashboard/MetricScoreCard.tsx +149 -0
agentevals_cli-0.5.2/ui/src/components/dashboard/PerformanceCard.tsx +88 -0
agentevals_cli-0.5.2/ui/src/components/dashboard/PerformanceCharts.tsx +254 -0
agentevals_cli-0.5.2/ui/src/components/dashboard/SummaryStats.tsx +149 -0
agentevals_cli-0.5.2/ui/src/components/dashboard/TraceCard.tsx +278 -0
agentevals_cli-0.5.2/ui/src/components/dashboard/TraceTable.tsx +552 -0
agentevals_cli-0.5.2/ui/src/components/inspector/ComparisonPanel.tsx +180 -0
agentevals_cli-0.5.2/ui/src/components/inspector/DataSection.tsx +125 -0
agentevals_cli-0.5.2/ui/src/components/inspector/InspectorHeader.tsx +309 -0
agentevals_cli-0.5.2/ui/src/components/inspector/InspectorLayout.tsx +139 -0
agentevals_cli-0.5.2/ui/src/components/inspector/InspectorView.tsx +337 -0
agentevals_cli-0.5.2/ui/src/components/inspector/InvocationCard.tsx +180 -0
agentevals_cli-0.5.2/ui/src/components/inspector/InvocationSummaryPanel.tsx +282 -0
agentevals_cli-0.5.2/ui/src/components/inspector/MetricResultsSection.tsx +286 -0
agentevals_cli-0.5.2/ui/src/components/inspector/MetricsComparisonSection.tsx +645 -0
agentevals_cli-0.5.2/ui/src/components/inspector/PerformanceSection.tsx +94 -0
agentevals_cli-0.5.2/ui/src/components/inspector/ToolCallList.tsx +229 -0
agentevals_cli-0.5.2/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +222 -0
agentevals_cli-0.5.2/ui/src/components/sidebar/Sidebar.tsx +226 -0
agentevals_cli-0.5.2/ui/src/components/streaming/LiveConversationPanel.tsx +123 -0
agentevals_cli-0.5.2/ui/src/components/streaming/LiveMessage.tsx +146 -0
agentevals_cli-0.5.2/ui/src/components/streaming/LiveStreamingView.tsx +823 -0
agentevals_cli-0.5.2/ui/src/components/streaming/SessionCard.tsx +573 -0
agentevals_cli-0.5.2/ui/src/components/streaming/SessionMetadata.tsx +78 -0
agentevals_cli-0.5.2/ui/src/components/upload/EvalSetEditorDrawer.tsx +308 -0
agentevals_cli-0.5.2/ui/src/components/upload/FileDropZone.tsx +88 -0
agentevals_cli-0.5.2/ui/src/components/upload/MetricSelector.tsx +228 -0
agentevals_cli-0.5.2/ui/src/components/upload/RawJsonPreview.tsx +91 -0
agentevals_cli-0.5.2/ui/src/components/upload/TraceEditorDrawer.tsx +357 -0
agentevals_cli-0.5.2/ui/src/components/upload/UploadView.tsx +567 -0
agentevals_cli-0.5.2/ui/src/components/welcome/WelcomeView.tsx +152 -0
agentevals_cli-0.5.2/ui/src/config.ts +28 -0
agentevals_cli-0.5.2/ui/src/context/TraceContext.tsx +90 -0
agentevals_cli-0.5.2/ui/src/context/TraceProvider.tsx +379 -0
agentevals_cli-0.5.2/ui/src/index.css +256 -0
agentevals_cli-0.5.2/ui/src/lib/console-capture.ts +51 -0
agentevals_cli-0.5.2/ui/src/lib/evalset-builder.ts +131 -0
agentevals_cli-0.5.2/ui/src/lib/network-capture.ts +62 -0
agentevals_cli-0.5.2/ui/src/lib/trace-converter.ts +734 -0
agentevals_cli-0.5.2/ui/src/lib/trace-loader.ts +249 -0
agentevals_cli-0.5.2/ui/src/lib/trace-metadata.ts +391 -0
agentevals_cli-0.5.2/ui/src/lib/trace-patcher.ts +328 -0
agentevals_cli-0.5.2/ui/src/lib/types.ts +421 -0
agentevals_cli-0.5.2/ui/src/lib/utils.ts +223 -0
agentevals_cli-0.5.2/ui/src/main.tsx +15 -0
agentevals_cli-0.5.2/ui/tsconfig.app.json +29 -0
agentevals_cli-0.5.2/ui/tsconfig.json +7 -0
agentevals_cli-0.5.2/ui/tsconfig.node.json +26 -0
agentevals_cli-0.5.2/ui/vite.config.ts +14 -0
agentevals_cli-0.5.2/uv.lock +4098 -0

agentevals_cli-0.5.2/.claude/skills/eval/SKILL.md ADDED Viewed

@@ -0,0 +1,83 @@
+---
+name: eval
+description: >
+  Evaluate and score agent behavior against a golden reference. Use this skill
+  whenever the user wants to run evaluation, check pass/fail status, understand
+  metric scores, compare sessions for regressions, validate agent behavior, or
+  score a trace from a file or a live session. Trigger on phrases like "eval this
+  trace", "check my agent output", "did my agent do the right thing", "compare runs",
+  "did my agent regress", "score session X", "evaluate against golden", "run evals".
+  Works with both local trace files and live streaming sessions.
+---
+Evaluate agent behavior and explain what the scores mean.
+## Determine the input type
+First, figure out what to evaluate:
+- **Trace file(s)** — user mentions a `.json` or `.jsonl` file path → use `evaluate_traces`
+- **Sessions vs golden** — user has multiple live sessions and wants regression testing → use `evaluate_sessions`
+- **Single live session** — user wants to score one session against a golden eval set → guide them to use `evaluate_sessions` with one session as golden
+## Evaluating trace files
+1. Get the file path(s). Check the extension:
+   `.jsonl` → `trace_format: "otlp-json"` | `.json` → `"jaeger-json"` (default)
+2. Ask if they have a golden eval set JSON. For `tool_trajectory_avg_score` (the
+   default metric), an eval set is required — it provides the expected tool call
+   sequence to compare against. If they don't have one yet, explain this and suggest
+   starting with `hallucinations_v1`, or ask if they want to create a golden set from
+   a reference run first.
+3. Call `evaluate_traces` with the file(s), format, and eval set.
+4. Present results as a score table (see Score interpretation below) and explain failures.
+## Evaluating sessions (regression testing)
+This workflow requires the server to be running with the `--dev` flag (which enables
+WebSocket and session streaming). Plain `agentevals serve` will not have sessions.
+If you get a connection error from any tool below, tell the user:
+```bash
+uv run agentevals serve --dev
+```
+1. Call `list_sessions` to show available sessions.
+2. Help the user identify the "golden" session — the reference run that represents
+   correct behavior. The server derives the eval set from it automatically.
+3. Call `evaluate_sessions(golden_session_id=...)`. This scores all other completed
+   sessions against the golden.
+4. Present a comparison table:
+   ```
+   Session             | Score | Status  | Delta
+   session-abc (golden)| 1.00  | —       | baseline
+   session-def         | 0.85  | PASSED  | -0.15
+   session-ghi         | 0.40  | FAILED  | -0.60 ⚠️
+   ```
+5. Explain regressions specifically: which tools the golden called that a failing
+   session skipped, or unexpected extra calls. Concrete tool names are more useful
+   than just quoting the score.
+## Score interpretation
+| Score | Meaning |
+|-------|---------|
+| 1.0 | Exact match — right tools, right order |
+| 0.7–0.9 | Minor deviations (extra call or slightly different args) |
+| 0.5–0.7 | Partial match — some turns correct, others missing or wrong tool calls |
+| 0.0–0.5 | Major divergence — most tool calls don't match golden |
+**Important:** `evalStatus: PASSED` does **not** mean the agent did well — it only means
+the score met the configured threshold. Without a configured threshold, every session
+shows PASSED regardless of score. Focus on the numeric score, not the status label.
+## After results
+If the user wants to understand *what the agent did* step by step (not just the score),
+suggest `/inspect` to get a readable narrative of a session.

agentevals_cli-0.5.2/.claude/skills/eval/evals/evals.json ADDED Viewed

@@ -0,0 +1,46 @@
+{
+  "skill_name": "eval",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "I ran the dice agent twice and both sessions are on the agentevals server. Session IDs: dice-agent-gemini-2.5-flash-lite-20260306-160809-03676 and dice-agent-gemini-2.5-flash-lite-20260306-160828-64013. Did the second run regress compared to the first?",
+      "expected_output": "Lists sessions, uses first as golden, calls evaluate_sessions, presents comparison table with scores, explains which tool call changed",
+      "assertions": [
+        {"text": "Response attempts to call list_sessions or evaluate_sessions MCP tool (or explains it would do so if the server were accessible)"},
+        {"text": "Response identifies a 'golden' session concept — the reference run to compare against"},
+        {"text": "Response presents or describes a comparison table with scores and delta for both sessions"},
+        {"text": "Response mentions /inspect or suggests inspecting the session to understand what changed"}
+      ]
+    },
+    {
+      "id": 2,
+      "prompt": "Evaluate my agent trace at /tmp/run_42.jsonl against the golden set /tmp/golden.json",
+      "expected_output": "Detects otlp-json format from .jsonl extension, attempts to run eval, explains scores or handles missing file",
+      "assertions": [
+        {"text": "Response identifies the .jsonl extension as OTLP format (otlp-json), not jaeger-json"},
+        {"text": "Response handles the missing file gracefully (does not silently fail or crash without explanation)"},
+        {"text": "Response explains the format difference between .jsonl (OTLP) and .json (Jaeger) or at minimum names the correct format for each"}
+      ]
+    },
+    {
+      "id": 3,
+      "prompt": "I want to check if my dice agent behaved correctly in the session dice-agent-gemini-2.5-flash-lite-20260306-160828-64013. I think something went wrong in the last turn. Score it for me.",
+      "expected_output": "Identifies this as a session, calls evaluate_sessions using the other session as golden, shows score, explains the regression in the final turn",
+      "assertions": [
+        {"text": "Response attempts to call list_sessions or evaluate_sessions (or explains the workflow if tools unavailable)"},
+        {"text": "Response explains that a golden/reference session is needed to compare against"},
+        {"text": "Response describes what a regression in the last turn would look like (missing tool call, wrong tool, or score below 1.0)"}
+      ]
+    },
+    {
+      "id": 4,
+      "prompt": "Run eval on the dice agent session dice-agent-gemini-2.5-flash-lite-20260306-160809-03676 but I don't have an eval set yet",
+      "expected_output": "Explains that for session-based eval the server auto-derives the eval set from a golden session, suggests the other dice session as golden, or offers metric alternatives",
+      "assertions": [
+        {"text": "Response explains that tool_trajectory_avg_score requires a reference (eval set or golden session)"},
+        {"text": "Response suggests at least one path forward: use another session as golden, use hallucinations_v1, or create an eval set"},
+        {"text": "Response gives the user a concrete next step rather than just describing the problem"}
+      ]
+    }
+  ]
+}

agentevals_cli-0.5.2/.claude/skills/inspect/SKILL.md ADDED Viewed

@@ -0,0 +1,49 @@
+---
+name: inspect
+description: >
+  Inspect and debug live streaming agent sessions to understand what the agent did.
+  Use this skill when the user wants to see what happened in a recent agent run,
+  trace through the tool calls step by step, debug unexpected agent behavior, or
+  get a readable narrative of a session. Trigger on phrases like "show me what my
+  agent did", "inspect session", "what happened in that run", "debug my agent",
+  "trace through session", "walk me through the last run", "what tools did it call".
+  Requires `agentevals serve --dev` to be running. For scoring/pass-fail evaluation,
+  use /eval instead.
+---
+Help the user understand what happened in a live streaming agent session.
+If `list_sessions` or `summarize_session` fail with a connection error, the server
+is not running. Tell the user:
+```bash
+uv run agentevals serve --dev
+```
+The `--dev` flag is required — plain `serve` does not enable the WebSocket endpoint
+that sessions stream to. Also note: sessions are in-memory only with a **2-hour TTL**.
+If the server was restarted or the session is older than 2 hours, it is gone.
+1. **List sessions** with `list_sessions`. Show session ID, completion status,
+   span count, and start time. Default to the most recent completed session
+   unless the user specifies one.
+2. **Summarize** with `summarize_session`. This converts raw OTLP spans into
+   structured invocations with user messages, tool calls, and agent responses.
+3. **Present as a readable narrative.** For each turn:
+   ```
+   Turn 1:
+     User: [what the user asked]
+     Tools: tool_name(arg=val, ...) → [one-line description of what this achieves]
+     Response: [response text, truncated if long]
+   ```
+4. **Flag anything worth investigating:**
+   - Turns with no tool calls when tools would be expected
+   - Empty or very short response after a long tool chain
+   - Same tool called repeatedly in one turn (possible loop)
+   - Abrupt stop mid-conversation
+5. **Offer next steps.** If the user wants to *score* the session against a golden
+   reference (pass/fail), suggest `/eval` with the session ID or a comparison run.

agentevals_cli-0.5.2/.claude/skills/inspect/evals/evals.json ADDED Viewed

@@ -0,0 +1,25 @@
+{
+  "skill_name": "inspect",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "I just ran the dice agent and it streamed to the agentevals server. Show me what it did.",
+      "expected_output": "Lists sessions, picks most recent dice session, summarizes as readable turn-by-turn narrative showing roll_die and check_prime tool calls",
+      "assertions": [
+        {"text": "Response attempts to call list_sessions MCP tool (does not skip straight to explaining the error)"},
+        {"text": "Response provides the exact command to start the server (agentevals serve --dev) if connection fails"},
+        {"text": "Response describes what the session summary would include (tool calls, user messages, narrative format)"}
+      ]
+    },
+    {
+      "id": 2,
+      "prompt": "Walk me through session dice-agent-gemini-2.5-flash-lite-20260306-160809-03676 — what tools did it call and did anything look off?",
+      "expected_output": "Calls summarize_session with that exact ID, presents turn-by-turn narrative with roll_die(sides=20) and check_prime(nums=[8]) tool calls, notes the empty response in turn 2",
+      "assertions": [
+        {"text": "Response attempts to call summarize_session with session ID dice-agent-gemini-2.5-flash-lite-20260306-160809-03676"},
+        {"text": "Response presents or describes a turn-by-turn narrative format (not just a list of tool names)"},
+        {"text": "Response mentions or would flag the empty response after roll_die as potentially worth noting"}
+      ]
+    }
+  ]
+}

agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

@@ -0,0 +1,114 @@
+name: Bug Report
+description: Report a bug or unexpected behavior in agentevals
+labels: ["bug"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thank you for taking the time to report a bug. Please fill out the sections below so we can reproduce and fix the issue.
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: What happened? Describe the bug clearly.
+      placeholder: When I run X, Y happens instead of Z...
+    validations:
+      required: true
+  - type: textarea
+    id: expected
+    attributes:
+      label: Expected behavior
+      description: What did you expect to happen?
+    validations:
+      required: true
+  - type: textarea
+    id: steps
+    attributes:
+      label: Steps to reproduce
+      description: Minimal steps to reproduce the issue.
+      placeholder: |
+        1. Run `agentevals serve --dev`
+        2. Connect an agent via WebSocket
+        3. ...
+    validations:
+      required: true
+  - type: dropdown
+    id: usage-mode
+    attributes:
+      label: How are you using agentevals?
+      options:
+        - Live streaming (WebSocket / OTLP)
+        - CLI file evaluation (`agentevals run`)
+        - MCP server
+        - Other
+    validations:
+      required: true
+  - type: upload
+    id: config-dump
+    attributes:
+      label: Config dump (ZIP from the web UI)
+      description: |
+        If you are using the live web UI, click the **Bug Report** button in the sidebar (the bug icon at the bottom left).
+        This downloads a ZIP file containing your environment info, session data, and logs.
+        Drag and drop the ZIP file here.
+  - type: textarea
+    id: version-info
+    attributes:
+      label: Version information
+      description: |
+        If you are not using the web UI (or cannot generate the ZIP above), provide at minimum
+        the output of `agentevals --version` and `python --version`, along with your OS name and version along with the files mentioned below.
+      placeholder: |
+        agentevals version: ...
+        Python version: ...
+        OS: ...
+      render: text
+    validations:
+      required: true
+  - type: textarea
+    id: eval-config
+    attributes:
+      label: Eval config (if applicable)
+      description: |
+        If you use an `eval_config.yaml`, paste its contents here.
+        Please redact any secrets or API keys.
+      render: yaml
+  - type: upload
+    id: trace-files
+    attributes:
+      label: Trace and eval set files
+      description: |
+        If you are using the CLI (`agentevals run`) or file upload method rather than live streaming,
+        please attach your trace file(s) and eval set file(s) so we can reproduce the issue.
+        Supported formats: Jaeger JSON, OTLP JSON/JSONL, ADK EvalSet JSON.
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant logs or error output
+      description: |
+        Paste any error messages, stack traces, or log output.
+        Run with `-vv` for verbose output (e.g. `agentevals -vv run ...` or `agentevals serve --dev -vv`).
+      render: text
+  - type: textarea
+    id: additional
+    attributes:
+      label: Additional context
+      description: Anything else that might help (screenshots, related issues, workarounds you tried, etc.).
+  - type: checkboxes
+    id: human-check
+    attributes:
+      label: Human confirmation
+      options:
+        - label: I am a human (not a bot, agent, or AI) filing this issue.
+          required: true

agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1 @@
1	+ blank_issues_enabled: false

agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/feature_request.yml ADDED Viewed

@@ -0,0 +1,39 @@
+name: Feature Request
+description: Suggest a new feature or improvement
+labels: ["enhancement"]
+body:
+  - type: textarea
+    id: problem
+    attributes:
+      label: Problem or motivation
+      description: What problem does this feature solve? Why is it needed?
+    validations:
+      required: true
+  - type: textarea
+    id: solution
+    attributes:
+      label: Proposed solution
+      description: Describe how you would like this to work.
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternatives considered
+      description: Any alternative solutions or workarounds you have considered.
+  - type: textarea
+    id: additional
+    attributes:
+      label: Additional context
+      description: Anything else (screenshots, links, examples, etc.).
+  - type: checkboxes
+    id: human-check
+    attributes:
+      label: Human confirmation
+      options:
+        - label: I am a human (not a bot, agent, or AI) filing this issue.
+          required: true

agentevals_cli-0.5.2/.github/workflows/ci.yml ADDED Viewed

@@ -0,0 +1,56 @@
+name: CI
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+permissions:
+  contents: read
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+      - name: Install dependencies
+        run: |
+          uv venv
+          uv pip install setuptools
+          uv sync --dev
+      - name: Ruff check
+        run: uv run ruff check .
+      - name: Ruff format check
+        run: uv run ruff format --check .
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v6
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+      - name: Install Python ${{ matrix.python-version }}
+        run: uv python install ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          uv venv --python ${{ matrix.python-version }}
+          uv pip install setuptools
+          uv sync --dev --python ${{ matrix.python-version }}
+      - name: Run tests
+        run: uv run pytest -m "not integration and not e2e" --tb=short -q

agentevals_cli-0.5.2/.github/workflows/publish-evaluator-sdk.yml ADDED Viewed

@@ -0,0 +1,44 @@
+name: Publish evaluator SDK
+on:
+  push:
+    tags:
+      - 'evaluator-sdk-v*'
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: 'Release tag (e.g. evaluator-sdk-v0.1.0)'
+        required: true
+permissions:
+  contents: read
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout GitHub Action'
+        uses: actions/checkout@main
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+      # Repo root cwd: uv build puts artifacts in ./dist; uv publish looks for dist/* relative to cwd.
+      - name: 'Release Python Packages'
+        env:
+          TAG_OR_VERSION: ${{ github.event.inputs.tag || github.ref_name }}
+        run: |
+          REF="${TAG_OR_VERSION}"
+          case "$REF" in
+            evaluator-sdk-v*) VERSION="${REF#evaluator-sdk-v}" ;;
+            *) VERSION="$REF" ;;
+          esac
+          if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+'; then
+            echo "Expected PEP 440 version (e.g. 0.1.0) or tag evaluator-sdk-v0.1.0; got: $REF"
+            exit 1
+          fi
+          uv sync --package agentevals-evaluator-sdk --all-extras
+          uv version "$VERSION" --package agentevals-evaluator-sdk
+          uv build --package agentevals-evaluator-sdk
+          uv publish dist/* --token ${{ secrets.PYPI_TOKEN }}

agentevals_cli-0.5.2/.github/workflows/release.yml ADDED Viewed

@@ -0,0 +1,79 @@
+name: Release
+on:
+  push:
+    tags:
+      - 'v*.*.*'
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: 'Release tag (e.g. v0.1.0)'
+        required: true
+permissions:
+  contents: read
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+      - uses: actions/setup-node@v6
+        with:
+          node-version: '22'
+          cache: npm
+          cache-dependency-path: ui/package-lock.json
+      - name: Build core and bundled wheels
+        run: make release
+      - uses: actions/upload-artifact@v7
+        with:
+          name: wheels
+          path: |
+            dist/core/*.whl
+            dist/bundle/*.whl
+  github-release:
+    needs: build
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/download-artifact@v8
+        with:
+          name: wheels
+          path: dist/
+      - uses: softprops/action-gh-release@v2.5.0
+        with:
+          tag_name: ${{ github.event.inputs.tag || github.ref_name }}
+          files: dist/**/*.whl
+          generate_release_notes: true
+  publish:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout GitHub Action'
+        uses: actions/checkout@main
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+      # Repo root cwd: uv build puts artifacts in ./dist; uv publish looks for dist/* relative to cwd.
+      - name: 'Release Python Packages'
+        env:
+          VERSION: ${{ github.event.inputs.tag || github.ref_name }}
+        run: |
+          uv sync --package agentevals-cli --all-extras
+          uv version "$VERSION" --package agentevals-cli
+          uv build --package agentevals-cli
+          uv publish dist/* --token ${{ secrets.PYPI_TOKEN }}

agentevals_cli-0.5.2/.gitignore ADDED Viewed

@@ -0,0 +1,57 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+.pytest_cache/
+.venv/
+venv/
+ENV/
+env/
+# UI (Node.js/React)
+ui/node_modules/
+ui/dist/
+ui/dist-ssr/
+ui/*.local
+# Bundled UI assets (generated by make build-bundle)
+src/agentevals/_static/
+# Logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+# Editor directories and files
+.vscode/
+!.vscode/extensions.json
+.idea/
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
+# Misc
+.env
+.env.local

agentevals_cli-0.5.2/.mcp.json ADDED Viewed

@@ -0,0 +1,8 @@
+{
+  "mcpServers": {
+    "agentevals": {
+      "command": "/usr/bin/env",
+      "args": ["uv", "run", "agentevals", "mcp"]
+    }
+  }
+}