agentevals-cli 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. agentevals_cli-0.5.2/.claude/skills/eval/SKILL.md +83 -0
  2. agentevals_cli-0.5.2/.claude/skills/eval/evals/evals.json +46 -0
  3. agentevals_cli-0.5.2/.claude/skills/inspect/SKILL.md +49 -0
  4. agentevals_cli-0.5.2/.claude/skills/inspect/evals/evals.json +25 -0
  5. agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/bug_report.yml +114 -0
  6. agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/config.yml +1 -0
  7. agentevals_cli-0.5.2/.github/ISSUE_TEMPLATE/feature_request.yml +39 -0
  8. agentevals_cli-0.5.2/.github/workflows/ci.yml +56 -0
  9. agentevals_cli-0.5.2/.github/workflows/publish-evaluator-sdk.yml +44 -0
  10. agentevals_cli-0.5.2/.github/workflows/release.yml +79 -0
  11. agentevals_cli-0.5.2/.gitignore +57 -0
  12. agentevals_cli-0.5.2/.mcp.json +8 -0
  13. agentevals_cli-0.5.2/CONTRIBUTING.md +197 -0
  14. agentevals_cli-0.5.2/DEVELOPMENT.md +143 -0
  15. agentevals_cli-0.5.2/LICENSE +201 -0
  16. agentevals_cli-0.5.2/Makefile +61 -0
  17. agentevals_cli-0.5.2/PKG-INFO +22 -0
  18. agentevals_cli-0.5.2/README.md +248 -0
  19. agentevals_cli-0.5.2/docs/assets/logo-color.png +0 -0
  20. agentevals_cli-0.5.2/docs/custom-evaluators.md +439 -0
  21. agentevals_cli-0.5.2/docs/eval-set-format.md +244 -0
  22. agentevals_cli-0.5.2/docs/otel-compatibility.md +90 -0
  23. agentevals_cli-0.5.2/docs/streaming.md +316 -0
  24. agentevals_cli-0.5.2/examples/README.md +224 -0
  25. agentevals_cli-0.5.2/examples/custom_evaluators/eval_config.yaml +35 -0
  26. agentevals_cli-0.5.2/examples/custom_evaluators/response_quality.py +57 -0
  27. agentevals_cli-0.5.2/examples/custom_evaluators/tool_call_checker.py +36 -0
  28. agentevals_cli-0.5.2/examples/dice_agent/README.md +186 -0
  29. agentevals_cli-0.5.2/examples/dice_agent/agent.py +67 -0
  30. agentevals_cli-0.5.2/examples/dice_agent/eval_set.json +127 -0
  31. agentevals_cli-0.5.2/examples/dice_agent/main.py +139 -0
  32. agentevals_cli-0.5.2/examples/dice_agent/test_streaming.py +63 -0
  33. agentevals_cli-0.5.2/examples/langchain_agent/README.md +230 -0
  34. agentevals_cli-0.5.2/examples/langchain_agent/agent.py +62 -0
  35. agentevals_cli-0.5.2/examples/langchain_agent/eval_set.json +29 -0
  36. agentevals_cli-0.5.2/examples/langchain_agent/main.py +180 -0
  37. agentevals_cli-0.5.2/examples/langchain_agent/requirements.txt +5 -0
  38. agentevals_cli-0.5.2/examples/langchain_agent/test_streaming.py +79 -0
  39. agentevals_cli-0.5.2/examples/sdk_example/async_example.py +61 -0
  40. agentevals_cli-0.5.2/examples/sdk_example/context_manager_example.py +35 -0
  41. agentevals_cli-0.5.2/examples/sdk_example/decorator_example.py +37 -0
  42. agentevals_cli-0.5.2/examples/sdk_example/requirements.txt +4 -0
  43. agentevals_cli-0.5.2/examples/strands_agent/agent.py +68 -0
  44. agentevals_cli-0.5.2/examples/strands_agent/eval_set.json +127 -0
  45. agentevals_cli-0.5.2/examples/strands_agent/main.py +138 -0
  46. agentevals_cli-0.5.2/examples/strands_agent/requirements.txt +2 -0
  47. agentevals_cli-0.5.2/examples/zero-code-examples/adk/requirements.txt +5 -0
  48. agentevals_cli-0.5.2/examples/zero-code-examples/adk/run.py +88 -0
  49. agentevals_cli-0.5.2/examples/zero-code-examples/langchain/requirements.txt +7 -0
  50. agentevals_cli-0.5.2/examples/zero-code-examples/langchain/run.py +110 -0
  51. agentevals_cli-0.5.2/examples/zero-code-examples/strands/requirements.txt +5 -0
  52. agentevals_cli-0.5.2/examples/zero-code-examples/strands/run.py +72 -0
  53. agentevals_cli-0.5.2/flake.lock +154 -0
  54. agentevals_cli-0.5.2/flake.nix +96 -0
  55. agentevals_cli-0.5.2/packages/evaluator-sdk-py/README.md +45 -0
  56. agentevals_cli-0.5.2/packages/evaluator-sdk-py/pyproject.toml +32 -0
  57. agentevals_cli-0.5.2/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py +43 -0
  58. agentevals_cli-0.5.2/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py +77 -0
  59. agentevals_cli-0.5.2/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py +84 -0
  60. agentevals_cli-0.5.2/pyproject.toml +86 -0
  61. agentevals_cli-0.5.2/samples/eval_set_helm.json +51 -0
  62. agentevals_cli-0.5.2/samples/evalset_helm_3_2026-02-23.json +59 -0
  63. agentevals_cli-0.5.2/samples/evalset_k8s_2026-02-20.json +36 -0
  64. agentevals_cli-0.5.2/samples/helm.json +5333 -0
  65. agentevals_cli-0.5.2/samples/helm_2.json +3708 -0
  66. agentevals_cli-0.5.2/samples/helm_3.json +7182 -0
  67. agentevals_cli-0.5.2/samples/k8s.json +3908 -0
  68. agentevals_cli-0.5.2/src/agentevals/__init__.py +16 -0
  69. agentevals_cli-0.5.2/src/agentevals/_protocol.py +83 -0
  70. agentevals_cli-0.5.2/src/agentevals/api/__init__.py +0 -0
  71. agentevals_cli-0.5.2/src/agentevals/api/app.py +137 -0
  72. agentevals_cli-0.5.2/src/agentevals/api/debug_routes.py +268 -0
  73. agentevals_cli-0.5.2/src/agentevals/api/models.py +204 -0
  74. agentevals_cli-0.5.2/src/agentevals/api/otlp_app.py +25 -0
  75. agentevals_cli-0.5.2/src/agentevals/api/otlp_routes.py +383 -0
  76. agentevals_cli-0.5.2/src/agentevals/api/routes.py +554 -0
  77. agentevals_cli-0.5.2/src/agentevals/api/streaming_routes.py +373 -0
  78. agentevals_cli-0.5.2/src/agentevals/builtin_metrics.py +234 -0
  79. agentevals_cli-0.5.2/src/agentevals/cli.py +643 -0
  80. agentevals_cli-0.5.2/src/agentevals/config.py +108 -0
  81. agentevals_cli-0.5.2/src/agentevals/converter.py +328 -0
  82. agentevals_cli-0.5.2/src/agentevals/custom_evaluators.py +468 -0
  83. agentevals_cli-0.5.2/src/agentevals/eval_config_loader.py +147 -0
  84. agentevals_cli-0.5.2/src/agentevals/evaluator/__init__.py +24 -0
  85. agentevals_cli-0.5.2/src/agentevals/evaluator/resolver.py +70 -0
  86. agentevals_cli-0.5.2/src/agentevals/evaluator/sources.py +293 -0
  87. agentevals_cli-0.5.2/src/agentevals/evaluator/templates.py +224 -0
  88. agentevals_cli-0.5.2/src/agentevals/extraction.py +444 -0
  89. agentevals_cli-0.5.2/src/agentevals/genai_converter.py +538 -0
  90. agentevals_cli-0.5.2/src/agentevals/loader/__init__.py +7 -0
  91. agentevals_cli-0.5.2/src/agentevals/loader/base.py +53 -0
  92. agentevals_cli-0.5.2/src/agentevals/loader/jaeger.py +112 -0
  93. agentevals_cli-0.5.2/src/agentevals/loader/otlp.py +193 -0
  94. agentevals_cli-0.5.2/src/agentevals/mcp_server.py +236 -0
  95. agentevals_cli-0.5.2/src/agentevals/output.py +204 -0
  96. agentevals_cli-0.5.2/src/agentevals/runner.py +310 -0
  97. agentevals_cli-0.5.2/src/agentevals/sdk.py +433 -0
  98. agentevals_cli-0.5.2/src/agentevals/streaming/__init__.py +120 -0
  99. agentevals_cli-0.5.2/src/agentevals/streaming/incremental_processor.py +337 -0
  100. agentevals_cli-0.5.2/src/agentevals/streaming/processor.py +285 -0
  101. agentevals_cli-0.5.2/src/agentevals/streaming/session.py +36 -0
  102. agentevals_cli-0.5.2/src/agentevals/streaming/ws_server.py +806 -0
  103. agentevals_cli-0.5.2/src/agentevals/trace_attrs.py +32 -0
  104. agentevals_cli-0.5.2/src/agentevals/trace_metrics.py +126 -0
  105. agentevals_cli-0.5.2/src/agentevals/utils/__init__.py +0 -0
  106. agentevals_cli-0.5.2/src/agentevals/utils/genai_messages.py +142 -0
  107. agentevals_cli-0.5.2/src/agentevals/utils/log_buffer.py +43 -0
  108. agentevals_cli-0.5.2/src/agentevals/utils/log_enrichment.py +187 -0
  109. agentevals_cli-0.5.2/tests/integration/__init__.py +0 -0
  110. agentevals_cli-0.5.2/tests/integration/conftest.py +341 -0
  111. agentevals_cli-0.5.2/tests/integration/test_evaluation_pipeline.py +131 -0
  112. agentevals_cli-0.5.2/tests/integration/test_live_agents.py +348 -0
  113. agentevals_cli-0.5.2/tests/integration/test_session_grouping.py +509 -0
  114. agentevals_cli-0.5.2/tests/integration/test_timing_stress.py +346 -0
  115. agentevals_cli-0.5.2/tests/test_api.py +1033 -0
  116. agentevals_cli-0.5.2/tests/test_converter.py +403 -0
  117. agentevals_cli-0.5.2/tests/test_extraction.py +513 -0
  118. agentevals_cli-0.5.2/tests/test_genai_converter.py +992 -0
  119. agentevals_cli-0.5.2/tests/test_jaeger_loader.py +146 -0
  120. agentevals_cli-0.5.2/tests/test_log_enrichment.py +197 -0
  121. agentevals_cli-0.5.2/tests/test_otlp_loader.py +210 -0
  122. agentevals_cli-0.5.2/tests/test_otlp_receiver.py +1659 -0
  123. agentevals_cli-0.5.2/tests/test_protocol.py +25 -0
  124. agentevals_cli-0.5.2/tests/test_runner.py +166 -0
  125. agentevals_cli-0.5.2/tests/test_sdk.py +472 -0
  126. agentevals_cli-0.5.2/ui/.gitignore +24 -0
  127. agentevals_cli-0.5.2/ui/README.md +34 -0
  128. agentevals_cli-0.5.2/ui/eslint.config.js +23 -0
  129. agentevals_cli-0.5.2/ui/index.html +13 -0
  130. agentevals_cli-0.5.2/ui/package-lock.json +4691 -0
  131. agentevals_cli-0.5.2/ui/package.json +38 -0
  132. agentevals_cli-0.5.2/ui/public/logo.svg +13 -0
  133. agentevals_cli-0.5.2/ui/public/vite.svg +1 -0
  134. agentevals_cli-0.5.2/ui/src/App.css +42 -0
  135. agentevals_cli-0.5.2/ui/src/App.tsx +45 -0
  136. agentevals_cli-0.5.2/ui/src/api/client.ts +267 -0
  137. agentevals_cli-0.5.2/ui/src/assets/react.svg +1 -0
  138. agentevals_cli-0.5.2/ui/src/components/annotation-queue/AnnotationDetailPanel.tsx +331 -0
  139. agentevals_cli-0.5.2/ui/src/components/annotation-queue/AnnotationQueueView.tsx +332 -0
  140. agentevals_cli-0.5.2/ui/src/components/annotation-queue/AnnotationTable.tsx +360 -0
  141. agentevals_cli-0.5.2/ui/src/components/bug-report/BugReportModal.tsx +367 -0
  142. agentevals_cli-0.5.2/ui/src/components/builder/BuilderHeader.tsx +94 -0
  143. agentevals_cli-0.5.2/ui/src/components/builder/BuilderView.tsx +98 -0
  144. agentevals_cli-0.5.2/ui/src/components/builder/EvalCaseCard.tsx +153 -0
  145. agentevals_cli-0.5.2/ui/src/components/builder/EvalCasesList.tsx +91 -0
  146. agentevals_cli-0.5.2/ui/src/components/builder/InvocationEditor.tsx +123 -0
  147. agentevals_cli-0.5.2/ui/src/components/builder/JsonPreview.tsx +77 -0
  148. agentevals_cli-0.5.2/ui/src/components/builder/MetadataEditor.tsx +79 -0
  149. agentevals_cli-0.5.2/ui/src/components/builder/TraceUploadZone.tsx +194 -0
  150. agentevals_cli-0.5.2/ui/src/components/builder/index.ts +7 -0
  151. agentevals_cli-0.5.2/ui/src/components/dashboard/DashboardView.tsx +213 -0
  152. agentevals_cli-0.5.2/ui/src/components/dashboard/MetricScoreCard.tsx +149 -0
  153. agentevals_cli-0.5.2/ui/src/components/dashboard/PerformanceCard.tsx +88 -0
  154. agentevals_cli-0.5.2/ui/src/components/dashboard/PerformanceCharts.tsx +254 -0
  155. agentevals_cli-0.5.2/ui/src/components/dashboard/SummaryStats.tsx +149 -0
  156. agentevals_cli-0.5.2/ui/src/components/dashboard/TraceCard.tsx +278 -0
  157. agentevals_cli-0.5.2/ui/src/components/dashboard/TraceTable.tsx +552 -0
  158. agentevals_cli-0.5.2/ui/src/components/inspector/ComparisonPanel.tsx +180 -0
  159. agentevals_cli-0.5.2/ui/src/components/inspector/DataSection.tsx +125 -0
  160. agentevals_cli-0.5.2/ui/src/components/inspector/InspectorHeader.tsx +309 -0
  161. agentevals_cli-0.5.2/ui/src/components/inspector/InspectorLayout.tsx +139 -0
  162. agentevals_cli-0.5.2/ui/src/components/inspector/InspectorView.tsx +337 -0
  163. agentevals_cli-0.5.2/ui/src/components/inspector/InvocationCard.tsx +180 -0
  164. agentevals_cli-0.5.2/ui/src/components/inspector/InvocationSummaryPanel.tsx +282 -0
  165. agentevals_cli-0.5.2/ui/src/components/inspector/MetricResultsSection.tsx +286 -0
  166. agentevals_cli-0.5.2/ui/src/components/inspector/MetricsComparisonSection.tsx +645 -0
  167. agentevals_cli-0.5.2/ui/src/components/inspector/PerformanceSection.tsx +94 -0
  168. agentevals_cli-0.5.2/ui/src/components/inspector/ToolCallList.tsx +229 -0
  169. agentevals_cli-0.5.2/ui/src/components/inspector/TrajectoryComparisonDetails.tsx +222 -0
  170. agentevals_cli-0.5.2/ui/src/components/sidebar/Sidebar.tsx +226 -0
  171. agentevals_cli-0.5.2/ui/src/components/streaming/LiveConversationPanel.tsx +123 -0
  172. agentevals_cli-0.5.2/ui/src/components/streaming/LiveMessage.tsx +146 -0
  173. agentevals_cli-0.5.2/ui/src/components/streaming/LiveStreamingView.tsx +823 -0
  174. agentevals_cli-0.5.2/ui/src/components/streaming/SessionCard.tsx +573 -0
  175. agentevals_cli-0.5.2/ui/src/components/streaming/SessionMetadata.tsx +78 -0
  176. agentevals_cli-0.5.2/ui/src/components/upload/EvalSetEditorDrawer.tsx +308 -0
  177. agentevals_cli-0.5.2/ui/src/components/upload/FileDropZone.tsx +88 -0
  178. agentevals_cli-0.5.2/ui/src/components/upload/MetricSelector.tsx +228 -0
  179. agentevals_cli-0.5.2/ui/src/components/upload/RawJsonPreview.tsx +91 -0
  180. agentevals_cli-0.5.2/ui/src/components/upload/TraceEditorDrawer.tsx +357 -0
  181. agentevals_cli-0.5.2/ui/src/components/upload/UploadView.tsx +567 -0
  182. agentevals_cli-0.5.2/ui/src/components/welcome/WelcomeView.tsx +152 -0
  183. agentevals_cli-0.5.2/ui/src/config.ts +28 -0
  184. agentevals_cli-0.5.2/ui/src/context/TraceContext.tsx +90 -0
  185. agentevals_cli-0.5.2/ui/src/context/TraceProvider.tsx +379 -0
  186. agentevals_cli-0.5.2/ui/src/index.css +256 -0
  187. agentevals_cli-0.5.2/ui/src/lib/console-capture.ts +51 -0
  188. agentevals_cli-0.5.2/ui/src/lib/evalset-builder.ts +131 -0
  189. agentevals_cli-0.5.2/ui/src/lib/network-capture.ts +62 -0
  190. agentevals_cli-0.5.2/ui/src/lib/trace-converter.ts +734 -0
  191. agentevals_cli-0.5.2/ui/src/lib/trace-loader.ts +249 -0
  192. agentevals_cli-0.5.2/ui/src/lib/trace-metadata.ts +391 -0
  193. agentevals_cli-0.5.2/ui/src/lib/trace-patcher.ts +328 -0
  194. agentevals_cli-0.5.2/ui/src/lib/types.ts +421 -0
  195. agentevals_cli-0.5.2/ui/src/lib/utils.ts +223 -0
  196. agentevals_cli-0.5.2/ui/src/main.tsx +15 -0
  197. agentevals_cli-0.5.2/ui/tsconfig.app.json +29 -0
  198. agentevals_cli-0.5.2/ui/tsconfig.json +7 -0
  199. agentevals_cli-0.5.2/ui/tsconfig.node.json +26 -0
  200. agentevals_cli-0.5.2/ui/vite.config.ts +14 -0
  201. agentevals_cli-0.5.2/uv.lock +4098 -0
@@ -0,0 +1,83 @@
1
+ ---
2
+ name: eval
3
+ description: >
4
+ Evaluate and score agent behavior against a golden reference. Use this skill
5
+ whenever the user wants to run evaluation, check pass/fail status, understand
6
+ metric scores, compare sessions for regressions, validate agent behavior, or
7
+ score a trace from a file or a live session. Trigger on phrases like "eval this
8
+ trace", "check my agent output", "did my agent do the right thing", "compare runs",
9
+ "did my agent regress", "score session X", "evaluate against golden", "run evals".
10
+ Works with both local trace files and live streaming sessions.
11
+ ---
12
+
13
+ Evaluate agent behavior and explain what the scores mean.
14
+
15
+ ## Determine the input type
16
+
17
+ First, figure out what to evaluate:
18
+ - **Trace file(s)** — user mentions a `.json` or `.jsonl` file path → use `evaluate_traces`
19
+ - **Sessions vs golden** — user has multiple live sessions and wants regression testing → use `evaluate_sessions`
20
+ - **Single live session** — user wants to score one session against a golden eval set → guide them to use `evaluate_sessions` with one session as golden
21
+
22
+ ## Evaluating trace files
23
+
24
+ 1. Get the file path(s). Check the extension:
25
+ `.jsonl` → `trace_format: "otlp-json"` | `.json` → `"jaeger-json"` (default)
26
+
27
+ 2. Ask if they have a golden eval set JSON. For `tool_trajectory_avg_score` (the
28
+ default metric), an eval set is required — it provides the expected tool call
29
+ sequence to compare against. If they don't have one yet, explain this and suggest
30
+ starting with `hallucinations_v1`, or ask if they want to create a golden set from
31
+ a reference run first.
32
+
33
+ 3. Call `evaluate_traces` with the file(s), format, and eval set.
34
+
35
+ 4. Present results as a score table (see Score interpretation below) and explain failures.
36
+
37
+ ## Evaluating sessions (regression testing)
38
+
39
+ This workflow requires the server to be running with the `--dev` flag (which enables
40
+ WebSocket and session streaming). Plain `agentevals serve` will not have sessions.
41
+ If you get a connection error from any tool below, tell the user:
42
+
43
+ ```bash
44
+ uv run agentevals serve --dev
45
+ ```
46
+
47
+ 1. Call `list_sessions` to show available sessions.
48
+
49
+ 2. Help the user identify the "golden" session — the reference run that represents
50
+ correct behavior. The server derives the eval set from it automatically.
51
+
52
+ 3. Call `evaluate_sessions(golden_session_id=...)`. This scores all other completed
53
+ sessions against the golden.
54
+
55
+ 4. Present a comparison table:
56
+ ```
57
+ Session | Score | Status | Delta
58
+ session-abc (golden)| 1.00 | — | baseline
59
+ session-def | 0.85 | PASSED | -0.15
60
+ session-ghi | 0.40 | FAILED | -0.60 ⚠️
61
+ ```
62
+
63
+ 5. Explain regressions specifically: which tools the golden called that a failing
64
+ session skipped, or unexpected extra calls. Concrete tool names are more useful
65
+ than just quoting the score.
66
+
67
+ ## Score interpretation
68
+
69
+ | Score | Meaning |
70
+ |-------|---------|
71
+ | 1.0 | Exact match — right tools, right order |
72
+ | 0.7–0.9 | Minor deviations (extra call or slightly different args) |
73
+ | 0.5–0.7 | Partial match — some turns correct, others missing or wrong tool calls |
74
+ | 0.0–0.5 | Major divergence — most tool calls don't match golden |
75
+
76
+ **Important:** `evalStatus: PASSED` does **not** mean the agent did well — it only means
77
+ the score met the configured threshold. Without a configured threshold, every session
78
+ shows PASSED regardless of score. Focus on the numeric score, not the status label.
79
+
80
+ ## After results
81
+
82
+ If the user wants to understand *what the agent did* step by step (not just the score),
83
+ suggest `/inspect` to get a readable narrative of a session.
@@ -0,0 +1,46 @@
1
+ {
2
+ "skill_name": "eval",
3
+ "evals": [
4
+ {
5
+ "id": 1,
6
+ "prompt": "I ran the dice agent twice and both sessions are on the agentevals server. Session IDs: dice-agent-gemini-2.5-flash-lite-20260306-160809-03676 and dice-agent-gemini-2.5-flash-lite-20260306-160828-64013. Did the second run regress compared to the first?",
7
+ "expected_output": "Lists sessions, uses first as golden, calls evaluate_sessions, presents comparison table with scores, explains which tool call changed",
8
+ "assertions": [
9
+ {"text": "Response attempts to call list_sessions or evaluate_sessions MCP tool (or explains it would do so if the server were accessible)"},
10
+ {"text": "Response identifies a 'golden' session concept — the reference run to compare against"},
11
+ {"text": "Response presents or describes a comparison table with scores and delta for both sessions"},
12
+ {"text": "Response mentions /inspect or suggests inspecting the session to understand what changed"}
13
+ ]
14
+ },
15
+ {
16
+ "id": 2,
17
+ "prompt": "Evaluate my agent trace at /tmp/run_42.jsonl against the golden set /tmp/golden.json",
18
+ "expected_output": "Detects otlp-json format from .jsonl extension, attempts to run eval, explains scores or handles missing file",
19
+ "assertions": [
20
+ {"text": "Response identifies the .jsonl extension as OTLP format (otlp-json), not jaeger-json"},
21
+ {"text": "Response handles the missing file gracefully (does not silently fail or crash without explanation)"},
22
+ {"text": "Response explains the format difference between .jsonl (OTLP) and .json (Jaeger) or at minimum names the correct format for each"}
23
+ ]
24
+ },
25
+ {
26
+ "id": 3,
27
+ "prompt": "I want to check if my dice agent behaved correctly in the session dice-agent-gemini-2.5-flash-lite-20260306-160828-64013. I think something went wrong in the last turn. Score it for me.",
28
+ "expected_output": "Identifies this as a session, calls evaluate_sessions using the other session as golden, shows score, explains the regression in the final turn",
29
+ "assertions": [
30
+ {"text": "Response attempts to call list_sessions or evaluate_sessions (or explains the workflow if tools unavailable)"},
31
+ {"text": "Response explains that a golden/reference session is needed to compare against"},
32
+ {"text": "Response describes what a regression in the last turn would look like (missing tool call, wrong tool, or score below 1.0)"}
33
+ ]
34
+ },
35
+ {
36
+ "id": 4,
37
+ "prompt": "Run eval on the dice agent session dice-agent-gemini-2.5-flash-lite-20260306-160809-03676 but I don't have an eval set yet",
38
+ "expected_output": "Explains that for session-based eval the server auto-derives the eval set from a golden session, suggests the other dice session as golden, or offers metric alternatives",
39
+ "assertions": [
40
+ {"text": "Response explains that tool_trajectory_avg_score requires a reference (eval set or golden session)"},
41
+ {"text": "Response suggests at least one path forward: use another session as golden, use hallucinations_v1, or create an eval set"},
42
+ {"text": "Response gives the user a concrete next step rather than just describing the problem"}
43
+ ]
44
+ }
45
+ ]
46
+ }
@@ -0,0 +1,49 @@
1
+ ---
2
+ name: inspect
3
+ description: >
4
+ Inspect and debug live streaming agent sessions to understand what the agent did.
5
+ Use this skill when the user wants to see what happened in a recent agent run,
6
+ trace through the tool calls step by step, debug unexpected agent behavior, or
7
+ get a readable narrative of a session. Trigger on phrases like "show me what my
8
+ agent did", "inspect session", "what happened in that run", "debug my agent",
9
+ "trace through session", "walk me through the last run", "what tools did it call".
10
+ Requires `agentevals serve --dev` to be running. For scoring/pass-fail evaluation,
11
+ use /eval instead.
12
+ ---
13
+
14
+ Help the user understand what happened in a live streaming agent session.
15
+
16
+ If `list_sessions` or `summarize_session` fail with a connection error, the server
17
+ is not running. Tell the user:
18
+
19
+ ```bash
20
+ uv run agentevals serve --dev
21
+ ```
22
+
23
+ The `--dev` flag is required — plain `serve` does not enable the WebSocket endpoint
24
+ that sessions stream to. Also note: sessions are in-memory only with a **2-hour TTL**.
25
+ If the server was restarted or the session is older than 2 hours, it is gone.
26
+
27
+ 1. **List sessions** with `list_sessions`. Show session ID, completion status,
28
+ span count, and start time. Default to the most recent completed session
29
+ unless the user specifies one.
30
+
31
+ 2. **Summarize** with `summarize_session`. This converts raw OTLP spans into
32
+ structured invocations with user messages, tool calls, and agent responses.
33
+
34
+ 3. **Present as a readable narrative.** For each turn:
35
+ ```
36
+ Turn 1:
37
+ User: [what the user asked]
38
+ Tools: tool_name(arg=val, ...) → [one-line description of what this achieves]
39
+ Response: [response text, truncated if long]
40
+ ```
41
+
42
+ 4. **Flag anything worth investigating:**
43
+ - Turns with no tool calls when tools would be expected
44
+ - Empty or very short response after a long tool chain
45
+ - Same tool called repeatedly in one turn (possible loop)
46
+ - Abrupt stop mid-conversation
47
+
48
+ 5. **Offer next steps.** If the user wants to *score* the session against a golden
49
+ reference (pass/fail), suggest `/eval` with the session ID or a comparison run.
@@ -0,0 +1,25 @@
1
+ {
2
+ "skill_name": "inspect",
3
+ "evals": [
4
+ {
5
+ "id": 1,
6
+ "prompt": "I just ran the dice agent and it streamed to the agentevals server. Show me what it did.",
7
+ "expected_output": "Lists sessions, picks most recent dice session, summarizes as readable turn-by-turn narrative showing roll_die and check_prime tool calls",
8
+ "assertions": [
9
+ {"text": "Response attempts to call list_sessions MCP tool (does not skip straight to explaining the error)"},
10
+ {"text": "Response provides the exact command to start the server (agentevals serve --dev) if connection fails"},
11
+ {"text": "Response describes what the session summary would include (tool calls, user messages, narrative format)"}
12
+ ]
13
+ },
14
+ {
15
+ "id": 2,
16
+ "prompt": "Walk me through session dice-agent-gemini-2.5-flash-lite-20260306-160809-03676 — what tools did it call and did anything look off?",
17
+ "expected_output": "Calls summarize_session with that exact ID, presents turn-by-turn narrative with roll_die(sides=20) and check_prime(nums=[8]) tool calls, notes the empty response in turn 2",
18
+ "assertions": [
19
+ {"text": "Response attempts to call summarize_session with session ID dice-agent-gemini-2.5-flash-lite-20260306-160809-03676"},
20
+ {"text": "Response presents or describes a turn-by-turn narrative format (not just a list of tool names)"},
21
+ {"text": "Response mentions or would flag the empty response after roll_die as potentially worth noting"}
22
+ ]
23
+ }
24
+ ]
25
+ }
@@ -0,0 +1,114 @@
1
+ name: Bug Report
2
+ description: Report a bug or unexpected behavior in agentevals
3
+ labels: ["bug"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thank you for taking the time to report a bug. Please fill out the sections below so we can reproduce and fix the issue.
9
+
10
+ - type: textarea
11
+ id: description
12
+ attributes:
13
+ label: Description
14
+ description: What happened? Describe the bug clearly.
15
+ placeholder: When I run X, Y happens instead of Z...
16
+ validations:
17
+ required: true
18
+
19
+ - type: textarea
20
+ id: expected
21
+ attributes:
22
+ label: Expected behavior
23
+ description: What did you expect to happen?
24
+ validations:
25
+ required: true
26
+
27
+ - type: textarea
28
+ id: steps
29
+ attributes:
30
+ label: Steps to reproduce
31
+ description: Minimal steps to reproduce the issue.
32
+ placeholder: |
33
+ 1. Run `agentevals serve --dev`
34
+ 2. Connect an agent via WebSocket
35
+ 3. ...
36
+ validations:
37
+ required: true
38
+
39
+ - type: dropdown
40
+ id: usage-mode
41
+ attributes:
42
+ label: How are you using agentevals?
43
+ options:
44
+ - Live streaming (WebSocket / OTLP)
45
+ - CLI file evaluation (`agentevals run`)
46
+ - MCP server
47
+ - Other
48
+ validations:
49
+ required: true
50
+
51
+ - type: upload
52
+ id: config-dump
53
+ attributes:
54
+ label: Config dump (ZIP from the web UI)
55
+ description: |
56
+ If you are using the live web UI, click the **Bug Report** button in the sidebar (the bug icon at the bottom left).
57
+ This downloads a ZIP file containing your environment info, session data, and logs.
58
+ Drag and drop the ZIP file here.
59
+
60
+ - type: textarea
61
+ id: version-info
62
+ attributes:
63
+ label: Version information
64
+ description: |
65
+ If you are not using the web UI (or cannot generate the ZIP above), provide at minimum
66
+ the output of `agentevals --version` and `python --version`, along with your OS name and version along with the files mentioned below.
67
+ placeholder: |
68
+ agentevals version: ...
69
+ Python version: ...
70
+ OS: ...
71
+ render: text
72
+ validations:
73
+ required: true
74
+
75
+ - type: textarea
76
+ id: eval-config
77
+ attributes:
78
+ label: Eval config (if applicable)
79
+ description: |
80
+ If you use an `eval_config.yaml`, paste its contents here.
81
+ Please redact any secrets or API keys.
82
+ render: yaml
83
+
84
+ - type: upload
85
+ id: trace-files
86
+ attributes:
87
+ label: Trace and eval set files
88
+ description: |
89
+ If you are using the CLI (`agentevals run`) or file upload method rather than live streaming,
90
+ please attach your trace file(s) and eval set file(s) so we can reproduce the issue.
91
+ Supported formats: Jaeger JSON, OTLP JSON/JSONL, ADK EvalSet JSON.
92
+
93
+ - type: textarea
94
+ id: logs
95
+ attributes:
96
+ label: Relevant logs or error output
97
+ description: |
98
+ Paste any error messages, stack traces, or log output.
99
+ Run with `-vv` for verbose output (e.g. `agentevals -vv run ...` or `agentevals serve --dev -vv`).
100
+ render: text
101
+
102
+ - type: textarea
103
+ id: additional
104
+ attributes:
105
+ label: Additional context
106
+ description: Anything else that might help (screenshots, related issues, workarounds you tried, etc.).
107
+
108
+ - type: checkboxes
109
+ id: human-check
110
+ attributes:
111
+ label: Human confirmation
112
+ options:
113
+ - label: I am a human (not a bot, agent, or AI) filing this issue.
114
+ required: true
@@ -0,0 +1 @@
1
+ blank_issues_enabled: false
@@ -0,0 +1,39 @@
1
+ name: Feature Request
2
+ description: Suggest a new feature or improvement
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: textarea
6
+ id: problem
7
+ attributes:
8
+ label: Problem or motivation
9
+ description: What problem does this feature solve? Why is it needed?
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: solution
15
+ attributes:
16
+ label: Proposed solution
17
+ description: Describe how you would like this to work.
18
+ validations:
19
+ required: true
20
+
21
+ - type: textarea
22
+ id: alternatives
23
+ attributes:
24
+ label: Alternatives considered
25
+ description: Any alternative solutions or workarounds you have considered.
26
+
27
+ - type: textarea
28
+ id: additional
29
+ attributes:
30
+ label: Additional context
31
+ description: Anything else (screenshots, links, examples, etc.).
32
+
33
+ - type: checkboxes
34
+ id: human-check
35
+ attributes:
36
+ label: Human confirmation
37
+ options:
38
+ - label: I am a human (not a bot, agent, or AI) filing this issue.
39
+ required: true
@@ -0,0 +1,56 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main]
6
+ push:
7
+ branches: [main]
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ lint:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v6
17
+
18
+ - uses: astral-sh/setup-uv@v7
19
+ with:
20
+ enable-cache: true
21
+
22
+ - name: Install dependencies
23
+ run: |
24
+ uv venv
25
+ uv pip install setuptools
26
+ uv sync --dev
27
+
28
+ - name: Ruff check
29
+ run: uv run ruff check .
30
+
31
+ - name: Ruff format check
32
+ run: uv run ruff format --check .
33
+
34
+ test:
35
+ runs-on: ubuntu-latest
36
+ strategy:
37
+ matrix:
38
+ python-version: ["3.11", "3.12", "3.13"]
39
+ steps:
40
+ - uses: actions/checkout@v6
41
+
42
+ - uses: astral-sh/setup-uv@v7
43
+ with:
44
+ enable-cache: true
45
+
46
+ - name: Install Python ${{ matrix.python-version }}
47
+ run: uv python install ${{ matrix.python-version }}
48
+
49
+ - name: Install dependencies
50
+ run: |
51
+ uv venv --python ${{ matrix.python-version }}
52
+ uv pip install setuptools
53
+ uv sync --dev --python ${{ matrix.python-version }}
54
+
55
+ - name: Run tests
56
+ run: uv run pytest -m "not integration and not e2e" --tb=short -q
@@ -0,0 +1,44 @@
1
+ name: Publish evaluator SDK
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'evaluator-sdk-v*'
7
+ workflow_dispatch:
8
+ inputs:
9
+ tag:
10
+ description: 'Release tag (e.g. evaluator-sdk-v0.1.0)'
11
+ required: true
12
+
13
+ permissions:
14
+ contents: read
15
+
16
+ jobs:
17
+ publish:
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - name: 'Checkout GitHub Action'
21
+ uses: actions/checkout@main
22
+
23
+ - name: Install uv
24
+ uses: astral-sh/setup-uv@v6
25
+
26
+ # Repo root cwd: uv build puts artifacts in ./dist; uv publish looks for dist/* relative to cwd.
27
+ - name: 'Release Python Packages'
28
+ env:
29
+ TAG_OR_VERSION: ${{ github.event.inputs.tag || github.ref_name }}
30
+ run: |
31
+ REF="${TAG_OR_VERSION}"
32
+ case "$REF" in
33
+ evaluator-sdk-v*) VERSION="${REF#evaluator-sdk-v}" ;;
34
+ *) VERSION="$REF" ;;
35
+ esac
36
+ if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+'; then
37
+ echo "Expected PEP 440 version (e.g. 0.1.0) or tag evaluator-sdk-v0.1.0; got: $REF"
38
+ exit 1
39
+ fi
40
+ uv sync --package agentevals-evaluator-sdk --all-extras
41
+ uv version "$VERSION" --package agentevals-evaluator-sdk
42
+
43
+ uv build --package agentevals-evaluator-sdk
44
+ uv publish dist/* --token ${{ secrets.PYPI_TOKEN }}
@@ -0,0 +1,79 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*.*.*'
7
+ workflow_dispatch:
8
+ inputs:
9
+ tag:
10
+ description: 'Release tag (e.g. v0.1.0)'
11
+ required: true
12
+
13
+ permissions:
14
+ contents: read
15
+
16
+ jobs:
17
+ build:
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v6
21
+
22
+ - uses: astral-sh/setup-uv@v7
23
+ with:
24
+ enable-cache: true
25
+
26
+ - uses: actions/setup-node@v6
27
+ with:
28
+ node-version: '22'
29
+ cache: npm
30
+ cache-dependency-path: ui/package-lock.json
31
+
32
+ - name: Build core and bundled wheels
33
+ run: make release
34
+
35
+ - uses: actions/upload-artifact@v7
36
+ with:
37
+ name: wheels
38
+ path: |
39
+ dist/core/*.whl
40
+ dist/bundle/*.whl
41
+
42
+ github-release:
43
+ needs: build
44
+ runs-on: ubuntu-latest
45
+ permissions:
46
+ contents: write
47
+
48
+ steps:
49
+ - uses: actions/download-artifact@v8
50
+ with:
51
+ name: wheels
52
+ path: dist/
53
+
54
+ - uses: softprops/action-gh-release@v2.5.0
55
+ with:
56
+ tag_name: ${{ github.event.inputs.tag || github.ref_name }}
57
+ files: dist/**/*.whl
58
+ generate_release_notes: true
59
+
60
+ publish:
61
+ needs: build
62
+ runs-on: ubuntu-latest
63
+ steps:
64
+ - name: 'Checkout GitHub Action'
65
+ uses: actions/checkout@main
66
+
67
+ - name: Install uv
68
+ uses: astral-sh/setup-uv@v6
69
+
70
+ # Repo root cwd: uv build puts artifacts in ./dist; uv publish looks for dist/* relative to cwd.
71
+ - name: 'Release Python Packages'
72
+ env:
73
+ VERSION: ${{ github.event.inputs.tag || github.ref_name }}
74
+ run: |
75
+ uv sync --package agentevals-cli --all-extras
76
+ uv version "$VERSION" --package agentevals-cli
77
+
78
+ uv build --package agentevals-cli
79
+ uv publish dist/* --token ${{ secrets.PYPI_TOKEN }}
@@ -0,0 +1,57 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib64/
14
+ parts/
15
+ sdist/
16
+ var/
17
+ wheels/
18
+ *.egg-info/
19
+ .installed.cfg
20
+ *.egg
21
+ .pytest_cache/
22
+ .venv/
23
+ venv/
24
+ ENV/
25
+ env/
26
+
27
+ # UI (Node.js/React)
28
+ ui/node_modules/
29
+ ui/dist/
30
+ ui/dist-ssr/
31
+ ui/*.local
32
+
33
+ # Bundled UI assets (generated by make build-bundle)
34
+ src/agentevals/_static/
35
+
36
+ # Logs
37
+ *.log
38
+ npm-debug.log*
39
+ yarn-debug.log*
40
+ yarn-error.log*
41
+ pnpm-debug.log*
42
+ lerna-debug.log*
43
+
44
+ # Editor directories and files
45
+ .vscode/
46
+ !.vscode/extensions.json
47
+ .idea/
48
+ .DS_Store
49
+ *.suo
50
+ *.ntvs*
51
+ *.njsproj
52
+ *.sln
53
+ *.sw?
54
+
55
+ # Misc
56
+ .env
57
+ .env.local
@@ -0,0 +1,8 @@
1
+ {
2
+ "mcpServers": {
3
+ "agentevals": {
4
+ "command": "/usr/bin/env",
5
+ "args": ["uv", "run", "agentevals", "mcp"]
6
+ }
7
+ }
8
+ }