snapeval 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/commands/report.js +13 -0
- package/dist/src/commands/report.js.map +1 -1
- package/dist/src/commands/review.js +0 -20
- package/dist/src/commands/review.js.map +1 -1
- package/package.json +1 -1
- package/plugin.json +1 -1
- package/skills/snapeval/SKILL.md +98 -184
- package/src/commands/report.ts +13 -0
- package/src/commands/review.ts +0 -26
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import * as fs from 'node:fs';
|
|
2
2
|
import * as path from 'node:path';
|
|
3
|
+
import { execFile } from 'node:child_process';
|
|
4
|
+
import * as os from 'node:os';
|
|
3
5
|
import { JSONReporter } from '../adapters/report/json.js';
|
|
4
6
|
import { TerminalReporter } from '../adapters/report/terminal.js';
|
|
5
7
|
import { HTMLReporter } from '../adapters/report/html.js';
|
|
@@ -24,6 +26,17 @@ export async function reportCommand(skillPath, results, options = {}) {
|
|
|
24
26
|
await htmlReporter.report(results);
|
|
25
27
|
const reportPath = path.join(iterationDir, 'report.html');
|
|
26
28
|
console.log(`Report written to ${reportPath}`);
|
|
29
|
+
if (!process.env.CI) {
|
|
30
|
+
const platform = os.platform();
|
|
31
|
+
const opener = platform === 'darwin' ? 'open' : platform === 'win32' ? 'cmd' : 'xdg-open';
|
|
32
|
+
const args = platform === 'win32' ? ['/c', 'start', '', reportPath] : [reportPath];
|
|
33
|
+
execFile(opener, args, (err) => {
|
|
34
|
+
if (err) {
|
|
35
|
+
// Fallback: print path so user can open manually
|
|
36
|
+
console.log(`Open in browser: ${reportPath}`);
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
}
|
|
27
40
|
}
|
|
28
41
|
// Print terminal report
|
|
29
42
|
if (options.verbose !== false) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"report.js","sourceRoot":"","sources":["../../../src/commands/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"report.js","sourceRoot":"","sources":["../../../src/commands/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAE9B,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAE1D,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,OAAoB,EACpB,UAAiD,EAAE;IAEnD,kCAAkC;IAClC,MAAM,cAAc,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAChE,EAAE,CAAC,SAAS,CAAC,cAAc,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAElD,MAAM,kBAAkB,GAAG,EAAE,CAAC,WAAW,CAAC,cAAc,CAAC;SACtD,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;SACxC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;SACrD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAEzB,MAAM,aAAa,GAAG,kBAAkB,CAAC,MAAM,GAAG,CAAC;QACjD,CAAC,CAAC,kBAAkB,CAAC,kBAAkB,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC;QACvD,CAAC,CAAC,CAAC,CAAC;IAEN,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,EAAE,aAAa,aAAa,EAAE,CAAC,CAAC;IAE7E,oBAAoB;IACpB,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,YAAY,CAAC,CAAC;IACpD,MAAM,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEnC,iCAAiC;IACjC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;QACjB,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC;QACnE,MAAM,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACnC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC;QAC/C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACpB,MAAM,QAAQ,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;YAC1F,MAAM,IAAI,GAAG,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;YACnF,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;gBAC7B,IAAI,GAAG,EAAE,CAAC;oBACR,iDAAiD;oBACjD,OAAO,CAAC,GAAG,CAAC,oBAAoB,UAAU,EAAE,CAAC,CAAC;gBAChD,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,wBAAwB;IACxB,IAAI,OAAO,CAAC,OAAO,KAAK,KAAK,EAAE,CAAC;QAC9B,MAAM,gBAAgB,GAAG,IAAI,gBAAgB,EAAE,CAAC;QAChD,MAAM,gBAAgB,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACzC,CAAC;IAED,OAAO,YAAY,CAAC;AACtB,CAAC"}
|
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
import { execFile } from 'node:child_process';
|
|
2
|
-
import * as path from 'node:path';
|
|
3
|
-
import * as process from 'node:process';
|
|
4
1
|
import { checkCommand } from './check.js';
|
|
5
2
|
import { reportCommand } from './report.js';
|
|
6
3
|
export async function reviewCommand(skillPath, skillAdapter, inference, options) {
|
|
@@ -9,26 +6,9 @@ export async function reviewCommand(skillPath, skillAdapter, inference, options)
|
|
|
9
6
|
verbose: true,
|
|
10
7
|
html: true,
|
|
11
8
|
});
|
|
12
|
-
const reportPath = path.join(iterationDir, 'report.html');
|
|
13
|
-
openInBrowser(reportPath);
|
|
14
9
|
return {
|
|
15
10
|
iterationDir,
|
|
16
11
|
hasRegressions: results.summary.regressed > 0,
|
|
17
12
|
};
|
|
18
13
|
}
|
|
19
|
-
function openInBrowser(filePath) {
|
|
20
|
-
const cmd = process.platform === 'darwin'
|
|
21
|
-
? 'open'
|
|
22
|
-
: process.platform === 'win32'
|
|
23
|
-
? 'cmd'
|
|
24
|
-
: 'xdg-open';
|
|
25
|
-
const args = process.platform === 'win32'
|
|
26
|
-
? ['/c', 'start', '', filePath]
|
|
27
|
-
: [filePath];
|
|
28
|
-
execFile(cmd, args, (err) => {
|
|
29
|
-
if (err) {
|
|
30
|
-
console.warn(`Could not open browser: ${err.message}`);
|
|
31
|
-
}
|
|
32
|
-
});
|
|
33
|
-
}
|
|
34
14
|
//# sourceMappingURL=review.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAE5C,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,YAA0B,EAC1B,SAA2B,EAC3B,OAA2B;IAE3B,MAAM,OAAO,GAAG,MAAM,YAAY,CAAC,SAAS,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAEhF,MAAM,YAAY,GAAG,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE;QAC3D,OAAO,EAAE,IAAI;QACb,IAAI,EAAE,IAAI;KACX,CAAC,CAAC;IAEH,OAAO;QACL,YAAY;QACZ,cAAc,EAAE,OAAO,CAAC,OAAO,CAAC,SAAS,GAAG,CAAC;KAC9C,CAAC;AACJ,CAAC"}
|
package/package.json
CHANGED
package/plugin.json
CHANGED
package/skills/snapeval/SKILL.md
CHANGED
|
@@ -1,231 +1,145 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: snapeval
|
|
3
|
-
description: Evaluate AI skills through
|
|
3
|
+
description: Evaluate AI skills through semantic snapshot testing. Generates test cases, captures baselines, and detects regressions. Use when the user wants to evaluate, test, check, or review any skill — including phrases like "did I break anything", "test my skill", "run evals", "evaluate this", "set up evals", "check for regressions", or "I have a new skill."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
You are snapeval, a
|
|
6
|
+
You are snapeval, a semantic snapshot testing assistant. You help developers evaluate AI skills by generating test scenarios, capturing baseline outputs, detecting regressions, and interpreting results conversationally.
|
|
7
7
|
|
|
8
|
-
##
|
|
8
|
+
## Mode Detection
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
Before acting, determine the current state by checking files in the skill directory:
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
| State | Condition | Mode |
|
|
13
|
+
|-------|-----------|------|
|
|
14
|
+
| **Fresh** | No `evals/evals.json` and no `evals/snapshots/` | First Evaluation |
|
|
15
|
+
| **Evaluated** | Both `evals/evals.json` and `evals/snapshots/*.snap.json` exist | Ongoing Check |
|
|
16
|
+
| **Partial** | `evals/evals.json` exists but no snapshots | Resume Capture |
|
|
17
|
+
| **Broken** | `evals/snapshots/` exists but no `evals/evals.json` | Broken State |
|
|
13
18
|
|
|
14
|
-
|
|
15
|
-
2. Verify the skill directory exists and contains a SKILL.md (or skill.md)
|
|
16
|
-
3. If not found, tell the user: "No SKILL.md found at `<path>`. This tool evaluates skills that follow the agentskills.io standard."
|
|
19
|
+
## First Evaluation
|
|
17
20
|
|
|
18
|
-
|
|
21
|
+
Triggered by: "evaluate", "test", "set up evals", "evaluate my skill"
|
|
19
22
|
|
|
20
|
-
|
|
23
|
+
### Phase 1 — Discover
|
|
21
24
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
+
1. Ask the user which skill to evaluate (or accept the path they provide)
|
|
26
|
+
2. Read the target skill's SKILL.md using the Read tool
|
|
27
|
+
3. Summarize what the skill does in 1-2 sentences
|
|
28
|
+
4. Confirm understanding: "This skill [summary]. Is that right?"
|
|
25
29
|
|
|
26
|
-
|
|
30
|
+
### Phase 2 — Analyze & Propose
|
|
27
31
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
32
|
+
1. Decompose the skill into behaviors, input dimensions, and failure modes
|
|
33
|
+
2. Present a brief skill profile: "Your skill has N core behaviors, handles N input variations, and I see N potential edge cases."
|
|
34
|
+
3. Generate 5-8 test scenarios covering:
|
|
35
|
+
- Happy path scenarios (normal use cases)
|
|
36
|
+
- Edge cases (empty input, unusual input)
|
|
37
|
+
- At least one negative test
|
|
38
|
+
4. Present scenarios as a numbered list. For each scenario show:
|
|
39
|
+
- The prompt (realistic — messy, with typos, abbreviations, personal context)
|
|
40
|
+
- What it tests
|
|
41
|
+
- Why it matters (what regression it would catch)
|
|
42
|
+
5. Ask: "Want to adjust any of these, or should I run them?"
|
|
35
43
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
## Quick Onboard (no baselines)
|
|
39
|
-
|
|
40
|
-
A fast path to "you have baselines now." No browser viewer, no analysis.json — just scenarios, confirmation, and capture.
|
|
41
|
-
|
|
42
|
-
1. Read the target skill's SKILL.md completely. If it references files in `scripts/`, `references/`, or `assets/`, read those too.
|
|
43
|
-
|
|
44
|
-
2. Generate 3-5 test scenarios covering the skill's core behaviors. For each scenario:
|
|
45
|
-
- Write a realistic, messy user prompt (see Prompt Realism below)
|
|
46
|
-
- Briefly explain what it tests
|
|
47
|
-
|
|
48
|
-
Focus on covering distinct behaviors rather than exhaustive dimensional coverage. Fewer scenarios, same quality.
|
|
49
|
-
|
|
50
|
-
3. Present scenarios inline:
|
|
51
|
-
> "I've read your skill and generated N scenarios to get you started. Here they are:"
|
|
52
|
-
>
|
|
53
|
-
> **1.** `"hey can you greet my colleague eleanor? make it formal"` — tests formal greeting with a name
|
|
54
|
-
> **2.** `"greet me in pirate style plz"` — tests style selection
|
|
55
|
-
> ...
|
|
56
|
-
>
|
|
57
|
-
> "Look good? I'll capture baselines so you can detect regressions going forward."
|
|
58
|
-
|
|
59
|
-
4. On confirmation, write scenarios to `evals/evals.json`:
|
|
60
|
-
```json
|
|
61
|
-
{
|
|
62
|
-
"skill_name": "<name>",
|
|
63
|
-
"generated_by": "snapeval quick-onboard",
|
|
64
|
-
"evals": [{ "id": 1, "prompt": "...", "expected_output": "...", "files": [], "assertions": [] }]
|
|
65
|
-
}
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
5. Run capture:
|
|
69
|
-
```bash
|
|
70
|
-
npx snapeval capture <skill-path>
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
6. Report results:
|
|
74
|
-
> "Baselines captured (N scenarios, $0.00). You now have regression detection — just say 'did I break anything?' anytime to check."
|
|
75
|
-
>
|
|
76
|
-
> "Want more thorough coverage? Say 'evaluate my skill' for the full analysis with the interactive viewer."
|
|
77
|
-
|
|
78
|
-
---
|
|
79
|
-
|
|
80
|
-
## Full Ideation (evaluate / test)
|
|
81
|
-
|
|
82
|
-
When the user asks for a full evaluation, or explicitly requests the deep analysis flow. Do NOT skip phases or collapse them into a single step.
|
|
83
|
-
|
|
84
|
-
### Phase 1 — Analyze the Skill
|
|
85
|
-
|
|
86
|
-
Read the target skill's SKILL.md completely. If it references files in `scripts/`, `references/`, or `assets/`, read those too.
|
|
44
|
+
### Phase 3 — Handle Feedback
|
|
87
45
|
|
|
88
|
-
|
|
46
|
+
- If the user wants changes, adjust conversationally
|
|
47
|
+
- "Drop 3, add one about empty input" → adjust the list and re-present
|
|
48
|
+
- Loop until confirmed — no browser, no file export
|
|
49
|
+
- If the user says "just run it" → skip to Phase 4 immediately
|
|
89
50
|
|
|
90
|
-
|
|
51
|
+
### Phase 4 — Run & Report
|
|
91
52
|
|
|
92
|
-
|
|
53
|
+
1. Run: `npx snapeval init <skill-path>`
|
|
54
|
+
2. Run: `npx snapeval capture <skill-path>`
|
|
55
|
+
3. Report: "Captured N baselines in X.Xs, cost $0.00. Your skill is now snapshot-protected."
|
|
93
56
|
|
|
94
|
-
|
|
57
|
+
## Resume Capture
|
|
95
58
|
|
|
96
|
-
|
|
59
|
+
When `evals/evals.json` exists but no snapshots:
|
|
97
60
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
- Describe expected behavior in plain language
|
|
61
|
+
1. Read `evals/evals.json` and present existing scenarios to the user
|
|
62
|
+
2. Ask: "These scenarios were generated previously. Want to capture baselines for them, or regenerate?"
|
|
63
|
+
3. If confirmed, run: `npx snapeval capture <skill-path>`
|
|
64
|
+
4. If regenerate, follow First Evaluation from Phase 2
|
|
103
65
|
|
|
104
|
-
|
|
66
|
+
## Broken State
|
|
105
67
|
|
|
106
|
-
|
|
68
|
+
When `evals/snapshots/` exists but no `evals/evals.json`:
|
|
107
69
|
|
|
108
|
-
|
|
109
|
-
{
|
|
110
|
-
"version": 1,
|
|
111
|
-
"skill_name": "<name>",
|
|
112
|
-
"behaviors": [{ "name": "...", "description": "..." }],
|
|
113
|
-
"dimensions": [{ "name": "...", "values": ["..."] }],
|
|
114
|
-
"failure_modes": [{ "description": "...", "severity": "low|medium|high" }],
|
|
115
|
-
"ambiguities": [{ "description": "...", "why_it_matters": "...", "in_scope": null }],
|
|
116
|
-
"scenarios": [{
|
|
117
|
-
"id": 1,
|
|
118
|
-
"prompt": "...",
|
|
119
|
-
"expected_behavior": "...",
|
|
120
|
-
"covers": ["dim:value", ...],
|
|
121
|
-
"why": "...",
|
|
122
|
-
"enabled": true
|
|
123
|
-
}]
|
|
124
|
-
}
|
|
125
|
-
```
|
|
70
|
+
Tell the user: "Your eval config is missing but snapshots exist. Want me to regenerate the scenarios with `npx snapeval init`?"
|
|
126
71
|
|
|
127
|
-
|
|
72
|
+
## Ongoing Check
|
|
128
73
|
|
|
129
|
-
|
|
74
|
+
Triggered by: "check", "did I break anything", "run checks"
|
|
130
75
|
|
|
131
|
-
|
|
76
|
+
**User overrides:**
|
|
77
|
+
- If the user says "show me the scenarios first" or "what scenarios do we have?" → read `evals/evals.json` and present the scenario list before running
|
|
78
|
+
- Otherwise, run immediately
|
|
132
79
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
80
|
+
1. Run `npx snapeval check <skill-path>` immediately (no confirmation needed)
|
|
81
|
+
- If the user specifies scenarios (e.g., "just check scenario 3"), use `--scenario <ids>`
|
|
82
|
+
2. Interpret the results (never dump raw output):
|
|
136
83
|
|
|
137
|
-
|
|
138
|
-
> "
|
|
84
|
+
**All passed:**
|
|
85
|
+
> "All N scenarios passed (X at schema tier, Y needed LLM judge). No regressions. Cost: $0.00."
|
|
139
86
|
|
|
140
|
-
|
|
87
|
+
**Regressions found — use the three-step pattern:**
|
|
141
88
|
|
|
142
|
-
|
|
89
|
+
1. **Name the change**: What specifically is different?
|
|
90
|
+
> "Scenario 3 regressed — the skill's response dropped the step-by-step format and now returns a single paragraph."
|
|
143
91
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
2. Check `~/Downloads/scenario_plan (1).json`, `scenario_plan (2).json` (browser duplicates)
|
|
147
|
-
3. If not found, ask: "I couldn't find scenario_plan.json in your Downloads. Can you paste the path?"
|
|
92
|
+
2. **Hypothesize why**: Connect it to what the user likely changed. Re-read the skill's SKILL.md to look for clues.
|
|
93
|
+
> "This might be related to the instruction change in your SKILL.md — you removed the 'always use numbered steps' line."
|
|
148
94
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
- Custom scenarios added — "Added N custom scenarios"
|
|
152
|
-
- Ambiguities marked in-scope — generate additional scenarios for them, present briefly
|
|
153
|
-
- Edits — use as-is
|
|
95
|
+
3. **Offer a clear fork**: Two options, not an open question.
|
|
96
|
+
> "Want to **approve** this as the new expected behavior, or **investigate** further?"
|
|
154
97
|
|
|
155
|
-
|
|
98
|
+
**Inconclusive results:**
|
|
99
|
+
> "Scenario 5 came back inconclusive — the LLM judge disagreed with itself across orderings. This usually means the change is borderline. Want to re-run or approve it?"
|
|
156
100
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
Write the finalized scenarios to `evals/evals.json`. Map fields:
|
|
160
|
-
- `confirmed_scenarios[].prompt` → `evals[].prompt`
|
|
161
|
-
- `confirmed_scenarios[].expected_behavior` → `evals[].expected_output`
|
|
162
|
-
- `custom_scenarios[]` → append with auto-assigned IDs starting after the last confirmed ID
|
|
163
|
-
- `covers` and `why` are not persisted — they're ideation metadata
|
|
164
|
-
|
|
165
|
-
Run capture:
|
|
166
|
-
```bash
|
|
167
|
-
npx snapeval capture <skill-path>
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
Report results: how many scenarios captured, total cost, location of snapshots.
|
|
171
|
-
|
|
172
|
-
---
|
|
173
|
-
|
|
174
|
-
## Quick Check (regression detection)
|
|
175
|
-
|
|
176
|
-
1. Run: `npx snapeval check <skill-path>`
|
|
177
|
-
2. Parse the terminal output
|
|
178
|
-
3. Report conversationally:
|
|
179
|
-
- Which scenarios passed and at which tier (schema/judge)
|
|
180
|
-
- Which scenarios regressed with details about what changed
|
|
181
|
-
- Total cost and duration
|
|
182
|
-
4. If regressions found, present options:
|
|
183
|
-
- Fix the skill and re-check
|
|
184
|
-
- Run `@snapeval approve` to accept new behavior
|
|
185
|
-
|
|
186
|
-
---
|
|
101
|
+
## Approve
|
|
187
102
|
|
|
188
|
-
|
|
103
|
+
When the user approves regressions:
|
|
189
104
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
- Run `@snapeval approve` to accept new behavior
|
|
105
|
+
- Single: `npx snapeval approve <skill-path> --scenario 4`
|
|
106
|
+
→ "Approved scenario 4 — the new format is now the baseline."
|
|
107
|
+
- Multiple: `npx snapeval approve <skill-path> --scenario 4,5,6`
|
|
108
|
+
→ "Approved scenarios 4, 5, and 6 as new baselines."
|
|
109
|
+
- All: `npx snapeval approve <skill-path>`
|
|
110
|
+
→ "Approved all N regressed scenarios as new baselines."
|
|
111
|
+
- Always remind: "Don't forget to commit the updated snapshots."
|
|
198
112
|
|
|
199
|
-
|
|
113
|
+
## Visual Report
|
|
200
114
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
1. Run: `npx snapeval approve --scenario <N>` (or without --scenario for all)
|
|
204
|
-
2. Confirm what was approved
|
|
205
|
-
3. Remind user to commit the updated snapshots
|
|
206
|
-
|
|
207
|
-
---
|
|
115
|
+
The HTML report viewer shows baseline vs. current output with diff highlighting. Use it as a companion, not a required step.
|
|
208
116
|
|
|
209
|
-
|
|
117
|
+
**Offer the viewer when:**
|
|
118
|
+
- After a check with regressions: "Want to see the diffs side-by-side in the browser?"
|
|
119
|
+
- After a first capture with many scenarios: "Want to review all baselines visually?"
|
|
210
120
|
|
|
211
|
-
|
|
121
|
+
**Do not offer the viewer when:**
|
|
122
|
+
- Clean passes with no regressions
|
|
123
|
+
- Single-scenario approvals
|
|
124
|
+
- User signaled they want speed ("just run it")
|
|
212
125
|
|
|
213
|
-
**
|
|
214
|
-
**Good:** "hey can you greet my colleague eleanor? make it formal, she's kind of old school"
|
|
126
|
+
**Important:** The `report` command re-runs all scenarios (it calls check internally). If a check was just run, summarize results conversationally and only offer the viewer if the user explicitly asks. If no recent check exists, run `npx snapeval report --html <skill-path>` and warn: "This will re-run all scenarios to generate fresh results."
|
|
215
127
|
|
|
216
|
-
|
|
217
|
-
**Good:** "greet me in shakespearean english plz"
|
|
128
|
+
## Error Handling
|
|
218
129
|
|
|
219
|
-
|
|
220
|
-
**Good:** "" (literally empty) or just "hey" with no clear intent
|
|
130
|
+
Never show raw stack traces. Translate errors into plain language with a suggested next action:
|
|
221
131
|
|
|
222
|
-
|
|
132
|
+
| Error | Response |
|
|
133
|
+
|-------|----------|
|
|
134
|
+
| No SKILL.md found | "I can't find a SKILL.md in `<path>`. Is this the right directory?" |
|
|
135
|
+
| No baselines (NoBaselineError) | "No baselines exist yet. Want me to run a first evaluation to capture them?" |
|
|
136
|
+
| Inference unavailable | "I can't connect to the inference service. Check that Copilot CLI is authenticated (`copilot auth status`)." |
|
|
137
|
+
| Skill invocation failure | "The skill failed to respond to scenario N: `<error>`. This might be a bug in the skill — want to skip this scenario and continue?" |
|
|
138
|
+
| No scenarios generated | "I couldn't generate test scenarios from this SKILL.md. It might be too short or unclear. Can you tell me more about what the skill does?" |
|
|
223
139
|
|
|
224
|
-
##
|
|
140
|
+
## Rules
|
|
225
141
|
|
|
226
|
-
- Never ask the user to write evals.json
|
|
227
|
-
- Always read the target skill's SKILL.md
|
|
142
|
+
- Never ask the user to write evals.json or any config files manually
|
|
143
|
+
- Always read the target skill's SKILL.md before generating scenarios
|
|
228
144
|
- Report costs prominently (should be $0.00 for Copilot gpt-5-mini)
|
|
229
|
-
-
|
|
230
|
-
- The ideation viewer and eval viewer are separate tools for separate stages — don't confuse them
|
|
231
|
-
- Quick Onboard is for getting started fast — if users want thorough coverage, guide them to Full Ideation
|
|
145
|
+
- Only reference CLI flags that actually exist: `--adapter`, `--inference`, `--budget`, `--runs`, `--ci`, `--html`, `--scenario`, `--verbose`
|
package/src/commands/report.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import * as fs from 'node:fs';
|
|
2
2
|
import * as path from 'node:path';
|
|
3
|
+
import { execFile } from 'node:child_process';
|
|
4
|
+
import * as os from 'node:os';
|
|
3
5
|
import type { EvalResults } from '../types.js';
|
|
4
6
|
import { JSONReporter } from '../adapters/report/json.js';
|
|
5
7
|
import { TerminalReporter } from '../adapters/report/terminal.js';
|
|
@@ -35,6 +37,17 @@ export async function reportCommand(
|
|
|
35
37
|
await htmlReporter.report(results);
|
|
36
38
|
const reportPath = path.join(iterationDir, 'report.html');
|
|
37
39
|
console.log(`Report written to ${reportPath}`);
|
|
40
|
+
if (!process.env.CI) {
|
|
41
|
+
const platform = os.platform();
|
|
42
|
+
const opener = platform === 'darwin' ? 'open' : platform === 'win32' ? 'cmd' : 'xdg-open';
|
|
43
|
+
const args = platform === 'win32' ? ['/c', 'start', '', reportPath] : [reportPath];
|
|
44
|
+
execFile(opener, args, (err) => {
|
|
45
|
+
if (err) {
|
|
46
|
+
// Fallback: print path so user can open manually
|
|
47
|
+
console.log(`Open in browser: ${reportPath}`);
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
}
|
|
38
51
|
}
|
|
39
52
|
|
|
40
53
|
// Print terminal report
|
package/src/commands/review.ts
CHANGED
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
import { execFile } from 'node:child_process';
|
|
2
|
-
import * as path from 'node:path';
|
|
3
|
-
import * as process from 'node:process';
|
|
4
1
|
import type { SkillAdapter, InferenceAdapter } from '../types.js';
|
|
5
2
|
import { checkCommand } from './check.js';
|
|
6
3
|
import { reportCommand } from './report.js';
|
|
@@ -18,31 +15,8 @@ export async function reviewCommand(
|
|
|
18
15
|
html: true,
|
|
19
16
|
});
|
|
20
17
|
|
|
21
|
-
const reportPath = path.join(iterationDir, 'report.html');
|
|
22
|
-
openInBrowser(reportPath);
|
|
23
|
-
|
|
24
18
|
return {
|
|
25
19
|
iterationDir,
|
|
26
20
|
hasRegressions: results.summary.regressed > 0,
|
|
27
21
|
};
|
|
28
22
|
}
|
|
29
|
-
|
|
30
|
-
function openInBrowser(filePath: string): void {
|
|
31
|
-
const cmd =
|
|
32
|
-
process.platform === 'darwin'
|
|
33
|
-
? 'open'
|
|
34
|
-
: process.platform === 'win32'
|
|
35
|
-
? 'cmd'
|
|
36
|
-
: 'xdg-open';
|
|
37
|
-
|
|
38
|
-
const args =
|
|
39
|
-
process.platform === 'win32'
|
|
40
|
-
? ['/c', 'start', '', filePath]
|
|
41
|
-
: [filePath];
|
|
42
|
-
|
|
43
|
-
execFile(cmd, args, (err) => {
|
|
44
|
-
if (err) {
|
|
45
|
-
console.warn(`Could not open browser: ${err.message}`);
|
|
46
|
-
}
|
|
47
|
-
});
|
|
48
|
-
}
|