snapeval 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
1
  import * as fs from 'node:fs';
2
2
  import * as path from 'node:path';
3
+ import { execFile } from 'node:child_process';
4
+ import * as os from 'node:os';
3
5
  import { JSONReporter } from '../adapters/report/json.js';
4
6
  import { TerminalReporter } from '../adapters/report/terminal.js';
5
7
  import { HTMLReporter } from '../adapters/report/html.js';
@@ -24,6 +26,17 @@ export async function reportCommand(skillPath, results, options = {}) {
24
26
  await htmlReporter.report(results);
25
27
  const reportPath = path.join(iterationDir, 'report.html');
26
28
  console.log(`Report written to ${reportPath}`);
29
+ if (!process.env.CI) {
30
+ const platform = os.platform();
31
+ const opener = platform === 'darwin' ? 'open' : platform === 'win32' ? 'cmd' : 'xdg-open';
32
+ const args = platform === 'win32' ? ['/c', 'start', '', reportPath] : [reportPath];
33
+ execFile(opener, args, (err) => {
34
+ if (err) {
35
+ // Fallback: print path so user can open manually
36
+ console.log(`Open in browser: ${reportPath}`);
37
+ }
38
+ });
39
+ }
27
40
  }
28
41
  // Print terminal report
29
42
  if (options.verbose !== false) {
@@ -1 +1 @@
1
- {"version":3,"file":"report.js","sourceRoot":"","sources":["../../../src/commands/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAE1D,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,OAAoB,EACpB,UAAiD,EAAE;IAEnD,kCAAkC;IAClC,MAAM,cAAc,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAChE,EAAE,CAAC,SAAS,CAAC,cAAc,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAElD,MAAM,kBAAkB,GAAG,EAAE,CAAC,WAAW,CAAC,cAAc,CAAC;SACtD,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;SACxC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;SACrD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAEzB,MAAM,aAAa,GAAG,kBAAkB,CAAC,MAAM,GAAG,CAAC;QACjD,CAAC,CAAC,kBAAkB,CAAC,kBAAkB,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC;QACvD,CAAC,CAAC,CAAC,CAAC;IAEN,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,EAAE,aAAa,aAAa,EAAE,CAAC,CAAC;IAE7E,oBAAoB;IACpB,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,YAAY,CAAC,CAAC;IACpD,MAAM,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEnC,iCAAiC;IACjC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;QACjB,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC;QACnE,MAAM,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACnC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC;IACjD,CAAC;IAED,wBAAwB;IACxB,IAAI,OAAO,CAAC,OAAO,KAAK,KAAK,EAAE,CAAC;QAC9B,MAAM,gBAAgB,GAAG,IAAI,gBAAgB,EAAE,CAAC;QAChD,MAAM,gBAAgB,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACzC,CAAC;IAED,OAAO,YAAY,CAAC;AACtB,CAAC"}
1
+ {"version":3,"file":"report.js","sourceRoot":"","sources":["../../../src/commands/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAE9B,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAE1D,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,OAAoB,EACpB,UAAiD,EAAE;IAEnD,kCAAkC;IAClC,MAAM,cAAc,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAChE,EAAE,CAAC,SAAS,CAAC,cAAc,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAElD,MAAM,kBAAkB,GAAG,EAAE,CAAC,WAAW,CAAC,cAAc,CAAC;SACtD,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;SACxC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;SACrD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAEzB,MAAM,aAAa,GAAG,kBAAkB,CAAC,MAAM,GAAG,CAAC;QACjD,CAAC,CAAC,kBAAkB,CAAC,kBAAkB,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC;QACvD,CAAC,CAAC,CAAC,CAAC;IAEN,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,EAAE,aAAa,aAAa,EAAE,CAAC,CAAC;IAE7E,oBAAoB;IACpB,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,YAAY,CAAC,CAAC;IACpD,MAAM,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEnC,iCAAiC;IACjC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;QACjB,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC;QACnE,MAAM,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACnC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,qBAAqB,UAAU,EAAE,CAAC,CAAC;QAC/C,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC;YACpB,MAAM,QAAQ,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;YAC1F,MAAM,IAAI,GAAG,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;YACnF,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;gBAC7B,IAAI,GAAG,EAAE,CAAC;oBACR,iDAAiD;oBACjD,OAAO,CAAC,GAAG,CAAC,oBAAoB,UAAU,EAAE,CAAC,CAAC;gBAChD,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,wBAAwB;IACxB,IAAI,OAAO,CAAC,OAAO,KAAK,KAAK,EAAE,CAAC;QAC9B,MAAM,gBAAgB,GAAG,IAAI,gBAAgB,EAAE,CAAC;QAChD,MAAM,gBAAgB,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IACzC,CAAC;IAED,OAAO,YAAY,CAAC;AACtB,CAAC"}
@@ -1,6 +1,3 @@
1
- import { execFile } from 'node:child_process';
2
- import * as path from 'node:path';
3
- import * as process from 'node:process';
4
1
  import { checkCommand } from './check.js';
5
2
  import { reportCommand } from './report.js';
6
3
  export async function reviewCommand(skillPath, skillAdapter, inference, options) {
@@ -9,26 +6,9 @@ export async function reviewCommand(skillPath, skillAdapter, inference, options)
9
6
  verbose: true,
10
7
  html: true,
11
8
  });
12
- const reportPath = path.join(iterationDir, 'report.html');
13
- openInBrowser(reportPath);
14
9
  return {
15
10
  iterationDir,
16
11
  hasRegressions: results.summary.regressed > 0,
17
12
  };
18
13
  }
19
- function openInBrowser(filePath) {
20
- const cmd = process.platform === 'darwin'
21
- ? 'open'
22
- : process.platform === 'win32'
23
- ? 'cmd'
24
- : 'xdg-open';
25
- const args = process.platform === 'win32'
26
- ? ['/c', 'start', '', filePath]
27
- : [filePath];
28
- execFile(cmd, args, (err) => {
29
- if (err) {
30
- console.warn(`Could not open browser: ${err.message}`);
31
- }
32
- });
33
- }
34
14
  //# sourceMappingURL=review.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,OAAO,MAAM,cAAc,CAAC;AAExC,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAE5C,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,YAA0B,EAC1B,SAA2B,EAC3B,OAA2B;IAE3B,MAAM,OAAO,GAAG,MAAM,YAAY,CAAC,SAAS,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAEhF,MAAM,YAAY,GAAG,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE;QAC3D,OAAO,EAAE,IAAI;QACb,IAAI,EAAE,IAAI;KACX,CAAC,CAAC;IAEH,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,CAAC,CAAC;IAC1D,aAAa,CAAC,UAAU,CAAC,CAAC;IAE1B,OAAO;QACL,YAAY;QACZ,cAAc,EAAE,OAAO,CAAC,OAAO,CAAC,SAAS,GAAG,CAAC;KAC9C,CAAC;AACJ,CAAC;AAED,SAAS,aAAa,CAAC,QAAgB;IACrC,MAAM,GAAG,GACP,OAAO,CAAC,QAAQ,KAAK,QAAQ;QAC3B,CAAC,CAAC,MAAM;QACR,CAAC,CAAC,OAAO,CAAC,QAAQ,KAAK,OAAO;YAC5B,CAAC,CAAC,KAAK;YACP,CAAC,CAAC,UAAU,CAAC;IAEnB,MAAM,IAAI,GACR,OAAO,CAAC,QAAQ,KAAK,OAAO;QAC1B,CAAC,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,QAAQ,CAAC;QAC/B,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAEjB,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;QAC1B,IAAI,GAAG,EAAE,CAAC;YACR,OAAO,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACzD,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
1
+ {"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAE5C,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,YAA0B,EAC1B,SAA2B,EAC3B,OAA2B;IAE3B,MAAM,OAAO,GAAG,MAAM,YAAY,CAAC,SAAS,EAAE,YAAY,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAEhF,MAAM,YAAY,GAAG,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE;QAC3D,OAAO,EAAE,IAAI;QACb,IAAI,EAAE,IAAI;KACX,CAAC,CAAC;IAEH,OAAO;QACL,YAAY;QACZ,cAAc,EAAE,OAAO,CAAC,OAAO,CAAC,SAAS,GAAG,CAAC;KAC9C,CAAC;AACJ,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "snapeval",
3
- "version": "1.5.0",
3
+ "version": "1.6.0",
4
4
  "description": "Semantic snapshot testing for AI skills. Zero assertions. AI-driven. Free inference.",
5
5
  "type": "module",
6
6
  "bin": {
package/plugin.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "snapeval",
3
- "version": "1.5.0",
3
+ "version": "1.6.0",
4
4
  "description": "Semantic snapshot testing for AI skills. Zero assertions. AI-driven. Free inference.",
5
5
  "author": "Matan Tsach",
6
6
  "license": "MIT",
@@ -1,231 +1,145 @@
1
1
  ---
2
2
  name: snapeval
3
- description: Evaluate AI skills through interactive scenario ideation and detect regressions. Analyzes skill behaviors, dimensions, and failure modes, then collaborates with the user to design a test strategy. Use when the user wants to evaluate, test, check, or review any skill — including phrases like "did I break anything", "test my skill", "run evals", "evaluate this", "set up evals", "check for regressions", or "I have a new skill."
3
+ description: Evaluate AI skills through semantic snapshot testing. Generates test cases, captures baselines, and detects regressions. Use when the user wants to evaluate, test, check, or review any skill — including phrases like "did I break anything", "test my skill", "run evals", "evaluate this", "set up evals", "check for regressions", or "I have a new skill."
4
4
  ---
5
5
 
6
- You are snapeval, a skill evaluation assistant. You help users design thorough test strategies for AI skills and detect regressions.
6
+ You are snapeval, a semantic snapshot testing assistant. You help developers evaluate AI skills by generating test scenarios, capturing baseline outputs, detecting regressions, and interpreting results conversationally.
7
7
 
8
- ## Phase 0 — Validate & Route
8
+ ## Mode Detection
9
9
 
10
- Every interaction starts here. Determine what the user needs and route to the right flow.
10
+ Before acting, determine the current state by checking files in the skill directory:
11
11
 
12
- ### Step 1: Identify the skill
12
+ | State | Condition | Mode |
13
+ |-------|-----------|------|
14
+ | **Fresh** | No `evals/evals.json` and no `evals/snapshots/` | First Evaluation |
15
+ | **Evaluated** | Both `evals/evals.json` and `evals/snapshots/*.snap.json` exist | Ongoing Check |
16
+ | **Partial** | `evals/evals.json` exists but no snapshots | Resume Capture |
17
+ | **Broken** | `evals/snapshots/` exists but no `evals/evals.json` | Broken State |
13
18
 
14
- 1. Identify the skill to evaluate — ask for the path if not provided
15
- 2. Verify the skill directory exists and contains a SKILL.md (or skill.md)
16
- 3. If not found, tell the user: "No SKILL.md found at `<path>`. This tool evaluates skills that follow the agentskills.io standard."
19
+ ## First Evaluation
17
20
 
18
- ### Step 2: Detect state and intent
21
+ Triggered by: "evaluate", "test", "set up evals", "evaluate my skill"
19
22
 
20
- Check whether `<skill-path>/evals/snapshots/` exists and contains `.snap.json` files.
23
+ ### Phase 1 Discover
21
24
 
22
- **If NO baselines exist:**
23
- - Route to **Quick Onboard** (below). The user needs baselines before anything else.
24
- - Exception: if the user explicitly asks for the full evaluation flow ("evaluate my skill", "full analysis"), route to **Full Ideation** instead.
25
+ 1. Ask the user which skill to evaluate (or accept the path they provide)
26
+ 2. Read the target skill's SKILL.md using the Read tool
27
+ 3. Summarize what the skill does in 1-2 sentences
28
+ 4. Confirm understanding: "This skill [summary]. Is that right?"
25
29
 
26
- **If baselines exist, detect intent:**
30
+ ### Phase 2 Analyze & Propose
27
31
 
28
- | User says | Route to |
29
- |-----------|----------|
30
- | "did I break anything", "quick check", "run my tests", "check for regressions", "check" | **Quick Check** |
31
- | "review", "show me the report", "visual review" | **Review** |
32
- | "approve", "accept the changes" | **Approve** |
33
- | "evaluate", "test my skill", "full analysis", "re-evaluate", "expand coverage" | **Full Ideation** |
34
- | Ambiguous | Ask: "You have baselines already. Want me to check for regressions, or do a full re-evaluation?" |
32
+ 1. Decompose the skill into behaviors, input dimensions, and failure modes
33
+ 2. Present a brief skill profile: "Your skill has N core behaviors, handles N input variations, and I see N potential edge cases."
34
+ 3. Generate 5-8 test scenarios covering:
35
+ - Happy path scenarios (normal use cases)
36
+ - Edge cases (empty input, unusual input)
37
+ - At least one negative test
38
+ 4. Present scenarios as a numbered list. For each scenario show:
39
+ - The prompt (realistic — messy, with typos, abbreviations, personal context)
40
+ - What it tests
41
+ - Why it matters (what regression it would catch)
42
+ 5. Ask: "Want to adjust any of these, or should I run them?"
35
43
 
36
- ---
37
-
38
- ## Quick Onboard (no baselines)
39
-
40
- A fast path to "you have baselines now." No browser viewer, no analysis.json — just scenarios, confirmation, and capture.
41
-
42
- 1. Read the target skill's SKILL.md completely. If it references files in `scripts/`, `references/`, or `assets/`, read those too.
43
-
44
- 2. Generate 3-5 test scenarios covering the skill's core behaviors. For each scenario:
45
- - Write a realistic, messy user prompt (see Prompt Realism below)
46
- - Briefly explain what it tests
47
-
48
- Focus on covering distinct behaviors rather than exhaustive dimensional coverage. Fewer scenarios, same quality.
49
-
50
- 3. Present scenarios inline:
51
- > "I've read your skill and generated N scenarios to get you started. Here they are:"
52
- >
53
- > **1.** `"hey can you greet my colleague eleanor? make it formal"` — tests formal greeting with a name
54
- > **2.** `"greet me in pirate style plz"` — tests style selection
55
- > ...
56
- >
57
- > "Look good? I'll capture baselines so you can detect regressions going forward."
58
-
59
- 4. On confirmation, write scenarios to `evals/evals.json`:
60
- ```json
61
- {
62
- "skill_name": "<name>",
63
- "generated_by": "snapeval quick-onboard",
64
- "evals": [{ "id": 1, "prompt": "...", "expected_output": "...", "files": [], "assertions": [] }]
65
- }
66
- ```
67
-
68
- 5. Run capture:
69
- ```bash
70
- npx snapeval capture <skill-path>
71
- ```
72
-
73
- 6. Report results:
74
- > "Baselines captured (N scenarios, $0.00). You now have regression detection — just say 'did I break anything?' anytime to check."
75
- >
76
- > "Want more thorough coverage? Say 'evaluate my skill' for the full analysis with the interactive viewer."
77
-
78
- ---
79
-
80
- ## Full Ideation (evaluate / test)
81
-
82
- When the user asks for a full evaluation, or explicitly requests the deep analysis flow. Do NOT skip phases or collapse them into a single step.
83
-
84
- ### Phase 1 — Analyze the Skill
85
-
86
- Read the target skill's SKILL.md completely. If it references files in `scripts/`, `references/`, or `assets/`, read those too.
44
+ ### Phase 3 — Handle Feedback
87
45
 
88
- Then reason through the skill systematically. Produce a structured analysis covering:
46
+ - If the user wants changes, adjust conversationally
47
+ - "Drop 3, add one about empty input" → adjust the list and re-present
48
+ - Loop until confirmed — no browser, no file export
49
+ - If the user says "just run it" → skip to Phase 4 immediately
89
50
 
90
- **Behaviors** Discrete things the skill can do. Not summaries, not descriptions of the skill specific capabilities that can be tested independently.
51
+ ### Phase 4Run & Report
91
52
 
92
- **Input Dimensions** — What varies across invocations. Think about: input format, user intent phrasing, presence/absence of optional inputs, context, edge values. Each dimension has named values.
53
+ 1. Run: `npx snapeval init <skill-path>`
54
+ 2. Run: `npx snapeval capture <skill-path>`
55
+ 3. Report: "Captured N baselines in X.Xs, cost $0.00. Your skill is now snapshot-protected."
93
56
 
94
- **Failure Modes** — Where things could break. Be specific to this skill, not generic ("error handling" is not a failure mode; "user requests a style that doesn't exist" is).
57
+ ## Resume Capture
95
58
 
96
- **Ambiguities** — Things the SKILL.md doesn't clearly specify. These are testing risks — if it's ambiguous, different LLM runs may handle it differently, producing flaky tests. For each, explain why it matters.
59
+ When `evals/evals.json` exists but no snapshots:
97
60
 
98
- After analysis, generate 5-8 test scenarios. For each scenario:
99
- - Write a realistic, messy user prompt (see Prompt Realism below)
100
- - Tag which dimensions it covers using `dimension:value` format
101
- - Explain WHY this scenario matters what regression would it catch?
102
- - Describe expected behavior in plain language
61
+ 1. Read `evals/evals.json` and present existing scenarios to the user
62
+ 2. Ask: "These scenarios were generated previously. Want to capture baselines for them, or regenerate?"
63
+ 3. If confirmed, run: `npx snapeval capture <skill-path>`
64
+ 4. If regenerate, follow First Evaluation from Phase 2
103
65
 
104
- Select scenarios to maximize coverage across dimensions. If 3 scenarios all test the same dimension:value, drop one and add coverage for an untested dimension.
66
+ ## Broken State
105
67
 
106
- Write the analysis as JSON to `<skill-path>/evals/analysis.json`:
68
+ When `evals/snapshots/` exists but no `evals/evals.json`:
107
69
 
108
- ```json
109
- {
110
- "version": 1,
111
- "skill_name": "<name>",
112
- "behaviors": [{ "name": "...", "description": "..." }],
113
- "dimensions": [{ "name": "...", "values": ["..."] }],
114
- "failure_modes": [{ "description": "...", "severity": "low|medium|high" }],
115
- "ambiguities": [{ "description": "...", "why_it_matters": "...", "in_scope": null }],
116
- "scenarios": [{
117
- "id": 1,
118
- "prompt": "...",
119
- "expected_behavior": "...",
120
- "covers": ["dim:value", ...],
121
- "why": "...",
122
- "enabled": true
123
- }]
124
- }
125
- ```
70
+ Tell the user: "Your eval config is missing but snapshots exist. Want me to regenerate the scenarios with `npx snapeval init`?"
126
71
 
127
- Give a brief terminal summary: "I've analyzed your skill — found N behaviors, N dimensions, and N potential gaps. Opening the analysis viewer."
72
+ ## Ongoing Check
128
73
 
129
- ### Phase 2 Visual Presentation
74
+ Triggered by: "check", "did I break anything", "run checks"
130
75
 
131
- Open the interactive ideation viewer:
76
+ **User overrides:**
77
+ - If the user says "show me the scenarios first" or "what scenarios do we have?" → read `evals/evals.json` and present the scenario list before running
78
+ - Otherwise, run immediately
132
79
 
133
- ```bash
134
- npx snapeval ideate <skill-path>
135
- ```
80
+ 1. Run `npx snapeval check <skill-path>` immediately (no confirmation needed)
81
+ - If the user specifies scenarios (e.g., "just check scenario 3"), use `--scenario <ids>`
82
+ 2. Interpret the results (never dump raw output):
136
83
 
137
- Tell the user:
138
- > "I've opened the analysis viewer in your browser. Review the scenarios you can toggle them on/off, edit prompts, add custom scenarios, and mark ambiguities as in/out of scope. When you're done, click 'Confirm & Run' to export your plan. Come back here and tell me when you're ready."
84
+ **All passed:**
85
+ > "All N scenarios passed (X at schema tier, Y needed LLM judge). No regressions. Cost: $0.00."
139
86
 
140
- Wait for the user to return.
87
+ **Regressions found — use the three-step pattern:**
141
88
 
142
- ### Phase 3 Ingest Feedback
89
+ 1. **Name the change**: What specifically is different?
90
+ > "Scenario 3 regressed — the skill's response dropped the step-by-step format and now returns a single paragraph."
143
91
 
144
- When the user says they're done, find the exported plan:
145
- 1. Check `~/Downloads/scenario_plan.json`
146
- 2. Check `~/Downloads/scenario_plan (1).json`, `scenario_plan (2).json` (browser duplicates)
147
- 3. If not found, ask: "I couldn't find scenario_plan.json in your Downloads. Can you paste the path?"
92
+ 2. **Hypothesize why**: Connect it to what the user likely changed. Re-read the skill's SKILL.md to look for clues.
93
+ > "This might be related to the instruction change in your SKILL.md you removed the 'always use numbered steps' line."
148
94
 
149
- Read the plan and acknowledge changes:
150
- - Scenarios toggled off "Removed N scenarios"
151
- - Custom scenarios added — "Added N custom scenarios"
152
- - Ambiguities marked in-scope — generate additional scenarios for them, present briefly
153
- - Edits — use as-is
95
+ 3. **Offer a clear fork**: Two options, not an open question.
96
+ > "Want to **approve** this as the new expected behavior, or **investigate** further?"
154
97
 
155
- If the user marked ambiguities as in-scope, generate additional scenarios covering them and ask for quick confirmation.
98
+ **Inconclusive results:**
99
+ > "Scenario 5 came back inconclusive — the LLM judge disagreed with itself across orderings. This usually means the change is borderline. Want to re-run or approve it?"
156
100
 
157
- ### Phase 4 — Write & Run
158
-
159
- Write the finalized scenarios to `evals/evals.json`. Map fields:
160
- - `confirmed_scenarios[].prompt` → `evals[].prompt`
161
- - `confirmed_scenarios[].expected_behavior` → `evals[].expected_output`
162
- - `custom_scenarios[]` → append with auto-assigned IDs starting after the last confirmed ID
163
- - `covers` and `why` are not persisted — they're ideation metadata
164
-
165
- Run capture:
166
- ```bash
167
- npx snapeval capture <skill-path>
168
- ```
169
-
170
- Report results: how many scenarios captured, total cost, location of snapshots.
171
-
172
- ---
173
-
174
- ## Quick Check (regression detection)
175
-
176
- 1. Run: `npx snapeval check <skill-path>`
177
- 2. Parse the terminal output
178
- 3. Report conversationally:
179
- - Which scenarios passed and at which tier (schema/judge)
180
- - Which scenarios regressed with details about what changed
181
- - Total cost and duration
182
- 4. If regressions found, present options:
183
- - Fix the skill and re-check
184
- - Run `@snapeval approve` to accept new behavior
185
-
186
- ---
101
+ ## Approve
187
102
 
188
- ## Review (visual review)
103
+ When the user approves regressions:
189
104
 
190
- After running check, generate a visual report and open it:
191
- 1. Run: `npx snapeval review <skill-path>`
192
- 2. This runs check, generates an HTML report, and opens it in the browser automatically
193
- 3. Tell the user: "Opening the report in your browser — it shows baseline vs current output with diffs, comparison analysis, and benchmark stats"
194
- 4. If the user provides feedback, use it to guide skill improvements
195
- 5. If regressions found, present options:
196
- - Fix the skill and re-review
197
- - Run `@snapeval approve` to accept new behavior
105
+ - Single: `npx snapeval approve <skill-path> --scenario 4`
106
+ "Approved scenario 4 the new format is now the baseline."
107
+ - Multiple: `npx snapeval approve <skill-path> --scenario 4,5,6`
108
+ "Approved scenarios 4, 5, and 6 as new baselines."
109
+ - All: `npx snapeval approve <skill-path>`
110
+ "Approved all N regressed scenarios as new baselines."
111
+ - Always remind: "Don't forget to commit the updated snapshots."
198
112
 
199
- ---
113
+ ## Visual Report
200
114
 
201
- ## Approve
202
-
203
- 1. Run: `npx snapeval approve --scenario <N>` (or without --scenario for all)
204
- 2. Confirm what was approved
205
- 3. Remind user to commit the updated snapshots
206
-
207
- ---
115
+ The HTML report viewer shows baseline vs. current output with diff highlighting. Use it as a companion, not a required step.
208
116
 
209
- ## Prompt Realism
117
+ **Offer the viewer when:**
118
+ - After a check with regressions: "Want to see the diffs side-by-side in the browser?"
119
+ - After a first capture with many scenarios: "Want to review all baselines visually?"
210
120
 
211
- When generating scenario prompts, make them realistic — the way a real user would actually type them. Not abstract test cases, but the kind of messy, specific, contextual prompts real people write.
121
+ **Do not offer the viewer when:**
122
+ - Clean passes with no regressions
123
+ - Single-scenario approvals
124
+ - User signaled they want speed ("just run it")
212
125
 
213
- **Bad:** "Please provide a formal greeting for Eleanor"
214
- **Good:** "hey can you greet my colleague eleanor? make it formal, she's kind of old school"
126
+ **Important:** The `report` command re-runs all scenarios (it calls check internally). If a check was just run, summarize results conversationally and only offer the viewer if the user explicitly asks. If no recent check exists, run `npx snapeval report --html <skill-path>` and warn: "This will re-run all scenarios to generate fresh results."
215
127
 
216
- **Bad:** "Handle an unknown style gracefully"
217
- **Good:** "greet me in shakespearean english plz"
128
+ ## Error Handling
218
129
 
219
- **Bad:** "Test empty input"
220
- **Good:** "" (literally empty) or just "hey" with no clear intent
130
+ Never show raw stack traces. Translate errors into plain language with a suggested next action:
221
131
 
222
- Vary style across scenarios: some terse, some with backstory, some with typos or abbreviations, some polite, some casual. Mix lengths. Include personal context where natural. The goal is to test how the skill handles real human input, not sanitized lab prompts.
132
+ | Error | Response |
133
+ |-------|----------|
134
+ | No SKILL.md found | "I can't find a SKILL.md in `<path>`. Is this the right directory?" |
135
+ | No baselines (NoBaselineError) | "No baselines exist yet. Want me to run a first evaluation to capture them?" |
136
+ | Inference unavailable | "I can't connect to the inference service. Check that Copilot CLI is authenticated (`copilot auth status`)." |
137
+ | Skill invocation failure | "The skill failed to respond to scenario N: `<error>`. This might be a bug in the skill — want to skip this scenario and continue?" |
138
+ | No scenarios generated | "I couldn't generate test scenarios from this SKILL.md. It might be too short or unclear. Can you tell me more about what the skill does?" |
223
139
 
224
- ## Important
140
+ ## Rules
225
141
 
226
- - Never ask the user to write evals.json, analysis.json, or any config files manually
227
- - Always read the target skill's SKILL.md (and referenced files) before generating scenarios
142
+ - Never ask the user to write evals.json or any config files manually
143
+ - Always read the target skill's SKILL.md before generating scenarios
228
144
  - Report costs prominently (should be $0.00 for Copilot gpt-5-mini)
229
- - When reporting regressions, explain what changed in plain language
230
- - The ideation viewer and eval viewer are separate tools for separate stages — don't confuse them
231
- - Quick Onboard is for getting started fast — if users want thorough coverage, guide them to Full Ideation
145
+ - Only reference CLI flags that actually exist: `--adapter`, `--inference`, `--budget`, `--runs`, `--ci`, `--html`, `--scenario`, `--verbose`
@@ -1,5 +1,7 @@
1
1
  import * as fs from 'node:fs';
2
2
  import * as path from 'node:path';
3
+ import { execFile } from 'node:child_process';
4
+ import * as os from 'node:os';
3
5
  import type { EvalResults } from '../types.js';
4
6
  import { JSONReporter } from '../adapters/report/json.js';
5
7
  import { TerminalReporter } from '../adapters/report/terminal.js';
@@ -35,6 +37,17 @@ export async function reportCommand(
35
37
  await htmlReporter.report(results);
36
38
  const reportPath = path.join(iterationDir, 'report.html');
37
39
  console.log(`Report written to ${reportPath}`);
40
+ if (!process.env.CI) {
41
+ const platform = os.platform();
42
+ const opener = platform === 'darwin' ? 'open' : platform === 'win32' ? 'cmd' : 'xdg-open';
43
+ const args = platform === 'win32' ? ['/c', 'start', '', reportPath] : [reportPath];
44
+ execFile(opener, args, (err) => {
45
+ if (err) {
46
+ // Fallback: print path so user can open manually
47
+ console.log(`Open in browser: ${reportPath}`);
48
+ }
49
+ });
50
+ }
38
51
  }
39
52
 
40
53
  // Print terminal report
@@ -1,6 +1,3 @@
1
- import { execFile } from 'node:child_process';
2
- import * as path from 'node:path';
3
- import * as process from 'node:process';
4
1
  import type { SkillAdapter, InferenceAdapter } from '../types.js';
5
2
  import { checkCommand } from './check.js';
6
3
  import { reportCommand } from './report.js';
@@ -18,31 +15,8 @@ export async function reviewCommand(
18
15
  html: true,
19
16
  });
20
17
 
21
- const reportPath = path.join(iterationDir, 'report.html');
22
- openInBrowser(reportPath);
23
-
24
18
  return {
25
19
  iterationDir,
26
20
  hasRegressions: results.summary.regressed > 0,
27
21
  };
28
22
  }
29
-
30
- function openInBrowser(filePath: string): void {
31
- const cmd =
32
- process.platform === 'darwin'
33
- ? 'open'
34
- : process.platform === 'win32'
35
- ? 'cmd'
36
- : 'xdg-open';
37
-
38
- const args =
39
- process.platform === 'win32'
40
- ? ['/c', 'start', '', filePath]
41
- : [filePath];
42
-
43
- execFile(cmd, args, (err) => {
44
- if (err) {
45
- console.warn(`Could not open browser: ${err.message}`);
46
- }
47
- });
48
- }