snapeval 2.0.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +144 -104
  2. package/bin/snapeval.ts +39 -1
  3. package/dist/bin/snapeval.js +33 -0
  4. package/dist/bin/snapeval.js.map +1 -1
  5. package/dist/src/adapters/copilot-sdk-client.js +3 -1
  6. package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
  7. package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
  8. package/dist/src/adapters/harness/copilot-sdk.js +101 -0
  9. package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
  10. package/dist/src/adapters/harness/resolve.js +10 -2
  11. package/dist/src/adapters/harness/resolve.js.map +1 -1
  12. package/dist/src/adapters/inference/copilot-sdk.js +4 -1
  13. package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
  14. package/dist/src/adapters/report/terminal.js +89 -9
  15. package/dist/src/adapters/report/terminal.js.map +1 -1
  16. package/dist/src/commands/eval.d.ts +3 -0
  17. package/dist/src/commands/eval.js +146 -17
  18. package/dist/src/commands/eval.js.map +1 -1
  19. package/dist/src/commands/review.d.ts +1 -0
  20. package/dist/src/commands/review.js.map +1 -1
  21. package/dist/src/config.js +2 -1
  22. package/dist/src/config.js.map +1 -1
  23. package/dist/src/engine/grader.js +67 -9
  24. package/dist/src/engine/grader.js.map +1 -1
  25. package/dist/src/engine/runner.d.ts +1 -0
  26. package/dist/src/engine/runner.js +15 -12
  27. package/dist/src/engine/runner.js.map +1 -1
  28. package/dist/src/errors.d.ts +6 -0
  29. package/dist/src/errors.js +21 -3
  30. package/dist/src/errors.js.map +1 -1
  31. package/dist/src/types.d.ts +3 -0
  32. package/package.json +4 -1
  33. package/plugin.json +1 -1
  34. package/skills/snapeval/SKILL.md +132 -39
  35. package/src/adapters/copilot-sdk-client.ts +3 -1
  36. package/src/adapters/harness/copilot-sdk.ts +126 -0
  37. package/src/adapters/harness/resolve.ts +13 -2
  38. package/src/adapters/inference/copilot-sdk.ts +5 -1
  39. package/src/adapters/report/terminal.ts +99 -10
  40. package/src/commands/eval.ts +183 -31
  41. package/src/commands/review.ts +1 -1
  42. package/src/config.ts +2 -1
  43. package/src/engine/grader.ts +59 -8
  44. package/src/engine/runner.ts +16 -13
  45. package/src/errors.ts +24 -3
  46. package/src/types.ts +3 -0
@@ -8,9 +8,34 @@ import type {
8
8
  AssertionResult,
9
9
  } from '../types.js';
10
10
 
11
+ const EXACT_MATCH_PATTERN = /^Output (?:is |equals )exactly:\s*"(.+)"$/i;
12
+
13
+ function gradeExactMatch(assertion: string, output: string): AssertionResult | null {
14
+ const match = assertion.match(EXACT_MATCH_PATTERN);
15
+ if (!match) return null;
16
+ const expected = match[1];
17
+ const actual = output.trim();
18
+ const passed = actual === expected;
19
+ return {
20
+ text: assertion,
21
+ passed,
22
+ evidence: passed
23
+ ? `Exact match: "${expected}"`
24
+ : `Expected: "${expected}"\nGot: "${actual}"`,
25
+ };
26
+ }
27
+
11
28
  function buildGradingPrompt(assertions: string[], output: string, files: string[]): string {
12
29
  const fileList = files.length > 0 ? `\nFiles produced: ${files.join(', ')}` : '';
13
- return `You are a strict eval grader. For each assertion, determine PASS or FAIL based on the output below. Require concrete evidence for a PASS — do not give the benefit of the doubt.
30
+ return `You are an eval grader. For each assertion, determine PASS or FAIL based solely on the output below.
31
+
32
+ GRADING RULES:
33
+ - PASS if the output satisfies the assertion's intent, even if wording differs slightly.
34
+ - FAIL only if the output clearly does not satisfy the assertion.
35
+ - Be consistent: if an assertion checks for X and the output contains X in different phrasing, that is a PASS.
36
+ - For "contains" assertions: look for semantic presence, not exact substring.
37
+ - For "identifies" assertions: the output must demonstrate awareness of the concept, not use identical words.
38
+ - Always cite specific text from the output as evidence.
14
39
 
15
40
  OUTPUT:
16
41
  ---
@@ -23,7 +48,7 @@ ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
23
48
  Respond with JSON only:
24
49
  {
25
50
  "results": [
26
- {"text": "<assertion text>", "passed": true/false, "evidence": "<quote or reference from output>"}
51
+ {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
27
52
  ]
28
53
  }`;
29
54
  }
@@ -38,18 +63,38 @@ function runScript(
38
63
  return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
39
64
  }
40
65
  try {
41
- const evidence = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
66
+ const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000, stdio: ['pipe', 'pipe', 'pipe'] }).trim();
67
+ const evidence = stdout || `Script passed: ${scriptName}`;
42
68
  return { text: `script:${scriptName}`, passed: true, evidence };
43
69
  } catch (err: any) {
44
- const evidence = err.stdout?.trim() || err.message || 'Script exited with non-zero code';
70
+ // Extract the most useful error info without raw stack traces
71
+ const stderr = err.stderr?.trim();
72
+ const stdout = err.stdout?.trim();
73
+ let evidence: string;
74
+ if (err.code === 'EACCES') {
75
+ evidence = `Permission denied: ${scriptPath} is not executable. Run: chmod +x ${scriptPath}`;
76
+ } else if (stderr) {
77
+ // Take only the first line of stderr to avoid stack trace noise
78
+ evidence = stderr.split('\n')[0];
79
+ } else if (stdout) {
80
+ evidence = stdout.split('\n')[0];
81
+ } else {
82
+ evidence = `Script exited with code ${err.status ?? 'unknown'}`;
83
+ }
45
84
  return { text: `script:${scriptName}`, passed: false, evidence };
46
85
  }
47
86
  }
48
87
 
49
88
  function extractJSON(text: string): string {
50
- const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
51
- if (match) return match[1].trim();
52
- return text.trim();
89
+ // Try JSON-tagged fence first, then bare fence, then raw text
90
+ const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
91
+ if (jsonFence) return jsonFence[1].trim();
92
+ // Try parsing raw text as JSON before falling back to any fence
93
+ const trimmed = text.trim();
94
+ try { JSON.parse(trimmed); return trimmed; } catch { /* not raw JSON */ }
95
+ const anyFence = text.match(/```\s*([\s\S]*?)```/);
96
+ if (anyFence) return anyFence[1].trim();
97
+ return trimmed;
53
98
  }
54
99
 
55
100
  export async function gradeAssertions(
@@ -62,7 +107,8 @@ export async function gradeAssertions(
62
107
  if (assertions.length === 0) return null;
63
108
 
64
109
  const scriptAssertions = assertions.filter(a => a.startsWith('script:'));
65
- const llmAssertions = assertions.filter(a => !a.startsWith('script:'));
110
+ const exactAssertions = assertions.filter(a => !a.startsWith('script:') && EXACT_MATCH_PATTERN.test(a));
111
+ const llmAssertions = assertions.filter(a => !a.startsWith('script:') && !EXACT_MATCH_PATTERN.test(a));
66
112
  const results: AssertionResult[] = [];
67
113
 
68
114
  for (const assertion of scriptAssertions) {
@@ -72,6 +118,11 @@ export async function gradeAssertions(
72
118
  results.push(runScript(scriptName, outputDir, dir));
73
119
  }
74
120
 
121
+ for (const assertion of exactAssertions) {
122
+ const result = gradeExactMatch(assertion, output.raw);
123
+ if (result) results.push(result);
124
+ }
125
+
75
126
  if (llmAssertions.length > 0) {
76
127
  const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
77
128
  const response = await inference.chat(
@@ -5,6 +5,7 @@ import type { Harness, HarnessRunResult, EvalCase, TimingData } from '../types.j
5
5
  interface RunEvalResult {
6
6
  evalId: number;
7
7
  slug: string;
8
+ label?: string;
8
9
  prompt: string;
9
10
  withSkill: { output: HarnessRunResult };
10
11
  withoutSkill: { output: HarnessRunResult };
@@ -33,27 +34,29 @@ export async function runEval(
33
34
  const baselineVariant = oldSkillPath ? 'old_skill' : 'without_skill';
34
35
  const baselineDir = path.join(evalDir, baselineVariant);
35
36
 
36
- const withSkillResult = await harness.run({
37
- skillPath,
38
- prompt: evalCase.prompt,
39
- files: evalCase.files,
40
- outputDir: path.join(withSkillDir, 'outputs'),
41
- });
37
+ const [withSkillResult, baselineResult] = await Promise.all([
38
+ harness.run({
39
+ skillPath,
40
+ prompt: evalCase.prompt,
41
+ files: evalCase.files,
42
+ outputDir: path.join(withSkillDir, 'outputs'),
43
+ }),
44
+ harness.run({
45
+ skillPath: oldSkillPath,
46
+ prompt: evalCase.prompt,
47
+ files: evalCase.files,
48
+ outputDir: path.join(baselineDir, 'outputs'),
49
+ }),
50
+ ]);
42
51
  writeTiming(withSkillDir, withSkillResult);
43
52
  writeOutput(withSkillDir, withSkillResult);
44
-
45
- const baselineResult = await harness.run({
46
- skillPath: oldSkillPath,
47
- prompt: evalCase.prompt,
48
- files: evalCase.files,
49
- outputDir: path.join(baselineDir, 'outputs'),
50
- });
51
53
  writeTiming(baselineDir, baselineResult);
52
54
  writeOutput(baselineDir, baselineResult);
53
55
 
54
56
  return {
55
57
  evalId: evalCase.id,
56
58
  slug: evalCase.slug ?? `${evalCase.id}`,
59
+ label: evalCase.label,
57
60
  prompt: evalCase.prompt,
58
61
  withSkill: { output: withSkillResult },
59
62
  withoutSkill: { output: baselineResult },
package/src/errors.ts CHANGED
@@ -1,3 +1,10 @@
1
+ // Exit codes:
2
+ // 0 = success
3
+ // 1 = threshold not met (eval ran successfully but pass rate below threshold)
4
+ // 2 = config/input error (bad JSON, missing fields, invalid flags)
5
+ // 3 = file not found (missing skill dir, missing evals.json, missing script)
6
+ // 4 = runtime error (harness failure, grading failure, timeout)
7
+
1
8
  export class SnapevalError extends Error {
2
9
  constructor(message: string, public exitCode: number = 2) {
3
10
  super(message);
@@ -5,9 +12,23 @@ export class SnapevalError extends Error {
5
12
  }
6
13
  }
7
14
 
15
+ export class FileNotFoundError extends SnapevalError {
16
+ constructor(filePath: string, hint?: string) {
17
+ super(`File not found: ${filePath}${hint ? `. ${hint}` : ''}`, 3);
18
+ this.name = 'FileNotFoundError';
19
+ }
20
+ }
21
+
22
+ export class ThresholdError extends SnapevalError {
23
+ constructor(actual: number, threshold: number) {
24
+ super(`Skill pass rate ${(actual * 100).toFixed(1)}% is below threshold ${(threshold * 100).toFixed(1)}%`, 1);
25
+ this.name = 'ThresholdError';
26
+ }
27
+ }
28
+
8
29
  export class AdapterNotAvailableError extends SnapevalError {
9
30
  constructor(adapterName: string, installHint: string) {
10
- super(`${adapterName} is not available. ${installHint}`);
31
+ super(`${adapterName} is not available. ${installHint}`, 4);
11
32
  this.name = 'AdapterNotAvailableError';
12
33
  }
13
34
  }
@@ -21,14 +42,14 @@ export class RateLimitError extends SnapevalError {
21
42
 
22
43
  export class TimeoutError extends SnapevalError {
23
44
  constructor(evalId: number, timeoutMs: number) {
24
- super(`Eval ${evalId} timed out after ${timeoutMs}ms.`);
45
+ super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
25
46
  this.name = 'TimeoutError';
26
47
  }
27
48
  }
28
49
 
29
50
  export class GradingError extends SnapevalError {
30
51
  constructor(evalId: number, detail: string) {
31
- super(`Grading failed for eval ${evalId}: ${detail}`);
52
+ super(`Grading failed for eval ${evalId}: ${detail}`, 4);
32
53
  this.name = 'GradingError';
33
54
  }
34
55
  }
package/src/types.ts CHANGED
@@ -43,6 +43,7 @@ export interface EvalCase {
43
43
  id: number;
44
44
  prompt: string;
45
45
  expected_output: string;
46
+ label?: string;
46
47
  slug?: string;
47
48
  files?: string[];
48
49
  assertions?: string[];
@@ -110,6 +111,7 @@ export interface FeedbackData {
110
111
  export interface EvalRunResult {
111
112
  evalId: number;
112
113
  slug: string;
114
+ label?: string;
113
115
  prompt: string;
114
116
  withSkill: {
115
117
  output: HarnessRunResult;
@@ -142,4 +144,5 @@ export interface SnapevalConfig {
142
144
  inference: string;
143
145
  workspace: string;
144
146
  runs: number;
147
+ concurrency: number;
145
148
  }