snapeval 2.0.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +144 -104
- package/bin/snapeval.ts +39 -1
- package/dist/bin/snapeval.js +33 -0
- package/dist/bin/snapeval.js.map +1 -1
- package/dist/src/adapters/copilot-sdk-client.js +3 -1
- package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
- package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
- package/dist/src/adapters/harness/copilot-sdk.js +101 -0
- package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
- package/dist/src/adapters/harness/resolve.js +10 -2
- package/dist/src/adapters/harness/resolve.js.map +1 -1
- package/dist/src/adapters/inference/copilot-sdk.js +4 -1
- package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
- package/dist/src/adapters/report/terminal.js +89 -9
- package/dist/src/adapters/report/terminal.js.map +1 -1
- package/dist/src/commands/eval.d.ts +3 -0
- package/dist/src/commands/eval.js +146 -17
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/commands/review.d.ts +1 -0
- package/dist/src/commands/review.js.map +1 -1
- package/dist/src/config.js +2 -1
- package/dist/src/config.js.map +1 -1
- package/dist/src/engine/grader.js +67 -9
- package/dist/src/engine/grader.js.map +1 -1
- package/dist/src/engine/runner.d.ts +1 -0
- package/dist/src/engine/runner.js +15 -12
- package/dist/src/engine/runner.js.map +1 -1
- package/dist/src/errors.d.ts +6 -0
- package/dist/src/errors.js +21 -3
- package/dist/src/errors.js.map +1 -1
- package/dist/src/types.d.ts +3 -0
- package/package.json +4 -1
- package/plugin.json +1 -1
- package/skills/snapeval/SKILL.md +132 -39
- package/src/adapters/copilot-sdk-client.ts +3 -1
- package/src/adapters/harness/copilot-sdk.ts +126 -0
- package/src/adapters/harness/resolve.ts +13 -2
- package/src/adapters/inference/copilot-sdk.ts +5 -1
- package/src/adapters/report/terminal.ts +99 -10
- package/src/commands/eval.ts +183 -31
- package/src/commands/review.ts +1 -1
- package/src/config.ts +2 -1
- package/src/engine/grader.ts +59 -8
- package/src/engine/runner.ts +16 -13
- package/src/errors.ts +24 -3
- package/src/types.ts +3 -0
package/src/engine/grader.ts
CHANGED
|
@@ -8,9 +8,34 @@ import type {
|
|
|
8
8
|
AssertionResult,
|
|
9
9
|
} from '../types.js';
|
|
10
10
|
|
|
11
|
+
const EXACT_MATCH_PATTERN = /^Output (?:is |equals )exactly:\s*"(.+)"$/i;
|
|
12
|
+
|
|
13
|
+
function gradeExactMatch(assertion: string, output: string): AssertionResult | null {
|
|
14
|
+
const match = assertion.match(EXACT_MATCH_PATTERN);
|
|
15
|
+
if (!match) return null;
|
|
16
|
+
const expected = match[1];
|
|
17
|
+
const actual = output.trim();
|
|
18
|
+
const passed = actual === expected;
|
|
19
|
+
return {
|
|
20
|
+
text: assertion,
|
|
21
|
+
passed,
|
|
22
|
+
evidence: passed
|
|
23
|
+
? `Exact match: "${expected}"`
|
|
24
|
+
: `Expected: "${expected}"\nGot: "${actual}"`,
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
|
|
11
28
|
function buildGradingPrompt(assertions: string[], output: string, files: string[]): string {
|
|
12
29
|
const fileList = files.length > 0 ? `\nFiles produced: ${files.join(', ')}` : '';
|
|
13
|
-
return `You are
|
|
30
|
+
return `You are an eval grader. For each assertion, determine PASS or FAIL based solely on the output below.
|
|
31
|
+
|
|
32
|
+
GRADING RULES:
|
|
33
|
+
- PASS if the output satisfies the assertion's intent, even if wording differs slightly.
|
|
34
|
+
- FAIL only if the output clearly does not satisfy the assertion.
|
|
35
|
+
- Be consistent: if an assertion checks for X and the output contains X in different phrasing, that is a PASS.
|
|
36
|
+
- For "contains" assertions: look for semantic presence, not exact substring.
|
|
37
|
+
- For "identifies" assertions: the output must demonstrate awareness of the concept, not use identical words.
|
|
38
|
+
- Always cite specific text from the output as evidence.
|
|
14
39
|
|
|
15
40
|
OUTPUT:
|
|
16
41
|
---
|
|
@@ -23,7 +48,7 @@ ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
|
|
|
23
48
|
Respond with JSON only:
|
|
24
49
|
{
|
|
25
50
|
"results": [
|
|
26
|
-
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote
|
|
51
|
+
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
|
|
27
52
|
]
|
|
28
53
|
}`;
|
|
29
54
|
}
|
|
@@ -38,18 +63,38 @@ function runScript(
|
|
|
38
63
|
return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
|
|
39
64
|
}
|
|
40
65
|
try {
|
|
41
|
-
const
|
|
66
|
+
const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000, stdio: ['pipe', 'pipe', 'pipe'] }).trim();
|
|
67
|
+
const evidence = stdout || `Script passed: ${scriptName}`;
|
|
42
68
|
return { text: `script:${scriptName}`, passed: true, evidence };
|
|
43
69
|
} catch (err: any) {
|
|
44
|
-
|
|
70
|
+
// Extract the most useful error info without raw stack traces
|
|
71
|
+
const stderr = err.stderr?.trim();
|
|
72
|
+
const stdout = err.stdout?.trim();
|
|
73
|
+
let evidence: string;
|
|
74
|
+
if (err.code === 'EACCES') {
|
|
75
|
+
evidence = `Permission denied: ${scriptPath} is not executable. Run: chmod +x ${scriptPath}`;
|
|
76
|
+
} else if (stderr) {
|
|
77
|
+
// Take only the first line of stderr to avoid stack trace noise
|
|
78
|
+
evidence = stderr.split('\n')[0];
|
|
79
|
+
} else if (stdout) {
|
|
80
|
+
evidence = stdout.split('\n')[0];
|
|
81
|
+
} else {
|
|
82
|
+
evidence = `Script exited with code ${err.status ?? 'unknown'}`;
|
|
83
|
+
}
|
|
45
84
|
return { text: `script:${scriptName}`, passed: false, evidence };
|
|
46
85
|
}
|
|
47
86
|
}
|
|
48
87
|
|
|
49
88
|
function extractJSON(text: string): string {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
return
|
|
89
|
+
// Try JSON-tagged fence first, then bare fence, then raw text
|
|
90
|
+
const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
|
|
91
|
+
if (jsonFence) return jsonFence[1].trim();
|
|
92
|
+
// Try parsing raw text as JSON before falling back to any fence
|
|
93
|
+
const trimmed = text.trim();
|
|
94
|
+
try { JSON.parse(trimmed); return trimmed; } catch { /* not raw JSON */ }
|
|
95
|
+
const anyFence = text.match(/```\s*([\s\S]*?)```/);
|
|
96
|
+
if (anyFence) return anyFence[1].trim();
|
|
97
|
+
return trimmed;
|
|
53
98
|
}
|
|
54
99
|
|
|
55
100
|
export async function gradeAssertions(
|
|
@@ -62,7 +107,8 @@ export async function gradeAssertions(
|
|
|
62
107
|
if (assertions.length === 0) return null;
|
|
63
108
|
|
|
64
109
|
const scriptAssertions = assertions.filter(a => a.startsWith('script:'));
|
|
65
|
-
const
|
|
110
|
+
const exactAssertions = assertions.filter(a => !a.startsWith('script:') && EXACT_MATCH_PATTERN.test(a));
|
|
111
|
+
const llmAssertions = assertions.filter(a => !a.startsWith('script:') && !EXACT_MATCH_PATTERN.test(a));
|
|
66
112
|
const results: AssertionResult[] = [];
|
|
67
113
|
|
|
68
114
|
for (const assertion of scriptAssertions) {
|
|
@@ -72,6 +118,11 @@ export async function gradeAssertions(
|
|
|
72
118
|
results.push(runScript(scriptName, outputDir, dir));
|
|
73
119
|
}
|
|
74
120
|
|
|
121
|
+
for (const assertion of exactAssertions) {
|
|
122
|
+
const result = gradeExactMatch(assertion, output.raw);
|
|
123
|
+
if (result) results.push(result);
|
|
124
|
+
}
|
|
125
|
+
|
|
75
126
|
if (llmAssertions.length > 0) {
|
|
76
127
|
const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
|
|
77
128
|
const response = await inference.chat(
|
package/src/engine/runner.ts
CHANGED
|
@@ -5,6 +5,7 @@ import type { Harness, HarnessRunResult, EvalCase, TimingData } from '../types.j
|
|
|
5
5
|
interface RunEvalResult {
|
|
6
6
|
evalId: number;
|
|
7
7
|
slug: string;
|
|
8
|
+
label?: string;
|
|
8
9
|
prompt: string;
|
|
9
10
|
withSkill: { output: HarnessRunResult };
|
|
10
11
|
withoutSkill: { output: HarnessRunResult };
|
|
@@ -33,27 +34,29 @@ export async function runEval(
|
|
|
33
34
|
const baselineVariant = oldSkillPath ? 'old_skill' : 'without_skill';
|
|
34
35
|
const baselineDir = path.join(evalDir, baselineVariant);
|
|
35
36
|
|
|
36
|
-
const withSkillResult = await
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
37
|
+
const [withSkillResult, baselineResult] = await Promise.all([
|
|
38
|
+
harness.run({
|
|
39
|
+
skillPath,
|
|
40
|
+
prompt: evalCase.prompt,
|
|
41
|
+
files: evalCase.files,
|
|
42
|
+
outputDir: path.join(withSkillDir, 'outputs'),
|
|
43
|
+
}),
|
|
44
|
+
harness.run({
|
|
45
|
+
skillPath: oldSkillPath,
|
|
46
|
+
prompt: evalCase.prompt,
|
|
47
|
+
files: evalCase.files,
|
|
48
|
+
outputDir: path.join(baselineDir, 'outputs'),
|
|
49
|
+
}),
|
|
50
|
+
]);
|
|
42
51
|
writeTiming(withSkillDir, withSkillResult);
|
|
43
52
|
writeOutput(withSkillDir, withSkillResult);
|
|
44
|
-
|
|
45
|
-
const baselineResult = await harness.run({
|
|
46
|
-
skillPath: oldSkillPath,
|
|
47
|
-
prompt: evalCase.prompt,
|
|
48
|
-
files: evalCase.files,
|
|
49
|
-
outputDir: path.join(baselineDir, 'outputs'),
|
|
50
|
-
});
|
|
51
53
|
writeTiming(baselineDir, baselineResult);
|
|
52
54
|
writeOutput(baselineDir, baselineResult);
|
|
53
55
|
|
|
54
56
|
return {
|
|
55
57
|
evalId: evalCase.id,
|
|
56
58
|
slug: evalCase.slug ?? `${evalCase.id}`,
|
|
59
|
+
label: evalCase.label,
|
|
57
60
|
prompt: evalCase.prompt,
|
|
58
61
|
withSkill: { output: withSkillResult },
|
|
59
62
|
withoutSkill: { output: baselineResult },
|
package/src/errors.ts
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
// Exit codes:
|
|
2
|
+
// 0 = success
|
|
3
|
+
// 1 = threshold not met (eval ran successfully but pass rate below threshold)
|
|
4
|
+
// 2 = config/input error (bad JSON, missing fields, invalid flags)
|
|
5
|
+
// 3 = file not found (missing skill dir, missing evals.json, missing script)
|
|
6
|
+
// 4 = runtime error (harness failure, grading failure, timeout)
|
|
7
|
+
|
|
1
8
|
export class SnapevalError extends Error {
|
|
2
9
|
constructor(message: string, public exitCode: number = 2) {
|
|
3
10
|
super(message);
|
|
@@ -5,9 +12,23 @@ export class SnapevalError extends Error {
|
|
|
5
12
|
}
|
|
6
13
|
}
|
|
7
14
|
|
|
15
|
+
export class FileNotFoundError extends SnapevalError {
|
|
16
|
+
constructor(filePath: string, hint?: string) {
|
|
17
|
+
super(`File not found: ${filePath}${hint ? `. ${hint}` : ''}`, 3);
|
|
18
|
+
this.name = 'FileNotFoundError';
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export class ThresholdError extends SnapevalError {
|
|
23
|
+
constructor(actual: number, threshold: number) {
|
|
24
|
+
super(`Skill pass rate ${(actual * 100).toFixed(1)}% is below threshold ${(threshold * 100).toFixed(1)}%`, 1);
|
|
25
|
+
this.name = 'ThresholdError';
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
8
29
|
export class AdapterNotAvailableError extends SnapevalError {
|
|
9
30
|
constructor(adapterName: string, installHint: string) {
|
|
10
|
-
super(`${adapterName} is not available. ${installHint}
|
|
31
|
+
super(`${adapterName} is not available. ${installHint}`, 4);
|
|
11
32
|
this.name = 'AdapterNotAvailableError';
|
|
12
33
|
}
|
|
13
34
|
}
|
|
@@ -21,14 +42,14 @@ export class RateLimitError extends SnapevalError {
|
|
|
21
42
|
|
|
22
43
|
export class TimeoutError extends SnapevalError {
|
|
23
44
|
constructor(evalId: number, timeoutMs: number) {
|
|
24
|
-
super(`Eval ${evalId} timed out after ${timeoutMs}ms
|
|
45
|
+
super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
|
|
25
46
|
this.name = 'TimeoutError';
|
|
26
47
|
}
|
|
27
48
|
}
|
|
28
49
|
|
|
29
50
|
export class GradingError extends SnapevalError {
|
|
30
51
|
constructor(evalId: number, detail: string) {
|
|
31
|
-
super(`Grading failed for eval ${evalId}: ${detail}
|
|
52
|
+
super(`Grading failed for eval ${evalId}: ${detail}`, 4);
|
|
32
53
|
this.name = 'GradingError';
|
|
33
54
|
}
|
|
34
55
|
}
|
package/src/types.ts
CHANGED
|
@@ -43,6 +43,7 @@ export interface EvalCase {
|
|
|
43
43
|
id: number;
|
|
44
44
|
prompt: string;
|
|
45
45
|
expected_output: string;
|
|
46
|
+
label?: string;
|
|
46
47
|
slug?: string;
|
|
47
48
|
files?: string[];
|
|
48
49
|
assertions?: string[];
|
|
@@ -110,6 +111,7 @@ export interface FeedbackData {
|
|
|
110
111
|
export interface EvalRunResult {
|
|
111
112
|
evalId: number;
|
|
112
113
|
slug: string;
|
|
114
|
+
label?: string;
|
|
113
115
|
prompt: string;
|
|
114
116
|
withSkill: {
|
|
115
117
|
output: HarnessRunResult;
|
|
@@ -142,4 +144,5 @@ export interface SnapevalConfig {
|
|
|
142
144
|
inference: string;
|
|
143
145
|
workspace: string;
|
|
144
146
|
runs: number;
|
|
147
|
+
concurrency: number;
|
|
145
148
|
}
|