snapeval 1.8.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/snapeval.ts +30 -24
- package/dist/bin/snapeval.js +25 -22
- package/dist/bin/snapeval.js.map +1 -1
- package/dist/src/adapters/copilot-sdk-client.js +1 -1
- package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
- package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
- package/dist/src/adapters/harness/copilot-sdk.js +101 -0
- package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
- package/dist/src/adapters/harness/resolve.js +10 -2
- package/dist/src/adapters/harness/resolve.js.map +1 -1
- package/dist/src/adapters/inference/copilot-sdk.js +4 -1
- package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
- package/dist/src/adapters/report/terminal.js +89 -9
- package/dist/src/adapters/report/terminal.js.map +1 -1
- package/dist/src/commands/eval.d.ts +3 -0
- package/dist/src/commands/eval.js +106 -17
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/commands/review.d.ts +1 -0
- package/dist/src/commands/review.js.map +1 -1
- package/dist/src/config.js +2 -1
- package/dist/src/config.js.map +1 -1
- package/dist/src/engine/grader.js +67 -9
- package/dist/src/engine/grader.js.map +1 -1
- package/dist/src/engine/runner.js +14 -12
- package/dist/src/engine/runner.js.map +1 -1
- package/dist/src/errors.d.ts +6 -0
- package/dist/src/errors.js +21 -3
- package/dist/src/errors.js.map +1 -1
- package/dist/src/types.d.ts +1 -0
- package/package.json +4 -1
- package/plugin.json +1 -1
- package/skills/snapeval/SKILL.md +33 -18
- package/src/adapters/copilot-sdk-client.ts +1 -1
- package/src/adapters/harness/copilot-sdk.ts +126 -0
- package/src/adapters/harness/resolve.ts +13 -2
- package/src/adapters/inference/copilot-sdk.ts +5 -1
- package/src/adapters/report/terminal.ts +100 -10
- package/src/commands/eval.ts +133 -31
- package/src/commands/review.ts +1 -1
- package/src/config.ts +2 -1
- package/src/engine/grader.ts +59 -8
- package/src/engine/runner.ts +14 -13
- package/src/errors.ts +24 -3
- package/src/types.ts +1 -0
- package/dist/src/commands/init.d.ts +0 -2
- package/dist/src/commands/init.js +0 -27
- package/dist/src/commands/init.js.map +0 -1
- package/dist/src/engine/generator.d.ts +0 -3
- package/dist/src/engine/generator.js +0 -51
- package/dist/src/engine/generator.js.map +0 -1
- package/src/commands/init.ts +0 -38
- package/src/engine/generator.ts +0 -60
|
@@ -4,47 +4,136 @@ import { WorkspaceManager } from '../engine/workspace.js';
|
|
|
4
4
|
import { runEval } from '../engine/runner.js';
|
|
5
5
|
import { gradeAssertions } from '../engine/grader.js';
|
|
6
6
|
import { computeBenchmark } from '../engine/aggregator.js';
|
|
7
|
-
import { SnapevalError } from '../errors.js';
|
|
7
|
+
import { SnapevalError, FileNotFoundError, ThresholdError } from '../errors.js';
|
|
8
|
+
async function runWithConcurrency(tasks, limit) {
|
|
9
|
+
const results = new Array(tasks.length);
|
|
10
|
+
let index = 0;
|
|
11
|
+
async function worker() {
|
|
12
|
+
while (index < tasks.length) {
|
|
13
|
+
const i = index++;
|
|
14
|
+
results[i] = await tasks[i]();
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
|
|
18
|
+
return results;
|
|
19
|
+
}
|
|
20
|
+
const MAX_CONCURRENCY = 10;
|
|
21
|
+
function validateEvalsFile(evalsFile, evalsPath) {
|
|
22
|
+
if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
|
|
23
|
+
throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
|
|
24
|
+
}
|
|
25
|
+
if (!Array.isArray(evalsFile.evals)) {
|
|
26
|
+
throw new SnapevalError(`Invalid evals.json at ${evalsPath}: "evals" must be an array.`);
|
|
27
|
+
}
|
|
28
|
+
for (const [i, evalCase] of evalsFile.evals.entries()) {
|
|
29
|
+
const prefix = `Invalid evals.json at ${evalsPath}: evals[${i}]`;
|
|
30
|
+
if (typeof evalCase.id !== 'number') {
|
|
31
|
+
throw new SnapevalError(`${prefix} missing or invalid "id" (must be a number).`);
|
|
32
|
+
}
|
|
33
|
+
if (typeof evalCase.prompt !== 'string') {
|
|
34
|
+
throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "prompt" field.`);
|
|
35
|
+
}
|
|
36
|
+
if (typeof evalCase.expected_output !== 'string') {
|
|
37
|
+
throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "expected_output" field.`);
|
|
38
|
+
}
|
|
39
|
+
if (evalCase.assertions !== undefined && !Array.isArray(evalCase.assertions)) {
|
|
40
|
+
throw new SnapevalError(`${prefix} (id:${evalCase.id}) "assertions" must be an array of strings.`);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
8
44
|
export async function evalCommand(skillPath, harness, inference, options) {
|
|
9
45
|
const evalsPath = path.join(skillPath, 'evals', 'evals.json');
|
|
10
46
|
if (!fs.existsSync(evalsPath)) {
|
|
11
|
-
throw new
|
|
47
|
+
throw new FileNotFoundError(evalsPath, 'Create evals/evals.json with test scenarios first');
|
|
48
|
+
}
|
|
49
|
+
let evalsFile;
|
|
50
|
+
try {
|
|
51
|
+
evalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
throw new SnapevalError(`Invalid JSON in ${evalsPath}. Check for syntax errors (missing commas, trailing commas, etc).`);
|
|
55
|
+
}
|
|
56
|
+
validateEvalsFile(evalsFile, evalsPath);
|
|
57
|
+
// Filter to specific eval IDs if --only is provided
|
|
58
|
+
if (options.only && options.only.length > 0) {
|
|
59
|
+
const ids = new Set(options.only);
|
|
60
|
+
const filtered = evalsFile.evals.filter((e) => ids.has(e.id));
|
|
61
|
+
if (filtered.length === 0) {
|
|
62
|
+
throw new SnapevalError(`No eval cases match --only ${options.only.join(',')}. Available IDs: ${evalsFile.evals.map((e) => e.id).join(', ')}`);
|
|
63
|
+
}
|
|
64
|
+
evalsFile = { ...evalsFile, evals: filtered };
|
|
12
65
|
}
|
|
13
|
-
const evalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
|
|
14
66
|
const ws = new WorkspaceManager(skillPath, options.workspace);
|
|
15
67
|
const iterationDir = ws.createIteration();
|
|
68
|
+
// Track which SKILL.md was used for this iteration
|
|
69
|
+
const skillMdPath = path.join(skillPath, 'SKILL.md');
|
|
70
|
+
if (fs.existsSync(skillMdPath)) {
|
|
71
|
+
fs.copyFileSync(skillMdPath, path.join(iterationDir, 'SKILL.md.snapshot'));
|
|
72
|
+
}
|
|
16
73
|
const runs = options.runs ?? 1;
|
|
74
|
+
const concurrency = Math.min(Math.max(options.concurrency ?? 1, 1), MAX_CONCURRENCY);
|
|
17
75
|
const baselineVariant = options.oldSkill ? 'old_skill' : 'without_skill';
|
|
18
76
|
const scriptsDir = path.join(skillPath, 'evals', 'scripts');
|
|
19
|
-
|
|
20
|
-
|
|
77
|
+
// Pre-create eval directories sequentially (filesystem setup)
|
|
78
|
+
const evalDirs = evalsFile.evals.map((evalCase) => {
|
|
21
79
|
const slug = WorkspaceManager.getEvalSlug(evalCase).replace('eval-', '');
|
|
22
|
-
|
|
80
|
+
return { evalCase, slug, evalDir: ws.createEvalDir(iterationDir, slug, baselineVariant) };
|
|
81
|
+
});
|
|
82
|
+
const tasks = evalDirs.map(({ evalCase, slug, evalDir }) => async () => {
|
|
83
|
+
const assertions = evalCase.assertions ?? [];
|
|
84
|
+
const allGradings = [];
|
|
23
85
|
let lastRun = null;
|
|
24
86
|
for (let i = 0; i < runs; i++) {
|
|
25
87
|
lastRun = await runEval(evalCase, skillPath, evalDir, harness, options.oldSkill);
|
|
88
|
+
// Grade every run, not just the last
|
|
89
|
+
const [wsGrading, wosGrading] = await Promise.all([
|
|
90
|
+
gradeAssertions(assertions, lastRun.withSkill.output, path.join(evalDir, 'with_skill'), inference, fs.existsSync(scriptsDir) ? scriptsDir : undefined),
|
|
91
|
+
gradeAssertions(assertions, lastRun.withoutSkill.output, path.join(evalDir, baselineVariant), inference, fs.existsSync(scriptsDir) ? scriptsDir : undefined),
|
|
92
|
+
]);
|
|
93
|
+
allGradings.push({ withSkill: wsGrading, withoutSkill: wosGrading });
|
|
26
94
|
}
|
|
27
|
-
if (!lastRun)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
95
|
+
if (!lastRun) {
|
|
96
|
+
throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
|
|
97
|
+
}
|
|
98
|
+
// Use the last run's grading as the primary result (written to grading.json)
|
|
99
|
+
// but all gradings contribute to benchmark stats via pass rates
|
|
100
|
+
const lastGrading = allGradings[allGradings.length - 1];
|
|
101
|
+
return {
|
|
33
102
|
evalId: evalCase.id,
|
|
34
103
|
slug,
|
|
35
104
|
prompt: evalCase.prompt,
|
|
36
105
|
withSkill: {
|
|
37
106
|
output: lastRun.withSkill.output,
|
|
38
|
-
grading:
|
|
107
|
+
grading: lastGrading.withSkill ?? undefined,
|
|
39
108
|
},
|
|
40
109
|
withoutSkill: {
|
|
41
110
|
output: lastRun.withoutSkill.output,
|
|
42
|
-
grading:
|
|
111
|
+
grading: lastGrading.withoutSkill ?? undefined,
|
|
43
112
|
},
|
|
44
|
-
}
|
|
45
|
-
}
|
|
113
|
+
};
|
|
114
|
+
});
|
|
115
|
+
const evalRuns = await runWithConcurrency(tasks, concurrency);
|
|
46
116
|
const benchmark = computeBenchmark(evalRuns);
|
|
47
|
-
|
|
117
|
+
// Add iteration metadata for cross-iteration comparison
|
|
118
|
+
const benchmarkWithMeta = {
|
|
119
|
+
...benchmark,
|
|
120
|
+
metadata: {
|
|
121
|
+
eval_count: evalRuns.length,
|
|
122
|
+
eval_ids: evalRuns.map((r) => r.evalId),
|
|
123
|
+
skill_name: evalsFile.skill_name,
|
|
124
|
+
timestamp: new Date().toISOString(),
|
|
125
|
+
},
|
|
126
|
+
};
|
|
127
|
+
fs.writeFileSync(path.join(iterationDir, 'benchmark.json'), JSON.stringify(benchmarkWithMeta, null, 2));
|
|
128
|
+
// Check threshold if set (for CI gating)
|
|
129
|
+
if (options.threshold !== undefined) {
|
|
130
|
+
const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
|
|
131
|
+
if (passRate < options.threshold) {
|
|
132
|
+
// Still return results so the reporter can display them before the error
|
|
133
|
+
const results = { skillName: evalsFile.skill_name, evalRuns, benchmark, iterationDir };
|
|
134
|
+
throw Object.assign(new ThresholdError(passRate, options.threshold), { results });
|
|
135
|
+
}
|
|
136
|
+
}
|
|
48
137
|
return {
|
|
49
138
|
skillName: evalsFile.skill_name,
|
|
50
139
|
evalRuns,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AASlC,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAC3D,OAAO,EAAE,aAAa,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEhF,KAAK,UAAU,kBAAkB,CAC/B,KAA2B,EAC3B,KAAa;IAEb,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,UAAU,MAAM;QACnB,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC5B,MAAM,CAAC,GAAG,KAAK,EAAE,CAAC;YAClB,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;QAChC,CAAC;IACH,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC;IACjF,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,eAAe,GAAG,EAAE,CAAC;AAE3B,SAAS,iBAAiB,CAAC,SAAoB,EAAE,SAAiB;IAChE,IAAI,CAAC,SAAS,CAAC,UAAU,IAAI,OAAO,SAAS,CAAC,UAAU,KAAK,QAAQ,EAAE,CAAC;QACtE,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,0CAA0C,CAAC,CAAC;IACxG,CAAC;IACD,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,yBAAyB,SAAS,6BAA6B,CAAC,CAAC;IAC3F,CAAC;IACD,KAAK,MAAM,CAAC,CAAC,EAAE,QAAQ,CAAC,IAAI,SAAS,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,yBAAyB,SAAS,WAAW,CAAC,GAAG,CAAC;QACjE,IAAI,OAAO,QAAQ,CAAC,EAAE,KAAK,QAAQ,EAAE,CAAC;YACpC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,8CAA8C,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxC,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,2BAA2B,CAAC,CAAC;QACnF,CAAC;QACD,IAAI,OAAO,QAAQ,CAAC,eAAe,KAAK,QAAQ,EAAE,CAAC;YACjD,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,oCAAoC,CAAC,CAAC;QAC5F,CAAC;QACD,IAAI,QAAQ,CAAC,UAAU,KAAK,SAAS,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7E,MAAM,IAAI,aAAa,CAAC,GAAG,MAAM,QAAQ,QAAQ,CAAC,EAAE,6CAA6C,CAAC,CAAC;QACrG,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAA4H;IAE5H,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,iBAAiB,CAAC,SAAS,EAAE,mDAAmD,CAAC,CAAC;IAC9F,CAAC;IAED,IAAI,SAAoB,CAAC;IACzB,IAAI,CAAC;QACH,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;IAC9D,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,aAAa,CAAC,mBAAmB,SAAS,mEAAmE,CAAC,CAAC;IAC3H,CAAC;IACD,iBAAiB,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;IAExC,oDAAoD;IACpD,IAAI,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,GAAG,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAClC,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9D,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,aAAa,CAAC,8BAA8B,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,oBAAoB,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjJ,CAAC;QACD,SAAS,GAAG,EAAE,GAAG,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IAChD,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,gBAAgB,CAAC,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC;IAC9D,MAAM,YAAY,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IAE1C,mDAAmD;IACnD,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,EAAE,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC/B,EAAE,CAAC,YAAY,CAAC,WAAW,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,mBAAmB,CAAC,CAAC,CAAC;IAC7E,CAAC;IACD,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,IAAI,CAAC,EAAE,CAAC,CAAC,EAAE,eAAe,CAAC,CAAC;IACrF,MAAM,eAAe,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACzE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAE5D,8DAA8D;IAC9D,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE;QAChD,MAAM,IAAI,GAAG,gBAAgB,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACzE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,CAAC,aAAa,CAAC,YAAY,EAAE,IAAI,EAAE,eAAe,CAAC,EAAE,CAAC;IAC5F,CAAC,CAAC,CAAC;IAEH,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,KAAK,IAA4B,EAAE;QAC7F,MAAM,UAAU,GAAG,QAAQ,CAAC,UAAU,IAAI,EAAE,CAAC;QAC7C,MAAM,WAAW,GAA8E,EAAE,CAAC;QAClG,IAAI,OAAO,GAA+C,IAAI,CAAC;QAE/D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9B,OAAO,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;YAEjF,qCAAqC;YACrC,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;gBAChD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,SAAS,CAAC,MAAM,EACxB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,EAChC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;gBACD,eAAe,CACb,UAAU,EACV,OAAO,CAAC,YAAY,CAAC,MAAM,EAC3B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,EACnC,SAAS,EACT,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CACnD;aACF,CAAC,CAAC;YACH,WAAW,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,aAAa,CAAC,8BAA8B,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;QACvE,CAAC;QAED,6EAA6E;QAC7E,gEAAgE;QAChE,MAAM,WAAW,GAAG,WAAW,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAExD,OAAO;YACL,MAAM,EAAE,QAAQ,CAAC,EAAE;YACnB,IAAI;YACJ,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,SAAS,EAAE;gBACT,MAAM,EAAE,OAAO,CAAC,SAAS,CAAC,MAAM;gBAChC,OAAO,EAAE,WAAW,CAAC,SAAS,IAAI,SAAS;aAC5C;YACD,YAAY,EAAE;gBACZ,MAAM,EAAE,OAAO,CAAC,YAAY,CAAC,MAAM;gBACnC,OAAO,EAAE,WAAW,CAAC,YAAY,IAAI,SAAS;aAC/C;SACF,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAC9D,MAAM,SAAS,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAE7C,wDAAwD;IACxD,MAAM,iBAAiB,GAAG;QACxB,GAAG,SAAS;QACZ,QAAQ,EAAE;YACR,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,QAAQ,EAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YACvC,UAAU,EAAE,SAAS,CAAC,UAAU;YAChC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC;KACF,CAAC;IAEF,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,gBAAgB,CAAC,EACzC,IAAI,CAAC,SAAS,CAAC,iBAAiB,EAAE,IAAI,EAAE,CAAC,CAAC,CAC3C,CAAC;IAEF,yCAAyC;IACzC,IAAI,OAAO,CAAC,SAAS,KAAK,SAAS,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;QACjE,IAAI,QAAQ,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC;YACjC,yEAAyE;YACzE,MAAM,OAAO,GAAG,EAAE,SAAS,EAAE,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC;YACvF,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,SAAS,CAAC,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;QACpF,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,QAAQ;QACR,SAAS;QACT,YAAY;KACb,CAAC;AACJ,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,OAAO,MAAM,cAAc,CAAC;AAExC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAElE,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,
|
|
1
|
+
{"version":3,"file":"review.js","sourceRoot":"","sources":["../../../src/commands/review.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,OAAO,MAAM,cAAc,CAAC;AAExC,OAAO,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AACxC,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAElE,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,SAAiB,EACjB,OAAgB,EAChB,SAA2B,EAC3B,OAAyG;IAEzG,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;IAE1E,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;IACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE/B,yBAAyB;IACzB,MAAM,QAAQ,GAAiB,EAAE,CAAC;IAClC,KAAK,MAAM,GAAG,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACnC,QAAQ,CAAC,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC,GAAG,EAAE,CAAC;IACpC,CAAC;IACD,EAAE,CAAC,aAAa,CACd,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,eAAe,CAAC,EAChD,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAClC,CAAC;IAEF,oEAAoE;IACpE,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;QACpB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,gBAAgB,CAAC,CAAC;QACrE,aAAa,CAAC,UAAU,CAAC,CAAC;IAC5B,CAAC;AACH,CAAC;AAED,SAAS,aAAa,CAAC,QAAgB;IACrC,MAAM,GAAG,GACP,OAAO,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACxC,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;IACpD,MAAM,IAAI,GACR,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,OAAO,EAAE,EAAE,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAC5E,QAAQ,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,GAAG,EAAE,EAAE;QAC1B,IAAI,GAAG;YAAE,OAAO,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/dist/src/config.js
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import * as fs from 'node:fs';
|
|
2
2
|
import * as path from 'node:path';
|
|
3
3
|
export const DEFAULT_CONFIG = {
|
|
4
|
-
harness: 'copilot-
|
|
4
|
+
harness: 'copilot-sdk',
|
|
5
5
|
inference: 'auto',
|
|
6
6
|
workspace: '../{skill_name}-workspace',
|
|
7
7
|
runs: 1,
|
|
8
|
+
concurrency: 1,
|
|
8
9
|
};
|
|
9
10
|
function loadConfigFile(dirPath) {
|
|
10
11
|
const configPath = path.join(dirPath, 'snapeval.config.json');
|
package/dist/src/config.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/config.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAGlC,MAAM,CAAC,MAAM,cAAc,GAAmB;IAC5C,OAAO,EAAE,aAAa;IACtB,SAAS,EAAE,MAAM;IACjB,SAAS,EAAE,2BAA2B;IACtC,IAAI,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/config.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAGlC,MAAM,CAAC,MAAM,cAAc,GAAmB;IAC5C,OAAO,EAAE,aAAa;IACtB,SAAS,EAAE,MAAM;IACjB,SAAS,EAAE,2BAA2B;IACtC,IAAI,EAAE,CAAC;IACP,WAAW,EAAE,CAAC;CACf,CAAC;AAEF,SAAS,cAAc,CAAC,OAAe;IACrC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,sBAAsB,CAAC,CAAC;IAC9D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,IAAI,CAAC;IAC5C,MAAM,GAAG,GAAG,EAAE,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;IACjD,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAED,MAAM,UAAU,aAAa,CAC3B,QAAiC,EACjC,WAAmB,EACnB,QAAiB;IAEjB,MAAM,cAAc,GAAG,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAClE,MAAM,aAAa,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;IAClD,OAAO;QACL,GAAG,cAAc;QACjB,GAAG,CAAC,aAAa,IAAI,EAAE,CAAC;QACxB,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC;QACzB,GAAG,cAAc,CAAC,QAAQ,CAAC;KAC5B,CAAC;AACJ,CAAC;AAED,SAAS,cAAc,CAAC,GAA4B;IAClD,OAAO,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC,CAAC;AACpF,CAAC"}
|
|
@@ -1,9 +1,33 @@
|
|
|
1
1
|
import * as fs from 'node:fs';
|
|
2
2
|
import * as path from 'node:path';
|
|
3
3
|
import { execFileSync } from 'node:child_process';
|
|
4
|
+
const EXACT_MATCH_PATTERN = /^Output (?:is |equals )exactly:\s*"(.+)"$/i;
|
|
5
|
+
function gradeExactMatch(assertion, output) {
|
|
6
|
+
const match = assertion.match(EXACT_MATCH_PATTERN);
|
|
7
|
+
if (!match)
|
|
8
|
+
return null;
|
|
9
|
+
const expected = match[1];
|
|
10
|
+
const actual = output.trim();
|
|
11
|
+
const passed = actual === expected;
|
|
12
|
+
return {
|
|
13
|
+
text: assertion,
|
|
14
|
+
passed,
|
|
15
|
+
evidence: passed
|
|
16
|
+
? `Exact match: "${expected}"`
|
|
17
|
+
: `Expected: "${expected}"\nGot: "${actual}"`,
|
|
18
|
+
};
|
|
19
|
+
}
|
|
4
20
|
function buildGradingPrompt(assertions, output, files) {
|
|
5
21
|
const fileList = files.length > 0 ? `\nFiles produced: ${files.join(', ')}` : '';
|
|
6
|
-
return `You are
|
|
22
|
+
return `You are an eval grader. For each assertion, determine PASS or FAIL based solely on the output below.
|
|
23
|
+
|
|
24
|
+
GRADING RULES:
|
|
25
|
+
- PASS if the output satisfies the assertion's intent, even if wording differs slightly.
|
|
26
|
+
- FAIL only if the output clearly does not satisfy the assertion.
|
|
27
|
+
- Be consistent: if an assertion checks for X and the output contains X in different phrasing, that is a PASS.
|
|
28
|
+
- For "contains" assertions: look for semantic presence, not exact substring.
|
|
29
|
+
- For "identifies" assertions: the output must demonstrate awareness of the concept, not use identical words.
|
|
30
|
+
- Always cite specific text from the output as evidence.
|
|
7
31
|
|
|
8
32
|
OUTPUT:
|
|
9
33
|
---
|
|
@@ -16,7 +40,7 @@ ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
|
|
|
16
40
|
Respond with JSON only:
|
|
17
41
|
{
|
|
18
42
|
"results": [
|
|
19
|
-
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote
|
|
43
|
+
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
|
|
20
44
|
]
|
|
21
45
|
}`;
|
|
22
46
|
}
|
|
@@ -26,25 +50,54 @@ function runScript(scriptName, outputDir, scriptsDir) {
|
|
|
26
50
|
return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
|
|
27
51
|
}
|
|
28
52
|
try {
|
|
29
|
-
const
|
|
53
|
+
const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
|
|
54
|
+
const evidence = stdout || `Script passed: ${scriptName}`;
|
|
30
55
|
return { text: `script:${scriptName}`, passed: true, evidence };
|
|
31
56
|
}
|
|
32
57
|
catch (err) {
|
|
33
|
-
|
|
58
|
+
// Extract the most useful error info without raw stack traces
|
|
59
|
+
const stderr = err.stderr?.trim();
|
|
60
|
+
const stdout = err.stdout?.trim();
|
|
61
|
+
let evidence;
|
|
62
|
+
if (err.code === 'EACCES') {
|
|
63
|
+
evidence = `Permission denied: ${scriptPath} is not executable. Run: chmod +x ${scriptPath}`;
|
|
64
|
+
}
|
|
65
|
+
else if (stderr) {
|
|
66
|
+
// Take only the first line of stderr to avoid stack trace noise
|
|
67
|
+
evidence = stderr.split('\n')[0];
|
|
68
|
+
}
|
|
69
|
+
else if (stdout) {
|
|
70
|
+
evidence = stdout.split('\n')[0];
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
evidence = `Script exited with code ${err.status ?? 'unknown'}`;
|
|
74
|
+
}
|
|
34
75
|
return { text: `script:${scriptName}`, passed: false, evidence };
|
|
35
76
|
}
|
|
36
77
|
}
|
|
37
78
|
function extractJSON(text) {
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
79
|
+
// Try JSON-tagged fence first, then bare fence, then raw text
|
|
80
|
+
const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
|
|
81
|
+
if (jsonFence)
|
|
82
|
+
return jsonFence[1].trim();
|
|
83
|
+
// Try parsing raw text as JSON before falling back to any fence
|
|
84
|
+
const trimmed = text.trim();
|
|
85
|
+
try {
|
|
86
|
+
JSON.parse(trimmed);
|
|
87
|
+
return trimmed;
|
|
88
|
+
}
|
|
89
|
+
catch { /* not raw JSON */ }
|
|
90
|
+
const anyFence = text.match(/```\s*([\s\S]*?)```/);
|
|
91
|
+
if (anyFence)
|
|
92
|
+
return anyFence[1].trim();
|
|
93
|
+
return trimmed;
|
|
42
94
|
}
|
|
43
95
|
export async function gradeAssertions(assertions, output, runDir, inference, scriptsDir) {
|
|
44
96
|
if (assertions.length === 0)
|
|
45
97
|
return null;
|
|
46
98
|
const scriptAssertions = assertions.filter(a => a.startsWith('script:'));
|
|
47
|
-
const
|
|
99
|
+
const exactAssertions = assertions.filter(a => !a.startsWith('script:') && EXACT_MATCH_PATTERN.test(a));
|
|
100
|
+
const llmAssertions = assertions.filter(a => !a.startsWith('script:') && !EXACT_MATCH_PATTERN.test(a));
|
|
48
101
|
const results = [];
|
|
49
102
|
for (const assertion of scriptAssertions) {
|
|
50
103
|
const scriptName = assertion.slice('script:'.length);
|
|
@@ -52,6 +105,11 @@ export async function gradeAssertions(assertions, output, runDir, inference, scr
|
|
|
52
105
|
const dir = scriptsDir ?? path.join(runDir, '..', '..', '..', 'evals', 'scripts');
|
|
53
106
|
results.push(runScript(scriptName, outputDir, dir));
|
|
54
107
|
}
|
|
108
|
+
for (const assertion of exactAssertions) {
|
|
109
|
+
const result = gradeExactMatch(assertion, output.raw);
|
|
110
|
+
if (result)
|
|
111
|
+
results.push(result);
|
|
112
|
+
}
|
|
55
113
|
if (llmAssertions.length > 0) {
|
|
56
114
|
const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
|
|
57
115
|
const response = await inference.chat([{ role: 'user', content: prompt }], { temperature: 0, responseFormat: 'json' });
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO
|
|
1
|
+
{"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACnG,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACxD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1C,gEAAgE;IAChE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,CAAC;QAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAAC,OAAO,OAAO,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACnD,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACvG,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;QACtD,IAAI,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -14,20 +14,22 @@ export async function runEval(evalCase, skillPath, evalDir, harness, oldSkillPat
|
|
|
14
14
|
const withSkillDir = path.join(evalDir, 'with_skill');
|
|
15
15
|
const baselineVariant = oldSkillPath ? 'old_skill' : 'without_skill';
|
|
16
16
|
const baselineDir = path.join(evalDir, baselineVariant);
|
|
17
|
-
const withSkillResult = await
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
17
|
+
const [withSkillResult, baselineResult] = await Promise.all([
|
|
18
|
+
harness.run({
|
|
19
|
+
skillPath,
|
|
20
|
+
prompt: evalCase.prompt,
|
|
21
|
+
files: evalCase.files,
|
|
22
|
+
outputDir: path.join(withSkillDir, 'outputs'),
|
|
23
|
+
}),
|
|
24
|
+
harness.run({
|
|
25
|
+
skillPath: oldSkillPath,
|
|
26
|
+
prompt: evalCase.prompt,
|
|
27
|
+
files: evalCase.files,
|
|
28
|
+
outputDir: path.join(baselineDir, 'outputs'),
|
|
29
|
+
}),
|
|
30
|
+
]);
|
|
23
31
|
writeTiming(withSkillDir, withSkillResult);
|
|
24
32
|
writeOutput(withSkillDir, withSkillResult);
|
|
25
|
-
const baselineResult = await harness.run({
|
|
26
|
-
skillPath: oldSkillPath,
|
|
27
|
-
prompt: evalCase.prompt,
|
|
28
|
-
files: evalCase.files,
|
|
29
|
-
outputDir: path.join(baselineDir, 'outputs'),
|
|
30
|
-
});
|
|
31
33
|
writeTiming(baselineDir, baselineResult);
|
|
32
34
|
writeOutput(baselineDir, baselineResult);
|
|
33
35
|
return {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/engine/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAWlC,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,MAAM,MAAM,GAAe,EAAE,YAAY,EAAE,MAAM,CAAC,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC;IAClG,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,aAAa,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AACnF,CAAC;AAED,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,YAAY,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;IACtE,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,gBAAgB,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IACxE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,QAAkB,EAClB,SAAiB,EACjB,OAAe,EACf,OAAgB,EAChB,YAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;IACtD,MAAM,eAAe,GAAG,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACrE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC;IAExD,MAAM,eAAe,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/engine/runner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAWlC,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,MAAM,MAAM,GAAe,EAAE,YAAY,EAAE,MAAM,CAAC,YAAY,EAAE,WAAW,EAAE,MAAM,CAAC,WAAW,EAAE,CAAC;IAClG,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,aAAa,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AACnF,CAAC;AAED,SAAS,WAAW,CAAC,GAAW,EAAE,MAAwB;IACxD,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,YAAY,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;IACtE,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;QACtB,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,gBAAgB,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC,CAAC;IACxE,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,QAAkB,EAClB,SAAiB,EACjB,OAAe,EACf,OAAgB,EAChB,YAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;IACtD,MAAM,eAAe,GAAG,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,eAAe,CAAC;IACrE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,eAAe,CAAC,CAAC;IAExD,MAAM,CAAC,eAAe,EAAE,cAAc,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC;YACV,SAAS;YACT,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,SAAS,CAAC;SAC9C,CAAC;QACF,OAAO,CAAC,GAAG,CAAC;YACV,SAAS,EAAE,YAAY;YACvB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,SAAS,EAAE,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,SAAS,CAAC;SAC7C,CAAC;KACH,CAAC,CAAC;IACH,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,YAAY,EAAE,eAAe,CAAC,CAAC;IAC3C,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IACzC,WAAW,CAAC,WAAW,EAAE,cAAc,CAAC,CAAC;IAEzC,OAAO;QACL,MAAM,EAAE,QAAQ,CAAC,EAAE;QACnB,IAAI,EAAE,QAAQ,CAAC,IAAI,IAAI,GAAG,QAAQ,CAAC,EAAE,EAAE;QACvC,MAAM,EAAE,QAAQ,CAAC,MAAM;QACvB,SAAS,EAAE,EAAE,MAAM,EAAE,eAAe,EAAE;QACtC,YAAY,EAAE,EAAE,MAAM,EAAE,cAAc,EAAE;KACzC,CAAC;AACJ,CAAC"}
|
package/dist/src/errors.d.ts
CHANGED
|
@@ -2,6 +2,12 @@ export declare class SnapevalError extends Error {
|
|
|
2
2
|
exitCode: number;
|
|
3
3
|
constructor(message: string, exitCode?: number);
|
|
4
4
|
}
|
|
5
|
+
export declare class FileNotFoundError extends SnapevalError {
|
|
6
|
+
constructor(filePath: string, hint?: string);
|
|
7
|
+
}
|
|
8
|
+
export declare class ThresholdError extends SnapevalError {
|
|
9
|
+
constructor(actual: number, threshold: number);
|
|
10
|
+
}
|
|
5
11
|
export declare class AdapterNotAvailableError extends SnapevalError {
|
|
6
12
|
constructor(adapterName: string, installHint: string);
|
|
7
13
|
}
|
package/dist/src/errors.js
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
// Exit codes:
|
|
2
|
+
// 0 = success
|
|
3
|
+
// 1 = threshold not met (eval ran successfully but pass rate below threshold)
|
|
4
|
+
// 2 = config/input error (bad JSON, missing fields, invalid flags)
|
|
5
|
+
// 3 = file not found (missing skill dir, missing evals.json, missing script)
|
|
6
|
+
// 4 = runtime error (harness failure, grading failure, timeout)
|
|
1
7
|
export class SnapevalError extends Error {
|
|
2
8
|
exitCode;
|
|
3
9
|
constructor(message, exitCode = 2) {
|
|
@@ -6,9 +12,21 @@ export class SnapevalError extends Error {
|
|
|
6
12
|
this.name = 'SnapevalError';
|
|
7
13
|
}
|
|
8
14
|
}
|
|
15
|
+
export class FileNotFoundError extends SnapevalError {
|
|
16
|
+
constructor(filePath, hint) {
|
|
17
|
+
super(`File not found: ${filePath}${hint ? `. ${hint}` : ''}`, 3);
|
|
18
|
+
this.name = 'FileNotFoundError';
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
export class ThresholdError extends SnapevalError {
|
|
22
|
+
constructor(actual, threshold) {
|
|
23
|
+
super(`Skill pass rate ${(actual * 100).toFixed(1)}% is below threshold ${(threshold * 100).toFixed(1)}%`, 1);
|
|
24
|
+
this.name = 'ThresholdError';
|
|
25
|
+
}
|
|
26
|
+
}
|
|
9
27
|
export class AdapterNotAvailableError extends SnapevalError {
|
|
10
28
|
constructor(adapterName, installHint) {
|
|
11
|
-
super(`${adapterName} is not available. ${installHint}
|
|
29
|
+
super(`${adapterName} is not available. ${installHint}`, 4);
|
|
12
30
|
this.name = 'AdapterNotAvailableError';
|
|
13
31
|
}
|
|
14
32
|
}
|
|
@@ -20,13 +38,13 @@ export class RateLimitError extends SnapevalError {
|
|
|
20
38
|
}
|
|
21
39
|
export class TimeoutError extends SnapevalError {
|
|
22
40
|
constructor(evalId, timeoutMs) {
|
|
23
|
-
super(`Eval ${evalId} timed out after ${timeoutMs}ms
|
|
41
|
+
super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
|
|
24
42
|
this.name = 'TimeoutError';
|
|
25
43
|
}
|
|
26
44
|
}
|
|
27
45
|
export class GradingError extends SnapevalError {
|
|
28
46
|
constructor(evalId, detail) {
|
|
29
|
-
super(`Grading failed for eval ${evalId}: ${detail}
|
|
47
|
+
super(`Grading failed for eval ${evalId}: ${detail}`, 4);
|
|
30
48
|
this.name = 'GradingError';
|
|
31
49
|
}
|
|
32
50
|
}
|
package/dist/src/errors.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"errors.js","sourceRoot":"","sources":["../../src/errors.ts"],"names":[],"mappings":"AAAA,MAAM,OAAO,aAAc,SAAQ,KAAK;IACF;IAApC,YAAY,OAAe,EAAS,WAAmB,CAAC;QACtD,KAAK,CAAC,OAAO,CAAC,CAAC;QADmB,aAAQ,GAAR,QAAQ,CAAY;QAEtD,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;IAC9B,CAAC;CACF;AAED,MAAM,OAAO,wBAAyB,SAAQ,aAAa;IACzD,YAAY,WAAmB,EAAE,WAAmB;QAClD,KAAK,CAAC,GAAG,WAAW,sBAAsB,WAAW,EAAE,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"errors.js","sourceRoot":"","sources":["../../src/errors.ts"],"names":[],"mappings":"AAAA,cAAc;AACd,cAAc;AACd,8EAA8E;AAC9E,mEAAmE;AACnE,6EAA6E;AAC7E,gEAAgE;AAEhE,MAAM,OAAO,aAAc,SAAQ,KAAK;IACF;IAApC,YAAY,OAAe,EAAS,WAAmB,CAAC;QACtD,KAAK,CAAC,OAAO,CAAC,CAAC;QADmB,aAAQ,GAAR,QAAQ,CAAY;QAEtD,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;IAC9B,CAAC;CACF;AAED,MAAM,OAAO,iBAAkB,SAAQ,aAAa;IAClD,YAAY,QAAgB,EAAE,IAAa;QACzC,KAAK,CAAC,mBAAmB,QAAQ,GAAG,IAAI,CAAC,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC;QAClE,IAAI,CAAC,IAAI,GAAG,mBAAmB,CAAC;IAClC,CAAC;CACF;AAED,MAAM,OAAO,cAAe,SAAQ,aAAa;IAC/C,YAAY,MAAc,EAAE,SAAiB;QAC3C,KAAK,CAAC,mBAAmB,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,wBAAwB,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QAC9G,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,MAAM,OAAO,wBAAyB,SAAQ,aAAa;IACzD,YAAY,WAAmB,EAAE,WAAmB;QAClD,KAAK,CAAC,GAAG,WAAW,sBAAsB,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC;QAC5D,IAAI,CAAC,IAAI,GAAG,0BAA0B,CAAC;IACzC,CAAC;CACF;AAED,MAAM,OAAO,cAAe,SAAQ,aAAa;IAC/C,YAAY,WAAmB;QAC7B,KAAK,CAAC,GAAG,WAAW,mEAAmE,CAAC,CAAC;QACzF,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,MAAM,OAAO,YAAa,SAAQ,aAAa;IAC7C,YAAY,MAAc,EAAE,SAAiB;QAC3C,KAAK,CAAC,QAAQ,MAAM,oBAAoB,SAAS,KAAK,EAAE,CAAC,CAAC,CAAC;QAC3D,IAAI,CAAC,IAAI,GAAG,cAAc,CAAC;IAC7B,CAAC;CACF;AAED,MAAM,OAAO,YAAa,SAAQ,aAAa;IAC7C,YAAY,MAAc,EAAE,MAAc;QACxC,KAAK,CAAC,2BAA2B,MAAM,KAAK,MAAM,EAAE,EAAE,CAAC,CAAC,CAAC;QACzD,IAAI,CAAC,IAAI,GAAG,cAAc,CAAC;IAC7B,CAAC;CACF"}
|
package/dist/src/types.d.ts
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "snapeval",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"description": "Harness-agnostic eval runner for agentskills.io skills",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -50,5 +50,8 @@
|
|
|
50
50
|
"tsx": "^4.19.3",
|
|
51
51
|
"typescript": "^5.8.2",
|
|
52
52
|
"vitest": "^4.1.0"
|
|
53
|
+
},
|
|
54
|
+
"optionalDependencies": {
|
|
55
|
+
"@github/copilot-sdk": "^0.2.0"
|
|
53
56
|
}
|
|
54
57
|
}
|
package/plugin.json
CHANGED
package/skills/snapeval/SKILL.md
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: snapeval
|
|
3
|
-
description: Evaluate AI skills using the agentskills.io eval spec.
|
|
3
|
+
description: Evaluate AI skills using the agentskills.io eval spec. Runs with/without skill comparisons, grades assertions, and computes benchmarks. Use when the user wants to evaluate, test, or review any skill — including phrases like "test my skill", "run evals", "evaluate this", "set up evals", or "how good is my skill."
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
You are snapeval, a harness-agnostic eval runner for agentskills.io skills. You help developers evaluate AI skills by
|
|
6
|
+
You are snapeval, a harness-agnostic eval runner for agentskills.io skills. You help developers evaluate AI skills by designing test scenarios, running with/without skill comparisons, grading assertions, and iterating on skill quality.
|
|
7
7
|
|
|
8
8
|
## Mode Detection
|
|
9
9
|
|
|
@@ -47,18 +47,34 @@ Triggered by: "evaluate", "test", "set up evals", "evaluate my skill"
|
|
|
47
47
|
- Loop until confirmed
|
|
48
48
|
- If the user says "just run it" → skip to Phase 4 immediately
|
|
49
49
|
|
|
50
|
-
### Phase 4 —
|
|
51
|
-
|
|
52
|
-
1.
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
50
|
+
### Phase 4 — Write evals.json & First Eval
|
|
51
|
+
|
|
52
|
+
1. Write the approved scenarios to `<skill-path>/evals/evals.json` with assertions derived from the "What it tests" analysis. Format:
|
|
53
|
+
```json
|
|
54
|
+
{
|
|
55
|
+
"skill_name": "<skill-name>",
|
|
56
|
+
"evals": [
|
|
57
|
+
{
|
|
58
|
+
"id": 1,
|
|
59
|
+
"slug": "kebab-case-slug",
|
|
60
|
+
"prompt": "The realistic user prompt",
|
|
61
|
+
"assertions": ["Assertion 1", "Assertion 2"],
|
|
62
|
+
"files": []
|
|
63
|
+
}
|
|
64
|
+
]
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Writing good assertions:** Assertions are graded by an LLM that requires concrete evidence from the output to pass. Write specific, verifiable assertions — not vague ones.
|
|
69
|
+
- Good: `"Output contains a YAML block with an 'id' field for each issue"`
|
|
70
|
+
- Bad: `"Output is correct"`
|
|
71
|
+
- Good: `"Response declines to scout because the pipeline already has unclaimed issues"`
|
|
72
|
+
- Bad: `"Handles edge case properly"`
|
|
73
|
+
|
|
74
|
+
Script assertions are also supported: prefix with `script:` (e.g. `"script:check-yaml.sh"`). Scripts live in `<skill-path>/evals/scripts/`, receive the output directory as their first argument, and pass on exit code 0.
|
|
75
|
+
|
|
76
|
+
2. Run: `npx snapeval eval <skill-path>` — runs each eval with and without the skill, grades assertions, produces grading.json + benchmark.json
|
|
77
|
+
3. Interpret the benchmark:
|
|
62
78
|
> "With skill: X% pass rate. Without skill: Y% pass rate. Delta: +Z%. The skill adds value on [specific assertions]."
|
|
63
79
|
|
|
64
80
|
## Review & Iterate
|
|
@@ -91,8 +107,7 @@ Never show raw stack traces. Translate errors into plain language with a suggest
|
|
|
91
107
|
|
|
92
108
|
| Error | Response |
|
|
93
109
|
|-------|----------|
|
|
94
|
-
| No
|
|
95
|
-
| No evals.json | "No test cases exist yet. Want me to generate them with `snapeval init`?" |
|
|
110
|
+
| No evals.json | "No test cases exist yet. Want me to design scenarios and create evals.json?" |
|
|
96
111
|
| Inference unavailable | "I can't connect to the inference service. Check that Copilot CLI is authenticated (`copilot auth status`)." |
|
|
97
112
|
| Skill invocation failure | "The skill failed to respond to eval N: `<error>`. This might be a bug in the skill — want to skip this eval and continue?" |
|
|
98
113
|
|
|
@@ -100,5 +115,5 @@ Never show raw stack traces. Translate errors into plain language with a suggest
|
|
|
100
115
|
|
|
101
116
|
- Never ask the user to write evals.json or any config files manually
|
|
102
117
|
- Always read the target skill's SKILL.md before generating scenarios
|
|
103
|
-
- Only reference CLI commands that exist: `
|
|
104
|
-
- Only reference CLI flags that exist: `--harness`, `--inference`, `--workspace`, `--runs`, `--old-skill`, `--no-open`, `--verbose`
|
|
118
|
+
- Only reference CLI commands that exist: `eval`, `review`
|
|
119
|
+
- Only reference CLI flags that exist: `--harness`, `--inference`, `--workspace`, `--runs`, `--concurrency`, `--only`, `--threshold`, `--old-skill`, `--no-open`, `--verbose`
|