snapeval 1.8.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/snapeval.ts +30 -24
- package/dist/bin/snapeval.js +25 -22
- package/dist/bin/snapeval.js.map +1 -1
- package/dist/src/adapters/copilot-sdk-client.js +1 -1
- package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
- package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
- package/dist/src/adapters/harness/copilot-sdk.js +101 -0
- package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
- package/dist/src/adapters/harness/resolve.js +10 -2
- package/dist/src/adapters/harness/resolve.js.map +1 -1
- package/dist/src/adapters/inference/copilot-sdk.js +4 -1
- package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
- package/dist/src/adapters/report/terminal.js +89 -9
- package/dist/src/adapters/report/terminal.js.map +1 -1
- package/dist/src/commands/eval.d.ts +3 -0
- package/dist/src/commands/eval.js +106 -17
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/commands/review.d.ts +1 -0
- package/dist/src/commands/review.js.map +1 -1
- package/dist/src/config.js +2 -1
- package/dist/src/config.js.map +1 -1
- package/dist/src/engine/grader.js +67 -9
- package/dist/src/engine/grader.js.map +1 -1
- package/dist/src/engine/runner.js +14 -12
- package/dist/src/engine/runner.js.map +1 -1
- package/dist/src/errors.d.ts +6 -0
- package/dist/src/errors.js +21 -3
- package/dist/src/errors.js.map +1 -1
- package/dist/src/types.d.ts +1 -0
- package/package.json +4 -1
- package/plugin.json +1 -1
- package/skills/snapeval/SKILL.md +33 -18
- package/src/adapters/copilot-sdk-client.ts +1 -1
- package/src/adapters/harness/copilot-sdk.ts +126 -0
- package/src/adapters/harness/resolve.ts +13 -2
- package/src/adapters/inference/copilot-sdk.ts +5 -1
- package/src/adapters/report/terminal.ts +100 -10
- package/src/commands/eval.ts +133 -31
- package/src/commands/review.ts +1 -1
- package/src/config.ts +2 -1
- package/src/engine/grader.ts +59 -8
- package/src/engine/runner.ts +14 -13
- package/src/errors.ts +24 -3
- package/src/types.ts +1 -0
- package/dist/src/commands/init.d.ts +0 -2
- package/dist/src/commands/init.js +0 -27
- package/dist/src/commands/init.js.map +0 -1
- package/dist/src/engine/generator.d.ts +0 -3
- package/dist/src/engine/generator.js +0 -51
- package/dist/src/engine/generator.js.map +0 -1
- package/src/commands/init.ts +0 -38
- package/src/engine/generator.ts +0 -60
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type { Harness, HarnessRunResult } from '../../types.js';
|
|
4
|
+
import { getClient, isSDKInstalled } from '../copilot-sdk-client.js';
|
|
5
|
+
|
|
6
|
+
export class CopilotSDKHarness implements Harness {
|
|
7
|
+
readonly name = 'copilot-sdk';
|
|
8
|
+
|
|
9
|
+
async run(options: {
|
|
10
|
+
skillPath?: string;
|
|
11
|
+
prompt: string;
|
|
12
|
+
files?: string[];
|
|
13
|
+
outputDir: string;
|
|
14
|
+
}): Promise<HarnessRunResult> {
|
|
15
|
+
const startMs = Date.now();
|
|
16
|
+
const client = await getClient();
|
|
17
|
+
|
|
18
|
+
fs.mkdirSync(options.outputDir, { recursive: true });
|
|
19
|
+
|
|
20
|
+
// Dynamically import SDK for approveAll
|
|
21
|
+
// @ts-ignore — module may not be installed (optional dep)
|
|
22
|
+
const { approveAll } = await import('@github/copilot-sdk');
|
|
23
|
+
|
|
24
|
+
// Build session config
|
|
25
|
+
const sessionConfig: Record<string, unknown> = {
|
|
26
|
+
model: 'gpt-4.1',
|
|
27
|
+
onPermissionRequest: approveAll,
|
|
28
|
+
workingDirectory: options.outputDir,
|
|
29
|
+
infiniteSessions: { enabled: false },
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
// Native skill loading: point skillDirectories at the skill's parent
|
|
33
|
+
if (options.skillPath) {
|
|
34
|
+
sessionConfig.skillDirectories = [options.skillPath];
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const session = await client.createSession(sessionConfig);
|
|
38
|
+
|
|
39
|
+
try {
|
|
40
|
+
// Attach input files if provided
|
|
41
|
+
const attachments: Array<{ type: string; path: string; displayName?: string }> = [];
|
|
42
|
+
if (options.files) {
|
|
43
|
+
for (const file of options.files) {
|
|
44
|
+
// Copy to outputDir for script assertions, and attach for the model
|
|
45
|
+
const dest = path.join(options.outputDir, path.basename(file));
|
|
46
|
+
fs.copyFileSync(file, dest);
|
|
47
|
+
attachments.push({ type: 'file', path: dest, displayName: path.basename(file) });
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const response = await session.sendAndWait(
|
|
52
|
+
{
|
|
53
|
+
prompt: options.prompt,
|
|
54
|
+
...(attachments.length > 0 ? { attachments } : {}),
|
|
55
|
+
},
|
|
56
|
+
300_000, // 5 min timeout — calibrated for complex eval prompts
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
const raw = response?.data?.content ?? '';
|
|
60
|
+
|
|
61
|
+
// Collect full transcript from session events
|
|
62
|
+
const events = await session.getMessages();
|
|
63
|
+
const transcript = buildTranscript(events);
|
|
64
|
+
|
|
65
|
+
// Extract token count from events if available
|
|
66
|
+
const totalTokens = extractTokenCount(events);
|
|
67
|
+
|
|
68
|
+
const durationMs = Date.now() - startMs;
|
|
69
|
+
|
|
70
|
+
return {
|
|
71
|
+
raw: raw.trim(),
|
|
72
|
+
transcript,
|
|
73
|
+
files: [],
|
|
74
|
+
total_tokens: totalTokens,
|
|
75
|
+
duration_ms: durationMs,
|
|
76
|
+
};
|
|
77
|
+
} finally {
|
|
78
|
+
await session.disconnect();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
async isAvailable(): Promise<boolean> {
|
|
83
|
+
return isSDKInstalled();
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function buildTranscript(events: any[]): string {
|
|
88
|
+
const lines: string[] = [];
|
|
89
|
+
for (const event of events) {
|
|
90
|
+
switch (event.type) {
|
|
91
|
+
case 'user.message':
|
|
92
|
+
lines.push(`[user] ${event.data?.content ?? ''}`);
|
|
93
|
+
break;
|
|
94
|
+
case 'assistant.message':
|
|
95
|
+
lines.push(`[assistant] ${event.data?.content ?? ''}`);
|
|
96
|
+
break;
|
|
97
|
+
case 'tool.execution_start':
|
|
98
|
+
lines.push(`[tool:start] ${event.data?.toolName ?? 'unknown'}(${JSON.stringify(event.data?.arguments ?? {})})`);
|
|
99
|
+
break;
|
|
100
|
+
case 'tool.execution_complete':
|
|
101
|
+
lines.push(`[tool:done] ${event.data?.toolName ?? 'unknown'} → ${truncate(event.data?.result ?? '', 200)}`);
|
|
102
|
+
break;
|
|
103
|
+
case 'skill.invoked':
|
|
104
|
+
lines.push(`[skill] ${event.data?.name ?? 'unknown'} (${event.data?.path ?? ''})`);
|
|
105
|
+
break;
|
|
106
|
+
case 'session.error':
|
|
107
|
+
lines.push(`[error] ${event.data?.message ?? ''}`);
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return lines.join('\n');
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function extractTokenCount(events: any[]): number {
|
|
115
|
+
let total = 0;
|
|
116
|
+
for (const event of events) {
|
|
117
|
+
if (event.type === 'assistant.usage') {
|
|
118
|
+
total += (event.data?.inputTokens ?? 0) + (event.data?.outputTokens ?? 0);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return total;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function truncate(str: string, max: number): string {
|
|
125
|
+
return str.length > max ? str.slice(0, max) + '...' : str;
|
|
126
|
+
}
|
|
@@ -1,10 +1,21 @@
|
|
|
1
1
|
import type { Harness } from '../../types.js';
|
|
2
2
|
import { CopilotCLIHarness } from './copilot-cli.js';
|
|
3
|
-
import {
|
|
3
|
+
import { CopilotSDKHarness } from './copilot-sdk.js';
|
|
4
|
+
import { AdapterNotAvailableError, SnapevalError } from '../../errors.js';
|
|
5
|
+
import { isSDKInstalled } from '../copilot-sdk-client.js';
|
|
4
6
|
|
|
5
7
|
export function resolveHarness(name: string): Harness {
|
|
8
|
+
if (name === 'copilot-sdk') {
|
|
9
|
+
if (!isSDKInstalled()) {
|
|
10
|
+
throw new AdapterNotAvailableError(
|
|
11
|
+
'copilot-sdk',
|
|
12
|
+
'@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk'
|
|
13
|
+
);
|
|
14
|
+
}
|
|
15
|
+
return new CopilotSDKHarness();
|
|
16
|
+
}
|
|
6
17
|
if (name === 'copilot-cli') {
|
|
7
18
|
return new CopilotCLIHarness();
|
|
8
19
|
}
|
|
9
|
-
throw new SnapevalError(`Unknown harness "${name}". Built-in options: copilot-cli.`);
|
|
20
|
+
throw new SnapevalError(`Unknown harness "${name}". Built-in options: copilot-sdk, copilot-cli.`);
|
|
10
21
|
}
|
|
@@ -7,6 +7,9 @@ export class CopilotSDKInference implements InferenceAdapter {
|
|
|
7
7
|
async chat(messages: Message[], _options?: ChatOptions): Promise<string> {
|
|
8
8
|
const client = await getClient();
|
|
9
9
|
|
|
10
|
+
// @ts-ignore — module may not be installed (optional dep)
|
|
11
|
+
const { approveAll } = await import('@github/copilot-sdk');
|
|
12
|
+
|
|
10
13
|
const systemMessages = messages.filter((m) => m.role === 'system');
|
|
11
14
|
const nonSystemMessages = messages.filter((m) => m.role !== 'system');
|
|
12
15
|
const systemContent = systemMessages.map((m) => m.content).join('\n');
|
|
@@ -17,7 +20,8 @@ export class CopilotSDKInference implements InferenceAdapter {
|
|
|
17
20
|
...(systemContent
|
|
18
21
|
? { systemMessage: { content: systemContent } }
|
|
19
22
|
: {}),
|
|
20
|
-
onPermissionRequest:
|
|
23
|
+
onPermissionRequest: approveAll,
|
|
24
|
+
infiniteSessions: { enabled: false },
|
|
21
25
|
});
|
|
22
26
|
|
|
23
27
|
try {
|
|
@@ -1,5 +1,45 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
1
3
|
import chalk from 'chalk';
|
|
2
|
-
import type { ReportAdapter, EvalResults } from '../../types.js';
|
|
4
|
+
import type { ReportAdapter, EvalResults, BenchmarkData, GradingResult } from '../../types.js';
|
|
5
|
+
|
|
6
|
+
interface PreviousIteration {
|
|
7
|
+
benchmark: BenchmarkData;
|
|
8
|
+
gradings: Map<string, { withSkill?: GradingResult; withoutSkill?: GradingResult }>;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function loadPreviousIteration(iterationDir: string): PreviousIteration | null {
|
|
12
|
+
const workspaceDir = path.dirname(iterationDir);
|
|
13
|
+
const currentName = path.basename(iterationDir);
|
|
14
|
+
const currentNum = parseInt(currentName.replace('iteration-', ''), 10);
|
|
15
|
+
if (isNaN(currentNum) || currentNum <= 1) return null;
|
|
16
|
+
const prevDir = path.join(workspaceDir, `iteration-${currentNum - 1}`);
|
|
17
|
+
const prevBenchmarkPath = path.join(prevDir, 'benchmark.json');
|
|
18
|
+
if (!fs.existsSync(prevBenchmarkPath)) return null;
|
|
19
|
+
try {
|
|
20
|
+
const benchmark = JSON.parse(fs.readFileSync(prevBenchmarkPath, 'utf-8'));
|
|
21
|
+
const gradings = new Map<string, { withSkill?: GradingResult; withoutSkill?: GradingResult }>();
|
|
22
|
+
const evalDirs = fs.readdirSync(prevDir).filter(d => d.startsWith('eval-'));
|
|
23
|
+
for (const evalDir of evalDirs) {
|
|
24
|
+
const wsPath = path.join(prevDir, evalDir, 'with_skill', 'grading.json');
|
|
25
|
+
const wosPath = path.join(prevDir, evalDir, 'without_skill', 'grading.json');
|
|
26
|
+
const ws = fs.existsSync(wsPath) ? JSON.parse(fs.readFileSync(wsPath, 'utf-8')) : undefined;
|
|
27
|
+
const wos = fs.existsSync(wosPath) ? JSON.parse(fs.readFileSync(wosPath, 'utf-8')) : undefined;
|
|
28
|
+
gradings.set(evalDir, { withSkill: ws, withoutSkill: wos });
|
|
29
|
+
}
|
|
30
|
+
return { benchmark, gradings };
|
|
31
|
+
} catch {
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function evalLabel(run: { evalId: number; slug: string; prompt: string }): string {
|
|
37
|
+
// Use expected_output or slug as a readable label instead of truncated prompt
|
|
38
|
+
if (run.slug && run.slug !== `${run.evalId}`) return run.slug;
|
|
39
|
+
// Truncate prompt but show first meaningful line
|
|
40
|
+
const firstLine = run.prompt.split('\n')[0].slice(0, 60);
|
|
41
|
+
return firstLine;
|
|
42
|
+
}
|
|
3
43
|
|
|
4
44
|
export class TerminalReporter implements ReportAdapter {
|
|
5
45
|
readonly name = 'terminal';
|
|
@@ -8,24 +48,74 @@ export class TerminalReporter implements ReportAdapter {
|
|
|
8
48
|
const { skillName, evalRuns, benchmark } = results;
|
|
9
49
|
|
|
10
50
|
console.log(chalk.bold(`\nsnapeval — ${skillName}`));
|
|
11
|
-
console.log(chalk.dim(
|
|
51
|
+
console.log(chalk.dim(`Baseline = without SKILL.md (raw AI response)`));
|
|
52
|
+
console.log(chalk.dim('─'.repeat(60)));
|
|
53
|
+
|
|
54
|
+
const prev = loadPreviousIteration(results.iterationDir);
|
|
12
55
|
|
|
13
56
|
for (const run of evalRuns) {
|
|
14
|
-
const
|
|
57
|
+
const wsGrading = run.withSkill.grading;
|
|
58
|
+
const wsRate = wsGrading?.summary.pass_rate;
|
|
15
59
|
const wosRate = run.withoutSkill.grading?.summary.pass_rate;
|
|
16
60
|
const wsLabel = wsRate !== undefined ? `${(wsRate * 100).toFixed(0)}%` : 'n/a';
|
|
17
61
|
const wosLabel = wosRate !== undefined ? `${(wosRate * 100).toFixed(0)}%` : 'n/a';
|
|
18
|
-
const
|
|
19
|
-
const durationS = (run.withSkill.output.duration_ms / 1000).toFixed(
|
|
20
|
-
|
|
21
|
-
|
|
62
|
+
const wsColor = wsRate === 1 ? chalk.green : wsRate === 0 ? chalk.red : chalk.yellow;
|
|
63
|
+
const durationS = (run.withSkill.output.duration_ms / 1000).toFixed(1);
|
|
64
|
+
|
|
65
|
+
// Show per-eval delta from previous iteration
|
|
66
|
+
let perEvalDelta = '';
|
|
67
|
+
if (prev) {
|
|
68
|
+
const prevGrading = prev.gradings.get(`eval-${run.slug}`);
|
|
69
|
+
const prevRate = prevGrading?.withSkill?.summary.pass_rate;
|
|
70
|
+
if (prevRate !== undefined && wsRate !== undefined) {
|
|
71
|
+
const change = wsRate - prevRate;
|
|
72
|
+
if (change !== 0) {
|
|
73
|
+
const arrow = change > 0 ? chalk.green('↑') : chalk.red('↓');
|
|
74
|
+
perEvalDelta = ` ${arrow} was ${(prevRate * 100).toFixed(0)}%`;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
console.log(` ${chalk.cyan(`#${run.evalId}`)} ${evalLabel(run)}`);
|
|
80
|
+
console.log(` Skill: ${wsColor(wsLabel)}${perEvalDelta} | Baseline: ${wosLabel} | ${durationS}s`);
|
|
81
|
+
|
|
82
|
+
// Show failed assertions inline
|
|
83
|
+
if (wsGrading) {
|
|
84
|
+
const failed = wsGrading.assertion_results.filter((a) => !a.passed);
|
|
85
|
+
for (const f of failed) {
|
|
86
|
+
console.log(chalk.red(` FAIL: ${f.text}`));
|
|
87
|
+
if (f.evidence) {
|
|
88
|
+
console.log(chalk.dim(` ${f.evidence.slice(0, 100)}`));
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
22
92
|
}
|
|
23
93
|
|
|
24
|
-
console.log(chalk.dim('─'.repeat(
|
|
94
|
+
console.log(chalk.dim('─'.repeat(60)));
|
|
25
95
|
|
|
96
|
+
const ws = benchmark.run_summary.with_skill;
|
|
97
|
+
const wos = benchmark.run_summary.without_skill;
|
|
26
98
|
const delta = benchmark.run_summary.delta;
|
|
27
99
|
const deltaColor = delta.pass_rate > 0 ? chalk.green : delta.pass_rate < 0 ? chalk.red : chalk.dim;
|
|
28
|
-
|
|
29
|
-
console.log(chalk.
|
|
100
|
+
|
|
101
|
+
console.log(chalk.bold('Summary:'));
|
|
102
|
+
console.log(` Skill pass rate: ${(ws.pass_rate.mean * 100).toFixed(1)}%`);
|
|
103
|
+
console.log(` Baseline pass rate: ${(wos.pass_rate.mean * 100).toFixed(1)}%`);
|
|
104
|
+
console.log(` Improvement: ${deltaColor(`${delta.pass_rate > 0 ? '+' : ''}${(delta.pass_rate * 100).toFixed(1)}%`)}`);
|
|
105
|
+
|
|
106
|
+
if (prev) {
|
|
107
|
+
const prevRate = prev.benchmark.run_summary.with_skill.pass_rate.mean;
|
|
108
|
+
const currRate = ws.pass_rate.mean;
|
|
109
|
+
const change = currRate - prevRate;
|
|
110
|
+
const changeColor = change > 0 ? chalk.green : change < 0 ? chalk.red : chalk.dim;
|
|
111
|
+
console.log(` vs previous: ${changeColor(`${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`)} (was ${(prevRate * 100).toFixed(1)}%)`);
|
|
112
|
+
|
|
113
|
+
// Note if eval set size changed
|
|
114
|
+
const prevEvalCount = prev.gradings.size;
|
|
115
|
+
const currEvalCount = evalRuns.length;
|
|
116
|
+
if (prevEvalCount !== currEvalCount) {
|
|
117
|
+
console.log(chalk.dim(` Note: eval set changed (${prevEvalCount} → ${currEvalCount} evals)`));
|
|
118
|
+
}
|
|
119
|
+
}
|
|
30
120
|
}
|
|
31
121
|
}
|
package/src/commands/eval.ts
CHANGED
|
@@ -6,82 +6,184 @@ import type {
|
|
|
6
6
|
EvalsFile,
|
|
7
7
|
EvalResults,
|
|
8
8
|
EvalRunResult,
|
|
9
|
+
GradingResult,
|
|
9
10
|
} from '../types.js';
|
|
10
11
|
import { WorkspaceManager } from '../engine/workspace.js';
|
|
11
12
|
import { runEval } from '../engine/runner.js';
|
|
12
13
|
import { gradeAssertions } from '../engine/grader.js';
|
|
13
14
|
import { computeBenchmark } from '../engine/aggregator.js';
|
|
14
|
-
import { SnapevalError } from '../errors.js';
|
|
15
|
+
import { SnapevalError, FileNotFoundError, ThresholdError } from '../errors.js';
|
|
16
|
+
|
|
17
|
+
async function runWithConcurrency<T>(
|
|
18
|
+
tasks: (() => Promise<T>)[],
|
|
19
|
+
limit: number,
|
|
20
|
+
): Promise<T[]> {
|
|
21
|
+
const results: T[] = new Array(tasks.length);
|
|
22
|
+
let index = 0;
|
|
23
|
+
async function worker() {
|
|
24
|
+
while (index < tasks.length) {
|
|
25
|
+
const i = index++;
|
|
26
|
+
results[i] = await tasks[i]();
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
|
|
30
|
+
return results;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const MAX_CONCURRENCY = 10;
|
|
34
|
+
|
|
35
|
+
function validateEvalsFile(evalsFile: EvalsFile, evalsPath: string): void {
|
|
36
|
+
if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
|
|
37
|
+
throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
|
|
38
|
+
}
|
|
39
|
+
if (!Array.isArray(evalsFile.evals)) {
|
|
40
|
+
throw new SnapevalError(`Invalid evals.json at ${evalsPath}: "evals" must be an array.`);
|
|
41
|
+
}
|
|
42
|
+
for (const [i, evalCase] of evalsFile.evals.entries()) {
|
|
43
|
+
const prefix = `Invalid evals.json at ${evalsPath}: evals[${i}]`;
|
|
44
|
+
if (typeof evalCase.id !== 'number') {
|
|
45
|
+
throw new SnapevalError(`${prefix} missing or invalid "id" (must be a number).`);
|
|
46
|
+
}
|
|
47
|
+
if (typeof evalCase.prompt !== 'string') {
|
|
48
|
+
throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "prompt" field.`);
|
|
49
|
+
}
|
|
50
|
+
if (typeof evalCase.expected_output !== 'string') {
|
|
51
|
+
throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "expected_output" field.`);
|
|
52
|
+
}
|
|
53
|
+
if (evalCase.assertions !== undefined && !Array.isArray(evalCase.assertions)) {
|
|
54
|
+
throw new SnapevalError(`${prefix} (id:${evalCase.id}) "assertions" must be an array of strings.`);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
15
58
|
|
|
16
59
|
export async function evalCommand(
|
|
17
60
|
skillPath: string,
|
|
18
61
|
harness: Harness,
|
|
19
62
|
inference: InferenceAdapter,
|
|
20
|
-
options: { workspace?: string; runs?: number; oldSkill?: string }
|
|
63
|
+
options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number }
|
|
21
64
|
): Promise<EvalResults> {
|
|
22
65
|
const evalsPath = path.join(skillPath, 'evals', 'evals.json');
|
|
23
66
|
if (!fs.existsSync(evalsPath)) {
|
|
24
|
-
throw new
|
|
67
|
+
throw new FileNotFoundError(evalsPath, 'Create evals/evals.json with test scenarios first');
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
let evalsFile: EvalsFile;
|
|
71
|
+
try {
|
|
72
|
+
evalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
|
|
73
|
+
} catch {
|
|
74
|
+
throw new SnapevalError(`Invalid JSON in ${evalsPath}. Check for syntax errors (missing commas, trailing commas, etc).`);
|
|
75
|
+
}
|
|
76
|
+
validateEvalsFile(evalsFile, evalsPath);
|
|
77
|
+
|
|
78
|
+
// Filter to specific eval IDs if --only is provided
|
|
79
|
+
if (options.only && options.only.length > 0) {
|
|
80
|
+
const ids = new Set(options.only);
|
|
81
|
+
const filtered = evalsFile.evals.filter((e) => ids.has(e.id));
|
|
82
|
+
if (filtered.length === 0) {
|
|
83
|
+
throw new SnapevalError(`No eval cases match --only ${options.only.join(',')}. Available IDs: ${evalsFile.evals.map((e) => e.id).join(', ')}`);
|
|
84
|
+
}
|
|
85
|
+
evalsFile = { ...evalsFile, evals: filtered };
|
|
25
86
|
}
|
|
26
87
|
|
|
27
|
-
const evalsFile: EvalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
|
|
28
88
|
const ws = new WorkspaceManager(skillPath, options.workspace);
|
|
29
89
|
const iterationDir = ws.createIteration();
|
|
90
|
+
|
|
91
|
+
// Track which SKILL.md was used for this iteration
|
|
92
|
+
const skillMdPath = path.join(skillPath, 'SKILL.md');
|
|
93
|
+
if (fs.existsSync(skillMdPath)) {
|
|
94
|
+
fs.copyFileSync(skillMdPath, path.join(iterationDir, 'SKILL.md.snapshot'));
|
|
95
|
+
}
|
|
30
96
|
const runs = options.runs ?? 1;
|
|
97
|
+
const concurrency = Math.min(Math.max(options.concurrency ?? 1, 1), MAX_CONCURRENCY);
|
|
31
98
|
const baselineVariant = options.oldSkill ? 'old_skill' : 'without_skill';
|
|
32
99
|
const scriptsDir = path.join(skillPath, 'evals', 'scripts');
|
|
33
100
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
for (const evalCase of evalsFile.evals) {
|
|
101
|
+
// Pre-create eval directories sequentially (filesystem setup)
|
|
102
|
+
const evalDirs = evalsFile.evals.map((evalCase) => {
|
|
37
103
|
const slug = WorkspaceManager.getEvalSlug(evalCase).replace('eval-', '');
|
|
38
|
-
|
|
104
|
+
return { evalCase, slug, evalDir: ws.createEvalDir(iterationDir, slug, baselineVariant) };
|
|
105
|
+
});
|
|
39
106
|
|
|
107
|
+
const tasks = evalDirs.map(({ evalCase, slug, evalDir }) => async (): Promise<EvalRunResult> => {
|
|
108
|
+
const assertions = evalCase.assertions ?? [];
|
|
109
|
+
const allGradings: { withSkill: GradingResult | null; withoutSkill: GradingResult | null }[] = [];
|
|
40
110
|
let lastRun: Awaited<ReturnType<typeof runEval>> | null = null;
|
|
111
|
+
|
|
41
112
|
for (let i = 0; i < runs; i++) {
|
|
42
113
|
lastRun = await runEval(evalCase, skillPath, evalDir, harness, options.oldSkill);
|
|
114
|
+
|
|
115
|
+
// Grade every run, not just the last
|
|
116
|
+
const [wsGrading, wosGrading] = await Promise.all([
|
|
117
|
+
gradeAssertions(
|
|
118
|
+
assertions,
|
|
119
|
+
lastRun.withSkill.output,
|
|
120
|
+
path.join(evalDir, 'with_skill'),
|
|
121
|
+
inference,
|
|
122
|
+
fs.existsSync(scriptsDir) ? scriptsDir : undefined,
|
|
123
|
+
),
|
|
124
|
+
gradeAssertions(
|
|
125
|
+
assertions,
|
|
126
|
+
lastRun.withoutSkill.output,
|
|
127
|
+
path.join(evalDir, baselineVariant),
|
|
128
|
+
inference,
|
|
129
|
+
fs.existsSync(scriptsDir) ? scriptsDir : undefined,
|
|
130
|
+
),
|
|
131
|
+
]);
|
|
132
|
+
allGradings.push({ withSkill: wsGrading, withoutSkill: wosGrading });
|
|
43
133
|
}
|
|
44
134
|
|
|
45
|
-
if (!lastRun)
|
|
135
|
+
if (!lastRun) {
|
|
136
|
+
throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
|
|
137
|
+
}
|
|
46
138
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
inference,
|
|
53
|
-
fs.existsSync(scriptsDir) ? scriptsDir : undefined,
|
|
54
|
-
);
|
|
55
|
-
const withoutSkillGrading = await gradeAssertions(
|
|
56
|
-
assertions,
|
|
57
|
-
lastRun.withoutSkill.output,
|
|
58
|
-
path.join(evalDir, baselineVariant),
|
|
59
|
-
inference,
|
|
60
|
-
fs.existsSync(scriptsDir) ? scriptsDir : undefined,
|
|
61
|
-
);
|
|
62
|
-
|
|
63
|
-
evalRuns.push({
|
|
139
|
+
// Use the last run's grading as the primary result (written to grading.json)
|
|
140
|
+
// but all gradings contribute to benchmark stats via pass rates
|
|
141
|
+
const lastGrading = allGradings[allGradings.length - 1];
|
|
142
|
+
|
|
143
|
+
return {
|
|
64
144
|
evalId: evalCase.id,
|
|
65
145
|
slug,
|
|
66
146
|
prompt: evalCase.prompt,
|
|
67
147
|
withSkill: {
|
|
68
148
|
output: lastRun.withSkill.output,
|
|
69
|
-
grading:
|
|
149
|
+
grading: lastGrading.withSkill ?? undefined,
|
|
70
150
|
},
|
|
71
151
|
withoutSkill: {
|
|
72
152
|
output: lastRun.withoutSkill.output,
|
|
73
|
-
grading:
|
|
153
|
+
grading: lastGrading.withoutSkill ?? undefined,
|
|
74
154
|
},
|
|
75
|
-
}
|
|
76
|
-
}
|
|
155
|
+
};
|
|
156
|
+
});
|
|
77
157
|
|
|
158
|
+
const evalRuns = await runWithConcurrency(tasks, concurrency);
|
|
78
159
|
const benchmark = computeBenchmark(evalRuns);
|
|
79
160
|
|
|
161
|
+
// Add iteration metadata for cross-iteration comparison
|
|
162
|
+
const benchmarkWithMeta = {
|
|
163
|
+
...benchmark,
|
|
164
|
+
metadata: {
|
|
165
|
+
eval_count: evalRuns.length,
|
|
166
|
+
eval_ids: evalRuns.map((r) => r.evalId),
|
|
167
|
+
skill_name: evalsFile.skill_name,
|
|
168
|
+
timestamp: new Date().toISOString(),
|
|
169
|
+
},
|
|
170
|
+
};
|
|
171
|
+
|
|
80
172
|
fs.writeFileSync(
|
|
81
173
|
path.join(iterationDir, 'benchmark.json'),
|
|
82
|
-
JSON.stringify(
|
|
174
|
+
JSON.stringify(benchmarkWithMeta, null, 2)
|
|
83
175
|
);
|
|
84
176
|
|
|
177
|
+
// Check threshold if set (for CI gating)
|
|
178
|
+
if (options.threshold !== undefined) {
|
|
179
|
+
const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
|
|
180
|
+
if (passRate < options.threshold) {
|
|
181
|
+
// Still return results so the reporter can display them before the error
|
|
182
|
+
const results = { skillName: evalsFile.skill_name, evalRuns, benchmark, iterationDir };
|
|
183
|
+
throw Object.assign(new ThresholdError(passRate, options.threshold), { results });
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
85
187
|
return {
|
|
86
188
|
skillName: evalsFile.skill_name,
|
|
87
189
|
evalRuns,
|
package/src/commands/review.ts
CHANGED
|
@@ -10,7 +10,7 @@ export async function reviewCommand(
|
|
|
10
10
|
skillPath: string,
|
|
11
11
|
harness: Harness,
|
|
12
12
|
inference: InferenceAdapter,
|
|
13
|
-
options: { workspace?: string; runs?: number; oldSkill?: string; noOpen?: boolean }
|
|
13
|
+
options: { workspace?: string; runs?: number; oldSkill?: string; noOpen?: boolean; concurrency?: number }
|
|
14
14
|
): Promise<void> {
|
|
15
15
|
const results = await evalCommand(skillPath, harness, inference, options);
|
|
16
16
|
|
package/src/config.ts
CHANGED
|
@@ -3,10 +3,11 @@ import * as path from 'node:path';
|
|
|
3
3
|
import type { SnapevalConfig } from './types.js';
|
|
4
4
|
|
|
5
5
|
export const DEFAULT_CONFIG: SnapevalConfig = {
|
|
6
|
-
harness: 'copilot-
|
|
6
|
+
harness: 'copilot-sdk',
|
|
7
7
|
inference: 'auto',
|
|
8
8
|
workspace: '../{skill_name}-workspace',
|
|
9
9
|
runs: 1,
|
|
10
|
+
concurrency: 1,
|
|
10
11
|
};
|
|
11
12
|
|
|
12
13
|
function loadConfigFile(dirPath: string): Partial<SnapevalConfig> | null {
|
package/src/engine/grader.ts
CHANGED
|
@@ -8,9 +8,34 @@ import type {
|
|
|
8
8
|
AssertionResult,
|
|
9
9
|
} from '../types.js';
|
|
10
10
|
|
|
11
|
+
const EXACT_MATCH_PATTERN = /^Output (?:is |equals )exactly:\s*"(.+)"$/i;
|
|
12
|
+
|
|
13
|
+
function gradeExactMatch(assertion: string, output: string): AssertionResult | null {
|
|
14
|
+
const match = assertion.match(EXACT_MATCH_PATTERN);
|
|
15
|
+
if (!match) return null;
|
|
16
|
+
const expected = match[1];
|
|
17
|
+
const actual = output.trim();
|
|
18
|
+
const passed = actual === expected;
|
|
19
|
+
return {
|
|
20
|
+
text: assertion,
|
|
21
|
+
passed,
|
|
22
|
+
evidence: passed
|
|
23
|
+
? `Exact match: "${expected}"`
|
|
24
|
+
: `Expected: "${expected}"\nGot: "${actual}"`,
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
|
|
11
28
|
function buildGradingPrompt(assertions: string[], output: string, files: string[]): string {
|
|
12
29
|
const fileList = files.length > 0 ? `\nFiles produced: ${files.join(', ')}` : '';
|
|
13
|
-
return `You are
|
|
30
|
+
return `You are an eval grader. For each assertion, determine PASS or FAIL based solely on the output below.
|
|
31
|
+
|
|
32
|
+
GRADING RULES:
|
|
33
|
+
- PASS if the output satisfies the assertion's intent, even if wording differs slightly.
|
|
34
|
+
- FAIL only if the output clearly does not satisfy the assertion.
|
|
35
|
+
- Be consistent: if an assertion checks for X and the output contains X in different phrasing, that is a PASS.
|
|
36
|
+
- For "contains" assertions: look for semantic presence, not exact substring.
|
|
37
|
+
- For "identifies" assertions: the output must demonstrate awareness of the concept, not use identical words.
|
|
38
|
+
- Always cite specific text from the output as evidence.
|
|
14
39
|
|
|
15
40
|
OUTPUT:
|
|
16
41
|
---
|
|
@@ -23,7 +48,7 @@ ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
|
|
|
23
48
|
Respond with JSON only:
|
|
24
49
|
{
|
|
25
50
|
"results": [
|
|
26
|
-
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote
|
|
51
|
+
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
|
|
27
52
|
]
|
|
28
53
|
}`;
|
|
29
54
|
}
|
|
@@ -38,18 +63,38 @@ function runScript(
|
|
|
38
63
|
return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
|
|
39
64
|
}
|
|
40
65
|
try {
|
|
41
|
-
const
|
|
66
|
+
const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
|
|
67
|
+
const evidence = stdout || `Script passed: ${scriptName}`;
|
|
42
68
|
return { text: `script:${scriptName}`, passed: true, evidence };
|
|
43
69
|
} catch (err: any) {
|
|
44
|
-
|
|
70
|
+
// Extract the most useful error info without raw stack traces
|
|
71
|
+
const stderr = err.stderr?.trim();
|
|
72
|
+
const stdout = err.stdout?.trim();
|
|
73
|
+
let evidence: string;
|
|
74
|
+
if (err.code === 'EACCES') {
|
|
75
|
+
evidence = `Permission denied: ${scriptPath} is not executable. Run: chmod +x ${scriptPath}`;
|
|
76
|
+
} else if (stderr) {
|
|
77
|
+
// Take only the first line of stderr to avoid stack trace noise
|
|
78
|
+
evidence = stderr.split('\n')[0];
|
|
79
|
+
} else if (stdout) {
|
|
80
|
+
evidence = stdout.split('\n')[0];
|
|
81
|
+
} else {
|
|
82
|
+
evidence = `Script exited with code ${err.status ?? 'unknown'}`;
|
|
83
|
+
}
|
|
45
84
|
return { text: `script:${scriptName}`, passed: false, evidence };
|
|
46
85
|
}
|
|
47
86
|
}
|
|
48
87
|
|
|
49
88
|
function extractJSON(text: string): string {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
return
|
|
89
|
+
// Try JSON-tagged fence first, then bare fence, then raw text
|
|
90
|
+
const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
|
|
91
|
+
if (jsonFence) return jsonFence[1].trim();
|
|
92
|
+
// Try parsing raw text as JSON before falling back to any fence
|
|
93
|
+
const trimmed = text.trim();
|
|
94
|
+
try { JSON.parse(trimmed); return trimmed; } catch { /* not raw JSON */ }
|
|
95
|
+
const anyFence = text.match(/```\s*([\s\S]*?)```/);
|
|
96
|
+
if (anyFence) return anyFence[1].trim();
|
|
97
|
+
return trimmed;
|
|
53
98
|
}
|
|
54
99
|
|
|
55
100
|
export async function gradeAssertions(
|
|
@@ -62,7 +107,8 @@ export async function gradeAssertions(
|
|
|
62
107
|
if (assertions.length === 0) return null;
|
|
63
108
|
|
|
64
109
|
const scriptAssertions = assertions.filter(a => a.startsWith('script:'));
|
|
65
|
-
const
|
|
110
|
+
const exactAssertions = assertions.filter(a => !a.startsWith('script:') && EXACT_MATCH_PATTERN.test(a));
|
|
111
|
+
const llmAssertions = assertions.filter(a => !a.startsWith('script:') && !EXACT_MATCH_PATTERN.test(a));
|
|
66
112
|
const results: AssertionResult[] = [];
|
|
67
113
|
|
|
68
114
|
for (const assertion of scriptAssertions) {
|
|
@@ -72,6 +118,11 @@ export async function gradeAssertions(
|
|
|
72
118
|
results.push(runScript(scriptName, outputDir, dir));
|
|
73
119
|
}
|
|
74
120
|
|
|
121
|
+
for (const assertion of exactAssertions) {
|
|
122
|
+
const result = gradeExactMatch(assertion, output.raw);
|
|
123
|
+
if (result) results.push(result);
|
|
124
|
+
}
|
|
125
|
+
|
|
75
126
|
if (llmAssertions.length > 0) {
|
|
76
127
|
const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
|
|
77
128
|
const response = await inference.chat(
|