snapeval 1.8.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/bin/snapeval.ts +30 -24
  2. package/dist/bin/snapeval.js +25 -22
  3. package/dist/bin/snapeval.js.map +1 -1
  4. package/dist/src/adapters/copilot-sdk-client.js +1 -1
  5. package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
  6. package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
  7. package/dist/src/adapters/harness/copilot-sdk.js +101 -0
  8. package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
  9. package/dist/src/adapters/harness/resolve.js +10 -2
  10. package/dist/src/adapters/harness/resolve.js.map +1 -1
  11. package/dist/src/adapters/inference/copilot-sdk.js +4 -1
  12. package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
  13. package/dist/src/adapters/report/terminal.js +89 -9
  14. package/dist/src/adapters/report/terminal.js.map +1 -1
  15. package/dist/src/commands/eval.d.ts +3 -0
  16. package/dist/src/commands/eval.js +106 -17
  17. package/dist/src/commands/eval.js.map +1 -1
  18. package/dist/src/commands/review.d.ts +1 -0
  19. package/dist/src/commands/review.js.map +1 -1
  20. package/dist/src/config.js +2 -1
  21. package/dist/src/config.js.map +1 -1
  22. package/dist/src/engine/grader.js +67 -9
  23. package/dist/src/engine/grader.js.map +1 -1
  24. package/dist/src/engine/runner.js +14 -12
  25. package/dist/src/engine/runner.js.map +1 -1
  26. package/dist/src/errors.d.ts +6 -0
  27. package/dist/src/errors.js +21 -3
  28. package/dist/src/errors.js.map +1 -1
  29. package/dist/src/types.d.ts +1 -0
  30. package/package.json +4 -1
  31. package/plugin.json +1 -1
  32. package/skills/snapeval/SKILL.md +33 -18
  33. package/src/adapters/copilot-sdk-client.ts +1 -1
  34. package/src/adapters/harness/copilot-sdk.ts +126 -0
  35. package/src/adapters/harness/resolve.ts +13 -2
  36. package/src/adapters/inference/copilot-sdk.ts +5 -1
  37. package/src/adapters/report/terminal.ts +100 -10
  38. package/src/commands/eval.ts +133 -31
  39. package/src/commands/review.ts +1 -1
  40. package/src/config.ts +2 -1
  41. package/src/engine/grader.ts +59 -8
  42. package/src/engine/runner.ts +14 -13
  43. package/src/errors.ts +24 -3
  44. package/src/types.ts +1 -0
  45. package/dist/src/commands/init.d.ts +0 -2
  46. package/dist/src/commands/init.js +0 -27
  47. package/dist/src/commands/init.js.map +0 -1
  48. package/dist/src/engine/generator.d.ts +0 -3
  49. package/dist/src/engine/generator.js +0 -51
  50. package/dist/src/engine/generator.js.map +0 -1
  51. package/src/commands/init.ts +0 -38
  52. package/src/engine/generator.ts +0 -60
@@ -0,0 +1,126 @@
1
+ import * as fs from 'node:fs';
2
+ import * as path from 'node:path';
3
+ import type { Harness, HarnessRunResult } from '../../types.js';
4
+ import { getClient, isSDKInstalled } from '../copilot-sdk-client.js';
5
+
6
+ export class CopilotSDKHarness implements Harness {
7
+ readonly name = 'copilot-sdk';
8
+
9
+ async run(options: {
10
+ skillPath?: string;
11
+ prompt: string;
12
+ files?: string[];
13
+ outputDir: string;
14
+ }): Promise<HarnessRunResult> {
15
+ const startMs = Date.now();
16
+ const client = await getClient();
17
+
18
+ fs.mkdirSync(options.outputDir, { recursive: true });
19
+
20
+ // Dynamically import SDK for approveAll
21
+ // @ts-ignore — module may not be installed (optional dep)
22
+ const { approveAll } = await import('@github/copilot-sdk');
23
+
24
+ // Build session config
25
+ const sessionConfig: Record<string, unknown> = {
26
+ model: 'gpt-4.1',
27
+ onPermissionRequest: approveAll,
28
+ workingDirectory: options.outputDir,
29
+ infiniteSessions: { enabled: false },
30
+ };
31
+
32
+ // Native skill loading: point skillDirectories at the skill's parent
33
+ if (options.skillPath) {
34
+ sessionConfig.skillDirectories = [options.skillPath];
35
+ }
36
+
37
+ const session = await client.createSession(sessionConfig);
38
+
39
+ try {
40
+ // Attach input files if provided
41
+ const attachments: Array<{ type: string; path: string; displayName?: string }> = [];
42
+ if (options.files) {
43
+ for (const file of options.files) {
44
+ // Copy to outputDir for script assertions, and attach for the model
45
+ const dest = path.join(options.outputDir, path.basename(file));
46
+ fs.copyFileSync(file, dest);
47
+ attachments.push({ type: 'file', path: dest, displayName: path.basename(file) });
48
+ }
49
+ }
50
+
51
+ const response = await session.sendAndWait(
52
+ {
53
+ prompt: options.prompt,
54
+ ...(attachments.length > 0 ? { attachments } : {}),
55
+ },
56
+ 300_000, // 5 min timeout — calibrated for complex eval prompts
57
+ );
58
+
59
+ const raw = response?.data?.content ?? '';
60
+
61
+ // Collect full transcript from session events
62
+ const events = await session.getMessages();
63
+ const transcript = buildTranscript(events);
64
+
65
+ // Extract token count from events if available
66
+ const totalTokens = extractTokenCount(events);
67
+
68
+ const durationMs = Date.now() - startMs;
69
+
70
+ return {
71
+ raw: raw.trim(),
72
+ transcript,
73
+ files: [],
74
+ total_tokens: totalTokens,
75
+ duration_ms: durationMs,
76
+ };
77
+ } finally {
78
+ await session.disconnect();
79
+ }
80
+ }
81
+
82
+ async isAvailable(): Promise<boolean> {
83
+ return isSDKInstalled();
84
+ }
85
+ }
86
+
87
+ function buildTranscript(events: any[]): string {
88
+ const lines: string[] = [];
89
+ for (const event of events) {
90
+ switch (event.type) {
91
+ case 'user.message':
92
+ lines.push(`[user] ${event.data?.content ?? ''}`);
93
+ break;
94
+ case 'assistant.message':
95
+ lines.push(`[assistant] ${event.data?.content ?? ''}`);
96
+ break;
97
+ case 'tool.execution_start':
98
+ lines.push(`[tool:start] ${event.data?.toolName ?? 'unknown'}(${JSON.stringify(event.data?.arguments ?? {})})`);
99
+ break;
100
+ case 'tool.execution_complete':
101
+ lines.push(`[tool:done] ${event.data?.toolName ?? 'unknown'} → ${truncate(event.data?.result ?? '', 200)}`);
102
+ break;
103
+ case 'skill.invoked':
104
+ lines.push(`[skill] ${event.data?.name ?? 'unknown'} (${event.data?.path ?? ''})`);
105
+ break;
106
+ case 'session.error':
107
+ lines.push(`[error] ${event.data?.message ?? ''}`);
108
+ break;
109
+ }
110
+ }
111
+ return lines.join('\n');
112
+ }
113
+
114
+ function extractTokenCount(events: any[]): number {
115
+ let total = 0;
116
+ for (const event of events) {
117
+ if (event.type === 'assistant.usage') {
118
+ total += (event.data?.inputTokens ?? 0) + (event.data?.outputTokens ?? 0);
119
+ }
120
+ }
121
+ return total;
122
+ }
123
+
124
+ function truncate(str: string, max: number): string {
125
+ return str.length > max ? str.slice(0, max) + '...' : str;
126
+ }
@@ -1,10 +1,21 @@
1
1
  import type { Harness } from '../../types.js';
2
2
  import { CopilotCLIHarness } from './copilot-cli.js';
3
- import { SnapevalError } from '../../errors.js';
3
+ import { CopilotSDKHarness } from './copilot-sdk.js';
4
+ import { AdapterNotAvailableError, SnapevalError } from '../../errors.js';
5
+ import { isSDKInstalled } from '../copilot-sdk-client.js';
4
6
 
5
7
  export function resolveHarness(name: string): Harness {
8
+ if (name === 'copilot-sdk') {
9
+ if (!isSDKInstalled()) {
10
+ throw new AdapterNotAvailableError(
11
+ 'copilot-sdk',
12
+ '@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk'
13
+ );
14
+ }
15
+ return new CopilotSDKHarness();
16
+ }
6
17
  if (name === 'copilot-cli') {
7
18
  return new CopilotCLIHarness();
8
19
  }
9
- throw new SnapevalError(`Unknown harness "${name}". Built-in options: copilot-cli.`);
20
+ throw new SnapevalError(`Unknown harness "${name}". Built-in options: copilot-sdk, copilot-cli.`);
10
21
  }
@@ -7,6 +7,9 @@ export class CopilotSDKInference implements InferenceAdapter {
7
7
  async chat(messages: Message[], _options?: ChatOptions): Promise<string> {
8
8
  const client = await getClient();
9
9
 
10
+ // @ts-ignore — module may not be installed (optional dep)
11
+ const { approveAll } = await import('@github/copilot-sdk');
12
+
10
13
  const systemMessages = messages.filter((m) => m.role === 'system');
11
14
  const nonSystemMessages = messages.filter((m) => m.role !== 'system');
12
15
  const systemContent = systemMessages.map((m) => m.content).join('\n');
@@ -17,7 +20,8 @@ export class CopilotSDKInference implements InferenceAdapter {
17
20
  ...(systemContent
18
21
  ? { systemMessage: { content: systemContent } }
19
22
  : {}),
20
- onPermissionRequest: async () => ({ kind: 'approved' }),
23
+ onPermissionRequest: approveAll,
24
+ infiniteSessions: { enabled: false },
21
25
  });
22
26
 
23
27
  try {
@@ -1,5 +1,45 @@
1
+ import * as fs from 'node:fs';
2
+ import * as path from 'node:path';
1
3
  import chalk from 'chalk';
2
- import type { ReportAdapter, EvalResults } from '../../types.js';
4
+ import type { ReportAdapter, EvalResults, BenchmarkData, GradingResult } from '../../types.js';
5
+
6
+ interface PreviousIteration {
7
+ benchmark: BenchmarkData;
8
+ gradings: Map<string, { withSkill?: GradingResult; withoutSkill?: GradingResult }>;
9
+ }
10
+
11
+ function loadPreviousIteration(iterationDir: string): PreviousIteration | null {
12
+ const workspaceDir = path.dirname(iterationDir);
13
+ const currentName = path.basename(iterationDir);
14
+ const currentNum = parseInt(currentName.replace('iteration-', ''), 10);
15
+ if (isNaN(currentNum) || currentNum <= 1) return null;
16
+ const prevDir = path.join(workspaceDir, `iteration-${currentNum - 1}`);
17
+ const prevBenchmarkPath = path.join(prevDir, 'benchmark.json');
18
+ if (!fs.existsSync(prevBenchmarkPath)) return null;
19
+ try {
20
+ const benchmark = JSON.parse(fs.readFileSync(prevBenchmarkPath, 'utf-8'));
21
+ const gradings = new Map<string, { withSkill?: GradingResult; withoutSkill?: GradingResult }>();
22
+ const evalDirs = fs.readdirSync(prevDir).filter(d => d.startsWith('eval-'));
23
+ for (const evalDir of evalDirs) {
24
+ const wsPath = path.join(prevDir, evalDir, 'with_skill', 'grading.json');
25
+ const wosPath = path.join(prevDir, evalDir, 'without_skill', 'grading.json');
26
+ const ws = fs.existsSync(wsPath) ? JSON.parse(fs.readFileSync(wsPath, 'utf-8')) : undefined;
27
+ const wos = fs.existsSync(wosPath) ? JSON.parse(fs.readFileSync(wosPath, 'utf-8')) : undefined;
28
+ gradings.set(evalDir, { withSkill: ws, withoutSkill: wos });
29
+ }
30
+ return { benchmark, gradings };
31
+ } catch {
32
+ return null;
33
+ }
34
+ }
35
+
36
+ function evalLabel(run: { evalId: number; slug: string; prompt: string }): string {
37
+ // Use expected_output or slug as a readable label instead of truncated prompt
38
+ if (run.slug && run.slug !== `${run.evalId}`) return run.slug;
39
+ // Truncate prompt but show first meaningful line
40
+ const firstLine = run.prompt.split('\n')[0].slice(0, 60);
41
+ return firstLine;
42
+ }
3
43
 
4
44
  export class TerminalReporter implements ReportAdapter {
5
45
  readonly name = 'terminal';
@@ -8,24 +48,74 @@ export class TerminalReporter implements ReportAdapter {
8
48
  const { skillName, evalRuns, benchmark } = results;
9
49
 
10
50
  console.log(chalk.bold(`\nsnapeval — ${skillName}`));
11
- console.log(chalk.dim('─'.repeat(50)));
51
+ console.log(chalk.dim(`Baseline = without SKILL.md (raw AI response)`));
52
+ console.log(chalk.dim('─'.repeat(60)));
53
+
54
+ const prev = loadPreviousIteration(results.iterationDir);
12
55
 
13
56
  for (const run of evalRuns) {
14
- const wsRate = run.withSkill.grading?.summary.pass_rate;
57
+ const wsGrading = run.withSkill.grading;
58
+ const wsRate = wsGrading?.summary.pass_rate;
15
59
  const wosRate = run.withoutSkill.grading?.summary.pass_rate;
16
60
  const wsLabel = wsRate !== undefined ? `${(wsRate * 100).toFixed(0)}%` : 'n/a';
17
61
  const wosLabel = wosRate !== undefined ? `${(wosRate * 100).toFixed(0)}%` : 'n/a';
18
- const tokens = run.withSkill.output.total_tokens;
19
- const durationS = (run.withSkill.output.duration_ms / 1000).toFixed(2);
20
- console.log(` ${chalk.cyan(`#${run.evalId}`)} ${run.prompt.slice(0, 60)}`);
21
- console.log(` with_skill: ${wsLabel} | without_skill: ${wosLabel} | ${tokens} tokens, ${durationS}s`);
62
+ const wsColor = wsRate === 1 ? chalk.green : wsRate === 0 ? chalk.red : chalk.yellow;
63
+ const durationS = (run.withSkill.output.duration_ms / 1000).toFixed(1);
64
+
65
+ // Show per-eval delta from previous iteration
66
+ let perEvalDelta = '';
67
+ if (prev) {
68
+ const prevGrading = prev.gradings.get(`eval-${run.slug}`);
69
+ const prevRate = prevGrading?.withSkill?.summary.pass_rate;
70
+ if (prevRate !== undefined && wsRate !== undefined) {
71
+ const change = wsRate - prevRate;
72
+ if (change !== 0) {
73
+ const arrow = change > 0 ? chalk.green('↑') : chalk.red('↓');
74
+ perEvalDelta = ` ${arrow} was ${(prevRate * 100).toFixed(0)}%`;
75
+ }
76
+ }
77
+ }
78
+
79
+ console.log(` ${chalk.cyan(`#${run.evalId}`)} ${evalLabel(run)}`);
80
+ console.log(` Skill: ${wsColor(wsLabel)}${perEvalDelta} | Baseline: ${wosLabel} | ${durationS}s`);
81
+
82
+ // Show failed assertions inline
83
+ if (wsGrading) {
84
+ const failed = wsGrading.assertion_results.filter((a) => !a.passed);
85
+ for (const f of failed) {
86
+ console.log(chalk.red(` FAIL: ${f.text}`));
87
+ if (f.evidence) {
88
+ console.log(chalk.dim(` ${f.evidence.slice(0, 100)}`));
89
+ }
90
+ }
91
+ }
22
92
  }
23
93
 
24
- console.log(chalk.dim('─'.repeat(50)));
94
+ console.log(chalk.dim('─'.repeat(60)));
25
95
 
96
+ const ws = benchmark.run_summary.with_skill;
97
+ const wos = benchmark.run_summary.without_skill;
26
98
  const delta = benchmark.run_summary.delta;
27
99
  const deltaColor = delta.pass_rate > 0 ? chalk.green : delta.pass_rate < 0 ? chalk.red : chalk.dim;
28
- console.log(`Delta: ${deltaColor(`${(delta.pass_rate * 100).toFixed(1)}% pass rate`)} | ${delta.time_seconds.toFixed(1)}s time | ${delta.tokens.toFixed(0)} tokens`);
29
- console.log(chalk.dim(`with_skill avg: ${(benchmark.run_summary.with_skill.pass_rate.mean * 100).toFixed(1)}% | without_skill avg: ${(benchmark.run_summary.without_skill.pass_rate.mean * 100).toFixed(1)}%`));
100
+
101
+ console.log(chalk.bold('Summary:'));
102
+ console.log(` Skill pass rate: ${(ws.pass_rate.mean * 100).toFixed(1)}%`);
103
+ console.log(` Baseline pass rate: ${(wos.pass_rate.mean * 100).toFixed(1)}%`);
104
+ console.log(` Improvement: ${deltaColor(`${delta.pass_rate > 0 ? '+' : ''}${(delta.pass_rate * 100).toFixed(1)}%`)}`);
105
+
106
+ if (prev) {
107
+ const prevRate = prev.benchmark.run_summary.with_skill.pass_rate.mean;
108
+ const currRate = ws.pass_rate.mean;
109
+ const change = currRate - prevRate;
110
+ const changeColor = change > 0 ? chalk.green : change < 0 ? chalk.red : chalk.dim;
111
+ console.log(` vs previous: ${changeColor(`${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`)} (was ${(prevRate * 100).toFixed(1)}%)`);
112
+
113
+ // Note if eval set size changed
114
+ const prevEvalCount = prev.gradings.size;
115
+ const currEvalCount = evalRuns.length;
116
+ if (prevEvalCount !== currEvalCount) {
117
+ console.log(chalk.dim(` Note: eval set changed (${prevEvalCount} → ${currEvalCount} evals)`));
118
+ }
119
+ }
30
120
  }
31
121
  }
@@ -6,82 +6,184 @@ import type {
6
6
  EvalsFile,
7
7
  EvalResults,
8
8
  EvalRunResult,
9
+ GradingResult,
9
10
  } from '../types.js';
10
11
  import { WorkspaceManager } from '../engine/workspace.js';
11
12
  import { runEval } from '../engine/runner.js';
12
13
  import { gradeAssertions } from '../engine/grader.js';
13
14
  import { computeBenchmark } from '../engine/aggregator.js';
14
- import { SnapevalError } from '../errors.js';
15
+ import { SnapevalError, FileNotFoundError, ThresholdError } from '../errors.js';
16
+
17
+ async function runWithConcurrency<T>(
18
+ tasks: (() => Promise<T>)[],
19
+ limit: number,
20
+ ): Promise<T[]> {
21
+ const results: T[] = new Array(tasks.length);
22
+ let index = 0;
23
+ async function worker() {
24
+ while (index < tasks.length) {
25
+ const i = index++;
26
+ results[i] = await tasks[i]();
27
+ }
28
+ }
29
+ await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
30
+ return results;
31
+ }
32
+
33
+ const MAX_CONCURRENCY = 10;
34
+
35
+ function validateEvalsFile(evalsFile: EvalsFile, evalsPath: string): void {
36
+ if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
37
+ throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
38
+ }
39
+ if (!Array.isArray(evalsFile.evals)) {
40
+ throw new SnapevalError(`Invalid evals.json at ${evalsPath}: "evals" must be an array.`);
41
+ }
42
+ for (const [i, evalCase] of evalsFile.evals.entries()) {
43
+ const prefix = `Invalid evals.json at ${evalsPath}: evals[${i}]`;
44
+ if (typeof evalCase.id !== 'number') {
45
+ throw new SnapevalError(`${prefix} missing or invalid "id" (must be a number).`);
46
+ }
47
+ if (typeof evalCase.prompt !== 'string') {
48
+ throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "prompt" field.`);
49
+ }
50
+ if (typeof evalCase.expected_output !== 'string') {
51
+ throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "expected_output" field.`);
52
+ }
53
+ if (evalCase.assertions !== undefined && !Array.isArray(evalCase.assertions)) {
54
+ throw new SnapevalError(`${prefix} (id:${evalCase.id}) "assertions" must be an array of strings.`);
55
+ }
56
+ }
57
+ }
15
58
 
16
59
  export async function evalCommand(
17
60
  skillPath: string,
18
61
  harness: Harness,
19
62
  inference: InferenceAdapter,
20
- options: { workspace?: string; runs?: number; oldSkill?: string }
63
+ options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number }
21
64
  ): Promise<EvalResults> {
22
65
  const evalsPath = path.join(skillPath, 'evals', 'evals.json');
23
66
  if (!fs.existsSync(evalsPath)) {
24
- throw new SnapevalError(`No evals.json found at ${evalsPath}. Run \`snapeval init\` first.`);
67
+ throw new FileNotFoundError(evalsPath, 'Create evals/evals.json with test scenarios first');
68
+ }
69
+
70
+ let evalsFile: EvalsFile;
71
+ try {
72
+ evalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
73
+ } catch {
74
+ throw new SnapevalError(`Invalid JSON in ${evalsPath}. Check for syntax errors (missing commas, trailing commas, etc).`);
75
+ }
76
+ validateEvalsFile(evalsFile, evalsPath);
77
+
78
+ // Filter to specific eval IDs if --only is provided
79
+ if (options.only && options.only.length > 0) {
80
+ const ids = new Set(options.only);
81
+ const filtered = evalsFile.evals.filter((e) => ids.has(e.id));
82
+ if (filtered.length === 0) {
83
+ throw new SnapevalError(`No eval cases match --only ${options.only.join(',')}. Available IDs: ${evalsFile.evals.map((e) => e.id).join(', ')}`);
84
+ }
85
+ evalsFile = { ...evalsFile, evals: filtered };
25
86
  }
26
87
 
27
- const evalsFile: EvalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
28
88
  const ws = new WorkspaceManager(skillPath, options.workspace);
29
89
  const iterationDir = ws.createIteration();
90
+
91
+ // Track which SKILL.md was used for this iteration
92
+ const skillMdPath = path.join(skillPath, 'SKILL.md');
93
+ if (fs.existsSync(skillMdPath)) {
94
+ fs.copyFileSync(skillMdPath, path.join(iterationDir, 'SKILL.md.snapshot'));
95
+ }
30
96
  const runs = options.runs ?? 1;
97
+ const concurrency = Math.min(Math.max(options.concurrency ?? 1, 1), MAX_CONCURRENCY);
31
98
  const baselineVariant = options.oldSkill ? 'old_skill' : 'without_skill';
32
99
  const scriptsDir = path.join(skillPath, 'evals', 'scripts');
33
100
 
34
- const evalRuns: EvalRunResult[] = [];
35
-
36
- for (const evalCase of evalsFile.evals) {
101
+ // Pre-create eval directories sequentially (filesystem setup)
102
+ const evalDirs = evalsFile.evals.map((evalCase) => {
37
103
  const slug = WorkspaceManager.getEvalSlug(evalCase).replace('eval-', '');
38
- const evalDir = ws.createEvalDir(iterationDir, slug, baselineVariant);
104
+ return { evalCase, slug, evalDir: ws.createEvalDir(iterationDir, slug, baselineVariant) };
105
+ });
39
106
 
107
+ const tasks = evalDirs.map(({ evalCase, slug, evalDir }) => async (): Promise<EvalRunResult> => {
108
+ const assertions = evalCase.assertions ?? [];
109
+ const allGradings: { withSkill: GradingResult | null; withoutSkill: GradingResult | null }[] = [];
40
110
  let lastRun: Awaited<ReturnType<typeof runEval>> | null = null;
111
+
41
112
  for (let i = 0; i < runs; i++) {
42
113
  lastRun = await runEval(evalCase, skillPath, evalDir, harness, options.oldSkill);
114
+
115
+ // Grade every run, not just the last
116
+ const [wsGrading, wosGrading] = await Promise.all([
117
+ gradeAssertions(
118
+ assertions,
119
+ lastRun.withSkill.output,
120
+ path.join(evalDir, 'with_skill'),
121
+ inference,
122
+ fs.existsSync(scriptsDir) ? scriptsDir : undefined,
123
+ ),
124
+ gradeAssertions(
125
+ assertions,
126
+ lastRun.withoutSkill.output,
127
+ path.join(evalDir, baselineVariant),
128
+ inference,
129
+ fs.existsSync(scriptsDir) ? scriptsDir : undefined,
130
+ ),
131
+ ]);
132
+ allGradings.push({ withSkill: wsGrading, withoutSkill: wosGrading });
43
133
  }
44
134
 
45
- if (!lastRun) continue;
135
+ if (!lastRun) {
136
+ throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
137
+ }
46
138
 
47
- const assertions = evalCase.assertions ?? [];
48
- const withSkillGrading = await gradeAssertions(
49
- assertions,
50
- lastRun.withSkill.output,
51
- path.join(evalDir, 'with_skill'),
52
- inference,
53
- fs.existsSync(scriptsDir) ? scriptsDir : undefined,
54
- );
55
- const withoutSkillGrading = await gradeAssertions(
56
- assertions,
57
- lastRun.withoutSkill.output,
58
- path.join(evalDir, baselineVariant),
59
- inference,
60
- fs.existsSync(scriptsDir) ? scriptsDir : undefined,
61
- );
62
-
63
- evalRuns.push({
139
+ // Use the last run's grading as the primary result (written to grading.json)
140
+ // but all gradings contribute to benchmark stats via pass rates
141
+ const lastGrading = allGradings[allGradings.length - 1];
142
+
143
+ return {
64
144
  evalId: evalCase.id,
65
145
  slug,
66
146
  prompt: evalCase.prompt,
67
147
  withSkill: {
68
148
  output: lastRun.withSkill.output,
69
- grading: withSkillGrading ?? undefined,
149
+ grading: lastGrading.withSkill ?? undefined,
70
150
  },
71
151
  withoutSkill: {
72
152
  output: lastRun.withoutSkill.output,
73
- grading: withoutSkillGrading ?? undefined,
153
+ grading: lastGrading.withoutSkill ?? undefined,
74
154
  },
75
- });
76
- }
155
+ };
156
+ });
77
157
 
158
+ const evalRuns = await runWithConcurrency(tasks, concurrency);
78
159
  const benchmark = computeBenchmark(evalRuns);
79
160
 
161
+ // Add iteration metadata for cross-iteration comparison
162
+ const benchmarkWithMeta = {
163
+ ...benchmark,
164
+ metadata: {
165
+ eval_count: evalRuns.length,
166
+ eval_ids: evalRuns.map((r) => r.evalId),
167
+ skill_name: evalsFile.skill_name,
168
+ timestamp: new Date().toISOString(),
169
+ },
170
+ };
171
+
80
172
  fs.writeFileSync(
81
173
  path.join(iterationDir, 'benchmark.json'),
82
- JSON.stringify(benchmark, null, 2)
174
+ JSON.stringify(benchmarkWithMeta, null, 2)
83
175
  );
84
176
 
177
+ // Check threshold if set (for CI gating)
178
+ if (options.threshold !== undefined) {
179
+ const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
180
+ if (passRate < options.threshold) {
181
+ // Still return results so the reporter can display them before the error
182
+ const results = { skillName: evalsFile.skill_name, evalRuns, benchmark, iterationDir };
183
+ throw Object.assign(new ThresholdError(passRate, options.threshold), { results });
184
+ }
185
+ }
186
+
85
187
  return {
86
188
  skillName: evalsFile.skill_name,
87
189
  evalRuns,
@@ -10,7 +10,7 @@ export async function reviewCommand(
10
10
  skillPath: string,
11
11
  harness: Harness,
12
12
  inference: InferenceAdapter,
13
- options: { workspace?: string; runs?: number; oldSkill?: string; noOpen?: boolean }
13
+ options: { workspace?: string; runs?: number; oldSkill?: string; noOpen?: boolean; concurrency?: number }
14
14
  ): Promise<void> {
15
15
  const results = await evalCommand(skillPath, harness, inference, options);
16
16
 
package/src/config.ts CHANGED
@@ -3,10 +3,11 @@ import * as path from 'node:path';
3
3
  import type { SnapevalConfig } from './types.js';
4
4
 
5
5
  export const DEFAULT_CONFIG: SnapevalConfig = {
6
- harness: 'copilot-cli',
6
+ harness: 'copilot-sdk',
7
7
  inference: 'auto',
8
8
  workspace: '../{skill_name}-workspace',
9
9
  runs: 1,
10
+ concurrency: 1,
10
11
  };
11
12
 
12
13
  function loadConfigFile(dirPath: string): Partial<SnapevalConfig> | null {
@@ -8,9 +8,34 @@ import type {
8
8
  AssertionResult,
9
9
  } from '../types.js';
10
10
 
11
+ const EXACT_MATCH_PATTERN = /^Output (?:is |equals )exactly:\s*"(.+)"$/i;
12
+
13
+ function gradeExactMatch(assertion: string, output: string): AssertionResult | null {
14
+ const match = assertion.match(EXACT_MATCH_PATTERN);
15
+ if (!match) return null;
16
+ const expected = match[1];
17
+ const actual = output.trim();
18
+ const passed = actual === expected;
19
+ return {
20
+ text: assertion,
21
+ passed,
22
+ evidence: passed
23
+ ? `Exact match: "${expected}"`
24
+ : `Expected: "${expected}"\nGot: "${actual}"`,
25
+ };
26
+ }
27
+
11
28
  function buildGradingPrompt(assertions: string[], output: string, files: string[]): string {
12
29
  const fileList = files.length > 0 ? `\nFiles produced: ${files.join(', ')}` : '';
13
- return `You are a strict eval grader. For each assertion, determine PASS or FAIL based on the output below. Require concrete evidence for a PASS — do not give the benefit of the doubt.
30
+ return `You are an eval grader. For each assertion, determine PASS or FAIL based solely on the output below.
31
+
32
+ GRADING RULES:
33
+ - PASS if the output satisfies the assertion's intent, even if wording differs slightly.
34
+ - FAIL only if the output clearly does not satisfy the assertion.
35
+ - Be consistent: if an assertion checks for X and the output contains X in different phrasing, that is a PASS.
36
+ - For "contains" assertions: look for semantic presence, not exact substring.
37
+ - For "identifies" assertions: the output must demonstrate awareness of the concept, not use identical words.
38
+ - Always cite specific text from the output as evidence.
14
39
 
15
40
  OUTPUT:
16
41
  ---
@@ -23,7 +48,7 @@ ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
23
48
  Respond with JSON only:
24
49
  {
25
50
  "results": [
26
- {"text": "<assertion text>", "passed": true/false, "evidence": "<quote or reference from output>"}
51
+ {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
27
52
  ]
28
53
  }`;
29
54
  }
@@ -38,18 +63,38 @@ function runScript(
38
63
  return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
39
64
  }
40
65
  try {
41
- const evidence = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
66
+ const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
67
+ const evidence = stdout || `Script passed: ${scriptName}`;
42
68
  return { text: `script:${scriptName}`, passed: true, evidence };
43
69
  } catch (err: any) {
44
- const evidence = err.stdout?.trim() || err.message || 'Script exited with non-zero code';
70
+ // Extract the most useful error info without raw stack traces
71
+ const stderr = err.stderr?.trim();
72
+ const stdout = err.stdout?.trim();
73
+ let evidence: string;
74
+ if (err.code === 'EACCES') {
75
+ evidence = `Permission denied: ${scriptPath} is not executable. Run: chmod +x ${scriptPath}`;
76
+ } else if (stderr) {
77
+ // Take only the first line of stderr to avoid stack trace noise
78
+ evidence = stderr.split('\n')[0];
79
+ } else if (stdout) {
80
+ evidence = stdout.split('\n')[0];
81
+ } else {
82
+ evidence = `Script exited with code ${err.status ?? 'unknown'}`;
83
+ }
45
84
  return { text: `script:${scriptName}`, passed: false, evidence };
46
85
  }
47
86
  }
48
87
 
49
88
  function extractJSON(text: string): string {
50
- const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
51
- if (match) return match[1].trim();
52
- return text.trim();
89
+ // Try JSON-tagged fence first, then bare fence, then raw text
90
+ const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
91
+ if (jsonFence) return jsonFence[1].trim();
92
+ // Try parsing raw text as JSON before falling back to any fence
93
+ const trimmed = text.trim();
94
+ try { JSON.parse(trimmed); return trimmed; } catch { /* not raw JSON */ }
95
+ const anyFence = text.match(/```\s*([\s\S]*?)```/);
96
+ if (anyFence) return anyFence[1].trim();
97
+ return trimmed;
53
98
  }
54
99
 
55
100
  export async function gradeAssertions(
@@ -62,7 +107,8 @@ export async function gradeAssertions(
62
107
  if (assertions.length === 0) return null;
63
108
 
64
109
  const scriptAssertions = assertions.filter(a => a.startsWith('script:'));
65
- const llmAssertions = assertions.filter(a => !a.startsWith('script:'));
110
+ const exactAssertions = assertions.filter(a => !a.startsWith('script:') && EXACT_MATCH_PATTERN.test(a));
111
+ const llmAssertions = assertions.filter(a => !a.startsWith('script:') && !EXACT_MATCH_PATTERN.test(a));
66
112
  const results: AssertionResult[] = [];
67
113
 
68
114
  for (const assertion of scriptAssertions) {
@@ -72,6 +118,11 @@ export async function gradeAssertions(
72
118
  results.push(runScript(scriptName, outputDir, dir));
73
119
  }
74
120
 
121
+ for (const assertion of exactAssertions) {
122
+ const result = gradeExactMatch(assertion, output.raw);
123
+ if (result) results.push(result);
124
+ }
125
+
75
126
  if (llmAssertions.length > 0) {
76
127
  const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
77
128
  const response = await inference.chat(