npm - snapeval - Versions diffs - 1.8.0 → 2.1.0 - Mend

snapeval 1.8.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/bin/snapeval.ts +30 -24
package/dist/bin/snapeval.js +25 -22
package/dist/bin/snapeval.js.map +1 -1
package/dist/src/adapters/copilot-sdk-client.js +1 -1
package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
package/dist/src/adapters/harness/copilot-sdk.js +101 -0
package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
package/dist/src/adapters/harness/resolve.js +10 -2
package/dist/src/adapters/harness/resolve.js.map +1 -1
package/dist/src/adapters/inference/copilot-sdk.js +4 -1
package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
package/dist/src/adapters/report/terminal.js +89 -9
package/dist/src/adapters/report/terminal.js.map +1 -1
package/dist/src/commands/eval.d.ts +3 -0
package/dist/src/commands/eval.js +106 -17
package/dist/src/commands/eval.js.map +1 -1
package/dist/src/commands/review.d.ts +1 -0
package/dist/src/commands/review.js.map +1 -1
package/dist/src/config.js +2 -1
package/dist/src/config.js.map +1 -1
package/dist/src/engine/grader.js +67 -9
package/dist/src/engine/grader.js.map +1 -1
package/dist/src/engine/runner.js +14 -12
package/dist/src/engine/runner.js.map +1 -1
package/dist/src/errors.d.ts +6 -0
package/dist/src/errors.js +21 -3
package/dist/src/errors.js.map +1 -1
package/dist/src/types.d.ts +1 -0
package/package.json +4 -1
package/plugin.json +1 -1
package/skills/snapeval/SKILL.md +33 -18
package/src/adapters/copilot-sdk-client.ts +1 -1
package/src/adapters/harness/copilot-sdk.ts +126 -0
package/src/adapters/harness/resolve.ts +13 -2
package/src/adapters/inference/copilot-sdk.ts +5 -1
package/src/adapters/report/terminal.ts +100 -10
package/src/commands/eval.ts +133 -31
package/src/commands/review.ts +1 -1
package/src/config.ts +2 -1
package/src/engine/grader.ts +59 -8
package/src/engine/runner.ts +14 -13
package/src/errors.ts +24 -3
package/src/types.ts +1 -0
package/dist/src/commands/init.d.ts +0 -2
package/dist/src/commands/init.js +0 -27
package/dist/src/commands/init.js.map +0 -1
package/dist/src/engine/generator.d.ts +0 -3
package/dist/src/engine/generator.js +0 -51
package/dist/src/engine/generator.js.map +0 -1
package/src/commands/init.ts +0 -38
package/src/engine/generator.ts +0 -60

package/src/adapters/harness/copilot-sdk.ts ADDED Viewed

@@ -0,0 +1,126 @@
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import type { Harness, HarnessRunResult } from '../../types.js';
+import { getClient, isSDKInstalled } from '../copilot-sdk-client.js';
+export class CopilotSDKHarness implements Harness {
+  readonly name = 'copilot-sdk';
+  async run(options: {
+    skillPath?: string;
+    prompt: string;
+    files?: string[];
+    outputDir: string;
+  }): Promise<HarnessRunResult> {
+    const startMs = Date.now();
+    const client = await getClient();
+    fs.mkdirSync(options.outputDir, { recursive: true });
+    // Dynamically import SDK for approveAll
+    // @ts-ignore — module may not be installed (optional dep)
+    const { approveAll } = await import('@github/copilot-sdk');
+    // Build session config
+    const sessionConfig: Record<string, unknown> = {
+      model: 'gpt-4.1',
+      onPermissionRequest: approveAll,
+      workingDirectory: options.outputDir,
+      infiniteSessions: { enabled: false },
+    };
+    // Native skill loading: point skillDirectories at the skill's parent
+    if (options.skillPath) {
+      sessionConfig.skillDirectories = [options.skillPath];
+    }
+    const session = await client.createSession(sessionConfig);
+    try {
+      // Attach input files if provided
+      const attachments: Array<{ type: string; path: string; displayName?: string }> = [];
+      if (options.files) {
+        for (const file of options.files) {
+          // Copy to outputDir for script assertions, and attach for the model
+          const dest = path.join(options.outputDir, path.basename(file));
+          fs.copyFileSync(file, dest);
+          attachments.push({ type: 'file', path: dest, displayName: path.basename(file) });
+        }
+      }
+      const response = await session.sendAndWait(
+        {
+          prompt: options.prompt,
+          ...(attachments.length > 0 ? { attachments } : {}),
+        },
+        300_000, // 5 min timeout — calibrated for complex eval prompts
+      );
+      const raw = response?.data?.content ?? '';
+      // Collect full transcript from session events
+      const events = await session.getMessages();
+      const transcript = buildTranscript(events);
+      // Extract token count from events if available
+      const totalTokens = extractTokenCount(events);
+      const durationMs = Date.now() - startMs;
+      return {
+        raw: raw.trim(),
+        transcript,
+        files: [],
+        total_tokens: totalTokens,
+        duration_ms: durationMs,
+      };
+    } finally {
+      await session.disconnect();
+    }
+  }
+  async isAvailable(): Promise<boolean> {
+    return isSDKInstalled();
+  }
+}
+function buildTranscript(events: any[]): string {
+  const lines: string[] = [];
+  for (const event of events) {
+    switch (event.type) {
+      case 'user.message':
+        lines.push(`[user] ${event.data?.content ?? ''}`);
+        break;
+      case 'assistant.message':
+        lines.push(`[assistant] ${event.data?.content ?? ''}`);
+        break;
+      case 'tool.execution_start':
+        lines.push(`[tool:start] ${event.data?.toolName ?? 'unknown'}(${JSON.stringify(event.data?.arguments ?? {})})`);
+        break;
+      case 'tool.execution_complete':
+        lines.push(`[tool:done] ${event.data?.toolName ?? 'unknown'} → ${truncate(event.data?.result ?? '', 200)}`);
+        break;
+      case 'skill.invoked':
+        lines.push(`[skill] ${event.data?.name ?? 'unknown'} (${event.data?.path ?? ''})`);
+        break;
+      case 'session.error':
+        lines.push(`[error] ${event.data?.message ?? ''}`);
+        break;
+    }
+  }
+  return lines.join('\n');
+}
+function extractTokenCount(events: any[]): number {
+  let total = 0;
+  for (const event of events) {
+    if (event.type === 'assistant.usage') {
+      total += (event.data?.inputTokens ?? 0) + (event.data?.outputTokens ?? 0);
+    }
+  }
+  return total;
+}
+function truncate(str: string, max: number): string {
+  return str.length > max ? str.slice(0, max) + '...' : str;
+}

package/src/adapters/harness/resolve.ts CHANGED Viewed

@@ -1,10 +1,21 @@
 import type { Harness } from '../../types.js';
 import { CopilotCLIHarness } from './copilot-cli.js';
-import { SnapevalError } from '../../errors.js';
+import { CopilotSDKHarness } from './copilot-sdk.js';
+import { AdapterNotAvailableError, SnapevalError } from '../../errors.js';
+import { isSDKInstalled } from '../copilot-sdk-client.js';
 export function resolveHarness(name: string): Harness {
+  if (name === 'copilot-sdk') {
+    if (!isSDKInstalled()) {
+      throw new AdapterNotAvailableError(
+        'copilot-sdk',
+        '@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk'
+      );
+    }
+    return new CopilotSDKHarness();
+  }
   if (name === 'copilot-cli') {
     return new CopilotCLIHarness();
   }
-  throw new SnapevalError(`Unknown harness "${name}". Built-in options: copilot-cli.`);
+  throw new SnapevalError(`Unknown harness "${name}". Built-in options: copilot-sdk, copilot-cli.`);
 }

package/src/adapters/inference/copilot-sdk.ts CHANGED Viewed

@@ -7,6 +7,9 @@ export class CopilotSDKInference implements InferenceAdapter {
   async chat(messages: Message[], _options?: ChatOptions): Promise<string> {
     const client = await getClient();
+    // @ts-ignore — module may not be installed (optional dep)
+    const { approveAll } = await import('@github/copilot-sdk');
     const systemMessages = messages.filter((m) => m.role === 'system');
     const nonSystemMessages = messages.filter((m) => m.role !== 'system');
     const systemContent = systemMessages.map((m) => m.content).join('\n');
@@ -17,7 +20,8 @@ export class CopilotSDKInference implements InferenceAdapter {
       ...(systemContent
         ? { systemMessage: { content: systemContent } }
         : {}),
-      onPermissionRequest: async () => ({ kind: 'approved' }),
+      onPermissionRequest: approveAll,
+      infiniteSessions: { enabled: false },
     });
     try {

package/src/adapters/report/terminal.ts CHANGED Viewed

@@ -1,5 +1,45 @@
+import * as fs from 'node:fs';
+import * as path from 'node:path';
 import chalk from 'chalk';
-import type { ReportAdapter, EvalResults } from '../../types.js';
+import type { ReportAdapter, EvalResults, BenchmarkData, GradingResult } from '../../types.js';
+interface PreviousIteration {
+  benchmark: BenchmarkData;
+  gradings: Map<string, { withSkill?: GradingResult; withoutSkill?: GradingResult }>;
+}
+function loadPreviousIteration(iterationDir: string): PreviousIteration | null {
+  const workspaceDir = path.dirname(iterationDir);
+  const currentName = path.basename(iterationDir);
+  const currentNum = parseInt(currentName.replace('iteration-', ''), 10);
+  if (isNaN(currentNum) || currentNum <= 1) return null;
+  const prevDir = path.join(workspaceDir, `iteration-${currentNum - 1}`);
+  const prevBenchmarkPath = path.join(prevDir, 'benchmark.json');
+  if (!fs.existsSync(prevBenchmarkPath)) return null;
+  try {
+    const benchmark = JSON.parse(fs.readFileSync(prevBenchmarkPath, 'utf-8'));
+    const gradings = new Map<string, { withSkill?: GradingResult; withoutSkill?: GradingResult }>();
+    const evalDirs = fs.readdirSync(prevDir).filter(d => d.startsWith('eval-'));
+    for (const evalDir of evalDirs) {
+      const wsPath = path.join(prevDir, evalDir, 'with_skill', 'grading.json');
+      const wosPath = path.join(prevDir, evalDir, 'without_skill', 'grading.json');
+      const ws = fs.existsSync(wsPath) ? JSON.parse(fs.readFileSync(wsPath, 'utf-8')) : undefined;
+      const wos = fs.existsSync(wosPath) ? JSON.parse(fs.readFileSync(wosPath, 'utf-8')) : undefined;
+      gradings.set(evalDir, { withSkill: ws, withoutSkill: wos });
+    }
+    return { benchmark, gradings };
+  } catch {
+    return null;
+  }
+}
+function evalLabel(run: { evalId: number; slug: string; prompt: string }): string {
+  // Use expected_output or slug as a readable label instead of truncated prompt
+  if (run.slug && run.slug !== `${run.evalId}`) return run.slug;
+  // Truncate prompt but show first meaningful line
+  const firstLine = run.prompt.split('\n')[0].slice(0, 60);
+  return firstLine;
+}
 export class TerminalReporter implements ReportAdapter {
   readonly name = 'terminal';
@@ -8,24 +48,74 @@ export class TerminalReporter implements ReportAdapter {
     const { skillName, evalRuns, benchmark } = results;
     console.log(chalk.bold(`\nsnapeval — ${skillName}`));
-    console.log(chalk.dim('─'.repeat(50)));
+    console.log(chalk.dim(`Baseline = without SKILL.md (raw AI response)`));
+    console.log(chalk.dim('─'.repeat(60)));
+    const prev = loadPreviousIteration(results.iterationDir);
     for (const run of evalRuns) {
-      const wsRate = run.withSkill.grading?.summary.pass_rate;
+      const wsGrading = run.withSkill.grading;
+      const wsRate = wsGrading?.summary.pass_rate;
       const wosRate = run.withoutSkill.grading?.summary.pass_rate;
       const wsLabel = wsRate !== undefined ? `${(wsRate * 100).toFixed(0)}%` : 'n/a';
       const wosLabel = wosRate !== undefined ? `${(wosRate * 100).toFixed(0)}%` : 'n/a';
-      const tokens = run.withSkill.output.total_tokens;
-      const durationS = (run.withSkill.output.duration_ms / 1000).toFixed(2);
-      console.log(`  ${chalk.cyan(`#${run.evalId}`)} ${run.prompt.slice(0, 60)}`);
-      console.log(`    with_skill: ${wsLabel} | without_skill: ${wosLabel} | ${tokens} tokens, ${durationS}s`);
+      const wsColor = wsRate === 1 ? chalk.green : wsRate === 0 ? chalk.red : chalk.yellow;
+      const durationS = (run.withSkill.output.duration_ms / 1000).toFixed(1);
+      // Show per-eval delta from previous iteration
+      let perEvalDelta = '';
+      if (prev) {
+        const prevGrading = prev.gradings.get(`eval-${run.slug}`);
+        const prevRate = prevGrading?.withSkill?.summary.pass_rate;
+        if (prevRate !== undefined && wsRate !== undefined) {
+          const change = wsRate - prevRate;
+          if (change !== 0) {
+            const arrow = change > 0 ? chalk.green('↑') : chalk.red('↓');
+            perEvalDelta = ` ${arrow} was ${(prevRate * 100).toFixed(0)}%`;
+          }
+        }
+      }
+      console.log(`  ${chalk.cyan(`#${run.evalId}`)} ${evalLabel(run)}`);
+      console.log(`    Skill: ${wsColor(wsLabel)}${perEvalDelta} | Baseline: ${wosLabel} | ${durationS}s`);
+      // Show failed assertions inline
+      if (wsGrading) {
+        const failed = wsGrading.assertion_results.filter((a) => !a.passed);
+        for (const f of failed) {
+          console.log(chalk.red(`    FAIL: ${f.text}`));
+          if (f.evidence) {
+            console.log(chalk.dim(`          ${f.evidence.slice(0, 100)}`));
+          }
+        }
+      }
     }
-    console.log(chalk.dim('─'.repeat(50)));
+    console.log(chalk.dim('─'.repeat(60)));
+    const ws = benchmark.run_summary.with_skill;
+    const wos = benchmark.run_summary.without_skill;
     const delta = benchmark.run_summary.delta;
     const deltaColor = delta.pass_rate > 0 ? chalk.green : delta.pass_rate < 0 ? chalk.red : chalk.dim;
-    console.log(`Delta: ${deltaColor(`${(delta.pass_rate * 100).toFixed(1)}% pass rate`)} | ${delta.time_seconds.toFixed(1)}s time | ${delta.tokens.toFixed(0)} tokens`);
-    console.log(chalk.dim(`with_skill avg: ${(benchmark.run_summary.with_skill.pass_rate.mean * 100).toFixed(1)}% | without_skill avg: ${(benchmark.run_summary.without_skill.pass_rate.mean * 100).toFixed(1)}%`));
+    console.log(chalk.bold('Summary:'));
+    console.log(`  Skill pass rate:    ${(ws.pass_rate.mean * 100).toFixed(1)}%`);
+    console.log(`  Baseline pass rate: ${(wos.pass_rate.mean * 100).toFixed(1)}%`);
+    console.log(`  Improvement:        ${deltaColor(`${delta.pass_rate > 0 ? '+' : ''}${(delta.pass_rate * 100).toFixed(1)}%`)}`);
+    if (prev) {
+      const prevRate = prev.benchmark.run_summary.with_skill.pass_rate.mean;
+      const currRate = ws.pass_rate.mean;
+      const change = currRate - prevRate;
+      const changeColor = change > 0 ? chalk.green : change < 0 ? chalk.red : chalk.dim;
+      console.log(`  vs previous:        ${changeColor(`${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`)} (was ${(prevRate * 100).toFixed(1)}%)`);
+      // Note if eval set size changed
+      const prevEvalCount = prev.gradings.size;
+      const currEvalCount = evalRuns.length;
+      if (prevEvalCount !== currEvalCount) {
+        console.log(chalk.dim(`  Note: eval set changed (${prevEvalCount} → ${currEvalCount} evals)`));
+      }
+    }
   }
 }

package/src/commands/eval.ts CHANGED Viewed

@@ -6,82 +6,184 @@ import type {
   EvalsFile,
   EvalResults,
   EvalRunResult,
+  GradingResult,
 } from '../types.js';
 import { WorkspaceManager } from '../engine/workspace.js';
 import { runEval } from '../engine/runner.js';
 import { gradeAssertions } from '../engine/grader.js';
 import { computeBenchmark } from '../engine/aggregator.js';
-import { SnapevalError } from '../errors.js';
+import { SnapevalError, FileNotFoundError, ThresholdError } from '../errors.js';
+async function runWithConcurrency<T>(
+  tasks: (() => Promise<T>)[],
+  limit: number,
+): Promise<T[]> {
+  const results: T[] = new Array(tasks.length);
+  let index = 0;
+  async function worker() {
+    while (index < tasks.length) {
+      const i = index++;
+      results[i] = await tasks[i]();
+    }
+  }
+  await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, worker));
+  return results;
+}
+const MAX_CONCURRENCY = 10;
+function validateEvalsFile(evalsFile: EvalsFile, evalsPath: string): void {
+  if (!evalsFile.skill_name || typeof evalsFile.skill_name !== 'string') {
+    throw new SnapevalError(`Invalid evals.json at ${evalsPath}: missing or invalid "skill_name" field.`);
+  }
+  if (!Array.isArray(evalsFile.evals)) {
+    throw new SnapevalError(`Invalid evals.json at ${evalsPath}: "evals" must be an array.`);
+  }
+  for (const [i, evalCase] of evalsFile.evals.entries()) {
+    const prefix = `Invalid evals.json at ${evalsPath}: evals[${i}]`;
+    if (typeof evalCase.id !== 'number') {
+      throw new SnapevalError(`${prefix} missing or invalid "id" (must be a number).`);
+    }
+    if (typeof evalCase.prompt !== 'string') {
+      throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "prompt" field.`);
+    }
+    if (typeof evalCase.expected_output !== 'string') {
+      throw new SnapevalError(`${prefix} (id:${evalCase.id}) missing "expected_output" field.`);
+    }
+    if (evalCase.assertions !== undefined && !Array.isArray(evalCase.assertions)) {
+      throw new SnapevalError(`${prefix} (id:${evalCase.id}) "assertions" must be an array of strings.`);
+    }
+  }
+}
 export async function evalCommand(
   skillPath: string,
   harness: Harness,
   inference: InferenceAdapter,
-  options: { workspace?: string; runs?: number; oldSkill?: string }
+  options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number }
 ): Promise<EvalResults> {
   const evalsPath = path.join(skillPath, 'evals', 'evals.json');
   if (!fs.existsSync(evalsPath)) {
-    throw new SnapevalError(`No evals.json found at ${evalsPath}. Run \`snapeval init\` first.`);
+    throw new FileNotFoundError(evalsPath, 'Create evals/evals.json with test scenarios first');
+  }
+  let evalsFile: EvalsFile;
+  try {
+    evalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
+  } catch {
+    throw new SnapevalError(`Invalid JSON in ${evalsPath}. Check for syntax errors (missing commas, trailing commas, etc).`);
+  }
+  validateEvalsFile(evalsFile, evalsPath);
+  // Filter to specific eval IDs if --only is provided
+  if (options.only && options.only.length > 0) {
+    const ids = new Set(options.only);
+    const filtered = evalsFile.evals.filter((e) => ids.has(e.id));
+    if (filtered.length === 0) {
+      throw new SnapevalError(`No eval cases match --only ${options.only.join(',')}. Available IDs: ${evalsFile.evals.map((e) => e.id).join(', ')}`);
+    }
+    evalsFile = { ...evalsFile, evals: filtered };
   }
-  const evalsFile: EvalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
   const ws = new WorkspaceManager(skillPath, options.workspace);
   const iterationDir = ws.createIteration();
+  // Track which SKILL.md was used for this iteration
+  const skillMdPath = path.join(skillPath, 'SKILL.md');
+  if (fs.existsSync(skillMdPath)) {
+    fs.copyFileSync(skillMdPath, path.join(iterationDir, 'SKILL.md.snapshot'));
+  }
   const runs = options.runs ?? 1;
+  const concurrency = Math.min(Math.max(options.concurrency ?? 1, 1), MAX_CONCURRENCY);
   const baselineVariant = options.oldSkill ? 'old_skill' : 'without_skill';
   const scriptsDir = path.join(skillPath, 'evals', 'scripts');
-  const evalRuns: EvalRunResult[] = [];
-  for (const evalCase of evalsFile.evals) {
+  // Pre-create eval directories sequentially (filesystem setup)
+  const evalDirs = evalsFile.evals.map((evalCase) => {
     const slug = WorkspaceManager.getEvalSlug(evalCase).replace('eval-', '');
-    const evalDir = ws.createEvalDir(iterationDir, slug, baselineVariant);
+    return { evalCase, slug, evalDir: ws.createEvalDir(iterationDir, slug, baselineVariant) };
+  });
+  const tasks = evalDirs.map(({ evalCase, slug, evalDir }) => async (): Promise<EvalRunResult> => {
+    const assertions = evalCase.assertions ?? [];
+    const allGradings: { withSkill: GradingResult | null; withoutSkill: GradingResult | null }[] = [];
     let lastRun: Awaited<ReturnType<typeof runEval>> | null = null;
     for (let i = 0; i < runs; i++) {
       lastRun = await runEval(evalCase, skillPath, evalDir, harness, options.oldSkill);
+      // Grade every run, not just the last
+      const [wsGrading, wosGrading] = await Promise.all([
+        gradeAssertions(
+          assertions,
+          lastRun.withSkill.output,
+          path.join(evalDir, 'with_skill'),
+          inference,
+          fs.existsSync(scriptsDir) ? scriptsDir : undefined,
+        ),
+        gradeAssertions(
+          assertions,
+          lastRun.withoutSkill.output,
+          path.join(evalDir, baselineVariant),
+          inference,
+          fs.existsSync(scriptsDir) ? scriptsDir : undefined,
+        ),
+      ]);
+      allGradings.push({ withSkill: wsGrading, withoutSkill: wosGrading });
     }
-    if (!lastRun) continue;
+    if (!lastRun) {
+      throw new SnapevalError(`No runs completed for eval ${evalCase.id}`);
+    }
-    const assertions = evalCase.assertions ?? [];
-    const withSkillGrading = await gradeAssertions(
-      assertions,
-      lastRun.withSkill.output,
-      path.join(evalDir, 'with_skill'),
-      inference,
-      fs.existsSync(scriptsDir) ? scriptsDir : undefined,
-    );
-    const withoutSkillGrading = await gradeAssertions(
-      assertions,
-      lastRun.withoutSkill.output,
-      path.join(evalDir, baselineVariant),
-      inference,
-      fs.existsSync(scriptsDir) ? scriptsDir : undefined,
-    );
-    evalRuns.push({
+    // Use the last run's grading as the primary result (written to grading.json)
+    // but all gradings contribute to benchmark stats via pass rates
+    const lastGrading = allGradings[allGradings.length - 1];
+    return {
       evalId: evalCase.id,
       slug,
       prompt: evalCase.prompt,
       withSkill: {
         output: lastRun.withSkill.output,
-        grading: withSkillGrading ?? undefined,
+        grading: lastGrading.withSkill ?? undefined,
       },
       withoutSkill: {
         output: lastRun.withoutSkill.output,
-        grading: withoutSkillGrading ?? undefined,
+        grading: lastGrading.withoutSkill ?? undefined,
       },
-    });
-  }
+    };
+  });
+  const evalRuns = await runWithConcurrency(tasks, concurrency);
   const benchmark = computeBenchmark(evalRuns);
+  // Add iteration metadata for cross-iteration comparison
+  const benchmarkWithMeta = {
+    ...benchmark,
+    metadata: {
+      eval_count: evalRuns.length,
+      eval_ids: evalRuns.map((r) => r.evalId),
+      skill_name: evalsFile.skill_name,
+      timestamp: new Date().toISOString(),
+    },
+  };
   fs.writeFileSync(
     path.join(iterationDir, 'benchmark.json'),
-    JSON.stringify(benchmark, null, 2)
+    JSON.stringify(benchmarkWithMeta, null, 2)
   );
+  // Check threshold if set (for CI gating)
+  if (options.threshold !== undefined) {
+    const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
+    if (passRate < options.threshold) {
+      // Still return results so the reporter can display them before the error
+      const results = { skillName: evalsFile.skill_name, evalRuns, benchmark, iterationDir };
+      throw Object.assign(new ThresholdError(passRate, options.threshold), { results });
+    }
+  }
   return {
     skillName: evalsFile.skill_name,
     evalRuns,

package/src/commands/review.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export async function reviewCommand(
   skillPath: string,
   harness: Harness,
   inference: InferenceAdapter,
-  options: { workspace?: string; runs?: number; oldSkill?: string; noOpen?: boolean }
+  options: { workspace?: string; runs?: number; oldSkill?: string; noOpen?: boolean; concurrency?: number }
 ): Promise<void> {
   const results = await evalCommand(skillPath, harness, inference, options);

package/src/config.ts CHANGED Viewed

@@ -3,10 +3,11 @@ import * as path from 'node:path';
 import type { SnapevalConfig } from './types.js';
 export const DEFAULT_CONFIG: SnapevalConfig = {
-  harness: 'copilot-cli',
+  harness: 'copilot-sdk',
   inference: 'auto',
   workspace: '../{skill_name}-workspace',
   runs: 1,
+  concurrency: 1,
 };
 function loadConfigFile(dirPath: string): Partial<SnapevalConfig> | null {

package/src/engine/grader.ts CHANGED Viewed

@@ -8,9 +8,34 @@ import type {
   AssertionResult,
 } from '../types.js';
+const EXACT_MATCH_PATTERN = /^Output (?:is |equals )exactly:\s*"(.+)"$/i;
+function gradeExactMatch(assertion: string, output: string): AssertionResult | null {
+  const match = assertion.match(EXACT_MATCH_PATTERN);
+  if (!match) return null;
+  const expected = match[1];
+  const actual = output.trim();
+  const passed = actual === expected;
+  return {
+    text: assertion,
+    passed,
+    evidence: passed
+      ? `Exact match: "${expected}"`
+      : `Expected: "${expected}"\nGot: "${actual}"`,
+  };
+}
 function buildGradingPrompt(assertions: string[], output: string, files: string[]): string {
   const fileList = files.length > 0 ? `\nFiles produced: ${files.join(', ')}` : '';
-  return `You are a strict eval grader. For each assertion, determine PASS or FAIL based on the output below. Require concrete evidence for a PASS — do not give the benefit of the doubt.
+  return `You are an eval grader. For each assertion, determine PASS or FAIL based solely on the output below.
+GRADING RULES:
+- PASS if the output satisfies the assertion's intent, even if wording differs slightly.
+- FAIL only if the output clearly does not satisfy the assertion.
+- Be consistent: if an assertion checks for X and the output contains X in different phrasing, that is a PASS.
+- For "contains" assertions: look for semantic presence, not exact substring.
+- For "identifies" assertions: the output must demonstrate awareness of the concept, not use identical words.
+- Always cite specific text from the output as evidence.
 OUTPUT:
 ---
@@ -23,7 +48,7 @@ ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
 Respond with JSON only:
 {
   "results": [
-    {"text": "<assertion text>", "passed": true/false, "evidence": "<quote or reference from output>"}
+    {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
   ]
 }`;
 }
@@ -38,18 +63,38 @@ function runScript(
     return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
   }
   try {
-    const evidence = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
+    const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
+    const evidence = stdout || `Script passed: ${scriptName}`;
     return { text: `script:${scriptName}`, passed: true, evidence };
   } catch (err: any) {
-    const evidence = err.stdout?.trim() || err.message || 'Script exited with non-zero code';
+    // Extract the most useful error info without raw stack traces
+    const stderr = err.stderr?.trim();
+    const stdout = err.stdout?.trim();
+    let evidence: string;
+    if (err.code === 'EACCES') {
+      evidence = `Permission denied: ${scriptPath} is not executable. Run: chmod +x ${scriptPath}`;
+    } else if (stderr) {
+      // Take only the first line of stderr to avoid stack trace noise
+      evidence = stderr.split('\n')[0];
+    } else if (stdout) {
+      evidence = stdout.split('\n')[0];
+    } else {
+      evidence = `Script exited with code ${err.status ?? 'unknown'}`;
+    }
     return { text: `script:${scriptName}`, passed: false, evidence };
   }
 }
 function extractJSON(text: string): string {
-  const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
-  if (match) return match[1].trim();
-  return text.trim();
+  // Try JSON-tagged fence first, then bare fence, then raw text
+  const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
+  if (jsonFence) return jsonFence[1].trim();
+  // Try parsing raw text as JSON before falling back to any fence
+  const trimmed = text.trim();
+  try { JSON.parse(trimmed); return trimmed; } catch { /* not raw JSON */ }
+  const anyFence = text.match(/```\s*([\s\S]*?)```/);
+  if (anyFence) return anyFence[1].trim();
+  return trimmed;
 }
 export async function gradeAssertions(
@@ -62,7 +107,8 @@ export async function gradeAssertions(
   if (assertions.length === 0) return null;
   const scriptAssertions = assertions.filter(a => a.startsWith('script:'));
-  const llmAssertions = assertions.filter(a => !a.startsWith('script:'));
+  const exactAssertions = assertions.filter(a => !a.startsWith('script:') && EXACT_MATCH_PATTERN.test(a));
+  const llmAssertions = assertions.filter(a => !a.startsWith('script:') && !EXACT_MATCH_PATTERN.test(a));
   const results: AssertionResult[] = [];
   for (const assertion of scriptAssertions) {
@@ -72,6 +118,11 @@ export async function gradeAssertions(
     results.push(runScript(scriptName, outputDir, dir));
   }
+  for (const assertion of exactAssertions) {
+    const result = gradeExactMatch(assertion, output.raw);
+    if (result) results.push(result);
+  }
   if (llmAssertions.length > 0) {
     const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
     const response = await inference.chat(