npm - snapeval - Versions diffs - 2.0.0 → 2.1.1 - Mend

snapeval 2.0.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/README.md +144 -104
package/bin/snapeval.ts +39 -1
package/dist/bin/snapeval.js +33 -0
package/dist/bin/snapeval.js.map +1 -1
package/dist/src/adapters/copilot-sdk-client.js +3 -1
package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
package/dist/src/adapters/harness/copilot-sdk.js +101 -0
package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
package/dist/src/adapters/harness/resolve.js +10 -2
package/dist/src/adapters/harness/resolve.js.map +1 -1
package/dist/src/adapters/inference/copilot-sdk.js +4 -1
package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
package/dist/src/adapters/report/terminal.js +89 -9
package/dist/src/adapters/report/terminal.js.map +1 -1
package/dist/src/commands/eval.d.ts +3 -0
package/dist/src/commands/eval.js +146 -17
package/dist/src/commands/eval.js.map +1 -1
package/dist/src/commands/review.d.ts +1 -0
package/dist/src/commands/review.js.map +1 -1
package/dist/src/config.js +2 -1
package/dist/src/config.js.map +1 -1
package/dist/src/engine/grader.js +67 -9
package/dist/src/engine/grader.js.map +1 -1
package/dist/src/engine/runner.d.ts +1 -0
package/dist/src/engine/runner.js +15 -12
package/dist/src/engine/runner.js.map +1 -1
package/dist/src/errors.d.ts +6 -0
package/dist/src/errors.js +21 -3
package/dist/src/errors.js.map +1 -1
package/dist/src/types.d.ts +3 -0
package/package.json +4 -1
package/plugin.json +1 -1
package/skills/snapeval/SKILL.md +132 -39
package/src/adapters/copilot-sdk-client.ts +3 -1
package/src/adapters/harness/copilot-sdk.ts +126 -0
package/src/adapters/harness/resolve.ts +13 -2
package/src/adapters/inference/copilot-sdk.ts +5 -1
package/src/adapters/report/terminal.ts +99 -10
package/src/commands/eval.ts +183 -31
package/src/commands/review.ts +1 -1
package/src/config.ts +2 -1
package/src/engine/grader.ts +59 -8
package/src/engine/runner.ts +16 -13
package/src/errors.ts +24 -3
package/src/types.ts +3 -0

package/src/engine/grader.ts CHANGED Viewed

@@ -8,9 +8,34 @@ import type {
   AssertionResult,
 } from '../types.js';
+const EXACT_MATCH_PATTERN = /^Output (?:is |equals )exactly:\s*"(.+)"$/i;
+function gradeExactMatch(assertion: string, output: string): AssertionResult | null {
+  const match = assertion.match(EXACT_MATCH_PATTERN);
+  if (!match) return null;
+  const expected = match[1];
+  const actual = output.trim();
+  const passed = actual === expected;
+  return {
+    text: assertion,
+    passed,
+    evidence: passed
+      ? `Exact match: "${expected}"`
+      : `Expected: "${expected}"\nGot: "${actual}"`,
+  };
+}
 function buildGradingPrompt(assertions: string[], output: string, files: string[]): string {
   const fileList = files.length > 0 ? `\nFiles produced: ${files.join(', ')}` : '';
-  return `You are a strict eval grader. For each assertion, determine PASS or FAIL based on the output below. Require concrete evidence for a PASS — do not give the benefit of the doubt.
+  return `You are an eval grader. For each assertion, determine PASS or FAIL based solely on the output below.
+GRADING RULES:
+- PASS if the output satisfies the assertion's intent, even if wording differs slightly.
+- FAIL only if the output clearly does not satisfy the assertion.
+- Be consistent: if an assertion checks for X and the output contains X in different phrasing, that is a PASS.
+- For "contains" assertions: look for semantic presence, not exact substring.
+- For "identifies" assertions: the output must demonstrate awareness of the concept, not use identical words.
+- Always cite specific text from the output as evidence.
 OUTPUT:
 ---
@@ -23,7 +48,7 @@ ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
 Respond with JSON only:
 {
   "results": [
-    {"text": "<assertion text>", "passed": true/false, "evidence": "<quote or reference from output>"}
+    {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
   ]
 }`;
 }
@@ -38,18 +63,38 @@ function runScript(
     return { text: `script:${scriptName}`, passed: false, evidence: `Script not found: ${scriptPath}` };
   }
   try {
-    const evidence = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000 }).trim();
+    const stdout = execFileSync(scriptPath, [outputDir], { encoding: 'utf-8', timeout: 30000, stdio: ['pipe', 'pipe', 'pipe'] }).trim();
+    const evidence = stdout || `Script passed: ${scriptName}`;
     return { text: `script:${scriptName}`, passed: true, evidence };
   } catch (err: any) {
-    const evidence = err.stdout?.trim() || err.message || 'Script exited with non-zero code';
+    // Extract the most useful error info without raw stack traces
+    const stderr = err.stderr?.trim();
+    const stdout = err.stdout?.trim();
+    let evidence: string;
+    if (err.code === 'EACCES') {
+      evidence = `Permission denied: ${scriptPath} is not executable. Run: chmod +x ${scriptPath}`;
+    } else if (stderr) {
+      // Take only the first line of stderr to avoid stack trace noise
+      evidence = stderr.split('\n')[0];
+    } else if (stdout) {
+      evidence = stdout.split('\n')[0];
+    } else {
+      evidence = `Script exited with code ${err.status ?? 'unknown'}`;
+    }
     return { text: `script:${scriptName}`, passed: false, evidence };
   }
 }
 function extractJSON(text: string): string {
-  const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
-  if (match) return match[1].trim();
-  return text.trim();
+  // Try JSON-tagged fence first, then bare fence, then raw text
+  const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
+  if (jsonFence) return jsonFence[1].trim();
+  // Try parsing raw text as JSON before falling back to any fence
+  const trimmed = text.trim();
+  try { JSON.parse(trimmed); return trimmed; } catch { /* not raw JSON */ }
+  const anyFence = text.match(/```\s*([\s\S]*?)```/);
+  if (anyFence) return anyFence[1].trim();
+  return trimmed;
 }
 export async function gradeAssertions(
@@ -62,7 +107,8 @@ export async function gradeAssertions(
   if (assertions.length === 0) return null;
   const scriptAssertions = assertions.filter(a => a.startsWith('script:'));
-  const llmAssertions = assertions.filter(a => !a.startsWith('script:'));
+  const exactAssertions = assertions.filter(a => !a.startsWith('script:') && EXACT_MATCH_PATTERN.test(a));
+  const llmAssertions = assertions.filter(a => !a.startsWith('script:') && !EXACT_MATCH_PATTERN.test(a));
   const results: AssertionResult[] = [];
   for (const assertion of scriptAssertions) {
@@ -72,6 +118,11 @@ export async function gradeAssertions(
     results.push(runScript(scriptName, outputDir, dir));
   }
+  for (const assertion of exactAssertions) {
+    const result = gradeExactMatch(assertion, output.raw);
+    if (result) results.push(result);
+  }
   if (llmAssertions.length > 0) {
     const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
     const response = await inference.chat(

package/src/engine/runner.ts CHANGED Viewed

@@ -5,6 +5,7 @@ import type { Harness, HarnessRunResult, EvalCase, TimingData } from '../types.j
 interface RunEvalResult {
   evalId: number;
   slug: string;
+  label?: string;
   prompt: string;
   withSkill: { output: HarnessRunResult };
   withoutSkill: { output: HarnessRunResult };
@@ -33,27 +34,29 @@ export async function runEval(
   const baselineVariant = oldSkillPath ? 'old_skill' : 'without_skill';
   const baselineDir = path.join(evalDir, baselineVariant);
-  const withSkillResult = await harness.run({
-    skillPath,
-    prompt: evalCase.prompt,
-    files: evalCase.files,
-    outputDir: path.join(withSkillDir, 'outputs'),
-  });
+  const [withSkillResult, baselineResult] = await Promise.all([
+    harness.run({
+      skillPath,
+      prompt: evalCase.prompt,
+      files: evalCase.files,
+      outputDir: path.join(withSkillDir, 'outputs'),
+    }),
+    harness.run({
+      skillPath: oldSkillPath,
+      prompt: evalCase.prompt,
+      files: evalCase.files,
+      outputDir: path.join(baselineDir, 'outputs'),
+    }),
+  ]);
   writeTiming(withSkillDir, withSkillResult);
   writeOutput(withSkillDir, withSkillResult);
-  const baselineResult = await harness.run({
-    skillPath: oldSkillPath,
-    prompt: evalCase.prompt,
-    files: evalCase.files,
-    outputDir: path.join(baselineDir, 'outputs'),
-  });
   writeTiming(baselineDir, baselineResult);
   writeOutput(baselineDir, baselineResult);
   return {
     evalId: evalCase.id,
     slug: evalCase.slug ?? `${evalCase.id}`,
+    label: evalCase.label,
     prompt: evalCase.prompt,
     withSkill: { output: withSkillResult },
     withoutSkill: { output: baselineResult },

package/src/errors.ts CHANGED Viewed

@@ -1,3 +1,10 @@
+// Exit codes:
+// 0 = success
+// 1 = threshold not met (eval ran successfully but pass rate below threshold)
+// 2 = config/input error (bad JSON, missing fields, invalid flags)
+// 3 = file not found (missing skill dir, missing evals.json, missing script)
+// 4 = runtime error (harness failure, grading failure, timeout)
 export class SnapevalError extends Error {
   constructor(message: string, public exitCode: number = 2) {
     super(message);
@@ -5,9 +12,23 @@ export class SnapevalError extends Error {
   }
 }
+export class FileNotFoundError extends SnapevalError {
+  constructor(filePath: string, hint?: string) {
+    super(`File not found: ${filePath}${hint ? `. ${hint}` : ''}`, 3);
+    this.name = 'FileNotFoundError';
+  }
+}
+export class ThresholdError extends SnapevalError {
+  constructor(actual: number, threshold: number) {
+    super(`Skill pass rate ${(actual * 100).toFixed(1)}% is below threshold ${(threshold * 100).toFixed(1)}%`, 1);
+    this.name = 'ThresholdError';
+  }
+}
 export class AdapterNotAvailableError extends SnapevalError {
   constructor(adapterName: string, installHint: string) {
-    super(`${adapterName} is not available. ${installHint}`);
+    super(`${adapterName} is not available. ${installHint}`, 4);
     this.name = 'AdapterNotAvailableError';
   }
 }
@@ -21,14 +42,14 @@ export class RateLimitError extends SnapevalError {
 export class TimeoutError extends SnapevalError {
   constructor(evalId: number, timeoutMs: number) {
-    super(`Eval ${evalId} timed out after ${timeoutMs}ms.`);
+    super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
     this.name = 'TimeoutError';
   }
 }
 export class GradingError extends SnapevalError {
   constructor(evalId: number, detail: string) {
-    super(`Grading failed for eval ${evalId}: ${detail}`);
+    super(`Grading failed for eval ${evalId}: ${detail}`, 4);
     this.name = 'GradingError';
   }
 }

package/src/types.ts CHANGED Viewed

@@ -43,6 +43,7 @@ export interface EvalCase {
   id: number;
   prompt: string;
   expected_output: string;
+  label?: string;
   slug?: string;
   files?: string[];
   assertions?: string[];
@@ -110,6 +111,7 @@ export interface FeedbackData {
 export interface EvalRunResult {
   evalId: number;
   slug: string;
+  label?: string;
   prompt: string;
   withSkill: {
     output: HarnessRunResult;
@@ -142,4 +144,5 @@ export interface SnapevalConfig {
   inference: string;
   workspace: string;
   runs: number;
+  concurrency: number;
 }