snapeval 1.8.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/snapeval.ts +30 -24
- package/dist/bin/snapeval.js +25 -22
- package/dist/bin/snapeval.js.map +1 -1
- package/dist/src/adapters/copilot-sdk-client.js +1 -1
- package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
- package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
- package/dist/src/adapters/harness/copilot-sdk.js +101 -0
- package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
- package/dist/src/adapters/harness/resolve.js +10 -2
- package/dist/src/adapters/harness/resolve.js.map +1 -1
- package/dist/src/adapters/inference/copilot-sdk.js +4 -1
- package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
- package/dist/src/adapters/report/terminal.js +89 -9
- package/dist/src/adapters/report/terminal.js.map +1 -1
- package/dist/src/commands/eval.d.ts +3 -0
- package/dist/src/commands/eval.js +106 -17
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/commands/review.d.ts +1 -0
- package/dist/src/commands/review.js.map +1 -1
- package/dist/src/config.js +2 -1
- package/dist/src/config.js.map +1 -1
- package/dist/src/engine/grader.js +67 -9
- package/dist/src/engine/grader.js.map +1 -1
- package/dist/src/engine/runner.js +14 -12
- package/dist/src/engine/runner.js.map +1 -1
- package/dist/src/errors.d.ts +6 -0
- package/dist/src/errors.js +21 -3
- package/dist/src/errors.js.map +1 -1
- package/dist/src/types.d.ts +1 -0
- package/package.json +4 -1
- package/plugin.json +1 -1
- package/skills/snapeval/SKILL.md +33 -18
- package/src/adapters/copilot-sdk-client.ts +1 -1
- package/src/adapters/harness/copilot-sdk.ts +126 -0
- package/src/adapters/harness/resolve.ts +13 -2
- package/src/adapters/inference/copilot-sdk.ts +5 -1
- package/src/adapters/report/terminal.ts +100 -10
- package/src/commands/eval.ts +133 -31
- package/src/commands/review.ts +1 -1
- package/src/config.ts +2 -1
- package/src/engine/grader.ts +59 -8
- package/src/engine/runner.ts +14 -13
- package/src/errors.ts +24 -3
- package/src/types.ts +1 -0
- package/dist/src/commands/init.d.ts +0 -2
- package/dist/src/commands/init.js +0 -27
- package/dist/src/commands/init.js.map +0 -1
- package/dist/src/engine/generator.d.ts +0 -3
- package/dist/src/engine/generator.js +0 -51
- package/dist/src/engine/generator.js.map +0 -1
- package/src/commands/init.ts +0 -38
- package/src/engine/generator.ts +0 -60
package/src/engine/runner.ts
CHANGED
|
@@ -33,21 +33,22 @@ export async function runEval(
|
|
|
33
33
|
const baselineVariant = oldSkillPath ? 'old_skill' : 'without_skill';
|
|
34
34
|
const baselineDir = path.join(evalDir, baselineVariant);
|
|
35
35
|
|
|
36
|
-
const withSkillResult = await
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
36
|
+
const [withSkillResult, baselineResult] = await Promise.all([
|
|
37
|
+
harness.run({
|
|
38
|
+
skillPath,
|
|
39
|
+
prompt: evalCase.prompt,
|
|
40
|
+
files: evalCase.files,
|
|
41
|
+
outputDir: path.join(withSkillDir, 'outputs'),
|
|
42
|
+
}),
|
|
43
|
+
harness.run({
|
|
44
|
+
skillPath: oldSkillPath,
|
|
45
|
+
prompt: evalCase.prompt,
|
|
46
|
+
files: evalCase.files,
|
|
47
|
+
outputDir: path.join(baselineDir, 'outputs'),
|
|
48
|
+
}),
|
|
49
|
+
]);
|
|
42
50
|
writeTiming(withSkillDir, withSkillResult);
|
|
43
51
|
writeOutput(withSkillDir, withSkillResult);
|
|
44
|
-
|
|
45
|
-
const baselineResult = await harness.run({
|
|
46
|
-
skillPath: oldSkillPath,
|
|
47
|
-
prompt: evalCase.prompt,
|
|
48
|
-
files: evalCase.files,
|
|
49
|
-
outputDir: path.join(baselineDir, 'outputs'),
|
|
50
|
-
});
|
|
51
52
|
writeTiming(baselineDir, baselineResult);
|
|
52
53
|
writeOutput(baselineDir, baselineResult);
|
|
53
54
|
|
package/src/errors.ts
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
// Exit codes:
|
|
2
|
+
// 0 = success
|
|
3
|
+
// 1 = threshold not met (eval ran successfully but pass rate below threshold)
|
|
4
|
+
// 2 = config/input error (bad JSON, missing fields, invalid flags)
|
|
5
|
+
// 3 = file not found (missing skill dir, missing evals.json, missing script)
|
|
6
|
+
// 4 = runtime error (harness failure, grading failure, timeout)
|
|
7
|
+
|
|
1
8
|
export class SnapevalError extends Error {
|
|
2
9
|
constructor(message: string, public exitCode: number = 2) {
|
|
3
10
|
super(message);
|
|
@@ -5,9 +12,23 @@ export class SnapevalError extends Error {
|
|
|
5
12
|
}
|
|
6
13
|
}
|
|
7
14
|
|
|
15
|
+
export class FileNotFoundError extends SnapevalError {
|
|
16
|
+
constructor(filePath: string, hint?: string) {
|
|
17
|
+
super(`File not found: ${filePath}${hint ? `. ${hint}` : ''}`, 3);
|
|
18
|
+
this.name = 'FileNotFoundError';
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export class ThresholdError extends SnapevalError {
|
|
23
|
+
constructor(actual: number, threshold: number) {
|
|
24
|
+
super(`Skill pass rate ${(actual * 100).toFixed(1)}% is below threshold ${(threshold * 100).toFixed(1)}%`, 1);
|
|
25
|
+
this.name = 'ThresholdError';
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
8
29
|
export class AdapterNotAvailableError extends SnapevalError {
|
|
9
30
|
constructor(adapterName: string, installHint: string) {
|
|
10
|
-
super(`${adapterName} is not available. ${installHint}
|
|
31
|
+
super(`${adapterName} is not available. ${installHint}`, 4);
|
|
11
32
|
this.name = 'AdapterNotAvailableError';
|
|
12
33
|
}
|
|
13
34
|
}
|
|
@@ -21,14 +42,14 @@ export class RateLimitError extends SnapevalError {
|
|
|
21
42
|
|
|
22
43
|
export class TimeoutError extends SnapevalError {
|
|
23
44
|
constructor(evalId: number, timeoutMs: number) {
|
|
24
|
-
super(`Eval ${evalId} timed out after ${timeoutMs}ms
|
|
45
|
+
super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
|
|
25
46
|
this.name = 'TimeoutError';
|
|
26
47
|
}
|
|
27
48
|
}
|
|
28
49
|
|
|
29
50
|
export class GradingError extends SnapevalError {
|
|
30
51
|
constructor(evalId: number, detail: string) {
|
|
31
|
-
super(`Grading failed for eval ${evalId}: ${detail}
|
|
52
|
+
super(`Grading failed for eval ${evalId}: ${detail}`, 4);
|
|
32
53
|
this.name = 'GradingError';
|
|
33
54
|
}
|
|
34
55
|
}
|
package/src/types.ts
CHANGED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import * as fs from 'node:fs';
|
|
2
|
-
import * as path from 'node:path';
|
|
3
|
-
import { generateEvals } from '../engine/generator.js';
|
|
4
|
-
import { SnapevalError } from '../errors.js';
|
|
5
|
-
export async function initCommand(skillPath, inference) {
|
|
6
|
-
// Locate the skill definition file (SKILL.md or skill.md)
|
|
7
|
-
const candidates = ['SKILL.md', 'skill.md'];
|
|
8
|
-
let skillFilePath = null;
|
|
9
|
-
for (const name of candidates) {
|
|
10
|
-
const candidate = path.join(skillPath, name);
|
|
11
|
-
if (fs.existsSync(candidate)) {
|
|
12
|
-
skillFilePath = candidate;
|
|
13
|
-
break;
|
|
14
|
-
}
|
|
15
|
-
}
|
|
16
|
-
if (!skillFilePath) {
|
|
17
|
-
throw new SnapevalError(`No SKILL.md found at ${skillPath}. Create a SKILL.md file to describe your skill.`);
|
|
18
|
-
}
|
|
19
|
-
const skillContent = fs.readFileSync(skillFilePath, 'utf-8');
|
|
20
|
-
const skillName = path.basename(skillPath);
|
|
21
|
-
const evalsFile = await generateEvals(skillContent, skillName, inference);
|
|
22
|
-
const evalsDir = path.join(skillPath, 'evals');
|
|
23
|
-
fs.mkdirSync(evalsDir, { recursive: true });
|
|
24
|
-
const evalsPath = path.join(evalsDir, 'evals.json');
|
|
25
|
-
fs.writeFileSync(evalsPath, JSON.stringify(evalsFile, null, 2), 'utf-8');
|
|
26
|
-
}
|
|
27
|
-
//# sourceMappingURL=init.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"init.js","sourceRoot":"","sources":["../../../src/commands/init.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,OAAO,EAAE,aAAa,EAAE,MAAM,wBAAwB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAE7C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,SAAiB,EACjB,SAA2B;IAE3B,0DAA0D;IAC1D,MAAM,UAAU,GAAG,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IAC5C,IAAI,aAAa,GAAkB,IAAI,CAAC;IACxC,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;QAC7C,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;YAC7B,aAAa,GAAG,SAAS,CAAC;YAC1B,MAAM;QACR,CAAC;IACH,CAAC;IAED,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,aAAa,CACrB,wBAAwB,SAAS,kDAAkD,CACpF,CAAC;IACJ,CAAC;IAED,MAAM,YAAY,GAAG,EAAE,CAAC,YAAY,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC;IAC7D,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;IAE3C,MAAM,SAAS,GAAG,MAAM,aAAa,CAAC,YAAY,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;IAE1E,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;IAC/C,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE5C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IACpD,EAAE,CAAC,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AAC3E,CAAC"}
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
export function buildGeneratorPrompt(skillContent) {
|
|
2
|
-
return `You are a test case generator for AI skills. Read the following skill definition and generate 5-8 realistic test scenarios.
|
|
3
|
-
|
|
4
|
-
SKILL DEFINITION:
|
|
5
|
-
---
|
|
6
|
-
${skillContent}
|
|
7
|
-
---
|
|
8
|
-
|
|
9
|
-
Generate test scenarios as JSON with this exact format:
|
|
10
|
-
{
|
|
11
|
-
"skill_name": "<name from skill>",
|
|
12
|
-
"evals": [
|
|
13
|
-
{
|
|
14
|
-
"id": 1,
|
|
15
|
-
"slug": "<2-4-word-kebab-case-label>",
|
|
16
|
-
"prompt": "<realistic user prompt that would trigger this skill>",
|
|
17
|
-
"expected_output": "<human-readable description of expected behavior>"
|
|
18
|
-
}
|
|
19
|
-
]
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
Requirements:
|
|
23
|
-
- Include happy path scenarios (normal use cases)
|
|
24
|
-
- Include edge cases (empty input, malformed input, boundary conditions)
|
|
25
|
-
- Include at least one negative test (input the skill should handle gracefully)
|
|
26
|
-
- Prompts should be realistic — the way a real user would type them
|
|
27
|
-
- slug must be 2-4 words in kebab-case (e.g. "happy-path", "empty-input-edge-case")
|
|
28
|
-
- Return ONLY the JSON, no markdown wrapping`;
|
|
29
|
-
}
|
|
30
|
-
function extractJSON(text) {
|
|
31
|
-
const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
32
|
-
if (match)
|
|
33
|
-
return match[1].trim();
|
|
34
|
-
return text.trim();
|
|
35
|
-
}
|
|
36
|
-
export async function generateEvals(skillContent, skillName, inference) {
|
|
37
|
-
const prompt = buildGeneratorPrompt(skillContent);
|
|
38
|
-
const response = await inference.chat([{ role: 'user', content: prompt }], { temperature: 0.7, responseFormat: 'json' });
|
|
39
|
-
const parsed = JSON.parse(extractJSON(response));
|
|
40
|
-
return {
|
|
41
|
-
skill_name: parsed.skill_name || skillName,
|
|
42
|
-
evals: parsed.evals.map((e, i) => ({
|
|
43
|
-
id: e.id || i + 1,
|
|
44
|
-
slug: e.slug,
|
|
45
|
-
prompt: e.prompt,
|
|
46
|
-
expected_output: e.expected_output || '',
|
|
47
|
-
files: e.files || [],
|
|
48
|
-
})),
|
|
49
|
-
};
|
|
50
|
-
}
|
|
51
|
-
//# sourceMappingURL=generator.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"generator.js","sourceRoot":"","sources":["../../../src/engine/generator.ts"],"names":[],"mappings":"AAEA,MAAM,UAAU,oBAAoB,CAAC,YAAoB;IACvD,OAAO;;;;EAIP,YAAY;;;;;;;;;;;;;;;;;;;;;;6CAsB+B,CAAC;AAC9C,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;IACzD,IAAI,KAAK;QAAE,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAClC,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;AACrB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,YAAoB,EACpB,SAAiB,EACjB,SAA2B;IAE3B,MAAM,MAAM,GAAG,oBAAoB,CAAC,YAAY,CAAC,CAAC;IAClD,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,GAAG,EAAE,cAAc,EAAE,MAAM,EAAE,CAC7C,CAAC;IACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC;IACjD,OAAO;QACL,UAAU,EAAE,MAAM,CAAC,UAAU,IAAI,SAAS;QAC1C,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,CAAS,EAAE,EAAE,CAAC,CAAC;YAC9C,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC;YACjB,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,MAAM,EAAE,CAAC,CAAC,MAAM;YAChB,eAAe,EAAE,CAAC,CAAC,eAAe,IAAI,EAAE;YACxC,KAAK,EAAE,CAAC,CAAC,KAAK,IAAI,EAAE;SACrB,CAAC,CAAC;KACJ,CAAC;AACJ,CAAC"}
|
package/src/commands/init.ts
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import * as fs from 'node:fs';
|
|
2
|
-
import * as path from 'node:path';
|
|
3
|
-
import type { InferenceAdapter } from '../types.js';
|
|
4
|
-
import { generateEvals } from '../engine/generator.js';
|
|
5
|
-
import { SnapevalError } from '../errors.js';
|
|
6
|
-
|
|
7
|
-
export async function initCommand(
|
|
8
|
-
skillPath: string,
|
|
9
|
-
inference: InferenceAdapter
|
|
10
|
-
): Promise<void> {
|
|
11
|
-
// Locate the skill definition file (SKILL.md or skill.md)
|
|
12
|
-
const candidates = ['SKILL.md', 'skill.md'];
|
|
13
|
-
let skillFilePath: string | null = null;
|
|
14
|
-
for (const name of candidates) {
|
|
15
|
-
const candidate = path.join(skillPath, name);
|
|
16
|
-
if (fs.existsSync(candidate)) {
|
|
17
|
-
skillFilePath = candidate;
|
|
18
|
-
break;
|
|
19
|
-
}
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
if (!skillFilePath) {
|
|
23
|
-
throw new SnapevalError(
|
|
24
|
-
`No SKILL.md found at ${skillPath}. Create a SKILL.md file to describe your skill.`
|
|
25
|
-
);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
const skillContent = fs.readFileSync(skillFilePath, 'utf-8');
|
|
29
|
-
const skillName = path.basename(skillPath);
|
|
30
|
-
|
|
31
|
-
const evalsFile = await generateEvals(skillContent, skillName, inference);
|
|
32
|
-
|
|
33
|
-
const evalsDir = path.join(skillPath, 'evals');
|
|
34
|
-
fs.mkdirSync(evalsDir, { recursive: true });
|
|
35
|
-
|
|
36
|
-
const evalsPath = path.join(evalsDir, 'evals.json');
|
|
37
|
-
fs.writeFileSync(evalsPath, JSON.stringify(evalsFile, null, 2), 'utf-8');
|
|
38
|
-
}
|
package/src/engine/generator.ts
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
import type { InferenceAdapter, EvalsFile } from '../types.js';
|
|
2
|
-
|
|
3
|
-
export function buildGeneratorPrompt(skillContent: string): string {
|
|
4
|
-
return `You are a test case generator for AI skills. Read the following skill definition and generate 5-8 realistic test scenarios.
|
|
5
|
-
|
|
6
|
-
SKILL DEFINITION:
|
|
7
|
-
---
|
|
8
|
-
${skillContent}
|
|
9
|
-
---
|
|
10
|
-
|
|
11
|
-
Generate test scenarios as JSON with this exact format:
|
|
12
|
-
{
|
|
13
|
-
"skill_name": "<name from skill>",
|
|
14
|
-
"evals": [
|
|
15
|
-
{
|
|
16
|
-
"id": 1,
|
|
17
|
-
"slug": "<2-4-word-kebab-case-label>",
|
|
18
|
-
"prompt": "<realistic user prompt that would trigger this skill>",
|
|
19
|
-
"expected_output": "<human-readable description of expected behavior>"
|
|
20
|
-
}
|
|
21
|
-
]
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
Requirements:
|
|
25
|
-
- Include happy path scenarios (normal use cases)
|
|
26
|
-
- Include edge cases (empty input, malformed input, boundary conditions)
|
|
27
|
-
- Include at least one negative test (input the skill should handle gracefully)
|
|
28
|
-
- Prompts should be realistic — the way a real user would type them
|
|
29
|
-
- slug must be 2-4 words in kebab-case (e.g. "happy-path", "empty-input-edge-case")
|
|
30
|
-
- Return ONLY the JSON, no markdown wrapping`;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
function extractJSON(text: string): string {
|
|
34
|
-
const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
35
|
-
if (match) return match[1].trim();
|
|
36
|
-
return text.trim();
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
export async function generateEvals(
|
|
40
|
-
skillContent: string,
|
|
41
|
-
skillName: string,
|
|
42
|
-
inference: InferenceAdapter
|
|
43
|
-
): Promise<EvalsFile> {
|
|
44
|
-
const prompt = buildGeneratorPrompt(skillContent);
|
|
45
|
-
const response = await inference.chat(
|
|
46
|
-
[{ role: 'user', content: prompt }],
|
|
47
|
-
{ temperature: 0.7, responseFormat: 'json' }
|
|
48
|
-
);
|
|
49
|
-
const parsed = JSON.parse(extractJSON(response));
|
|
50
|
-
return {
|
|
51
|
-
skill_name: parsed.skill_name || skillName,
|
|
52
|
-
evals: parsed.evals.map((e: any, i: number) => ({
|
|
53
|
-
id: e.id || i + 1,
|
|
54
|
-
slug: e.slug,
|
|
55
|
-
prompt: e.prompt,
|
|
56
|
-
expected_output: e.expected_output || '',
|
|
57
|
-
files: e.files || [],
|
|
58
|
-
})),
|
|
59
|
-
};
|
|
60
|
-
}
|