snapeval 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +194 -0
- package/bin/snapeval.ts +226 -0
- package/dist/bin/snapeval.d.ts +2 -0
- package/dist/bin/snapeval.js +191 -0
- package/dist/bin/snapeval.js.map +1 -0
- package/dist/src/adapters/inference/copilot.d.ts +9 -0
- package/dist/src/adapters/inference/copilot.js +25 -0
- package/dist/src/adapters/inference/copilot.js.map +1 -0
- package/dist/src/adapters/inference/github-models.d.ts +9 -0
- package/dist/src/adapters/inference/github-models.js +62 -0
- package/dist/src/adapters/inference/github-models.js.map +1 -0
- package/dist/src/adapters/inference/resolve.d.ts +2 -0
- package/dist/src/adapters/inference/resolve.js +49 -0
- package/dist/src/adapters/inference/resolve.js.map +1 -0
- package/dist/src/adapters/report/json.d.ts +7 -0
- package/dist/src/adapters/report/json.js +39 -0
- package/dist/src/adapters/report/json.js.map +1 -0
- package/dist/src/adapters/report/terminal.d.ts +5 -0
- package/dist/src/adapters/report/terminal.js +42 -0
- package/dist/src/adapters/report/terminal.js.map +1 -0
- package/dist/src/adapters/skill/copilot-cli.d.ts +6 -0
- package/dist/src/adapters/skill/copilot-cli.js +51 -0
- package/dist/src/adapters/skill/copilot-cli.js.map +1 -0
- package/dist/src/commands/approve.d.ts +5 -0
- package/dist/src/commands/approve.js +40 -0
- package/dist/src/commands/approve.js.map +1 -0
- package/dist/src/commands/capture.d.ts +4 -0
- package/dist/src/commands/capture.js +18 -0
- package/dist/src/commands/capture.js.map +1 -0
- package/dist/src/commands/check.d.ts +6 -0
- package/dist/src/commands/check.js +68 -0
- package/dist/src/commands/check.js.map +1 -0
- package/dist/src/commands/init.d.ts +2 -0
- package/dist/src/commands/init.js +27 -0
- package/dist/src/commands/init.js.map +1 -0
- package/dist/src/commands/report.d.ts +4 -0
- package/dist/src/commands/report.js +26 -0
- package/dist/src/commands/report.js.map +1 -0
- package/dist/src/config.d.ts +3 -0
- package/dist/src/config.js +30 -0
- package/dist/src/config.js.map +1 -0
- package/dist/src/engine/budget.d.ts +10 -0
- package/dist/src/engine/budget.js +25 -0
- package/dist/src/engine/budget.js.map +1 -0
- package/dist/src/engine/comparison/embedding.d.ts +6 -0
- package/dist/src/engine/comparison/embedding.js +19 -0
- package/dist/src/engine/comparison/embedding.js.map +1 -0
- package/dist/src/engine/comparison/judge.d.ts +8 -0
- package/dist/src/engine/comparison/judge.js +64 -0
- package/dist/src/engine/comparison/judge.js.map +1 -0
- package/dist/src/engine/comparison/pipeline.d.ts +6 -0
- package/dist/src/engine/comparison/pipeline.js +31 -0
- package/dist/src/engine/comparison/pipeline.js.map +1 -0
- package/dist/src/engine/comparison/schema.d.ts +2 -0
- package/dist/src/engine/comparison/schema.js +28 -0
- package/dist/src/engine/comparison/schema.js.map +1 -0
- package/dist/src/engine/comparison/variance.d.ts +3 -0
- package/dist/src/engine/comparison/variance.js +26 -0
- package/dist/src/engine/comparison/variance.js.map +1 -0
- package/dist/src/engine/generator.d.ts +3 -0
- package/dist/src/engine/generator.js +52 -0
- package/dist/src/engine/generator.js.map +1 -0
- package/dist/src/engine/snapshot.d.ts +11 -0
- package/dist/src/engine/snapshot.js +46 -0
- package/dist/src/engine/snapshot.js.map +1 -0
- package/dist/src/errors.d.ts +16 -0
- package/dist/src/errors.js +33 -0
- package/dist/src/errors.js.map +1 -0
- package/dist/src/types.d.ts +125 -0
- package/dist/src/types.js +2 -0
- package/dist/src/types.js.map +1 -0
- package/package.json +53 -0
- package/plugin.json +9 -0
- package/scripts/snapeval-cli.sh +7 -0
- package/skills/snapeval/SKILL.md +51 -0
- package/src/adapters/inference/copilot.ts +30 -0
- package/src/adapters/inference/github-models.ts +74 -0
- package/src/adapters/inference/resolve.ts +70 -0
- package/src/adapters/report/json.ts +64 -0
- package/src/adapters/report/terminal.ts +59 -0
- package/src/adapters/skill/copilot-cli.ts +60 -0
- package/src/commands/approve.ts +58 -0
- package/src/commands/capture.ts +25 -0
- package/src/commands/check.ts +86 -0
- package/src/commands/init.ts +38 -0
- package/src/commands/report.ts +36 -0
- package/src/config.ts +37 -0
- package/src/engine/budget.ts +27 -0
- package/src/engine/comparison/embedding.ts +26 -0
- package/src/engine/comparison/judge.ts +78 -0
- package/src/engine/comparison/pipeline.ts +43 -0
- package/src/engine/comparison/schema.ts +22 -0
- package/src/engine/comparison/variance.ts +31 -0
- package/src/engine/generator.ts +61 -0
- package/src/engine/snapshot.ts +48 -0
- package/src/errors.ts +34 -0
- package/src/types.ts +153 -0
package/src/config.ts
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type { SnapevalConfig } from './types.js';
|
|
4
|
+
|
|
5
|
+
export const DEFAULT_CONFIG: SnapevalConfig = {
|
|
6
|
+
adapter: 'copilot-cli',
|
|
7
|
+
inference: 'auto',
|
|
8
|
+
threshold: 0.85,
|
|
9
|
+
runs: 3,
|
|
10
|
+
budget: 'unlimited',
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
function loadConfigFile(dirPath: string): Partial<SnapevalConfig> | null {
|
|
14
|
+
const configPath = path.join(dirPath, 'snapeval.config.json');
|
|
15
|
+
if (!fs.existsSync(configPath)) return null;
|
|
16
|
+
const raw = fs.readFileSync(configPath, 'utf-8');
|
|
17
|
+
return JSON.parse(raw);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function resolveConfig(
|
|
21
|
+
cliFlags: Partial<SnapevalConfig>,
|
|
22
|
+
projectRoot: string,
|
|
23
|
+
skillDir?: string
|
|
24
|
+
): SnapevalConfig {
|
|
25
|
+
const skillDirConfig = skillDir ? loadConfigFile(skillDir) : null;
|
|
26
|
+
const projectConfig = loadConfigFile(projectRoot);
|
|
27
|
+
return {
|
|
28
|
+
...DEFAULT_CONFIG,
|
|
29
|
+
...(projectConfig ?? {}),
|
|
30
|
+
...(skillDirConfig ?? {}),
|
|
31
|
+
...stripUndefined(cliFlags),
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function stripUndefined(obj: Record<string, unknown>): Record<string, unknown> {
|
|
36
|
+
return Object.fromEntries(Object.entries(obj).filter(([, v]) => v !== undefined));
|
|
37
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
export class BudgetEngine {
|
|
2
|
+
private spent = 0;
|
|
3
|
+
private cap: number | null;
|
|
4
|
+
|
|
5
|
+
constructor(budget: string) {
|
|
6
|
+
this.cap = budget === 'unlimited' ? null : parseFloat(budget);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
get totalCost(): number { return this.spent; }
|
|
10
|
+
|
|
11
|
+
addCost(amount: number): void { this.spent += amount; }
|
|
12
|
+
|
|
13
|
+
isExceeded(): boolean {
|
|
14
|
+
if (this.cap === null) return false;
|
|
15
|
+
return this.spent > this.cap;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
estimateScenarioCost(tokens: number, isFreeModel: boolean): number {
|
|
19
|
+
if (isFreeModel) return 0;
|
|
20
|
+
return (tokens / 1_000_000) * 0.15;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
get remaining(): number | null {
|
|
24
|
+
if (this.cap === null) return null;
|
|
25
|
+
return Math.max(0, this.cap - this.spent);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { InferenceAdapter } from '../../types.js';
|
|
2
|
+
|
|
3
|
+
export function cosineSimilarity(a: number[], b: number[]): number {
|
|
4
|
+
let dot = 0, magA = 0, magB = 0;
|
|
5
|
+
for (let i = 0; i < a.length; i++) {
|
|
6
|
+
dot += a[i] * b[i];
|
|
7
|
+
magA += a[i] * a[i];
|
|
8
|
+
magB += b[i] * b[i];
|
|
9
|
+
}
|
|
10
|
+
const denom = Math.sqrt(magA) * Math.sqrt(magB);
|
|
11
|
+
return denom === 0 ? 0 : dot / denom;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export async function embeddingCheck(
|
|
15
|
+
baseline: string,
|
|
16
|
+
current: string,
|
|
17
|
+
inference: InferenceAdapter,
|
|
18
|
+
threshold: number = 0.85
|
|
19
|
+
): Promise<{ similarity: number; pass: boolean }> {
|
|
20
|
+
const [baselineEmb, currentEmb] = await Promise.all([
|
|
21
|
+
inference.embed(baseline),
|
|
22
|
+
inference.embed(current),
|
|
23
|
+
]);
|
|
24
|
+
const similarity = cosineSimilarity(baselineEmb, currentEmb);
|
|
25
|
+
return { similarity, pass: similarity >= threshold };
|
|
26
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import type { InferenceAdapter, ComparisonVerdict } from '../../types.js';
|
|
2
|
+
|
|
3
|
+
export function buildJudgePrompt(outputA: string, outputB: string): string {
|
|
4
|
+
return `You are an AI output comparison judge. Compare these two outputs and determine if they are semantically consistent (same meaning, same key information) or different (changed behavior, missing information, or contradictory content).
|
|
5
|
+
|
|
6
|
+
OUTPUT A:
|
|
7
|
+
---
|
|
8
|
+
${outputA}
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
OUTPUT B:
|
|
12
|
+
---
|
|
13
|
+
${outputB}
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
Respond with JSON only: {"verdict": "consistent"} or {"verdict": "different"}`;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
interface JudgeResult {
|
|
20
|
+
verdict: ComparisonVerdict;
|
|
21
|
+
details: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function parseJudgeResponse(response: string): 'consistent' | 'different' | null {
|
|
25
|
+
try {
|
|
26
|
+
const parsed = JSON.parse(response);
|
|
27
|
+
if (parsed.verdict === 'consistent' || parsed.verdict === 'different') return parsed.verdict;
|
|
28
|
+
return null;
|
|
29
|
+
} catch {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
async function runJudgePair(
|
|
35
|
+
baseline: string,
|
|
36
|
+
current: string,
|
|
37
|
+
inference: InferenceAdapter
|
|
38
|
+
): Promise<{ forward: string | null; reverse: string | null }> {
|
|
39
|
+
const [forwardResp, reverseResp] = await Promise.all([
|
|
40
|
+
inference.chat([{ role: 'user', content: buildJudgePrompt(baseline, current) }], {
|
|
41
|
+
temperature: 0,
|
|
42
|
+
responseFormat: 'json',
|
|
43
|
+
}),
|
|
44
|
+
inference.chat([{ role: 'user', content: buildJudgePrompt(current, baseline) }], {
|
|
45
|
+
temperature: 0,
|
|
46
|
+
responseFormat: 'json',
|
|
47
|
+
}),
|
|
48
|
+
]);
|
|
49
|
+
return { forward: parseJudgeResponse(forwardResp), reverse: parseJudgeResponse(reverseResp) };
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export async function llmJudge(
|
|
53
|
+
baseline: string,
|
|
54
|
+
current: string,
|
|
55
|
+
inference: InferenceAdapter
|
|
56
|
+
): Promise<JudgeResult> {
|
|
57
|
+
for (let attempt = 0; attempt < 2; attempt++) {
|
|
58
|
+
const { forward, reverse } = await runJudgePair(baseline, current, inference);
|
|
59
|
+
if (forward === null || reverse === null) {
|
|
60
|
+
if (attempt === 0) continue;
|
|
61
|
+
return {
|
|
62
|
+
verdict: 'inconclusive',
|
|
63
|
+
details: 'LLM judge returned unparseable response after retry',
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
if (forward === reverse) {
|
|
67
|
+
return {
|
|
68
|
+
verdict: forward === 'consistent' ? 'pass' : 'regressed',
|
|
69
|
+
details: `LLM Judge: both orderings agree — ${forward}`,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
return {
|
|
73
|
+
verdict: 'inconclusive',
|
|
74
|
+
details: `LLM Judge: orderings disagree (forward=${forward}, reverse=${reverse})`,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
return { verdict: 'inconclusive', details: 'LLM judge exhausted retries' };
|
|
78
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { InferenceAdapter, ComparisonResult } from '../../types.js';
|
|
2
|
+
import { schemaCheck } from './schema.js';
|
|
3
|
+
import { embeddingCheck } from './embedding.js';
|
|
4
|
+
import { llmJudge } from './judge.js';
|
|
5
|
+
|
|
6
|
+
export interface PipelineOptions {
|
|
7
|
+
threshold: number;
|
|
8
|
+
skipEmbedding?: boolean;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function comparePipeline(
|
|
12
|
+
baseline: string,
|
|
13
|
+
current: string,
|
|
14
|
+
inference: InferenceAdapter,
|
|
15
|
+
options: PipelineOptions
|
|
16
|
+
): Promise<ComparisonResult> {
|
|
17
|
+
// Tier 1: Schema check (FREE)
|
|
18
|
+
if (schemaCheck(baseline, current)) {
|
|
19
|
+
return { scenarioId: 0, verdict: 'pass', tier: 1, details: 'Schema match' };
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Tier 2: Embedding similarity (CHEAP) — skip if unavailable
|
|
23
|
+
if (!options.skipEmbedding) {
|
|
24
|
+
try {
|
|
25
|
+
const embResult = await embeddingCheck(baseline, current, inference, options.threshold);
|
|
26
|
+
if (embResult.pass) {
|
|
27
|
+
return {
|
|
28
|
+
scenarioId: 0,
|
|
29
|
+
verdict: 'pass',
|
|
30
|
+
tier: 2,
|
|
31
|
+
similarity: embResult.similarity,
|
|
32
|
+
details: `Embedding similarity: ${embResult.similarity.toFixed(4)}`,
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
} catch {
|
|
36
|
+
// Embedding not available — fall through to Tier 3
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Tier 3: LLM Judge (EXPENSIVE)
|
|
41
|
+
const judgeResult = await llmJudge(baseline, current, inference);
|
|
42
|
+
return { scenarioId: 0, verdict: judgeResult.verdict, tier: 3, details: judgeResult.details };
|
|
43
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export function extractSchema(text: string): string {
|
|
2
|
+
if (!text) return '';
|
|
3
|
+
return text
|
|
4
|
+
.split('\n')
|
|
5
|
+
.map((line) => {
|
|
6
|
+
const trimmed = line.trim();
|
|
7
|
+
if (/^#{1,6}\s/.test(trimmed)) return trimmed.replace(/^(#{1,6}\s).*/, '$1[heading]');
|
|
8
|
+
if (/^[-*+]\s/.test(trimmed)) return '- [item]';
|
|
9
|
+
if (/^\d+\.\s/.test(trimmed)) return '1. [item]';
|
|
10
|
+
if (/^```/.test(trimmed)) return '```';
|
|
11
|
+
if (trimmed === '') return '';
|
|
12
|
+
return '[content]';
|
|
13
|
+
})
|
|
14
|
+
.join('\n')
|
|
15
|
+
.replace(/(\[content\]\n)+\[content\]/g, '[content]')
|
|
16
|
+
.replace(/(\[content\]\n)+/g, '[content]\n')
|
|
17
|
+
.trim();
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function schemaCheck(baseline: string, current: string): boolean {
|
|
21
|
+
return extractSchema(baseline) === extractSchema(current);
|
|
22
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import type { VarianceEnvelope, VarianceEnvelopeRun } from '../../types.js';
|
|
2
|
+
import { cosineSimilarity } from './embedding.js';
|
|
3
|
+
|
|
4
|
+
export function computeEnvelope(scenarioId: number, runs: VarianceEnvelopeRun[]): VarianceEnvelope {
|
|
5
|
+
const dims = runs[0].embedding.length;
|
|
6
|
+
const centroid = new Array(dims).fill(0);
|
|
7
|
+
for (const run of runs) {
|
|
8
|
+
for (let i = 0; i < dims; i++) {
|
|
9
|
+
centroid[i] += run.embedding[i];
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
for (let i = 0; i < dims; i++) {
|
|
13
|
+
centroid[i] /= runs.length;
|
|
14
|
+
}
|
|
15
|
+
let maxDistance = 0;
|
|
16
|
+
for (const run of runs) {
|
|
17
|
+
const sim = cosineSimilarity(run.embedding, centroid);
|
|
18
|
+
const distance = 1 - sim;
|
|
19
|
+
if (distance > maxDistance) maxDistance = distance;
|
|
20
|
+
}
|
|
21
|
+
return { scenario_id: scenarioId, runs, centroid, radius: maxDistance };
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function isWithinEnvelope(
|
|
25
|
+
embedding: number[],
|
|
26
|
+
envelope: VarianceEnvelope,
|
|
27
|
+
threshold: number
|
|
28
|
+
): boolean {
|
|
29
|
+
const sim = cosineSimilarity(embedding, envelope.centroid);
|
|
30
|
+
return sim >= threshold - envelope.radius;
|
|
31
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import type { InferenceAdapter, EvalsFile } from '../types.js';
|
|
2
|
+
|
|
3
|
+
export function buildGeneratorPrompt(skillContent: string): string {
|
|
4
|
+
return `You are a test case generator for AI skills. Read the following skill definition and generate 5-8 realistic test scenarios.
|
|
5
|
+
|
|
6
|
+
SKILL DEFINITION:
|
|
7
|
+
---
|
|
8
|
+
${skillContent}
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
Generate test scenarios as JSON with this exact format:
|
|
12
|
+
{
|
|
13
|
+
"skill_name": "<name from skill>",
|
|
14
|
+
"evals": [
|
|
15
|
+
{
|
|
16
|
+
"id": 1,
|
|
17
|
+
"prompt": "<realistic user prompt that would trigger this skill>",
|
|
18
|
+
"expected_output": "<human-readable description of expected behavior>",
|
|
19
|
+
"assertions": ["<verifiable statement about the output>"]
|
|
20
|
+
}
|
|
21
|
+
]
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
Requirements:
|
|
25
|
+
- Include happy path scenarios (normal use cases)
|
|
26
|
+
- Include edge cases (empty input, malformed input, boundary conditions)
|
|
27
|
+
- Include at least one negative test (input the skill should handle gracefully)
|
|
28
|
+
- Prompts should be realistic — the way a real user would type them
|
|
29
|
+
- Each assertion should be specific and verifiable
|
|
30
|
+
- Return ONLY the JSON, no markdown wrapping`;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function extractJSON(text: string): string {
|
|
34
|
+
const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
35
|
+
if (match) return match[1].trim();
|
|
36
|
+
return text.trim();
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export async function generateEvals(
|
|
40
|
+
skillContent: string,
|
|
41
|
+
skillName: string,
|
|
42
|
+
inference: InferenceAdapter
|
|
43
|
+
): Promise<EvalsFile> {
|
|
44
|
+
const prompt = buildGeneratorPrompt(skillContent);
|
|
45
|
+
const response = await inference.chat(
|
|
46
|
+
[{ role: 'user', content: prompt }],
|
|
47
|
+
{ temperature: 0.7, responseFormat: 'json' }
|
|
48
|
+
);
|
|
49
|
+
const parsed = JSON.parse(extractJSON(response));
|
|
50
|
+
return {
|
|
51
|
+
skill_name: parsed.skill_name || skillName,
|
|
52
|
+
generated_by: 'snapeval v1.0.0',
|
|
53
|
+
evals: parsed.evals.map((e: any, i: number) => ({
|
|
54
|
+
id: e.id || i + 1,
|
|
55
|
+
prompt: e.prompt,
|
|
56
|
+
expected_output: e.expected_output || '',
|
|
57
|
+
files: e.files || [],
|
|
58
|
+
assertions: e.assertions || [],
|
|
59
|
+
})),
|
|
60
|
+
};
|
|
61
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import * as crypto from 'node:crypto';
|
|
4
|
+
import type { SkillOutput, Snapshot } from '../types.js';
|
|
5
|
+
|
|
6
|
+
export class SnapshotManager {
|
|
7
|
+
private snapshotsDir: string;
|
|
8
|
+
constructor(private evalsDir: string) {
|
|
9
|
+
this.snapshotsDir = path.join(evalsDir, 'snapshots');
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
private snapshotPath(scenarioId: number): string {
|
|
13
|
+
return path.join(this.snapshotsDir, `scenario-${scenarioId}.snap.json`);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
saveSnapshot(scenarioId: number, prompt: string, output: SkillOutput, runs: number = 1): void {
|
|
17
|
+
const snapshot: Snapshot = {
|
|
18
|
+
scenario_id: scenarioId, prompt, output,
|
|
19
|
+
captured_at: new Date().toISOString(), runs, approved_by: null,
|
|
20
|
+
};
|
|
21
|
+
fs.mkdirSync(this.snapshotsDir, { recursive: true });
|
|
22
|
+
fs.writeFileSync(this.snapshotPath(scenarioId), JSON.stringify(snapshot, null, 2));
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
loadSnapshot(scenarioId: number): Snapshot | null {
|
|
26
|
+
const p = this.snapshotPath(scenarioId);
|
|
27
|
+
if (!fs.existsSync(p)) return null;
|
|
28
|
+
return JSON.parse(fs.readFileSync(p, 'utf-8'));
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
approve(scenarioId: number, prompt: string, newOutput: SkillOutput): void {
|
|
32
|
+
const old = this.loadSnapshot(scenarioId);
|
|
33
|
+
const previousHash = old ? crypto.createHash('sha256').update(old.output.raw).digest('hex').slice(0, 8) : 'none';
|
|
34
|
+
const newHash = crypto.createHash('sha256').update(newOutput.raw).digest('hex').slice(0, 8);
|
|
35
|
+
this.saveSnapshot(scenarioId, prompt, newOutput);
|
|
36
|
+
const auditEntry = { scenario_id: scenarioId, approved_at: new Date().toISOString(), previous_hash: previousHash, new_hash: newHash };
|
|
37
|
+
const auditPath = path.join(this.snapshotsDir, '.audit-log.jsonl');
|
|
38
|
+
fs.appendFileSync(auditPath, JSON.stringify(auditEntry) + '\n');
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
listSnapshotIds(): number[] {
|
|
42
|
+
if (!fs.existsSync(this.snapshotsDir)) return [];
|
|
43
|
+
return fs.readdirSync(this.snapshotsDir)
|
|
44
|
+
.filter((f) => f.match(/^scenario-\d+\.snap\.json$/))
|
|
45
|
+
.map((f) => parseInt(f.match(/scenario-(\d+)/)![1]))
|
|
46
|
+
.sort((a, b) => a - b);
|
|
47
|
+
}
|
|
48
|
+
}
|
package/src/errors.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export class SnapevalError extends Error {
|
|
2
|
+
constructor(message: string, public exitCode: number = 2) {
|
|
3
|
+
super(message);
|
|
4
|
+
this.name = 'SnapevalError';
|
|
5
|
+
}
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export class AdapterNotAvailableError extends SnapevalError {
|
|
9
|
+
constructor(adapterName: string, installHint: string) {
|
|
10
|
+
super(`${adapterName} is not available. ${installHint}`);
|
|
11
|
+
this.name = 'AdapterNotAvailableError';
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export class RateLimitError extends SnapevalError {
|
|
16
|
+
constructor(adapterName: string) {
|
|
17
|
+
super(`${adapterName} rate limit exceeded. Try again later or use a different adapter.`);
|
|
18
|
+
this.name = 'RateLimitError';
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export class TimeoutError extends SnapevalError {
|
|
23
|
+
constructor(scenarioId: number, timeoutMs: number) {
|
|
24
|
+
super(`Scenario ${scenarioId} timed out after ${timeoutMs}ms.`);
|
|
25
|
+
this.name = 'TimeoutError';
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export class NoBaselineError extends SnapevalError {
|
|
30
|
+
constructor(skillPath: string) {
|
|
31
|
+
super(`No baselines found at ${skillPath}/evals/snapshots/. Run \`snapeval capture\` first.`, 2);
|
|
32
|
+
this.name = 'NoBaselineError';
|
|
33
|
+
}
|
|
34
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
// === Adapter Interfaces ===
|
|
2
|
+
export interface SkillOutput {
|
|
3
|
+
raw: string;
|
|
4
|
+
metadata: {
|
|
5
|
+
tokens: number;
|
|
6
|
+
durationMs: number;
|
|
7
|
+
model: string;
|
|
8
|
+
adapter: string;
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface SkillAdapter {
|
|
13
|
+
name: string;
|
|
14
|
+
invoke(skillPath: string, prompt: string, files?: string[]): Promise<SkillOutput>;
|
|
15
|
+
isAvailable(): Promise<boolean>;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface Message {
|
|
19
|
+
role: 'system' | 'user' | 'assistant';
|
|
20
|
+
content: string;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface ChatOptions {
|
|
24
|
+
temperature?: number;
|
|
25
|
+
maxTokens?: number;
|
|
26
|
+
responseFormat?: 'text' | 'json';
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface InferenceAdapter {
|
|
30
|
+
name: string;
|
|
31
|
+
chat(messages: Message[], options?: ChatOptions): Promise<string>;
|
|
32
|
+
embed(text: string): Promise<number[]>;
|
|
33
|
+
estimateCost(tokens: number): number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface EvalResults {
|
|
37
|
+
skillName: string;
|
|
38
|
+
scenarios: ScenarioResult[];
|
|
39
|
+
summary: BenchmarkSummary;
|
|
40
|
+
timing: TimingData;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export interface ReportAdapter {
|
|
44
|
+
name: string;
|
|
45
|
+
report(results: EvalResults): Promise<void>;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// === Eval Format (agentskills.io) ===
|
|
49
|
+
export interface EvalCase {
|
|
50
|
+
id: number;
|
|
51
|
+
prompt: string;
|
|
52
|
+
expected_output: string;
|
|
53
|
+
files?: string[];
|
|
54
|
+
assertions?: string[];
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface EvalsFile {
|
|
58
|
+
skill_name: string;
|
|
59
|
+
generated_by: string;
|
|
60
|
+
evals: EvalCase[];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// === Snapshot Format ===
|
|
64
|
+
export interface Snapshot {
|
|
65
|
+
scenario_id: number;
|
|
66
|
+
prompt: string;
|
|
67
|
+
output: SkillOutput;
|
|
68
|
+
captured_at: string;
|
|
69
|
+
runs: number;
|
|
70
|
+
approved_by: string | null;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export interface VarianceEnvelopeRun {
|
|
74
|
+
raw: string;
|
|
75
|
+
embedding: number[];
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export interface VarianceEnvelope {
|
|
79
|
+
scenario_id: number;
|
|
80
|
+
runs: VarianceEnvelopeRun[];
|
|
81
|
+
centroid: number[];
|
|
82
|
+
radius: number;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// === Comparison Results ===
|
|
86
|
+
export type ComparisonVerdict = 'pass' | 'regressed' | 'inconclusive' | 'error';
|
|
87
|
+
|
|
88
|
+
export interface ComparisonResult {
|
|
89
|
+
scenarioId: number;
|
|
90
|
+
verdict: ComparisonVerdict;
|
|
91
|
+
tier: 1 | 2 | 3;
|
|
92
|
+
similarity?: number;
|
|
93
|
+
details: string;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// === Grading ===
|
|
97
|
+
export interface AssertionResult {
|
|
98
|
+
text: string;
|
|
99
|
+
passed: boolean;
|
|
100
|
+
evidence: string;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
export interface GradingSummary {
|
|
104
|
+
passed: number;
|
|
105
|
+
failed: number;
|
|
106
|
+
total: number;
|
|
107
|
+
pass_rate: number;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export interface GradingFile {
|
|
111
|
+
assertion_results: AssertionResult[];
|
|
112
|
+
summary: GradingSummary;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// === Timing & Benchmark ===
|
|
116
|
+
export interface TimingData {
|
|
117
|
+
total_tokens: number;
|
|
118
|
+
duration_ms: number;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export interface BenchmarkSummary {
|
|
122
|
+
total_scenarios: number;
|
|
123
|
+
passed: number;
|
|
124
|
+
regressed: number;
|
|
125
|
+
pass_rate: number;
|
|
126
|
+
total_tokens: number;
|
|
127
|
+
total_cost_usd: number;
|
|
128
|
+
total_duration_ms: number;
|
|
129
|
+
tier_breakdown: {
|
|
130
|
+
tier1_schema: number;
|
|
131
|
+
tier2_embedding: number;
|
|
132
|
+
tier3_llm_judge: number;
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// === Scenario Result ===
|
|
137
|
+
export interface ScenarioResult {
|
|
138
|
+
scenarioId: number;
|
|
139
|
+
prompt: string;
|
|
140
|
+
comparison: ComparisonResult;
|
|
141
|
+
grading?: GradingFile;
|
|
142
|
+
timing: TimingData;
|
|
143
|
+
newOutput: SkillOutput;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// === Config ===
|
|
147
|
+
export interface SnapevalConfig {
|
|
148
|
+
adapter: string;
|
|
149
|
+
inference: string;
|
|
150
|
+
threshold: number;
|
|
151
|
+
runs: number;
|
|
152
|
+
budget: string;
|
|
153
|
+
}
|