snapeval 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +194 -0
  3. package/bin/snapeval.ts +226 -0
  4. package/dist/bin/snapeval.d.ts +2 -0
  5. package/dist/bin/snapeval.js +191 -0
  6. package/dist/bin/snapeval.js.map +1 -0
  7. package/dist/src/adapters/inference/copilot.d.ts +9 -0
  8. package/dist/src/adapters/inference/copilot.js +25 -0
  9. package/dist/src/adapters/inference/copilot.js.map +1 -0
  10. package/dist/src/adapters/inference/github-models.d.ts +9 -0
  11. package/dist/src/adapters/inference/github-models.js +62 -0
  12. package/dist/src/adapters/inference/github-models.js.map +1 -0
  13. package/dist/src/adapters/inference/resolve.d.ts +2 -0
  14. package/dist/src/adapters/inference/resolve.js +49 -0
  15. package/dist/src/adapters/inference/resolve.js.map +1 -0
  16. package/dist/src/adapters/report/json.d.ts +7 -0
  17. package/dist/src/adapters/report/json.js +39 -0
  18. package/dist/src/adapters/report/json.js.map +1 -0
  19. package/dist/src/adapters/report/terminal.d.ts +5 -0
  20. package/dist/src/adapters/report/terminal.js +42 -0
  21. package/dist/src/adapters/report/terminal.js.map +1 -0
  22. package/dist/src/adapters/skill/copilot-cli.d.ts +6 -0
  23. package/dist/src/adapters/skill/copilot-cli.js +51 -0
  24. package/dist/src/adapters/skill/copilot-cli.js.map +1 -0
  25. package/dist/src/commands/approve.d.ts +5 -0
  26. package/dist/src/commands/approve.js +40 -0
  27. package/dist/src/commands/approve.js.map +1 -0
  28. package/dist/src/commands/capture.d.ts +4 -0
  29. package/dist/src/commands/capture.js +18 -0
  30. package/dist/src/commands/capture.js.map +1 -0
  31. package/dist/src/commands/check.d.ts +6 -0
  32. package/dist/src/commands/check.js +68 -0
  33. package/dist/src/commands/check.js.map +1 -0
  34. package/dist/src/commands/init.d.ts +2 -0
  35. package/dist/src/commands/init.js +27 -0
  36. package/dist/src/commands/init.js.map +1 -0
  37. package/dist/src/commands/report.d.ts +4 -0
  38. package/dist/src/commands/report.js +26 -0
  39. package/dist/src/commands/report.js.map +1 -0
  40. package/dist/src/config.d.ts +3 -0
  41. package/dist/src/config.js +30 -0
  42. package/dist/src/config.js.map +1 -0
  43. package/dist/src/engine/budget.d.ts +10 -0
  44. package/dist/src/engine/budget.js +25 -0
  45. package/dist/src/engine/budget.js.map +1 -0
  46. package/dist/src/engine/comparison/embedding.d.ts +6 -0
  47. package/dist/src/engine/comparison/embedding.js +19 -0
  48. package/dist/src/engine/comparison/embedding.js.map +1 -0
  49. package/dist/src/engine/comparison/judge.d.ts +8 -0
  50. package/dist/src/engine/comparison/judge.js +64 -0
  51. package/dist/src/engine/comparison/judge.js.map +1 -0
  52. package/dist/src/engine/comparison/pipeline.d.ts +6 -0
  53. package/dist/src/engine/comparison/pipeline.js +31 -0
  54. package/dist/src/engine/comparison/pipeline.js.map +1 -0
  55. package/dist/src/engine/comparison/schema.d.ts +2 -0
  56. package/dist/src/engine/comparison/schema.js +28 -0
  57. package/dist/src/engine/comparison/schema.js.map +1 -0
  58. package/dist/src/engine/comparison/variance.d.ts +3 -0
  59. package/dist/src/engine/comparison/variance.js +26 -0
  60. package/dist/src/engine/comparison/variance.js.map +1 -0
  61. package/dist/src/engine/generator.d.ts +3 -0
  62. package/dist/src/engine/generator.js +52 -0
  63. package/dist/src/engine/generator.js.map +1 -0
  64. package/dist/src/engine/snapshot.d.ts +11 -0
  65. package/dist/src/engine/snapshot.js +46 -0
  66. package/dist/src/engine/snapshot.js.map +1 -0
  67. package/dist/src/errors.d.ts +16 -0
  68. package/dist/src/errors.js +33 -0
  69. package/dist/src/errors.js.map +1 -0
  70. package/dist/src/types.d.ts +125 -0
  71. package/dist/src/types.js +2 -0
  72. package/dist/src/types.js.map +1 -0
  73. package/package.json +53 -0
  74. package/plugin.json +9 -0
  75. package/scripts/snapeval-cli.sh +7 -0
  76. package/skills/snapeval/SKILL.md +51 -0
  77. package/src/adapters/inference/copilot.ts +30 -0
  78. package/src/adapters/inference/github-models.ts +74 -0
  79. package/src/adapters/inference/resolve.ts +70 -0
  80. package/src/adapters/report/json.ts +64 -0
  81. package/src/adapters/report/terminal.ts +59 -0
  82. package/src/adapters/skill/copilot-cli.ts +60 -0
  83. package/src/commands/approve.ts +58 -0
  84. package/src/commands/capture.ts +25 -0
  85. package/src/commands/check.ts +86 -0
  86. package/src/commands/init.ts +38 -0
  87. package/src/commands/report.ts +36 -0
  88. package/src/config.ts +37 -0
  89. package/src/engine/budget.ts +27 -0
  90. package/src/engine/comparison/embedding.ts +26 -0
  91. package/src/engine/comparison/judge.ts +78 -0
  92. package/src/engine/comparison/pipeline.ts +43 -0
  93. package/src/engine/comparison/schema.ts +22 -0
  94. package/src/engine/comparison/variance.ts +31 -0
  95. package/src/engine/generator.ts +61 -0
  96. package/src/engine/snapshot.ts +48 -0
  97. package/src/errors.ts +34 -0
  98. package/src/types.ts +153 -0
package/src/config.ts ADDED
@@ -0,0 +1,37 @@
1
+ import * as fs from 'node:fs';
2
+ import * as path from 'node:path';
3
+ import type { SnapevalConfig } from './types.js';
4
+
5
+ export const DEFAULT_CONFIG: SnapevalConfig = {
6
+ adapter: 'copilot-cli',
7
+ inference: 'auto',
8
+ threshold: 0.85,
9
+ runs: 3,
10
+ budget: 'unlimited',
11
+ };
12
+
13
+ function loadConfigFile(dirPath: string): Partial<SnapevalConfig> | null {
14
+ const configPath = path.join(dirPath, 'snapeval.config.json');
15
+ if (!fs.existsSync(configPath)) return null;
16
+ const raw = fs.readFileSync(configPath, 'utf-8');
17
+ return JSON.parse(raw);
18
+ }
19
+
20
+ export function resolveConfig(
21
+ cliFlags: Partial<SnapevalConfig>,
22
+ projectRoot: string,
23
+ skillDir?: string
24
+ ): SnapevalConfig {
25
+ const skillDirConfig = skillDir ? loadConfigFile(skillDir) : null;
26
+ const projectConfig = loadConfigFile(projectRoot);
27
+ return {
28
+ ...DEFAULT_CONFIG,
29
+ ...(projectConfig ?? {}),
30
+ ...(skillDirConfig ?? {}),
31
+ ...stripUndefined(cliFlags),
32
+ };
33
+ }
34
+
35
+ function stripUndefined(obj: Record<string, unknown>): Record<string, unknown> {
36
+ return Object.fromEntries(Object.entries(obj).filter(([, v]) => v !== undefined));
37
+ }
@@ -0,0 +1,27 @@
1
+ export class BudgetEngine {
2
+ private spent = 0;
3
+ private cap: number | null;
4
+
5
+ constructor(budget: string) {
6
+ this.cap = budget === 'unlimited' ? null : parseFloat(budget);
7
+ }
8
+
9
+ get totalCost(): number { return this.spent; }
10
+
11
+ addCost(amount: number): void { this.spent += amount; }
12
+
13
+ isExceeded(): boolean {
14
+ if (this.cap === null) return false;
15
+ return this.spent > this.cap;
16
+ }
17
+
18
+ estimateScenarioCost(tokens: number, isFreeModel: boolean): number {
19
+ if (isFreeModel) return 0;
20
+ return (tokens / 1_000_000) * 0.15;
21
+ }
22
+
23
+ get remaining(): number | null {
24
+ if (this.cap === null) return null;
25
+ return Math.max(0, this.cap - this.spent);
26
+ }
27
+ }
@@ -0,0 +1,26 @@
1
+ import type { InferenceAdapter } from '../../types.js';
2
+
3
+ export function cosineSimilarity(a: number[], b: number[]): number {
4
+ let dot = 0, magA = 0, magB = 0;
5
+ for (let i = 0; i < a.length; i++) {
6
+ dot += a[i] * b[i];
7
+ magA += a[i] * a[i];
8
+ magB += b[i] * b[i];
9
+ }
10
+ const denom = Math.sqrt(magA) * Math.sqrt(magB);
11
+ return denom === 0 ? 0 : dot / denom;
12
+ }
13
+
14
+ export async function embeddingCheck(
15
+ baseline: string,
16
+ current: string,
17
+ inference: InferenceAdapter,
18
+ threshold: number = 0.85
19
+ ): Promise<{ similarity: number; pass: boolean }> {
20
+ const [baselineEmb, currentEmb] = await Promise.all([
21
+ inference.embed(baseline),
22
+ inference.embed(current),
23
+ ]);
24
+ const similarity = cosineSimilarity(baselineEmb, currentEmb);
25
+ return { similarity, pass: similarity >= threshold };
26
+ }
@@ -0,0 +1,78 @@
1
+ import type { InferenceAdapter, ComparisonVerdict } from '../../types.js';
2
+
3
+ export function buildJudgePrompt(outputA: string, outputB: string): string {
4
+ return `You are an AI output comparison judge. Compare these two outputs and determine if they are semantically consistent (same meaning, same key information) or different (changed behavior, missing information, or contradictory content).
5
+
6
+ OUTPUT A:
7
+ ---
8
+ ${outputA}
9
+ ---
10
+
11
+ OUTPUT B:
12
+ ---
13
+ ${outputB}
14
+ ---
15
+
16
+ Respond with JSON only: {"verdict": "consistent"} or {"verdict": "different"}`;
17
+ }
18
+
19
+ interface JudgeResult {
20
+ verdict: ComparisonVerdict;
21
+ details: string;
22
+ }
23
+
24
+ function parseJudgeResponse(response: string): 'consistent' | 'different' | null {
25
+ try {
26
+ const parsed = JSON.parse(response);
27
+ if (parsed.verdict === 'consistent' || parsed.verdict === 'different') return parsed.verdict;
28
+ return null;
29
+ } catch {
30
+ return null;
31
+ }
32
+ }
33
+
34
+ async function runJudgePair(
35
+ baseline: string,
36
+ current: string,
37
+ inference: InferenceAdapter
38
+ ): Promise<{ forward: string | null; reverse: string | null }> {
39
+ const [forwardResp, reverseResp] = await Promise.all([
40
+ inference.chat([{ role: 'user', content: buildJudgePrompt(baseline, current) }], {
41
+ temperature: 0,
42
+ responseFormat: 'json',
43
+ }),
44
+ inference.chat([{ role: 'user', content: buildJudgePrompt(current, baseline) }], {
45
+ temperature: 0,
46
+ responseFormat: 'json',
47
+ }),
48
+ ]);
49
+ return { forward: parseJudgeResponse(forwardResp), reverse: parseJudgeResponse(reverseResp) };
50
+ }
51
+
52
+ export async function llmJudge(
53
+ baseline: string,
54
+ current: string,
55
+ inference: InferenceAdapter
56
+ ): Promise<JudgeResult> {
57
+ for (let attempt = 0; attempt < 2; attempt++) {
58
+ const { forward, reverse } = await runJudgePair(baseline, current, inference);
59
+ if (forward === null || reverse === null) {
60
+ if (attempt === 0) continue;
61
+ return {
62
+ verdict: 'inconclusive',
63
+ details: 'LLM judge returned unparseable response after retry',
64
+ };
65
+ }
66
+ if (forward === reverse) {
67
+ return {
68
+ verdict: forward === 'consistent' ? 'pass' : 'regressed',
69
+ details: `LLM Judge: both orderings agree — ${forward}`,
70
+ };
71
+ }
72
+ return {
73
+ verdict: 'inconclusive',
74
+ details: `LLM Judge: orderings disagree (forward=${forward}, reverse=${reverse})`,
75
+ };
76
+ }
77
+ return { verdict: 'inconclusive', details: 'LLM judge exhausted retries' };
78
+ }
@@ -0,0 +1,43 @@
1
+ import type { InferenceAdapter, ComparisonResult } from '../../types.js';
2
+ import { schemaCheck } from './schema.js';
3
+ import { embeddingCheck } from './embedding.js';
4
+ import { llmJudge } from './judge.js';
5
+
6
+ export interface PipelineOptions {
7
+ threshold: number;
8
+ skipEmbedding?: boolean;
9
+ }
10
+
11
+ export async function comparePipeline(
12
+ baseline: string,
13
+ current: string,
14
+ inference: InferenceAdapter,
15
+ options: PipelineOptions
16
+ ): Promise<ComparisonResult> {
17
+ // Tier 1: Schema check (FREE)
18
+ if (schemaCheck(baseline, current)) {
19
+ return { scenarioId: 0, verdict: 'pass', tier: 1, details: 'Schema match' };
20
+ }
21
+
22
+ // Tier 2: Embedding similarity (CHEAP) — skip if unavailable
23
+ if (!options.skipEmbedding) {
24
+ try {
25
+ const embResult = await embeddingCheck(baseline, current, inference, options.threshold);
26
+ if (embResult.pass) {
27
+ return {
28
+ scenarioId: 0,
29
+ verdict: 'pass',
30
+ tier: 2,
31
+ similarity: embResult.similarity,
32
+ details: `Embedding similarity: ${embResult.similarity.toFixed(4)}`,
33
+ };
34
+ }
35
+ } catch {
36
+ // Embedding not available — fall through to Tier 3
37
+ }
38
+ }
39
+
40
+ // Tier 3: LLM Judge (EXPENSIVE)
41
+ const judgeResult = await llmJudge(baseline, current, inference);
42
+ return { scenarioId: 0, verdict: judgeResult.verdict, tier: 3, details: judgeResult.details };
43
+ }
@@ -0,0 +1,22 @@
1
+ export function extractSchema(text: string): string {
2
+ if (!text) return '';
3
+ return text
4
+ .split('\n')
5
+ .map((line) => {
6
+ const trimmed = line.trim();
7
+ if (/^#{1,6}\s/.test(trimmed)) return trimmed.replace(/^(#{1,6}\s).*/, '$1[heading]');
8
+ if (/^[-*+]\s/.test(trimmed)) return '- [item]';
9
+ if (/^\d+\.\s/.test(trimmed)) return '1. [item]';
10
+ if (/^```/.test(trimmed)) return '```';
11
+ if (trimmed === '') return '';
12
+ return '[content]';
13
+ })
14
+ .join('\n')
15
+ .replace(/(\[content\]\n)+\[content\]/g, '[content]')
16
+ .replace(/(\[content\]\n)+/g, '[content]\n')
17
+ .trim();
18
+ }
19
+
20
+ export function schemaCheck(baseline: string, current: string): boolean {
21
+ return extractSchema(baseline) === extractSchema(current);
22
+ }
@@ -0,0 +1,31 @@
1
+ import type { VarianceEnvelope, VarianceEnvelopeRun } from '../../types.js';
2
+ import { cosineSimilarity } from './embedding.js';
3
+
4
+ export function computeEnvelope(scenarioId: number, runs: VarianceEnvelopeRun[]): VarianceEnvelope {
5
+ const dims = runs[0].embedding.length;
6
+ const centroid = new Array(dims).fill(0);
7
+ for (const run of runs) {
8
+ for (let i = 0; i < dims; i++) {
9
+ centroid[i] += run.embedding[i];
10
+ }
11
+ }
12
+ for (let i = 0; i < dims; i++) {
13
+ centroid[i] /= runs.length;
14
+ }
15
+ let maxDistance = 0;
16
+ for (const run of runs) {
17
+ const sim = cosineSimilarity(run.embedding, centroid);
18
+ const distance = 1 - sim;
19
+ if (distance > maxDistance) maxDistance = distance;
20
+ }
21
+ return { scenario_id: scenarioId, runs, centroid, radius: maxDistance };
22
+ }
23
+
24
+ export function isWithinEnvelope(
25
+ embedding: number[],
26
+ envelope: VarianceEnvelope,
27
+ threshold: number
28
+ ): boolean {
29
+ const sim = cosineSimilarity(embedding, envelope.centroid);
30
+ return sim >= threshold - envelope.radius;
31
+ }
@@ -0,0 +1,61 @@
1
+ import type { InferenceAdapter, EvalsFile } from '../types.js';
2
+
3
+ export function buildGeneratorPrompt(skillContent: string): string {
4
+ return `You are a test case generator for AI skills. Read the following skill definition and generate 5-8 realistic test scenarios.
5
+
6
+ SKILL DEFINITION:
7
+ ---
8
+ ${skillContent}
9
+ ---
10
+
11
+ Generate test scenarios as JSON with this exact format:
12
+ {
13
+ "skill_name": "<name from skill>",
14
+ "evals": [
15
+ {
16
+ "id": 1,
17
+ "prompt": "<realistic user prompt that would trigger this skill>",
18
+ "expected_output": "<human-readable description of expected behavior>",
19
+ "assertions": ["<verifiable statement about the output>"]
20
+ }
21
+ ]
22
+ }
23
+
24
+ Requirements:
25
+ - Include happy path scenarios (normal use cases)
26
+ - Include edge cases (empty input, malformed input, boundary conditions)
27
+ - Include at least one negative test (input the skill should handle gracefully)
28
+ - Prompts should be realistic — the way a real user would type them
29
+ - Each assertion should be specific and verifiable
30
+ - Return ONLY the JSON, no markdown wrapping`;
31
+ }
32
+
33
+ function extractJSON(text: string): string {
34
+ const match = text.match(/```(?:json)?\s*([\s\S]*?)```/);
35
+ if (match) return match[1].trim();
36
+ return text.trim();
37
+ }
38
+
39
+ export async function generateEvals(
40
+ skillContent: string,
41
+ skillName: string,
42
+ inference: InferenceAdapter
43
+ ): Promise<EvalsFile> {
44
+ const prompt = buildGeneratorPrompt(skillContent);
45
+ const response = await inference.chat(
46
+ [{ role: 'user', content: prompt }],
47
+ { temperature: 0.7, responseFormat: 'json' }
48
+ );
49
+ const parsed = JSON.parse(extractJSON(response));
50
+ return {
51
+ skill_name: parsed.skill_name || skillName,
52
+ generated_by: 'snapeval v1.0.0',
53
+ evals: parsed.evals.map((e: any, i: number) => ({
54
+ id: e.id || i + 1,
55
+ prompt: e.prompt,
56
+ expected_output: e.expected_output || '',
57
+ files: e.files || [],
58
+ assertions: e.assertions || [],
59
+ })),
60
+ };
61
+ }
@@ -0,0 +1,48 @@
1
+ import * as fs from 'node:fs';
2
+ import * as path from 'node:path';
3
+ import * as crypto from 'node:crypto';
4
+ import type { SkillOutput, Snapshot } from '../types.js';
5
+
6
+ export class SnapshotManager {
7
+ private snapshotsDir: string;
8
+ constructor(private evalsDir: string) {
9
+ this.snapshotsDir = path.join(evalsDir, 'snapshots');
10
+ }
11
+
12
+ private snapshotPath(scenarioId: number): string {
13
+ return path.join(this.snapshotsDir, `scenario-${scenarioId}.snap.json`);
14
+ }
15
+
16
+ saveSnapshot(scenarioId: number, prompt: string, output: SkillOutput, runs: number = 1): void {
17
+ const snapshot: Snapshot = {
18
+ scenario_id: scenarioId, prompt, output,
19
+ captured_at: new Date().toISOString(), runs, approved_by: null,
20
+ };
21
+ fs.mkdirSync(this.snapshotsDir, { recursive: true });
22
+ fs.writeFileSync(this.snapshotPath(scenarioId), JSON.stringify(snapshot, null, 2));
23
+ }
24
+
25
+ loadSnapshot(scenarioId: number): Snapshot | null {
26
+ const p = this.snapshotPath(scenarioId);
27
+ if (!fs.existsSync(p)) return null;
28
+ return JSON.parse(fs.readFileSync(p, 'utf-8'));
29
+ }
30
+
31
+ approve(scenarioId: number, prompt: string, newOutput: SkillOutput): void {
32
+ const old = this.loadSnapshot(scenarioId);
33
+ const previousHash = old ? crypto.createHash('sha256').update(old.output.raw).digest('hex').slice(0, 8) : 'none';
34
+ const newHash = crypto.createHash('sha256').update(newOutput.raw).digest('hex').slice(0, 8);
35
+ this.saveSnapshot(scenarioId, prompt, newOutput);
36
+ const auditEntry = { scenario_id: scenarioId, approved_at: new Date().toISOString(), previous_hash: previousHash, new_hash: newHash };
37
+ const auditPath = path.join(this.snapshotsDir, '.audit-log.jsonl');
38
+ fs.appendFileSync(auditPath, JSON.stringify(auditEntry) + '\n');
39
+ }
40
+
41
+ listSnapshotIds(): number[] {
42
+ if (!fs.existsSync(this.snapshotsDir)) return [];
43
+ return fs.readdirSync(this.snapshotsDir)
44
+ .filter((f) => f.match(/^scenario-\d+\.snap\.json$/))
45
+ .map((f) => parseInt(f.match(/scenario-(\d+)/)![1]))
46
+ .sort((a, b) => a - b);
47
+ }
48
+ }
package/src/errors.ts ADDED
@@ -0,0 +1,34 @@
1
+ export class SnapevalError extends Error {
2
+ constructor(message: string, public exitCode: number = 2) {
3
+ super(message);
4
+ this.name = 'SnapevalError';
5
+ }
6
+ }
7
+
8
+ export class AdapterNotAvailableError extends SnapevalError {
9
+ constructor(adapterName: string, installHint: string) {
10
+ super(`${adapterName} is not available. ${installHint}`);
11
+ this.name = 'AdapterNotAvailableError';
12
+ }
13
+ }
14
+
15
+ export class RateLimitError extends SnapevalError {
16
+ constructor(adapterName: string) {
17
+ super(`${adapterName} rate limit exceeded. Try again later or use a different adapter.`);
18
+ this.name = 'RateLimitError';
19
+ }
20
+ }
21
+
22
+ export class TimeoutError extends SnapevalError {
23
+ constructor(scenarioId: number, timeoutMs: number) {
24
+ super(`Scenario ${scenarioId} timed out after ${timeoutMs}ms.`);
25
+ this.name = 'TimeoutError';
26
+ }
27
+ }
28
+
29
+ export class NoBaselineError extends SnapevalError {
30
+ constructor(skillPath: string) {
31
+ super(`No baselines found at ${skillPath}/evals/snapshots/. Run \`snapeval capture\` first.`, 2);
32
+ this.name = 'NoBaselineError';
33
+ }
34
+ }
package/src/types.ts ADDED
@@ -0,0 +1,153 @@
1
+ // === Adapter Interfaces ===
2
+ export interface SkillOutput {
3
+ raw: string;
4
+ metadata: {
5
+ tokens: number;
6
+ durationMs: number;
7
+ model: string;
8
+ adapter: string;
9
+ };
10
+ }
11
+
12
+ export interface SkillAdapter {
13
+ name: string;
14
+ invoke(skillPath: string, prompt: string, files?: string[]): Promise<SkillOutput>;
15
+ isAvailable(): Promise<boolean>;
16
+ }
17
+
18
+ export interface Message {
19
+ role: 'system' | 'user' | 'assistant';
20
+ content: string;
21
+ }
22
+
23
+ export interface ChatOptions {
24
+ temperature?: number;
25
+ maxTokens?: number;
26
+ responseFormat?: 'text' | 'json';
27
+ }
28
+
29
+ export interface InferenceAdapter {
30
+ name: string;
31
+ chat(messages: Message[], options?: ChatOptions): Promise<string>;
32
+ embed(text: string): Promise<number[]>;
33
+ estimateCost(tokens: number): number;
34
+ }
35
+
36
+ export interface EvalResults {
37
+ skillName: string;
38
+ scenarios: ScenarioResult[];
39
+ summary: BenchmarkSummary;
40
+ timing: TimingData;
41
+ }
42
+
43
+ export interface ReportAdapter {
44
+ name: string;
45
+ report(results: EvalResults): Promise<void>;
46
+ }
47
+
48
+ // === Eval Format (agentskills.io) ===
49
+ export interface EvalCase {
50
+ id: number;
51
+ prompt: string;
52
+ expected_output: string;
53
+ files?: string[];
54
+ assertions?: string[];
55
+ }
56
+
57
+ export interface EvalsFile {
58
+ skill_name: string;
59
+ generated_by: string;
60
+ evals: EvalCase[];
61
+ }
62
+
63
+ // === Snapshot Format ===
64
+ export interface Snapshot {
65
+ scenario_id: number;
66
+ prompt: string;
67
+ output: SkillOutput;
68
+ captured_at: string;
69
+ runs: number;
70
+ approved_by: string | null;
71
+ }
72
+
73
+ export interface VarianceEnvelopeRun {
74
+ raw: string;
75
+ embedding: number[];
76
+ }
77
+
78
+ export interface VarianceEnvelope {
79
+ scenario_id: number;
80
+ runs: VarianceEnvelopeRun[];
81
+ centroid: number[];
82
+ radius: number;
83
+ }
84
+
85
+ // === Comparison Results ===
86
+ export type ComparisonVerdict = 'pass' | 'regressed' | 'inconclusive' | 'error';
87
+
88
+ export interface ComparisonResult {
89
+ scenarioId: number;
90
+ verdict: ComparisonVerdict;
91
+ tier: 1 | 2 | 3;
92
+ similarity?: number;
93
+ details: string;
94
+ }
95
+
96
+ // === Grading ===
97
+ export interface AssertionResult {
98
+ text: string;
99
+ passed: boolean;
100
+ evidence: string;
101
+ }
102
+
103
+ export interface GradingSummary {
104
+ passed: number;
105
+ failed: number;
106
+ total: number;
107
+ pass_rate: number;
108
+ }
109
+
110
+ export interface GradingFile {
111
+ assertion_results: AssertionResult[];
112
+ summary: GradingSummary;
113
+ }
114
+
115
+ // === Timing & Benchmark ===
116
+ export interface TimingData {
117
+ total_tokens: number;
118
+ duration_ms: number;
119
+ }
120
+
121
+ export interface BenchmarkSummary {
122
+ total_scenarios: number;
123
+ passed: number;
124
+ regressed: number;
125
+ pass_rate: number;
126
+ total_tokens: number;
127
+ total_cost_usd: number;
128
+ total_duration_ms: number;
129
+ tier_breakdown: {
130
+ tier1_schema: number;
131
+ tier2_embedding: number;
132
+ tier3_llm_judge: number;
133
+ };
134
+ }
135
+
136
+ // === Scenario Result ===
137
+ export interface ScenarioResult {
138
+ scenarioId: number;
139
+ prompt: string;
140
+ comparison: ComparisonResult;
141
+ grading?: GradingFile;
142
+ timing: TimingData;
143
+ newOutput: SkillOutput;
144
+ }
145
+
146
+ // === Config ===
147
+ export interface SnapevalConfig {
148
+ adapter: string;
149
+ inference: string;
150
+ threshold: number;
151
+ runs: number;
152
+ budget: string;
153
+ }