snapeval 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +194 -0
- package/bin/snapeval.ts +226 -0
- package/dist/bin/snapeval.d.ts +2 -0
- package/dist/bin/snapeval.js +191 -0
- package/dist/bin/snapeval.js.map +1 -0
- package/dist/src/adapters/inference/copilot.d.ts +9 -0
- package/dist/src/adapters/inference/copilot.js +25 -0
- package/dist/src/adapters/inference/copilot.js.map +1 -0
- package/dist/src/adapters/inference/github-models.d.ts +9 -0
- package/dist/src/adapters/inference/github-models.js +62 -0
- package/dist/src/adapters/inference/github-models.js.map +1 -0
- package/dist/src/adapters/inference/resolve.d.ts +2 -0
- package/dist/src/adapters/inference/resolve.js +49 -0
- package/dist/src/adapters/inference/resolve.js.map +1 -0
- package/dist/src/adapters/report/json.d.ts +7 -0
- package/dist/src/adapters/report/json.js +39 -0
- package/dist/src/adapters/report/json.js.map +1 -0
- package/dist/src/adapters/report/terminal.d.ts +5 -0
- package/dist/src/adapters/report/terminal.js +42 -0
- package/dist/src/adapters/report/terminal.js.map +1 -0
- package/dist/src/adapters/skill/copilot-cli.d.ts +6 -0
- package/dist/src/adapters/skill/copilot-cli.js +51 -0
- package/dist/src/adapters/skill/copilot-cli.js.map +1 -0
- package/dist/src/commands/approve.d.ts +5 -0
- package/dist/src/commands/approve.js +40 -0
- package/dist/src/commands/approve.js.map +1 -0
- package/dist/src/commands/capture.d.ts +4 -0
- package/dist/src/commands/capture.js +18 -0
- package/dist/src/commands/capture.js.map +1 -0
- package/dist/src/commands/check.d.ts +6 -0
- package/dist/src/commands/check.js +68 -0
- package/dist/src/commands/check.js.map +1 -0
- package/dist/src/commands/init.d.ts +2 -0
- package/dist/src/commands/init.js +27 -0
- package/dist/src/commands/init.js.map +1 -0
- package/dist/src/commands/report.d.ts +4 -0
- package/dist/src/commands/report.js +26 -0
- package/dist/src/commands/report.js.map +1 -0
- package/dist/src/config.d.ts +3 -0
- package/dist/src/config.js +30 -0
- package/dist/src/config.js.map +1 -0
- package/dist/src/engine/budget.d.ts +10 -0
- package/dist/src/engine/budget.js +25 -0
- package/dist/src/engine/budget.js.map +1 -0
- package/dist/src/engine/comparison/embedding.d.ts +6 -0
- package/dist/src/engine/comparison/embedding.js +19 -0
- package/dist/src/engine/comparison/embedding.js.map +1 -0
- package/dist/src/engine/comparison/judge.d.ts +8 -0
- package/dist/src/engine/comparison/judge.js +64 -0
- package/dist/src/engine/comparison/judge.js.map +1 -0
- package/dist/src/engine/comparison/pipeline.d.ts +6 -0
- package/dist/src/engine/comparison/pipeline.js +31 -0
- package/dist/src/engine/comparison/pipeline.js.map +1 -0
- package/dist/src/engine/comparison/schema.d.ts +2 -0
- package/dist/src/engine/comparison/schema.js +28 -0
- package/dist/src/engine/comparison/schema.js.map +1 -0
- package/dist/src/engine/comparison/variance.d.ts +3 -0
- package/dist/src/engine/comparison/variance.js +26 -0
- package/dist/src/engine/comparison/variance.js.map +1 -0
- package/dist/src/engine/generator.d.ts +3 -0
- package/dist/src/engine/generator.js +52 -0
- package/dist/src/engine/generator.js.map +1 -0
- package/dist/src/engine/snapshot.d.ts +11 -0
- package/dist/src/engine/snapshot.js +46 -0
- package/dist/src/engine/snapshot.js.map +1 -0
- package/dist/src/errors.d.ts +16 -0
- package/dist/src/errors.js +33 -0
- package/dist/src/errors.js.map +1 -0
- package/dist/src/types.d.ts +125 -0
- package/dist/src/types.js +2 -0
- package/dist/src/types.js.map +1 -0
- package/package.json +53 -0
- package/plugin.json +9 -0
- package/scripts/snapeval-cli.sh +7 -0
- package/skills/snapeval/SKILL.md +51 -0
- package/src/adapters/inference/copilot.ts +30 -0
- package/src/adapters/inference/github-models.ts +74 -0
- package/src/adapters/inference/resolve.ts +70 -0
- package/src/adapters/report/json.ts +64 -0
- package/src/adapters/report/terminal.ts +59 -0
- package/src/adapters/skill/copilot-cli.ts +60 -0
- package/src/commands/approve.ts +58 -0
- package/src/commands/capture.ts +25 -0
- package/src/commands/check.ts +86 -0
- package/src/commands/init.ts +38 -0
- package/src/commands/report.ts +36 -0
- package/src/config.ts +37 -0
- package/src/engine/budget.ts +27 -0
- package/src/engine/comparison/embedding.ts +26 -0
- package/src/engine/comparison/judge.ts +78 -0
- package/src/engine/comparison/pipeline.ts +43 -0
- package/src/engine/comparison/schema.ts +22 -0
- package/src/engine/comparison/variance.ts +31 -0
- package/src/engine/generator.ts +61 -0
- package/src/engine/snapshot.ts +48 -0
- package/src/errors.ts +34 -0
- package/src/types.ts +153 -0
package/plugin.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "snapeval",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Semantic snapshot testing for AI skills. Zero assertions. AI-driven. Free inference.",
|
|
5
|
+
"author": "Matan Tsach",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"skills": ["skills/snapeval"],
|
|
8
|
+
"scripts": ["scripts/snapeval-cli.sh"]
|
|
9
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: snapeval
|
|
3
|
+
description: Evaluate AI skills through semantic snapshot testing. Generates test cases, captures baselines, and detects regressions.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
You are snapeval, a skill evaluation assistant. When the user asks you to evaluate, check, or approve a skill, follow this process:
|
|
7
|
+
|
|
8
|
+
## Commands
|
|
9
|
+
|
|
10
|
+
### evaluate / test (first-time capture)
|
|
11
|
+
|
|
12
|
+
1. Ask the user which skill they want to evaluate (or accept the path they provide)
|
|
13
|
+
2. Read the target skill's SKILL.md file using the Read tool
|
|
14
|
+
3. Analyze its purpose, inputs, expected behaviors, and edge cases
|
|
15
|
+
4. Generate 5-8 test scenarios covering:
|
|
16
|
+
- Happy path scenarios (normal use cases)
|
|
17
|
+
- Edge cases (empty input, unusual input)
|
|
18
|
+
- At least one negative test
|
|
19
|
+
5. Present the scenarios as a numbered list and ask: "Here are N test scenarios. Want to adjust any, or should I run them?"
|
|
20
|
+
6. Wait for user confirmation
|
|
21
|
+
7. On confirmation, run these commands:
|
|
22
|
+
```bash
|
|
23
|
+
npx snapeval init <skill-path>
|
|
24
|
+
npx snapeval capture <skill-path>
|
|
25
|
+
```
|
|
26
|
+
8. Report results: how many scenarios captured, total cost, location of snapshots
|
|
27
|
+
|
|
28
|
+
### check (regression detection)
|
|
29
|
+
|
|
30
|
+
1. Run: `npx snapeval check <skill-path> --threshold 0.85`
|
|
31
|
+
2. Parse the terminal output
|
|
32
|
+
3. Report conversationally:
|
|
33
|
+
- Which scenarios passed and at which tier (schema/embedding/judge)
|
|
34
|
+
- Which scenarios regressed with details about what changed
|
|
35
|
+
- Total cost and duration
|
|
36
|
+
4. If regressions found, present options:
|
|
37
|
+
- Fix the skill and re-check
|
|
38
|
+
- Run `@snapeval approve` to accept new behavior
|
|
39
|
+
|
|
40
|
+
### approve
|
|
41
|
+
|
|
42
|
+
1. Run: `npx snapeval approve --scenario <N>` (or without --scenario for all)
|
|
43
|
+
2. Confirm what was approved
|
|
44
|
+
3. Remind user to commit the updated snapshots
|
|
45
|
+
|
|
46
|
+
## Important
|
|
47
|
+
|
|
48
|
+
- Never ask the user to write evals.json or any config files manually
|
|
49
|
+
- Always read the target skill's SKILL.md before generating scenarios
|
|
50
|
+
- Report costs prominently (should be $0.00 for Copilot gpt-5-mini)
|
|
51
|
+
- When reporting regressions, explain what changed in plain language
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { execFileSync } from 'node:child_process';
|
|
2
|
+
import type { InferenceAdapter, Message, ChatOptions } from '../../types.js';
|
|
3
|
+
import { AdapterNotAvailableError } from '../../errors.js';
|
|
4
|
+
|
|
5
|
+
export class CopilotInference implements InferenceAdapter {
|
|
6
|
+
readonly name = 'copilot';
|
|
7
|
+
|
|
8
|
+
constructor(private readonly fallback?: InferenceAdapter) {}
|
|
9
|
+
|
|
10
|
+
async chat(messages: Message[], _options?: ChatOptions): Promise<string> {
|
|
11
|
+
// Concatenate messages into a single prompt string
|
|
12
|
+
const prompt = messages.map((m) => m.content).join('\n');
|
|
13
|
+
const result = execFileSync('gh', ['copilot', '-p', prompt], { encoding: 'utf-8' });
|
|
14
|
+
return result.trim();
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
async embed(text: string): Promise<number[]> {
|
|
18
|
+
if (this.fallback) {
|
|
19
|
+
return this.fallback.embed(text);
|
|
20
|
+
}
|
|
21
|
+
throw new AdapterNotAvailableError(
|
|
22
|
+
'copilot-embed',
|
|
23
|
+
'Copilot CLI does not support embeddings. Provide a fallback InferenceAdapter (e.g. GitHubModelsInference).'
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
estimateCost(_tokens: number): number {
|
|
28
|
+
return 0;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import type { InferenceAdapter, Message, ChatOptions } from '../../types.js';
|
|
2
|
+
import { RateLimitError } from '../../errors.js';
|
|
3
|
+
|
|
4
|
+
const API_BASE = 'https://models.github.ai/inference';
|
|
5
|
+
const CHAT_MODEL = 'openai/gpt-4o-mini';
|
|
6
|
+
const EMBEDDING_MODEL = 'openai/text-embedding-3-small';
|
|
7
|
+
|
|
8
|
+
export class GitHubModelsInference implements InferenceAdapter {
|
|
9
|
+
readonly name = 'github-models';
|
|
10
|
+
|
|
11
|
+
constructor(private readonly token: string = process.env.GITHUB_TOKEN ?? '') {}
|
|
12
|
+
|
|
13
|
+
async chat(messages: Message[], options?: ChatOptions): Promise<string> {
|
|
14
|
+
const body: Record<string, unknown> = {
|
|
15
|
+
model: CHAT_MODEL,
|
|
16
|
+
messages,
|
|
17
|
+
};
|
|
18
|
+
if (options?.temperature !== undefined) body.temperature = options.temperature;
|
|
19
|
+
if (options?.maxTokens !== undefined) body.max_tokens = options.maxTokens;
|
|
20
|
+
if (options?.responseFormat === 'json') {
|
|
21
|
+
body.response_format = { type: 'json_object' };
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const response = await fetch(`${API_BASE}/chat/completions`, {
|
|
25
|
+
method: 'POST',
|
|
26
|
+
headers: {
|
|
27
|
+
Authorization: `Bearer ${this.token}`,
|
|
28
|
+
'Content-Type': 'application/json',
|
|
29
|
+
},
|
|
30
|
+
body: JSON.stringify(body),
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
if (response.status === 429) {
|
|
34
|
+
throw new RateLimitError(this.name);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (!response.ok) {
|
|
38
|
+
throw new Error(`GitHub Models API error: ${response.status} ${response.statusText}`);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const data = (await response.json()) as {
|
|
42
|
+
choices: Array<{ message: { content: string } }>;
|
|
43
|
+
};
|
|
44
|
+
return data.choices[0].message.content;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
async embed(text: string): Promise<number[]> {
|
|
48
|
+
const response = await fetch(`${API_BASE}/embeddings`, {
|
|
49
|
+
method: 'POST',
|
|
50
|
+
headers: {
|
|
51
|
+
Authorization: `Bearer ${this.token}`,
|
|
52
|
+
'Content-Type': 'application/json',
|
|
53
|
+
},
|
|
54
|
+
body: JSON.stringify({ model: EMBEDDING_MODEL, input: text }),
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
if (response.status === 429) {
|
|
58
|
+
throw new RateLimitError(this.name);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (!response.ok) {
|
|
62
|
+
throw new Error(`GitHub Models API error: ${response.status} ${response.statusText}`);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const data = (await response.json()) as {
|
|
66
|
+
data: Array<{ embedding: number[] }>;
|
|
67
|
+
};
|
|
68
|
+
return data.data[0].embedding;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
estimateCost(_tokens: number): number {
|
|
72
|
+
return 0;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import { execFileSync } from 'node:child_process';
|
|
2
|
+
import type { InferenceAdapter } from '../../types.js';
|
|
3
|
+
import { AdapterNotAvailableError } from '../../errors.js';
|
|
4
|
+
import { GitHubModelsInference } from './github-models.js';
|
|
5
|
+
import { CopilotInference } from './copilot.js';
|
|
6
|
+
|
|
7
|
+
function isCopilotAvailable(): boolean {
|
|
8
|
+
try {
|
|
9
|
+
execFileSync('gh', ['copilot', '--version'], { encoding: 'utf-8', stdio: 'pipe' });
|
|
10
|
+
return true;
|
|
11
|
+
} catch {
|
|
12
|
+
return false;
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function isGitHubTokenAvailable(): boolean {
|
|
17
|
+
return Boolean(process.env.GITHUB_TOKEN);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function resolveInference(preference: string): InferenceAdapter {
|
|
21
|
+
if (preference === 'auto') {
|
|
22
|
+
const copilotAvailable = isCopilotAvailable();
|
|
23
|
+
const tokenAvailable = isGitHubTokenAvailable();
|
|
24
|
+
|
|
25
|
+
if (copilotAvailable && tokenAvailable) {
|
|
26
|
+
// Copilot for chat, GitHubModels as embedding fallback
|
|
27
|
+
const githubModels = new GitHubModelsInference();
|
|
28
|
+
return new CopilotInference(githubModels);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if (copilotAvailable) {
|
|
32
|
+
return new CopilotInference();
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (tokenAvailable) {
|
|
36
|
+
return new GitHubModelsInference();
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
throw new AdapterNotAvailableError(
|
|
40
|
+
'inference',
|
|
41
|
+
'No inference adapter available. Install GitHub Copilot CLI (`gh extension install github/gh-copilot`) or set GITHUB_TOKEN.'
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (preference === 'copilot') {
|
|
46
|
+
if (!isCopilotAvailable()) {
|
|
47
|
+
throw new AdapterNotAvailableError(
|
|
48
|
+
'copilot',
|
|
49
|
+
'GitHub Copilot CLI is not available. Install with: gh extension install github/gh-copilot'
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
const fallback = isGitHubTokenAvailable() ? new GitHubModelsInference() : undefined;
|
|
53
|
+
return new CopilotInference(fallback);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (preference === 'github-models') {
|
|
57
|
+
if (!isGitHubTokenAvailable()) {
|
|
58
|
+
throw new AdapterNotAvailableError(
|
|
59
|
+
'github-models',
|
|
60
|
+
'GITHUB_TOKEN environment variable is not set.'
|
|
61
|
+
);
|
|
62
|
+
}
|
|
63
|
+
return new GitHubModelsInference();
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
throw new AdapterNotAvailableError(
|
|
67
|
+
preference,
|
|
68
|
+
`Unknown inference adapter "${preference}". Valid options: auto, copilot, github-models.`
|
|
69
|
+
);
|
|
70
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type {
|
|
4
|
+
ReportAdapter,
|
|
5
|
+
EvalResults,
|
|
6
|
+
GradingFile,
|
|
7
|
+
AssertionResult,
|
|
8
|
+
GradingSummary,
|
|
9
|
+
} from '../../types.js';
|
|
10
|
+
|
|
11
|
+
function buildGradingFile(results: EvalResults): GradingFile {
|
|
12
|
+
const allAssertions: AssertionResult[] = [];
|
|
13
|
+
|
|
14
|
+
for (const scenario of results.scenarios) {
|
|
15
|
+
if (scenario.grading) {
|
|
16
|
+
allAssertions.push(...scenario.grading.assertion_results);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const passed = allAssertions.filter((a) => a.passed).length;
|
|
21
|
+
const failed = allAssertions.filter((a) => !a.passed).length;
|
|
22
|
+
const total = allAssertions.length;
|
|
23
|
+
const pass_rate = total > 0 ? passed / total : 0;
|
|
24
|
+
|
|
25
|
+
const summary: GradingSummary = { passed, failed, total, pass_rate };
|
|
26
|
+
return { assertion_results: allAssertions, summary };
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export class JSONReporter implements ReportAdapter {
|
|
30
|
+
readonly name = 'json';
|
|
31
|
+
|
|
32
|
+
constructor(private readonly outputDir: string) {}
|
|
33
|
+
|
|
34
|
+
async report(results: EvalResults): Promise<void> {
|
|
35
|
+
fs.mkdirSync(this.outputDir, { recursive: true });
|
|
36
|
+
|
|
37
|
+
// grading.json
|
|
38
|
+
const gradingFile = buildGradingFile(results);
|
|
39
|
+
fs.writeFileSync(
|
|
40
|
+
path.join(this.outputDir, 'grading.json'),
|
|
41
|
+
JSON.stringify(gradingFile, null, 2),
|
|
42
|
+
'utf-8'
|
|
43
|
+
);
|
|
44
|
+
|
|
45
|
+
// timing.json
|
|
46
|
+
const timingData = {
|
|
47
|
+
total_tokens: results.timing.total_tokens,
|
|
48
|
+
duration_ms: results.timing.duration_ms,
|
|
49
|
+
};
|
|
50
|
+
fs.writeFileSync(
|
|
51
|
+
path.join(this.outputDir, 'timing.json'),
|
|
52
|
+
JSON.stringify(timingData, null, 2),
|
|
53
|
+
'utf-8'
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
// benchmark.json
|
|
57
|
+
const benchmarkData = { run_summary: results.summary };
|
|
58
|
+
fs.writeFileSync(
|
|
59
|
+
path.join(this.outputDir, 'benchmark.json'),
|
|
60
|
+
JSON.stringify(benchmarkData, null, 2),
|
|
61
|
+
'utf-8'
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import chalk from 'chalk';
|
|
2
|
+
import type { ReportAdapter, EvalResults, ScenarioResult, ComparisonVerdict } from '../../types.js';
|
|
3
|
+
|
|
4
|
+
function verdictIcon(verdict: ComparisonVerdict): string {
|
|
5
|
+
switch (verdict) {
|
|
6
|
+
case 'pass':
|
|
7
|
+
return chalk.green('✓');
|
|
8
|
+
case 'regressed':
|
|
9
|
+
return chalk.red('✗');
|
|
10
|
+
default:
|
|
11
|
+
return chalk.yellow('?');
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function formatScenario(scenario: ScenarioResult): string {
|
|
16
|
+
const icon = verdictIcon(scenario.comparison.verdict);
|
|
17
|
+
const tier = `tier${scenario.comparison.tier}`;
|
|
18
|
+
const tokens = scenario.timing.total_tokens;
|
|
19
|
+
const durationS = (scenario.timing.duration_ms / 1000).toFixed(2);
|
|
20
|
+
const cost = scenario.newOutput.metadata.adapter;
|
|
21
|
+
return ` ${icon} Scenario ${scenario.scenarioId} [${tier}] — ${tokens} tokens, ${durationS}s (${cost})`;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export class TerminalReporter implements ReportAdapter {
|
|
25
|
+
readonly name = 'terminal';
|
|
26
|
+
|
|
27
|
+
async report(results: EvalResults): Promise<void> {
|
|
28
|
+
const { skillName, scenarios, summary, timing } = results;
|
|
29
|
+
|
|
30
|
+
console.log(chalk.bold(`\nSnapeval — ${skillName}`));
|
|
31
|
+
console.log(chalk.dim('─'.repeat(50)));
|
|
32
|
+
|
|
33
|
+
for (const scenario of scenarios) {
|
|
34
|
+
console.log(formatScenario(scenario));
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
console.log(chalk.dim('─'.repeat(50)));
|
|
38
|
+
|
|
39
|
+
const passedStr = chalk.green(`${summary.passed} passed`);
|
|
40
|
+
const regressedCount = summary.regressed;
|
|
41
|
+
const regressedStr = regressedCount > 0
|
|
42
|
+
? chalk.red(`${regressedCount} regressed`)
|
|
43
|
+
: chalk.dim(`${regressedCount} regressed`);
|
|
44
|
+
const totalStr = `${summary.total_scenarios} total`;
|
|
45
|
+
const passRate = (summary.pass_rate * 100).toFixed(0);
|
|
46
|
+
|
|
47
|
+
console.log(`${passedStr}, ${regressedStr}, ${totalStr} (${passRate}%)`);
|
|
48
|
+
console.log(
|
|
49
|
+
chalk.dim(
|
|
50
|
+
`Tokens: ${timing.total_tokens} | Duration: ${(timing.duration_ms / 1000).toFixed(2)}s | Cost: $${summary.total_cost_usd.toFixed(4)}`
|
|
51
|
+
)
|
|
52
|
+
);
|
|
53
|
+
console.log(
|
|
54
|
+
chalk.dim(
|
|
55
|
+
`Tier breakdown — schema: ${summary.tier_breakdown.tier1_schema}, embedding: ${summary.tier_breakdown.tier2_embedding}, llm: ${summary.tier_breakdown.tier3_llm_judge}`
|
|
56
|
+
)
|
|
57
|
+
);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { execFile, execFileSync } from 'node:child_process';
|
|
2
|
+
import { readFile } from 'node:fs/promises';
|
|
3
|
+
import * as path from 'node:path';
|
|
4
|
+
import type { SkillAdapter, SkillOutput } from '../../types.js';
|
|
5
|
+
|
|
6
|
+
export class CopilotCLIAdapter implements SkillAdapter {
|
|
7
|
+
readonly name = 'copilot-cli';
|
|
8
|
+
|
|
9
|
+
async invoke(skillPath: string, prompt: string, _files?: string[]): Promise<SkillOutput> {
|
|
10
|
+
const startMs = Date.now();
|
|
11
|
+
|
|
12
|
+
// Try to include SKILL.md content as context if present
|
|
13
|
+
let skillMd = '';
|
|
14
|
+
try {
|
|
15
|
+
const skillFile = path.join(skillPath, 'SKILL.md');
|
|
16
|
+
skillMd = await readFile(skillFile, { encoding: 'utf-8' });
|
|
17
|
+
} catch {
|
|
18
|
+
// ignore missing SKILL.md
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const finalPrompt = skillMd ? `${skillMd}\n\n${prompt}` : prompt;
|
|
22
|
+
|
|
23
|
+
return new Promise<SkillOutput>((resolve, reject) => {
|
|
24
|
+
// Use gh copilot and pass flags after `--` so gh doesn't consume them.
|
|
25
|
+
// Use --silent to limit output to the model's response only.
|
|
26
|
+
execFile(
|
|
27
|
+
'gh',
|
|
28
|
+
['copilot', '--', '-p', finalPrompt, '--silent'],
|
|
29
|
+
{ encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 },
|
|
30
|
+
(error, stdout, _stderr) => {
|
|
31
|
+
if (error) {
|
|
32
|
+
reject(error);
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
const durationMs = Date.now() - startMs;
|
|
36
|
+
const raw = stdout.trim();
|
|
37
|
+
resolve({
|
|
38
|
+
raw,
|
|
39
|
+
metadata: {
|
|
40
|
+
tokens: 0,
|
|
41
|
+
durationMs,
|
|
42
|
+
model: 'copilot',
|
|
43
|
+
adapter: this.name,
|
|
44
|
+
},
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
);
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async isAvailable(): Promise<boolean> {
|
|
52
|
+
try {
|
|
53
|
+
// Use gh copilot --help as a lightweight availability check
|
|
54
|
+
execFileSync('gh', ['copilot', '--help'], { encoding: 'utf-8', stdio: 'pipe' });
|
|
55
|
+
return true;
|
|
56
|
+
} catch {
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type { SkillAdapter, EvalsFile, EvalResults } from '../types.js';
|
|
4
|
+
import { SnapshotManager } from '../engine/snapshot.js';
|
|
5
|
+
import { SnapevalError } from '../errors.js';
|
|
6
|
+
|
|
7
|
+
export async function approveCommand(
|
|
8
|
+
skillPath: string,
|
|
9
|
+
skillAdapter: SkillAdapter,
|
|
10
|
+
options: { scenarioIds?: number[] } = {}
|
|
11
|
+
): Promise<void> {
|
|
12
|
+
const evalsPath = path.join(skillPath, 'evals', 'evals.json');
|
|
13
|
+
if (!fs.existsSync(evalsPath)) {
|
|
14
|
+
throw new SnapevalError(`No evals.json found at ${evalsPath}. Run \`snapeval init\` first.`);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const evalsFile: EvalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
|
|
18
|
+
const manager = new SnapshotManager(path.join(skillPath, 'evals'));
|
|
19
|
+
|
|
20
|
+
// Determine which scenarios to approve
|
|
21
|
+
const targetCases = options.scenarioIds && options.scenarioIds.length > 0
|
|
22
|
+
? evalsFile.evals.filter((e) => options.scenarioIds!.includes(e.id))
|
|
23
|
+
: evalsFile.evals;
|
|
24
|
+
|
|
25
|
+
for (const evalCase of targetCases) {
|
|
26
|
+
const newOutput = await skillAdapter.invoke(skillPath, evalCase.prompt, evalCase.files);
|
|
27
|
+
manager.approve(evalCase.id, evalCase.prompt, newOutput);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function approveFromResults(
|
|
32
|
+
skillPath: string,
|
|
33
|
+
results: EvalResults,
|
|
34
|
+
scenarioIds?: number[]
|
|
35
|
+
): void {
|
|
36
|
+
const evalsPath = path.join(skillPath, 'evals', 'evals.json');
|
|
37
|
+
if (!fs.existsSync(evalsPath)) {
|
|
38
|
+
throw new SnapevalError(`No evals.json found at ${evalsPath}.`);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const evalsFile: EvalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
|
|
42
|
+
const manager = new SnapshotManager(path.join(skillPath, 'evals'));
|
|
43
|
+
|
|
44
|
+
// Find regressed scenarios from results
|
|
45
|
+
const regressedResults = results.scenarios.filter(
|
|
46
|
+
(s) => s.comparison.verdict === 'regressed'
|
|
47
|
+
);
|
|
48
|
+
|
|
49
|
+
const toApprove = scenarioIds && scenarioIds.length > 0
|
|
50
|
+
? regressedResults.filter((s) => scenarioIds.includes(s.scenarioId))
|
|
51
|
+
: regressedResults;
|
|
52
|
+
|
|
53
|
+
for (const scenario of toApprove) {
|
|
54
|
+
const evalCase = evalsFile.evals.find((e) => e.id === scenario.scenarioId);
|
|
55
|
+
if (!evalCase) continue;
|
|
56
|
+
manager.approve(scenario.scenarioId, evalCase.prompt, scenario.newOutput);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type { SkillAdapter, EvalsFile } from '../types.js';
|
|
4
|
+
import { SnapshotManager } from '../engine/snapshot.js';
|
|
5
|
+
import { SnapevalError } from '../errors.js';
|
|
6
|
+
|
|
7
|
+
export async function captureCommand(
|
|
8
|
+
skillPath: string,
|
|
9
|
+
skillAdapter: SkillAdapter,
|
|
10
|
+
options: { runs?: number } = {}
|
|
11
|
+
): Promise<void> {
|
|
12
|
+
const evalsPath = path.join(skillPath, 'evals', 'evals.json');
|
|
13
|
+
if (!fs.existsSync(evalsPath)) {
|
|
14
|
+
throw new SnapevalError(`No evals.json found at ${evalsPath}. Run \`snapeval init\` first.`);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const evalsFile: EvalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
|
|
18
|
+
const manager = new SnapshotManager(path.join(skillPath, 'evals'));
|
|
19
|
+
const runs = options.runs ?? 1;
|
|
20
|
+
|
|
21
|
+
for (const evalCase of evalsFile.evals) {
|
|
22
|
+
const output = await skillAdapter.invoke(skillPath, evalCase.prompt, evalCase.files);
|
|
23
|
+
manager.saveSnapshot(evalCase.id, evalCase.prompt, output, runs);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type {
|
|
4
|
+
SkillAdapter,
|
|
5
|
+
InferenceAdapter,
|
|
6
|
+
EvalsFile,
|
|
7
|
+
EvalResults,
|
|
8
|
+
ScenarioResult,
|
|
9
|
+
BenchmarkSummary,
|
|
10
|
+
} from '../types.js';
|
|
11
|
+
import { SnapshotManager } from '../engine/snapshot.js';
|
|
12
|
+
import { comparePipeline } from '../engine/comparison/pipeline.js';
|
|
13
|
+
import { NoBaselineError, SnapevalError } from '../errors.js';
|
|
14
|
+
import { BudgetEngine } from '../engine/budget.js';
|
|
15
|
+
|
|
16
|
+
export async function checkCommand(
|
|
17
|
+
skillPath: string,
|
|
18
|
+
skillAdapter: SkillAdapter,
|
|
19
|
+
inference: InferenceAdapter,
|
|
20
|
+
options: { threshold: number; budget: string; skipEmbedding?: boolean }
|
|
21
|
+
): Promise<EvalResults> {
|
|
22
|
+
const evalsPath = path.join(skillPath, 'evals', 'evals.json');
|
|
23
|
+
if (!fs.existsSync(evalsPath)) {
|
|
24
|
+
throw new SnapevalError(`No evals.json found at ${evalsPath}`);
|
|
25
|
+
}
|
|
26
|
+
const evalsFile: EvalsFile = JSON.parse(fs.readFileSync(evalsPath, 'utf-8'));
|
|
27
|
+
const manager = new SnapshotManager(path.join(skillPath, 'evals'));
|
|
28
|
+
const budget = new BudgetEngine(options.budget);
|
|
29
|
+
const startTime = Date.now();
|
|
30
|
+
|
|
31
|
+
if (manager.listSnapshotIds().length === 0) {
|
|
32
|
+
throw new NoBaselineError(skillPath);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const scenarios: ScenarioResult[] = [];
|
|
36
|
+
const tierBreakdown = { tier1_schema: 0, tier2_embedding: 0, tier3_llm_judge: 0 };
|
|
37
|
+
|
|
38
|
+
for (const evalCase of evalsFile.evals) {
|
|
39
|
+
const baseline = manager.loadSnapshot(evalCase.id);
|
|
40
|
+
if (!baseline) continue;
|
|
41
|
+
const newOutput = await skillAdapter.invoke(skillPath, evalCase.prompt, evalCase.files);
|
|
42
|
+
const comparison = await comparePipeline(
|
|
43
|
+
baseline.output.raw,
|
|
44
|
+
newOutput.raw,
|
|
45
|
+
inference,
|
|
46
|
+
{ threshold: options.threshold, skipEmbedding: options.skipEmbedding }
|
|
47
|
+
);
|
|
48
|
+
comparison.scenarioId = evalCase.id;
|
|
49
|
+
if (comparison.tier === 1) tierBreakdown.tier1_schema++;
|
|
50
|
+
else if (comparison.tier === 2) tierBreakdown.tier2_embedding++;
|
|
51
|
+
else tierBreakdown.tier3_llm_judge++;
|
|
52
|
+
budget.addCost(inference.estimateCost(newOutput.metadata.tokens));
|
|
53
|
+
scenarios.push({
|
|
54
|
+
scenarioId: evalCase.id,
|
|
55
|
+
prompt: evalCase.prompt,
|
|
56
|
+
comparison,
|
|
57
|
+
timing: {
|
|
58
|
+
total_tokens: newOutput.metadata.tokens,
|
|
59
|
+
duration_ms: newOutput.metadata.durationMs,
|
|
60
|
+
},
|
|
61
|
+
newOutput,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const passed = scenarios.filter((s) => s.comparison.verdict === 'pass').length;
|
|
66
|
+
const regressed = scenarios.filter((s) => s.comparison.verdict === 'regressed').length;
|
|
67
|
+
const summary: BenchmarkSummary = {
|
|
68
|
+
total_scenarios: scenarios.length,
|
|
69
|
+
passed,
|
|
70
|
+
regressed,
|
|
71
|
+
pass_rate: scenarios.length > 0 ? passed / scenarios.length : 1.0,
|
|
72
|
+
total_tokens: scenarios.reduce((sum, s) => sum + s.timing.total_tokens, 0),
|
|
73
|
+
total_cost_usd: budget.totalCost,
|
|
74
|
+
total_duration_ms: Date.now() - startTime,
|
|
75
|
+
tier_breakdown: tierBreakdown,
|
|
76
|
+
};
|
|
77
|
+
return {
|
|
78
|
+
skillName: evalsFile.skill_name,
|
|
79
|
+
scenarios,
|
|
80
|
+
summary,
|
|
81
|
+
timing: {
|
|
82
|
+
total_tokens: summary.total_tokens,
|
|
83
|
+
duration_ms: summary.total_duration_ms,
|
|
84
|
+
},
|
|
85
|
+
};
|
|
86
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type { InferenceAdapter } from '../types.js';
|
|
4
|
+
import { generateEvals } from '../engine/generator.js';
|
|
5
|
+
import { SnapevalError } from '../errors.js';
|
|
6
|
+
|
|
7
|
+
export async function initCommand(
|
|
8
|
+
skillPath: string,
|
|
9
|
+
inference: InferenceAdapter
|
|
10
|
+
): Promise<void> {
|
|
11
|
+
// Locate the skill definition file (SKILL.md or skill.md)
|
|
12
|
+
const candidates = ['SKILL.md', 'skill.md'];
|
|
13
|
+
let skillFilePath: string | null = null;
|
|
14
|
+
for (const name of candidates) {
|
|
15
|
+
const candidate = path.join(skillPath, name);
|
|
16
|
+
if (fs.existsSync(candidate)) {
|
|
17
|
+
skillFilePath = candidate;
|
|
18
|
+
break;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if (!skillFilePath) {
|
|
23
|
+
throw new SnapevalError(
|
|
24
|
+
`No SKILL.md found at ${skillPath}. Create a SKILL.md file to describe your skill.`
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const skillContent = fs.readFileSync(skillFilePath, 'utf-8');
|
|
29
|
+
const skillName = path.basename(skillPath);
|
|
30
|
+
|
|
31
|
+
const evalsFile = await generateEvals(skillContent, skillName, inference);
|
|
32
|
+
|
|
33
|
+
const evalsDir = path.join(skillPath, 'evals');
|
|
34
|
+
fs.mkdirSync(evalsDir, { recursive: true });
|
|
35
|
+
|
|
36
|
+
const evalsPath = path.join(evalsDir, 'evals.json');
|
|
37
|
+
fs.writeFileSync(evalsPath, JSON.stringify(evalsFile, null, 2), 'utf-8');
|
|
38
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type { EvalResults } from '../types.js';
|
|
4
|
+
import { JSONReporter } from '../adapters/report/json.js';
|
|
5
|
+
import { TerminalReporter } from '../adapters/report/terminal.js';
|
|
6
|
+
|
|
7
|
+
export async function reportCommand(
|
|
8
|
+
skillPath: string,
|
|
9
|
+
results: EvalResults,
|
|
10
|
+
options: { verbose?: boolean } = {}
|
|
11
|
+
): Promise<void> {
|
|
12
|
+
// Determine next iteration number
|
|
13
|
+
const resultsBaseDir = path.join(skillPath, 'evals', 'results');
|
|
14
|
+
fs.mkdirSync(resultsBaseDir, { recursive: true });
|
|
15
|
+
|
|
16
|
+
const existingIterations = fs.readdirSync(resultsBaseDir)
|
|
17
|
+
.filter((d) => /^iteration-\d+$/.test(d))
|
|
18
|
+
.map((d) => parseInt(d.replace('iteration-', ''), 10))
|
|
19
|
+
.sort((a, b) => a - b);
|
|
20
|
+
|
|
21
|
+
const nextIteration = existingIterations.length > 0
|
|
22
|
+
? existingIterations[existingIterations.length - 1] + 1
|
|
23
|
+
: 1;
|
|
24
|
+
|
|
25
|
+
const iterationDir = path.join(resultsBaseDir, `iteration-${nextIteration}`);
|
|
26
|
+
|
|
27
|
+
// Write JSON report
|
|
28
|
+
const jsonReporter = new JSONReporter(iterationDir);
|
|
29
|
+
await jsonReporter.report(results);
|
|
30
|
+
|
|
31
|
+
// Print terminal report
|
|
32
|
+
if (options.verbose !== false) {
|
|
33
|
+
const terminalReporter = new TerminalReporter();
|
|
34
|
+
await terminalReporter.report(results);
|
|
35
|
+
}
|
|
36
|
+
}
|