snapeval 2.2.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -13
- package/bin/snapeval.ts +7 -44
- package/dist/bin/snapeval.d.ts +1 -1
- package/dist/bin/snapeval.js +6 -42
- package/dist/bin/snapeval.js.map +1 -1
- package/dist/src/adapters/copilot-sdk-client.d.ts +0 -4
- package/dist/src/adapters/copilot-sdk-client.js +2 -23
- package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
- package/dist/src/adapters/harness/copilot-cli.js +1 -0
- package/dist/src/adapters/harness/copilot-cli.js.map +1 -1
- package/dist/src/adapters/harness/copilot-sdk.d.ts +1 -0
- package/dist/src/adapters/harness/copilot-sdk.js +15 -16
- package/dist/src/adapters/harness/copilot-sdk.js.map +1 -1
- package/dist/src/adapters/harness/resolve.js +1 -5
- package/dist/src/adapters/harness/resolve.js.map +1 -1
- package/dist/src/adapters/harness/skill-blocker.d.ts +17 -0
- package/dist/src/adapters/harness/skill-blocker.js +47 -0
- package/dist/src/adapters/harness/skill-blocker.js.map +1 -0
- package/dist/src/adapters/inference/copilot-sdk.d.ts +1 -1
- package/dist/src/adapters/inference/copilot-sdk.js +4 -2
- package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
- package/dist/src/adapters/inference/github-models.js +3 -0
- package/dist/src/adapters/inference/github-models.js.map +1 -1
- package/dist/src/adapters/inference/resolve.js +6 -32
- package/dist/src/adapters/inference/resolve.js.map +1 -1
- package/dist/src/commands/eval.d.ts +1 -0
- package/dist/src/commands/eval.js +8 -0
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/engine/runner.js +1 -0
- package/dist/src/engine/runner.js.map +1 -1
- package/dist/src/errors.d.ts +0 -6
- package/dist/src/errors.js +1 -13
- package/dist/src/errors.js.map +1 -1
- package/dist/src/types.d.ts +1 -1
- package/package.json +8 -11
- package/plugin.json +4 -2
- package/skills/create-evals/SKILL.md +152 -0
- package/skills/run-evals/SKILL.md +132 -0
- package/src/adapters/copilot-sdk-client.ts +2 -22
- package/src/adapters/harness/copilot-cli.ts +1 -0
- package/src/adapters/harness/copilot-sdk.ts +20 -17
- package/src/adapters/harness/resolve.ts +1 -8
- package/src/adapters/harness/skill-blocker.ts +61 -0
- package/src/adapters/inference/copilot-sdk.ts +4 -2
- package/src/adapters/inference/github-models.ts +3 -0
- package/src/adapters/inference/resolve.ts +8 -43
- package/src/commands/eval.ts +14 -1
- package/src/engine/runner.ts +1 -0
- package/src/errors.ts +1 -15
- package/src/types.ts +1 -1
- package/assets/ideation-viewer.html +0 -469
- package/dist/src/adapters/inference/copilot.d.ts +0 -5
- package/dist/src/adapters/inference/copilot.js +0 -10
- package/dist/src/adapters/inference/copilot.js.map +0 -1
- package/dist/src/commands/review.d.ts +0 -8
- package/dist/src/commands/review.js +0 -32
- package/dist/src/commands/review.js.map +0 -1
- package/src/adapters/inference/copilot.ts +0 -12
- package/src/commands/review.ts +0 -46
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
import * as fs from 'node:fs';
|
|
2
2
|
import * as path from 'node:path';
|
|
3
3
|
import type { Harness, HarnessRunResult } from '../../types.js';
|
|
4
|
-
import { getClient
|
|
4
|
+
import { getClient } from '../copilot-sdk-client.js';
|
|
5
|
+
import {
|
|
6
|
+
createSkillBlockingPermissionHandler,
|
|
7
|
+
createSkillBlockingHook,
|
|
8
|
+
} from './skill-blocker.js';
|
|
5
9
|
|
|
6
10
|
export class CopilotSDKHarness implements Harness {
|
|
7
11
|
readonly name = 'copilot-sdk';
|
|
8
12
|
|
|
9
13
|
async run(options: {
|
|
10
14
|
skillPath?: string;
|
|
15
|
+
blockedSkillPath?: string;
|
|
11
16
|
prompt: string;
|
|
12
17
|
files?: string[];
|
|
13
18
|
outputDir: string;
|
|
@@ -18,7 +23,6 @@ export class CopilotSDKHarness implements Harness {
|
|
|
18
23
|
fs.mkdirSync(options.outputDir, { recursive: true });
|
|
19
24
|
|
|
20
25
|
// Dynamically import SDK for approveAll
|
|
21
|
-
// @ts-ignore — module may not be installed (optional dep)
|
|
22
26
|
const { approveAll } = await import('@github/copilot-sdk');
|
|
23
27
|
|
|
24
28
|
// Build session config
|
|
@@ -31,7 +35,16 @@ export class CopilotSDKHarness implements Harness {
|
|
|
31
35
|
|
|
32
36
|
// Native skill loading: point skillDirectories at the skill's parent
|
|
33
37
|
if (options.skillPath) {
|
|
34
|
-
sessionConfig.skillDirectories = [options.skillPath];
|
|
38
|
+
sessionConfig.skillDirectories = [path.dirname(options.skillPath)];
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Skill blocking: replace approveAll with blocking handler + hook
|
|
42
|
+
if (options.blockedSkillPath) {
|
|
43
|
+
const blockedDir = path.dirname(options.blockedSkillPath);
|
|
44
|
+
sessionConfig.onPermissionRequest = createSkillBlockingPermissionHandler(blockedDir);
|
|
45
|
+
sessionConfig.hooks = {
|
|
46
|
+
onPreToolUse: createSkillBlockingHook(blockedDir),
|
|
47
|
+
};
|
|
35
48
|
}
|
|
36
49
|
|
|
37
50
|
const session = await client.createSession(sessionConfig);
|
|
@@ -62,8 +75,8 @@ export class CopilotSDKHarness implements Harness {
|
|
|
62
75
|
const events = await session.getMessages();
|
|
63
76
|
const transcript = buildTranscript(events);
|
|
64
77
|
|
|
65
|
-
//
|
|
66
|
-
const totalTokens =
|
|
78
|
+
// SDK assistant.usage events are ephemeral and not available via getMessages()
|
|
79
|
+
const totalTokens = 0;
|
|
67
80
|
|
|
68
81
|
const durationMs = Date.now() - startMs;
|
|
69
82
|
|
|
@@ -80,7 +93,7 @@ export class CopilotSDKHarness implements Harness {
|
|
|
80
93
|
}
|
|
81
94
|
|
|
82
95
|
async isAvailable(): Promise<boolean> {
|
|
83
|
-
return
|
|
96
|
+
return true;
|
|
84
97
|
}
|
|
85
98
|
}
|
|
86
99
|
|
|
@@ -98,7 +111,7 @@ function buildTranscript(events: any[]): string {
|
|
|
98
111
|
lines.push(`[tool:start] ${event.data?.toolName ?? 'unknown'}(${JSON.stringify(event.data?.arguments ?? {})})`);
|
|
99
112
|
break;
|
|
100
113
|
case 'tool.execution_complete':
|
|
101
|
-
lines.push(`[tool:done] ${event.data?.toolName ?? 'unknown'} → ${truncate(event.data?.result ?? '', 200)}`);
|
|
114
|
+
lines.push(`[tool:done] ${event.data?.toolName ?? 'unknown'} → ${truncate(event.data?.result?.content ?? '', 200)}`);
|
|
102
115
|
break;
|
|
103
116
|
case 'skill.invoked':
|
|
104
117
|
lines.push(`[skill] ${event.data?.name ?? 'unknown'} (${event.data?.path ?? ''})`);
|
|
@@ -111,16 +124,6 @@ function buildTranscript(events: any[]): string {
|
|
|
111
124
|
return lines.join('\n');
|
|
112
125
|
}
|
|
113
126
|
|
|
114
|
-
function extractTokenCount(events: any[]): number {
|
|
115
|
-
let total = 0;
|
|
116
|
-
for (const event of events) {
|
|
117
|
-
if (event.type === 'assistant.usage') {
|
|
118
|
-
total += (event.data?.inputTokens ?? 0) + (event.data?.outputTokens ?? 0);
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
return total;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
127
|
function truncate(str: string, max: number): string {
|
|
125
128
|
return str.length > max ? str.slice(0, max) + '...' : str;
|
|
126
129
|
}
|
|
@@ -1,17 +1,10 @@
|
|
|
1
1
|
import type { Harness } from '../../types.js';
|
|
2
2
|
import { CopilotCLIHarness } from './copilot-cli.js';
|
|
3
3
|
import { CopilotSDKHarness } from './copilot-sdk.js';
|
|
4
|
-
import {
|
|
5
|
-
import { isSDKInstalled } from '../copilot-sdk-client.js';
|
|
4
|
+
import { SnapevalError } from '../../errors.js';
|
|
6
5
|
|
|
7
6
|
export function resolveHarness(name: string): Harness {
|
|
8
7
|
if (name === 'copilot-sdk') {
|
|
9
|
-
if (!isSDKInstalled()) {
|
|
10
|
-
throw new AdapterNotAvailableError(
|
|
11
|
-
'copilot-sdk',
|
|
12
|
-
'@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk'
|
|
13
|
-
);
|
|
14
|
-
}
|
|
15
8
|
return new CopilotSDKHarness();
|
|
16
9
|
}
|
|
17
10
|
if (name === 'copilot-cli') {
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import type { PermissionHandler } from '@github/copilot-sdk';
|
|
4
|
+
|
|
5
|
+
// PreToolUseHandler is not re-exported from @github/copilot-sdk's public API,
|
|
6
|
+
// so we define the minimal type inline matching the SDK's internal definition.
|
|
7
|
+
type PreToolUseHookInput = { toolName: string; toolArgs: unknown; timestamp: number; cwd: string };
|
|
8
|
+
type PreToolUseHookOutput = { permissionDecision?: 'allow' | 'deny' | 'ask'; permissionDecisionReason?: string };
|
|
9
|
+
type PreToolUseHandler = (input: PreToolUseHookInput, invocation: { sessionId: string }) => PreToolUseHookOutput | void;
|
|
10
|
+
|
|
11
|
+
function resolveDir(dir: string): string {
|
|
12
|
+
try {
|
|
13
|
+
return fs.realpathSync(dir) + path.sep;
|
|
14
|
+
} catch {
|
|
15
|
+
return path.resolve(dir) + path.sep;
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function createSkillBlockingHook(
|
|
20
|
+
blockedDir: string,
|
|
21
|
+
): PreToolUseHandler {
|
|
22
|
+
const resolved = resolveDir(blockedDir);
|
|
23
|
+
const raw = path.resolve(blockedDir) + path.sep;
|
|
24
|
+
|
|
25
|
+
return (input) => {
|
|
26
|
+
const argsStr = typeof input.toolArgs === 'string'
|
|
27
|
+
? input.toolArgs
|
|
28
|
+
: JSON.stringify(input.toolArgs ?? '');
|
|
29
|
+
|
|
30
|
+
if (argsStr.includes(resolved) || argsStr.includes(raw)) {
|
|
31
|
+
return {
|
|
32
|
+
permissionDecision: 'deny' as const,
|
|
33
|
+
permissionDecisionReason: `Blocked: tool "${input.toolName}" references blocked skill directory`,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
return {};
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function createSkillBlockingPermissionHandler(
|
|
41
|
+
blockedDir: string,
|
|
42
|
+
): PermissionHandler {
|
|
43
|
+
const resolved = resolveDir(blockedDir);
|
|
44
|
+
|
|
45
|
+
return (request) => {
|
|
46
|
+
if (request.kind === 'read') {
|
|
47
|
+
const readPath = (request as { path?: unknown }).path;
|
|
48
|
+
if (typeof readPath === 'string') {
|
|
49
|
+
try {
|
|
50
|
+
const resolvedPath = fs.realpathSync(readPath);
|
|
51
|
+
if (resolvedPath === resolved.slice(0, -1) || resolvedPath.startsWith(resolved)) {
|
|
52
|
+
return { kind: 'denied-by-rules', rules: [{ reason: 'skill-access-blocked' }] };
|
|
53
|
+
}
|
|
54
|
+
} catch {
|
|
55
|
+
// Path doesn't exist — can't be the skill dir
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return { kind: 'approved' };
|
|
60
|
+
};
|
|
61
|
+
}
|
|
@@ -4,10 +4,9 @@ import { getClient } from '../copilot-sdk-client.js';
|
|
|
4
4
|
export class CopilotSDKInference implements InferenceAdapter {
|
|
5
5
|
readonly name = 'copilot-sdk';
|
|
6
6
|
|
|
7
|
-
async chat(messages: Message[],
|
|
7
|
+
async chat(messages: Message[], options?: ChatOptions): Promise<string> {
|
|
8
8
|
const client = await getClient();
|
|
9
9
|
|
|
10
|
-
// @ts-ignore — module may not be installed (optional dep)
|
|
11
10
|
const { approveAll } = await import('@github/copilot-sdk');
|
|
12
11
|
|
|
13
12
|
const systemMessages = messages.filter((m) => m.role === 'system');
|
|
@@ -15,6 +14,9 @@ export class CopilotSDKInference implements InferenceAdapter {
|
|
|
15
14
|
const systemContent = systemMessages.map((m) => m.content).join('\n');
|
|
16
15
|
const userPrompt = nonSystemMessages.map((m) => m.content).join('\n');
|
|
17
16
|
|
|
17
|
+
// Note: ChatOptions (temperature, responseFormat) are not supported by the
|
|
18
|
+
// SDK's session config. The SDK controls these at the server level.
|
|
19
|
+
void options;
|
|
18
20
|
const session = await client.createSession({
|
|
19
21
|
model: 'gpt-4.1',
|
|
20
22
|
...(systemContent
|
|
@@ -40,6 +40,9 @@ export class GitHubModelsInference implements InferenceAdapter {
|
|
|
40
40
|
const data = (await response.json()) as {
|
|
41
41
|
choices: Array<{ message: { content: string } }>;
|
|
42
42
|
};
|
|
43
|
+
if (!data.choices?.length) {
|
|
44
|
+
throw new Error('GitHub Models API returned no choices');
|
|
45
|
+
}
|
|
43
46
|
return data.choices[0].message.content;
|
|
44
47
|
}
|
|
45
48
|
}
|
|
@@ -1,19 +1,7 @@
|
|
|
1
|
-
import { execFileSync } from 'node:child_process';
|
|
2
1
|
import type { InferenceAdapter } from '../../types.js';
|
|
3
2
|
import { AdapterNotAvailableError } from '../../errors.js';
|
|
4
3
|
import { GitHubModelsInference } from './github-models.js';
|
|
5
|
-
import { CopilotInference } from './copilot.js';
|
|
6
4
|
import { CopilotSDKInference } from './copilot-sdk.js';
|
|
7
|
-
import { isSDKInstalled } from '../copilot-sdk-client.js';
|
|
8
|
-
|
|
9
|
-
function isCopilotAvailable(): boolean {
|
|
10
|
-
try {
|
|
11
|
-
execFileSync('copilot', ['--version'], { encoding: 'utf-8', stdio: 'pipe' });
|
|
12
|
-
return true;
|
|
13
|
-
} catch {
|
|
14
|
-
return false;
|
|
15
|
-
}
|
|
16
|
-
}
|
|
17
5
|
|
|
18
6
|
function isGitHubTokenAvailable(): boolean {
|
|
19
7
|
return Boolean(process.env.GITHUB_TOKEN);
|
|
@@ -21,31 +9,18 @@ function isGitHubTokenAvailable(): boolean {
|
|
|
21
9
|
|
|
22
10
|
export function resolveInference(preference: string): InferenceAdapter {
|
|
23
11
|
if (preference === 'auto') {
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
if (copilotAvailable) {
|
|
28
|
-
return new CopilotInference();
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
if (tokenAvailable) {
|
|
32
|
-
return new GitHubModelsInference();
|
|
33
|
-
}
|
|
12
|
+
return new CopilotSDKInference();
|
|
13
|
+
}
|
|
34
14
|
|
|
15
|
+
if (preference === 'copilot') {
|
|
35
16
|
throw new AdapterNotAvailableError(
|
|
36
|
-
'
|
|
37
|
-
'
|
|
17
|
+
'copilot',
|
|
18
|
+
'The copilot CLI inference adapter has been removed. Use --inference copilot-sdk instead.'
|
|
38
19
|
);
|
|
39
20
|
}
|
|
40
21
|
|
|
41
|
-
if (preference === 'copilot') {
|
|
42
|
-
|
|
43
|
-
throw new AdapterNotAvailableError(
|
|
44
|
-
'copilot',
|
|
45
|
-
'GitHub Copilot CLI is not available. Install with: npm install -g @github/copilot'
|
|
46
|
-
);
|
|
47
|
-
}
|
|
48
|
-
return new CopilotInference();
|
|
22
|
+
if (preference === 'copilot-sdk') {
|
|
23
|
+
return new CopilotSDKInference();
|
|
49
24
|
}
|
|
50
25
|
|
|
51
26
|
if (preference === 'github-models') {
|
|
@@ -58,18 +33,8 @@ export function resolveInference(preference: string): InferenceAdapter {
|
|
|
58
33
|
return new GitHubModelsInference();
|
|
59
34
|
}
|
|
60
35
|
|
|
61
|
-
if (preference === 'copilot-sdk') {
|
|
62
|
-
if (!isSDKInstalled()) {
|
|
63
|
-
throw new AdapterNotAvailableError(
|
|
64
|
-
'copilot-sdk',
|
|
65
|
-
'@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk'
|
|
66
|
-
);
|
|
67
|
-
}
|
|
68
|
-
return new CopilotSDKInference();
|
|
69
|
-
}
|
|
70
|
-
|
|
71
36
|
throw new AdapterNotAvailableError(
|
|
72
37
|
preference,
|
|
73
|
-
`Unknown inference adapter "${preference}". Valid options: auto, copilot
|
|
38
|
+
`Unknown inference adapter "${preference}". Valid options: auto, copilot-sdk, github-models.`
|
|
74
39
|
);
|
|
75
40
|
}
|
package/src/commands/eval.ts
CHANGED
|
@@ -7,6 +7,7 @@ import type {
|
|
|
7
7
|
EvalResults,
|
|
8
8
|
EvalRunResult,
|
|
9
9
|
GradingResult,
|
|
10
|
+
FeedbackData,
|
|
10
11
|
} from '../types.js';
|
|
11
12
|
import { WorkspaceManager } from '../engine/workspace.js';
|
|
12
13
|
import { runEval } from '../engine/runner.js';
|
|
@@ -86,7 +87,7 @@ export async function evalCommand(
|
|
|
86
87
|
skillPath: string,
|
|
87
88
|
harness: Harness,
|
|
88
89
|
inference: InferenceAdapter,
|
|
89
|
-
options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number }
|
|
90
|
+
options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number; feedback?: boolean }
|
|
90
91
|
): Promise<EvalResults> {
|
|
91
92
|
const evalsPath = path.join(skillPath, 'evals', 'evals.json');
|
|
92
93
|
if (!fs.existsSync(evalsPath)) {
|
|
@@ -224,6 +225,18 @@ export async function evalCommand(
|
|
|
224
225
|
typeof value === 'number' ? Math.round(value * 10000) / 10000 : value, 2)
|
|
225
226
|
);
|
|
226
227
|
|
|
228
|
+
// Write feedback template if requested
|
|
229
|
+
if (options.feedback) {
|
|
230
|
+
const feedback: FeedbackData = {};
|
|
231
|
+
for (const run of evalRuns) {
|
|
232
|
+
feedback[`eval-${run.slug}`] = '';
|
|
233
|
+
}
|
|
234
|
+
fs.writeFileSync(
|
|
235
|
+
path.join(iterationDir, 'feedback.json'),
|
|
236
|
+
JSON.stringify(feedback, null, 2)
|
|
237
|
+
);
|
|
238
|
+
}
|
|
239
|
+
|
|
227
240
|
// Check threshold if set (for CI gating)
|
|
228
241
|
if (options.threshold !== undefined) {
|
|
229
242
|
const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
|
package/src/engine/runner.ts
CHANGED
package/src/errors.ts
CHANGED
|
@@ -35,21 +35,7 @@ export class AdapterNotAvailableError extends SnapevalError {
|
|
|
35
35
|
|
|
36
36
|
export class RateLimitError extends SnapevalError {
|
|
37
37
|
constructor(adapterName: string) {
|
|
38
|
-
super(`${adapterName} rate limit exceeded. Try again later or use a different adapter
|
|
38
|
+
super(`${adapterName} rate limit exceeded. Try again later or use a different adapter.`, 4);
|
|
39
39
|
this.name = 'RateLimitError';
|
|
40
40
|
}
|
|
41
41
|
}
|
|
42
|
-
|
|
43
|
-
export class TimeoutError extends SnapevalError {
|
|
44
|
-
constructor(evalId: number, timeoutMs: number) {
|
|
45
|
-
super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
|
|
46
|
-
this.name = 'TimeoutError';
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
export class GradingError extends SnapevalError {
|
|
51
|
-
constructor(evalId: number, detail: string) {
|
|
52
|
-
super(`Grading failed for eval ${evalId}: ${detail}`, 4);
|
|
53
|
-
this.name = 'GradingError';
|
|
54
|
-
}
|
|
55
|
-
}
|