snapeval 2.2.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +2 -13
  2. package/bin/snapeval.ts +7 -44
  3. package/dist/bin/snapeval.d.ts +1 -1
  4. package/dist/bin/snapeval.js +6 -42
  5. package/dist/bin/snapeval.js.map +1 -1
  6. package/dist/src/adapters/copilot-sdk-client.d.ts +0 -4
  7. package/dist/src/adapters/copilot-sdk-client.js +2 -23
  8. package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
  9. package/dist/src/adapters/harness/copilot-cli.js +1 -0
  10. package/dist/src/adapters/harness/copilot-cli.js.map +1 -1
  11. package/dist/src/adapters/harness/copilot-sdk.d.ts +1 -0
  12. package/dist/src/adapters/harness/copilot-sdk.js +15 -16
  13. package/dist/src/adapters/harness/copilot-sdk.js.map +1 -1
  14. package/dist/src/adapters/harness/resolve.js +1 -5
  15. package/dist/src/adapters/harness/resolve.js.map +1 -1
  16. package/dist/src/adapters/harness/skill-blocker.d.ts +17 -0
  17. package/dist/src/adapters/harness/skill-blocker.js +47 -0
  18. package/dist/src/adapters/harness/skill-blocker.js.map +1 -0
  19. package/dist/src/adapters/inference/copilot-sdk.d.ts +1 -1
  20. package/dist/src/adapters/inference/copilot-sdk.js +4 -2
  21. package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
  22. package/dist/src/adapters/inference/github-models.js +3 -0
  23. package/dist/src/adapters/inference/github-models.js.map +1 -1
  24. package/dist/src/adapters/inference/resolve.js +6 -32
  25. package/dist/src/adapters/inference/resolve.js.map +1 -1
  26. package/dist/src/commands/eval.d.ts +1 -0
  27. package/dist/src/commands/eval.js +8 -0
  28. package/dist/src/commands/eval.js.map +1 -1
  29. package/dist/src/engine/runner.js +1 -0
  30. package/dist/src/engine/runner.js.map +1 -1
  31. package/dist/src/errors.d.ts +0 -6
  32. package/dist/src/errors.js +1 -13
  33. package/dist/src/errors.js.map +1 -1
  34. package/dist/src/types.d.ts +1 -1
  35. package/package.json +8 -11
  36. package/plugin.json +4 -2
  37. package/skills/create-evals/SKILL.md +152 -0
  38. package/skills/run-evals/SKILL.md +132 -0
  39. package/src/adapters/copilot-sdk-client.ts +2 -22
  40. package/src/adapters/harness/copilot-cli.ts +1 -0
  41. package/src/adapters/harness/copilot-sdk.ts +20 -17
  42. package/src/adapters/harness/resolve.ts +1 -8
  43. package/src/adapters/harness/skill-blocker.ts +61 -0
  44. package/src/adapters/inference/copilot-sdk.ts +4 -2
  45. package/src/adapters/inference/github-models.ts +3 -0
  46. package/src/adapters/inference/resolve.ts +8 -43
  47. package/src/commands/eval.ts +14 -1
  48. package/src/engine/runner.ts +1 -0
  49. package/src/errors.ts +1 -15
  50. package/src/types.ts +1 -1
  51. package/assets/ideation-viewer.html +0 -469
  52. package/dist/src/adapters/inference/copilot.d.ts +0 -5
  53. package/dist/src/adapters/inference/copilot.js +0 -10
  54. package/dist/src/adapters/inference/copilot.js.map +0 -1
  55. package/dist/src/commands/review.d.ts +0 -8
  56. package/dist/src/commands/review.js +0 -32
  57. package/dist/src/commands/review.js.map +0 -1
  58. package/src/adapters/inference/copilot.ts +0 -12
  59. package/src/commands/review.ts +0 -46
@@ -1,13 +1,18 @@
1
1
  import * as fs from 'node:fs';
2
2
  import * as path from 'node:path';
3
3
  import type { Harness, HarnessRunResult } from '../../types.js';
4
- import { getClient, isSDKInstalled } from '../copilot-sdk-client.js';
4
+ import { getClient } from '../copilot-sdk-client.js';
5
+ import {
6
+ createSkillBlockingPermissionHandler,
7
+ createSkillBlockingHook,
8
+ } from './skill-blocker.js';
5
9
 
6
10
  export class CopilotSDKHarness implements Harness {
7
11
  readonly name = 'copilot-sdk';
8
12
 
9
13
  async run(options: {
10
14
  skillPath?: string;
15
+ blockedSkillPath?: string;
11
16
  prompt: string;
12
17
  files?: string[];
13
18
  outputDir: string;
@@ -18,7 +23,6 @@ export class CopilotSDKHarness implements Harness {
18
23
  fs.mkdirSync(options.outputDir, { recursive: true });
19
24
 
20
25
  // Dynamically import SDK for approveAll
21
- // @ts-ignore — module may not be installed (optional dep)
22
26
  const { approveAll } = await import('@github/copilot-sdk');
23
27
 
24
28
  // Build session config
@@ -31,7 +35,16 @@ export class CopilotSDKHarness implements Harness {
31
35
 
32
36
  // Native skill loading: point skillDirectories at the skill's parent
33
37
  if (options.skillPath) {
34
- sessionConfig.skillDirectories = [options.skillPath];
38
+ sessionConfig.skillDirectories = [path.dirname(options.skillPath)];
39
+ }
40
+
41
+ // Skill blocking: replace approveAll with blocking handler + hook
42
+ if (options.blockedSkillPath) {
43
+ const blockedDir = path.dirname(options.blockedSkillPath);
44
+ sessionConfig.onPermissionRequest = createSkillBlockingPermissionHandler(blockedDir);
45
+ sessionConfig.hooks = {
46
+ onPreToolUse: createSkillBlockingHook(blockedDir),
47
+ };
35
48
  }
36
49
 
37
50
  const session = await client.createSession(sessionConfig);
@@ -62,8 +75,8 @@ export class CopilotSDKHarness implements Harness {
62
75
  const events = await session.getMessages();
63
76
  const transcript = buildTranscript(events);
64
77
 
65
- // Extract token count from events if available
66
- const totalTokens = extractTokenCount(events);
78
+ // SDK assistant.usage events are ephemeral and not available via getMessages()
79
+ const totalTokens = 0;
67
80
 
68
81
  const durationMs = Date.now() - startMs;
69
82
 
@@ -80,7 +93,7 @@ export class CopilotSDKHarness implements Harness {
80
93
  }
81
94
 
82
95
  async isAvailable(): Promise<boolean> {
83
- return isSDKInstalled();
96
+ return true;
84
97
  }
85
98
  }
86
99
 
@@ -98,7 +111,7 @@ function buildTranscript(events: any[]): string {
98
111
  lines.push(`[tool:start] ${event.data?.toolName ?? 'unknown'}(${JSON.stringify(event.data?.arguments ?? {})})`);
99
112
  break;
100
113
  case 'tool.execution_complete':
101
- lines.push(`[tool:done] ${event.data?.toolName ?? 'unknown'} → ${truncate(event.data?.result ?? '', 200)}`);
114
+ lines.push(`[tool:done] ${event.data?.toolName ?? 'unknown'} → ${truncate(event.data?.result?.content ?? '', 200)}`);
102
115
  break;
103
116
  case 'skill.invoked':
104
117
  lines.push(`[skill] ${event.data?.name ?? 'unknown'} (${event.data?.path ?? ''})`);
@@ -111,16 +124,6 @@ function buildTranscript(events: any[]): string {
111
124
  return lines.join('\n');
112
125
  }
113
126
 
114
- function extractTokenCount(events: any[]): number {
115
- let total = 0;
116
- for (const event of events) {
117
- if (event.type === 'assistant.usage') {
118
- total += (event.data?.inputTokens ?? 0) + (event.data?.outputTokens ?? 0);
119
- }
120
- }
121
- return total;
122
- }
123
-
124
127
  function truncate(str: string, max: number): string {
125
128
  return str.length > max ? str.slice(0, max) + '...' : str;
126
129
  }
@@ -1,17 +1,10 @@
1
1
  import type { Harness } from '../../types.js';
2
2
  import { CopilotCLIHarness } from './copilot-cli.js';
3
3
  import { CopilotSDKHarness } from './copilot-sdk.js';
4
- import { AdapterNotAvailableError, SnapevalError } from '../../errors.js';
5
- import { isSDKInstalled } from '../copilot-sdk-client.js';
4
+ import { SnapevalError } from '../../errors.js';
6
5
 
7
6
  export function resolveHarness(name: string): Harness {
8
7
  if (name === 'copilot-sdk') {
9
- if (!isSDKInstalled()) {
10
- throw new AdapterNotAvailableError(
11
- 'copilot-sdk',
12
- '@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk'
13
- );
14
- }
15
8
  return new CopilotSDKHarness();
16
9
  }
17
10
  if (name === 'copilot-cli') {
@@ -0,0 +1,61 @@
1
+ import * as fs from 'node:fs';
2
+ import * as path from 'node:path';
3
+ import type { PermissionHandler } from '@github/copilot-sdk';
4
+
5
+ // PreToolUseHandler is not re-exported from @github/copilot-sdk's public API,
6
+ // so we define the minimal type inline matching the SDK's internal definition.
7
+ type PreToolUseHookInput = { toolName: string; toolArgs: unknown; timestamp: number; cwd: string };
8
+ type PreToolUseHookOutput = { permissionDecision?: 'allow' | 'deny' | 'ask'; permissionDecisionReason?: string };
9
+ type PreToolUseHandler = (input: PreToolUseHookInput, invocation: { sessionId: string }) => PreToolUseHookOutput | void;
10
+
11
+ function resolveDir(dir: string): string {
12
+ try {
13
+ return fs.realpathSync(dir) + path.sep;
14
+ } catch {
15
+ return path.resolve(dir) + path.sep;
16
+ }
17
+ }
18
+
19
+ export function createSkillBlockingHook(
20
+ blockedDir: string,
21
+ ): PreToolUseHandler {
22
+ const resolved = resolveDir(blockedDir);
23
+ const raw = path.resolve(blockedDir) + path.sep;
24
+
25
+ return (input) => {
26
+ const argsStr = typeof input.toolArgs === 'string'
27
+ ? input.toolArgs
28
+ : JSON.stringify(input.toolArgs ?? '');
29
+
30
+ if (argsStr.includes(resolved) || argsStr.includes(raw)) {
31
+ return {
32
+ permissionDecision: 'deny' as const,
33
+ permissionDecisionReason: `Blocked: tool "${input.toolName}" references blocked skill directory`,
34
+ };
35
+ }
36
+ return {};
37
+ };
38
+ }
39
+
40
+ export function createSkillBlockingPermissionHandler(
41
+ blockedDir: string,
42
+ ): PermissionHandler {
43
+ const resolved = resolveDir(blockedDir);
44
+
45
+ return (request) => {
46
+ if (request.kind === 'read') {
47
+ const readPath = (request as { path?: unknown }).path;
48
+ if (typeof readPath === 'string') {
49
+ try {
50
+ const resolvedPath = fs.realpathSync(readPath);
51
+ if (resolvedPath === resolved.slice(0, -1) || resolvedPath.startsWith(resolved)) {
52
+ return { kind: 'denied-by-rules', rules: [{ reason: 'skill-access-blocked' }] };
53
+ }
54
+ } catch {
55
+ // Path doesn't exist — can't be the skill dir
56
+ }
57
+ }
58
+ }
59
+ return { kind: 'approved' };
60
+ };
61
+ }
@@ -4,10 +4,9 @@ import { getClient } from '../copilot-sdk-client.js';
4
4
  export class CopilotSDKInference implements InferenceAdapter {
5
5
  readonly name = 'copilot-sdk';
6
6
 
7
- async chat(messages: Message[], _options?: ChatOptions): Promise<string> {
7
+ async chat(messages: Message[], options?: ChatOptions): Promise<string> {
8
8
  const client = await getClient();
9
9
 
10
- // @ts-ignore — module may not be installed (optional dep)
11
10
  const { approveAll } = await import('@github/copilot-sdk');
12
11
 
13
12
  const systemMessages = messages.filter((m) => m.role === 'system');
@@ -15,6 +14,9 @@ export class CopilotSDKInference implements InferenceAdapter {
15
14
  const systemContent = systemMessages.map((m) => m.content).join('\n');
16
15
  const userPrompt = nonSystemMessages.map((m) => m.content).join('\n');
17
16
 
17
+ // Note: ChatOptions (temperature, responseFormat) are not supported by the
18
+ // SDK's session config. The SDK controls these at the server level.
19
+ void options;
18
20
  const session = await client.createSession({
19
21
  model: 'gpt-4.1',
20
22
  ...(systemContent
@@ -40,6 +40,9 @@ export class GitHubModelsInference implements InferenceAdapter {
40
40
  const data = (await response.json()) as {
41
41
  choices: Array<{ message: { content: string } }>;
42
42
  };
43
+ if (!data.choices?.length) {
44
+ throw new Error('GitHub Models API returned no choices');
45
+ }
43
46
  return data.choices[0].message.content;
44
47
  }
45
48
  }
@@ -1,19 +1,7 @@
1
- import { execFileSync } from 'node:child_process';
2
1
  import type { InferenceAdapter } from '../../types.js';
3
2
  import { AdapterNotAvailableError } from '../../errors.js';
4
3
  import { GitHubModelsInference } from './github-models.js';
5
- import { CopilotInference } from './copilot.js';
6
4
  import { CopilotSDKInference } from './copilot-sdk.js';
7
- import { isSDKInstalled } from '../copilot-sdk-client.js';
8
-
9
- function isCopilotAvailable(): boolean {
10
- try {
11
- execFileSync('copilot', ['--version'], { encoding: 'utf-8', stdio: 'pipe' });
12
- return true;
13
- } catch {
14
- return false;
15
- }
16
- }
17
5
 
18
6
  function isGitHubTokenAvailable(): boolean {
19
7
  return Boolean(process.env.GITHUB_TOKEN);
@@ -21,31 +9,18 @@ function isGitHubTokenAvailable(): boolean {
21
9
 
22
10
  export function resolveInference(preference: string): InferenceAdapter {
23
11
  if (preference === 'auto') {
24
- const copilotAvailable = isCopilotAvailable();
25
- const tokenAvailable = isGitHubTokenAvailable();
26
-
27
- if (copilotAvailable) {
28
- return new CopilotInference();
29
- }
30
-
31
- if (tokenAvailable) {
32
- return new GitHubModelsInference();
33
- }
12
+ return new CopilotSDKInference();
13
+ }
34
14
 
15
+ if (preference === 'copilot') {
35
16
  throw new AdapterNotAvailableError(
36
- 'inference',
37
- 'No inference adapter available. Install GitHub Copilot CLI (`npm install -g @github/copilot`) or set GITHUB_TOKEN.'
17
+ 'copilot',
18
+ 'The copilot CLI inference adapter has been removed. Use --inference copilot-sdk instead.'
38
19
  );
39
20
  }
40
21
 
41
- if (preference === 'copilot') {
42
- if (!isCopilotAvailable()) {
43
- throw new AdapterNotAvailableError(
44
- 'copilot',
45
- 'GitHub Copilot CLI is not available. Install with: npm install -g @github/copilot'
46
- );
47
- }
48
- return new CopilotInference();
22
+ if (preference === 'copilot-sdk') {
23
+ return new CopilotSDKInference();
49
24
  }
50
25
 
51
26
  if (preference === 'github-models') {
@@ -58,18 +33,8 @@ export function resolveInference(preference: string): InferenceAdapter {
58
33
  return new GitHubModelsInference();
59
34
  }
60
35
 
61
- if (preference === 'copilot-sdk') {
62
- if (!isSDKInstalled()) {
63
- throw new AdapterNotAvailableError(
64
- 'copilot-sdk',
65
- '@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk'
66
- );
67
- }
68
- return new CopilotSDKInference();
69
- }
70
-
71
36
  throw new AdapterNotAvailableError(
72
37
  preference,
73
- `Unknown inference adapter "${preference}". Valid options: auto, copilot, copilot-sdk, github-models.`
38
+ `Unknown inference adapter "${preference}". Valid options: auto, copilot-sdk, github-models.`
74
39
  );
75
40
  }
@@ -7,6 +7,7 @@ import type {
7
7
  EvalResults,
8
8
  EvalRunResult,
9
9
  GradingResult,
10
+ FeedbackData,
10
11
  } from '../types.js';
11
12
  import { WorkspaceManager } from '../engine/workspace.js';
12
13
  import { runEval } from '../engine/runner.js';
@@ -86,7 +87,7 @@ export async function evalCommand(
86
87
  skillPath: string,
87
88
  harness: Harness,
88
89
  inference: InferenceAdapter,
89
- options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number }
90
+ options: { workspace?: string; runs?: number; oldSkill?: string; concurrency?: number; only?: number[]; threshold?: number; feedback?: boolean }
90
91
  ): Promise<EvalResults> {
91
92
  const evalsPath = path.join(skillPath, 'evals', 'evals.json');
92
93
  if (!fs.existsSync(evalsPath)) {
@@ -224,6 +225,18 @@ export async function evalCommand(
224
225
  typeof value === 'number' ? Math.round(value * 10000) / 10000 : value, 2)
225
226
  );
226
227
 
228
+ // Write feedback template if requested
229
+ if (options.feedback) {
230
+ const feedback: FeedbackData = {};
231
+ for (const run of evalRuns) {
232
+ feedback[`eval-${run.slug}`] = '';
233
+ }
234
+ fs.writeFileSync(
235
+ path.join(iterationDir, 'feedback.json'),
236
+ JSON.stringify(feedback, null, 2)
237
+ );
238
+ }
239
+
227
240
  // Check threshold if set (for CI gating)
228
241
  if (options.threshold !== undefined) {
229
242
  const passRate = benchmark.run_summary.with_skill.pass_rate.mean;
@@ -43,6 +43,7 @@ export async function runEval(
43
43
  }),
44
44
  harness.run({
45
45
  skillPath: oldSkillPath,
46
+ blockedSkillPath: skillPath,
46
47
  prompt: evalCase.prompt,
47
48
  files: evalCase.files,
48
49
  outputDir: path.join(baselineDir, 'outputs'),
package/src/errors.ts CHANGED
@@ -35,21 +35,7 @@ export class AdapterNotAvailableError extends SnapevalError {
35
35
 
36
36
  export class RateLimitError extends SnapevalError {
37
37
  constructor(adapterName: string) {
38
- super(`${adapterName} rate limit exceeded. Try again later or use a different adapter.`);
38
+ super(`${adapterName} rate limit exceeded. Try again later or use a different adapter.`, 4);
39
39
  this.name = 'RateLimitError';
40
40
  }
41
41
  }
42
-
43
- export class TimeoutError extends SnapevalError {
44
- constructor(evalId: number, timeoutMs: number) {
45
- super(`Eval ${evalId} timed out after ${timeoutMs}ms.`, 4);
46
- this.name = 'TimeoutError';
47
- }
48
- }
49
-
50
- export class GradingError extends SnapevalError {
51
- constructor(evalId: number, detail: string) {
52
- super(`Grading failed for eval ${evalId}: ${detail}`, 4);
53
- this.name = 'GradingError';
54
- }
55
- }
package/src/types.ts CHANGED
@@ -15,7 +15,7 @@ export interface Harness {
15
15
  prompt: string;
16
16
  files?: string[];
17
17
  outputDir: string;
18
- }): Promise<HarnessRunResult>;
18
+ } & Record<string, unknown>): Promise<HarnessRunResult>;
19
19
  isAvailable(): Promise<boolean>;
20
20
  }
21
21