snapeval 1.8.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/bin/snapeval.ts +30 -24
  2. package/dist/bin/snapeval.js +25 -22
  3. package/dist/bin/snapeval.js.map +1 -1
  4. package/dist/src/adapters/copilot-sdk-client.js +1 -1
  5. package/dist/src/adapters/copilot-sdk-client.js.map +1 -1
  6. package/dist/src/adapters/harness/copilot-sdk.d.ts +11 -0
  7. package/dist/src/adapters/harness/copilot-sdk.js +101 -0
  8. package/dist/src/adapters/harness/copilot-sdk.js.map +1 -0
  9. package/dist/src/adapters/harness/resolve.js +10 -2
  10. package/dist/src/adapters/harness/resolve.js.map +1 -1
  11. package/dist/src/adapters/inference/copilot-sdk.js +4 -1
  12. package/dist/src/adapters/inference/copilot-sdk.js.map +1 -1
  13. package/dist/src/adapters/report/terminal.js +89 -9
  14. package/dist/src/adapters/report/terminal.js.map +1 -1
  15. package/dist/src/commands/eval.d.ts +3 -0
  16. package/dist/src/commands/eval.js +106 -17
  17. package/dist/src/commands/eval.js.map +1 -1
  18. package/dist/src/commands/review.d.ts +1 -0
  19. package/dist/src/commands/review.js.map +1 -1
  20. package/dist/src/config.js +2 -1
  21. package/dist/src/config.js.map +1 -1
  22. package/dist/src/engine/grader.js +67 -9
  23. package/dist/src/engine/grader.js.map +1 -1
  24. package/dist/src/engine/runner.js +14 -12
  25. package/dist/src/engine/runner.js.map +1 -1
  26. package/dist/src/errors.d.ts +6 -0
  27. package/dist/src/errors.js +21 -3
  28. package/dist/src/errors.js.map +1 -1
  29. package/dist/src/types.d.ts +1 -0
  30. package/package.json +4 -1
  31. package/plugin.json +1 -1
  32. package/skills/snapeval/SKILL.md +33 -18
  33. package/src/adapters/copilot-sdk-client.ts +1 -1
  34. package/src/adapters/harness/copilot-sdk.ts +126 -0
  35. package/src/adapters/harness/resolve.ts +13 -2
  36. package/src/adapters/inference/copilot-sdk.ts +5 -1
  37. package/src/adapters/report/terminal.ts +100 -10
  38. package/src/commands/eval.ts +133 -31
  39. package/src/commands/review.ts +1 -1
  40. package/src/config.ts +2 -1
  41. package/src/engine/grader.ts +59 -8
  42. package/src/engine/runner.ts +14 -13
  43. package/src/errors.ts +24 -3
  44. package/src/types.ts +1 -0
  45. package/dist/src/commands/init.d.ts +0 -2
  46. package/dist/src/commands/init.js +0 -27
  47. package/dist/src/commands/init.js.map +0 -1
  48. package/dist/src/engine/generator.d.ts +0 -3
  49. package/dist/src/engine/generator.js +0 -51
  50. package/dist/src/engine/generator.js.map +0 -1
  51. package/src/commands/init.ts +0 -38
  52. package/src/engine/generator.ts +0 -60
package/bin/snapeval.ts CHANGED
@@ -3,11 +3,11 @@ import { Command } from 'commander';
3
3
  import { resolveConfig } from '../src/config.js';
4
4
  import { resolveInference } from '../src/adapters/inference/resolve.js';
5
5
  import { resolveHarness } from '../src/adapters/harness/resolve.js';
6
- import { initCommand } from '../src/commands/init.js';
7
6
  import { evalCommand } from '../src/commands/eval.js';
8
7
  import { reviewCommand } from '../src/commands/review.js';
9
8
  import { TerminalReporter } from '../src/adapters/report/terminal.js';
10
9
  import { SnapevalError } from '../src/errors.js';
10
+ import { stopClient } from '../src/adapters/copilot-sdk-client.js';
11
11
  import * as path from 'node:path';
12
12
 
13
13
  const program = new Command();
@@ -17,28 +17,6 @@ program
17
17
  .description('Harness-agnostic eval runner for agentskills.io skills')
18
18
  .version('2.0.0');
19
19
 
20
- // --- init ---
21
- program
22
- .command('init')
23
- .description('Generate evals.json from SKILL.md (prompts + expected outputs, no assertions)')
24
- .option('--harness <harness>', 'Harness to use')
25
- .option('--inference <inference>', 'Inference adapter to use')
26
- .option('--verbose', 'Verbose output')
27
- .argument('[skill-dir]', 'Path to skill directory', process.cwd())
28
- .action(async (skillDir: string, opts: Record<string, string | boolean>) => {
29
- try {
30
- const skillPath = path.resolve(skillDir);
31
- const config = resolveConfig(
32
- { harness: opts.harness as string, inference: opts.inference as string },
33
- process.cwd(), skillPath
34
- );
35
- const inference = resolveInference(config.inference);
36
- await initCommand(skillPath, inference);
37
- console.log(`Generated evals at ${path.join(skillPath, 'evals', 'evals.json')}`);
38
- process.exit(0);
39
- } catch (err) { handleError(err); }
40
- });
41
-
42
20
  // --- eval ---
43
21
  program
44
22
  .command('eval')
@@ -47,6 +25,9 @@ program
47
25
  .option('--inference <inference>', 'Inference adapter to use')
48
26
  .option('--workspace <path>', 'Workspace directory')
49
27
  .option('--runs <n>', 'Runs per eval for statistical significance', '1')
28
+ .option('--concurrency <n>', 'Number of eval cases to run in parallel (1-10)', '1')
29
+ .option('--only <ids>', 'Run only specific eval IDs (comma-separated, e.g. --only 1,3,5)')
30
+ .option('--threshold <rate>', 'Minimum pass rate (0-1) for exit code 0. Below threshold exits with code 1.')
50
31
  .option('--old-skill <path>', 'Compare against old skill version instead of no-skill')
51
32
  .option('--verbose', 'Verbose output')
52
33
  .argument('[skill-dir]', 'Path to skill directory', process.cwd())
@@ -59,15 +40,26 @@ program
59
40
  inference: opts.inference as string,
60
41
  workspace: opts.workspace as string,
61
42
  runs: opts.runs ? parseInt(opts.runs as string, 10) : undefined,
43
+ concurrency: opts.concurrency ? parseInt(opts.concurrency as string, 10) : undefined,
62
44
  },
63
45
  process.cwd(), skillPath
64
46
  );
65
47
  const harness = resolveHarness(config.harness);
66
48
  const inference = resolveInference(config.inference);
67
49
 
50
+ const only = opts.only
51
+ ? (opts.only as string).split(',').map((s) => parseInt(s.trim(), 10))
52
+ : undefined;
53
+ const threshold = opts.threshold
54
+ ? parseFloat(opts.threshold as string)
55
+ : undefined;
56
+
68
57
  const results = await evalCommand(skillPath, harness, inference, {
69
58
  workspace: config.workspace,
70
59
  runs: config.runs,
60
+ concurrency: config.concurrency,
61
+ only,
62
+ threshold,
71
63
  oldSkill: opts.oldSkill as string | undefined,
72
64
  });
73
65
 
@@ -75,7 +67,15 @@ program
75
67
  await terminal.report(results);
76
68
  console.log(`Results at ${results.iterationDir}`);
77
69
  process.exit(0);
78
- } catch (err) { handleError(err); }
70
+ } catch (err: any) {
71
+ // ThresholdError has results attached — show them before failing
72
+ if (err.results) {
73
+ const terminal = new TerminalReporter();
74
+ await terminal.report(err.results);
75
+ console.log(`Results at ${err.results.iterationDir}`);
76
+ }
77
+ handleError(err);
78
+ }
79
79
  });
80
80
 
81
81
  // --- review ---
@@ -86,6 +86,7 @@ program
86
86
  .option('--inference <inference>', 'Inference adapter to use')
87
87
  .option('--workspace <path>', 'Workspace directory')
88
88
  .option('--runs <n>', 'Runs per eval for statistical significance', '1')
89
+ .option('--concurrency <n>', 'Number of eval cases to run in parallel (1-10)', '1')
89
90
  .option('--old-skill <path>', 'Compare against old skill version instead of no-skill')
90
91
  .option('--no-open', 'Do not open browser')
91
92
  .option('--verbose', 'Verbose output')
@@ -99,6 +100,7 @@ program
99
100
  inference: opts.inference as string,
100
101
  workspace: opts.workspace as string,
101
102
  runs: opts.runs ? parseInt(opts.runs as string, 10) : undefined,
103
+ concurrency: opts.concurrency ? parseInt(opts.concurrency as string, 10) : undefined,
102
104
  },
103
105
  process.cwd(), skillPath
104
106
  );
@@ -108,6 +110,7 @@ program
108
110
  await reviewCommand(skillPath, harness, inference, {
109
111
  workspace: config.workspace,
110
112
  runs: config.runs,
113
+ concurrency: config.concurrency,
111
114
  oldSkill: opts.oldSkill as string | undefined,
112
115
  noOpen: opts.open === false,
113
116
  });
@@ -115,6 +118,9 @@ program
115
118
  } catch (err) { handleError(err); }
116
119
  });
117
120
 
121
+ // Clean up SDK client on exit (no-op if never started)
122
+ process.on('exit', () => { stopClient().catch(() => {}); });
123
+
118
124
  function handleError(err: unknown): never {
119
125
  if (err instanceof SnapevalError) {
120
126
  console.error(`Error: ${err.message}`);
@@ -3,38 +3,17 @@ import { Command } from 'commander';
3
3
  import { resolveConfig } from '../src/config.js';
4
4
  import { resolveInference } from '../src/adapters/inference/resolve.js';
5
5
  import { resolveHarness } from '../src/adapters/harness/resolve.js';
6
- import { initCommand } from '../src/commands/init.js';
7
6
  import { evalCommand } from '../src/commands/eval.js';
8
7
  import { reviewCommand } from '../src/commands/review.js';
9
8
  import { TerminalReporter } from '../src/adapters/report/terminal.js';
10
9
  import { SnapevalError } from '../src/errors.js';
10
+ import { stopClient } from '../src/adapters/copilot-sdk-client.js';
11
11
  import * as path from 'node:path';
12
12
  const program = new Command();
13
13
  program
14
14
  .name('snapeval')
15
15
  .description('Harness-agnostic eval runner for agentskills.io skills')
16
16
  .version('2.0.0');
17
- // --- init ---
18
- program
19
- .command('init')
20
- .description('Generate evals.json from SKILL.md (prompts + expected outputs, no assertions)')
21
- .option('--harness <harness>', 'Harness to use')
22
- .option('--inference <inference>', 'Inference adapter to use')
23
- .option('--verbose', 'Verbose output')
24
- .argument('[skill-dir]', 'Path to skill directory', process.cwd())
25
- .action(async (skillDir, opts) => {
26
- try {
27
- const skillPath = path.resolve(skillDir);
28
- const config = resolveConfig({ harness: opts.harness, inference: opts.inference }, process.cwd(), skillPath);
29
- const inference = resolveInference(config.inference);
30
- await initCommand(skillPath, inference);
31
- console.log(`Generated evals at ${path.join(skillPath, 'evals', 'evals.json')}`);
32
- process.exit(0);
33
- }
34
- catch (err) {
35
- handleError(err);
36
- }
37
- });
38
17
  // --- eval ---
39
18
  program
40
19
  .command('eval')
@@ -43,6 +22,9 @@ program
43
22
  .option('--inference <inference>', 'Inference adapter to use')
44
23
  .option('--workspace <path>', 'Workspace directory')
45
24
  .option('--runs <n>', 'Runs per eval for statistical significance', '1')
25
+ .option('--concurrency <n>', 'Number of eval cases to run in parallel (1-10)', '1')
26
+ .option('--only <ids>', 'Run only specific eval IDs (comma-separated, e.g. --only 1,3,5)')
27
+ .option('--threshold <rate>', 'Minimum pass rate (0-1) for exit code 0. Below threshold exits with code 1.')
46
28
  .option('--old-skill <path>', 'Compare against old skill version instead of no-skill')
47
29
  .option('--verbose', 'Verbose output')
48
30
  .argument('[skill-dir]', 'Path to skill directory', process.cwd())
@@ -54,12 +36,22 @@ program
54
36
  inference: opts.inference,
55
37
  workspace: opts.workspace,
56
38
  runs: opts.runs ? parseInt(opts.runs, 10) : undefined,
39
+ concurrency: opts.concurrency ? parseInt(opts.concurrency, 10) : undefined,
57
40
  }, process.cwd(), skillPath);
58
41
  const harness = resolveHarness(config.harness);
59
42
  const inference = resolveInference(config.inference);
43
+ const only = opts.only
44
+ ? opts.only.split(',').map((s) => parseInt(s.trim(), 10))
45
+ : undefined;
46
+ const threshold = opts.threshold
47
+ ? parseFloat(opts.threshold)
48
+ : undefined;
60
49
  const results = await evalCommand(skillPath, harness, inference, {
61
50
  workspace: config.workspace,
62
51
  runs: config.runs,
52
+ concurrency: config.concurrency,
53
+ only,
54
+ threshold,
63
55
  oldSkill: opts.oldSkill,
64
56
  });
65
57
  const terminal = new TerminalReporter();
@@ -68,6 +60,12 @@ program
68
60
  process.exit(0);
69
61
  }
70
62
  catch (err) {
63
+ // ThresholdError has results attached — show them before failing
64
+ if (err.results) {
65
+ const terminal = new TerminalReporter();
66
+ await terminal.report(err.results);
67
+ console.log(`Results at ${err.results.iterationDir}`);
68
+ }
71
69
  handleError(err);
72
70
  }
73
71
  });
@@ -79,6 +77,7 @@ program
79
77
  .option('--inference <inference>', 'Inference adapter to use')
80
78
  .option('--workspace <path>', 'Workspace directory')
81
79
  .option('--runs <n>', 'Runs per eval for statistical significance', '1')
80
+ .option('--concurrency <n>', 'Number of eval cases to run in parallel (1-10)', '1')
82
81
  .option('--old-skill <path>', 'Compare against old skill version instead of no-skill')
83
82
  .option('--no-open', 'Do not open browser')
84
83
  .option('--verbose', 'Verbose output')
@@ -91,12 +90,14 @@ program
91
90
  inference: opts.inference,
92
91
  workspace: opts.workspace,
93
92
  runs: opts.runs ? parseInt(opts.runs, 10) : undefined,
93
+ concurrency: opts.concurrency ? parseInt(opts.concurrency, 10) : undefined,
94
94
  }, process.cwd(), skillPath);
95
95
  const harness = resolveHarness(config.harness);
96
96
  const inference = resolveInference(config.inference);
97
97
  await reviewCommand(skillPath, harness, inference, {
98
98
  workspace: config.workspace,
99
99
  runs: config.runs,
100
+ concurrency: config.concurrency,
100
101
  oldSkill: opts.oldSkill,
101
102
  noOpen: opts.open === false,
102
103
  });
@@ -106,6 +107,8 @@ program
106
107
  handleError(err);
107
108
  }
108
109
  });
110
+ // Clean up SDK client on exit (no-op if never started)
111
+ process.on('exit', () => { stopClient().catch(() => { }); });
109
112
  function handleError(err) {
110
113
  if (err instanceof SnapevalError) {
111
114
  console.error(`Error: ${err.message}`);
@@ -1 +1 @@
1
- {"version":3,"file":"snapeval.js","sourceRoot":"","sources":["../../bin/snapeval.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,gBAAgB,EAAE,MAAM,sCAAsC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,oCAAoC,CAAC;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,UAAU,CAAC;KAChB,WAAW,CAAC,wDAAwD,CAAC;KACrE,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,eAAe;AACf,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,+EAA+E,CAAC;KAC5F,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B,EAAE,OAAO,EAAE,IAAI,CAAC,OAAiB,EAAE,SAAS,EAAE,IAAI,CAAC,SAAmB,EAAE,EACxE,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QACrD,MAAM,WAAW,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;QACxC,OAAO,CAAC,GAAG,CAAC,sBAAsB,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,EAAE,CAAC,CAAC;QACjF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,eAAe;AACf,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,qEAAqE,CAAC;KAClF,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SAChE,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YAC/D,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,QAAQ,EAAE,IAAI,CAAC,QAA8B;SAC9C,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/B,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,mDAAmD,CAAC;KAChE,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,qBAAqB,CAAC;KAC1C,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SAChE,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YACjD,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,QAAQ,EAAE,IAAI,CAAC,QAA8B;YAC7C,MAAM,EAAE,IAAI,CAAC,IAAI,KAAK,KAAK;SAC5B,CAAC,CAAC;QACH,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,SAAS,WAAW,CAAC,GAAY;IAC/B,IAAI,GAAG,YAAY,aAAa,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;IAClC,CAAC;IACD,IAAI,GAAG,YAAY,KAAK,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
1
+ {"version":3,"file":"snapeval.js","sourceRoot":"","sources":["../../bin/snapeval.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,gBAAgB,EAAE,MAAM,sCAAsC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,oCAAoC,CAAC;AACpE,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,aAAa,EAAE,MAAM,2BAA2B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AACtE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AACjD,OAAO,EAAE,UAAU,EAAE,MAAM,uCAAuC,CAAC;AACnE,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,UAAU,CAAC;KAChB,WAAW,CAAC,wDAAwD,CAAC;KACrE,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,eAAe;AACf,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,qEAAqE,CAAC;KAClF,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,cAAc,EAAE,iEAAiE,CAAC;KACzF,MAAM,CAAC,oBAAoB,EAAE,6EAA6E,CAAC;KAC3G,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI;YACpB,CAAC,CAAE,IAAI,CAAC,IAAe,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;YACrE,CAAC,CAAC,SAAS,CAAC;QACd,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS;YAC9B,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,SAAmB,CAAC;YACtC,CAAC,CAAC,SAAS,CAAC;QAEd,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YAC/D,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,IAAI;YACJ,SAAS;YACT,QAAQ,EAAE,IAAI,CAAC,QAA8B;SAC9C,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/B,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,iEAAiE;QACjE,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;YAChB,MAAM,QAAQ,GAAG,IAAI,gBAAgB,EAAE,CAAC;YACxC,MAAM,QAAQ,CAAC,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,cAAc,GAAG,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;QACxD,CAAC;QACD,WAAW,CAAC,GAAG,CAAC,CAAC;IACnB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,iBAAiB;AACjB,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,mDAAmD,CAAC;KAChE,MAAM,CAAC,qBAAqB,EAAE,gBAAgB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,oBAAoB,EAAE,qBAAqB,CAAC;KACnD,MAAM,CAAC,YAAY,EAAE,4CAA4C,EAAE,GAAG,CAAC;KACvE,MAAM,CAAC,mBAAmB,EAAE,gDAAgD,EAAE,GAAG,CAAC;KAClF,MAAM,CAAC,oBAAoB,EAAE,uDAAuD,CAAC;KACrF,MAAM,CAAC,WAAW,EAAE,qBAAqB,CAAC;KAC1C,MAAM,CAAC,WAAW,EAAE,gBAAgB,CAAC;KACrC,QAAQ,CAAC,aAAa,EAAE,yBAAyB,EAAE,OAAO,CAAC,GAAG,EAAE,CAAC;KACjE,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,IAAsC,EAAE,EAAE;IACzE,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACzC,MAAM,MAAM,GAAG,aAAa,CAC1B;YACE,OAAO,EAAE,IAAI,CAAC,OAAiB;YAC/B,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,SAAS,EAAE,IAAI,CAAC,SAAmB;YACnC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAc,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC/D,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAqB,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACrF,EACD,OAAO,CAAC,GAAG,EAAE,EAAE,SAAS,CACzB,CAAC;QACF,MAAM,OAAO,GAAG,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAErD,MAAM,aAAa,CAAC,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE;YACjD,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,QAAQ,EAAE,IAAI,CAAC,QAA8B;YAC7C,MAAM,EAAE,IAAI,CAAC,IAAI,KAAK,KAAK;SAC5B,CAAC,CAAC;QACH,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IAAC,CAAC;AACrC,CAAC,CAAC,CAAC;AAEL,uDAAuD;AACvD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,GAAG,EAAE,GAAG,UAAU,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAE5D,SAAS,WAAW,CAAC,GAAY;IAC/B,IAAI,GAAG,YAAY,aAAa,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;IAClC,CAAC;IACD,IAAI,GAAG,YAAY,KAAK,EAAE,CAAC;QACzB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAC5C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
@@ -25,7 +25,7 @@ export async function getClient() {
25
25
  if (!CopilotClient) {
26
26
  throw new Error('Could not find CopilotClient export in @github/copilot-sdk. The package may have changed its API.');
27
27
  }
28
- clientInstance = new CopilotClient();
28
+ clientInstance = new CopilotClient({ logLevel: 'none' });
29
29
  await clientInstance.start();
30
30
  clientStarted = true;
31
31
  return clientInstance;
@@ -1 +1 @@
1
- {"version":3,"file":"copilot-sdk-client.js","sourceRoot":"","sources":["../../../src/adapters/copilot-sdk-client.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,iEAAiE;AACjE,4DAA4D;AAC5D,IAAI,cAAc,GAAQ,IAAI,CAAC;AAC/B,IAAI,aAAa,GAAG,KAAK,CAAC;AAE1B,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,IAAI,cAAc,IAAI,aAAa;QAAE,OAAO,cAAc,CAAC;IAE3D,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,+DAA+D;QAC/D,GAAG,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,OAAO,EAAE,aAAa,CAAC;IACtE,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,cAAc,GAAG,IAAI,aAAa,EAAE,CAAC;IACrC,MAAM,cAAc,CAAC,KAAK,EAAE,CAAC;IAC7B,aAAa,GAAG,IAAI,CAAC;IACrB,OAAO,cAAc,CAAC;AACxB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,IAAI,cAAc,IAAI,aAAa,EAAE,CAAC;QACpC,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;QAC5B,aAAa,GAAG,KAAK,CAAC;QACtB,cAAc,GAAG,IAAI,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,iEAAiE;IACjE,mEAAmE;IACnE,IAAI,GAAG,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IACxB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;QAC3F,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,MAAM,KAAK,GAAG;YAAE,MAAM;QAC1B,GAAG,GAAG,MAAM,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
1
+ {"version":3,"file":"copilot-sdk-client.js","sourceRoot":"","sources":["../../../src/adapters/copilot-sdk-client.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,iEAAiE;AACjE,4DAA4D;AAC5D,IAAI,cAAc,GAAQ,IAAI,CAAC;AAC/B,IAAI,aAAa,GAAG,KAAK,CAAC;AAE1B,MAAM,CAAC,KAAK,UAAU,SAAS;IAC7B,IAAI,cAAc,IAAI,aAAa;QAAE,OAAO,cAAc,CAAC;IAE3D,IAAI,GAAQ,CAAC;IACb,IAAI,CAAC;QACH,+DAA+D;QAC/D,GAAG,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,MAAM,aAAa,GAAG,GAAG,CAAC,aAAa,IAAI,GAAG,CAAC,OAAO,EAAE,aAAa,CAAC;IACtE,IAAI,CAAC,aAAa,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CACb,mGAAmG,CACpG,CAAC;IACJ,CAAC;IAED,cAAc,GAAG,IAAI,aAAa,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAC;IACzD,MAAM,cAAc,CAAC,KAAK,EAAE,CAAC;IAC7B,aAAa,GAAG,IAAI,CAAC;IACrB,OAAO,cAAc,CAAC;AACxB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,UAAU;IAC9B,IAAI,cAAc,IAAI,aAAa,EAAE,CAAC;QACpC,MAAM,cAAc,CAAC,IAAI,EAAE,CAAC;QAC5B,aAAa,GAAG,KAAK,CAAC;QACtB,cAAc,GAAG,IAAI,CAAC;IACxB,CAAC;AACH,CAAC;AAED,MAAM,UAAU,cAAc;IAC5B,iEAAiE;IACjE,mEAAmE;IACnE,IAAI,GAAG,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IACxB,OAAO,IAAI,EAAE,CAAC;QACZ,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,cAAc,EAAE,SAAS,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;QAC3F,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjC,IAAI,MAAM,KAAK,GAAG;YAAE,MAAM;QAC1B,GAAG,GAAG,MAAM,CAAC;IACf,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
@@ -0,0 +1,11 @@
1
+ import type { Harness, HarnessRunResult } from '../../types.js';
2
+ export declare class CopilotSDKHarness implements Harness {
3
+ readonly name = "copilot-sdk";
4
+ run(options: {
5
+ skillPath?: string;
6
+ prompt: string;
7
+ files?: string[];
8
+ outputDir: string;
9
+ }): Promise<HarnessRunResult>;
10
+ isAvailable(): Promise<boolean>;
11
+ }
@@ -0,0 +1,101 @@
1
+ import * as fs from 'node:fs';
2
+ import * as path from 'node:path';
3
+ import { getClient, isSDKInstalled } from '../copilot-sdk-client.js';
4
+ export class CopilotSDKHarness {
5
+ name = 'copilot-sdk';
6
+ async run(options) {
7
+ const startMs = Date.now();
8
+ const client = await getClient();
9
+ fs.mkdirSync(options.outputDir, { recursive: true });
10
+ // Dynamically import SDK for approveAll
11
+ // @ts-ignore — module may not be installed (optional dep)
12
+ const { approveAll } = await import('@github/copilot-sdk');
13
+ // Build session config
14
+ const sessionConfig = {
15
+ model: 'gpt-4.1',
16
+ onPermissionRequest: approveAll,
17
+ workingDirectory: options.outputDir,
18
+ infiniteSessions: { enabled: false },
19
+ };
20
+ // Native skill loading: point skillDirectories at the skill's parent
21
+ if (options.skillPath) {
22
+ sessionConfig.skillDirectories = [options.skillPath];
23
+ }
24
+ const session = await client.createSession(sessionConfig);
25
+ try {
26
+ // Attach input files if provided
27
+ const attachments = [];
28
+ if (options.files) {
29
+ for (const file of options.files) {
30
+ // Copy to outputDir for script assertions, and attach for the model
31
+ const dest = path.join(options.outputDir, path.basename(file));
32
+ fs.copyFileSync(file, dest);
33
+ attachments.push({ type: 'file', path: dest, displayName: path.basename(file) });
34
+ }
35
+ }
36
+ const response = await session.sendAndWait({
37
+ prompt: options.prompt,
38
+ ...(attachments.length > 0 ? { attachments } : {}),
39
+ }, 300_000);
40
+ const raw = response?.data?.content ?? '';
41
+ // Collect full transcript from session events
42
+ const events = await session.getMessages();
43
+ const transcript = buildTranscript(events);
44
+ // Extract token count from events if available
45
+ const totalTokens = extractTokenCount(events);
46
+ const durationMs = Date.now() - startMs;
47
+ return {
48
+ raw: raw.trim(),
49
+ transcript,
50
+ files: [],
51
+ total_tokens: totalTokens,
52
+ duration_ms: durationMs,
53
+ };
54
+ }
55
+ finally {
56
+ await session.disconnect();
57
+ }
58
+ }
59
+ async isAvailable() {
60
+ return isSDKInstalled();
61
+ }
62
+ }
63
+ function buildTranscript(events) {
64
+ const lines = [];
65
+ for (const event of events) {
66
+ switch (event.type) {
67
+ case 'user.message':
68
+ lines.push(`[user] ${event.data?.content ?? ''}`);
69
+ break;
70
+ case 'assistant.message':
71
+ lines.push(`[assistant] ${event.data?.content ?? ''}`);
72
+ break;
73
+ case 'tool.execution_start':
74
+ lines.push(`[tool:start] ${event.data?.toolName ?? 'unknown'}(${JSON.stringify(event.data?.arguments ?? {})})`);
75
+ break;
76
+ case 'tool.execution_complete':
77
+ lines.push(`[tool:done] ${event.data?.toolName ?? 'unknown'} → ${truncate(event.data?.result ?? '', 200)}`);
78
+ break;
79
+ case 'skill.invoked':
80
+ lines.push(`[skill] ${event.data?.name ?? 'unknown'} (${event.data?.path ?? ''})`);
81
+ break;
82
+ case 'session.error':
83
+ lines.push(`[error] ${event.data?.message ?? ''}`);
84
+ break;
85
+ }
86
+ }
87
+ return lines.join('\n');
88
+ }
89
+ function extractTokenCount(events) {
90
+ let total = 0;
91
+ for (const event of events) {
92
+ if (event.type === 'assistant.usage') {
93
+ total += (event.data?.inputTokens ?? 0) + (event.data?.outputTokens ?? 0);
94
+ }
95
+ }
96
+ return total;
97
+ }
98
+ function truncate(str, max) {
99
+ return str.length > max ? str.slice(0, max) + '...' : str;
100
+ }
101
+ //# sourceMappingURL=copilot-sdk.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"copilot-sdk.js","sourceRoot":"","sources":["../../../../src/adapters/harness/copilot-sdk.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAErE,MAAM,OAAO,iBAAiB;IACnB,IAAI,GAAG,aAAa,CAAC;IAE9B,KAAK,CAAC,GAAG,CAAC,OAKT;QACC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,MAAM,SAAS,EAAE,CAAC;QAEjC,EAAE,CAAC,SAAS,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAErD,wCAAwC;QACxC,0DAA0D;QAC1D,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;QAE3D,uBAAuB;QACvB,MAAM,aAAa,GAA4B;YAC7C,KAAK,EAAE,SAAS;YAChB,mBAAmB,EAAE,UAAU;YAC/B,gBAAgB,EAAE,OAAO,CAAC,SAAS;YACnC,gBAAgB,EAAE,EAAE,OAAO,EAAE,KAAK,EAAE;SACrC,CAAC;QAEF,qEAAqE;QACrE,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;YACtB,aAAa,CAAC,gBAAgB,GAAG,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QACvD,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,aAAa,CAAC,CAAC;QAE1D,IAAI,CAAC;YACH,iCAAiC;YACjC,MAAM,WAAW,GAAgE,EAAE,CAAC;YACpF,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;gBAClB,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;oBACjC,oEAAoE;oBACpE,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;oBAC/D,EAAE,CAAC,YAAY,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;oBAC5B,WAAW,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACnF,CAAC;YACH,CAAC;YAED,MAAM,QAAQ,GAAG,MAAM,OAAO,CAAC,WAAW,CACxC;gBACE,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,GAAG,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aACnD,EACD,OAAO,CACR,CAAC;YAEF,MAAM,GAAG,GAAG,QAAQ,EAAE,IAAI,EAAE,OAAO,IAAI,EAAE,CAAC;YAE1C,8CAA8C;YAC9C,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,WAAW,EAAE,CAAC;YAC3C,MAAM,UAAU,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC;YAE3C,+CAA+C;YAC/C,MAAM,WAAW,GAAG,iBAAiB,CAAC,MAAM,CAAC,CAAC;YAE9C,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,OAAO,CAAC;YAExC,OAAO;gBACL,GAAG,EAAE,GAAG,CAAC,IAAI,EAAE;gBACf,UAAU;gBACV,KAAK,EAAE,EAAE;gBACT,YAAY,EAAE,WAAW;gBACzB,WAAW,EAAE,UAAU;aACxB,CAAC;QACJ,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;QAC7B,CAAC;IACH,CAAC;IAED,KAAK,CAAC,WAAW;QACf,OAAO,cAAc,EAAE,CAAC;IAC1B,CAAC;CACF;AAED,SAAS,eAAe,CAAC,MAAa;IACpC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,cAAc;gBACjB,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,CAAC,IAAI,EAAE,OAAO,IAAI,EAAE,EAAE,CAAC,CAAC;gBAClD,MAAM;YACR,KAAK,mBAAmB;gBACtB,KAAK,CAAC,IAAI,CAAC,eAAe,KAAK,CAAC,IAAI,EAAE,OAAO,IAAI,EAAE,EAAE,CAAC,CAAC;gBACvD,MAAM;YACR,KAAK,sBAAsB;gBACzB,KAAK,CAAC,IAAI,CAAC,gBAAgB,KAAK,CAAC,IAAI,EAAE,QAAQ,IAAI,SAAS,IAAI,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,IAAI,EAAE,SAAS,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC;gBAChH,MAAM;YACR,KAAK,yBAAyB;gBAC5B,KAAK,CAAC,IAAI,CAAC,eAAe,KAAK,CAAC,IAAI,EAAE,QAAQ,IAAI,SAAS,MAAM,QAAQ,CAAC,KAAK,CAAC,IAAI,EAAE,MAAM,IAAI,EAAE,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;gBAC5G,MAAM;YACR,KAAK,eAAe;gBAClB,KAAK,CAAC,IAAI,CAAC,WAAW,KAAK,CAAC,IAAI,EAAE,IAAI,IAAI,SAAS,KAAK,KAAK,CAAC,IAAI,EAAE,IAAI,IAAI,EAAE,GAAG,CAAC,CAAC;gBACnF,MAAM;YACR,KAAK,eAAe;gBAClB,KAAK,CAAC,IAAI,CAAC,WAAW,KAAK,CAAC,IAAI,EAAE,OAAO,IAAI,EAAE,EAAE,CAAC,CAAC;gBACnD,MAAM;QACV,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,iBAAiB,CAAC,MAAa;IACtC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,KAAK,CAAC,IAAI,KAAK,iBAAiB,EAAE,CAAC;YACrC,KAAK,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,WAAW,IAAI,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,YAAY,IAAI,CAAC,CAAC,CAAC;QAC5E,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,QAAQ,CAAC,GAAW,EAAE,GAAW;IACxC,OAAO,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC;AAC5D,CAAC"}
@@ -1,9 +1,17 @@
1
1
  import { CopilotCLIHarness } from './copilot-cli.js';
2
- import { SnapevalError } from '../../errors.js';
2
+ import { CopilotSDKHarness } from './copilot-sdk.js';
3
+ import { AdapterNotAvailableError, SnapevalError } from '../../errors.js';
4
+ import { isSDKInstalled } from '../copilot-sdk-client.js';
3
5
  export function resolveHarness(name) {
6
+ if (name === 'copilot-sdk') {
7
+ if (!isSDKInstalled()) {
8
+ throw new AdapterNotAvailableError('copilot-sdk', '@github/copilot-sdk is not installed. Install with: npm install @github/copilot-sdk');
9
+ }
10
+ return new CopilotSDKHarness();
11
+ }
4
12
  if (name === 'copilot-cli') {
5
13
  return new CopilotCLIHarness();
6
14
  }
7
- throw new SnapevalError(`Unknown harness "${name}". Built-in options: copilot-cli.`);
15
+ throw new SnapevalError(`Unknown harness "${name}". Built-in options: copilot-sdk, copilot-cli.`);
8
16
  }
9
17
  //# sourceMappingURL=resolve.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"resolve.js","sourceRoot":"","sources":["../../../../src/adapters/harness/resolve.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAEhD,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,IAAI,KAAK,aAAa,EAAE,CAAC;QAC3B,OAAO,IAAI,iBAAiB,EAAE,CAAC;IACjC,CAAC;IACD,MAAM,IAAI,aAAa,CAAC,oBAAoB,IAAI,mCAAmC,CAAC,CAAC;AACvF,CAAC"}
1
+ {"version":3,"file":"resolve.js","sourceRoot":"","sources":["../../../../src/adapters/harness/resolve.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,wBAAwB,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC1E,OAAO,EAAE,cAAc,EAAE,MAAM,0BAA0B,CAAC;AAE1D,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,IAAI,KAAK,aAAa,EAAE,CAAC;QAC3B,IAAI,CAAC,cAAc,EAAE,EAAE,CAAC;YACtB,MAAM,IAAI,wBAAwB,CAChC,aAAa,EACb,qFAAqF,CACtF,CAAC;QACJ,CAAC;QACD,OAAO,IAAI,iBAAiB,EAAE,CAAC;IACjC,CAAC;IACD,IAAI,IAAI,KAAK,aAAa,EAAE,CAAC;QAC3B,OAAO,IAAI,iBAAiB,EAAE,CAAC;IACjC,CAAC;IACD,MAAM,IAAI,aAAa,CAAC,oBAAoB,IAAI,gDAAgD,CAAC,CAAC;AACpG,CAAC"}
@@ -3,6 +3,8 @@ export class CopilotSDKInference {
3
3
  name = 'copilot-sdk';
4
4
  async chat(messages, _options) {
5
5
  const client = await getClient();
6
+ // @ts-ignore — module may not be installed (optional dep)
7
+ const { approveAll } = await import('@github/copilot-sdk');
6
8
  const systemMessages = messages.filter((m) => m.role === 'system');
7
9
  const nonSystemMessages = messages.filter((m) => m.role !== 'system');
8
10
  const systemContent = systemMessages.map((m) => m.content).join('\n');
@@ -12,7 +14,8 @@ export class CopilotSDKInference {
12
14
  ...(systemContent
13
15
  ? { systemMessage: { content: systemContent } }
14
16
  : {}),
15
- onPermissionRequest: async () => ({ kind: 'approved' }),
17
+ onPermissionRequest: approveAll,
18
+ infiniteSessions: { enabled: false },
16
19
  });
17
20
  try {
18
21
  const response = await session.sendAndWait({ prompt: userPrompt });
@@ -1 +1 @@
1
- {"version":3,"file":"copilot-sdk.js","sourceRoot":"","sources":["../../../../src/adapters/inference/copilot-sdk.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AAErD,MAAM,OAAO,mBAAmB;IACrB,IAAI,GAAG,aAAa,CAAC;IAE9B,KAAK,CAAC,IAAI,CAAC,QAAmB,EAAE,QAAsB;QACpD,MAAM,MAAM,GAAG,MAAM,SAAS,EAAE,CAAC;QAEjC,MAAM,cAAc,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QACnE,MAAM,iBAAiB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QACtE,MAAM,aAAa,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtE,MAAM,UAAU,GAAG,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEtE,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC;YACzC,KAAK,EAAE,SAAS;YAChB,GAAG,CAAC,aAAa;gBACf,CAAC,CAAC,EAAE,aAAa,EAAE,EAAE,OAAO,EAAE,aAAa,EAAE,EAAE;gBAC/C,CAAC,CAAC,EAAE,CAAC;YACP,mBAAmB,EAAE,KAAK,IAAI,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;SACxD,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,OAAO,CAAC,WAAW,CAAC,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;YACnE,OAAO,CAAC,QAAQ,EAAE,IAAI,EAAE,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QAChD,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;QAC7B,CAAC;IACH,CAAC;CACF"}
1
+ {"version":3,"file":"copilot-sdk.js","sourceRoot":"","sources":["../../../../src/adapters/inference/copilot-sdk.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AAErD,MAAM,OAAO,mBAAmB;IACrB,IAAI,GAAG,aAAa,CAAC;IAE9B,KAAK,CAAC,IAAI,CAAC,QAAmB,EAAE,QAAsB;QACpD,MAAM,MAAM,GAAG,MAAM,SAAS,EAAE,CAAC;QAEjC,0DAA0D;QAC1D,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,qBAAqB,CAAC,CAAC;QAE3D,MAAM,cAAc,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QACnE,MAAM,iBAAiB,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC;QACtE,MAAM,aAAa,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACtE,MAAM,UAAU,GAAG,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEtE,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC;YACzC,KAAK,EAAE,SAAS;YAChB,GAAG,CAAC,aAAa;gBACf,CAAC,CAAC,EAAE,aAAa,EAAE,EAAE,OAAO,EAAE,aAAa,EAAE,EAAE;gBAC/C,CAAC,CAAC,EAAE,CAAC;YACP,mBAAmB,EAAE,UAAU;YAC/B,gBAAgB,EAAE,EAAE,OAAO,EAAE,KAAK,EAAE;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,OAAO,CAAC,WAAW,CAAC,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;YACnE,OAAO,CAAC,QAAQ,EAAE,IAAI,EAAE,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QAChD,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC;QAC7B,CAAC;IACH,CAAC;CACF"}
@@ -1,25 +1,105 @@
1
+ import * as fs from 'node:fs';
2
+ import * as path from 'node:path';
1
3
  import chalk from 'chalk';
4
+ function loadPreviousIteration(iterationDir) {
5
+ const workspaceDir = path.dirname(iterationDir);
6
+ const currentName = path.basename(iterationDir);
7
+ const currentNum = parseInt(currentName.replace('iteration-', ''), 10);
8
+ if (isNaN(currentNum) || currentNum <= 1)
9
+ return null;
10
+ const prevDir = path.join(workspaceDir, `iteration-${currentNum - 1}`);
11
+ const prevBenchmarkPath = path.join(prevDir, 'benchmark.json');
12
+ if (!fs.existsSync(prevBenchmarkPath))
13
+ return null;
14
+ try {
15
+ const benchmark = JSON.parse(fs.readFileSync(prevBenchmarkPath, 'utf-8'));
16
+ const gradings = new Map();
17
+ const evalDirs = fs.readdirSync(prevDir).filter(d => d.startsWith('eval-'));
18
+ for (const evalDir of evalDirs) {
19
+ const wsPath = path.join(prevDir, evalDir, 'with_skill', 'grading.json');
20
+ const wosPath = path.join(prevDir, evalDir, 'without_skill', 'grading.json');
21
+ const ws = fs.existsSync(wsPath) ? JSON.parse(fs.readFileSync(wsPath, 'utf-8')) : undefined;
22
+ const wos = fs.existsSync(wosPath) ? JSON.parse(fs.readFileSync(wosPath, 'utf-8')) : undefined;
23
+ gradings.set(evalDir, { withSkill: ws, withoutSkill: wos });
24
+ }
25
+ return { benchmark, gradings };
26
+ }
27
+ catch {
28
+ return null;
29
+ }
30
+ }
31
+ function evalLabel(run) {
32
+ // Use expected_output or slug as a readable label instead of truncated prompt
33
+ if (run.slug && run.slug !== `${run.evalId}`)
34
+ return run.slug;
35
+ // Truncate prompt but show first meaningful line
36
+ const firstLine = run.prompt.split('\n')[0].slice(0, 60);
37
+ return firstLine;
38
+ }
2
39
  export class TerminalReporter {
3
40
  name = 'terminal';
4
41
  async report(results) {
5
42
  const { skillName, evalRuns, benchmark } = results;
6
43
  console.log(chalk.bold(`\nsnapeval — ${skillName}`));
7
- console.log(chalk.dim('─'.repeat(50)));
44
+ console.log(chalk.dim(`Baseline = without SKILL.md (raw AI response)`));
45
+ console.log(chalk.dim('─'.repeat(60)));
46
+ const prev = loadPreviousIteration(results.iterationDir);
8
47
  for (const run of evalRuns) {
9
- const wsRate = run.withSkill.grading?.summary.pass_rate;
48
+ const wsGrading = run.withSkill.grading;
49
+ const wsRate = wsGrading?.summary.pass_rate;
10
50
  const wosRate = run.withoutSkill.grading?.summary.pass_rate;
11
51
  const wsLabel = wsRate !== undefined ? `${(wsRate * 100).toFixed(0)}%` : 'n/a';
12
52
  const wosLabel = wosRate !== undefined ? `${(wosRate * 100).toFixed(0)}%` : 'n/a';
13
- const tokens = run.withSkill.output.total_tokens;
14
- const durationS = (run.withSkill.output.duration_ms / 1000).toFixed(2);
15
- console.log(` ${chalk.cyan(`#${run.evalId}`)} ${run.prompt.slice(0, 60)}`);
16
- console.log(` with_skill: ${wsLabel} | without_skill: ${wosLabel} | ${tokens} tokens, ${durationS}s`);
53
+ const wsColor = wsRate === 1 ? chalk.green : wsRate === 0 ? chalk.red : chalk.yellow;
54
+ const durationS = (run.withSkill.output.duration_ms / 1000).toFixed(1);
55
+ // Show per-eval delta from previous iteration
56
+ let perEvalDelta = '';
57
+ if (prev) {
58
+ const prevGrading = prev.gradings.get(`eval-${run.slug}`);
59
+ const prevRate = prevGrading?.withSkill?.summary.pass_rate;
60
+ if (prevRate !== undefined && wsRate !== undefined) {
61
+ const change = wsRate - prevRate;
62
+ if (change !== 0) {
63
+ const arrow = change > 0 ? chalk.green('↑') : chalk.red('↓');
64
+ perEvalDelta = ` ${arrow} was ${(prevRate * 100).toFixed(0)}%`;
65
+ }
66
+ }
67
+ }
68
+ console.log(` ${chalk.cyan(`#${run.evalId}`)} ${evalLabel(run)}`);
69
+ console.log(` Skill: ${wsColor(wsLabel)}${perEvalDelta} | Baseline: ${wosLabel} | ${durationS}s`);
70
+ // Show failed assertions inline
71
+ if (wsGrading) {
72
+ const failed = wsGrading.assertion_results.filter((a) => !a.passed);
73
+ for (const f of failed) {
74
+ console.log(chalk.red(` FAIL: ${f.text}`));
75
+ if (f.evidence) {
76
+ console.log(chalk.dim(` ${f.evidence.slice(0, 100)}`));
77
+ }
78
+ }
79
+ }
17
80
  }
18
- console.log(chalk.dim('─'.repeat(50)));
81
+ console.log(chalk.dim('─'.repeat(60)));
82
+ const ws = benchmark.run_summary.with_skill;
83
+ const wos = benchmark.run_summary.without_skill;
19
84
  const delta = benchmark.run_summary.delta;
20
85
  const deltaColor = delta.pass_rate > 0 ? chalk.green : delta.pass_rate < 0 ? chalk.red : chalk.dim;
21
- console.log(`Delta: ${deltaColor(`${(delta.pass_rate * 100).toFixed(1)}% pass rate`)} | ${delta.time_seconds.toFixed(1)}s time | ${delta.tokens.toFixed(0)} tokens`);
22
- console.log(chalk.dim(`with_skill avg: ${(benchmark.run_summary.with_skill.pass_rate.mean * 100).toFixed(1)}% | without_skill avg: ${(benchmark.run_summary.without_skill.pass_rate.mean * 100).toFixed(1)}%`));
86
+ console.log(chalk.bold('Summary:'));
87
+ console.log(` Skill pass rate: ${(ws.pass_rate.mean * 100).toFixed(1)}%`);
88
+ console.log(` Baseline pass rate: ${(wos.pass_rate.mean * 100).toFixed(1)}%`);
89
+ console.log(` Improvement: ${deltaColor(`${delta.pass_rate > 0 ? '+' : ''}${(delta.pass_rate * 100).toFixed(1)}%`)}`);
90
+ if (prev) {
91
+ const prevRate = prev.benchmark.run_summary.with_skill.pass_rate.mean;
92
+ const currRate = ws.pass_rate.mean;
93
+ const change = currRate - prevRate;
94
+ const changeColor = change > 0 ? chalk.green : change < 0 ? chalk.red : chalk.dim;
95
+ console.log(` vs previous: ${changeColor(`${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`)} (was ${(prevRate * 100).toFixed(1)}%)`);
96
+ // Note if eval set size changed
97
+ const prevEvalCount = prev.gradings.size;
98
+ const currEvalCount = evalRuns.length;
99
+ if (prevEvalCount !== currEvalCount) {
100
+ console.log(chalk.dim(` Note: eval set changed (${prevEvalCount} → ${currEvalCount} evals)`));
101
+ }
102
+ }
23
103
  }
24
104
  }
25
105
  //# sourceMappingURL=terminal.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"terminal.js","sourceRoot":"","sources":["../../../../src/adapters/report/terminal.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAG1B,MAAM,OAAO,gBAAgB;IAClB,IAAI,GAAG,UAAU,CAAC;IAE3B,KAAK,CAAC,MAAM,CAAC,OAAoB;QAC/B,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,GAAG,OAAO,CAAC;QAEnD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,SAAS,EAAE,CAAC,CAAC,CAAC;QACrD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;YAC3B,MAAM,MAAM,GAAG,GAAG,CAAC,SAAS,CAAC,OAAO,EAAE,OAAO,CAAC,SAAS,CAAC;YACxD,MAAM,OAAO,GAAG,GAAG,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5D,MAAM,OAAO,GAAG,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAC/E,MAAM,QAAQ,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAClF,MAAM,MAAM,GAAG,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,YAAY,CAAC;YACjD,MAAM,SAAS,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YACvE,OAAO,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;YAC5E,OAAO,CAAC,GAAG,CAAC,mBAAmB,OAAO,qBAAqB,QAAQ,MAAM,MAAM,YAAY,SAAS,GAAG,CAAC,CAAC;QAC3G,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1C,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;QACnG,OAAO,CAAC,GAAG,CAAC,UAAU,UAAU,CAAC,GAAG,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,aAAa,CAAC,MAAM,KAAK,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACrK,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,mBAAmB,CAAC,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,0BAA0B,CAAC,SAAS,CAAC,WAAW,CAAC,aAAa,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAClN,CAAC;CACF"}
1
+ {"version":3,"file":"terminal.js","sourceRoot":"","sources":["../../../../src/adapters/report/terminal.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,MAAM,OAAO,CAAC;AAQ1B,SAAS,qBAAqB,CAAC,YAAoB;IACjD,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;IAChD,MAAM,UAAU,GAAG,QAAQ,CAAC,WAAW,CAAC,OAAO,CAAC,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;IACvE,IAAI,KAAK,CAAC,UAAU,CAAC,IAAI,UAAU,IAAI,CAAC;QAAE,OAAO,IAAI,CAAC;IACtD,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,aAAa,UAAU,GAAG,CAAC,EAAE,CAAC,CAAC;IACvE,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC/D,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,iBAAiB,CAAC;QAAE,OAAO,IAAI,CAAC;IACnD,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC,CAAC;QAC1E,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAuE,CAAC;QAChG,MAAM,QAAQ,GAAG,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC;QAC5E,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,cAAc,CAAC,CAAC;YACzE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,eAAe,EAAE,cAAc,CAAC,CAAC;YAC7E,MAAM,EAAE,GAAG,EAAE,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5F,MAAM,GAAG,GAAG,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC/F,QAAQ,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,GAAG,EAAE,CAAC,CAAC;QAC9D,CAAC;QACD,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAC;IACjC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAAC,GAAqD;IACtE,8EAA8E;IAC9E,IAAI,GAAG,CAAC,IAAI,IAAI,GAAG,CAAC,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,EAAE;QAAE,OAAO,GAAG,CAAC,IAAI,CAAC;IAC9D,iDAAiD;IACjD,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACzD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,MAAM,OAAO,gBAAgB;IAClB,IAAI,GAAG,UAAU,CAAC;IAE3B,KAAK,CAAC,MAAM,CAAC,OAAoB;QAC/B,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,GAAG,OAAO,CAAC;QAEnD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,SAAS,EAAE,CAAC,CAAC,CAAC;QACrD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAC,CAAC;QACxE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,IAAI,GAAG,qBAAqB,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QAEzD,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC;YACxC,MAAM,MAAM,GAAG,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5C,MAAM,OAAO,GAAG,GAAG,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,SAAS,CAAC;YAC5D,MAAM,OAAO,GAAG,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAC/E,MAAM,QAAQ,GAAG,OAAO,KAAK,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;YAClF,MAAM,OAAO,GAAG,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC;YACrF,MAAM,SAAS,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAEvE,8CAA8C;YAC9C,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAC1D,MAAM,QAAQ,GAAG,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,SAAS,CAAC;gBAC3D,IAAI,QAAQ,KAAK,SAAS,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;oBACnD,MAAM,MAAM,GAAG,MAAM,GAAG,QAAQ,CAAC;oBACjC,IAAI,MAAM,KAAK,CAAC,EAAE,CAAC;wBACjB,MAAM,KAAK,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;wBAC7D,YAAY,GAAG,IAAI,KAAK,QAAQ,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;oBACjE,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,MAAM,EAAE,CAAC,IAAI,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACnE,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,OAAO,CAAC,GAAG,YAAY,gBAAgB,QAAQ,MAAM,SAAS,GAAG,CAAC,CAAC;YAErG,gCAAgC;YAChC,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,MAAM,GAAG,SAAS,CAAC,iBAAiB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;gBACpE,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;oBACvB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;oBAC9C,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;wBACf,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;oBAClE,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,MAAM,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC;QAC5C,MAAM,GAAG,GAAG,SAAS,CAAC,WAAW,CAAC,aAAa,CAAC;QAChD,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW,CAAC,KAAK,CAAC;QAC1C,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;QAEnG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC;QACpC,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,EAAE,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC9E,OAAO,CAAC,GAAG,CAAC,yBAAyB,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC/E,OAAO,CAAC,GAAG,CAAC,yBAAyB,UAAU,CAAC,GAAG,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QAE9H,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC;YACtE,MAAM,QAAQ,GAAG,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC;YACnC,MAAM,MAAM,GAAG,QAAQ,GAAG,QAAQ,CAAC;YACnC,MAAM,WAAW,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC;YAClF,OAAO,CAAC,GAAG,CAAC,yBAAyB,WAAW,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAEnJ,gCAAgC;YAChC,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;YACzC,MAAM,aAAa,GAAG,QAAQ,CAAC,MAAM,CAAC;YACtC,IAAI,aAAa,KAAK,aAAa,EAAE,CAAC;gBACpC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,6BAA6B,aAAa,MAAM,aAAa,SAAS,CAAC,CAAC,CAAC;YACjG,CAAC;QACH,CAAC;IACH,CAAC;CACF"}
@@ -3,4 +3,7 @@ export declare function evalCommand(skillPath: string, harness: Harness, inferen
3
3
  workspace?: string;
4
4
  runs?: number;
5
5
  oldSkill?: string;
6
+ concurrency?: number;
7
+ only?: number[];
8
+ threshold?: number;
6
9
  }): Promise<EvalResults>;