sunpeak 0.19.10 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +3 -3
  2. package/bin/commands/inspect.mjs +321 -6
  3. package/bin/commands/new.mjs +85 -7
  4. package/bin/commands/test-init.mjs +190 -59
  5. package/bin/commands/test.mjs +60 -10
  6. package/bin/lib/eval/eval-providers.mjs +34 -0
  7. package/bin/lib/eval/eval-runner.mjs +54 -2
  8. package/bin/lib/eval/eval-vitest-plugin.mjs +3 -14
  9. package/bin/lib/eval/model-registry.mjs +2 -2
  10. package/bin/lib/inspect/inspect-config.mjs +16 -1
  11. package/bin/lib/inspect/inspect-server.d.mts +32 -0
  12. package/bin/lib/inspect/inspect-server.mjs +11 -0
  13. package/bin/lib/resolve-bin.mjs +39 -0
  14. package/bin/lib/test/base-config.mjs +3 -2
  15. package/bin/lib/test/matchers.mjs +2 -2
  16. package/bin/lib/test/test-config.mjs +18 -7
  17. package/bin/lib/test/test-fixtures.d.mts +52 -92
  18. package/bin/lib/test/test-fixtures.mjs +174 -147
  19. package/dist/chatgpt/index.cjs +1 -1
  20. package/dist/chatgpt/index.js +1 -1
  21. package/dist/claude/index.cjs +1 -1
  22. package/dist/claude/index.js +1 -1
  23. package/dist/host/chatgpt/index.cjs +1 -1
  24. package/dist/host/chatgpt/index.js +1 -1
  25. package/dist/index.cjs +4 -4
  26. package/dist/index.cjs.map +1 -1
  27. package/dist/index.js +3 -3
  28. package/dist/index.js.map +1 -1
  29. package/dist/inspector/index.cjs +1 -1
  30. package/dist/inspector/index.js +1 -1
  31. package/dist/{inspector-D5DckQuU.js → inspector-BBDa5yCm.js} +57 -23
  32. package/dist/inspector-BBDa5yCm.js.map +1 -0
  33. package/dist/{inspector-jY9O18z9.cjs → inspector-DAA1Wiyh.cjs} +58 -24
  34. package/dist/inspector-DAA1Wiyh.cjs.map +1 -0
  35. package/dist/lib/discovery-cli.cjs +1 -1
  36. package/dist/mcp/index.cjs +22 -25
  37. package/dist/mcp/index.cjs.map +1 -1
  38. package/dist/mcp/index.js +19 -22
  39. package/dist/mcp/index.js.map +1 -1
  40. package/dist/{use-app-Bfargfa3.js → use-app-Cr0auUa1.js} +2 -2
  41. package/dist/{use-app-Bfargfa3.js.map → use-app-Cr0auUa1.js.map} +1 -1
  42. package/dist/{use-app-CbsBEmwv.cjs → use-app-DPkj5Jp_.cjs} +2 -2
  43. package/dist/{use-app-CbsBEmwv.cjs.map → use-app-DPkj5Jp_.cjs.map} +1 -1
  44. package/package.json +27 -15
  45. package/template/README.md +1 -1
  46. package/template/dist/albums/albums.html +4 -4
  47. package/template/dist/albums/albums.json +1 -1
  48. package/template/dist/carousel/carousel.html +4 -4
  49. package/template/dist/carousel/carousel.json +1 -1
  50. package/template/dist/map/map.html +6 -6
  51. package/template/dist/map/map.json +1 -1
  52. package/template/dist/review/review.html +4 -4
  53. package/template/dist/review/review.json +1 -1
  54. package/template/node_modules/.bin/vite +2 -2
  55. package/template/node_modules/.bin/vitest +2 -2
  56. package/template/node_modules/.vite/deps/_metadata.json +4 -4
  57. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js +1 -1
  58. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js.map +1 -1
  59. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js +1 -1
  60. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js.map +1 -1
  61. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js +1 -1
  62. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js.map +1 -1
  63. package/template/node_modules/.vite-mcp/deps/@testing-library_react.js +4 -4
  64. package/template/node_modules/.vite-mcp/deps/@testing-library_react.js.map +1 -1
  65. package/template/node_modules/.vite-mcp/deps/_metadata.json +33 -33
  66. package/template/node_modules/.vite-mcp/deps/{client-CU1wWud4.js → client-B_5CX--u.js} +7 -7
  67. package/template/node_modules/.vite-mcp/deps/{client-CU1wWud4.js.map → client-B_5CX--u.js.map} +1 -1
  68. package/template/node_modules/.vite-mcp/deps/embla-carousel-react.js +1 -1
  69. package/template/node_modules/.vite-mcp/deps/embla-carousel-react.js.map +1 -1
  70. package/template/node_modules/.vite-mcp/deps/react-dom.js +3 -3
  71. package/template/node_modules/.vite-mcp/deps/react-dom.js.map +1 -1
  72. package/template/node_modules/.vite-mcp/deps/react-dom_client.js +1 -1
  73. package/template/node_modules/.vite-mcp/deps/react.js +3 -3
  74. package/template/node_modules/.vite-mcp/deps/react.js.map +1 -1
  75. package/template/node_modules/.vite-mcp/deps/react_jsx-dev-runtime.js +2 -2
  76. package/template/node_modules/.vite-mcp/deps/react_jsx-dev-runtime.js.map +1 -1
  77. package/template/node_modules/.vite-mcp/deps/react_jsx-runtime.js +2 -2
  78. package/template/node_modules/.vite-mcp/deps/react_jsx-runtime.js.map +1 -1
  79. package/template/node_modules/.vite-mcp/deps/vitest.js +1024 -622
  80. package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
  81. package/template/package.json +6 -6
  82. package/template/tests/e2e/albums.spec.ts +24 -52
  83. package/template/tests/e2e/carousel.spec.ts +36 -58
  84. package/template/tests/e2e/map.spec.ts +35 -56
  85. package/template/tests/e2e/review.spec.ts +56 -85
  86. package/template/tests/e2e/visual.spec.ts +14 -12
  87. package/template/tests/evals/albums.eval.ts +12 -9
  88. package/template/tests/evals/carousel.eval.ts +4 -14
  89. package/template/tests/evals/eval.config.ts +6 -6
  90. package/template/tests/evals/map.eval.ts +2 -9
  91. package/template/tests/evals/review.eval.ts +27 -22
  92. package/dist/inspector-D5DckQuU.js.map +0 -1
  93. package/dist/inspector-jY9O18z9.cjs.map +0 -1
  94. /package/template/tests/evals/{.env.example → _env.example} +0 -0
@@ -1,12 +1,38 @@
1
1
  import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
2
2
  import { execSync } from 'child_process';
3
3
  import { join, dirname } from 'path';
4
+ import { fileURLToPath } from 'url';
4
5
  import * as p from '@clack/prompts';
6
+ import { EVAL_PROVIDERS, generateModelLines } from '../lib/eval/eval-providers.mjs';
7
+ import { detectPackageManager } from '../utils.mjs';
8
+
9
+ /** Read the current sunpeak package version for pinning in scaffolded configs. */
10
+ function getSunpeakVersion() {
11
+ try {
12
+ const __dirname = dirname(fileURLToPath(import.meta.url));
13
+ const pkgPath = join(__dirname, '..', '..', 'package.json');
14
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
15
+ return pkg.version ? `^${pkg.version}` : 'latest';
16
+ } catch {
17
+ return 'latest';
18
+ }
19
+ }
5
20
 
6
21
  /**
7
22
  * Default dependencies (real implementations).
8
23
  * Override in tests via the `deps` parameter.
9
24
  */
25
+ async function defaultSelectProviders() {
26
+ const selected = await p.multiselect({
27
+ message: 'AI providers for evals (space to toggle, enter to skip)',
28
+ options: EVAL_PROVIDERS.map((prov) => ({ value: prov, label: prov.label })),
29
+ initialValues: [],
30
+ required: false,
31
+ });
32
+ if (p.isCancel(selected)) return [];
33
+ return selected;
34
+ }
35
+
10
36
  export const defaultDeps = {
11
37
  existsSync,
12
38
  readFileSync,
@@ -21,6 +47,9 @@ export const defaultDeps = {
21
47
  select: p.select,
22
48
  text: p.text,
23
49
  log: p.log,
50
+ password: p.password,
51
+ selectProviders: defaultSelectProviders,
52
+ detectPackageManager,
24
53
  };
25
54
 
26
55
  /**
@@ -33,7 +62,7 @@ export const defaultDeps = {
33
62
  *
34
63
  * Scaffolds all 5 test types:
35
64
  * 1. E2E tests — Playwright-based inspector tests (mcp fixture)
36
- * 2. Visual regression — Screenshot comparison via mcp.screenshot()
65
+ * 2. Visual regression — Screenshot comparison via result.screenshot()
37
66
  * 3. Live tests — Test against real ChatGPT/Claude hosts
38
67
  * 4. Evals — Multi-model tool calling reliability tests
39
68
  * 5. Unit tests — Direct tool handler tests (JS/TS projects only)
@@ -60,6 +89,61 @@ export async function testInit(args = [], deps = defaultDeps) {
60
89
  await initExternalProject(cliServer, d);
61
90
  }
62
91
 
92
+ // Offer to configure eval providers
93
+ const providers = await d.selectProviders();
94
+ if (!d.isCancel(providers) && providers.length > 0) {
95
+ const pm = d.detectPackageManager();
96
+ const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
97
+ const installCmd = `${pm} add -D ${pkgsToInstall.join(' ')}`;
98
+ try {
99
+ d.execSync(installCmd, { cwd: d.cwd(), stdio: 'inherit' });
100
+ } catch {
101
+ d.log.info(`Provider install failed. Install manually: ${installCmd}`);
102
+ }
103
+
104
+ // Uncomment selected models in eval.config.ts
105
+ const evalDir = d.existsSync(join(d.cwd(), 'tests', 'evals'))
106
+ ? join(d.cwd(), 'tests', 'evals')
107
+ : d.existsSync(join(d.cwd(), 'tests', 'sunpeak', 'evals'))
108
+ ? join(d.cwd(), 'tests', 'sunpeak', 'evals')
109
+ : null;
110
+ if (evalDir) {
111
+ const configPath = join(evalDir, 'eval.config.ts');
112
+ if (d.existsSync(configPath)) {
113
+ let config = d.readFileSync(configPath, 'utf-8');
114
+ for (const prov of providers) {
115
+ for (const model of prov.models) {
116
+ config = config.replace(
117
+ new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
118
+ '$1$2'
119
+ );
120
+ }
121
+ }
122
+ d.writeFileSync(configPath, config);
123
+ }
124
+
125
+ // Prompt for API keys and write .env
126
+ const envLines = [];
127
+ const seen = new Set();
128
+ for (const prov of providers) {
129
+ if (seen.has(prov.envVar)) continue;
130
+ seen.add(prov.envVar);
131
+ const key = await d.password({
132
+ message: `${prov.envVar} (enter to skip)`,
133
+ mask: '*',
134
+ });
135
+ if (!d.isCancel(key) && key) {
136
+ envLines.push(`${prov.envVar}=${key}`);
137
+ }
138
+ }
139
+ if (envLines.length > 0 && evalDir) {
140
+ const relEnvPath = evalDir.startsWith(d.cwd()) ? evalDir.slice(d.cwd().length + 1) : evalDir;
141
+ d.writeFileSync(join(evalDir, '.env'), envLines.join('\n') + '\n');
142
+ d.log.info(`API keys saved to ${relEnvPath}/.env (gitignored)`);
143
+ }
144
+ }
145
+ }
146
+
63
147
  // Offer to install the testing skill
64
148
  const installSkill = await d.confirm({
65
149
  message: 'Install the test-mcp-server skill? (helps your coding agent write tests)',
@@ -67,12 +151,12 @@ export async function testInit(args = [], deps = defaultDeps) {
67
151
  });
68
152
  if (!d.isCancel(installSkill) && installSkill) {
69
153
  try {
70
- d.execSync('npx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
154
+ d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
71
155
  cwd: d.cwd(),
72
156
  stdio: 'inherit',
73
157
  });
74
158
  } catch {
75
- d.log.info('Skill install skipped. Install later: npx skills add Sunpeak-AI/sunpeak@test-mcp-server');
159
+ d.log.info('Skill install skipped. Install later: pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server');
76
160
  }
77
161
  }
78
162
 
@@ -141,11 +225,32 @@ async function getServerConfig(cliServer, d) {
141
225
 
142
226
  function generateServerConfigBlock(server, relativeTo = '.') {
143
227
  if (server.type === 'later') {
144
- return ` // TODO: Configure your MCP server connection
228
+ return ` // TODO: Configure your MCP server connection before running tests.
229
+ // Uncomment one of the options below:
230
+ //
231
+ // HTTP server (Python FastAPI, Go, etc.):
232
+ // server: { url: 'http://localhost:8000/mcp' },
233
+ //
234
+ // Python (uv):
235
+ // server: { command: 'uv', args: ['run', 'python', 'server.py'] },
236
+ //
237
+ // Python (venv):
238
+ // server: { command: '.venv/bin/python', args: ['server.py'] },
239
+ //
240
+ // Go:
241
+ // server: { command: 'go', args: ['run', './cmd/server'] },
242
+ //
243
+ // Node.js:
244
+ // server: { command: 'node', args: ['server.js'] },
245
+ //
246
+ // Optional server options:
145
247
  // server: {
146
- // command: 'python',
147
- // args: ['server.py'],
148
- // },`;
248
+ // command: 'python', args: ['server.py'],
249
+ // env: { API_KEY: 'test-key' }, // Extra environment variables
250
+ // cwd: './backend', // Working directory
251
+ // },
252
+ //
253
+ // timeout: 120_000, // Server startup timeout in ms (default: 60s)`;
149
254
  }
150
255
  if (server.type === 'url') {
151
256
  return ` server: {
@@ -205,15 +310,11 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
205
310
  "",
206
311
  " models: [",
207
312
  " // Uncomment models and install their provider packages:",
208
- " // 'gpt-4o', // OPENAI_API_KEY",
209
- " // 'gpt-4o-mini', // OPENAI_API_KEY",
210
- " // 'o4-mini', // OPENAI_API_KEY",
211
- " // 'claude-sonnet-4-20250514', // ANTHROPIC_API_KEY",
212
- " // 'gemini-2.0-flash', // GOOGLE_GENERATIVE_AI_API_KEY",
313
+ ...generateModelLines(),
213
314
  " ],",
214
315
  "",
215
316
  " defaults: {",
216
- " runs: 10, // Number of times to run each case per model",
317
+ " runs: 5, // Number of times to run each case per model",
217
318
  " maxSteps: 1, // Max tool call steps per run",
218
319
  " temperature: 0, // 0 for most deterministic results",
219
320
  " timeout: 30_000, // Timeout per run in ms",
@@ -244,7 +345,7 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
244
345
  *
245
346
  * To get started:
246
347
  * 1. Configure models in eval.config.ts (uncomment the ones you want)
247
- * 2. Install the AI SDK and provider packages: pnpm add ai @ai-sdk/openai
348
+ * 2. Install the AI SDK and provider packages (e.g. pnpm add ai @ai-sdk/openai)
248
349
  * 3. Copy .env.example to .env and add your API keys
249
350
  * 4. Replace this file with evals for your own tools
250
351
  * 5. Run: sunpeak test --eval
@@ -302,31 +403,31 @@ function scaffoldVisualTest(filePath, d) {
302
403
  * Uncomment the tests below and replace 'your-tool' with your tool name.
303
404
  */
304
405
 
305
- // test('tool renders correctly in light mode', async ({ mcp }) => {
306
- // const result = await mcp.callTool('your-tool', { key: 'value' }, { theme: 'light' });
406
+ // test('tool renders correctly in light mode', async ({ inspector }) => {
407
+ // const result = await inspector.renderTool('your-tool', { key: 'value' }, { theme: 'light' });
307
408
  // expect(result).not.toBeError();
308
409
  //
309
410
  // // Wait for UI to render, then screenshot:
310
411
  // // const app = result.app();
311
412
  // // await expect(app.getByText('Expected text')).toBeVisible();
312
- // // await mcp.screenshot('tool-light');
413
+ // // await result.screenshot('tool-light');
313
414
  // });
314
415
 
315
- // test('tool renders correctly in dark mode', async ({ mcp }) => {
316
- // const result = await mcp.callTool('your-tool', { key: 'value' }, { theme: 'dark' });
416
+ // test('tool renders correctly in dark mode', async ({ inspector }) => {
417
+ // const result = await inspector.renderTool('your-tool', { key: 'value' }, { theme: 'dark' });
317
418
  // expect(result).not.toBeError();
318
419
  //
319
420
  // // const app = result.app();
320
421
  // // await expect(app.getByText('Expected text')).toBeVisible();
321
- // // await mcp.screenshot('tool-dark');
422
+ // // await result.screenshot('tool-dark');
322
423
  // });
323
424
 
324
425
  // Full-page screenshot (captures the inspector chrome too):
325
- // test('full page renders correctly', async ({ mcp }) => {
326
- // const result = await mcp.callTool('your-tool', {}, { theme: 'light' });
426
+ // test('full page renders correctly', async ({ inspector }) => {
427
+ // const result = await inspector.renderTool('your-tool', {}, { theme: 'light' });
327
428
  // const app = result.app();
328
429
  // await expect(app.getByText('Expected text')).toBeVisible();
329
- // await mcp.screenshot('tool-page', { target: 'page', maxDiffPixelRatio: 0.02 });
430
+ // await result.screenshot('tool-page', { target: 'page', maxDiffPixelRatio: 0.02 });
330
431
  // });
331
432
  `
332
433
  );
@@ -489,7 +590,8 @@ async function initExternalProject(cliServer, d) {
489
590
  private: true,
490
591
  type: 'module',
491
592
  devDependencies: {
492
- sunpeak: 'latest',
593
+ '@types/node': 'latest',
594
+ sunpeak: getSunpeakVersion(),
493
595
  '@playwright/test': 'latest',
494
596
  },
495
597
  scripts: {
@@ -531,24 +633,28 @@ ${serverBlock}
531
633
  ) + '\n'
532
634
  );
533
635
 
534
- // 1. E2E test — smoke test, verifies the server is reachable
636
+ // 1. E2E test — smoke test, verifies the server exposes tools
535
637
  d.writeFileSync(
536
638
  join(testDir, 'smoke.test.ts'),
537
639
  `import { test, expect } from 'sunpeak/test';
538
640
 
539
- test('server is reachable and inspector loads', async ({ mcp }) => {
540
- // Verify the inspector page loads successfully
541
- await expect(mcp.page.locator('#root')).not.toBeEmpty();
641
+ test('server exposes tools', async ({ mcp }) => {
642
+ const tools = await mcp.listTools();
643
+ expect(tools.length).toBeGreaterThan(0);
542
644
  });
543
645
 
544
- // Uncomment and customize for your tools:
545
- // test('my tool renders correctly', async ({ mcp }) => {
646
+ // Protocol-level test (no UI rendering):
647
+ // test('my tool returns data', async ({ mcp }) => {
546
648
  // const result = await mcp.callTool('your-tool', { key: 'value' });
649
+ // expect(result.isError).toBeFalsy();
650
+ // });
651
+
652
+ // UI rendering test:
653
+ // test('my tool renders correctly', async ({ inspector }) => {
654
+ // const result = await inspector.renderTool('your-tool', { key: 'value' });
547
655
  // expect(result).not.toBeError();
548
- //
549
- // // If your tool has a UI:
550
- // // const app = result.app();
551
- // // await expect(app.getByText('Hello')).toBeVisible();
656
+ // const app = result.app();
657
+ // await expect(app.getByText('Hello')).toBeVisible();
552
658
  // });
553
659
  `
554
660
  );
@@ -563,15 +669,31 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
563
669
  scaffoldEvals(join(testDir, 'evals'), { server, d });
564
670
 
565
671
  d.log.success('Created tests/sunpeak/ with all test types.');
566
- d.log.step('Next steps:');
567
- d.log.message(' cd tests/sunpeak');
568
- d.log.message(' npm install');
569
- d.log.message(' npx playwright install chromium');
570
- d.log.message('');
571
- d.log.message(' npx sunpeak test # E2E tests');
572
- d.log.message(' npx sunpeak test --visual # Visual regression (generates baselines on first run)');
573
- d.log.message(' npx sunpeak test --live # Live tests against real hosts (requires login)');
574
- d.log.message(' npx sunpeak test --eval # Multi-model evals (configure models in evals/eval.config.ts)');
672
+ if (server.type === 'later') {
673
+ d.log.warn('Server not configured. Edit tests/sunpeak/playwright.config.ts before running tests.');
674
+ }
675
+
676
+ // Auto-install dependencies so users can run tests immediately
677
+ const pm = d.detectPackageManager();
678
+ d.log.step('Installing dependencies...');
679
+ try {
680
+ d.execSync(`${pm} install`, { cwd: testDir, stdio: 'inherit' });
681
+ } catch {
682
+ d.log.warn(`Dependency install failed. Run manually: cd tests/sunpeak && ${pm} install`);
683
+ }
684
+
685
+ d.log.step('Installing Playwright browser...');
686
+ try {
687
+ d.execSync(`${pm} exec playwright install chromium`, { cwd: testDir, stdio: 'inherit' });
688
+ } catch {
689
+ d.log.warn(`Browser install failed. Run manually: cd tests/sunpeak && ${pm} exec playwright install chromium`);
690
+ }
691
+
692
+ d.log.step('Ready! Run tests with:');
693
+ d.log.message(' sunpeak test # E2E tests');
694
+ d.log.message(' sunpeak test --visual # Visual regression (generates baselines on first run)');
695
+ d.log.message(' sunpeak test --live # Live tests against real hosts (requires login)');
696
+ d.log.message(' sunpeak test --eval # Multi-model evals (configure models in evals/eval.config.ts)');
575
697
  }
576
698
 
577
699
  async function initJsProject(cliServer, d) {
@@ -608,18 +730,23 @@ ${serverBlock}
608
730
  testPath,
609
731
  `import { test, expect } from 'sunpeak/test';
610
732
 
611
- test('server is reachable and inspector loads', async ({ mcp }) => {
612
- await expect(mcp.page.locator('#root')).not.toBeEmpty();
733
+ test('server exposes tools', async ({ mcp }) => {
734
+ const tools = await mcp.listTools();
735
+ expect(tools.length).toBeGreaterThan(0);
613
736
  });
614
737
 
615
- // Uncomment and customize for your tools:
616
- // test('my tool renders correctly', async ({ mcp }) => {
738
+ // Protocol-level test (no UI rendering):
739
+ // test('my tool returns data', async ({ mcp }) => {
617
740
  // const result = await mcp.callTool('your-tool', { key: 'value' });
741
+ // expect(result.isError).toBeFalsy();
742
+ // });
743
+
744
+ // UI rendering test:
745
+ // test('my tool renders correctly', async ({ inspector }) => {
746
+ // const result = await inspector.renderTool('your-tool', { key: 'value' });
618
747
  // expect(result).not.toBeError();
619
- //
620
- // // If your tool has a UI:
621
- // // const app = result.app();
622
- // // await expect(app.getByText('Hello')).toBeVisible();
748
+ // const app = result.app();
749
+ // await expect(app.getByText('Hello')).toBeVisible();
623
750
  // });
624
751
  `
625
752
  );
@@ -638,15 +765,19 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
638
765
  // 5. Unit test
639
766
  scaffoldUnitTest(join(cwd, 'tests', 'unit', 'example.test.ts'), d);
640
767
 
768
+ if (server.type === 'later') {
769
+ d.log.warn('Server not configured. Edit playwright.config.ts before running tests.');
770
+ }
771
+ const pkgMgr = d.detectPackageManager();
641
772
  d.log.step('Next steps:');
642
- d.log.message(' npm install -D sunpeak @playwright/test vitest');
643
- d.log.message(' npx playwright install chromium');
773
+ d.log.message(` ${pkgMgr} add -D sunpeak @playwright/test vitest`);
774
+ d.log.message(` ${pkgMgr} exec playwright install chromium`);
644
775
  d.log.message('');
645
- d.log.message(' npx sunpeak test # E2E tests');
646
- d.log.message(' npx sunpeak test --unit # Unit tests (vitest)');
647
- d.log.message(' npx sunpeak test --visual # Visual regression');
648
- d.log.message(' npx sunpeak test --live # Live tests against real hosts');
649
- d.log.message(' npx sunpeak test --eval # Multi-model evals');
776
+ d.log.message(' sunpeak test # E2E tests');
777
+ d.log.message(' sunpeak test --unit # Unit tests (vitest)');
778
+ d.log.message(' sunpeak test --visual # Visual regression');
779
+ d.log.message(' sunpeak test --live # Live tests against real hosts');
780
+ d.log.message(' sunpeak test --eval # Multi-model evals');
650
781
  }
651
782
 
652
783
  async function initSunpeakProject(d) {
@@ -702,6 +833,6 @@ export default defineConfig();
702
833
  d.log.message(' Replace: import { test, expect } from "@playwright/test"');
703
834
  d.log.message(' With: import { test, expect } from "sunpeak/test"');
704
835
  d.log.message('');
705
- d.log.message(' Use the `mcp` fixture instead of raw page navigation.');
836
+ d.log.message(' Use the `mcp` and `inspector` fixtures instead of raw page navigation.');
706
837
  d.log.message(' See sunpeak docs for migration examples.');
707
838
  }
@@ -70,6 +70,9 @@ export async function runTest(args) {
70
70
  'playwright.config.js',
71
71
  'sunpeak.config.ts',
72
72
  'sunpeak.config.js',
73
+ // Fallback for non-JS projects: tests/sunpeak/ self-contained directory
74
+ 'tests/sunpeak/playwright.config.ts',
75
+ 'tests/sunpeak/playwright.config.js',
73
76
  ],
74
77
  visual: isVisual,
75
78
  updateSnapshots: isVisual && isUpdate,
@@ -82,6 +85,9 @@ export async function runTest(args) {
82
85
  configCandidates: [
83
86
  'tests/live/playwright.config.ts',
84
87
  'tests/live/playwright.config.js',
88
+ // Fallback for non-JS projects: tests/sunpeak/ self-contained directory
89
+ 'tests/sunpeak/live/playwright.config.ts',
90
+ 'tests/sunpeak/live/playwright.config.js',
85
91
  ],
86
92
  configRequired: true,
87
93
  configErrorMessage: 'No live test config found at tests/live/playwright.config.ts',
@@ -310,14 +316,43 @@ function validateApiKeys(models) {
310
316
  return warnings;
311
317
  }
312
318
 
319
+ /**
320
+ * Check that required AI SDK provider packages are installed for the configured models.
321
+ * @param {string[]} models
322
+ * @returns {Promise<Array<{ pkg: string, reason: string }>>}
323
+ */
324
+ async function checkProviderPackages(models) {
325
+ const { createRequire } = await import('module');
326
+ // Resolve from the project's node_modules, not the global CLI install
327
+ const require = createRequire(join(process.cwd(), 'package.json'));
328
+ const missing = [];
329
+ const checked = new Set();
330
+
331
+ try {
332
+ require.resolve('ai');
333
+ } catch {
334
+ missing.push({ pkg: 'ai', reason: 'core AI SDK' });
335
+ }
336
+
337
+ for (const modelId of models) {
338
+ const pkg = getProviderForModel(modelId);
339
+ if (!pkg || checked.has(pkg)) continue;
340
+ checked.add(pkg);
341
+ try {
342
+ require.resolve(pkg);
343
+ } catch {
344
+ missing.push({ pkg, reason: modelId });
345
+ }
346
+ }
347
+
348
+ return missing;
349
+ }
350
+
313
351
  /**
314
352
  * Run eval tests via vitest with the eval plugin.
315
353
  */
316
354
  async function runEvals(args) {
317
- const { dirname, resolve, basename } = await import('path');
318
- const { fileURLToPath } = await import('url');
319
-
320
- const __dirname = dirname(fileURLToPath(import.meta.url));
355
+ const { resolve, basename } = await import('path');
321
356
  const evalDir = findEvalDir();
322
357
 
323
358
  if (!evalDir) {
@@ -365,8 +400,19 @@ async function runEvals(args) {
365
400
  }
366
401
  }
367
402
 
368
- // Validate API keys for configured models
403
+ // Check for missing provider packages and API keys
369
404
  if (configModels && configModels.length > 0) {
405
+ const missingPkgs = await checkProviderPackages(configModels);
406
+ if (missingPkgs.length > 0) {
407
+ console.error('\nMissing required packages for eval models:\n');
408
+ for (const { pkg, reason } of missingPkgs) {
409
+ console.error(` ${pkg} (needed for ${reason})`);
410
+ }
411
+ const installCmd = missingPkgs.map((m) => m.pkg).join(' ');
412
+ console.error(`\nInstall with:\n\n pnpm add -D ${installCmd}\n`);
413
+ return 1;
414
+ }
415
+
370
416
  const warnings = validateApiKeys(configModels);
371
417
  if (warnings.length > 0) {
372
418
  console.log('');
@@ -416,8 +462,10 @@ async function runEvals(args) {
416
462
  // Use .ts extension so vitest handles TypeScript imports natively
417
463
  const vitestConfigPath = join(absEvalDir, '.eval-vitest.config.ts');
418
464
 
419
- const evalPluginPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-vitest-plugin.mjs');
420
- const evalReporterPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-reporter.mjs');
465
+ // Use package exports so vitest resolves from the project's node_modules,
466
+ // not the global CLI install. This ensures import('ai') finds project-local deps.
467
+ const evalPluginImport = 'sunpeak/eval/plugin';
468
+ const evalReporterImport = 'sunpeak/eval/reporter';
421
469
 
422
470
  // Clean up dev server and temp config
423
471
  const cleanupResources = () => {
@@ -442,17 +490,19 @@ async function runEvals(args) {
442
490
  ? './' + basename(configFile)
443
491
  : null;
444
492
 
445
- // Generate a vitest config that loads the eval config and plugin
493
+ // Generate a vitest config that loads the eval config and plugin.
494
+ // Imports use package names (sunpeak/eval/plugin) so vitest resolves from
495
+ // the project's node_modules, ensuring import('ai') finds project-local deps.
446
496
  const vitestConfig = `
447
497
  import { defineConfig } from 'vitest/config';
448
- import { evalVitestPlugin } from ${JSON.stringify(evalPluginPath)};
498
+ import { evalVitestPlugin } from '${evalPluginImport}';
449
499
  ${configImportPath ? `import evalConfig from ${JSON.stringify(configImportPath)};` : 'const evalConfig = { models: [], defaults: {} };'}
450
500
 
451
501
  export default defineConfig({
452
502
  test: {
453
503
  globals: true,
454
504
  include: ['**/*.eval.ts', '**/*.eval.js'],
455
- reporters: ['default', ${JSON.stringify(evalReporterPath)}],
505
+ reporters: ['default', '${evalReporterImport}'],
456
506
  testTimeout: 600000,
457
507
  },
458
508
  plugins: [
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Shared eval provider definitions used by both `sunpeak new` and `sunpeak test init`.
3
+ * Single source of truth for provider packages, model IDs, and CLI labels.
4
+ */
5
+
6
+ export const EVAL_PROVIDERS = [
7
+ { pkg: '@ai-sdk/openai', models: ['gpt-4o', 'gpt-4o-mini'], label: 'OpenAI (gpt-4o, gpt-4o-mini)', envVar: 'OPENAI_API_KEY' },
8
+ { pkg: '@ai-sdk/anthropic', models: ['claude-sonnet-4-20250514'], label: 'Anthropic (claude-sonnet-4)', envVar: 'ANTHROPIC_API_KEY' },
9
+ { pkg: '@ai-sdk/google', models: ['gemini-2.0-flash'], label: 'Google (gemini-2.0-flash)', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
10
+ ];
11
+
12
+ /**
13
+ * All model lines that appear in eval.config.ts, in order.
14
+ * Used by scaffoldEvals to generate the config and by the uncomment logic.
15
+ */
16
+ export const EVAL_CONFIG_MODELS = [
17
+ { id: 'gpt-4o', envVar: 'OPENAI_API_KEY' },
18
+ { id: 'gpt-4o-mini', envVar: 'OPENAI_API_KEY' },
19
+ { id: 'o4-mini', envVar: 'OPENAI_API_KEY' },
20
+ { id: 'claude-sonnet-4-20250514', envVar: 'ANTHROPIC_API_KEY' },
21
+ { id: 'gemini-2.0-flash', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
22
+ ];
23
+
24
+ /**
25
+ * Generate the models section lines for eval.config.ts.
26
+ * @returns {string[]}
27
+ */
28
+ export function generateModelLines() {
29
+ const maxIdLen = Math.max(...EVAL_CONFIG_MODELS.map((m) => m.id.length));
30
+ return EVAL_CONFIG_MODELS.map((m) => {
31
+ const padded = `'${m.id}',`.padEnd(maxIdLen + 3);
32
+ return ` // ${padded} // ${m.envVar}`;
33
+ });
34
+ }
@@ -8,6 +8,43 @@ import { resolveModel, checkAiSdkInstalled } from './model-registry.mjs';
8
8
  // Re-export for use in generated test code
9
9
  export { checkAiSdkInstalled };
10
10
 
11
+ /**
12
+ * Strip AI SDK retry wrapper from error messages for cleaner output.
13
+ * "Failed after 3 attempts. Last error: <actual error>" → "<actual error>"
14
+ * @param {string} message
15
+ * @returns {string}
16
+ */
17
+ export function cleanErrorMessage(message) {
18
+ return message.replace(/^Failed after \d+ attempts?\. Last error: /i, '');
19
+ }
20
+
21
+ /**
22
+ * Check if an error message indicates a fatal API error that won't resolve on retry.
23
+ * @param {string} message
24
+ * @returns {boolean}
25
+ */
26
+ export function isFatalApiError(message) {
27
+ const lower = message.toLowerCase();
28
+ const patterns = [
29
+ 'exceeded your current quota',
30
+ 'credit balance is too low',
31
+ 'insufficient_quota',
32
+ 'billing_hard_limit_reached',
33
+ 'check your plan and billing details',
34
+ 'add a payment method',
35
+ 'invalid api key',
36
+ 'invalid_api_key',
37
+ 'incorrect api key',
38
+ 'unauthorized',
39
+ 'permission denied',
40
+ 'access denied',
41
+ 'authentication failed',
42
+ 'account deactivated',
43
+ 'account suspended',
44
+ ];
45
+ return patterns.some((p) => lower.includes(p));
46
+ }
47
+
11
48
  /**
12
49
  * Define an eval spec. Identity function for type safety.
13
50
  * @param {import('./eval-types.d.mts').EvalSpec} spec
@@ -112,6 +149,7 @@ export async function runSingleEval({ prompt, model, tools, maxSteps, temperatur
112
149
  prompt,
113
150
  maxSteps,
114
151
  temperature,
152
+ maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
115
153
  abortSignal: AbortSignal.timeout(timeout),
116
154
  });
117
155
 
@@ -268,10 +306,15 @@ export async function runEvalCaseAggregate({
268
306
  const model = await resolveModel(modelId);
269
307
  let passed = 0;
270
308
  let failed = 0;
309
+ let executedRuns = 0;
271
310
  let totalDurationMs = 0;
272
311
  const failureMap = new Map();
273
312
 
274
313
  for (let i = 0; i < runs; i++) {
314
+ // Small delay between runs to avoid rate limits (skip before first run)
315
+ if (i > 0) await new Promise((r) => setTimeout(r, 1000));
316
+
317
+ executedRuns++;
275
318
  const start = performance.now();
276
319
  try {
277
320
  const result = await runSingleEval({
@@ -286,8 +329,17 @@ export async function runEvalCaseAggregate({
286
329
  passed++;
287
330
  } catch (err) {
288
331
  failed++;
289
- const msg = err.message || String(err);
332
+ const msg = cleanErrorMessage(err.message || String(err));
290
333
  failureMap.set(msg, (failureMap.get(msg) || 0) + 1);
334
+
335
+ if (isFatalApiError(msg)) {
336
+ // Count remaining runs as failed and stop early
337
+ const remaining = runs - i - 1;
338
+ failed += remaining;
339
+ failureMap.set(msg, (failureMap.get(msg) || 0) + remaining);
340
+ totalDurationMs += performance.now() - start;
341
+ break;
342
+ }
291
343
  }
292
344
  totalDurationMs += performance.now() - start;
293
345
  }
@@ -304,7 +356,7 @@ export async function runEvalCaseAggregate({
304
356
  passed,
305
357
  failed,
306
358
  passRate: runs > 0 ? passed / runs : 0,
307
- avgDurationMs: runs > 0 ? totalDurationMs / runs : 0,
359
+ avgDurationMs: executedRuns > 0 ? totalDurationMs / executedRuns : 0,
308
360
  failures,
309
361
  };
310
362
  }