sunpeak 0.19.4 → 0.19.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +6 -4
  2. package/bin/commands/dev.mjs +1 -1
  3. package/bin/commands/inspect.mjs +1 -1
  4. package/bin/commands/new.mjs +92 -10
  5. package/bin/commands/start.mjs +3 -1
  6. package/bin/commands/test-init.mjs +548 -76
  7. package/bin/commands/test.mjs +401 -4
  8. package/bin/lib/eval/eval-providers.mjs +34 -0
  9. package/bin/lib/eval/eval-reporter.mjs +105 -0
  10. package/bin/lib/eval/eval-runner.mjs +362 -0
  11. package/bin/lib/eval/eval-types.d.mts +168 -0
  12. package/bin/lib/eval/eval-vitest-plugin.mjs +147 -0
  13. package/bin/lib/eval/model-registry.mjs +73 -0
  14. package/bin/lib/inspect/inspect-config.mjs +1 -1
  15. package/bin/lib/sandbox-server.mjs +5 -2
  16. package/bin/lib/test/test-config.mjs +1 -1
  17. package/bin/sunpeak.js +1 -0
  18. package/dist/chatgpt/index.cjs +1 -1
  19. package/dist/chatgpt/index.js +1 -1
  20. package/dist/claude/index.cjs +1 -1
  21. package/dist/claude/index.js +1 -1
  22. package/dist/host/chatgpt/index.cjs +1 -1
  23. package/dist/host/chatgpt/index.js +1 -1
  24. package/dist/index.cjs +2 -2
  25. package/dist/index.js +2 -2
  26. package/dist/inspector/index.cjs +1 -1
  27. package/dist/inspector/index.js +1 -1
  28. package/dist/{inspector-Bp9jrHIu.js → inspector-D5DckQuU.js} +19 -19
  29. package/dist/{inspector-Bp9jrHIu.js.map → inspector-D5DckQuU.js.map} +1 -1
  30. package/dist/{inspector-Cvq3yjNL.cjs → inspector-jY9O18z9.cjs} +19 -19
  31. package/dist/{inspector-Cvq3yjNL.cjs.map → inspector-jY9O18z9.cjs.map} +1 -1
  32. package/dist/mcp/index.cjs +2 -2
  33. package/dist/mcp/index.cjs.map +1 -1
  34. package/dist/mcp/index.js +2 -2
  35. package/dist/mcp/index.js.map +1 -1
  36. package/dist/{use-app-Ck5kR1Sf.js → use-app-Bfargfa3.js} +2 -2
  37. package/dist/{use-app-Ck5kR1Sf.js.map → use-app-Bfargfa3.js.map} +1 -1
  38. package/dist/{use-app-DHYiev3D.cjs → use-app-CbsBEmwv.cjs} +2 -2
  39. package/dist/{use-app-DHYiev3D.cjs.map → use-app-CbsBEmwv.cjs.map} +1 -1
  40. package/package.json +32 -2
  41. package/template/README.md +17 -7
  42. package/template/_gitignore +2 -0
  43. package/template/dist/albums/albums.html +1 -1
  44. package/template/dist/albums/albums.json +1 -1
  45. package/template/dist/carousel/carousel.html +1 -1
  46. package/template/dist/carousel/carousel.json +1 -1
  47. package/template/dist/map/map.html +1 -1
  48. package/template/dist/map/map.json +1 -1
  49. package/template/dist/review/review.html +1 -1
  50. package/template/dist/review/review.json +1 -1
  51. package/template/node_modules/.bin/vitest +2 -2
  52. package/template/node_modules/.vite/deps/_metadata.json +3 -3
  53. package/template/node_modules/.vite-mcp/deps/_metadata.json +20 -20
  54. package/template/node_modules/.vite-mcp/deps/vitest.js +7 -7
  55. package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
  56. package/template/tests/evals/_env.example +5 -0
  57. package/template/tests/evals/albums.eval.ts +31 -0
  58. package/template/tests/evals/carousel.eval.ts +16 -0
  59. package/template/tests/evals/eval.config.ts +26 -0
  60. package/template/tests/evals/map.eval.ts +16 -0
  61. package/template/tests/evals/review.eval.ts +53 -0
@@ -1,6 +1,43 @@
1
1
  import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
2
- import { join } from 'path';
2
+ import { execSync } from 'child_process';
3
+ import { join, dirname } from 'path';
3
4
  import * as p from '@clack/prompts';
5
+ import { EVAL_PROVIDERS, generateModelLines } from '../lib/eval/eval-providers.mjs';
6
+ import { detectPackageManager } from '../utils.mjs';
7
+
8
+ /**
9
+ * Default dependencies (real implementations).
10
+ * Override in tests via the `deps` parameter.
11
+ */
12
+ async function defaultSelectProviders() {
13
+ const selected = await p.multiselect({
14
+ message: 'AI providers for evals (space to toggle, enter to skip)',
15
+ options: EVAL_PROVIDERS.map((prov) => ({ value: prov, label: prov.label })),
16
+ initialValues: [],
17
+ required: false,
18
+ });
19
+ if (p.isCancel(selected)) return [];
20
+ return selected;
21
+ }
22
+
23
+ export const defaultDeps = {
24
+ existsSync,
25
+ readFileSync,
26
+ writeFileSync,
27
+ mkdirSync,
28
+ execSync,
29
+ cwd: () => process.cwd(),
30
+ intro: p.intro,
31
+ outro: p.outro,
32
+ confirm: p.confirm,
33
+ isCancel: p.isCancel,
34
+ select: p.select,
35
+ text: p.text,
36
+ log: p.log,
37
+ password: p.password,
38
+ selectProviders: defaultSelectProviders,
39
+ detectPackageManager,
40
+ };
4
41
 
5
42
  /**
6
43
  * sunpeak test init — Scaffold test infrastructure for MCP servers.
@@ -9,9 +46,18 @@ import * as p from '@clack/prompts';
9
46
  * - Non-JS projects: self-contained tests/sunpeak/ directory
10
47
  * - JS/TS projects: root-level config + test files
11
48
  * - sunpeak projects: migrate to defineConfig()
49
+ *
50
+ * Scaffolds all 5 test types:
51
+ * 1. E2E tests — Playwright-based inspector tests (mcp fixture)
52
+ * 2. Visual regression — Screenshot comparison via mcp.screenshot()
53
+ * 3. Live tests — Test against real ChatGPT/Claude hosts
54
+ * 4. Evals — Multi-model tool calling reliability tests
55
+ * 5. Unit tests — Direct tool handler tests (JS/TS projects only)
12
56
  */
13
- export async function testInit(args = []) {
14
- p.intro('Setting up sunpeak tests');
57
+ export async function testInit(args = [], deps = defaultDeps) {
58
+ const d = { ...defaultDeps, ...deps };
59
+
60
+ d.intro('Setting up sunpeak tests');
15
61
 
16
62
  // Parse --server flag from CLI args
17
63
  const serverIdx = args.indexOf('--server');
@@ -20,26 +66,97 @@ export async function testInit(args = []) {
20
66
  ? args[serverIdx + 1]
21
67
  : undefined;
22
68
 
23
- const projectType = detectProjectType();
69
+ const projectType = detectProjectType(d);
24
70
 
25
71
  if (projectType === 'sunpeak') {
26
- await initSunpeakProject();
72
+ await initSunpeakProject(d);
27
73
  } else if (projectType === 'js') {
28
- await initJsProject(cliServer);
74
+ await initJsProject(cliServer, d);
29
75
  } else {
30
- await initExternalProject(cliServer);
76
+ await initExternalProject(cliServer, d);
77
+ }
78
+
79
+ // Offer to configure eval providers
80
+ const providers = await d.selectProviders();
81
+ if (!d.isCancel(providers) && providers.length > 0) {
82
+ const pm = d.detectPackageManager();
83
+ const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
84
+ const installCmd = `${pm} add -D ${pkgsToInstall.join(' ')}`;
85
+ try {
86
+ d.execSync(installCmd, { cwd: d.cwd(), stdio: 'inherit' });
87
+ } catch {
88
+ d.log.info(`Provider install failed. Install manually: ${installCmd}`);
89
+ }
90
+
91
+ // Uncomment selected models in eval.config.ts
92
+ const evalDir = d.existsSync(join(d.cwd(), 'tests', 'evals'))
93
+ ? join(d.cwd(), 'tests', 'evals')
94
+ : d.existsSync(join(d.cwd(), 'tests', 'sunpeak', 'evals'))
95
+ ? join(d.cwd(), 'tests', 'sunpeak', 'evals')
96
+ : null;
97
+ if (evalDir) {
98
+ const configPath = join(evalDir, 'eval.config.ts');
99
+ if (d.existsSync(configPath)) {
100
+ let config = d.readFileSync(configPath, 'utf-8');
101
+ for (const prov of providers) {
102
+ for (const model of prov.models) {
103
+ config = config.replace(
104
+ new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
105
+ '$1$2'
106
+ );
107
+ }
108
+ }
109
+ d.writeFileSync(configPath, config);
110
+ }
111
+
112
+ // Prompt for API keys and write .env
113
+ const envLines = [];
114
+ const seen = new Set();
115
+ for (const prov of providers) {
116
+ if (seen.has(prov.envVar)) continue;
117
+ seen.add(prov.envVar);
118
+ const key = await d.password({
119
+ message: `${prov.envVar} (enter to skip)`,
120
+ mask: '*',
121
+ });
122
+ if (!d.isCancel(key) && key) {
123
+ envLines.push(`${prov.envVar}=${key}`);
124
+ }
125
+ }
126
+ if (envLines.length > 0 && evalDir) {
127
+ const relEnvPath = evalDir.startsWith(d.cwd()) ? evalDir.slice(d.cwd().length + 1) : evalDir;
128
+ d.writeFileSync(join(evalDir, '.env'), envLines.join('\n') + '\n');
129
+ d.log.info(`API keys saved to ${relEnvPath}/.env (gitignored)`);
130
+ }
131
+ }
31
132
  }
32
133
 
33
- p.outro('Done!');
134
+ // Offer to install the testing skill
135
+ const installSkill = await d.confirm({
136
+ message: 'Install the test-mcp-server skill? (helps your coding agent write tests)',
137
+ initialValue: true,
138
+ });
139
+ if (!d.isCancel(installSkill) && installSkill) {
140
+ try {
141
+ d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
142
+ cwd: d.cwd(),
143
+ stdio: 'inherit',
144
+ });
145
+ } catch {
146
+ d.log.info('Skill install skipped. Install later: pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server');
147
+ }
148
+ }
149
+
150
+ d.outro('Done!');
34
151
  }
35
152
 
36
- function detectProjectType() {
37
- const cwd = process.cwd();
153
+ function detectProjectType(d) {
154
+ const cwd = d.cwd();
38
155
  const pkgPath = join(cwd, 'package.json');
39
156
 
40
- if (existsSync(pkgPath)) {
157
+ if (d.existsSync(pkgPath)) {
41
158
  try {
42
- const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
159
+ const pkg = JSON.parse(d.readFileSync(pkgPath, 'utf-8'));
43
160
  const deps = { ...pkg.dependencies, ...pkg.devDependencies };
44
161
  if ('sunpeak' in deps) return 'sunpeak';
45
162
  return 'js';
@@ -52,7 +169,7 @@ function detectProjectType() {
52
169
  return 'external';
53
170
  }
54
171
 
55
- async function getServerConfig(cliServer) {
172
+ async function getServerConfig(cliServer, d) {
56
173
  // If provided via --server flag, detect type automatically
57
174
  if (cliServer) {
58
175
  if (cliServer.startsWith('http://') || cliServer.startsWith('https://')) {
@@ -61,7 +178,7 @@ async function getServerConfig(cliServer) {
61
178
  return { type: 'command', value: cliServer };
62
179
  }
63
180
 
64
- const serverType = await p.select({
181
+ const serverType = await d.select({
65
182
  message: 'How does your MCP server start?',
66
183
  options: [
67
184
  { value: 'command', label: 'Command (e.g., python server.py)' },
@@ -70,23 +187,23 @@ async function getServerConfig(cliServer) {
70
187
  ],
71
188
  });
72
189
 
73
- if (p.isCancel(serverType)) process.exit(0);
190
+ if (d.isCancel(serverType)) process.exit(0);
74
191
 
75
192
  if (serverType === 'command') {
76
- const command = await p.text({
193
+ const command = await d.text({
77
194
  message: 'Server start command:',
78
195
  placeholder: 'python src/server.py',
79
196
  });
80
- if (p.isCancel(command)) process.exit(0);
197
+ if (d.isCancel(command)) process.exit(0);
81
198
  return { type: 'command', value: command };
82
199
  }
83
200
 
84
201
  if (serverType === 'url') {
85
- const url = await p.text({
202
+ const url = await d.text({
86
203
  message: 'Server URL:',
87
204
  placeholder: 'http://localhost:8000/mcp',
88
205
  });
89
- if (p.isCancel(url)) process.exit(0);
206
+ if (d.isCancel(url)) process.exit(0);
90
207
  return { type: 'url', value: url };
91
208
  }
92
209
 
@@ -122,27 +239,324 @@ function generateServerConfigBlock(server, relativeTo = '.') {
122
239
  },`;
123
240
  }
124
241
 
125
- async function initExternalProject(cliServer) {
126
- p.log.info('Detected non-JS project. Creating self-contained test directory.');
242
+ /**
243
+ * Scaffold eval boilerplate into a directory.
244
+ * @param {string} evalsDir - Directory to create eval files in
245
+ * @param {{ server?: object, isSunpeak?: boolean, d?: object }} options
246
+ */
247
+ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
248
+ const d = deps || defaultDeps;
249
+ if (d.existsSync(join(evalsDir, 'eval.config.ts'))) {
250
+ d.log.info('Eval config already exists. Skipping eval scaffold.');
251
+ return;
252
+ }
253
+
254
+ d.mkdirSync(evalsDir, { recursive: true });
255
+
256
+ // Generate server line for eval config
257
+ let serverLine = ' // server: \'http://localhost:8000/mcp\',';
258
+ if (isSunpeak) {
259
+ serverLine = ' // Omit server for sunpeak projects (auto-detected).\n // server: \'http://localhost:8000/mcp\',';
260
+ } else if (server?.type === 'url') {
261
+ serverLine = ` server: '${server.value}',`;
262
+ } else if (server?.type === 'command') {
263
+ serverLine = ` server: '${server.value}',`;
264
+ }
265
+
266
+ // Build the eval config content
267
+ const configLines = [
268
+ "import { defineEvalConfig } from 'sunpeak/eval';",
269
+ "",
270
+ "// API keys are loaded automatically from .env in this directory (gitignored).",
271
+ "// See .env.example for the format.",
272
+ "",
273
+ "export default defineEvalConfig({",
274
+ " // MCP server to test.",
275
+ serverLine,
276
+ "",
277
+ " models: [",
278
+ " // Uncomment models and install their provider packages:",
279
+ ...generateModelLines(),
280
+ " ],",
281
+ "",
282
+ " defaults: {",
283
+ " runs: 5, // Number of times to run each case per model",
284
+ " maxSteps: 1, // Max tool call steps per run",
285
+ " temperature: 0, // 0 for most deterministic results",
286
+ " timeout: 30_000, // Timeout per run in ms",
287
+ " },",
288
+ "});",
289
+ "",
290
+ ];
291
+
292
+ d.writeFileSync(join(evalsDir, 'eval.config.ts'), configLines.join('\n'));
293
+
294
+ // Scaffold .env template
295
+ d.writeFileSync(
296
+ join(evalsDir, '.env.example'),
297
+ `# Copy this file to .env and fill in your API keys.
298
+ # .env is gitignored — never commit API keys.
299
+ # OPENAI_API_KEY=sk-...
300
+ # ANTHROPIC_API_KEY=sk-ant-...
301
+ # GOOGLE_GENERATIVE_AI_API_KEY=...
302
+ `
303
+ );
304
+
305
+ d.writeFileSync(
306
+ join(evalsDir, 'example.eval.ts'),
307
+ `import { defineEval } from 'sunpeak/eval';
308
+
309
+ /**
310
+ * Example eval — tests whether LLMs call your tools correctly.
311
+ *
312
+ * To get started:
313
+ * 1. Configure models in eval.config.ts (uncomment the ones you want)
314
+ * 2. Install the AI SDK and provider packages (e.g. pnpm add ai @ai-sdk/openai)
315
+ * 3. Copy .env.example to .env and add your API keys
316
+ * 4. Replace this file with evals for your own tools
317
+ * 5. Run: sunpeak test --eval
318
+ *
319
+ * Each case sends a prompt to every configured model and checks
320
+ * that the model calls the expected tool with the expected arguments.
321
+ * Cases run multiple times (configured via \`runs\` in eval.config.ts)
322
+ * to measure reliability across non-deterministic LLM responses.
323
+ */
324
+ export default defineEval({
325
+ // This eval is skipped when no models are configured.
326
+ // Delete this file and create your own evals to get started.
327
+ cases: [
328
+ {
329
+ name: 'example (replace me)',
330
+ prompt: 'Show me a demo',
331
+ // expect which tool gets called and (optionally) its arguments:
332
+ expect: {
333
+ tool: 'your-tool-name',
334
+ // args: { key: 'value' },
335
+ },
336
+ },
337
+ ],
338
+ });
339
+ `
340
+ );
341
+
342
+ d.log.success(`Created ${evalsDir}/ with eval config and example.`);
343
+ }
344
+
345
+ /**
346
+ * Scaffold a visual regression test file.
347
+ * @param {string} filePath - Full path to the visual test file
348
+ * @param {object} d - Dependencies
349
+ */
350
+ function scaffoldVisualTest(filePath, d) {
351
+ if (d.existsSync(filePath)) {
352
+ d.log.info('Visual test already exists. Skipping.');
353
+ return;
354
+ }
355
+
356
+ d.writeFileSync(
357
+ filePath,
358
+ `import { test, expect } from 'sunpeak/test';
359
+
360
+ /**
361
+ * Visual regression tests — compare screenshots against saved baselines.
362
+ *
363
+ * Screenshots only run with: sunpeak test --visual
364
+ * Update baselines with: sunpeak test --visual --update
365
+ *
366
+ * During normal \`sunpeak test\` runs, screenshot() calls are silently
367
+ * skipped so these tests still pass without baselines.
368
+ *
369
+ * Uncomment the tests below and replace 'your-tool' with your tool name.
370
+ */
371
+
372
+ // test('tool renders correctly in light mode', async ({ mcp }) => {
373
+ // const result = await mcp.callTool('your-tool', { key: 'value' }, { theme: 'light' });
374
+ // expect(result).not.toBeError();
375
+ //
376
+ // // Wait for UI to render, then screenshot:
377
+ // // const app = result.app();
378
+ // // await expect(app.getByText('Expected text')).toBeVisible();
379
+ // // await mcp.screenshot('tool-light');
380
+ // });
381
+
382
+ // test('tool renders correctly in dark mode', async ({ mcp }) => {
383
+ // const result = await mcp.callTool('your-tool', { key: 'value' }, { theme: 'dark' });
384
+ // expect(result).not.toBeError();
385
+ //
386
+ // // const app = result.app();
387
+ // // await expect(app.getByText('Expected text')).toBeVisible();
388
+ // // await mcp.screenshot('tool-dark');
389
+ // });
390
+
391
+ // Full-page screenshot (captures the inspector chrome too):
392
+ // test('full page renders correctly', async ({ mcp }) => {
393
+ // const result = await mcp.callTool('your-tool', {}, { theme: 'light' });
394
+ // const app = result.app();
395
+ // await expect(app.getByText('Expected text')).toBeVisible();
396
+ // await mcp.screenshot('tool-page', { target: 'page', maxDiffPixelRatio: 0.02 });
397
+ // });
398
+ `
399
+ );
400
+ d.log.success(`Created ${filePath}`);
401
+ }
402
+
403
+ /**
404
+ * Scaffold live test boilerplate (test against real ChatGPT/Claude).
405
+ * @param {string} liveDir - Directory to create live test files in
406
+ * @param {{ isSunpeak?: boolean, d: object }} options
407
+ */
408
+ function scaffoldLiveTests(liveDir, { isSunpeak, d } = {}) {
409
+ if (d.existsSync(join(liveDir, 'playwright.config.ts'))) {
410
+ d.log.info('Live test config already exists. Skipping live test scaffold.');
411
+ return;
412
+ }
413
+
414
+ d.mkdirSync(liveDir, { recursive: true });
415
+
416
+ // Live test playwright config
417
+ const liveConfigPreamble = `import { defineLiveConfig } from 'sunpeak/test/live/config';
418
+
419
+ /**
420
+ * Live tests run against real AI hosts (ChatGPT, Claude).
421
+ *
422
+ * Prerequisites:
423
+ * 1. Your MCP server must be accessible via a public URL (e.g., ngrok tunnel)
424
+ * 2. The server must be registered as an MCP action in the host
425
+ * 3. Run: sunpeak test --live
426
+ *
427
+ * On first run, a browser window opens for you to log in to the host.
428
+ * The session is saved for subsequent runs (typically lasts a few hours).`;
429
+
430
+ const liveConfigExport = `export default defineLiveConfig({
431
+ // hosts: ['chatgpt'], // Which hosts to test against
432
+ // colorScheme: 'light', // Default color scheme
433
+ // viewport: { width: 1280, height: 720 },
434
+ devOverlay: false,
435
+ });
436
+ `;
437
+
438
+ const configContent = isSunpeak
439
+ ? `${liveConfigPreamble}
440
+ */
441
+ ${liveConfigExport}`
442
+ : `${liveConfigPreamble}
443
+ *
444
+ * NOTE: defineLiveConfig() starts a local sunpeak dev server as its backend.
445
+ * If your MCP server is not a sunpeak project, you may need to customize the
446
+ * webServer option in the Playwright config below to start your own server,
447
+ * or remove webServer entirely if your server is already running.
448
+ */
449
+ ${liveConfigExport}`;
127
450
 
128
- const server = await getServerConfig(cliServer);
129
- const testDir = join(process.cwd(), 'tests', 'sunpeak');
451
+ d.writeFileSync(join(liveDir, 'playwright.config.ts'), configContent);
130
452
 
131
- if (existsSync(testDir)) {
132
- p.log.warn('tests/sunpeak/ already exists. Skipping scaffold.');
453
+ // Live test example
454
+ d.writeFileSync(
455
+ join(liveDir, 'example.test.ts'),
456
+ `import { test, expect } from 'sunpeak/test/live';
457
+
458
+ /**
459
+ * Live tests invoke tools through real AI hosts (ChatGPT, Claude).
460
+ *
461
+ * The \`live\` fixture provides:
462
+ * - live.invoke(toolName) — invoke a tool and get the app locator
463
+ * - live.setColorScheme('dark', app) — switch theme while app is visible
464
+ * - live.page — the underlying Playwright page
465
+ *
466
+ * Run with: sunpeak test --live
467
+ *
468
+ * These tests are excluded from normal \`sunpeak test\` runs because
469
+ * they require host accounts and cost API credits.
470
+ */
471
+
472
+ // Uncomment and replace 'your-tool' with the tool name as it appears in the host.
473
+ // test('tool renders in the host', async ({ live }) => {
474
+ // const app = await live.invoke('your-tool');
475
+ //
476
+ // await expect(app.getByText('Expected text')).toBeVisible({ timeout: 15_000 });
477
+ //
478
+ // // Test dark mode:
479
+ // await live.setColorScheme('dark', app);
480
+ // await expect(app.getByText('Expected text')).toBeVisible();
481
+ // });
482
+ `
483
+ );
484
+
485
+ d.log.success(`Created ${liveDir}/ with live test config and example.`);
486
+ }
487
+
488
+ /**
489
+ * Scaffold a unit test example for JS/TS projects.
490
+ * @param {string} filePath - Full path to the unit test file
491
+ * @param {object} d - Dependencies
492
+ */
493
+ function scaffoldUnitTest(filePath, d) {
494
+ if (d.existsSync(filePath)) {
495
+ d.log.info('Unit test already exists. Skipping.');
133
496
  return;
134
497
  }
135
498
 
136
- mkdirSync(testDir, { recursive: true });
499
+ d.mkdirSync(dirname(filePath), { recursive: true });
500
+
501
+ d.writeFileSync(
502
+ filePath,
503
+ `import { describe, it, expect } from 'vitest';
504
+
505
+ /**
506
+ * Unit tests for your MCP tool handlers.
507
+ *
508
+ * Import your tool handler directly and test its input/output
509
+ * without starting the MCP server or inspector.
510
+ *
511
+ * Run with: sunpeak test --unit
512
+ *
513
+ * To set up vitest, add it to your devDependencies:
514
+ * npm install -D vitest
515
+ *
516
+ * Uncomment and customize the tests below for your tools.
517
+ */
518
+
519
+ // import handler, { tool, schema } from '../../src/tools/your-tool';
520
+ // const extra = {} as Parameters<typeof handler>[1];
521
+
522
+ // describe('your tool', () => {
523
+ // it('returns expected output', async () => {
524
+ // const result = await handler({ key: 'value' }, extra);
525
+ // expect(result.structuredContent).toBeDefined();
526
+ // });
527
+ //
528
+ // it('exports correct tool config', () => {
529
+ // expect(tool.title).toBe('Your Tool');
530
+ // expect(tool.annotations?.readOnlyHint).toBe(true);
531
+ // });
532
+ // });
533
+ `
534
+ );
535
+ d.log.success(`Created ${filePath}`);
536
+ }
537
+
538
+ async function initExternalProject(cliServer, d) {
539
+ d.log.info('Detected non-JS project. Creating self-contained test directory.');
540
+
541
+ const server = await getServerConfig(cliServer, d);
542
+ const testDir = join(d.cwd(), 'tests', 'sunpeak');
543
+
544
+ if (d.existsSync(testDir)) {
545
+ d.log.warn('tests/sunpeak/ already exists. Skipping scaffold.');
546
+ return;
547
+ }
548
+
549
+ d.mkdirSync(testDir, { recursive: true });
137
550
 
138
551
  // package.json
139
- writeFileSync(
552
+ d.writeFileSync(
140
553
  join(testDir, 'package.json'),
141
554
  JSON.stringify(
142
555
  {
143
556
  private: true,
144
557
  type: 'module',
145
558
  devDependencies: {
559
+ '@types/node': 'latest',
146
560
  sunpeak: 'latest',
147
561
  '@playwright/test': 'latest',
148
562
  },
@@ -157,7 +571,7 @@ async function initExternalProject(cliServer) {
157
571
 
158
572
  // sunpeak.config.ts (used as playwright config)
159
573
  const serverBlock = generateServerConfigBlock(server, '../..');
160
- writeFileSync(
574
+ d.writeFileSync(
161
575
  join(testDir, 'playwright.config.ts'),
162
576
  `import { defineConfig } from 'sunpeak/test/config';
163
577
 
@@ -168,7 +582,7 @@ ${serverBlock}
168
582
  );
169
583
 
170
584
  // tsconfig.json
171
- writeFileSync(
585
+ d.writeFileSync(
172
586
  join(testDir, 'tsconfig.json'),
173
587
  JSON.stringify(
174
588
  {
@@ -185,8 +599,8 @@ ${serverBlock}
185
599
  ) + '\n'
186
600
  );
187
601
 
188
- // smoke test — runnable out of the box, verifies the server is reachable
189
- writeFileSync(
602
+ // 1. E2E test — smoke test, verifies the server is reachable
603
+ d.writeFileSync(
190
604
  join(testDir, 'smoke.test.ts'),
191
605
  `import { test, expect } from 'sunpeak/test';
192
606
 
@@ -207,27 +621,41 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
207
621
  `
208
622
  );
209
623
 
210
- p.log.success('Created tests/sunpeak/ with config and starter test.');
211
- p.log.step('Next steps:');
212
- p.log.message(' cd tests/sunpeak');
213
- p.log.message(' npm install');
214
- p.log.message(' npx playwright install chromium');
215
- p.log.message(' npx sunpeak test');
624
+ // 2. Visual regression test
625
+ scaffoldVisualTest(join(testDir, 'visual.test.ts'), d);
626
+
627
+ // 3. Live tests
628
+ scaffoldLiveTests(join(testDir, 'live'), { isSunpeak: false, d });
629
+
630
+ // 4. Eval boilerplate
631
+ scaffoldEvals(join(testDir, 'evals'), { server, d });
632
+
633
+ d.log.success('Created tests/sunpeak/ with all test types.');
634
+ d.log.step('Next steps:');
635
+ const pm = d.detectPackageManager();
636
+ d.log.message(' cd tests/sunpeak');
637
+ d.log.message(` ${pm} install`);
638
+ d.log.message(` ${pm} exec playwright install chromium`);
639
+ d.log.message('');
640
+ d.log.message(' sunpeak test # E2E tests');
641
+ d.log.message(' sunpeak test --visual # Visual regression (generates baselines on first run)');
642
+ d.log.message(' sunpeak test --live # Live tests against real hosts (requires login)');
643
+ d.log.message(' sunpeak test --eval # Multi-model evals (configure models in evals/eval.config.ts)');
216
644
  }
217
645
 
218
- async function initJsProject(cliServer) {
219
- p.log.info('Detected JS/TS project. Adding test config at project root.');
646
+ async function initJsProject(cliServer, d) {
647
+ d.log.info('Detected JS/TS project. Adding test config at project root.');
220
648
 
221
- const server = await getServerConfig(cliServer);
222
- const cwd = process.cwd();
649
+ const server = await getServerConfig(cliServer, d);
650
+ const cwd = d.cwd();
223
651
 
224
652
  // Create playwright.config.ts
225
653
  const configPath = join(cwd, 'playwright.config.ts');
226
- if (existsSync(configPath)) {
227
- p.log.warn('playwright.config.ts already exists. Skipping config creation.');
654
+ if (d.existsSync(configPath)) {
655
+ d.log.warn('playwright.config.ts already exists. Skipping config creation.');
228
656
  } else {
229
657
  const serverBlock = generateServerConfigBlock(server);
230
- writeFileSync(
658
+ d.writeFileSync(
231
659
  configPath,
232
660
  `import { defineConfig } from 'sunpeak/test/config';
233
661
 
@@ -236,16 +664,16 @@ ${serverBlock}
236
664
  });
237
665
  `
238
666
  );
239
- p.log.success('Created playwright.config.ts');
667
+ d.log.success('Created playwright.config.ts');
240
668
  }
241
669
 
242
- // Create test directory and smoke test
243
- const testDir = join(cwd, 'tests', 'e2e');
244
- mkdirSync(testDir, { recursive: true });
670
+ // 1. E2E test smoke test
671
+ const e2eDir = join(cwd, 'tests', 'e2e');
672
+ d.mkdirSync(e2eDir, { recursive: true });
245
673
 
246
- const testPath = join(testDir, 'smoke.test.ts');
247
- if (!existsSync(testPath)) {
248
- writeFileSync(
674
+ const testPath = join(e2eDir, 'smoke.test.ts');
675
+ if (!d.existsSync(testPath)) {
676
+ d.writeFileSync(
249
677
  testPath,
250
678
  `import { test, expect } from 'sunpeak/test';
251
679
 
@@ -264,42 +692,86 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
264
692
  // });
265
693
  `
266
694
  );
267
- p.log.success('Created tests/e2e/smoke.test.ts');
695
+ d.log.success('Created tests/e2e/smoke.test.ts');
268
696
  }
269
697
 
270
- p.log.step('Next steps:');
271
- p.log.message(' npm install -D sunpeak @playwright/test');
272
- p.log.message(' npx playwright install chromium');
273
- p.log.message(' npx sunpeak test');
698
+ // 2. Visual regression test
699
+ scaffoldVisualTest(join(e2eDir, 'visual.test.ts'), d);
700
+
701
+ // 3. Live tests
702
+ scaffoldLiveTests(join(cwd, 'tests', 'live'), { isSunpeak: false, d });
703
+
704
+ // 4. Eval boilerplate
705
+ scaffoldEvals(join(cwd, 'tests', 'evals'), { server, d });
706
+
707
+ // 5. Unit test
708
+ scaffoldUnitTest(join(cwd, 'tests', 'unit', 'example.test.ts'), d);
709
+
710
+ const pkgMgr = d.detectPackageManager();
711
+ d.log.step('Next steps:');
712
+ d.log.message(` ${pkgMgr} add -D sunpeak @playwright/test vitest`);
713
+ d.log.message(` ${pkgMgr} exec playwright install chromium`);
714
+ d.log.message('');
715
+ d.log.message(' sunpeak test # E2E tests');
716
+ d.log.message(' sunpeak test --unit # Unit tests (vitest)');
717
+ d.log.message(' sunpeak test --visual # Visual regression');
718
+ d.log.message(' sunpeak test --live # Live tests against real hosts');
719
+ d.log.message(' sunpeak test --eval # Multi-model evals');
274
720
  }
275
721
 
276
- async function initSunpeakProject() {
277
- p.log.info('Detected sunpeak project. Updating config to use defineConfig().');
722
+ async function initSunpeakProject(d) {
723
+ d.log.info('Detected sunpeak project. Updating config to use defineConfig().');
278
724
 
279
- const cwd = process.cwd();
725
+ const cwd = d.cwd();
280
726
  const configPath = join(cwd, 'playwright.config.ts');
281
727
 
282
- if (existsSync(configPath)) {
283
- const content = readFileSync(configPath, 'utf-8');
728
+ if (d.existsSync(configPath)) {
729
+ const content = d.readFileSync(configPath, 'utf-8');
284
730
  if (content.includes('sunpeak/test/config')) {
285
- p.log.info('Config already uses sunpeak/test/config. Nothing to do.');
286
- return;
731
+ d.log.info('Config already uses sunpeak/test/config. Nothing to do.');
732
+ } else {
733
+ d.log.warn('playwright.config.ts exists but does not use sunpeak/test/config.');
734
+ d.log.message(' To migrate, replace your config with:');
735
+ d.log.message(" import { defineConfig } from 'sunpeak/test/config';");
736
+ d.log.message(' export default defineConfig();');
287
737
  }
288
- }
289
-
290
- writeFileSync(
291
- configPath,
292
- `import { defineConfig } from 'sunpeak/test/config';
738
+ } else {
739
+ d.writeFileSync(
740
+ configPath,
741
+ `import { defineConfig } from 'sunpeak/test/config';
293
742
 
294
743
  export default defineConfig();
295
744
  `
296
- );
745
+ );
746
+ d.log.success('Updated playwright.config.ts to use defineConfig()');
747
+ }
297
748
 
298
- p.log.success('Updated playwright.config.ts to use defineConfig()');
299
- p.log.step('Migrate test files:');
300
- p.log.message(' Replace: import { test, expect } from "@playwright/test"');
301
- p.log.message(' With: import { test, expect } from "sunpeak/test"');
302
- p.log.message('');
303
- p.log.message(' Use the `mcp` fixture instead of raw page navigation.');
304
- p.log.message(' See sunpeak docs for migration examples.');
749
+ // Scaffold missing test types
750
+
751
+ // 1. Visual regression test
752
+ const e2eDir = join(cwd, 'tests', 'e2e');
753
+ d.mkdirSync(e2eDir, { recursive: true });
754
+ scaffoldVisualTest(join(e2eDir, 'visual.test.ts'), d);
755
+
756
+ // 2. Live tests
757
+ scaffoldLiveTests(join(cwd, 'tests', 'live'), { isSunpeak: true, d });
758
+
759
+ // 3. Eval boilerplate
760
+ scaffoldEvals(join(cwd, 'tests', 'evals'), { isSunpeak: true, d });
761
+
762
+ // 4. Unit test
763
+ scaffoldUnitTest(join(cwd, 'tests', 'unit', 'example.test.ts'), d);
764
+
765
+ d.log.step('Scaffolded test types:');
766
+ d.log.message(' tests/e2e/visual.test.ts — Visual regression (sunpeak test --visual)');
767
+ d.log.message(' tests/live/ — Live host tests (sunpeak test --live)');
768
+ d.log.message(' tests/evals/ — Multi-model evals (sunpeak test --eval)');
769
+ d.log.message(' tests/unit/example.test.ts — Unit tests (sunpeak test --unit)');
770
+ d.log.message('');
771
+ d.log.message(' Migrate existing e2e tests:');
772
+ d.log.message(' Replace: import { test, expect } from "@playwright/test"');
773
+ d.log.message(' With: import { test, expect } from "sunpeak/test"');
774
+ d.log.message('');
775
+ d.log.message(' Use the `mcp` fixture instead of raw page navigation.');
776
+ d.log.message(' See sunpeak docs for migration examples.');
305
777
  }