sunpeak 0.19.10 → 0.20.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bin/commands/inspect.mjs +321 -6
- package/bin/commands/new.mjs +85 -7
- package/bin/commands/test-init.mjs +190 -59
- package/bin/commands/test.mjs +60 -10
- package/bin/lib/eval/eval-providers.mjs +34 -0
- package/bin/lib/eval/eval-runner.mjs +54 -2
- package/bin/lib/eval/eval-vitest-plugin.mjs +3 -14
- package/bin/lib/eval/model-registry.mjs +2 -2
- package/bin/lib/inspect/inspect-config.mjs +16 -1
- package/bin/lib/inspect/inspect-server.d.mts +32 -0
- package/bin/lib/inspect/inspect-server.mjs +11 -0
- package/bin/lib/resolve-bin.mjs +39 -0
- package/bin/lib/test/base-config.mjs +3 -2
- package/bin/lib/test/matchers.mjs +2 -2
- package/bin/lib/test/test-config.mjs +18 -7
- package/bin/lib/test/test-fixtures.d.mts +52 -92
- package/bin/lib/test/test-fixtures.mjs +174 -147
- package/dist/chatgpt/index.cjs +1 -1
- package/dist/chatgpt/index.js +1 -1
- package/dist/claude/index.cjs +1 -1
- package/dist/claude/index.js +1 -1
- package/dist/host/chatgpt/index.cjs +1 -1
- package/dist/host/chatgpt/index.js +1 -1
- package/dist/index.cjs +4 -4
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +3 -3
- package/dist/index.js.map +1 -1
- package/dist/inspector/index.cjs +1 -1
- package/dist/inspector/index.js +1 -1
- package/dist/{inspector-D5DckQuU.js → inspector-BBDa5yCm.js} +57 -23
- package/dist/inspector-BBDa5yCm.js.map +1 -0
- package/dist/{inspector-jY9O18z9.cjs → inspector-DAA1Wiyh.cjs} +58 -24
- package/dist/inspector-DAA1Wiyh.cjs.map +1 -0
- package/dist/lib/discovery-cli.cjs +1 -1
- package/dist/mcp/index.cjs +22 -25
- package/dist/mcp/index.cjs.map +1 -1
- package/dist/mcp/index.js +19 -22
- package/dist/mcp/index.js.map +1 -1
- package/dist/{use-app-Bfargfa3.js → use-app-Cr0auUa1.js} +2 -2
- package/dist/{use-app-Bfargfa3.js.map → use-app-Cr0auUa1.js.map} +1 -1
- package/dist/{use-app-CbsBEmwv.cjs → use-app-DPkj5Jp_.cjs} +2 -2
- package/dist/{use-app-CbsBEmwv.cjs.map → use-app-DPkj5Jp_.cjs.map} +1 -1
- package/package.json +27 -15
- package/template/README.md +1 -1
- package/template/dist/albums/albums.html +4 -4
- package/template/dist/albums/albums.json +1 -1
- package/template/dist/carousel/carousel.html +4 -4
- package/template/dist/carousel/carousel.json +1 -1
- package/template/dist/map/map.html +6 -6
- package/template/dist/map/map.json +1 -1
- package/template/dist/review/review.html +4 -4
- package/template/dist/review/review.json +1 -1
- package/template/node_modules/.bin/vite +2 -2
- package/template/node_modules/.bin/vitest +2 -2
- package/template/node_modules/.vite/deps/_metadata.json +4 -4
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js +1 -1
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js +1 -1
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js +1 -1
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/@testing-library_react.js +4 -4
- package/template/node_modules/.vite-mcp/deps/@testing-library_react.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/_metadata.json +33 -33
- package/template/node_modules/.vite-mcp/deps/{client-CU1wWud4.js → client-B_5CX--u.js} +7 -7
- package/template/node_modules/.vite-mcp/deps/{client-CU1wWud4.js.map → client-B_5CX--u.js.map} +1 -1
- package/template/node_modules/.vite-mcp/deps/embla-carousel-react.js +1 -1
- package/template/node_modules/.vite-mcp/deps/embla-carousel-react.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/react-dom.js +3 -3
- package/template/node_modules/.vite-mcp/deps/react-dom.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/react-dom_client.js +1 -1
- package/template/node_modules/.vite-mcp/deps/react.js +3 -3
- package/template/node_modules/.vite-mcp/deps/react.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/react_jsx-dev-runtime.js +2 -2
- package/template/node_modules/.vite-mcp/deps/react_jsx-dev-runtime.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/react_jsx-runtime.js +2 -2
- package/template/node_modules/.vite-mcp/deps/react_jsx-runtime.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/vitest.js +1024 -622
- package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
- package/template/package.json +6 -6
- package/template/tests/e2e/albums.spec.ts +24 -52
- package/template/tests/e2e/carousel.spec.ts +36 -58
- package/template/tests/e2e/map.spec.ts +35 -56
- package/template/tests/e2e/review.spec.ts +56 -85
- package/template/tests/e2e/visual.spec.ts +14 -12
- package/template/tests/evals/albums.eval.ts +12 -9
- package/template/tests/evals/carousel.eval.ts +4 -14
- package/template/tests/evals/eval.config.ts +6 -6
- package/template/tests/evals/map.eval.ts +2 -9
- package/template/tests/evals/review.eval.ts +27 -22
- package/dist/inspector-D5DckQuU.js.map +0 -1
- package/dist/inspector-jY9O18z9.cjs.map +0 -1
- /package/template/tests/evals/{.env.example → _env.example} +0 -0
|
@@ -1,12 +1,38 @@
|
|
|
1
1
|
import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
|
2
2
|
import { execSync } from 'child_process';
|
|
3
3
|
import { join, dirname } from 'path';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
4
5
|
import * as p from '@clack/prompts';
|
|
6
|
+
import { EVAL_PROVIDERS, generateModelLines } from '../lib/eval/eval-providers.mjs';
|
|
7
|
+
import { detectPackageManager } from '../utils.mjs';
|
|
8
|
+
|
|
9
|
+
/** Read the current sunpeak package version for pinning in scaffolded configs. */
|
|
10
|
+
function getSunpeakVersion() {
|
|
11
|
+
try {
|
|
12
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
13
|
+
const pkgPath = join(__dirname, '..', '..', 'package.json');
|
|
14
|
+
const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
|
|
15
|
+
return pkg.version ? `^${pkg.version}` : 'latest';
|
|
16
|
+
} catch {
|
|
17
|
+
return 'latest';
|
|
18
|
+
}
|
|
19
|
+
}
|
|
5
20
|
|
|
6
21
|
/**
|
|
7
22
|
* Default dependencies (real implementations).
|
|
8
23
|
* Override in tests via the `deps` parameter.
|
|
9
24
|
*/
|
|
25
|
+
async function defaultSelectProviders() {
|
|
26
|
+
const selected = await p.multiselect({
|
|
27
|
+
message: 'AI providers for evals (space to toggle, enter to skip)',
|
|
28
|
+
options: EVAL_PROVIDERS.map((prov) => ({ value: prov, label: prov.label })),
|
|
29
|
+
initialValues: [],
|
|
30
|
+
required: false,
|
|
31
|
+
});
|
|
32
|
+
if (p.isCancel(selected)) return [];
|
|
33
|
+
return selected;
|
|
34
|
+
}
|
|
35
|
+
|
|
10
36
|
export const defaultDeps = {
|
|
11
37
|
existsSync,
|
|
12
38
|
readFileSync,
|
|
@@ -21,6 +47,9 @@ export const defaultDeps = {
|
|
|
21
47
|
select: p.select,
|
|
22
48
|
text: p.text,
|
|
23
49
|
log: p.log,
|
|
50
|
+
password: p.password,
|
|
51
|
+
selectProviders: defaultSelectProviders,
|
|
52
|
+
detectPackageManager,
|
|
24
53
|
};
|
|
25
54
|
|
|
26
55
|
/**
|
|
@@ -33,7 +62,7 @@ export const defaultDeps = {
|
|
|
33
62
|
*
|
|
34
63
|
* Scaffolds all 5 test types:
|
|
35
64
|
* 1. E2E tests — Playwright-based inspector tests (mcp fixture)
|
|
36
|
-
* 2. Visual regression — Screenshot comparison via
|
|
65
|
+
* 2. Visual regression — Screenshot comparison via result.screenshot()
|
|
37
66
|
* 3. Live tests — Test against real ChatGPT/Claude hosts
|
|
38
67
|
* 4. Evals — Multi-model tool calling reliability tests
|
|
39
68
|
* 5. Unit tests — Direct tool handler tests (JS/TS projects only)
|
|
@@ -60,6 +89,61 @@ export async function testInit(args = [], deps = defaultDeps) {
|
|
|
60
89
|
await initExternalProject(cliServer, d);
|
|
61
90
|
}
|
|
62
91
|
|
|
92
|
+
// Offer to configure eval providers
|
|
93
|
+
const providers = await d.selectProviders();
|
|
94
|
+
if (!d.isCancel(providers) && providers.length > 0) {
|
|
95
|
+
const pm = d.detectPackageManager();
|
|
96
|
+
const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
|
|
97
|
+
const installCmd = `${pm} add -D ${pkgsToInstall.join(' ')}`;
|
|
98
|
+
try {
|
|
99
|
+
d.execSync(installCmd, { cwd: d.cwd(), stdio: 'inherit' });
|
|
100
|
+
} catch {
|
|
101
|
+
d.log.info(`Provider install failed. Install manually: ${installCmd}`);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Uncomment selected models in eval.config.ts
|
|
105
|
+
const evalDir = d.existsSync(join(d.cwd(), 'tests', 'evals'))
|
|
106
|
+
? join(d.cwd(), 'tests', 'evals')
|
|
107
|
+
: d.existsSync(join(d.cwd(), 'tests', 'sunpeak', 'evals'))
|
|
108
|
+
? join(d.cwd(), 'tests', 'sunpeak', 'evals')
|
|
109
|
+
: null;
|
|
110
|
+
if (evalDir) {
|
|
111
|
+
const configPath = join(evalDir, 'eval.config.ts');
|
|
112
|
+
if (d.existsSync(configPath)) {
|
|
113
|
+
let config = d.readFileSync(configPath, 'utf-8');
|
|
114
|
+
for (const prov of providers) {
|
|
115
|
+
for (const model of prov.models) {
|
|
116
|
+
config = config.replace(
|
|
117
|
+
new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
|
|
118
|
+
'$1$2'
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
d.writeFileSync(configPath, config);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Prompt for API keys and write .env
|
|
126
|
+
const envLines = [];
|
|
127
|
+
const seen = new Set();
|
|
128
|
+
for (const prov of providers) {
|
|
129
|
+
if (seen.has(prov.envVar)) continue;
|
|
130
|
+
seen.add(prov.envVar);
|
|
131
|
+
const key = await d.password({
|
|
132
|
+
message: `${prov.envVar} (enter to skip)`,
|
|
133
|
+
mask: '*',
|
|
134
|
+
});
|
|
135
|
+
if (!d.isCancel(key) && key) {
|
|
136
|
+
envLines.push(`${prov.envVar}=${key}`);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
if (envLines.length > 0 && evalDir) {
|
|
140
|
+
const relEnvPath = evalDir.startsWith(d.cwd()) ? evalDir.slice(d.cwd().length + 1) : evalDir;
|
|
141
|
+
d.writeFileSync(join(evalDir, '.env'), envLines.join('\n') + '\n');
|
|
142
|
+
d.log.info(`API keys saved to ${relEnvPath}/.env (gitignored)`);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
63
147
|
// Offer to install the testing skill
|
|
64
148
|
const installSkill = await d.confirm({
|
|
65
149
|
message: 'Install the test-mcp-server skill? (helps your coding agent write tests)',
|
|
@@ -67,12 +151,12 @@ export async function testInit(args = [], deps = defaultDeps) {
|
|
|
67
151
|
});
|
|
68
152
|
if (!d.isCancel(installSkill) && installSkill) {
|
|
69
153
|
try {
|
|
70
|
-
d.execSync('
|
|
154
|
+
d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
|
|
71
155
|
cwd: d.cwd(),
|
|
72
156
|
stdio: 'inherit',
|
|
73
157
|
});
|
|
74
158
|
} catch {
|
|
75
|
-
d.log.info('Skill install skipped. Install later:
|
|
159
|
+
d.log.info('Skill install skipped. Install later: pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server');
|
|
76
160
|
}
|
|
77
161
|
}
|
|
78
162
|
|
|
@@ -141,11 +225,32 @@ async function getServerConfig(cliServer, d) {
|
|
|
141
225
|
|
|
142
226
|
function generateServerConfigBlock(server, relativeTo = '.') {
|
|
143
227
|
if (server.type === 'later') {
|
|
144
|
-
return ` // TODO: Configure your MCP server connection
|
|
228
|
+
return ` // TODO: Configure your MCP server connection before running tests.
|
|
229
|
+
// Uncomment one of the options below:
|
|
230
|
+
//
|
|
231
|
+
// HTTP server (Python FastAPI, Go, etc.):
|
|
232
|
+
// server: { url: 'http://localhost:8000/mcp' },
|
|
233
|
+
//
|
|
234
|
+
// Python (uv):
|
|
235
|
+
// server: { command: 'uv', args: ['run', 'python', 'server.py'] },
|
|
236
|
+
//
|
|
237
|
+
// Python (venv):
|
|
238
|
+
// server: { command: '.venv/bin/python', args: ['server.py'] },
|
|
239
|
+
//
|
|
240
|
+
// Go:
|
|
241
|
+
// server: { command: 'go', args: ['run', './cmd/server'] },
|
|
242
|
+
//
|
|
243
|
+
// Node.js:
|
|
244
|
+
// server: { command: 'node', args: ['server.js'] },
|
|
245
|
+
//
|
|
246
|
+
// Optional server options:
|
|
145
247
|
// server: {
|
|
146
|
-
// command: 'python',
|
|
147
|
-
//
|
|
148
|
-
//
|
|
248
|
+
// command: 'python', args: ['server.py'],
|
|
249
|
+
// env: { API_KEY: 'test-key' }, // Extra environment variables
|
|
250
|
+
// cwd: './backend', // Working directory
|
|
251
|
+
// },
|
|
252
|
+
//
|
|
253
|
+
// timeout: 120_000, // Server startup timeout in ms (default: 60s)`;
|
|
149
254
|
}
|
|
150
255
|
if (server.type === 'url') {
|
|
151
256
|
return ` server: {
|
|
@@ -205,15 +310,11 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
205
310
|
"",
|
|
206
311
|
" models: [",
|
|
207
312
|
" // Uncomment models and install their provider packages:",
|
|
208
|
-
|
|
209
|
-
" // 'gpt-4o-mini', // OPENAI_API_KEY",
|
|
210
|
-
" // 'o4-mini', // OPENAI_API_KEY",
|
|
211
|
-
" // 'claude-sonnet-4-20250514', // ANTHROPIC_API_KEY",
|
|
212
|
-
" // 'gemini-2.0-flash', // GOOGLE_GENERATIVE_AI_API_KEY",
|
|
313
|
+
...generateModelLines(),
|
|
213
314
|
" ],",
|
|
214
315
|
"",
|
|
215
316
|
" defaults: {",
|
|
216
|
-
" runs:
|
|
317
|
+
" runs: 5, // Number of times to run each case per model",
|
|
217
318
|
" maxSteps: 1, // Max tool call steps per run",
|
|
218
319
|
" temperature: 0, // 0 for most deterministic results",
|
|
219
320
|
" timeout: 30_000, // Timeout per run in ms",
|
|
@@ -244,7 +345,7 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
244
345
|
*
|
|
245
346
|
* To get started:
|
|
246
347
|
* 1. Configure models in eval.config.ts (uncomment the ones you want)
|
|
247
|
-
* 2. Install the AI SDK and provider packages
|
|
348
|
+
* 2. Install the AI SDK and provider packages (e.g. pnpm add ai @ai-sdk/openai)
|
|
248
349
|
* 3. Copy .env.example to .env and add your API keys
|
|
249
350
|
* 4. Replace this file with evals for your own tools
|
|
250
351
|
* 5. Run: sunpeak test --eval
|
|
@@ -302,31 +403,31 @@ function scaffoldVisualTest(filePath, d) {
|
|
|
302
403
|
* Uncomment the tests below and replace 'your-tool' with your tool name.
|
|
303
404
|
*/
|
|
304
405
|
|
|
305
|
-
// test('tool renders correctly in light mode', async ({
|
|
306
|
-
// const result = await
|
|
406
|
+
// test('tool renders correctly in light mode', async ({ inspector }) => {
|
|
407
|
+
// const result = await inspector.renderTool('your-tool', { key: 'value' }, { theme: 'light' });
|
|
307
408
|
// expect(result).not.toBeError();
|
|
308
409
|
//
|
|
309
410
|
// // Wait for UI to render, then screenshot:
|
|
310
411
|
// // const app = result.app();
|
|
311
412
|
// // await expect(app.getByText('Expected text')).toBeVisible();
|
|
312
|
-
// // await
|
|
413
|
+
// // await result.screenshot('tool-light');
|
|
313
414
|
// });
|
|
314
415
|
|
|
315
|
-
// test('tool renders correctly in dark mode', async ({
|
|
316
|
-
// const result = await
|
|
416
|
+
// test('tool renders correctly in dark mode', async ({ inspector }) => {
|
|
417
|
+
// const result = await inspector.renderTool('your-tool', { key: 'value' }, { theme: 'dark' });
|
|
317
418
|
// expect(result).not.toBeError();
|
|
318
419
|
//
|
|
319
420
|
// // const app = result.app();
|
|
320
421
|
// // await expect(app.getByText('Expected text')).toBeVisible();
|
|
321
|
-
// // await
|
|
422
|
+
// // await result.screenshot('tool-dark');
|
|
322
423
|
// });
|
|
323
424
|
|
|
324
425
|
// Full-page screenshot (captures the inspector chrome too):
|
|
325
|
-
// test('full page renders correctly', async ({
|
|
326
|
-
// const result = await
|
|
426
|
+
// test('full page renders correctly', async ({ inspector }) => {
|
|
427
|
+
// const result = await inspector.renderTool('your-tool', {}, { theme: 'light' });
|
|
327
428
|
// const app = result.app();
|
|
328
429
|
// await expect(app.getByText('Expected text')).toBeVisible();
|
|
329
|
-
// await
|
|
430
|
+
// await result.screenshot('tool-page', { target: 'page', maxDiffPixelRatio: 0.02 });
|
|
330
431
|
// });
|
|
331
432
|
`
|
|
332
433
|
);
|
|
@@ -489,7 +590,8 @@ async function initExternalProject(cliServer, d) {
|
|
|
489
590
|
private: true,
|
|
490
591
|
type: 'module',
|
|
491
592
|
devDependencies: {
|
|
492
|
-
|
|
593
|
+
'@types/node': 'latest',
|
|
594
|
+
sunpeak: getSunpeakVersion(),
|
|
493
595
|
'@playwright/test': 'latest',
|
|
494
596
|
},
|
|
495
597
|
scripts: {
|
|
@@ -531,24 +633,28 @@ ${serverBlock}
|
|
|
531
633
|
) + '\n'
|
|
532
634
|
);
|
|
533
635
|
|
|
534
|
-
// 1. E2E test — smoke test, verifies the server
|
|
636
|
+
// 1. E2E test — smoke test, verifies the server exposes tools
|
|
535
637
|
d.writeFileSync(
|
|
536
638
|
join(testDir, 'smoke.test.ts'),
|
|
537
639
|
`import { test, expect } from 'sunpeak/test';
|
|
538
640
|
|
|
539
|
-
test('server
|
|
540
|
-
|
|
541
|
-
|
|
641
|
+
test('server exposes tools', async ({ mcp }) => {
|
|
642
|
+
const tools = await mcp.listTools();
|
|
643
|
+
expect(tools.length).toBeGreaterThan(0);
|
|
542
644
|
});
|
|
543
645
|
|
|
544
|
-
//
|
|
545
|
-
// test('my tool
|
|
646
|
+
// Protocol-level test (no UI rendering):
|
|
647
|
+
// test('my tool returns data', async ({ mcp }) => {
|
|
546
648
|
// const result = await mcp.callTool('your-tool', { key: 'value' });
|
|
649
|
+
// expect(result.isError).toBeFalsy();
|
|
650
|
+
// });
|
|
651
|
+
|
|
652
|
+
// UI rendering test:
|
|
653
|
+
// test('my tool renders correctly', async ({ inspector }) => {
|
|
654
|
+
// const result = await inspector.renderTool('your-tool', { key: 'value' });
|
|
547
655
|
// expect(result).not.toBeError();
|
|
548
|
-
//
|
|
549
|
-
//
|
|
550
|
-
// // const app = result.app();
|
|
551
|
-
// // await expect(app.getByText('Hello')).toBeVisible();
|
|
656
|
+
// const app = result.app();
|
|
657
|
+
// await expect(app.getByText('Hello')).toBeVisible();
|
|
552
658
|
// });
|
|
553
659
|
`
|
|
554
660
|
);
|
|
@@ -563,15 +669,31 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
|
|
|
563
669
|
scaffoldEvals(join(testDir, 'evals'), { server, d });
|
|
564
670
|
|
|
565
671
|
d.log.success('Created tests/sunpeak/ with all test types.');
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
d.
|
|
572
|
-
d.log.
|
|
573
|
-
|
|
574
|
-
|
|
672
|
+
if (server.type === 'later') {
|
|
673
|
+
d.log.warn('Server not configured. Edit tests/sunpeak/playwright.config.ts before running tests.');
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
// Auto-install dependencies so users can run tests immediately
|
|
677
|
+
const pm = d.detectPackageManager();
|
|
678
|
+
d.log.step('Installing dependencies...');
|
|
679
|
+
try {
|
|
680
|
+
d.execSync(`${pm} install`, { cwd: testDir, stdio: 'inherit' });
|
|
681
|
+
} catch {
|
|
682
|
+
d.log.warn(`Dependency install failed. Run manually: cd tests/sunpeak && ${pm} install`);
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
d.log.step('Installing Playwright browser...');
|
|
686
|
+
try {
|
|
687
|
+
d.execSync(`${pm} exec playwright install chromium`, { cwd: testDir, stdio: 'inherit' });
|
|
688
|
+
} catch {
|
|
689
|
+
d.log.warn(`Browser install failed. Run manually: cd tests/sunpeak && ${pm} exec playwright install chromium`);
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
d.log.step('Ready! Run tests with:');
|
|
693
|
+
d.log.message(' sunpeak test # E2E tests');
|
|
694
|
+
d.log.message(' sunpeak test --visual # Visual regression (generates baselines on first run)');
|
|
695
|
+
d.log.message(' sunpeak test --live # Live tests against real hosts (requires login)');
|
|
696
|
+
d.log.message(' sunpeak test --eval # Multi-model evals (configure models in evals/eval.config.ts)');
|
|
575
697
|
}
|
|
576
698
|
|
|
577
699
|
async function initJsProject(cliServer, d) {
|
|
@@ -608,18 +730,23 @@ ${serverBlock}
|
|
|
608
730
|
testPath,
|
|
609
731
|
`import { test, expect } from 'sunpeak/test';
|
|
610
732
|
|
|
611
|
-
test('server
|
|
612
|
-
await
|
|
733
|
+
test('server exposes tools', async ({ mcp }) => {
|
|
734
|
+
const tools = await mcp.listTools();
|
|
735
|
+
expect(tools.length).toBeGreaterThan(0);
|
|
613
736
|
});
|
|
614
737
|
|
|
615
|
-
//
|
|
616
|
-
// test('my tool
|
|
738
|
+
// Protocol-level test (no UI rendering):
|
|
739
|
+
// test('my tool returns data', async ({ mcp }) => {
|
|
617
740
|
// const result = await mcp.callTool('your-tool', { key: 'value' });
|
|
741
|
+
// expect(result.isError).toBeFalsy();
|
|
742
|
+
// });
|
|
743
|
+
|
|
744
|
+
// UI rendering test:
|
|
745
|
+
// test('my tool renders correctly', async ({ inspector }) => {
|
|
746
|
+
// const result = await inspector.renderTool('your-tool', { key: 'value' });
|
|
618
747
|
// expect(result).not.toBeError();
|
|
619
|
-
//
|
|
620
|
-
//
|
|
621
|
-
// // const app = result.app();
|
|
622
|
-
// // await expect(app.getByText('Hello')).toBeVisible();
|
|
748
|
+
// const app = result.app();
|
|
749
|
+
// await expect(app.getByText('Hello')).toBeVisible();
|
|
623
750
|
// });
|
|
624
751
|
`
|
|
625
752
|
);
|
|
@@ -638,15 +765,19 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
|
|
|
638
765
|
// 5. Unit test
|
|
639
766
|
scaffoldUnitTest(join(cwd, 'tests', 'unit', 'example.test.ts'), d);
|
|
640
767
|
|
|
768
|
+
if (server.type === 'later') {
|
|
769
|
+
d.log.warn('Server not configured. Edit playwright.config.ts before running tests.');
|
|
770
|
+
}
|
|
771
|
+
const pkgMgr = d.detectPackageManager();
|
|
641
772
|
d.log.step('Next steps:');
|
|
642
|
-
d.log.message(
|
|
643
|
-
d.log.message(
|
|
773
|
+
d.log.message(` ${pkgMgr} add -D sunpeak @playwright/test vitest`);
|
|
774
|
+
d.log.message(` ${pkgMgr} exec playwright install chromium`);
|
|
644
775
|
d.log.message('');
|
|
645
|
-
d.log.message('
|
|
646
|
-
d.log.message('
|
|
647
|
-
d.log.message('
|
|
648
|
-
d.log.message('
|
|
649
|
-
d.log.message('
|
|
776
|
+
d.log.message(' sunpeak test # E2E tests');
|
|
777
|
+
d.log.message(' sunpeak test --unit # Unit tests (vitest)');
|
|
778
|
+
d.log.message(' sunpeak test --visual # Visual regression');
|
|
779
|
+
d.log.message(' sunpeak test --live # Live tests against real hosts');
|
|
780
|
+
d.log.message(' sunpeak test --eval # Multi-model evals');
|
|
650
781
|
}
|
|
651
782
|
|
|
652
783
|
async function initSunpeakProject(d) {
|
|
@@ -702,6 +833,6 @@ export default defineConfig();
|
|
|
702
833
|
d.log.message(' Replace: import { test, expect } from "@playwright/test"');
|
|
703
834
|
d.log.message(' With: import { test, expect } from "sunpeak/test"');
|
|
704
835
|
d.log.message('');
|
|
705
|
-
d.log.message(' Use the `mcp`
|
|
836
|
+
d.log.message(' Use the `mcp` and `inspector` fixtures instead of raw page navigation.');
|
|
706
837
|
d.log.message(' See sunpeak docs for migration examples.');
|
|
707
838
|
}
|
package/bin/commands/test.mjs
CHANGED
|
@@ -70,6 +70,9 @@ export async function runTest(args) {
|
|
|
70
70
|
'playwright.config.js',
|
|
71
71
|
'sunpeak.config.ts',
|
|
72
72
|
'sunpeak.config.js',
|
|
73
|
+
// Fallback for non-JS projects: tests/sunpeak/ self-contained directory
|
|
74
|
+
'tests/sunpeak/playwright.config.ts',
|
|
75
|
+
'tests/sunpeak/playwright.config.js',
|
|
73
76
|
],
|
|
74
77
|
visual: isVisual,
|
|
75
78
|
updateSnapshots: isVisual && isUpdate,
|
|
@@ -82,6 +85,9 @@ export async function runTest(args) {
|
|
|
82
85
|
configCandidates: [
|
|
83
86
|
'tests/live/playwright.config.ts',
|
|
84
87
|
'tests/live/playwright.config.js',
|
|
88
|
+
// Fallback for non-JS projects: tests/sunpeak/ self-contained directory
|
|
89
|
+
'tests/sunpeak/live/playwright.config.ts',
|
|
90
|
+
'tests/sunpeak/live/playwright.config.js',
|
|
85
91
|
],
|
|
86
92
|
configRequired: true,
|
|
87
93
|
configErrorMessage: 'No live test config found at tests/live/playwright.config.ts',
|
|
@@ -310,14 +316,43 @@ function validateApiKeys(models) {
|
|
|
310
316
|
return warnings;
|
|
311
317
|
}
|
|
312
318
|
|
|
319
|
+
/**
|
|
320
|
+
* Check that required AI SDK provider packages are installed for the configured models.
|
|
321
|
+
* @param {string[]} models
|
|
322
|
+
* @returns {Promise<Array<{ pkg: string, reason: string }>>}
|
|
323
|
+
*/
|
|
324
|
+
async function checkProviderPackages(models) {
|
|
325
|
+
const { createRequire } = await import('module');
|
|
326
|
+
// Resolve from the project's node_modules, not the global CLI install
|
|
327
|
+
const require = createRequire(join(process.cwd(), 'package.json'));
|
|
328
|
+
const missing = [];
|
|
329
|
+
const checked = new Set();
|
|
330
|
+
|
|
331
|
+
try {
|
|
332
|
+
require.resolve('ai');
|
|
333
|
+
} catch {
|
|
334
|
+
missing.push({ pkg: 'ai', reason: 'core AI SDK' });
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
for (const modelId of models) {
|
|
338
|
+
const pkg = getProviderForModel(modelId);
|
|
339
|
+
if (!pkg || checked.has(pkg)) continue;
|
|
340
|
+
checked.add(pkg);
|
|
341
|
+
try {
|
|
342
|
+
require.resolve(pkg);
|
|
343
|
+
} catch {
|
|
344
|
+
missing.push({ pkg, reason: modelId });
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return missing;
|
|
349
|
+
}
|
|
350
|
+
|
|
313
351
|
/**
|
|
314
352
|
* Run eval tests via vitest with the eval plugin.
|
|
315
353
|
*/
|
|
316
354
|
async function runEvals(args) {
|
|
317
|
-
const {
|
|
318
|
-
const { fileURLToPath } = await import('url');
|
|
319
|
-
|
|
320
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
355
|
+
const { resolve, basename } = await import('path');
|
|
321
356
|
const evalDir = findEvalDir();
|
|
322
357
|
|
|
323
358
|
if (!evalDir) {
|
|
@@ -365,8 +400,19 @@ async function runEvals(args) {
|
|
|
365
400
|
}
|
|
366
401
|
}
|
|
367
402
|
|
|
368
|
-
//
|
|
403
|
+
// Check for missing provider packages and API keys
|
|
369
404
|
if (configModels && configModels.length > 0) {
|
|
405
|
+
const missingPkgs = await checkProviderPackages(configModels);
|
|
406
|
+
if (missingPkgs.length > 0) {
|
|
407
|
+
console.error('\nMissing required packages for eval models:\n');
|
|
408
|
+
for (const { pkg, reason } of missingPkgs) {
|
|
409
|
+
console.error(` ${pkg} (needed for ${reason})`);
|
|
410
|
+
}
|
|
411
|
+
const installCmd = missingPkgs.map((m) => m.pkg).join(' ');
|
|
412
|
+
console.error(`\nInstall with:\n\n pnpm add -D ${installCmd}\n`);
|
|
413
|
+
return 1;
|
|
414
|
+
}
|
|
415
|
+
|
|
370
416
|
const warnings = validateApiKeys(configModels);
|
|
371
417
|
if (warnings.length > 0) {
|
|
372
418
|
console.log('');
|
|
@@ -416,8 +462,10 @@ async function runEvals(args) {
|
|
|
416
462
|
// Use .ts extension so vitest handles TypeScript imports natively
|
|
417
463
|
const vitestConfigPath = join(absEvalDir, '.eval-vitest.config.ts');
|
|
418
464
|
|
|
419
|
-
|
|
420
|
-
|
|
465
|
+
// Use package exports so vitest resolves from the project's node_modules,
|
|
466
|
+
// not the global CLI install. This ensures import('ai') finds project-local deps.
|
|
467
|
+
const evalPluginImport = 'sunpeak/eval/plugin';
|
|
468
|
+
const evalReporterImport = 'sunpeak/eval/reporter';
|
|
421
469
|
|
|
422
470
|
// Clean up dev server and temp config
|
|
423
471
|
const cleanupResources = () => {
|
|
@@ -442,17 +490,19 @@ async function runEvals(args) {
|
|
|
442
490
|
? './' + basename(configFile)
|
|
443
491
|
: null;
|
|
444
492
|
|
|
445
|
-
// Generate a vitest config that loads the eval config and plugin
|
|
493
|
+
// Generate a vitest config that loads the eval config and plugin.
|
|
494
|
+
// Imports use package names (sunpeak/eval/plugin) so vitest resolves from
|
|
495
|
+
// the project's node_modules, ensuring import('ai') finds project-local deps.
|
|
446
496
|
const vitestConfig = `
|
|
447
497
|
import { defineConfig } from 'vitest/config';
|
|
448
|
-
import { evalVitestPlugin } from ${
|
|
498
|
+
import { evalVitestPlugin } from '${evalPluginImport}';
|
|
449
499
|
${configImportPath ? `import evalConfig from ${JSON.stringify(configImportPath)};` : 'const evalConfig = { models: [], defaults: {} };'}
|
|
450
500
|
|
|
451
501
|
export default defineConfig({
|
|
452
502
|
test: {
|
|
453
503
|
globals: true,
|
|
454
504
|
include: ['**/*.eval.ts', '**/*.eval.js'],
|
|
455
|
-
reporters: ['default', ${
|
|
505
|
+
reporters: ['default', '${evalReporterImport}'],
|
|
456
506
|
testTimeout: 600000,
|
|
457
507
|
},
|
|
458
508
|
plugins: [
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared eval provider definitions used by both `sunpeak new` and `sunpeak test init`.
|
|
3
|
+
* Single source of truth for provider packages, model IDs, and CLI labels.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export const EVAL_PROVIDERS = [
|
|
7
|
+
{ pkg: '@ai-sdk/openai', models: ['gpt-4o', 'gpt-4o-mini'], label: 'OpenAI (gpt-4o, gpt-4o-mini)', envVar: 'OPENAI_API_KEY' },
|
|
8
|
+
{ pkg: '@ai-sdk/anthropic', models: ['claude-sonnet-4-20250514'], label: 'Anthropic (claude-sonnet-4)', envVar: 'ANTHROPIC_API_KEY' },
|
|
9
|
+
{ pkg: '@ai-sdk/google', models: ['gemini-2.0-flash'], label: 'Google (gemini-2.0-flash)', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
|
|
10
|
+
];
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* All model lines that appear in eval.config.ts, in order.
|
|
14
|
+
* Used by scaffoldEvals to generate the config and by the uncomment logic.
|
|
15
|
+
*/
|
|
16
|
+
export const EVAL_CONFIG_MODELS = [
|
|
17
|
+
{ id: 'gpt-4o', envVar: 'OPENAI_API_KEY' },
|
|
18
|
+
{ id: 'gpt-4o-mini', envVar: 'OPENAI_API_KEY' },
|
|
19
|
+
{ id: 'o4-mini', envVar: 'OPENAI_API_KEY' },
|
|
20
|
+
{ id: 'claude-sonnet-4-20250514', envVar: 'ANTHROPIC_API_KEY' },
|
|
21
|
+
{ id: 'gemini-2.0-flash', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
|
|
22
|
+
];
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Generate the models section lines for eval.config.ts.
|
|
26
|
+
* @returns {string[]}
|
|
27
|
+
*/
|
|
28
|
+
export function generateModelLines() {
|
|
29
|
+
const maxIdLen = Math.max(...EVAL_CONFIG_MODELS.map((m) => m.id.length));
|
|
30
|
+
return EVAL_CONFIG_MODELS.map((m) => {
|
|
31
|
+
const padded = `'${m.id}',`.padEnd(maxIdLen + 3);
|
|
32
|
+
return ` // ${padded} // ${m.envVar}`;
|
|
33
|
+
});
|
|
34
|
+
}
|
|
@@ -8,6 +8,43 @@ import { resolveModel, checkAiSdkInstalled } from './model-registry.mjs';
|
|
|
8
8
|
// Re-export for use in generated test code
|
|
9
9
|
export { checkAiSdkInstalled };
|
|
10
10
|
|
|
11
|
+
/**
|
|
12
|
+
* Strip AI SDK retry wrapper from error messages for cleaner output.
|
|
13
|
+
* "Failed after 3 attempts. Last error: <actual error>" → "<actual error>"
|
|
14
|
+
* @param {string} message
|
|
15
|
+
* @returns {string}
|
|
16
|
+
*/
|
|
17
|
+
export function cleanErrorMessage(message) {
|
|
18
|
+
return message.replace(/^Failed after \d+ attempts?\. Last error: /i, '');
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Check if an error message indicates a fatal API error that won't resolve on retry.
|
|
23
|
+
* @param {string} message
|
|
24
|
+
* @returns {boolean}
|
|
25
|
+
*/
|
|
26
|
+
export function isFatalApiError(message) {
|
|
27
|
+
const lower = message.toLowerCase();
|
|
28
|
+
const patterns = [
|
|
29
|
+
'exceeded your current quota',
|
|
30
|
+
'credit balance is too low',
|
|
31
|
+
'insufficient_quota',
|
|
32
|
+
'billing_hard_limit_reached',
|
|
33
|
+
'check your plan and billing details',
|
|
34
|
+
'add a payment method',
|
|
35
|
+
'invalid api key',
|
|
36
|
+
'invalid_api_key',
|
|
37
|
+
'incorrect api key',
|
|
38
|
+
'unauthorized',
|
|
39
|
+
'permission denied',
|
|
40
|
+
'access denied',
|
|
41
|
+
'authentication failed',
|
|
42
|
+
'account deactivated',
|
|
43
|
+
'account suspended',
|
|
44
|
+
];
|
|
45
|
+
return patterns.some((p) => lower.includes(p));
|
|
46
|
+
}
|
|
47
|
+
|
|
11
48
|
/**
|
|
12
49
|
* Define an eval spec. Identity function for type safety.
|
|
13
50
|
* @param {import('./eval-types.d.mts').EvalSpec} spec
|
|
@@ -112,6 +149,7 @@ export async function runSingleEval({ prompt, model, tools, maxSteps, temperatur
|
|
|
112
149
|
prompt,
|
|
113
150
|
maxSteps,
|
|
114
151
|
temperature,
|
|
152
|
+
maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
|
|
115
153
|
abortSignal: AbortSignal.timeout(timeout),
|
|
116
154
|
});
|
|
117
155
|
|
|
@@ -268,10 +306,15 @@ export async function runEvalCaseAggregate({
|
|
|
268
306
|
const model = await resolveModel(modelId);
|
|
269
307
|
let passed = 0;
|
|
270
308
|
let failed = 0;
|
|
309
|
+
let executedRuns = 0;
|
|
271
310
|
let totalDurationMs = 0;
|
|
272
311
|
const failureMap = new Map();
|
|
273
312
|
|
|
274
313
|
for (let i = 0; i < runs; i++) {
|
|
314
|
+
// Small delay between runs to avoid rate limits (skip before first run)
|
|
315
|
+
if (i > 0) await new Promise((r) => setTimeout(r, 1000));
|
|
316
|
+
|
|
317
|
+
executedRuns++;
|
|
275
318
|
const start = performance.now();
|
|
276
319
|
try {
|
|
277
320
|
const result = await runSingleEval({
|
|
@@ -286,8 +329,17 @@ export async function runEvalCaseAggregate({
|
|
|
286
329
|
passed++;
|
|
287
330
|
} catch (err) {
|
|
288
331
|
failed++;
|
|
289
|
-
const msg = err.message || String(err);
|
|
332
|
+
const msg = cleanErrorMessage(err.message || String(err));
|
|
290
333
|
failureMap.set(msg, (failureMap.get(msg) || 0) + 1);
|
|
334
|
+
|
|
335
|
+
if (isFatalApiError(msg)) {
|
|
336
|
+
// Count remaining runs as failed and stop early
|
|
337
|
+
const remaining = runs - i - 1;
|
|
338
|
+
failed += remaining;
|
|
339
|
+
failureMap.set(msg, (failureMap.get(msg) || 0) + remaining);
|
|
340
|
+
totalDurationMs += performance.now() - start;
|
|
341
|
+
break;
|
|
342
|
+
}
|
|
291
343
|
}
|
|
292
344
|
totalDurationMs += performance.now() - start;
|
|
293
345
|
}
|
|
@@ -304,7 +356,7 @@ export async function runEvalCaseAggregate({
|
|
|
304
356
|
passed,
|
|
305
357
|
failed,
|
|
306
358
|
passRate: runs > 0 ? passed / runs : 0,
|
|
307
|
-
avgDurationMs:
|
|
359
|
+
avgDurationMs: executedRuns > 0 ? totalDurationMs / executedRuns : 0,
|
|
308
360
|
failures,
|
|
309
361
|
};
|
|
310
362
|
}
|