sunpeak 0.19.10 → 0.19.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/bin/commands/new.mjs +85 -7
- package/bin/commands/test-init.mjs +92 -22
- package/bin/commands/test.mjs +54 -10
- package/bin/lib/eval/eval-providers.mjs +34 -0
- package/bin/lib/eval/eval-runner.mjs +54 -2
- package/bin/lib/eval/eval-vitest-plugin.mjs +3 -14
- package/bin/lib/eval/model-registry.mjs +2 -2
- package/bin/lib/inspect/inspect-config.mjs +1 -1
- package/bin/lib/test/test-config.mjs +1 -1
- package/package.json +11 -5
- package/template/README.md +1 -1
- package/template/dist/albums/albums.json +1 -1
- package/template/dist/carousel/carousel.json +1 -1
- package/template/dist/map/map.json +1 -1
- package/template/dist/review/review.json +1 -1
- package/template/node_modules/.vite/deps/_metadata.json +3 -3
- package/template/node_modules/.vite-mcp/deps/_metadata.json +19 -19
- package/template/tests/evals/albums.eval.ts +12 -9
- package/template/tests/evals/carousel.eval.ts +4 -14
- package/template/tests/evals/eval.config.ts +6 -6
- package/template/tests/evals/map.eval.ts +2 -9
- package/template/tests/evals/review.eval.ts +27 -22
- /package/template/tests/evals/{.env.example → _env.example} +0 -0
package/README.md
CHANGED
|
@@ -132,7 +132,7 @@ sunpeak new
|
|
|
132
132
|
Install the sunpeak skills to give your coding agent (Claude Code, Cursor, etc.) built-in knowledge of sunpeak patterns, hooks, and testing:
|
|
133
133
|
|
|
134
134
|
```bash
|
|
135
|
-
|
|
135
|
+
pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
|
|
136
136
|
```
|
|
137
137
|
|
|
138
138
|
## Troubleshooting
|
package/bin/commands/new.mjs
CHANGED
|
@@ -9,6 +9,7 @@ const execAsync = promisify(exec);
|
|
|
9
9
|
import * as clack from '@clack/prompts';
|
|
10
10
|
import { discoverResources } from '../lib/patterns.mjs';
|
|
11
11
|
import { detectPackageManager } from '../utils.mjs';
|
|
12
|
+
import { EVAL_PROVIDERS } from '../lib/eval/eval-providers.mjs';
|
|
12
13
|
|
|
13
14
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
14
15
|
|
|
@@ -57,6 +58,21 @@ async function defaultSelectResources(availableResources) {
|
|
|
57
58
|
return selected;
|
|
58
59
|
}
|
|
59
60
|
|
|
61
|
+
/**
|
|
62
|
+
* Default prompt for eval provider selection.
|
|
63
|
+
* @returns {Promise<Array<{ pkg: string, models: string[] }>>}
|
|
64
|
+
*/
|
|
65
|
+
async function defaultSelectProviders() {
|
|
66
|
+
const selected = await clack.multiselect({
|
|
67
|
+
message: 'AI providers for evals (space to toggle, enter to skip)',
|
|
68
|
+
options: EVAL_PROVIDERS.map((p) => ({ value: p, label: p.label })),
|
|
69
|
+
initialValues: [],
|
|
70
|
+
required: false,
|
|
71
|
+
});
|
|
72
|
+
if (clack.isCancel(selected)) return [];
|
|
73
|
+
return selected;
|
|
74
|
+
}
|
|
75
|
+
|
|
60
76
|
/**
|
|
61
77
|
* Default dependencies (real implementations)
|
|
62
78
|
*/
|
|
@@ -73,6 +89,8 @@ export const defaultDeps = {
|
|
|
73
89
|
execAsync,
|
|
74
90
|
promptName: defaultPromptName,
|
|
75
91
|
selectResources: defaultSelectResources,
|
|
92
|
+
selectProviders: defaultSelectProviders,
|
|
93
|
+
password: clack.password,
|
|
76
94
|
confirm: clack.confirm,
|
|
77
95
|
intro: clack.intro,
|
|
78
96
|
outro: clack.outro,
|
|
@@ -233,6 +251,15 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
|
|
|
233
251
|
d.renameSync(srcPath, destPath);
|
|
234
252
|
}
|
|
235
253
|
}
|
|
254
|
+
// Rename nested dotfiles (underscore convention for npm compatibility)
|
|
255
|
+
const nestedDotfiles = [['tests/evals/_env.example', 'tests/evals/.env.example']];
|
|
256
|
+
for (const [from, to] of nestedDotfiles) {
|
|
257
|
+
const srcPath = join(targetDir, from);
|
|
258
|
+
const destPath = join(targetDir, to);
|
|
259
|
+
if (d.existsSync(srcPath)) {
|
|
260
|
+
d.renameSync(srcPath, destPath);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
236
263
|
|
|
237
264
|
// Read sunpeak version from root package.json
|
|
238
265
|
const rootPkg = JSON.parse(d.readFileSync(d.rootPkgPath, 'utf-8'));
|
|
@@ -282,6 +309,56 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
|
|
|
282
309
|
s.stop(`Install failed. You can try running "${pm} install" manually.`);
|
|
283
310
|
}
|
|
284
311
|
|
|
312
|
+
// Offer to configure eval providers (only in interactive mode)
|
|
313
|
+
if (resourcesArg === undefined) {
|
|
314
|
+
const providers = await d.selectProviders();
|
|
315
|
+
if (!clack.isCancel(providers) && providers.length > 0) {
|
|
316
|
+
// Install AI SDK core + selected provider packages
|
|
317
|
+
const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
|
|
318
|
+
try {
|
|
319
|
+
await d.execAsync(`${pm} add -D ${pkgsToInstall.join(' ')}`, { cwd: targetDir });
|
|
320
|
+
} catch {
|
|
321
|
+
d.console.log(`Provider install failed. Install manually: ${pm} add -D ${pkgsToInstall.join(' ')}`);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// Uncomment selected models in eval.config.ts
|
|
325
|
+
const evalConfigPath = join(targetDir, 'tests', 'evals', 'eval.config.ts');
|
|
326
|
+
if (d.existsSync(evalConfigPath)) {
|
|
327
|
+
let config = d.readFileSync(evalConfigPath, 'utf-8');
|
|
328
|
+
for (const p of providers) {
|
|
329
|
+
for (const model of p.models) {
|
|
330
|
+
// Uncomment lines matching this model (e.g., " // 'gpt-4o'," → " 'gpt-4o',")
|
|
331
|
+
config = config.replace(
|
|
332
|
+
new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
|
|
333
|
+
'$1$2'
|
|
334
|
+
);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
d.writeFileSync(evalConfigPath, config);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Prompt for API keys and write .env
|
|
341
|
+
const envLines = [];
|
|
342
|
+
const seen = new Set();
|
|
343
|
+
for (const p of providers) {
|
|
344
|
+
if (seen.has(p.envVar)) continue;
|
|
345
|
+
seen.add(p.envVar);
|
|
346
|
+
const key = await d.password({
|
|
347
|
+
message: `${p.envVar} (enter to skip)`,
|
|
348
|
+
mask: '*',
|
|
349
|
+
});
|
|
350
|
+
if (!clack.isCancel(key) && key) {
|
|
351
|
+
envLines.push(`${p.envVar}=${key}`);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
const envPath = join(targetDir, 'tests', 'evals', '.env');
|
|
355
|
+
if (envLines.length > 0) {
|
|
356
|
+
d.writeFileSync(envPath, envLines.join('\n') + '\n');
|
|
357
|
+
clack.log.info(`API keys saved to tests/evals/.env (gitignored)`);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
285
362
|
// Offer to install the sunpeak skills (only in interactive mode)
|
|
286
363
|
if (resourcesArg === undefined) {
|
|
287
364
|
const installSkill = await d.confirm({
|
|
@@ -290,18 +367,16 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
|
|
|
290
367
|
});
|
|
291
368
|
if (!clack.isCancel(installSkill) && installSkill) {
|
|
292
369
|
try {
|
|
293
|
-
d.execSync('
|
|
370
|
+
d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server', {
|
|
294
371
|
cwd: targetDir,
|
|
295
372
|
stdio: 'inherit',
|
|
296
373
|
});
|
|
297
374
|
} catch {
|
|
298
|
-
d.console.log('Skill install skipped. You can install later with:
|
|
375
|
+
d.console.log('Skill install skipped. You can install later with: pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server');
|
|
299
376
|
}
|
|
300
377
|
}
|
|
301
378
|
}
|
|
302
379
|
|
|
303
|
-
const runCmd = pm === 'npm' ? 'npm run' : pm;
|
|
304
|
-
|
|
305
380
|
d.outro(`Done! To get started:
|
|
306
381
|
|
|
307
382
|
cd ${projectName}
|
|
@@ -309,9 +384,12 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
|
|
|
309
384
|
|
|
310
385
|
Your project commands:
|
|
311
386
|
|
|
312
|
-
sunpeak dev
|
|
313
|
-
sunpeak build
|
|
314
|
-
|
|
387
|
+
sunpeak dev # Start dev server + MCP endpoint
|
|
388
|
+
sunpeak build # Build for production
|
|
389
|
+
sunpeak test # Run unit + e2e tests
|
|
390
|
+
sunpeak test --eval # Run LLM evals (configure models in tests/evals/eval.config.ts)
|
|
391
|
+
sunpeak test --visual # Run visual regression tests
|
|
392
|
+
sunpeak test --live # Run live tests against real AI hosts`);
|
|
315
393
|
}
|
|
316
394
|
|
|
317
395
|
// Allow running directly
|
|
@@ -2,11 +2,24 @@ import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
|
|
|
2
2
|
import { execSync } from 'child_process';
|
|
3
3
|
import { join, dirname } from 'path';
|
|
4
4
|
import * as p from '@clack/prompts';
|
|
5
|
+
import { EVAL_PROVIDERS, generateModelLines } from '../lib/eval/eval-providers.mjs';
|
|
6
|
+
import { detectPackageManager } from '../utils.mjs';
|
|
5
7
|
|
|
6
8
|
/**
|
|
7
9
|
* Default dependencies (real implementations).
|
|
8
10
|
* Override in tests via the `deps` parameter.
|
|
9
11
|
*/
|
|
12
|
+
async function defaultSelectProviders() {
|
|
13
|
+
const selected = await p.multiselect({
|
|
14
|
+
message: 'AI providers for evals (space to toggle, enter to skip)',
|
|
15
|
+
options: EVAL_PROVIDERS.map((prov) => ({ value: prov, label: prov.label })),
|
|
16
|
+
initialValues: [],
|
|
17
|
+
required: false,
|
|
18
|
+
});
|
|
19
|
+
if (p.isCancel(selected)) return [];
|
|
20
|
+
return selected;
|
|
21
|
+
}
|
|
22
|
+
|
|
10
23
|
export const defaultDeps = {
|
|
11
24
|
existsSync,
|
|
12
25
|
readFileSync,
|
|
@@ -21,6 +34,9 @@ export const defaultDeps = {
|
|
|
21
34
|
select: p.select,
|
|
22
35
|
text: p.text,
|
|
23
36
|
log: p.log,
|
|
37
|
+
password: p.password,
|
|
38
|
+
selectProviders: defaultSelectProviders,
|
|
39
|
+
detectPackageManager,
|
|
24
40
|
};
|
|
25
41
|
|
|
26
42
|
/**
|
|
@@ -60,6 +76,61 @@ export async function testInit(args = [], deps = defaultDeps) {
|
|
|
60
76
|
await initExternalProject(cliServer, d);
|
|
61
77
|
}
|
|
62
78
|
|
|
79
|
+
// Offer to configure eval providers
|
|
80
|
+
const providers = await d.selectProviders();
|
|
81
|
+
if (!d.isCancel(providers) && providers.length > 0) {
|
|
82
|
+
const pm = d.detectPackageManager();
|
|
83
|
+
const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
|
|
84
|
+
const installCmd = `${pm} add -D ${pkgsToInstall.join(' ')}`;
|
|
85
|
+
try {
|
|
86
|
+
d.execSync(installCmd, { cwd: d.cwd(), stdio: 'inherit' });
|
|
87
|
+
} catch {
|
|
88
|
+
d.log.info(`Provider install failed. Install manually: ${installCmd}`);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Uncomment selected models in eval.config.ts
|
|
92
|
+
const evalDir = d.existsSync(join(d.cwd(), 'tests', 'evals'))
|
|
93
|
+
? join(d.cwd(), 'tests', 'evals')
|
|
94
|
+
: d.existsSync(join(d.cwd(), 'tests', 'sunpeak', 'evals'))
|
|
95
|
+
? join(d.cwd(), 'tests', 'sunpeak', 'evals')
|
|
96
|
+
: null;
|
|
97
|
+
if (evalDir) {
|
|
98
|
+
const configPath = join(evalDir, 'eval.config.ts');
|
|
99
|
+
if (d.existsSync(configPath)) {
|
|
100
|
+
let config = d.readFileSync(configPath, 'utf-8');
|
|
101
|
+
for (const prov of providers) {
|
|
102
|
+
for (const model of prov.models) {
|
|
103
|
+
config = config.replace(
|
|
104
|
+
new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
|
|
105
|
+
'$1$2'
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
d.writeFileSync(configPath, config);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Prompt for API keys and write .env
|
|
113
|
+
const envLines = [];
|
|
114
|
+
const seen = new Set();
|
|
115
|
+
for (const prov of providers) {
|
|
116
|
+
if (seen.has(prov.envVar)) continue;
|
|
117
|
+
seen.add(prov.envVar);
|
|
118
|
+
const key = await d.password({
|
|
119
|
+
message: `${prov.envVar} (enter to skip)`,
|
|
120
|
+
mask: '*',
|
|
121
|
+
});
|
|
122
|
+
if (!d.isCancel(key) && key) {
|
|
123
|
+
envLines.push(`${prov.envVar}=${key}`);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
if (envLines.length > 0 && evalDir) {
|
|
127
|
+
const relEnvPath = evalDir.startsWith(d.cwd()) ? evalDir.slice(d.cwd().length + 1) : evalDir;
|
|
128
|
+
d.writeFileSync(join(evalDir, '.env'), envLines.join('\n') + '\n');
|
|
129
|
+
d.log.info(`API keys saved to ${relEnvPath}/.env (gitignored)`);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
63
134
|
// Offer to install the testing skill
|
|
64
135
|
const installSkill = await d.confirm({
|
|
65
136
|
message: 'Install the test-mcp-server skill? (helps your coding agent write tests)',
|
|
@@ -67,12 +138,12 @@ export async function testInit(args = [], deps = defaultDeps) {
|
|
|
67
138
|
});
|
|
68
139
|
if (!d.isCancel(installSkill) && installSkill) {
|
|
69
140
|
try {
|
|
70
|
-
d.execSync('
|
|
141
|
+
d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
|
|
71
142
|
cwd: d.cwd(),
|
|
72
143
|
stdio: 'inherit',
|
|
73
144
|
});
|
|
74
145
|
} catch {
|
|
75
|
-
d.log.info('Skill install skipped. Install later:
|
|
146
|
+
d.log.info('Skill install skipped. Install later: pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server');
|
|
76
147
|
}
|
|
77
148
|
}
|
|
78
149
|
|
|
@@ -205,15 +276,11 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
205
276
|
"",
|
|
206
277
|
" models: [",
|
|
207
278
|
" // Uncomment models and install their provider packages:",
|
|
208
|
-
|
|
209
|
-
" // 'gpt-4o-mini', // OPENAI_API_KEY",
|
|
210
|
-
" // 'o4-mini', // OPENAI_API_KEY",
|
|
211
|
-
" // 'claude-sonnet-4-20250514', // ANTHROPIC_API_KEY",
|
|
212
|
-
" // 'gemini-2.0-flash', // GOOGLE_GENERATIVE_AI_API_KEY",
|
|
279
|
+
...generateModelLines(),
|
|
213
280
|
" ],",
|
|
214
281
|
"",
|
|
215
282
|
" defaults: {",
|
|
216
|
-
" runs:
|
|
283
|
+
" runs: 5, // Number of times to run each case per model",
|
|
217
284
|
" maxSteps: 1, // Max tool call steps per run",
|
|
218
285
|
" temperature: 0, // 0 for most deterministic results",
|
|
219
286
|
" timeout: 30_000, // Timeout per run in ms",
|
|
@@ -244,7 +311,7 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
244
311
|
*
|
|
245
312
|
* To get started:
|
|
246
313
|
* 1. Configure models in eval.config.ts (uncomment the ones you want)
|
|
247
|
-
* 2. Install the AI SDK and provider packages
|
|
314
|
+
* 2. Install the AI SDK and provider packages (e.g. pnpm add ai @ai-sdk/openai)
|
|
248
315
|
* 3. Copy .env.example to .env and add your API keys
|
|
249
316
|
* 4. Replace this file with evals for your own tools
|
|
250
317
|
* 5. Run: sunpeak test --eval
|
|
@@ -489,6 +556,7 @@ async function initExternalProject(cliServer, d) {
|
|
|
489
556
|
private: true,
|
|
490
557
|
type: 'module',
|
|
491
558
|
devDependencies: {
|
|
559
|
+
'@types/node': 'latest',
|
|
492
560
|
sunpeak: 'latest',
|
|
493
561
|
'@playwright/test': 'latest',
|
|
494
562
|
},
|
|
@@ -564,14 +632,15 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
|
|
|
564
632
|
|
|
565
633
|
d.log.success('Created tests/sunpeak/ with all test types.');
|
|
566
634
|
d.log.step('Next steps:');
|
|
635
|
+
const pm = d.detectPackageManager();
|
|
567
636
|
d.log.message(' cd tests/sunpeak');
|
|
568
|
-
d.log.message(
|
|
569
|
-
d.log.message(
|
|
637
|
+
d.log.message(` ${pm} install`);
|
|
638
|
+
d.log.message(` ${pm} exec playwright install chromium`);
|
|
570
639
|
d.log.message('');
|
|
571
|
-
d.log.message('
|
|
572
|
-
d.log.message('
|
|
573
|
-
d.log.message('
|
|
574
|
-
d.log.message('
|
|
640
|
+
d.log.message(' sunpeak test # E2E tests');
|
|
641
|
+
d.log.message(' sunpeak test --visual # Visual regression (generates baselines on first run)');
|
|
642
|
+
d.log.message(' sunpeak test --live # Live tests against real hosts (requires login)');
|
|
643
|
+
d.log.message(' sunpeak test --eval # Multi-model evals (configure models in evals/eval.config.ts)');
|
|
575
644
|
}
|
|
576
645
|
|
|
577
646
|
async function initJsProject(cliServer, d) {
|
|
@@ -638,15 +707,16 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
|
|
|
638
707
|
// 5. Unit test
|
|
639
708
|
scaffoldUnitTest(join(cwd, 'tests', 'unit', 'example.test.ts'), d);
|
|
640
709
|
|
|
710
|
+
const pkgMgr = d.detectPackageManager();
|
|
641
711
|
d.log.step('Next steps:');
|
|
642
|
-
d.log.message(
|
|
643
|
-
d.log.message(
|
|
712
|
+
d.log.message(` ${pkgMgr} add -D sunpeak @playwright/test vitest`);
|
|
713
|
+
d.log.message(` ${pkgMgr} exec playwright install chromium`);
|
|
644
714
|
d.log.message('');
|
|
645
|
-
d.log.message('
|
|
646
|
-
d.log.message('
|
|
647
|
-
d.log.message('
|
|
648
|
-
d.log.message('
|
|
649
|
-
d.log.message('
|
|
715
|
+
d.log.message(' sunpeak test # E2E tests');
|
|
716
|
+
d.log.message(' sunpeak test --unit # Unit tests (vitest)');
|
|
717
|
+
d.log.message(' sunpeak test --visual # Visual regression');
|
|
718
|
+
d.log.message(' sunpeak test --live # Live tests against real hosts');
|
|
719
|
+
d.log.message(' sunpeak test --eval # Multi-model evals');
|
|
650
720
|
}
|
|
651
721
|
|
|
652
722
|
async function initSunpeakProject(d) {
|
package/bin/commands/test.mjs
CHANGED
|
@@ -310,14 +310,43 @@ function validateApiKeys(models) {
|
|
|
310
310
|
return warnings;
|
|
311
311
|
}
|
|
312
312
|
|
|
313
|
+
/**
|
|
314
|
+
* Check that required AI SDK provider packages are installed for the configured models.
|
|
315
|
+
* @param {string[]} models
|
|
316
|
+
* @returns {Promise<Array<{ pkg: string, reason: string }>>}
|
|
317
|
+
*/
|
|
318
|
+
async function checkProviderPackages(models) {
|
|
319
|
+
const { createRequire } = await import('module');
|
|
320
|
+
// Resolve from the project's node_modules, not the global CLI install
|
|
321
|
+
const require = createRequire(join(process.cwd(), 'package.json'));
|
|
322
|
+
const missing = [];
|
|
323
|
+
const checked = new Set();
|
|
324
|
+
|
|
325
|
+
try {
|
|
326
|
+
require.resolve('ai');
|
|
327
|
+
} catch {
|
|
328
|
+
missing.push({ pkg: 'ai', reason: 'core AI SDK' });
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
for (const modelId of models) {
|
|
332
|
+
const pkg = getProviderForModel(modelId);
|
|
333
|
+
if (!pkg || checked.has(pkg)) continue;
|
|
334
|
+
checked.add(pkg);
|
|
335
|
+
try {
|
|
336
|
+
require.resolve(pkg);
|
|
337
|
+
} catch {
|
|
338
|
+
missing.push({ pkg, reason: modelId });
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
return missing;
|
|
343
|
+
}
|
|
344
|
+
|
|
313
345
|
/**
|
|
314
346
|
* Run eval tests via vitest with the eval plugin.
|
|
315
347
|
*/
|
|
316
348
|
async function runEvals(args) {
|
|
317
|
-
const {
|
|
318
|
-
const { fileURLToPath } = await import('url');
|
|
319
|
-
|
|
320
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
349
|
+
const { resolve, basename } = await import('path');
|
|
321
350
|
const evalDir = findEvalDir();
|
|
322
351
|
|
|
323
352
|
if (!evalDir) {
|
|
@@ -365,8 +394,19 @@ async function runEvals(args) {
|
|
|
365
394
|
}
|
|
366
395
|
}
|
|
367
396
|
|
|
368
|
-
//
|
|
397
|
+
// Check for missing provider packages and API keys
|
|
369
398
|
if (configModels && configModels.length > 0) {
|
|
399
|
+
const missingPkgs = await checkProviderPackages(configModels);
|
|
400
|
+
if (missingPkgs.length > 0) {
|
|
401
|
+
console.error('\nMissing required packages for eval models:\n');
|
|
402
|
+
for (const { pkg, reason } of missingPkgs) {
|
|
403
|
+
console.error(` ${pkg} (needed for ${reason})`);
|
|
404
|
+
}
|
|
405
|
+
const installCmd = missingPkgs.map((m) => m.pkg).join(' ');
|
|
406
|
+
console.error(`\nInstall with:\n\n pnpm add -D ${installCmd}\n`);
|
|
407
|
+
return 1;
|
|
408
|
+
}
|
|
409
|
+
|
|
370
410
|
const warnings = validateApiKeys(configModels);
|
|
371
411
|
if (warnings.length > 0) {
|
|
372
412
|
console.log('');
|
|
@@ -416,8 +456,10 @@ async function runEvals(args) {
|
|
|
416
456
|
// Use .ts extension so vitest handles TypeScript imports natively
|
|
417
457
|
const vitestConfigPath = join(absEvalDir, '.eval-vitest.config.ts');
|
|
418
458
|
|
|
419
|
-
|
|
420
|
-
|
|
459
|
+
// Use package exports so vitest resolves from the project's node_modules,
|
|
460
|
+
// not the global CLI install. This ensures import('ai') finds project-local deps.
|
|
461
|
+
const evalPluginImport = 'sunpeak/eval/plugin';
|
|
462
|
+
const evalReporterImport = 'sunpeak/eval/reporter';
|
|
421
463
|
|
|
422
464
|
// Clean up dev server and temp config
|
|
423
465
|
const cleanupResources = () => {
|
|
@@ -442,17 +484,19 @@ async function runEvals(args) {
|
|
|
442
484
|
? './' + basename(configFile)
|
|
443
485
|
: null;
|
|
444
486
|
|
|
445
|
-
// Generate a vitest config that loads the eval config and plugin
|
|
487
|
+
// Generate a vitest config that loads the eval config and plugin.
|
|
488
|
+
// Imports use package names (sunpeak/eval/plugin) so vitest resolves from
|
|
489
|
+
// the project's node_modules, ensuring import('ai') finds project-local deps.
|
|
446
490
|
const vitestConfig = `
|
|
447
491
|
import { defineConfig } from 'vitest/config';
|
|
448
|
-
import { evalVitestPlugin } from ${
|
|
492
|
+
import { evalVitestPlugin } from '${evalPluginImport}';
|
|
449
493
|
${configImportPath ? `import evalConfig from ${JSON.stringify(configImportPath)};` : 'const evalConfig = { models: [], defaults: {} };'}
|
|
450
494
|
|
|
451
495
|
export default defineConfig({
|
|
452
496
|
test: {
|
|
453
497
|
globals: true,
|
|
454
498
|
include: ['**/*.eval.ts', '**/*.eval.js'],
|
|
455
|
-
reporters: ['default', ${
|
|
499
|
+
reporters: ['default', '${evalReporterImport}'],
|
|
456
500
|
testTimeout: 600000,
|
|
457
501
|
},
|
|
458
502
|
plugins: [
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared eval provider definitions used by both `sunpeak new` and `sunpeak test init`.
|
|
3
|
+
* Single source of truth for provider packages, model IDs, and CLI labels.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export const EVAL_PROVIDERS = [
|
|
7
|
+
{ pkg: '@ai-sdk/openai', models: ['gpt-4o', 'gpt-4o-mini'], label: 'OpenAI (gpt-4o, gpt-4o-mini)', envVar: 'OPENAI_API_KEY' },
|
|
8
|
+
{ pkg: '@ai-sdk/anthropic', models: ['claude-sonnet-4-20250514'], label: 'Anthropic (claude-sonnet-4)', envVar: 'ANTHROPIC_API_KEY' },
|
|
9
|
+
{ pkg: '@ai-sdk/google', models: ['gemini-2.0-flash'], label: 'Google (gemini-2.0-flash)', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
|
|
10
|
+
];
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* All model lines that appear in eval.config.ts, in order.
|
|
14
|
+
* Used by scaffoldEvals to generate the config and by the uncomment logic.
|
|
15
|
+
*/
|
|
16
|
+
export const EVAL_CONFIG_MODELS = [
|
|
17
|
+
{ id: 'gpt-4o', envVar: 'OPENAI_API_KEY' },
|
|
18
|
+
{ id: 'gpt-4o-mini', envVar: 'OPENAI_API_KEY' },
|
|
19
|
+
{ id: 'o4-mini', envVar: 'OPENAI_API_KEY' },
|
|
20
|
+
{ id: 'claude-sonnet-4-20250514', envVar: 'ANTHROPIC_API_KEY' },
|
|
21
|
+
{ id: 'gemini-2.0-flash', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
|
|
22
|
+
];
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Generate the models section lines for eval.config.ts.
|
|
26
|
+
* @returns {string[]}
|
|
27
|
+
*/
|
|
28
|
+
export function generateModelLines() {
|
|
29
|
+
const maxIdLen = Math.max(...EVAL_CONFIG_MODELS.map((m) => m.id.length));
|
|
30
|
+
return EVAL_CONFIG_MODELS.map((m) => {
|
|
31
|
+
const padded = `'${m.id}',`.padEnd(maxIdLen + 3);
|
|
32
|
+
return ` // ${padded} // ${m.envVar}`;
|
|
33
|
+
});
|
|
34
|
+
}
|
|
@@ -8,6 +8,43 @@ import { resolveModel, checkAiSdkInstalled } from './model-registry.mjs';
|
|
|
8
8
|
// Re-export for use in generated test code
|
|
9
9
|
export { checkAiSdkInstalled };
|
|
10
10
|
|
|
11
|
+
/**
|
|
12
|
+
* Strip AI SDK retry wrapper from error messages for cleaner output.
|
|
13
|
+
* "Failed after 3 attempts. Last error: <actual error>" → "<actual error>"
|
|
14
|
+
* @param {string} message
|
|
15
|
+
* @returns {string}
|
|
16
|
+
*/
|
|
17
|
+
export function cleanErrorMessage(message) {
|
|
18
|
+
return message.replace(/^Failed after \d+ attempts?\. Last error: /i, '');
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Check if an error message indicates a fatal API error that won't resolve on retry.
|
|
23
|
+
* @param {string} message
|
|
24
|
+
* @returns {boolean}
|
|
25
|
+
*/
|
|
26
|
+
export function isFatalApiError(message) {
|
|
27
|
+
const lower = message.toLowerCase();
|
|
28
|
+
const patterns = [
|
|
29
|
+
'exceeded your current quota',
|
|
30
|
+
'credit balance is too low',
|
|
31
|
+
'insufficient_quota',
|
|
32
|
+
'billing_hard_limit_reached',
|
|
33
|
+
'check your plan and billing details',
|
|
34
|
+
'add a payment method',
|
|
35
|
+
'invalid api key',
|
|
36
|
+
'invalid_api_key',
|
|
37
|
+
'incorrect api key',
|
|
38
|
+
'unauthorized',
|
|
39
|
+
'permission denied',
|
|
40
|
+
'access denied',
|
|
41
|
+
'authentication failed',
|
|
42
|
+
'account deactivated',
|
|
43
|
+
'account suspended',
|
|
44
|
+
];
|
|
45
|
+
return patterns.some((p) => lower.includes(p));
|
|
46
|
+
}
|
|
47
|
+
|
|
11
48
|
/**
|
|
12
49
|
* Define an eval spec. Identity function for type safety.
|
|
13
50
|
* @param {import('./eval-types.d.mts').EvalSpec} spec
|
|
@@ -112,6 +149,7 @@ export async function runSingleEval({ prompt, model, tools, maxSteps, temperatur
|
|
|
112
149
|
prompt,
|
|
113
150
|
maxSteps,
|
|
114
151
|
temperature,
|
|
152
|
+
maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
|
|
115
153
|
abortSignal: AbortSignal.timeout(timeout),
|
|
116
154
|
});
|
|
117
155
|
|
|
@@ -268,10 +306,15 @@ export async function runEvalCaseAggregate({
|
|
|
268
306
|
const model = await resolveModel(modelId);
|
|
269
307
|
let passed = 0;
|
|
270
308
|
let failed = 0;
|
|
309
|
+
let executedRuns = 0;
|
|
271
310
|
let totalDurationMs = 0;
|
|
272
311
|
const failureMap = new Map();
|
|
273
312
|
|
|
274
313
|
for (let i = 0; i < runs; i++) {
|
|
314
|
+
// Small delay between runs to avoid rate limits (skip before first run)
|
|
315
|
+
if (i > 0) await new Promise((r) => setTimeout(r, 1000));
|
|
316
|
+
|
|
317
|
+
executedRuns++;
|
|
275
318
|
const start = performance.now();
|
|
276
319
|
try {
|
|
277
320
|
const result = await runSingleEval({
|
|
@@ -286,8 +329,17 @@ export async function runEvalCaseAggregate({
|
|
|
286
329
|
passed++;
|
|
287
330
|
} catch (err) {
|
|
288
331
|
failed++;
|
|
289
|
-
const msg = err.message || String(err);
|
|
332
|
+
const msg = cleanErrorMessage(err.message || String(err));
|
|
290
333
|
failureMap.set(msg, (failureMap.get(msg) || 0) + 1);
|
|
334
|
+
|
|
335
|
+
if (isFatalApiError(msg)) {
|
|
336
|
+
// Count remaining runs as failed and stop early
|
|
337
|
+
const remaining = runs - i - 1;
|
|
338
|
+
failed += remaining;
|
|
339
|
+
failureMap.set(msg, (failureMap.get(msg) || 0) + remaining);
|
|
340
|
+
totalDurationMs += performance.now() - start;
|
|
341
|
+
break;
|
|
342
|
+
}
|
|
291
343
|
}
|
|
292
344
|
totalDurationMs += performance.now() - start;
|
|
293
345
|
}
|
|
@@ -304,7 +356,7 @@ export async function runEvalCaseAggregate({
|
|
|
304
356
|
passed,
|
|
305
357
|
failed,
|
|
306
358
|
passRate: runs > 0 ? passed / runs : 0,
|
|
307
|
-
avgDurationMs:
|
|
359
|
+
avgDurationMs: executedRuns > 0 ? totalDurationMs / executedRuns : 0,
|
|
308
360
|
failures,
|
|
309
361
|
};
|
|
310
362
|
}
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import { readFileSync } from 'fs';
|
|
16
16
|
import { basename } from 'path';
|
|
17
|
-
import { fileURLToPath } from 'url';
|
|
18
17
|
|
|
19
18
|
const EVAL_RE = /\.eval\.[tj]s$/;
|
|
20
19
|
const VIRTUAL_PREFIX = '\0sunpeak-eval-spec:';
|
|
@@ -61,11 +60,12 @@ export function evalVitestPlugin({ server, models, defaults }) {
|
|
|
61
60
|
virtualToReal.set(virtualId, id);
|
|
62
61
|
|
|
63
62
|
const testName = basename(id).replace(EVAL_RE, '');
|
|
64
|
-
const runnerPath = resolveRunnerPath();
|
|
65
63
|
|
|
64
|
+
// Import from 'sunpeak/eval' (package export) so vitest resolves from
|
|
65
|
+
// the project's node_modules, not the global CLI install.
|
|
66
66
|
const transformed = `
|
|
67
67
|
import { describe, it, beforeAll, afterAll } from 'vitest';
|
|
68
|
-
import { createMcpConnection, discoverAndConvertTools, runEvalCaseAggregate, checkAiSdkInstalled } from '
|
|
68
|
+
import { createMcpConnection, discoverAndConvertTools, runEvalCaseAggregate, checkAiSdkInstalled } from 'sunpeak/eval';
|
|
69
69
|
|
|
70
70
|
// Import the original eval spec via virtual module (bypasses this transform)
|
|
71
71
|
import evalSpec from ${JSON.stringify(virtualId)};
|
|
@@ -145,14 +145,3 @@ describe.skipIf(shouldSkip)(${JSON.stringify(testName)}, () => {
|
|
|
145
145
|
};
|
|
146
146
|
}
|
|
147
147
|
|
|
148
|
-
/**
|
|
149
|
-
* Get the absolute path to the eval-runner module.
|
|
150
|
-
*/
|
|
151
|
-
function resolveRunnerPath() {
|
|
152
|
-
const url = new URL('./eval-runner.mjs', import.meta.url);
|
|
153
|
-
// fileURLToPath requires file:// scheme; fall back to pathname for other schemes (e.g., vitest)
|
|
154
|
-
if (url.protocol === 'file:') {
|
|
155
|
-
return fileURLToPath(url);
|
|
156
|
-
}
|
|
157
|
-
return url.pathname;
|
|
158
|
-
}
|
|
@@ -36,7 +36,7 @@ export async function resolveModel(modelId) {
|
|
|
36
36
|
provider = await import(pkg);
|
|
37
37
|
} catch {
|
|
38
38
|
throw new Error(
|
|
39
|
-
`Provider package "${pkg}" is not installed. Install it to use ${modelId}:\n\n
|
|
39
|
+
`Provider package "${pkg}" is not installed. Install it to use ${modelId}:\n\n pnpm add -D ${pkg}\n`
|
|
40
40
|
);
|
|
41
41
|
}
|
|
42
42
|
|
|
@@ -67,7 +67,7 @@ export async function checkAiSdkInstalled() {
|
|
|
67
67
|
await import('ai');
|
|
68
68
|
} catch {
|
|
69
69
|
throw new Error(
|
|
70
|
-
'The "ai" package is not installed. Install it to use evals:\n\n
|
|
70
|
+
'The "ai" package is not installed. Install it to use evals:\n\n pnpm add -D ai\n'
|
|
71
71
|
);
|
|
72
72
|
}
|
|
73
73
|
}
|
|
@@ -49,7 +49,7 @@ export function defineInspectConfig(options) {
|
|
|
49
49
|
const serverArg = server.includes(' ') ? `"${server}"` : server;
|
|
50
50
|
const command = [
|
|
51
51
|
`SUNPEAK_SANDBOX_PORT=${sandboxPort}`,
|
|
52
|
-
'
|
|
52
|
+
'sunpeak inspect',
|
|
53
53
|
`--server ${serverArg}`,
|
|
54
54
|
...(simulationsDir ? [`--simulations ${simulationsDir}`] : []),
|
|
55
55
|
`--port ${port}`,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sunpeak",
|
|
3
|
-
"version": "0.19.
|
|
3
|
+
"version": "0.19.12",
|
|
4
4
|
"description": "Inspector, testing framework, and app framework for MCP Apps.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -102,6 +102,12 @@
|
|
|
102
102
|
"default": "./bin/lib/eval/eval-runner.mjs"
|
|
103
103
|
}
|
|
104
104
|
},
|
|
105
|
+
"./eval/plugin": {
|
|
106
|
+
"import": "./bin/lib/eval/eval-vitest-plugin.mjs"
|
|
107
|
+
},
|
|
108
|
+
"./eval/reporter": {
|
|
109
|
+
"import": "./bin/lib/eval/eval-reporter.mjs"
|
|
110
|
+
},
|
|
105
111
|
"./package.json": "./package.json"
|
|
106
112
|
},
|
|
107
113
|
"bin": {
|
|
@@ -138,10 +144,10 @@
|
|
|
138
144
|
"peerDependencies": {
|
|
139
145
|
"react": "^18.0.0 || ^19.0.0",
|
|
140
146
|
"react-dom": "^18.0.0 || ^19.0.0",
|
|
141
|
-
"ai": "^4.0.0",
|
|
142
|
-
"@ai-sdk/openai": "^1.0.0",
|
|
143
|
-
"@ai-sdk/anthropic": "^1.0.0",
|
|
144
|
-
"@ai-sdk/google": "^1.0.0"
|
|
147
|
+
"ai": "^4.0.0 || ^5.0.0 || ^6.0.0",
|
|
148
|
+
"@ai-sdk/openai": "^1.0.0 || ^2.0.0 || ^3.0.0",
|
|
149
|
+
"@ai-sdk/anthropic": "^1.0.0 || ^2.0.0 || ^3.0.0",
|
|
150
|
+
"@ai-sdk/google": "^1.0.0 || ^2.0.0 || ^3.0.0"
|
|
145
151
|
},
|
|
146
152
|
"peerDependenciesMeta": {
|
|
147
153
|
"ai": {
|
package/template/README.md
CHANGED
|
@@ -155,7 +155,7 @@ Then create a tool file in `src/tools/` and simulation file(s) in `tests/simulat
|
|
|
155
155
|
Install the sunpeak skills to give your coding agent built-in knowledge of sunpeak patterns, hooks, and testing:
|
|
156
156
|
|
|
157
157
|
```bash
|
|
158
|
-
|
|
158
|
+
pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
|
|
159
159
|
```
|
|
160
160
|
|
|
161
161
|
## Troubleshooting
|
|
@@ -1,103 +1,103 @@
|
|
|
1
1
|
{
|
|
2
|
-
"hash": "
|
|
2
|
+
"hash": "9e921320",
|
|
3
3
|
"configHash": "b5321072",
|
|
4
|
-
"lockfileHash": "
|
|
5
|
-
"browserHash": "
|
|
4
|
+
"lockfileHash": "0deaaac3",
|
|
5
|
+
"browserHash": "68384581",
|
|
6
6
|
"optimized": {
|
|
7
7
|
"@modelcontextprotocol/ext-apps": {
|
|
8
8
|
"src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/app.js",
|
|
9
9
|
"file": "@modelcontextprotocol_ext-apps.js",
|
|
10
|
-
"fileHash": "
|
|
10
|
+
"fileHash": "e681afc2",
|
|
11
11
|
"needsInterop": false
|
|
12
12
|
},
|
|
13
13
|
"@modelcontextprotocol/ext-apps/app-bridge": {
|
|
14
14
|
"src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/app-bridge.js",
|
|
15
15
|
"file": "@modelcontextprotocol_ext-apps_app-bridge.js",
|
|
16
|
-
"fileHash": "
|
|
16
|
+
"fileHash": "71abff0a",
|
|
17
17
|
"needsInterop": false
|
|
18
18
|
},
|
|
19
19
|
"@modelcontextprotocol/ext-apps/react": {
|
|
20
20
|
"src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/react/index.js",
|
|
21
21
|
"file": "@modelcontextprotocol_ext-apps_react.js",
|
|
22
|
-
"fileHash": "
|
|
22
|
+
"fileHash": "a6d7f230",
|
|
23
23
|
"needsInterop": false
|
|
24
24
|
},
|
|
25
25
|
"@testing-library/react": {
|
|
26
26
|
"src": "../../../../../../node_modules/.pnpm/@testing-library+react@16.3.2_@testing-library+dom@10.4.1_@types+react-dom@19.2.3_@type_893f466751a7d66081fd06e9edb9241a/node_modules/@testing-library/react/dist/@testing-library/react.esm.js",
|
|
27
27
|
"file": "@testing-library_react.js",
|
|
28
|
-
"fileHash": "
|
|
28
|
+
"fileHash": "2b1ba0cb",
|
|
29
29
|
"needsInterop": false
|
|
30
30
|
},
|
|
31
31
|
"clsx": {
|
|
32
32
|
"src": "../../../../../../node_modules/.pnpm/clsx@2.1.1/node_modules/clsx/dist/clsx.mjs",
|
|
33
33
|
"file": "clsx.js",
|
|
34
|
-
"fileHash": "
|
|
34
|
+
"fileHash": "207734d0",
|
|
35
35
|
"needsInterop": false
|
|
36
36
|
},
|
|
37
37
|
"embla-carousel-react": {
|
|
38
38
|
"src": "../../../../../../node_modules/.pnpm/embla-carousel-react@8.6.0_react@19.2.4/node_modules/embla-carousel-react/esm/embla-carousel-react.esm.js",
|
|
39
39
|
"file": "embla-carousel-react.js",
|
|
40
|
-
"fileHash": "
|
|
40
|
+
"fileHash": "20692215",
|
|
41
41
|
"needsInterop": false
|
|
42
42
|
},
|
|
43
43
|
"embla-carousel-wheel-gestures": {
|
|
44
44
|
"src": "../../../../../../node_modules/.pnpm/embla-carousel-wheel-gestures@8.1.0_embla-carousel@8.6.0/node_modules/embla-carousel-wheel-gestures/dist/embla-carousel-wheel-gestures.esm.js",
|
|
45
45
|
"file": "embla-carousel-wheel-gestures.js",
|
|
46
|
-
"fileHash": "
|
|
46
|
+
"fileHash": "c4fef2a7",
|
|
47
47
|
"needsInterop": false
|
|
48
48
|
},
|
|
49
49
|
"mapbox-gl": {
|
|
50
50
|
"src": "../../../../../../node_modules/.pnpm/mapbox-gl@3.21.0/node_modules/mapbox-gl/dist/mapbox-gl.js",
|
|
51
51
|
"file": "mapbox-gl.js",
|
|
52
|
-
"fileHash": "
|
|
52
|
+
"fileHash": "ec7a0faf",
|
|
53
53
|
"needsInterop": true
|
|
54
54
|
},
|
|
55
55
|
"react-dom": {
|
|
56
56
|
"src": "../../../../../../node_modules/.pnpm/react-dom@19.2.4_react@19.2.4/node_modules/react-dom/index.js",
|
|
57
57
|
"file": "react-dom.js",
|
|
58
|
-
"fileHash": "
|
|
58
|
+
"fileHash": "ea2e1e20",
|
|
59
59
|
"needsInterop": true
|
|
60
60
|
},
|
|
61
61
|
"react-dom/client": {
|
|
62
62
|
"src": "../../../../../../node_modules/.pnpm/react-dom@19.2.4_react@19.2.4/node_modules/react-dom/client.js",
|
|
63
63
|
"file": "react-dom_client.js",
|
|
64
|
-
"fileHash": "
|
|
64
|
+
"fileHash": "28137cda",
|
|
65
65
|
"needsInterop": true
|
|
66
66
|
},
|
|
67
67
|
"react": {
|
|
68
68
|
"src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/index.js",
|
|
69
69
|
"file": "react.js",
|
|
70
|
-
"fileHash": "
|
|
70
|
+
"fileHash": "7af1c490",
|
|
71
71
|
"needsInterop": true
|
|
72
72
|
},
|
|
73
73
|
"react/jsx-dev-runtime": {
|
|
74
74
|
"src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/jsx-dev-runtime.js",
|
|
75
75
|
"file": "react_jsx-dev-runtime.js",
|
|
76
|
-
"fileHash": "
|
|
76
|
+
"fileHash": "9f5cf3bd",
|
|
77
77
|
"needsInterop": true
|
|
78
78
|
},
|
|
79
79
|
"react/jsx-runtime": {
|
|
80
80
|
"src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/jsx-runtime.js",
|
|
81
81
|
"file": "react_jsx-runtime.js",
|
|
82
|
-
"fileHash": "
|
|
82
|
+
"fileHash": "b414b6b3",
|
|
83
83
|
"needsInterop": true
|
|
84
84
|
},
|
|
85
85
|
"tailwind-merge": {
|
|
86
86
|
"src": "../../../../../../node_modules/.pnpm/tailwind-merge@3.5.0/node_modules/tailwind-merge/dist/bundle-mjs.mjs",
|
|
87
87
|
"file": "tailwind-merge.js",
|
|
88
|
-
"fileHash": "
|
|
88
|
+
"fileHash": "999e54e9",
|
|
89
89
|
"needsInterop": false
|
|
90
90
|
},
|
|
91
91
|
"vitest": {
|
|
92
92
|
"src": "../../../../../../node_modules/.pnpm/vitest@4.1.2_@opentelemetry+api@1.9.0_@types+node@25.5.2_happy-dom@18.0.1_jsdom@29.0.1__cf001624456b7837f7b99c4ef7b4c061/node_modules/vitest/dist/index.js",
|
|
93
93
|
"file": "vitest.js",
|
|
94
|
-
"fileHash": "
|
|
94
|
+
"fileHash": "a4111eed",
|
|
95
95
|
"needsInterop": false
|
|
96
96
|
},
|
|
97
97
|
"zod": {
|
|
98
98
|
"src": "../../../../../../node_modules/.pnpm/zod@4.3.6/node_modules/zod/index.js",
|
|
99
99
|
"file": "zod.js",
|
|
100
|
-
"fileHash": "
|
|
100
|
+
"fileHash": "eb1d38b2",
|
|
101
101
|
"needsInterop": false
|
|
102
102
|
}
|
|
103
103
|
},
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { expect } from 'vitest';
|
|
2
1
|
import { defineEval } from 'sunpeak/eval';
|
|
3
2
|
|
|
4
3
|
export default defineEval({
|
|
@@ -11,18 +10,22 @@ export default defineEval({
|
|
|
11
10
|
{
|
|
12
11
|
name: 'asks for food photos',
|
|
13
12
|
prompt: 'Show me photos from my Austin pizza tour',
|
|
14
|
-
expect: {
|
|
15
|
-
tool: 'show-albums',
|
|
16
|
-
args: { search: expect.stringMatching(/pizza|austin/i) },
|
|
17
|
-
},
|
|
13
|
+
expect: { tool: 'show-albums' },
|
|
18
14
|
},
|
|
19
15
|
{
|
|
20
16
|
name: 'asks for a specific category',
|
|
21
17
|
prompt: 'Show me my travel photos',
|
|
22
|
-
expect: {
|
|
23
|
-
tool: 'show-albums',
|
|
24
|
-
args: { category: expect.stringMatching(/travel/i) },
|
|
25
|
-
},
|
|
18
|
+
expect: { tool: 'show-albums' },
|
|
26
19
|
},
|
|
20
|
+
// To also check argument extraction, add args expectations
|
|
21
|
+
// (import { expect } from 'vitest' to use matchers):
|
|
22
|
+
// {
|
|
23
|
+
// name: 'passes search term',
|
|
24
|
+
// prompt: 'Show me photos from my Austin pizza tour',
|
|
25
|
+
// expect: {
|
|
26
|
+
// tool: 'show-albums',
|
|
27
|
+
// args: { search: expect.stringMatching(/pizza|austin/i) },
|
|
28
|
+
// },
|
|
29
|
+
// },
|
|
27
30
|
],
|
|
28
31
|
});
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { expect } from 'vitest';
|
|
2
1
|
import { defineEval } from 'sunpeak/eval';
|
|
3
2
|
|
|
4
3
|
export default defineEval({
|
|
@@ -6,21 +5,12 @@ export default defineEval({
|
|
|
6
5
|
{
|
|
7
6
|
name: 'asks for popular places',
|
|
8
7
|
prompt: 'Show me popular places to visit in Tokyo',
|
|
9
|
-
expect: {
|
|
10
|
-
tool: 'show-carousel',
|
|
11
|
-
args: { city: expect.stringMatching(/tokyo/i) },
|
|
12
|
-
},
|
|
8
|
+
expect: { tool: 'show-carousel' },
|
|
13
9
|
},
|
|
14
10
|
{
|
|
15
|
-
name: 'asks for
|
|
16
|
-
prompt: '
|
|
17
|
-
expect: {
|
|
18
|
-
tool: 'show-carousel',
|
|
19
|
-
args: {
|
|
20
|
-
city: expect.stringMatching(/paris/i),
|
|
21
|
-
categories: expect.arrayContaining([expect.stringMatching(/restaurant|food|dining/i)]),
|
|
22
|
-
},
|
|
23
|
-
},
|
|
11
|
+
name: 'asks for places to eat',
|
|
12
|
+
prompt: 'Where should I eat in Paris?',
|
|
13
|
+
expect: { tool: 'show-carousel' },
|
|
24
14
|
},
|
|
25
15
|
],
|
|
26
16
|
});
|
|
@@ -10,15 +10,15 @@ export default defineEvalConfig({
|
|
|
10
10
|
|
|
11
11
|
models: [
|
|
12
12
|
// Uncomment models and install their provider packages:
|
|
13
|
-
// 'gpt-4o',
|
|
14
|
-
// 'gpt-4o-mini',
|
|
15
|
-
// 'o4-mini',
|
|
16
|
-
// 'claude-sonnet-4-20250514',
|
|
17
|
-
// 'gemini-2.0-flash',
|
|
13
|
+
// 'gpt-4o', // OPENAI_API_KEY
|
|
14
|
+
// 'gpt-4o-mini', // OPENAI_API_KEY
|
|
15
|
+
// 'o4-mini', // OPENAI_API_KEY
|
|
16
|
+
// 'claude-sonnet-4-20250514', // ANTHROPIC_API_KEY
|
|
17
|
+
// 'gemini-2.0-flash', // GOOGLE_GENERATIVE_AI_API_KEY
|
|
18
18
|
],
|
|
19
19
|
|
|
20
20
|
defaults: {
|
|
21
|
-
runs:
|
|
21
|
+
runs: 5, // Number of times to run each case per model
|
|
22
22
|
maxSteps: 1, // Max tool call steps per run
|
|
23
23
|
temperature: 0, // 0 for most deterministic results
|
|
24
24
|
timeout: 30_000, // Timeout per run in ms
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { expect } from 'vitest';
|
|
2
1
|
import { defineEval } from 'sunpeak/eval';
|
|
3
2
|
|
|
4
3
|
export default defineEval({
|
|
@@ -6,18 +5,12 @@ export default defineEval({
|
|
|
6
5
|
{
|
|
7
6
|
name: 'asks for a map',
|
|
8
7
|
prompt: 'Show me a map of coffee shops near downtown Austin',
|
|
9
|
-
expect: {
|
|
10
|
-
tool: 'show-map',
|
|
11
|
-
args: { query: expect.stringMatching(/coffee/i) },
|
|
12
|
-
},
|
|
8
|
+
expect: { tool: 'show-map' },
|
|
13
9
|
},
|
|
14
10
|
{
|
|
15
11
|
name: 'asks for nearby places',
|
|
16
12
|
prompt: 'Find me some parks nearby',
|
|
17
|
-
expect: {
|
|
18
|
-
tool: 'show-map',
|
|
19
|
-
args: { query: expect.stringMatching(/park/i) },
|
|
20
|
-
},
|
|
13
|
+
expect: { tool: 'show-map' },
|
|
21
14
|
},
|
|
22
15
|
],
|
|
23
16
|
});
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { expect } from 'vitest';
|
|
2
1
|
import { defineEval } from 'sunpeak/eval';
|
|
3
2
|
|
|
4
3
|
export default defineEval({
|
|
@@ -11,10 +10,7 @@ export default defineEval({
|
|
|
11
10
|
{
|
|
12
11
|
name: 'asks to draft a social post',
|
|
13
12
|
prompt: 'Write a launch announcement for X and LinkedIn',
|
|
14
|
-
expect: {
|
|
15
|
-
tool: 'review-post',
|
|
16
|
-
args: { platforms: expect.arrayContaining([expect.stringMatching(/x|twitter/i)]) },
|
|
17
|
-
},
|
|
13
|
+
expect: { tool: 'review-post' },
|
|
18
14
|
},
|
|
19
15
|
{
|
|
20
16
|
name: 'asks to review a purchase',
|
|
@@ -22,27 +18,36 @@ export default defineEval({
|
|
|
22
18
|
expect: { tool: 'review-purchase' },
|
|
23
19
|
},
|
|
24
20
|
|
|
25
|
-
//
|
|
21
|
+
// To also check argument extraction, add args expectations
|
|
22
|
+
// (import { expect } from 'vitest' to use matchers):
|
|
23
|
+
// {
|
|
24
|
+
// name: 'passes platforms',
|
|
25
|
+
// prompt: 'Write a launch announcement for X and LinkedIn',
|
|
26
|
+
// expect: {
|
|
27
|
+
// tool: 'review-post',
|
|
28
|
+
// args: { platforms: expect.arrayContaining([expect.stringMatching(/x|twitter/i)]) },
|
|
29
|
+
// },
|
|
30
|
+
// },
|
|
26
31
|
//
|
|
27
32
|
// Multi-step (ordered tool call sequence):
|
|
28
|
-
//
|
|
29
|
-
//
|
|
30
|
-
//
|
|
31
|
-
//
|
|
32
|
-
//
|
|
33
|
-
//
|
|
34
|
-
//
|
|
35
|
-
//
|
|
36
|
-
//
|
|
33
|
+
// {
|
|
34
|
+
// name: 'multi-step flow',
|
|
35
|
+
// prompt: 'Draft a post and then review it',
|
|
36
|
+
// maxSteps: 3,
|
|
37
|
+
// expect: [
|
|
38
|
+
// { tool: 'review-post' },
|
|
39
|
+
// { tool: 'publish-post' },
|
|
40
|
+
// ],
|
|
41
|
+
// },
|
|
37
42
|
//
|
|
38
43
|
// Custom assertion (full access to result):
|
|
39
|
-
//
|
|
40
|
-
//
|
|
41
|
-
//
|
|
42
|
-
//
|
|
43
|
-
//
|
|
44
|
-
//
|
|
45
|
-
// },
|
|
44
|
+
// {
|
|
45
|
+
// name: 'custom check',
|
|
46
|
+
// prompt: 'Show me my recent reviews',
|
|
47
|
+
// assert: (result) => {
|
|
48
|
+
// expect(result.toolCalls).toHaveLength(1);
|
|
49
|
+
// expect(result.toolCalls[0].name).toBe('review-diff');
|
|
46
50
|
// },
|
|
51
|
+
// },
|
|
47
52
|
],
|
|
48
53
|
});
|
|
File without changes
|