sunpeak 0.19.10 → 0.19.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -132,7 +132,7 @@ sunpeak new
132
132
  Install the sunpeak skills to give your coding agent (Claude Code, Cursor, etc.) built-in knowledge of sunpeak patterns, hooks, and testing:
133
133
 
134
134
  ```bash
135
- npx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
135
+ pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
136
136
  ```
137
137
 
138
138
  ## Troubleshooting
@@ -9,6 +9,7 @@ const execAsync = promisify(exec);
9
9
  import * as clack from '@clack/prompts';
10
10
  import { discoverResources } from '../lib/patterns.mjs';
11
11
  import { detectPackageManager } from '../utils.mjs';
12
+ import { EVAL_PROVIDERS } from '../lib/eval/eval-providers.mjs';
12
13
 
13
14
  const __dirname = dirname(fileURLToPath(import.meta.url));
14
15
 
@@ -57,6 +58,21 @@ async function defaultSelectResources(availableResources) {
57
58
  return selected;
58
59
  }
59
60
 
61
+ /**
62
+ * Default prompt for eval provider selection.
63
+ * @returns {Promise<Array<{ pkg: string, models: string[] }>>}
64
+ */
65
+ async function defaultSelectProviders() {
66
+ const selected = await clack.multiselect({
67
+ message: 'AI providers for evals (space to toggle, enter to skip)',
68
+ options: EVAL_PROVIDERS.map((p) => ({ value: p, label: p.label })),
69
+ initialValues: [],
70
+ required: false,
71
+ });
72
+ if (clack.isCancel(selected)) return [];
73
+ return selected;
74
+ }
75
+
60
76
  /**
61
77
  * Default dependencies (real implementations)
62
78
  */
@@ -73,6 +89,8 @@ export const defaultDeps = {
73
89
  execAsync,
74
90
  promptName: defaultPromptName,
75
91
  selectResources: defaultSelectResources,
92
+ selectProviders: defaultSelectProviders,
93
+ password: clack.password,
76
94
  confirm: clack.confirm,
77
95
  intro: clack.intro,
78
96
  outro: clack.outro,
@@ -233,6 +251,15 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
233
251
  d.renameSync(srcPath, destPath);
234
252
  }
235
253
  }
254
+ // Rename nested dotfiles (underscore convention for npm compatibility)
255
+ const nestedDotfiles = [['tests/evals/_env.example', 'tests/evals/.env.example']];
256
+ for (const [from, to] of nestedDotfiles) {
257
+ const srcPath = join(targetDir, from);
258
+ const destPath = join(targetDir, to);
259
+ if (d.existsSync(srcPath)) {
260
+ d.renameSync(srcPath, destPath);
261
+ }
262
+ }
236
263
 
237
264
  // Read sunpeak version from root package.json
238
265
  const rootPkg = JSON.parse(d.readFileSync(d.rootPkgPath, 'utf-8'));
@@ -282,6 +309,56 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
282
309
  s.stop(`Install failed. You can try running "${pm} install" manually.`);
283
310
  }
284
311
 
312
+ // Offer to configure eval providers (only in interactive mode)
313
+ if (resourcesArg === undefined) {
314
+ const providers = await d.selectProviders();
315
+ if (!clack.isCancel(providers) && providers.length > 0) {
316
+ // Install AI SDK core + selected provider packages
317
+ const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
318
+ try {
319
+ await d.execAsync(`${pm} add -D ${pkgsToInstall.join(' ')}`, { cwd: targetDir });
320
+ } catch {
321
+ d.console.log(`Provider install failed. Install manually: ${pm} add -D ${pkgsToInstall.join(' ')}`);
322
+ }
323
+
324
+ // Uncomment selected models in eval.config.ts
325
+ const evalConfigPath = join(targetDir, 'tests', 'evals', 'eval.config.ts');
326
+ if (d.existsSync(evalConfigPath)) {
327
+ let config = d.readFileSync(evalConfigPath, 'utf-8');
328
+ for (const p of providers) {
329
+ for (const model of p.models) {
330
+ // Uncomment lines matching this model (e.g., " // 'gpt-4o'," → " 'gpt-4o',")
331
+ config = config.replace(
332
+ new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
333
+ '$1$2'
334
+ );
335
+ }
336
+ }
337
+ d.writeFileSync(evalConfigPath, config);
338
+ }
339
+
340
+ // Prompt for API keys and write .env
341
+ const envLines = [];
342
+ const seen = new Set();
343
+ for (const p of providers) {
344
+ if (seen.has(p.envVar)) continue;
345
+ seen.add(p.envVar);
346
+ const key = await d.password({
347
+ message: `${p.envVar} (enter to skip)`,
348
+ mask: '*',
349
+ });
350
+ if (!clack.isCancel(key) && key) {
351
+ envLines.push(`${p.envVar}=${key}`);
352
+ }
353
+ }
354
+ const envPath = join(targetDir, 'tests', 'evals', '.env');
355
+ if (envLines.length > 0) {
356
+ d.writeFileSync(envPath, envLines.join('\n') + '\n');
357
+ clack.log.info(`API keys saved to tests/evals/.env (gitignored)`);
358
+ }
359
+ }
360
+ }
361
+
285
362
  // Offer to install the sunpeak skills (only in interactive mode)
286
363
  if (resourcesArg === undefined) {
287
364
  const installSkill = await d.confirm({
@@ -290,18 +367,16 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
290
367
  });
291
368
  if (!clack.isCancel(installSkill) && installSkill) {
292
369
  try {
293
- d.execSync('npx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server', {
370
+ d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server', {
294
371
  cwd: targetDir,
295
372
  stdio: 'inherit',
296
373
  });
297
374
  } catch {
298
- d.console.log('Skill install skipped. You can install later with: npx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server');
375
+ d.console.log('Skill install skipped. You can install later with: pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server');
299
376
  }
300
377
  }
301
378
  }
302
379
 
303
- const runCmd = pm === 'npm' ? 'npm run' : pm;
304
-
305
380
  d.outro(`Done! To get started:
306
381
 
307
382
  cd ${projectName}
@@ -309,9 +384,12 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
309
384
 
310
385
  Your project commands:
311
386
 
312
- sunpeak dev # Start dev server + MCP endpoint
313
- sunpeak build # Build for production
314
- ${runCmd} test # Run tests`);
387
+ sunpeak dev # Start dev server + MCP endpoint
388
+ sunpeak build # Build for production
389
+ sunpeak test # Run unit + e2e tests
390
+ sunpeak test --eval # Run LLM evals (configure models in tests/evals/eval.config.ts)
391
+ sunpeak test --visual # Run visual regression tests
392
+ sunpeak test --live # Run live tests against real AI hosts`);
315
393
  }
316
394
 
317
395
  // Allow running directly
@@ -2,11 +2,24 @@ import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
2
2
  import { execSync } from 'child_process';
3
3
  import { join, dirname } from 'path';
4
4
  import * as p from '@clack/prompts';
5
+ import { EVAL_PROVIDERS, generateModelLines } from '../lib/eval/eval-providers.mjs';
6
+ import { detectPackageManager } from '../utils.mjs';
5
7
 
6
8
  /**
7
9
  * Default dependencies (real implementations).
8
10
  * Override in tests via the `deps` parameter.
9
11
  */
12
+ async function defaultSelectProviders() {
13
+ const selected = await p.multiselect({
14
+ message: 'AI providers for evals (space to toggle, enter to skip)',
15
+ options: EVAL_PROVIDERS.map((prov) => ({ value: prov, label: prov.label })),
16
+ initialValues: [],
17
+ required: false,
18
+ });
19
+ if (p.isCancel(selected)) return [];
20
+ return selected;
21
+ }
22
+
10
23
  export const defaultDeps = {
11
24
  existsSync,
12
25
  readFileSync,
@@ -21,6 +34,9 @@ export const defaultDeps = {
21
34
  select: p.select,
22
35
  text: p.text,
23
36
  log: p.log,
37
+ password: p.password,
38
+ selectProviders: defaultSelectProviders,
39
+ detectPackageManager,
24
40
  };
25
41
 
26
42
  /**
@@ -60,6 +76,61 @@ export async function testInit(args = [], deps = defaultDeps) {
60
76
  await initExternalProject(cliServer, d);
61
77
  }
62
78
 
79
+ // Offer to configure eval providers
80
+ const providers = await d.selectProviders();
81
+ if (!d.isCancel(providers) && providers.length > 0) {
82
+ const pm = d.detectPackageManager();
83
+ const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
84
+ const installCmd = `${pm} add -D ${pkgsToInstall.join(' ')}`;
85
+ try {
86
+ d.execSync(installCmd, { cwd: d.cwd(), stdio: 'inherit' });
87
+ } catch {
88
+ d.log.info(`Provider install failed. Install manually: ${installCmd}`);
89
+ }
90
+
91
+ // Uncomment selected models in eval.config.ts
92
+ const evalDir = d.existsSync(join(d.cwd(), 'tests', 'evals'))
93
+ ? join(d.cwd(), 'tests', 'evals')
94
+ : d.existsSync(join(d.cwd(), 'tests', 'sunpeak', 'evals'))
95
+ ? join(d.cwd(), 'tests', 'sunpeak', 'evals')
96
+ : null;
97
+ if (evalDir) {
98
+ const configPath = join(evalDir, 'eval.config.ts');
99
+ if (d.existsSync(configPath)) {
100
+ let config = d.readFileSync(configPath, 'utf-8');
101
+ for (const prov of providers) {
102
+ for (const model of prov.models) {
103
+ config = config.replace(
104
+ new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
105
+ '$1$2'
106
+ );
107
+ }
108
+ }
109
+ d.writeFileSync(configPath, config);
110
+ }
111
+
112
+ // Prompt for API keys and write .env
113
+ const envLines = [];
114
+ const seen = new Set();
115
+ for (const prov of providers) {
116
+ if (seen.has(prov.envVar)) continue;
117
+ seen.add(prov.envVar);
118
+ const key = await d.password({
119
+ message: `${prov.envVar} (enter to skip)`,
120
+ mask: '*',
121
+ });
122
+ if (!d.isCancel(key) && key) {
123
+ envLines.push(`${prov.envVar}=${key}`);
124
+ }
125
+ }
126
+ if (envLines.length > 0 && evalDir) {
127
+ const relEnvPath = evalDir.startsWith(d.cwd()) ? evalDir.slice(d.cwd().length + 1) : evalDir;
128
+ d.writeFileSync(join(evalDir, '.env'), envLines.join('\n') + '\n');
129
+ d.log.info(`API keys saved to ${relEnvPath}/.env (gitignored)`);
130
+ }
131
+ }
132
+ }
133
+
63
134
  // Offer to install the testing skill
64
135
  const installSkill = await d.confirm({
65
136
  message: 'Install the test-mcp-server skill? (helps your coding agent write tests)',
@@ -67,12 +138,12 @@ export async function testInit(args = [], deps = defaultDeps) {
67
138
  });
68
139
  if (!d.isCancel(installSkill) && installSkill) {
69
140
  try {
70
- d.execSync('npx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
141
+ d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
71
142
  cwd: d.cwd(),
72
143
  stdio: 'inherit',
73
144
  });
74
145
  } catch {
75
- d.log.info('Skill install skipped. Install later: npx skills add Sunpeak-AI/sunpeak@test-mcp-server');
146
+ d.log.info('Skill install skipped. Install later: pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server');
76
147
  }
77
148
  }
78
149
 
@@ -205,15 +276,11 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
205
276
  "",
206
277
  " models: [",
207
278
  " // Uncomment models and install their provider packages:",
208
- " // 'gpt-4o', // OPENAI_API_KEY",
209
- " // 'gpt-4o-mini', // OPENAI_API_KEY",
210
- " // 'o4-mini', // OPENAI_API_KEY",
211
- " // 'claude-sonnet-4-20250514', // ANTHROPIC_API_KEY",
212
- " // 'gemini-2.0-flash', // GOOGLE_GENERATIVE_AI_API_KEY",
279
+ ...generateModelLines(),
213
280
  " ],",
214
281
  "",
215
282
  " defaults: {",
216
- " runs: 10, // Number of times to run each case per model",
283
+ " runs: 5, // Number of times to run each case per model",
217
284
  " maxSteps: 1, // Max tool call steps per run",
218
285
  " temperature: 0, // 0 for most deterministic results",
219
286
  " timeout: 30_000, // Timeout per run in ms",
@@ -244,7 +311,7 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
244
311
  *
245
312
  * To get started:
246
313
  * 1. Configure models in eval.config.ts (uncomment the ones you want)
247
- * 2. Install the AI SDK and provider packages: pnpm add ai @ai-sdk/openai
314
+ * 2. Install the AI SDK and provider packages (e.g. pnpm add ai @ai-sdk/openai)
248
315
  * 3. Copy .env.example to .env and add your API keys
249
316
  * 4. Replace this file with evals for your own tools
250
317
  * 5. Run: sunpeak test --eval
@@ -489,6 +556,7 @@ async function initExternalProject(cliServer, d) {
489
556
  private: true,
490
557
  type: 'module',
491
558
  devDependencies: {
559
+ '@types/node': 'latest',
492
560
  sunpeak: 'latest',
493
561
  '@playwright/test': 'latest',
494
562
  },
@@ -564,14 +632,15 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
564
632
 
565
633
  d.log.success('Created tests/sunpeak/ with all test types.');
566
634
  d.log.step('Next steps:');
635
+ const pm = d.detectPackageManager();
567
636
  d.log.message(' cd tests/sunpeak');
568
- d.log.message(' npm install');
569
- d.log.message(' npx playwright install chromium');
637
+ d.log.message(` ${pm} install`);
638
+ d.log.message(` ${pm} exec playwright install chromium`);
570
639
  d.log.message('');
571
- d.log.message(' npx sunpeak test # E2E tests');
572
- d.log.message(' npx sunpeak test --visual # Visual regression (generates baselines on first run)');
573
- d.log.message(' npx sunpeak test --live # Live tests against real hosts (requires login)');
574
- d.log.message(' npx sunpeak test --eval # Multi-model evals (configure models in evals/eval.config.ts)');
640
+ d.log.message(' sunpeak test # E2E tests');
641
+ d.log.message(' sunpeak test --visual # Visual regression (generates baselines on first run)');
642
+ d.log.message(' sunpeak test --live # Live tests against real hosts (requires login)');
643
+ d.log.message(' sunpeak test --eval # Multi-model evals (configure models in evals/eval.config.ts)');
575
644
  }
576
645
 
577
646
  async function initJsProject(cliServer, d) {
@@ -638,15 +707,16 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
638
707
  // 5. Unit test
639
708
  scaffoldUnitTest(join(cwd, 'tests', 'unit', 'example.test.ts'), d);
640
709
 
710
+ const pkgMgr = d.detectPackageManager();
641
711
  d.log.step('Next steps:');
642
- d.log.message(' npm install -D sunpeak @playwright/test vitest');
643
- d.log.message(' npx playwright install chromium');
712
+ d.log.message(` ${pkgMgr} add -D sunpeak @playwright/test vitest`);
713
+ d.log.message(` ${pkgMgr} exec playwright install chromium`);
644
714
  d.log.message('');
645
- d.log.message(' npx sunpeak test # E2E tests');
646
- d.log.message(' npx sunpeak test --unit # Unit tests (vitest)');
647
- d.log.message(' npx sunpeak test --visual # Visual regression');
648
- d.log.message(' npx sunpeak test --live # Live tests against real hosts');
649
- d.log.message(' npx sunpeak test --eval # Multi-model evals');
715
+ d.log.message(' sunpeak test # E2E tests');
716
+ d.log.message(' sunpeak test --unit # Unit tests (vitest)');
717
+ d.log.message(' sunpeak test --visual # Visual regression');
718
+ d.log.message(' sunpeak test --live # Live tests against real hosts');
719
+ d.log.message(' sunpeak test --eval # Multi-model evals');
650
720
  }
651
721
 
652
722
  async function initSunpeakProject(d) {
@@ -310,14 +310,43 @@ function validateApiKeys(models) {
310
310
  return warnings;
311
311
  }
312
312
 
313
+ /**
314
+ * Check that required AI SDK provider packages are installed for the configured models.
315
+ * @param {string[]} models
316
+ * @returns {Promise<Array<{ pkg: string, reason: string }>>}
317
+ */
318
+ async function checkProviderPackages(models) {
319
+ const { createRequire } = await import('module');
320
+ // Resolve from the project's node_modules, not the global CLI install
321
+ const require = createRequire(join(process.cwd(), 'package.json'));
322
+ const missing = [];
323
+ const checked = new Set();
324
+
325
+ try {
326
+ require.resolve('ai');
327
+ } catch {
328
+ missing.push({ pkg: 'ai', reason: 'core AI SDK' });
329
+ }
330
+
331
+ for (const modelId of models) {
332
+ const pkg = getProviderForModel(modelId);
333
+ if (!pkg || checked.has(pkg)) continue;
334
+ checked.add(pkg);
335
+ try {
336
+ require.resolve(pkg);
337
+ } catch {
338
+ missing.push({ pkg, reason: modelId });
339
+ }
340
+ }
341
+
342
+ return missing;
343
+ }
344
+
313
345
  /**
314
346
  * Run eval tests via vitest with the eval plugin.
315
347
  */
316
348
  async function runEvals(args) {
317
- const { dirname, resolve, basename } = await import('path');
318
- const { fileURLToPath } = await import('url');
319
-
320
- const __dirname = dirname(fileURLToPath(import.meta.url));
349
+ const { resolve, basename } = await import('path');
321
350
  const evalDir = findEvalDir();
322
351
 
323
352
  if (!evalDir) {
@@ -365,8 +394,19 @@ async function runEvals(args) {
365
394
  }
366
395
  }
367
396
 
368
- // Validate API keys for configured models
397
+ // Check for missing provider packages and API keys
369
398
  if (configModels && configModels.length > 0) {
399
+ const missingPkgs = await checkProviderPackages(configModels);
400
+ if (missingPkgs.length > 0) {
401
+ console.error('\nMissing required packages for eval models:\n');
402
+ for (const { pkg, reason } of missingPkgs) {
403
+ console.error(` ${pkg} (needed for ${reason})`);
404
+ }
405
+ const installCmd = missingPkgs.map((m) => m.pkg).join(' ');
406
+ console.error(`\nInstall with:\n\n pnpm add -D ${installCmd}\n`);
407
+ return 1;
408
+ }
409
+
370
410
  const warnings = validateApiKeys(configModels);
371
411
  if (warnings.length > 0) {
372
412
  console.log('');
@@ -416,8 +456,10 @@ async function runEvals(args) {
416
456
  // Use .ts extension so vitest handles TypeScript imports natively
417
457
  const vitestConfigPath = join(absEvalDir, '.eval-vitest.config.ts');
418
458
 
419
- const evalPluginPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-vitest-plugin.mjs');
420
- const evalReporterPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-reporter.mjs');
459
+ // Use package exports so vitest resolves from the project's node_modules,
460
+ // not the global CLI install. This ensures import('ai') finds project-local deps.
461
+ const evalPluginImport = 'sunpeak/eval/plugin';
462
+ const evalReporterImport = 'sunpeak/eval/reporter';
421
463
 
422
464
  // Clean up dev server and temp config
423
465
  const cleanupResources = () => {
@@ -442,17 +484,19 @@ async function runEvals(args) {
442
484
  ? './' + basename(configFile)
443
485
  : null;
444
486
 
445
- // Generate a vitest config that loads the eval config and plugin
487
+ // Generate a vitest config that loads the eval config and plugin.
488
+ // Imports use package names (sunpeak/eval/plugin) so vitest resolves from
489
+ // the project's node_modules, ensuring import('ai') finds project-local deps.
446
490
  const vitestConfig = `
447
491
  import { defineConfig } from 'vitest/config';
448
- import { evalVitestPlugin } from ${JSON.stringify(evalPluginPath)};
492
+ import { evalVitestPlugin } from '${evalPluginImport}';
449
493
  ${configImportPath ? `import evalConfig from ${JSON.stringify(configImportPath)};` : 'const evalConfig = { models: [], defaults: {} };'}
450
494
 
451
495
  export default defineConfig({
452
496
  test: {
453
497
  globals: true,
454
498
  include: ['**/*.eval.ts', '**/*.eval.js'],
455
- reporters: ['default', ${JSON.stringify(evalReporterPath)}],
499
+ reporters: ['default', '${evalReporterImport}'],
456
500
  testTimeout: 600000,
457
501
  },
458
502
  plugins: [
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Shared eval provider definitions used by both `sunpeak new` and `sunpeak test init`.
3
+ * Single source of truth for provider packages, model IDs, and CLI labels.
4
+ */
5
+
6
+ export const EVAL_PROVIDERS = [
7
+ { pkg: '@ai-sdk/openai', models: ['gpt-4o', 'gpt-4o-mini'], label: 'OpenAI (gpt-4o, gpt-4o-mini)', envVar: 'OPENAI_API_KEY' },
8
+ { pkg: '@ai-sdk/anthropic', models: ['claude-sonnet-4-20250514'], label: 'Anthropic (claude-sonnet-4)', envVar: 'ANTHROPIC_API_KEY' },
9
+ { pkg: '@ai-sdk/google', models: ['gemini-2.0-flash'], label: 'Google (gemini-2.0-flash)', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
10
+ ];
11
+
12
+ /**
13
+ * All model lines that appear in eval.config.ts, in order.
14
+ * Used by scaffoldEvals to generate the config and by the uncomment logic.
15
+ */
16
+ export const EVAL_CONFIG_MODELS = [
17
+ { id: 'gpt-4o', envVar: 'OPENAI_API_KEY' },
18
+ { id: 'gpt-4o-mini', envVar: 'OPENAI_API_KEY' },
19
+ { id: 'o4-mini', envVar: 'OPENAI_API_KEY' },
20
+ { id: 'claude-sonnet-4-20250514', envVar: 'ANTHROPIC_API_KEY' },
21
+ { id: 'gemini-2.0-flash', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
22
+ ];
23
+
24
+ /**
25
+ * Generate the models section lines for eval.config.ts.
26
+ * @returns {string[]}
27
+ */
28
+ export function generateModelLines() {
29
+ const maxIdLen = Math.max(...EVAL_CONFIG_MODELS.map((m) => m.id.length));
30
+ return EVAL_CONFIG_MODELS.map((m) => {
31
+ const padded = `'${m.id}',`.padEnd(maxIdLen + 3);
32
+ return ` // ${padded} // ${m.envVar}`;
33
+ });
34
+ }
@@ -8,6 +8,43 @@ import { resolveModel, checkAiSdkInstalled } from './model-registry.mjs';
8
8
  // Re-export for use in generated test code
9
9
  export { checkAiSdkInstalled };
10
10
 
11
+ /**
12
+ * Strip AI SDK retry wrapper from error messages for cleaner output.
13
+ * "Failed after 3 attempts. Last error: <actual error>" → "<actual error>"
14
+ * @param {string} message
15
+ * @returns {string}
16
+ */
17
+ export function cleanErrorMessage(message) {
18
+ return message.replace(/^Failed after \d+ attempts?\. Last error: /i, '');
19
+ }
20
+
21
+ /**
22
+ * Check if an error message indicates a fatal API error that won't resolve on retry.
23
+ * @param {string} message
24
+ * @returns {boolean}
25
+ */
26
+ export function isFatalApiError(message) {
27
+ const lower = message.toLowerCase();
28
+ const patterns = [
29
+ 'exceeded your current quota',
30
+ 'credit balance is too low',
31
+ 'insufficient_quota',
32
+ 'billing_hard_limit_reached',
33
+ 'check your plan and billing details',
34
+ 'add a payment method',
35
+ 'invalid api key',
36
+ 'invalid_api_key',
37
+ 'incorrect api key',
38
+ 'unauthorized',
39
+ 'permission denied',
40
+ 'access denied',
41
+ 'authentication failed',
42
+ 'account deactivated',
43
+ 'account suspended',
44
+ ];
45
+ return patterns.some((p) => lower.includes(p));
46
+ }
47
+
11
48
  /**
12
49
  * Define an eval spec. Identity function for type safety.
13
50
  * @param {import('./eval-types.d.mts').EvalSpec} spec
@@ -112,6 +149,7 @@ export async function runSingleEval({ prompt, model, tools, maxSteps, temperatur
112
149
  prompt,
113
150
  maxSteps,
114
151
  temperature,
152
+ maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
115
153
  abortSignal: AbortSignal.timeout(timeout),
116
154
  });
117
155
 
@@ -268,10 +306,15 @@ export async function runEvalCaseAggregate({
268
306
  const model = await resolveModel(modelId);
269
307
  let passed = 0;
270
308
  let failed = 0;
309
+ let executedRuns = 0;
271
310
  let totalDurationMs = 0;
272
311
  const failureMap = new Map();
273
312
 
274
313
  for (let i = 0; i < runs; i++) {
314
+ // Small delay between runs to avoid rate limits (skip before first run)
315
+ if (i > 0) await new Promise((r) => setTimeout(r, 1000));
316
+
317
+ executedRuns++;
275
318
  const start = performance.now();
276
319
  try {
277
320
  const result = await runSingleEval({
@@ -286,8 +329,17 @@ export async function runEvalCaseAggregate({
286
329
  passed++;
287
330
  } catch (err) {
288
331
  failed++;
289
- const msg = err.message || String(err);
332
+ const msg = cleanErrorMessage(err.message || String(err));
290
333
  failureMap.set(msg, (failureMap.get(msg) || 0) + 1);
334
+
335
+ if (isFatalApiError(msg)) {
336
+ // Count remaining runs as failed and stop early
337
+ const remaining = runs - i - 1;
338
+ failed += remaining;
339
+ failureMap.set(msg, (failureMap.get(msg) || 0) + remaining);
340
+ totalDurationMs += performance.now() - start;
341
+ break;
342
+ }
291
343
  }
292
344
  totalDurationMs += performance.now() - start;
293
345
  }
@@ -304,7 +356,7 @@ export async function runEvalCaseAggregate({
304
356
  passed,
305
357
  failed,
306
358
  passRate: runs > 0 ? passed / runs : 0,
307
- avgDurationMs: runs > 0 ? totalDurationMs / runs : 0,
359
+ avgDurationMs: executedRuns > 0 ? totalDurationMs / executedRuns : 0,
308
360
  failures,
309
361
  };
310
362
  }
@@ -14,7 +14,6 @@
14
14
 
15
15
  import { readFileSync } from 'fs';
16
16
  import { basename } from 'path';
17
- import { fileURLToPath } from 'url';
18
17
 
19
18
  const EVAL_RE = /\.eval\.[tj]s$/;
20
19
  const VIRTUAL_PREFIX = '\0sunpeak-eval-spec:';
@@ -61,11 +60,12 @@ export function evalVitestPlugin({ server, models, defaults }) {
61
60
  virtualToReal.set(virtualId, id);
62
61
 
63
62
  const testName = basename(id).replace(EVAL_RE, '');
64
- const runnerPath = resolveRunnerPath();
65
63
 
64
+ // Import from 'sunpeak/eval' (package export) so vitest resolves from
65
+ // the project's node_modules, not the global CLI install.
66
66
  const transformed = `
67
67
  import { describe, it, beforeAll, afterAll } from 'vitest';
68
- import { createMcpConnection, discoverAndConvertTools, runEvalCaseAggregate, checkAiSdkInstalled } from '${runnerPath}';
68
+ import { createMcpConnection, discoverAndConvertTools, runEvalCaseAggregate, checkAiSdkInstalled } from 'sunpeak/eval';
69
69
 
70
70
  // Import the original eval spec via virtual module (bypasses this transform)
71
71
  import evalSpec from ${JSON.stringify(virtualId)};
@@ -145,14 +145,3 @@ describe.skipIf(shouldSkip)(${JSON.stringify(testName)}, () => {
145
145
  };
146
146
  }
147
147
 
148
- /**
149
- * Get the absolute path to the eval-runner module.
150
- */
151
- function resolveRunnerPath() {
152
- const url = new URL('./eval-runner.mjs', import.meta.url);
153
- // fileURLToPath requires file:// scheme; fall back to pathname for other schemes (e.g., vitest)
154
- if (url.protocol === 'file:') {
155
- return fileURLToPath(url);
156
- }
157
- return url.pathname;
158
- }
@@ -36,7 +36,7 @@ export async function resolveModel(modelId) {
36
36
  provider = await import(pkg);
37
37
  } catch {
38
38
  throw new Error(
39
- `Provider package "${pkg}" is not installed. Install it to use ${modelId}:\n\n npm install ${pkg} (or pnpm add / yarn add)\n`
39
+ `Provider package "${pkg}" is not installed. Install it to use ${modelId}:\n\n pnpm add -D ${pkg}\n`
40
40
  );
41
41
  }
42
42
 
@@ -67,7 +67,7 @@ export async function checkAiSdkInstalled() {
67
67
  await import('ai');
68
68
  } catch {
69
69
  throw new Error(
70
- 'The "ai" package is not installed. Install it to use evals:\n\n npm install ai (or pnpm add / yarn add)\n'
70
+ 'The "ai" package is not installed. Install it to use evals:\n\n pnpm add -D ai\n'
71
71
  );
72
72
  }
73
73
  }
@@ -49,7 +49,7 @@ export function defineInspectConfig(options) {
49
49
  const serverArg = server.includes(' ') ? `"${server}"` : server;
50
50
  const command = [
51
51
  `SUNPEAK_SANDBOX_PORT=${sandboxPort}`,
52
- 'npx sunpeak inspect',
52
+ 'sunpeak inspect',
53
53
  `--server ${serverArg}`,
54
54
  ...(simulationsDir ? [`--simulations ${simulationsDir}`] : []),
55
55
  `--port ${port}`,
@@ -103,7 +103,7 @@ function buildInspectCommand({ server, port, sandboxPort, simulationsDir }) {
103
103
  }
104
104
  }
105
105
 
106
- parts.push('npx sunpeak inspect');
106
+ parts.push('sunpeak inspect');
107
107
 
108
108
  if (server.url) {
109
109
  parts.push(`--server ${server.url}`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sunpeak",
3
- "version": "0.19.10",
3
+ "version": "0.19.12",
4
4
  "description": "Inspector, testing framework, and app framework for MCP Apps.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -102,6 +102,12 @@
102
102
  "default": "./bin/lib/eval/eval-runner.mjs"
103
103
  }
104
104
  },
105
+ "./eval/plugin": {
106
+ "import": "./bin/lib/eval/eval-vitest-plugin.mjs"
107
+ },
108
+ "./eval/reporter": {
109
+ "import": "./bin/lib/eval/eval-reporter.mjs"
110
+ },
105
111
  "./package.json": "./package.json"
106
112
  },
107
113
  "bin": {
@@ -138,10 +144,10 @@
138
144
  "peerDependencies": {
139
145
  "react": "^18.0.0 || ^19.0.0",
140
146
  "react-dom": "^18.0.0 || ^19.0.0",
141
- "ai": "^4.0.0",
142
- "@ai-sdk/openai": "^1.0.0",
143
- "@ai-sdk/anthropic": "^1.0.0",
144
- "@ai-sdk/google": "^1.0.0"
147
+ "ai": "^4.0.0 || ^5.0.0 || ^6.0.0",
148
+ "@ai-sdk/openai": "^1.0.0 || ^2.0.0 || ^3.0.0",
149
+ "@ai-sdk/anthropic": "^1.0.0 || ^2.0.0 || ^3.0.0",
150
+ "@ai-sdk/google": "^1.0.0 || ^2.0.0 || ^3.0.0"
145
151
  },
146
152
  "peerDependenciesMeta": {
147
153
  "ai": {
@@ -155,7 +155,7 @@ Then create a tool file in `src/tools/` and simulation file(s) in `tests/simulat
155
155
  Install the sunpeak skills to give your coding agent built-in knowledge of sunpeak patterns, hooks, and testing:
156
156
 
157
157
  ```bash
158
- npx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
158
+ pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
159
159
  ```
160
160
 
161
161
  ## Troubleshooting
@@ -12,5 +12,5 @@
12
12
  }
13
13
  },
14
14
  "name": "albums",
15
- "uri": "ui://albums-mnszmwah"
15
+ "uri": "ui://albums-mntel8dx"
16
16
  }
@@ -12,5 +12,5 @@
12
12
  }
13
13
  },
14
14
  "name": "carousel",
15
- "uri": "ui://carousel-mnszmwah"
15
+ "uri": "ui://carousel-mntel8dx"
16
16
  }
@@ -18,5 +18,5 @@
18
18
  }
19
19
  },
20
20
  "name": "map",
21
- "uri": "ui://map-mnszmwah"
21
+ "uri": "ui://map-mntel8dx"
22
22
  }
@@ -12,5 +12,5 @@
12
12
  }
13
13
  },
14
14
  "name": "review",
15
- "uri": "ui://review-mnszmwah"
15
+ "uri": "ui://review-mntel8dx"
16
16
  }
@@ -1,8 +1,8 @@
1
1
  {
2
- "hash": "c19ea8a1",
2
+ "hash": "545a8631",
3
3
  "configHash": "a598103f",
4
- "lockfileHash": "18567c78",
5
- "browserHash": "34bbafa0",
4
+ "lockfileHash": "0deaaac3",
5
+ "browserHash": "a695eb93",
6
6
  "optimized": {},
7
7
  "chunks": {}
8
8
  }
@@ -1,103 +1,103 @@
1
1
  {
2
- "hash": "4280f302",
2
+ "hash": "9e921320",
3
3
  "configHash": "b5321072",
4
- "lockfileHash": "18567c78",
5
- "browserHash": "cea8bcfe",
4
+ "lockfileHash": "0deaaac3",
5
+ "browserHash": "68384581",
6
6
  "optimized": {
7
7
  "@modelcontextprotocol/ext-apps": {
8
8
  "src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/app.js",
9
9
  "file": "@modelcontextprotocol_ext-apps.js",
10
- "fileHash": "073455d6",
10
+ "fileHash": "e681afc2",
11
11
  "needsInterop": false
12
12
  },
13
13
  "@modelcontextprotocol/ext-apps/app-bridge": {
14
14
  "src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/app-bridge.js",
15
15
  "file": "@modelcontextprotocol_ext-apps_app-bridge.js",
16
- "fileHash": "b1b64fca",
16
+ "fileHash": "71abff0a",
17
17
  "needsInterop": false
18
18
  },
19
19
  "@modelcontextprotocol/ext-apps/react": {
20
20
  "src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/react/index.js",
21
21
  "file": "@modelcontextprotocol_ext-apps_react.js",
22
- "fileHash": "2138e39d",
22
+ "fileHash": "a6d7f230",
23
23
  "needsInterop": false
24
24
  },
25
25
  "@testing-library/react": {
26
26
  "src": "../../../../../../node_modules/.pnpm/@testing-library+react@16.3.2_@testing-library+dom@10.4.1_@types+react-dom@19.2.3_@type_893f466751a7d66081fd06e9edb9241a/node_modules/@testing-library/react/dist/@testing-library/react.esm.js",
27
27
  "file": "@testing-library_react.js",
28
- "fileHash": "351b9ceb",
28
+ "fileHash": "2b1ba0cb",
29
29
  "needsInterop": false
30
30
  },
31
31
  "clsx": {
32
32
  "src": "../../../../../../node_modules/.pnpm/clsx@2.1.1/node_modules/clsx/dist/clsx.mjs",
33
33
  "file": "clsx.js",
34
- "fileHash": "21018c6f",
34
+ "fileHash": "207734d0",
35
35
  "needsInterop": false
36
36
  },
37
37
  "embla-carousel-react": {
38
38
  "src": "../../../../../../node_modules/.pnpm/embla-carousel-react@8.6.0_react@19.2.4/node_modules/embla-carousel-react/esm/embla-carousel-react.esm.js",
39
39
  "file": "embla-carousel-react.js",
40
- "fileHash": "debc6259",
40
+ "fileHash": "20692215",
41
41
  "needsInterop": false
42
42
  },
43
43
  "embla-carousel-wheel-gestures": {
44
44
  "src": "../../../../../../node_modules/.pnpm/embla-carousel-wheel-gestures@8.1.0_embla-carousel@8.6.0/node_modules/embla-carousel-wheel-gestures/dist/embla-carousel-wheel-gestures.esm.js",
45
45
  "file": "embla-carousel-wheel-gestures.js",
46
- "fileHash": "b8f353af",
46
+ "fileHash": "c4fef2a7",
47
47
  "needsInterop": false
48
48
  },
49
49
  "mapbox-gl": {
50
50
  "src": "../../../../../../node_modules/.pnpm/mapbox-gl@3.21.0/node_modules/mapbox-gl/dist/mapbox-gl.js",
51
51
  "file": "mapbox-gl.js",
52
- "fileHash": "926d0d9a",
52
+ "fileHash": "ec7a0faf",
53
53
  "needsInterop": true
54
54
  },
55
55
  "react-dom": {
56
56
  "src": "../../../../../../node_modules/.pnpm/react-dom@19.2.4_react@19.2.4/node_modules/react-dom/index.js",
57
57
  "file": "react-dom.js",
58
- "fileHash": "9347dadf",
58
+ "fileHash": "ea2e1e20",
59
59
  "needsInterop": true
60
60
  },
61
61
  "react-dom/client": {
62
62
  "src": "../../../../../../node_modules/.pnpm/react-dom@19.2.4_react@19.2.4/node_modules/react-dom/client.js",
63
63
  "file": "react-dom_client.js",
64
- "fileHash": "77abbe7c",
64
+ "fileHash": "28137cda",
65
65
  "needsInterop": true
66
66
  },
67
67
  "react": {
68
68
  "src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/index.js",
69
69
  "file": "react.js",
70
- "fileHash": "d75fddcd",
70
+ "fileHash": "7af1c490",
71
71
  "needsInterop": true
72
72
  },
73
73
  "react/jsx-dev-runtime": {
74
74
  "src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/jsx-dev-runtime.js",
75
75
  "file": "react_jsx-dev-runtime.js",
76
- "fileHash": "d8e94b8c",
76
+ "fileHash": "9f5cf3bd",
77
77
  "needsInterop": true
78
78
  },
79
79
  "react/jsx-runtime": {
80
80
  "src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/jsx-runtime.js",
81
81
  "file": "react_jsx-runtime.js",
82
- "fileHash": "a61a105d",
82
+ "fileHash": "b414b6b3",
83
83
  "needsInterop": true
84
84
  },
85
85
  "tailwind-merge": {
86
86
  "src": "../../../../../../node_modules/.pnpm/tailwind-merge@3.5.0/node_modules/tailwind-merge/dist/bundle-mjs.mjs",
87
87
  "file": "tailwind-merge.js",
88
- "fileHash": "0608e35a",
88
+ "fileHash": "999e54e9",
89
89
  "needsInterop": false
90
90
  },
91
91
  "vitest": {
92
92
  "src": "../../../../../../node_modules/.pnpm/vitest@4.1.2_@opentelemetry+api@1.9.0_@types+node@25.5.2_happy-dom@18.0.1_jsdom@29.0.1__cf001624456b7837f7b99c4ef7b4c061/node_modules/vitest/dist/index.js",
93
93
  "file": "vitest.js",
94
- "fileHash": "11d2208b",
94
+ "fileHash": "a4111eed",
95
95
  "needsInterop": false
96
96
  },
97
97
  "zod": {
98
98
  "src": "../../../../../../node_modules/.pnpm/zod@4.3.6/node_modules/zod/index.js",
99
99
  "file": "zod.js",
100
- "fileHash": "06047af6",
100
+ "fileHash": "eb1d38b2",
101
101
  "needsInterop": false
102
102
  }
103
103
  },
@@ -1,4 +1,3 @@
1
- import { expect } from 'vitest';
2
1
  import { defineEval } from 'sunpeak/eval';
3
2
 
4
3
  export default defineEval({
@@ -11,18 +10,22 @@ export default defineEval({
11
10
  {
12
11
  name: 'asks for food photos',
13
12
  prompt: 'Show me photos from my Austin pizza tour',
14
- expect: {
15
- tool: 'show-albums',
16
- args: { search: expect.stringMatching(/pizza|austin/i) },
17
- },
13
+ expect: { tool: 'show-albums' },
18
14
  },
19
15
  {
20
16
  name: 'asks for a specific category',
21
17
  prompt: 'Show me my travel photos',
22
- expect: {
23
- tool: 'show-albums',
24
- args: { category: expect.stringMatching(/travel/i) },
25
- },
18
+ expect: { tool: 'show-albums' },
26
19
  },
20
+ // To also check argument extraction, add args expectations
21
+ // (import { expect } from 'vitest' to use matchers):
22
+ // {
23
+ // name: 'passes search term',
24
+ // prompt: 'Show me photos from my Austin pizza tour',
25
+ // expect: {
26
+ // tool: 'show-albums',
27
+ // args: { search: expect.stringMatching(/pizza|austin/i) },
28
+ // },
29
+ // },
27
30
  ],
28
31
  });
@@ -1,4 +1,3 @@
1
- import { expect } from 'vitest';
2
1
  import { defineEval } from 'sunpeak/eval';
3
2
 
4
3
  export default defineEval({
@@ -6,21 +5,12 @@ export default defineEval({
6
5
  {
7
6
  name: 'asks for popular places',
8
7
  prompt: 'Show me popular places to visit in Tokyo',
9
- expect: {
10
- tool: 'show-carousel',
11
- args: { city: expect.stringMatching(/tokyo/i) },
12
- },
8
+ expect: { tool: 'show-carousel' },
13
9
  },
14
10
  {
15
- name: 'asks for restaurants',
16
- prompt: 'What are some good restaurants in Paris?',
17
- expect: {
18
- tool: 'show-carousel',
19
- args: {
20
- city: expect.stringMatching(/paris/i),
21
- categories: expect.arrayContaining([expect.stringMatching(/restaurant|food|dining/i)]),
22
- },
23
- },
11
+ name: 'asks for places to eat',
12
+ prompt: 'Where should I eat in Paris?',
13
+ expect: { tool: 'show-carousel' },
24
14
  },
25
15
  ],
26
16
  });
@@ -10,15 +10,15 @@ export default defineEvalConfig({
10
10
 
11
11
  models: [
12
12
  // Uncomment models and install their provider packages:
13
- // 'gpt-4o', // OPENAI_API_KEY
14
- // 'gpt-4o-mini', // OPENAI_API_KEY
15
- // 'o4-mini', // OPENAI_API_KEY
16
- // 'claude-sonnet-4-20250514', // ANTHROPIC_API_KEY
17
- // 'gemini-2.0-flash', // GOOGLE_GENERATIVE_AI_API_KEY
13
+ // 'gpt-4o', // OPENAI_API_KEY
14
+ // 'gpt-4o-mini', // OPENAI_API_KEY
15
+ // 'o4-mini', // OPENAI_API_KEY
16
+ // 'claude-sonnet-4-20250514', // ANTHROPIC_API_KEY
17
+ // 'gemini-2.0-flash', // GOOGLE_GENERATIVE_AI_API_KEY
18
18
  ],
19
19
 
20
20
  defaults: {
21
- runs: 10, // Number of times to run each case per model
21
+ runs: 5, // Number of times to run each case per model
22
22
  maxSteps: 1, // Max tool call steps per run
23
23
  temperature: 0, // 0 for most deterministic results
24
24
  timeout: 30_000, // Timeout per run in ms
@@ -1,4 +1,3 @@
1
- import { expect } from 'vitest';
2
1
  import { defineEval } from 'sunpeak/eval';
3
2
 
4
3
  export default defineEval({
@@ -6,18 +5,12 @@ export default defineEval({
6
5
  {
7
6
  name: 'asks for a map',
8
7
  prompt: 'Show me a map of coffee shops near downtown Austin',
9
- expect: {
10
- tool: 'show-map',
11
- args: { query: expect.stringMatching(/coffee/i) },
12
- },
8
+ expect: { tool: 'show-map' },
13
9
  },
14
10
  {
15
11
  name: 'asks for nearby places',
16
12
  prompt: 'Find me some parks nearby',
17
- expect: {
18
- tool: 'show-map',
19
- args: { query: expect.stringMatching(/park/i) },
20
- },
13
+ expect: { tool: 'show-map' },
21
14
  },
22
15
  ],
23
16
  });
@@ -1,4 +1,3 @@
1
- import { expect } from 'vitest';
2
1
  import { defineEval } from 'sunpeak/eval';
3
2
 
4
3
  export default defineEval({
@@ -11,10 +10,7 @@ export default defineEval({
11
10
  {
12
11
  name: 'asks to draft a social post',
13
12
  prompt: 'Write a launch announcement for X and LinkedIn',
14
- expect: {
15
- tool: 'review-post',
16
- args: { platforms: expect.arrayContaining([expect.stringMatching(/x|twitter/i)]) },
17
- },
13
+ expect: { tool: 'review-post' },
18
14
  },
19
15
  {
20
16
  name: 'asks to review a purchase',
@@ -22,27 +18,36 @@ export default defineEval({
22
18
  expect: { tool: 'review-purchase' },
23
19
  },
24
20
 
25
- // --- Other assertion patterns ---
21
+ // To also check argument extraction, add args expectations
22
+ // (import { expect } from 'vitest' to use matchers):
23
+ // {
24
+ // name: 'passes platforms',
25
+ // prompt: 'Write a launch announcement for X and LinkedIn',
26
+ // expect: {
27
+ // tool: 'review-post',
28
+ // args: { platforms: expect.arrayContaining([expect.stringMatching(/x|twitter/i)]) },
29
+ // },
30
+ // },
26
31
  //
27
32
  // Multi-step (ordered tool call sequence):
28
- // {
29
- // name: 'multi-step flow',
30
- // prompt: 'Draft a post and then review it',
31
- // maxSteps: 3,
32
- // expect: [
33
- // { tool: 'review-post' },
34
- // { tool: 'publish-post' },
35
- // ],
36
- // },
33
+ // {
34
+ // name: 'multi-step flow',
35
+ // prompt: 'Draft a post and then review it',
36
+ // maxSteps: 3,
37
+ // expect: [
38
+ // { tool: 'review-post' },
39
+ // { tool: 'publish-post' },
40
+ // ],
41
+ // },
37
42
  //
38
43
  // Custom assertion (full access to result):
39
- // {
40
- // name: 'custom check',
41
- // prompt: 'Show me my recent reviews',
42
- // assert: (result) => {
43
- // expect(result.toolCalls).toHaveLength(1);
44
- // expect(result.toolCalls[0].name).toBe('review-diff');
45
- // },
44
+ // {
45
+ // name: 'custom check',
46
+ // prompt: 'Show me my recent reviews',
47
+ // assert: (result) => {
48
+ // expect(result.toolCalls).toHaveLength(1);
49
+ // expect(result.toolCalls[0].name).toBe('review-diff');
46
50
  // },
51
+ // },
47
52
  ],
48
53
  });