npm - sunpeak - Versions diffs - 0.19.10 → 0.19.12 - Mend

sunpeak 0.19.10 → 0.19.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +1 -1
package/bin/commands/new.mjs +85 -7
package/bin/commands/test-init.mjs +92 -22
package/bin/commands/test.mjs +54 -10
package/bin/lib/eval/eval-providers.mjs +34 -0
package/bin/lib/eval/eval-runner.mjs +54 -2
package/bin/lib/eval/eval-vitest-plugin.mjs +3 -14
package/bin/lib/eval/model-registry.mjs +2 -2
package/bin/lib/inspect/inspect-config.mjs +1 -1
package/bin/lib/test/test-config.mjs +1 -1
package/package.json +11 -5
package/template/README.md +1 -1
package/template/dist/albums/albums.json +1 -1
package/template/dist/carousel/carousel.json +1 -1
package/template/dist/map/map.json +1 -1
package/template/dist/review/review.json +1 -1
package/template/node_modules/.vite/deps/_metadata.json +3 -3
package/template/node_modules/.vite-mcp/deps/_metadata.json +19 -19
package/template/tests/evals/albums.eval.ts +12 -9
package/template/tests/evals/carousel.eval.ts +4 -14
package/template/tests/evals/eval.config.ts +6 -6
package/template/tests/evals/map.eval.ts +2 -9
package/template/tests/evals/review.eval.ts +27 -22
/package/template/tests/evals/{.env.example → _env.example} +0 -0

package/README.md CHANGED Viewed

@@ -132,7 +132,7 @@ sunpeak new
 Install the sunpeak skills to give your coding agent (Claude Code, Cursor, etc.) built-in knowledge of sunpeak patterns, hooks, and testing:
 ```bash
-npx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
+pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
 ```
 ## Troubleshooting

package/bin/commands/new.mjs CHANGED Viewed

@@ -9,6 +9,7 @@ const execAsync = promisify(exec);
 import * as clack from '@clack/prompts';
 import { discoverResources } from '../lib/patterns.mjs';
 import { detectPackageManager } from '../utils.mjs';
+import { EVAL_PROVIDERS } from '../lib/eval/eval-providers.mjs';
 const __dirname = dirname(fileURLToPath(import.meta.url));
@@ -57,6 +58,21 @@ async function defaultSelectResources(availableResources) {
   return selected;
 }
+/**
+ * Default prompt for eval provider selection.
+ * @returns {Promise<Array<{ pkg: string, models: string[] }>>}
+ */
+async function defaultSelectProviders() {
+  const selected = await clack.multiselect({
+    message: 'AI providers for evals (space to toggle, enter to skip)',
+    options: EVAL_PROVIDERS.map((p) => ({ value: p, label: p.label })),
+    initialValues: [],
+    required: false,
+  });
+  if (clack.isCancel(selected)) return [];
+  return selected;
+}
 /**
  * Default dependencies (real implementations)
  */
@@ -73,6 +89,8 @@ export const defaultDeps = {
   execAsync,
   promptName: defaultPromptName,
   selectResources: defaultSelectResources,
+  selectProviders: defaultSelectProviders,
+  password: clack.password,
   confirm: clack.confirm,
   intro: clack.intro,
   outro: clack.outro,
@@ -233,6 +251,15 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
       d.renameSync(srcPath, destPath);
     }
   }
+  // Rename nested dotfiles (underscore convention for npm compatibility)
+  const nestedDotfiles = [['tests/evals/_env.example', 'tests/evals/.env.example']];
+  for (const [from, to] of nestedDotfiles) {
+    const srcPath = join(targetDir, from);
+    const destPath = join(targetDir, to);
+    if (d.existsSync(srcPath)) {
+      d.renameSync(srcPath, destPath);
+    }
+  }
   // Read sunpeak version from root package.json
   const rootPkg = JSON.parse(d.readFileSync(d.rootPkgPath, 'utf-8'));
@@ -282,6 +309,56 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
     s.stop(`Install failed. You can try running "${pm} install" manually.`);
   }
+  // Offer to configure eval providers (only in interactive mode)
+  if (resourcesArg === undefined) {
+    const providers = await d.selectProviders();
+    if (!clack.isCancel(providers) && providers.length > 0) {
+      // Install AI SDK core + selected provider packages
+      const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
+      try {
+        await d.execAsync(`${pm} add -D ${pkgsToInstall.join(' ')}`, { cwd: targetDir });
+      } catch {
+        d.console.log(`Provider install failed. Install manually: ${pm} add -D ${pkgsToInstall.join(' ')}`);
+      }
+      // Uncomment selected models in eval.config.ts
+      const evalConfigPath = join(targetDir, 'tests', 'evals', 'eval.config.ts');
+      if (d.existsSync(evalConfigPath)) {
+        let config = d.readFileSync(evalConfigPath, 'utf-8');
+        for (const p of providers) {
+          for (const model of p.models) {
+            // Uncomment lines matching this model (e.g., "    // 'gpt-4o'," → "    'gpt-4o',")
+            config = config.replace(
+              new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
+              '$1$2'
+            );
+          }
+        }
+        d.writeFileSync(evalConfigPath, config);
+      }
+      // Prompt for API keys and write .env
+      const envLines = [];
+      const seen = new Set();
+      for (const p of providers) {
+        if (seen.has(p.envVar)) continue;
+        seen.add(p.envVar);
+        const key = await d.password({
+          message: `${p.envVar} (enter to skip)`,
+          mask: '*',
+        });
+        if (!clack.isCancel(key) && key) {
+          envLines.push(`${p.envVar}=${key}`);
+        }
+      }
+      const envPath = join(targetDir, 'tests', 'evals', '.env');
+      if (envLines.length > 0) {
+        d.writeFileSync(envPath, envLines.join('\n') + '\n');
+        clack.log.info(`API keys saved to tests/evals/.env (gitignored)`);
+      }
+    }
+  }
   // Offer to install the sunpeak skills (only in interactive mode)
   if (resourcesArg === undefined) {
     const installSkill = await d.confirm({
@@ -290,18 +367,16 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
     });
     if (!clack.isCancel(installSkill) && installSkill) {
       try {
-        d.execSync('npx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server', {
+        d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server', {
           cwd: targetDir,
           stdio: 'inherit',
         });
       } catch {
-        d.console.log('Skill install skipped. You can install later with: npx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server');
+        d.console.log('Skill install skipped. You can install later with: pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server');
       }
     }
   }
-  const runCmd = pm === 'npm' ? 'npm run' : pm;
   d.outro(`Done! To get started:
   cd ${projectName}
@@ -309,9 +384,12 @@ export async function init(projectName, resourcesArg, deps = defaultDeps) {
 Your project commands:
-  sunpeak dev       # Start dev server + MCP endpoint
-  sunpeak build     # Build for production
-  ${runCmd} test         # Run tests`);
+  sunpeak dev                # Start dev server + MCP endpoint
+  sunpeak build              # Build for production
+  sunpeak test               # Run unit + e2e tests
+  sunpeak test --eval        # Run LLM evals (configure models in tests/evals/eval.config.ts)
+  sunpeak test --visual      # Run visual regression tests
+  sunpeak test --live        # Run live tests against real AI hosts`);
 }
 // Allow running directly

package/bin/commands/test-init.mjs CHANGED Viewed

@@ -2,11 +2,24 @@ import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
 import { execSync } from 'child_process';
 import { join, dirname } from 'path';
 import * as p from '@clack/prompts';
+import { EVAL_PROVIDERS, generateModelLines } from '../lib/eval/eval-providers.mjs';
+import { detectPackageManager } from '../utils.mjs';
 /**
  * Default dependencies (real implementations).
  * Override in tests via the `deps` parameter.
  */
+async function defaultSelectProviders() {
+  const selected = await p.multiselect({
+    message: 'AI providers for evals (space to toggle, enter to skip)',
+    options: EVAL_PROVIDERS.map((prov) => ({ value: prov, label: prov.label })),
+    initialValues: [],
+    required: false,
+  });
+  if (p.isCancel(selected)) return [];
+  return selected;
+}
 export const defaultDeps = {
   existsSync,
   readFileSync,
@@ -21,6 +34,9 @@ export const defaultDeps = {
   select: p.select,
   text: p.text,
   log: p.log,
+  password: p.password,
+  selectProviders: defaultSelectProviders,
+  detectPackageManager,
 };
 /**
@@ -60,6 +76,61 @@ export async function testInit(args = [], deps = defaultDeps) {
     await initExternalProject(cliServer, d);
   }
+  // Offer to configure eval providers
+  const providers = await d.selectProviders();
+  if (!d.isCancel(providers) && providers.length > 0) {
+    const pm = d.detectPackageManager();
+    const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
+    const installCmd = `${pm} add -D ${pkgsToInstall.join(' ')}`;
+    try {
+      d.execSync(installCmd, { cwd: d.cwd(), stdio: 'inherit' });
+    } catch {
+      d.log.info(`Provider install failed. Install manually: ${installCmd}`);
+    }
+    // Uncomment selected models in eval.config.ts
+    const evalDir = d.existsSync(join(d.cwd(), 'tests', 'evals'))
+      ? join(d.cwd(), 'tests', 'evals')
+      : d.existsSync(join(d.cwd(), 'tests', 'sunpeak', 'evals'))
+        ? join(d.cwd(), 'tests', 'sunpeak', 'evals')
+        : null;
+    if (evalDir) {
+      const configPath = join(evalDir, 'eval.config.ts');
+      if (d.existsSync(configPath)) {
+        let config = d.readFileSync(configPath, 'utf-8');
+        for (const prov of providers) {
+          for (const model of prov.models) {
+            config = config.replace(
+              new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
+              '$1$2'
+            );
+          }
+        }
+        d.writeFileSync(configPath, config);
+      }
+      // Prompt for API keys and write .env
+      const envLines = [];
+      const seen = new Set();
+      for (const prov of providers) {
+        if (seen.has(prov.envVar)) continue;
+        seen.add(prov.envVar);
+        const key = await d.password({
+          message: `${prov.envVar} (enter to skip)`,
+          mask: '*',
+        });
+        if (!d.isCancel(key) && key) {
+          envLines.push(`${prov.envVar}=${key}`);
+        }
+      }
+      if (envLines.length > 0 && evalDir) {
+        const relEnvPath = evalDir.startsWith(d.cwd()) ? evalDir.slice(d.cwd().length + 1) : evalDir;
+        d.writeFileSync(join(evalDir, '.env'), envLines.join('\n') + '\n');
+        d.log.info(`API keys saved to ${relEnvPath}/.env (gitignored)`);
+      }
+    }
+  }
   // Offer to install the testing skill
   const installSkill = await d.confirm({
     message: 'Install the test-mcp-server skill? (helps your coding agent write tests)',
@@ -67,12 +138,12 @@ export async function testInit(args = [], deps = defaultDeps) {
   });
   if (!d.isCancel(installSkill) && installSkill) {
     try {
-      d.execSync('npx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
+      d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
         cwd: d.cwd(),
         stdio: 'inherit',
       });
     } catch {
-      d.log.info('Skill install skipped. Install later: npx skills add Sunpeak-AI/sunpeak@test-mcp-server');
+      d.log.info('Skill install skipped. Install later: pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server');
     }
   }
@@ -205,15 +276,11 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
     "",
     "  models: [",
     "    // Uncomment models and install their provider packages:",
-    "    // 'gpt-4o',                      // OPENAI_API_KEY",
-    "    // 'gpt-4o-mini',                 // OPENAI_API_KEY",
-    "    // 'o4-mini',                     // OPENAI_API_KEY",
-    "    // 'claude-sonnet-4-20250514',    // ANTHROPIC_API_KEY",
-    "    // 'gemini-2.0-flash',            // GOOGLE_GENERATIVE_AI_API_KEY",
+    ...generateModelLines(),
     "  ],",
     "",
     "  defaults: {",
-    "    runs: 10,          // Number of times to run each case per model",
+    "    runs: 5,           // Number of times to run each case per model",
     "    maxSteps: 1,       // Max tool call steps per run",
     "    temperature: 0,    // 0 for most deterministic results",
     "    timeout: 30_000,   // Timeout per run in ms",
@@ -244,7 +311,7 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
  *
  * To get started:
  * 1. Configure models in eval.config.ts (uncomment the ones you want)
- * 2. Install the AI SDK and provider packages: pnpm add ai @ai-sdk/openai
+ * 2. Install the AI SDK and provider packages (e.g. pnpm add ai @ai-sdk/openai)
  * 3. Copy .env.example to .env and add your API keys
  * 4. Replace this file with evals for your own tools
  * 5. Run: sunpeak test --eval
@@ -489,6 +556,7 @@ async function initExternalProject(cliServer, d) {
         private: true,
         type: 'module',
         devDependencies: {
+          '@types/node': 'latest',
           sunpeak: 'latest',
           '@playwright/test': 'latest',
         },
@@ -564,14 +632,15 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
   d.log.success('Created tests/sunpeak/ with all test types.');
   d.log.step('Next steps:');
+  const pm = d.detectPackageManager();
   d.log.message('  cd tests/sunpeak');
-  d.log.message('  npm install');
-  d.log.message('  npx playwright install chromium');
+  d.log.message(`  ${pm} install`);
+  d.log.message(`  ${pm} exec playwright install chromium`);
   d.log.message('');
-  d.log.message('  npx sunpeak test              # E2E tests');
-  d.log.message('  npx sunpeak test --visual      # Visual regression (generates baselines on first run)');
-  d.log.message('  npx sunpeak test --live         # Live tests against real hosts (requires login)');
-  d.log.message('  npx sunpeak test --eval         # Multi-model evals (configure models in evals/eval.config.ts)');
+  d.log.message('  sunpeak test              # E2E tests');
+  d.log.message('  sunpeak test --visual      # Visual regression (generates baselines on first run)');
+  d.log.message('  sunpeak test --live         # Live tests against real hosts (requires login)');
+  d.log.message('  sunpeak test --eval         # Multi-model evals (configure models in evals/eval.config.ts)');
 }
 async function initJsProject(cliServer, d) {
@@ -638,15 +707,16 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
   // 5. Unit test
   scaffoldUnitTest(join(cwd, 'tests', 'unit', 'example.test.ts'), d);
+  const pkgMgr = d.detectPackageManager();
   d.log.step('Next steps:');
-  d.log.message('  npm install -D sunpeak @playwright/test vitest');
-  d.log.message('  npx playwright install chromium');
+  d.log.message(`  ${pkgMgr} add -D sunpeak @playwright/test vitest`);
+  d.log.message(`  ${pkgMgr} exec playwright install chromium`);
   d.log.message('');
-  d.log.message('  npx sunpeak test              # E2E tests');
-  d.log.message('  npx sunpeak test --unit        # Unit tests (vitest)');
-  d.log.message('  npx sunpeak test --visual      # Visual regression');
-  d.log.message('  npx sunpeak test --live         # Live tests against real hosts');
-  d.log.message('  npx sunpeak test --eval         # Multi-model evals');
+  d.log.message('  sunpeak test              # E2E tests');
+  d.log.message('  sunpeak test --unit        # Unit tests (vitest)');
+  d.log.message('  sunpeak test --visual      # Visual regression');
+  d.log.message('  sunpeak test --live         # Live tests against real hosts');
+  d.log.message('  sunpeak test --eval         # Multi-model evals');
 }
 async function initSunpeakProject(d) {

package/bin/commands/test.mjs CHANGED Viewed

@@ -310,14 +310,43 @@ function validateApiKeys(models) {
   return warnings;
 }
+/**
+ * Check that required AI SDK provider packages are installed for the configured models.
+ * @param {string[]} models
+ * @returns {Promise<Array<{ pkg: string, reason: string }>>}
+ */
+async function checkProviderPackages(models) {
+  const { createRequire } = await import('module');
+  // Resolve from the project's node_modules, not the global CLI install
+  const require = createRequire(join(process.cwd(), 'package.json'));
+  const missing = [];
+  const checked = new Set();
+  try {
+    require.resolve('ai');
+  } catch {
+    missing.push({ pkg: 'ai', reason: 'core AI SDK' });
+  }
+  for (const modelId of models) {
+    const pkg = getProviderForModel(modelId);
+    if (!pkg || checked.has(pkg)) continue;
+    checked.add(pkg);
+    try {
+      require.resolve(pkg);
+    } catch {
+      missing.push({ pkg, reason: modelId });
+    }
+  }
+  return missing;
+}
 /**
  * Run eval tests via vitest with the eval plugin.
  */
 async function runEvals(args) {
-  const { dirname, resolve, basename } = await import('path');
-  const { fileURLToPath } = await import('url');
-  const __dirname = dirname(fileURLToPath(import.meta.url));
+  const { resolve, basename } = await import('path');
   const evalDir = findEvalDir();
   if (!evalDir) {
@@ -365,8 +394,19 @@ async function runEvals(args) {
     }
   }
-  // Validate API keys for configured models
+  // Check for missing provider packages and API keys
   if (configModels && configModels.length > 0) {
+    const missingPkgs = await checkProviderPackages(configModels);
+    if (missingPkgs.length > 0) {
+      console.error('\nMissing required packages for eval models:\n');
+      for (const { pkg, reason } of missingPkgs) {
+        console.error(`  ${pkg}  (needed for ${reason})`);
+      }
+      const installCmd = missingPkgs.map((m) => m.pkg).join(' ');
+      console.error(`\nInstall with:\n\n  pnpm add -D ${installCmd}\n`);
+      return 1;
+    }
     const warnings = validateApiKeys(configModels);
     if (warnings.length > 0) {
       console.log('');
@@ -416,8 +456,10 @@ async function runEvals(args) {
   // Use .ts extension so vitest handles TypeScript imports natively
   const vitestConfigPath = join(absEvalDir, '.eval-vitest.config.ts');
-  const evalPluginPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-vitest-plugin.mjs');
-  const evalReporterPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-reporter.mjs');
+  // Use package exports so vitest resolves from the project's node_modules,
+  // not the global CLI install. This ensures import('ai') finds project-local deps.
+  const evalPluginImport = 'sunpeak/eval/plugin';
+  const evalReporterImport = 'sunpeak/eval/reporter';
   // Clean up dev server and temp config
   const cleanupResources = () => {
@@ -442,17 +484,19 @@ async function runEvals(args) {
     ? './' + basename(configFile)
     : null;
-  // Generate a vitest config that loads the eval config and plugin
+  // Generate a vitest config that loads the eval config and plugin.
+  // Imports use package names (sunpeak/eval/plugin) so vitest resolves from
+  // the project's node_modules, ensuring import('ai') finds project-local deps.
   const vitestConfig = `
 import { defineConfig } from 'vitest/config';
-import { evalVitestPlugin } from ${JSON.stringify(evalPluginPath)};
+import { evalVitestPlugin } from '${evalPluginImport}';
 ${configImportPath ? `import evalConfig from ${JSON.stringify(configImportPath)};` : 'const evalConfig = { models: [], defaults: {} };'}
 export default defineConfig({
   test: {
     globals: true,
     include: ['**/*.eval.ts', '**/*.eval.js'],
-    reporters: ['default', ${JSON.stringify(evalReporterPath)}],
+    reporters: ['default', '${evalReporterImport}'],
     testTimeout: 600000,
   },
   plugins: [

package/bin/lib/eval/eval-providers.mjs ADDED Viewed

@@ -0,0 +1,34 @@
+/**
+ * Shared eval provider definitions used by both `sunpeak new` and `sunpeak test init`.
+ * Single source of truth for provider packages, model IDs, and CLI labels.
+ */
+export const EVAL_PROVIDERS = [
+  { pkg: '@ai-sdk/openai', models: ['gpt-4o', 'gpt-4o-mini'], label: 'OpenAI       (gpt-4o, gpt-4o-mini)', envVar: 'OPENAI_API_KEY' },
+  { pkg: '@ai-sdk/anthropic', models: ['claude-sonnet-4-20250514'], label: 'Anthropic    (claude-sonnet-4)', envVar: 'ANTHROPIC_API_KEY' },
+  { pkg: '@ai-sdk/google', models: ['gemini-2.0-flash'], label: 'Google       (gemini-2.0-flash)', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
+];
+/**
+ * All model lines that appear in eval.config.ts, in order.
+ * Used by scaffoldEvals to generate the config and by the uncomment logic.
+ */
+export const EVAL_CONFIG_MODELS = [
+  { id: 'gpt-4o', envVar: 'OPENAI_API_KEY' },
+  { id: 'gpt-4o-mini', envVar: 'OPENAI_API_KEY' },
+  { id: 'o4-mini', envVar: 'OPENAI_API_KEY' },
+  { id: 'claude-sonnet-4-20250514', envVar: 'ANTHROPIC_API_KEY' },
+  { id: 'gemini-2.0-flash', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
+];
+/**
+ * Generate the models section lines for eval.config.ts.
+ * @returns {string[]}
+ */
+export function generateModelLines() {
+  const maxIdLen = Math.max(...EVAL_CONFIG_MODELS.map((m) => m.id.length));
+  return EVAL_CONFIG_MODELS.map((m) => {
+    const padded = `'${m.id}',`.padEnd(maxIdLen + 3);
+    return `    // ${padded} // ${m.envVar}`;
+  });
+}

package/bin/lib/eval/eval-runner.mjs CHANGED Viewed

@@ -8,6 +8,43 @@ import { resolveModel, checkAiSdkInstalled } from './model-registry.mjs';
 // Re-export for use in generated test code
 export { checkAiSdkInstalled };
+/**
+ * Strip AI SDK retry wrapper from error messages for cleaner output.
+ * "Failed after 3 attempts. Last error: <actual error>" → "<actual error>"
+ * @param {string} message
+ * @returns {string}
+ */
+export function cleanErrorMessage(message) {
+  return message.replace(/^Failed after \d+ attempts?\. Last error: /i, '');
+}
+/**
+ * Check if an error message indicates a fatal API error that won't resolve on retry.
+ * @param {string} message
+ * @returns {boolean}
+ */
+export function isFatalApiError(message) {
+  const lower = message.toLowerCase();
+  const patterns = [
+    'exceeded your current quota',
+    'credit balance is too low',
+    'insufficient_quota',
+    'billing_hard_limit_reached',
+    'check your plan and billing details',
+    'add a payment method',
+    'invalid api key',
+    'invalid_api_key',
+    'incorrect api key',
+    'unauthorized',
+    'permission denied',
+    'access denied',
+    'authentication failed',
+    'account deactivated',
+    'account suspended',
+  ];
+  return patterns.some((p) => lower.includes(p));
+}
 /**
  * Define an eval spec. Identity function for type safety.
  * @param {import('./eval-types.d.mts').EvalSpec} spec
@@ -112,6 +149,7 @@ export async function runSingleEval({ prompt, model, tools, maxSteps, temperatur
     prompt,
     maxSteps,
     temperature,
+    maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
     abortSignal: AbortSignal.timeout(timeout),
   });
@@ -268,10 +306,15 @@ export async function runEvalCaseAggregate({
   const model = await resolveModel(modelId);
   let passed = 0;
   let failed = 0;
+  let executedRuns = 0;
   let totalDurationMs = 0;
   const failureMap = new Map();
   for (let i = 0; i < runs; i++) {
+    // Small delay between runs to avoid rate limits (skip before first run)
+    if (i > 0) await new Promise((r) => setTimeout(r, 1000));
+    executedRuns++;
     const start = performance.now();
     try {
       const result = await runSingleEval({
@@ -286,8 +329,17 @@ export async function runEvalCaseAggregate({
       passed++;
     } catch (err) {
       failed++;
-      const msg = err.message || String(err);
+      const msg = cleanErrorMessage(err.message || String(err));
       failureMap.set(msg, (failureMap.get(msg) || 0) + 1);
+      if (isFatalApiError(msg)) {
+        // Count remaining runs as failed and stop early
+        const remaining = runs - i - 1;
+        failed += remaining;
+        failureMap.set(msg, (failureMap.get(msg) || 0) + remaining);
+        totalDurationMs += performance.now() - start;
+        break;
+      }
     }
     totalDurationMs += performance.now() - start;
   }
@@ -304,7 +356,7 @@ export async function runEvalCaseAggregate({
     passed,
     failed,
     passRate: runs > 0 ? passed / runs : 0,
-    avgDurationMs: runs > 0 ? totalDurationMs / runs : 0,
+    avgDurationMs: executedRuns > 0 ? totalDurationMs / executedRuns : 0,
     failures,
   };
 }

package/bin/lib/eval/eval-vitest-plugin.mjs CHANGED Viewed

@@ -14,7 +14,6 @@
 import { readFileSync } from 'fs';
 import { basename } from 'path';
-import { fileURLToPath } from 'url';
 const EVAL_RE = /\.eval\.[tj]s$/;
 const VIRTUAL_PREFIX = '\0sunpeak-eval-spec:';
@@ -61,11 +60,12 @@ export function evalVitestPlugin({ server, models, defaults }) {
       virtualToReal.set(virtualId, id);
       const testName = basename(id).replace(EVAL_RE, '');
-      const runnerPath = resolveRunnerPath();
+      // Import from 'sunpeak/eval' (package export) so vitest resolves from
+      // the project's node_modules, not the global CLI install.
       const transformed = `
 import { describe, it, beforeAll, afterAll } from 'vitest';
-import { createMcpConnection, discoverAndConvertTools, runEvalCaseAggregate, checkAiSdkInstalled } from '${runnerPath}';
+import { createMcpConnection, discoverAndConvertTools, runEvalCaseAggregate, checkAiSdkInstalled } from 'sunpeak/eval';
 // Import the original eval spec via virtual module (bypasses this transform)
 import evalSpec from ${JSON.stringify(virtualId)};
@@ -145,14 +145,3 @@ describe.skipIf(shouldSkip)(${JSON.stringify(testName)}, () => {
   };
 }
-/**
- * Get the absolute path to the eval-runner module.
- */
-function resolveRunnerPath() {
-  const url = new URL('./eval-runner.mjs', import.meta.url);
-  // fileURLToPath requires file:// scheme; fall back to pathname for other schemes (e.g., vitest)
-  if (url.protocol === 'file:') {
-    return fileURLToPath(url);
-  }
-  return url.pathname;
-}

package/bin/lib/eval/model-registry.mjs CHANGED Viewed

@@ -36,7 +36,7 @@ export async function resolveModel(modelId) {
     provider = await import(pkg);
   } catch {
     throw new Error(
-      `Provider package "${pkg}" is not installed. Install it to use ${modelId}:\n\n  npm install ${pkg}  (or pnpm add / yarn add)\n`
+      `Provider package "${pkg}" is not installed. Install it to use ${modelId}:\n\n  pnpm add -D ${pkg}\n`
     );
   }
@@ -67,7 +67,7 @@ export async function checkAiSdkInstalled() {
     await import('ai');
   } catch {
     throw new Error(
-      'The "ai" package is not installed. Install it to use evals:\n\n  npm install ai  (or pnpm add / yarn add)\n'
+      'The "ai" package is not installed. Install it to use evals:\n\n  pnpm add -D ai\n'
     );
   }
 }

package/bin/lib/inspect/inspect-config.mjs CHANGED Viewed

@@ -49,7 +49,7 @@ export function defineInspectConfig(options) {
   const serverArg = server.includes(' ') ? `"${server}"` : server;
   const command = [
     `SUNPEAK_SANDBOX_PORT=${sandboxPort}`,
-    'npx sunpeak inspect',
+    'sunpeak inspect',
     `--server ${serverArg}`,
     ...(simulationsDir ? [`--simulations ${simulationsDir}`] : []),
     `--port ${port}`,

package/bin/lib/test/test-config.mjs CHANGED Viewed

@@ -103,7 +103,7 @@ function buildInspectCommand({ server, port, sandboxPort, simulationsDir }) {
     }
   }
-  parts.push('npx sunpeak inspect');
+  parts.push('sunpeak inspect');
   if (server.url) {
     parts.push(`--server ${server.url}`);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sunpeak",
-  "version": "0.19.10",
+  "version": "0.19.12",
   "description": "Inspector, testing framework, and app framework for MCP Apps.",
   "type": "module",
   "main": "./dist/index.cjs",
@@ -102,6 +102,12 @@
         "default": "./bin/lib/eval/eval-runner.mjs"
       }
     },
+    "./eval/plugin": {
+      "import": "./bin/lib/eval/eval-vitest-plugin.mjs"
+    },
+    "./eval/reporter": {
+      "import": "./bin/lib/eval/eval-reporter.mjs"
+    },
     "./package.json": "./package.json"
   },
   "bin": {
@@ -138,10 +144,10 @@
   "peerDependencies": {
     "react": "^18.0.0 || ^19.0.0",
     "react-dom": "^18.0.0 || ^19.0.0",
-    "ai": "^4.0.0",
-    "@ai-sdk/openai": "^1.0.0",
-    "@ai-sdk/anthropic": "^1.0.0",
-    "@ai-sdk/google": "^1.0.0"
+    "ai": "^4.0.0 || ^5.0.0 || ^6.0.0",
+    "@ai-sdk/openai": "^1.0.0 || ^2.0.0 || ^3.0.0",
+    "@ai-sdk/anthropic": "^1.0.0 || ^2.0.0 || ^3.0.0",
+    "@ai-sdk/google": "^1.0.0 || ^2.0.0 || ^3.0.0"
   },
   "peerDependenciesMeta": {
     "ai": {

package/template/README.md CHANGED Viewed

@@ -155,7 +155,7 @@ Then create a tool file in `src/tools/` and simulation file(s) in `tests/simulat
 Install the sunpeak skills to give your coding agent built-in knowledge of sunpeak patterns, hooks, and testing:
 ```bash
-npx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
+pnpm dlx skills add Sunpeak-AI/sunpeak@create-sunpeak-app Sunpeak-AI/sunpeak@test-mcp-server
 ```
 ## Troubleshooting

package/template/dist/albums/albums.json CHANGED Viewed

@@ -12,5 +12,5 @@
     }
   },
   "name": "albums",
-  "uri": "ui://albums-mnszmwah"
+  "uri": "ui://albums-mntel8dx"
 }

package/template/dist/carousel/carousel.json CHANGED Viewed

@@ -12,5 +12,5 @@
     }
   },
   "name": "carousel",
-  "uri": "ui://carousel-mnszmwah"
+  "uri": "ui://carousel-mntel8dx"
 }

package/template/dist/map/map.json CHANGED Viewed

@@ -18,5 +18,5 @@
     }
   },
   "name": "map",
-  "uri": "ui://map-mnszmwah"
+  "uri": "ui://map-mntel8dx"
 }

package/template/dist/review/review.json CHANGED Viewed

@@ -12,5 +12,5 @@
     }
   },
   "name": "review",
-  "uri": "ui://review-mnszmwah"
+  "uri": "ui://review-mntel8dx"
 }

package/template/node_modules/.vite/deps/_metadata.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-  "hash": "c19ea8a1",
+  "hash": "545a8631",
   "configHash": "a598103f",
-  "lockfileHash": "18567c78",
-  "browserHash": "34bbafa0",
+  "lockfileHash": "0deaaac3",
+  "browserHash": "a695eb93",
   "optimized": {},
   "chunks": {}
 }

package/template/node_modules/.vite-mcp/deps/_metadata.json CHANGED Viewed

@@ -1,103 +1,103 @@
 {
-  "hash": "4280f302",
+  "hash": "9e921320",
   "configHash": "b5321072",
-  "lockfileHash": "18567c78",
-  "browserHash": "cea8bcfe",
+  "lockfileHash": "0deaaac3",
+  "browserHash": "68384581",
   "optimized": {
     "@modelcontextprotocol/ext-apps": {
       "src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/app.js",
       "file": "@modelcontextprotocol_ext-apps.js",
-      "fileHash": "073455d6",
+      "fileHash": "e681afc2",
       "needsInterop": false
     },
     "@modelcontextprotocol/ext-apps/app-bridge": {
       "src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/app-bridge.js",
       "file": "@modelcontextprotocol_ext-apps_app-bridge.js",
-      "fileHash": "b1b64fca",
+      "fileHash": "71abff0a",
       "needsInterop": false
     },
     "@modelcontextprotocol/ext-apps/react": {
       "src": "../../../../../../node_modules/.pnpm/@modelcontextprotocol+ext-apps@1.5.0_@modelcontextprotocol+sdk@1.29.0_zod@4.3.6__react-_f4871531d9cf52c692eb6edc1ee416ef/node_modules/@modelcontextprotocol/ext-apps/dist/src/react/index.js",
       "file": "@modelcontextprotocol_ext-apps_react.js",
-      "fileHash": "2138e39d",
+      "fileHash": "a6d7f230",
       "needsInterop": false
     },
     "@testing-library/react": {
       "src": "../../../../../../node_modules/.pnpm/@testing-library+react@16.3.2_@testing-library+dom@10.4.1_@types+react-dom@19.2.3_@type_893f466751a7d66081fd06e9edb9241a/node_modules/@testing-library/react/dist/@testing-library/react.esm.js",
       "file": "@testing-library_react.js",
-      "fileHash": "351b9ceb",
+      "fileHash": "2b1ba0cb",
       "needsInterop": false
     },
     "clsx": {
       "src": "../../../../../../node_modules/.pnpm/clsx@2.1.1/node_modules/clsx/dist/clsx.mjs",
       "file": "clsx.js",
-      "fileHash": "21018c6f",
+      "fileHash": "207734d0",
       "needsInterop": false
     },
     "embla-carousel-react": {
       "src": "../../../../../../node_modules/.pnpm/embla-carousel-react@8.6.0_react@19.2.4/node_modules/embla-carousel-react/esm/embla-carousel-react.esm.js",
       "file": "embla-carousel-react.js",
-      "fileHash": "debc6259",
+      "fileHash": "20692215",
       "needsInterop": false
     },
     "embla-carousel-wheel-gestures": {
       "src": "../../../../../../node_modules/.pnpm/embla-carousel-wheel-gestures@8.1.0_embla-carousel@8.6.0/node_modules/embla-carousel-wheel-gestures/dist/embla-carousel-wheel-gestures.esm.js",
       "file": "embla-carousel-wheel-gestures.js",
-      "fileHash": "b8f353af",
+      "fileHash": "c4fef2a7",
       "needsInterop": false
     },
     "mapbox-gl": {
       "src": "../../../../../../node_modules/.pnpm/mapbox-gl@3.21.0/node_modules/mapbox-gl/dist/mapbox-gl.js",
       "file": "mapbox-gl.js",
-      "fileHash": "926d0d9a",
+      "fileHash": "ec7a0faf",
       "needsInterop": true
     },
     "react-dom": {
       "src": "../../../../../../node_modules/.pnpm/react-dom@19.2.4_react@19.2.4/node_modules/react-dom/index.js",
       "file": "react-dom.js",
-      "fileHash": "9347dadf",
+      "fileHash": "ea2e1e20",
       "needsInterop": true
     },
     "react-dom/client": {
       "src": "../../../../../../node_modules/.pnpm/react-dom@19.2.4_react@19.2.4/node_modules/react-dom/client.js",
       "file": "react-dom_client.js",
-      "fileHash": "77abbe7c",
+      "fileHash": "28137cda",
       "needsInterop": true
     },
     "react": {
       "src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/index.js",
       "file": "react.js",
-      "fileHash": "d75fddcd",
+      "fileHash": "7af1c490",
       "needsInterop": true
     },
     "react/jsx-dev-runtime": {
       "src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/jsx-dev-runtime.js",
       "file": "react_jsx-dev-runtime.js",
-      "fileHash": "d8e94b8c",
+      "fileHash": "9f5cf3bd",
       "needsInterop": true
     },
     "react/jsx-runtime": {
       "src": "../../../../../../node_modules/.pnpm/react@19.2.4/node_modules/react/jsx-runtime.js",
       "file": "react_jsx-runtime.js",
-      "fileHash": "a61a105d",
+      "fileHash": "b414b6b3",
       "needsInterop": true
     },
     "tailwind-merge": {
       "src": "../../../../../../node_modules/.pnpm/tailwind-merge@3.5.0/node_modules/tailwind-merge/dist/bundle-mjs.mjs",
       "file": "tailwind-merge.js",
-      "fileHash": "0608e35a",
+      "fileHash": "999e54e9",
       "needsInterop": false
     },
     "vitest": {
       "src": "../../../../../../node_modules/.pnpm/vitest@4.1.2_@opentelemetry+api@1.9.0_@types+node@25.5.2_happy-dom@18.0.1_jsdom@29.0.1__cf001624456b7837f7b99c4ef7b4c061/node_modules/vitest/dist/index.js",
       "file": "vitest.js",
-      "fileHash": "11d2208b",
+      "fileHash": "a4111eed",
       "needsInterop": false
     },
     "zod": {
       "src": "../../../../../../node_modules/.pnpm/zod@4.3.6/node_modules/zod/index.js",
       "file": "zod.js",
-      "fileHash": "06047af6",
+      "fileHash": "eb1d38b2",
       "needsInterop": false
     }
   },

package/template/tests/evals/albums.eval.ts CHANGED Viewed

@@ -1,4 +1,3 @@
-import { expect } from 'vitest';
 import { defineEval } from 'sunpeak/eval';
 export default defineEval({
@@ -11,18 +10,22 @@ export default defineEval({
     {
       name: 'asks for food photos',
       prompt: 'Show me photos from my Austin pizza tour',
-      expect: {
-        tool: 'show-albums',
-        args: { search: expect.stringMatching(/pizza|austin/i) },
-      },
+      expect: { tool: 'show-albums' },
     },
     {
       name: 'asks for a specific category',
       prompt: 'Show me my travel photos',
-      expect: {
-        tool: 'show-albums',
-        args: { category: expect.stringMatching(/travel/i) },
-      },
+      expect: { tool: 'show-albums' },
     },
+    // To also check argument extraction, add args expectations
+    // (import { expect } from 'vitest' to use matchers):
+    // {
+    //   name: 'passes search term',
+    //   prompt: 'Show me photos from my Austin pizza tour',
+    //   expect: {
+    //     tool: 'show-albums',
+    //     args: { search: expect.stringMatching(/pizza|austin/i) },
+    //   },
+    // },
   ],
 });

package/template/tests/evals/carousel.eval.ts CHANGED Viewed

@@ -1,4 +1,3 @@
-import { expect } from 'vitest';
 import { defineEval } from 'sunpeak/eval';
 export default defineEval({
@@ -6,21 +5,12 @@ export default defineEval({
     {
       name: 'asks for popular places',
       prompt: 'Show me popular places to visit in Tokyo',
-      expect: {
-        tool: 'show-carousel',
-        args: { city: expect.stringMatching(/tokyo/i) },
-      },
+      expect: { tool: 'show-carousel' },
     },
     {
-      name: 'asks for restaurants',
-      prompt: 'What are some good restaurants in Paris?',
-      expect: {
-        tool: 'show-carousel',
-        args: {
-          city: expect.stringMatching(/paris/i),
-          categories: expect.arrayContaining([expect.stringMatching(/restaurant|food|dining/i)]),
-        },
-      },
+      name: 'asks for places to eat',
+      prompt: 'Where should I eat in Paris?',
+      expect: { tool: 'show-carousel' },
     },
   ],
 });

package/template/tests/evals/eval.config.ts CHANGED Viewed

@@ -10,15 +10,15 @@ export default defineEvalConfig({
   models: [
     // Uncomment models and install their provider packages:
-    // 'gpt-4o',                      // OPENAI_API_KEY
-    // 'gpt-4o-mini',                 // OPENAI_API_KEY
-    // 'o4-mini',                     // OPENAI_API_KEY
-    // 'claude-sonnet-4-20250514',    // ANTHROPIC_API_KEY
-    // 'gemini-2.0-flash',            // GOOGLE_GENERATIVE_AI_API_KEY
+    // 'gpt-4o',                   // OPENAI_API_KEY
+    // 'gpt-4o-mini',              // OPENAI_API_KEY
+    // 'o4-mini',                  // OPENAI_API_KEY
+    // 'claude-sonnet-4-20250514', // ANTHROPIC_API_KEY
+    // 'gemini-2.0-flash',         // GOOGLE_GENERATIVE_AI_API_KEY
   ],
   defaults: {
-    runs: 10, // Number of times to run each case per model
+    runs: 5, // Number of times to run each case per model
     maxSteps: 1, // Max tool call steps per run
     temperature: 0, // 0 for most deterministic results
     timeout: 30_000, // Timeout per run in ms

package/template/tests/evals/map.eval.ts CHANGED Viewed

@@ -1,4 +1,3 @@
-import { expect } from 'vitest';
 import { defineEval } from 'sunpeak/eval';
 export default defineEval({
@@ -6,18 +5,12 @@ export default defineEval({
     {
       name: 'asks for a map',
       prompt: 'Show me a map of coffee shops near downtown Austin',
-      expect: {
-        tool: 'show-map',
-        args: { query: expect.stringMatching(/coffee/i) },
-      },
+      expect: { tool: 'show-map' },
     },
     {
       name: 'asks for nearby places',
       prompt: 'Find me some parks nearby',
-      expect: {
-        tool: 'show-map',
-        args: { query: expect.stringMatching(/park/i) },
-      },
+      expect: { tool: 'show-map' },
     },
   ],
 });

package/template/tests/evals/review.eval.ts CHANGED Viewed

@@ -1,4 +1,3 @@
-import { expect } from 'vitest';
 import { defineEval } from 'sunpeak/eval';
 export default defineEval({
@@ -11,10 +10,7 @@ export default defineEval({
     {
       name: 'asks to draft a social post',
       prompt: 'Write a launch announcement for X and LinkedIn',
-      expect: {
-        tool: 'review-post',
-        args: { platforms: expect.arrayContaining([expect.stringMatching(/x|twitter/i)]) },
-      },
+      expect: { tool: 'review-post' },
     },
     {
       name: 'asks to review a purchase',
@@ -22,27 +18,36 @@ export default defineEval({
       expect: { tool: 'review-purchase' },
     },
-    // --- Other assertion patterns ---
+    // To also check argument extraction, add args expectations
+    // (import { expect } from 'vitest' to use matchers):
+    // {
+    //   name: 'passes platforms',
+    //   prompt: 'Write a launch announcement for X and LinkedIn',
+    //   expect: {
+    //     tool: 'review-post',
+    //     args: { platforms: expect.arrayContaining([expect.stringMatching(/x|twitter/i)]) },
+    //   },
+    // },
     //
     // Multi-step (ordered tool call sequence):
-    //   {
-    //     name: 'multi-step flow',
-    //     prompt: 'Draft a post and then review it',
-    //     maxSteps: 3,
-    //     expect: [
-    //       { tool: 'review-post' },
-    //       { tool: 'publish-post' },
-    //     ],
-    //   },
+    // {
+    //   name: 'multi-step flow',
+    //   prompt: 'Draft a post and then review it',
+    //   maxSteps: 3,
+    //   expect: [
+    //     { tool: 'review-post' },
+    //     { tool: 'publish-post' },
+    //   ],
+    // },
     //
     // Custom assertion (full access to result):
-    //   {
-    //     name: 'custom check',
-    //     prompt: 'Show me my recent reviews',
-    //     assert: (result) => {
-    //       expect(result.toolCalls).toHaveLength(1);
-    //       expect(result.toolCalls[0].name).toBe('review-diff');
-    //     },
+    // {
+    //   name: 'custom check',
+    //   prompt: 'Show me my recent reviews',
+    //   assert: (result) => {
+    //     expect(result.toolCalls).toHaveLength(1);
+    //     expect(result.toolCalls[0].name).toBe('review-diff');
     //   },
+    // },
   ],
 });

/package/template/tests/evals/{.env.example → _env.example} RENAMED Viewed

File without changes