npm - sunpeak - Versions diffs - 0.19.10 → 0.20.1 - Mend

sunpeak 0.19.10 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

package/README.md +3 -3
package/bin/commands/inspect.mjs +321 -6
package/bin/commands/new.mjs +85 -7
package/bin/commands/test-init.mjs +190 -59
package/bin/commands/test.mjs +60 -10
package/bin/lib/eval/eval-providers.mjs +34 -0
package/bin/lib/eval/eval-runner.mjs +54 -2
package/bin/lib/eval/eval-vitest-plugin.mjs +3 -14
package/bin/lib/eval/model-registry.mjs +2 -2
package/bin/lib/inspect/inspect-config.mjs +16 -1
package/bin/lib/inspect/inspect-server.d.mts +32 -0
package/bin/lib/inspect/inspect-server.mjs +11 -0
package/bin/lib/resolve-bin.mjs +39 -0
package/bin/lib/test/base-config.mjs +3 -2
package/bin/lib/test/matchers.mjs +2 -2
package/bin/lib/test/test-config.mjs +18 -7
package/bin/lib/test/test-fixtures.d.mts +52 -92
package/bin/lib/test/test-fixtures.mjs +174 -147
package/dist/chatgpt/index.cjs +1 -1
package/dist/chatgpt/index.js +1 -1
package/dist/claude/index.cjs +1 -1
package/dist/claude/index.js +1 -1
package/dist/host/chatgpt/index.cjs +1 -1
package/dist/host/chatgpt/index.js +1 -1
package/dist/index.cjs +4 -4
package/dist/index.cjs.map +1 -1
package/dist/index.js +3 -3
package/dist/index.js.map +1 -1
package/dist/inspector/index.cjs +1 -1
package/dist/inspector/index.js +1 -1
package/dist/{inspector-D5DckQuU.js → inspector-BBDa5yCm.js} +57 -23
package/dist/inspector-BBDa5yCm.js.map +1 -0
package/dist/{inspector-jY9O18z9.cjs → inspector-DAA1Wiyh.cjs} +58 -24
package/dist/inspector-DAA1Wiyh.cjs.map +1 -0
package/dist/lib/discovery-cli.cjs +1 -1
package/dist/mcp/index.cjs +22 -25
package/dist/mcp/index.cjs.map +1 -1
package/dist/mcp/index.js +19 -22
package/dist/mcp/index.js.map +1 -1
package/dist/{use-app-Bfargfa3.js → use-app-Cr0auUa1.js} +2 -2
package/dist/{use-app-Bfargfa3.js.map → use-app-Cr0auUa1.js.map} +1 -1
package/dist/{use-app-CbsBEmwv.cjs → use-app-DPkj5Jp_.cjs} +2 -2
package/dist/{use-app-CbsBEmwv.cjs.map → use-app-DPkj5Jp_.cjs.map} +1 -1
package/package.json +27 -15
package/template/README.md +1 -1
package/template/dist/albums/albums.html +4 -4
package/template/dist/albums/albums.json +1 -1
package/template/dist/carousel/carousel.html +4 -4
package/template/dist/carousel/carousel.json +1 -1
package/template/dist/map/map.html +6 -6
package/template/dist/map/map.json +1 -1
package/template/dist/review/review.html +4 -4
package/template/dist/review/review.json +1 -1
package/template/node_modules/.bin/vite +2 -2
package/template/node_modules/.bin/vitest +2 -2
package/template/node_modules/.vite/deps/_metadata.json +4 -4
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js +1 -1
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js +1 -1
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js +1 -1
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/@testing-library_react.js +4 -4
package/template/node_modules/.vite-mcp/deps/@testing-library_react.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/_metadata.json +33 -33
package/template/node_modules/.vite-mcp/deps/{client-CU1wWud4.js → client-B_5CX--u.js} +7 -7
package/template/node_modules/.vite-mcp/deps/{client-CU1wWud4.js.map → client-B_5CX--u.js.map} +1 -1
package/template/node_modules/.vite-mcp/deps/embla-carousel-react.js +1 -1
package/template/node_modules/.vite-mcp/deps/embla-carousel-react.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/react-dom.js +3 -3
package/template/node_modules/.vite-mcp/deps/react-dom.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/react-dom_client.js +1 -1
package/template/node_modules/.vite-mcp/deps/react.js +3 -3
package/template/node_modules/.vite-mcp/deps/react.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/react_jsx-dev-runtime.js +2 -2
package/template/node_modules/.vite-mcp/deps/react_jsx-dev-runtime.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/react_jsx-runtime.js +2 -2
package/template/node_modules/.vite-mcp/deps/react_jsx-runtime.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/vitest.js +1024 -622
package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
package/template/package.json +6 -6
package/template/tests/e2e/albums.spec.ts +24 -52
package/template/tests/e2e/carousel.spec.ts +36 -58
package/template/tests/e2e/map.spec.ts +35 -56
package/template/tests/e2e/review.spec.ts +56 -85
package/template/tests/e2e/visual.spec.ts +14 -12
package/template/tests/evals/albums.eval.ts +12 -9
package/template/tests/evals/carousel.eval.ts +4 -14
package/template/tests/evals/eval.config.ts +6 -6
package/template/tests/evals/map.eval.ts +2 -9
package/template/tests/evals/review.eval.ts +27 -22
package/dist/inspector-D5DckQuU.js.map +0 -1
package/dist/inspector-jY9O18z9.cjs.map +0 -1
/package/template/tests/evals/{.env.example → _env.example} +0 -0

package/bin/commands/test-init.mjs CHANGED Viewed

@@ -1,12 +1,38 @@
 import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
 import { execSync } from 'child_process';
 import { join, dirname } from 'path';
+import { fileURLToPath } from 'url';
 import * as p from '@clack/prompts';
+import { EVAL_PROVIDERS, generateModelLines } from '../lib/eval/eval-providers.mjs';
+import { detectPackageManager } from '../utils.mjs';
+/** Read the current sunpeak package version for pinning in scaffolded configs. */
+function getSunpeakVersion() {
+  try {
+    const __dirname = dirname(fileURLToPath(import.meta.url));
+    const pkgPath = join(__dirname, '..', '..', 'package.json');
+    const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
+    return pkg.version ? `^${pkg.version}` : 'latest';
+  } catch {
+    return 'latest';
+  }
+}
 /**
  * Default dependencies (real implementations).
  * Override in tests via the `deps` parameter.
  */
+async function defaultSelectProviders() {
+  const selected = await p.multiselect({
+    message: 'AI providers for evals (space to toggle, enter to skip)',
+    options: EVAL_PROVIDERS.map((prov) => ({ value: prov, label: prov.label })),
+    initialValues: [],
+    required: false,
+  });
+  if (p.isCancel(selected)) return [];
+  return selected;
+}
 export const defaultDeps = {
   existsSync,
   readFileSync,
@@ -21,6 +47,9 @@ export const defaultDeps = {
   select: p.select,
   text: p.text,
   log: p.log,
+  password: p.password,
+  selectProviders: defaultSelectProviders,
+  detectPackageManager,
 };
 /**
@@ -33,7 +62,7 @@ export const defaultDeps = {
  *
  * Scaffolds all 5 test types:
  * 1. E2E tests — Playwright-based inspector tests (mcp fixture)
- * 2. Visual regression — Screenshot comparison via mcp.screenshot()
+ * 2. Visual regression — Screenshot comparison via result.screenshot()
  * 3. Live tests — Test against real ChatGPT/Claude hosts
  * 4. Evals — Multi-model tool calling reliability tests
  * 5. Unit tests — Direct tool handler tests (JS/TS projects only)
@@ -60,6 +89,61 @@ export async function testInit(args = [], deps = defaultDeps) {
     await initExternalProject(cliServer, d);
   }
+  // Offer to configure eval providers
+  const providers = await d.selectProviders();
+  if (!d.isCancel(providers) && providers.length > 0) {
+    const pm = d.detectPackageManager();
+    const pkgsToInstall = ['ai', ...providers.map((p) => p.pkg)];
+    const installCmd = `${pm} add -D ${pkgsToInstall.join(' ')}`;
+    try {
+      d.execSync(installCmd, { cwd: d.cwd(), stdio: 'inherit' });
+    } catch {
+      d.log.info(`Provider install failed. Install manually: ${installCmd}`);
+    }
+    // Uncomment selected models in eval.config.ts
+    const evalDir = d.existsSync(join(d.cwd(), 'tests', 'evals'))
+      ? join(d.cwd(), 'tests', 'evals')
+      : d.existsSync(join(d.cwd(), 'tests', 'sunpeak', 'evals'))
+        ? join(d.cwd(), 'tests', 'sunpeak', 'evals')
+        : null;
+    if (evalDir) {
+      const configPath = join(evalDir, 'eval.config.ts');
+      if (d.existsSync(configPath)) {
+        let config = d.readFileSync(configPath, 'utf-8');
+        for (const prov of providers) {
+          for (const model of prov.models) {
+            config = config.replace(
+              new RegExp(`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`, 'm'),
+              '$1$2'
+            );
+          }
+        }
+        d.writeFileSync(configPath, config);
+      }
+      // Prompt for API keys and write .env
+      const envLines = [];
+      const seen = new Set();
+      for (const prov of providers) {
+        if (seen.has(prov.envVar)) continue;
+        seen.add(prov.envVar);
+        const key = await d.password({
+          message: `${prov.envVar} (enter to skip)`,
+          mask: '*',
+        });
+        if (!d.isCancel(key) && key) {
+          envLines.push(`${prov.envVar}=${key}`);
+        }
+      }
+      if (envLines.length > 0 && evalDir) {
+        const relEnvPath = evalDir.startsWith(d.cwd()) ? evalDir.slice(d.cwd().length + 1) : evalDir;
+        d.writeFileSync(join(evalDir, '.env'), envLines.join('\n') + '\n');
+        d.log.info(`API keys saved to ${relEnvPath}/.env (gitignored)`);
+      }
+    }
+  }
   // Offer to install the testing skill
   const installSkill = await d.confirm({
     message: 'Install the test-mcp-server skill? (helps your coding agent write tests)',
@@ -67,12 +151,12 @@ export async function testInit(args = [], deps = defaultDeps) {
   });
   if (!d.isCancel(installSkill) && installSkill) {
     try {
-      d.execSync('npx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
+      d.execSync('pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server', {
         cwd: d.cwd(),
         stdio: 'inherit',
       });
     } catch {
-      d.log.info('Skill install skipped. Install later: npx skills add Sunpeak-AI/sunpeak@test-mcp-server');
+      d.log.info('Skill install skipped. Install later: pnpm dlx skills add Sunpeak-AI/sunpeak@test-mcp-server');
     }
   }
@@ -141,11 +225,32 @@ async function getServerConfig(cliServer, d) {
 function generateServerConfigBlock(server, relativeTo = '.') {
   if (server.type === 'later') {
-    return `  // TODO: Configure your MCP server connection
+    return `  // TODO: Configure your MCP server connection before running tests.
+  // Uncomment one of the options below:
+  //
+  // HTTP server (Python FastAPI, Go, etc.):
+  // server: { url: 'http://localhost:8000/mcp' },
+  //
+  // Python (uv):
+  // server: { command: 'uv', args: ['run', 'python', 'server.py'] },
+  //
+  // Python (venv):
+  // server: { command: '.venv/bin/python', args: ['server.py'] },
+  //
+  // Go:
+  // server: { command: 'go', args: ['run', './cmd/server'] },
+  //
+  // Node.js:
+  // server: { command: 'node', args: ['server.js'] },
+  //
+  // Optional server options:
   // server: {
-  //   command: 'python',
-  //   args: ['server.py'],
-  // },`;
+  //   command: 'python', args: ['server.py'],
+  //   env: { API_KEY: 'test-key' },  // Extra environment variables
+  //   cwd: './backend',               // Working directory
+  // },
+  //
+  // timeout: 120_000,  // Server startup timeout in ms (default: 60s)`;
   }
   if (server.type === 'url') {
     return `  server: {
@@ -205,15 +310,11 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
     "",
     "  models: [",
     "    // Uncomment models and install their provider packages:",
-    "    // 'gpt-4o',                      // OPENAI_API_KEY",
-    "    // 'gpt-4o-mini',                 // OPENAI_API_KEY",
-    "    // 'o4-mini',                     // OPENAI_API_KEY",
-    "    // 'claude-sonnet-4-20250514',    // ANTHROPIC_API_KEY",
-    "    // 'gemini-2.0-flash',            // GOOGLE_GENERATIVE_AI_API_KEY",
+    ...generateModelLines(),
     "  ],",
     "",
     "  defaults: {",
-    "    runs: 10,          // Number of times to run each case per model",
+    "    runs: 5,           // Number of times to run each case per model",
     "    maxSteps: 1,       // Max tool call steps per run",
     "    temperature: 0,    // 0 for most deterministic results",
     "    timeout: 30_000,   // Timeout per run in ms",
@@ -244,7 +345,7 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
  *
  * To get started:
  * 1. Configure models in eval.config.ts (uncomment the ones you want)
- * 2. Install the AI SDK and provider packages: pnpm add ai @ai-sdk/openai
+ * 2. Install the AI SDK and provider packages (e.g. pnpm add ai @ai-sdk/openai)
  * 3. Copy .env.example to .env and add your API keys
  * 4. Replace this file with evals for your own tools
  * 5. Run: sunpeak test --eval
@@ -302,31 +403,31 @@ function scaffoldVisualTest(filePath, d) {
  * Uncomment the tests below and replace 'your-tool' with your tool name.
  */
-// test('tool renders correctly in light mode', async ({ mcp }) => {
-//   const result = await mcp.callTool('your-tool', { key: 'value' }, { theme: 'light' });
+// test('tool renders correctly in light mode', async ({ inspector }) => {
+//   const result = await inspector.renderTool('your-tool', { key: 'value' }, { theme: 'light' });
 //   expect(result).not.toBeError();
 //
 //   // Wait for UI to render, then screenshot:
 //   // const app = result.app();
 //   // await expect(app.getByText('Expected text')).toBeVisible();
-//   // await mcp.screenshot('tool-light');
+//   // await result.screenshot('tool-light');
 // });
-// test('tool renders correctly in dark mode', async ({ mcp }) => {
-//   const result = await mcp.callTool('your-tool', { key: 'value' }, { theme: 'dark' });
+// test('tool renders correctly in dark mode', async ({ inspector }) => {
+//   const result = await inspector.renderTool('your-tool', { key: 'value' }, { theme: 'dark' });
 //   expect(result).not.toBeError();
 //
 //   // const app = result.app();
 //   // await expect(app.getByText('Expected text')).toBeVisible();
-//   // await mcp.screenshot('tool-dark');
+//   // await result.screenshot('tool-dark');
 // });
 // Full-page screenshot (captures the inspector chrome too):
-// test('full page renders correctly', async ({ mcp }) => {
-//   const result = await mcp.callTool('your-tool', {}, { theme: 'light' });
+// test('full page renders correctly', async ({ inspector }) => {
+//   const result = await inspector.renderTool('your-tool', {}, { theme: 'light' });
 //   const app = result.app();
 //   await expect(app.getByText('Expected text')).toBeVisible();
-//   await mcp.screenshot('tool-page', { target: 'page', maxDiffPixelRatio: 0.02 });
+//   await result.screenshot('tool-page', { target: 'page', maxDiffPixelRatio: 0.02 });
 // });
 `
   );
@@ -489,7 +590,8 @@ async function initExternalProject(cliServer, d) {
         private: true,
         type: 'module',
         devDependencies: {
-          sunpeak: 'latest',
+          '@types/node': 'latest',
+          sunpeak: getSunpeakVersion(),
           '@playwright/test': 'latest',
         },
         scripts: {
@@ -531,24 +633,28 @@ ${serverBlock}
     ) + '\n'
   );
-  // 1. E2E test — smoke test, verifies the server is reachable
+  // 1. E2E test — smoke test, verifies the server exposes tools
   d.writeFileSync(
     join(testDir, 'smoke.test.ts'),
     `import { test, expect } from 'sunpeak/test';
-test('server is reachable and inspector loads', async ({ mcp }) => {
-  // Verify the inspector page loads successfully
-  await expect(mcp.page.locator('#root')).not.toBeEmpty();
+test('server exposes tools', async ({ mcp }) => {
+  const tools = await mcp.listTools();
+  expect(tools.length).toBeGreaterThan(0);
 });
-// Uncomment and customize for your tools:
-// test('my tool renders correctly', async ({ mcp }) => {
+// Protocol-level test (no UI rendering):
+// test('my tool returns data', async ({ mcp }) => {
 //   const result = await mcp.callTool('your-tool', { key: 'value' });
+//   expect(result.isError).toBeFalsy();
+// });
+// UI rendering test:
+// test('my tool renders correctly', async ({ inspector }) => {
+//   const result = await inspector.renderTool('your-tool', { key: 'value' });
 //   expect(result).not.toBeError();
-//
-//   // If your tool has a UI:
-//   // const app = result.app();
-//   // await expect(app.getByText('Hello')).toBeVisible();
+//   const app = result.app();
+//   await expect(app.getByText('Hello')).toBeVisible();
 // });
 `
   );
@@ -563,15 +669,31 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
   scaffoldEvals(join(testDir, 'evals'), { server, d });
   d.log.success('Created tests/sunpeak/ with all test types.');
-  d.log.step('Next steps:');
-  d.log.message('  cd tests/sunpeak');
-  d.log.message('  npm install');
-  d.log.message('  npx playwright install chromium');
-  d.log.message('');
-  d.log.message('  npx sunpeak test              # E2E tests');
-  d.log.message('  npx sunpeak test --visual      # Visual regression (generates baselines on first run)');
-  d.log.message('  npx sunpeak test --live         # Live tests against real hosts (requires login)');
-  d.log.message('  npx sunpeak test --eval         # Multi-model evals (configure models in evals/eval.config.ts)');
+  if (server.type === 'later') {
+    d.log.warn('Server not configured. Edit tests/sunpeak/playwright.config.ts before running tests.');
+  }
+  // Auto-install dependencies so users can run tests immediately
+  const pm = d.detectPackageManager();
+  d.log.step('Installing dependencies...');
+  try {
+    d.execSync(`${pm} install`, { cwd: testDir, stdio: 'inherit' });
+  } catch {
+    d.log.warn(`Dependency install failed. Run manually: cd tests/sunpeak && ${pm} install`);
+  }
+  d.log.step('Installing Playwright browser...');
+  try {
+    d.execSync(`${pm} exec playwright install chromium`, { cwd: testDir, stdio: 'inherit' });
+  } catch {
+    d.log.warn(`Browser install failed. Run manually: cd tests/sunpeak && ${pm} exec playwright install chromium`);
+  }
+  d.log.step('Ready! Run tests with:');
+  d.log.message('  sunpeak test              # E2E tests');
+  d.log.message('  sunpeak test --visual      # Visual regression (generates baselines on first run)');
+  d.log.message('  sunpeak test --live         # Live tests against real hosts (requires login)');
+  d.log.message('  sunpeak test --eval         # Multi-model evals (configure models in evals/eval.config.ts)');
 }
 async function initJsProject(cliServer, d) {
@@ -608,18 +730,23 @@ ${serverBlock}
       testPath,
       `import { test, expect } from 'sunpeak/test';
-test('server is reachable and inspector loads', async ({ mcp }) => {
-  await expect(mcp.page.locator('#root')).not.toBeEmpty();
+test('server exposes tools', async ({ mcp }) => {
+  const tools = await mcp.listTools();
+  expect(tools.length).toBeGreaterThan(0);
 });
-// Uncomment and customize for your tools:
-// test('my tool renders correctly', async ({ mcp }) => {
+// Protocol-level test (no UI rendering):
+// test('my tool returns data', async ({ mcp }) => {
 //   const result = await mcp.callTool('your-tool', { key: 'value' });
+//   expect(result.isError).toBeFalsy();
+// });
+// UI rendering test:
+// test('my tool renders correctly', async ({ inspector }) => {
+//   const result = await inspector.renderTool('your-tool', { key: 'value' });
 //   expect(result).not.toBeError();
-//
-//   // If your tool has a UI:
-//   // const app = result.app();
-//   // await expect(app.getByText('Hello')).toBeVisible();
+//   const app = result.app();
+//   await expect(app.getByText('Hello')).toBeVisible();
 // });
 `
     );
@@ -638,15 +765,19 @@ test('server is reachable and inspector loads', async ({ mcp }) => {
   // 5. Unit test
   scaffoldUnitTest(join(cwd, 'tests', 'unit', 'example.test.ts'), d);
+  if (server.type === 'later') {
+    d.log.warn('Server not configured. Edit playwright.config.ts before running tests.');
+  }
+  const pkgMgr = d.detectPackageManager();
   d.log.step('Next steps:');
-  d.log.message('  npm install -D sunpeak @playwright/test vitest');
-  d.log.message('  npx playwright install chromium');
+  d.log.message(`  ${pkgMgr} add -D sunpeak @playwright/test vitest`);
+  d.log.message(`  ${pkgMgr} exec playwright install chromium`);
   d.log.message('');
-  d.log.message('  npx sunpeak test              # E2E tests');
-  d.log.message('  npx sunpeak test --unit        # Unit tests (vitest)');
-  d.log.message('  npx sunpeak test --visual      # Visual regression');
-  d.log.message('  npx sunpeak test --live         # Live tests against real hosts');
-  d.log.message('  npx sunpeak test --eval         # Multi-model evals');
+  d.log.message('  sunpeak test              # E2E tests');
+  d.log.message('  sunpeak test --unit        # Unit tests (vitest)');
+  d.log.message('  sunpeak test --visual      # Visual regression');
+  d.log.message('  sunpeak test --live         # Live tests against real hosts');
+  d.log.message('  sunpeak test --eval         # Multi-model evals');
 }
 async function initSunpeakProject(d) {
@@ -702,6 +833,6 @@ export default defineConfig();
   d.log.message('  Replace: import { test, expect } from "@playwright/test"');
   d.log.message('  With:    import { test, expect } from "sunpeak/test"');
   d.log.message('');
-  d.log.message('  Use the `mcp` fixture instead of raw page navigation.');
+  d.log.message('  Use the `mcp` and `inspector` fixtures instead of raw page navigation.');
   d.log.message('  See sunpeak docs for migration examples.');
 }

package/bin/commands/test.mjs CHANGED Viewed

@@ -70,6 +70,9 @@ export async function runTest(args) {
         'playwright.config.js',
         'sunpeak.config.ts',
         'sunpeak.config.js',
+        // Fallback for non-JS projects: tests/sunpeak/ self-contained directory
+        'tests/sunpeak/playwright.config.ts',
+        'tests/sunpeak/playwright.config.js',
       ],
       visual: isVisual,
       updateSnapshots: isVisual && isUpdate,
@@ -82,6 +85,9 @@ export async function runTest(args) {
       configCandidates: [
         'tests/live/playwright.config.ts',
         'tests/live/playwright.config.js',
+        // Fallback for non-JS projects: tests/sunpeak/ self-contained directory
+        'tests/sunpeak/live/playwright.config.ts',
+        'tests/sunpeak/live/playwright.config.js',
       ],
       configRequired: true,
       configErrorMessage: 'No live test config found at tests/live/playwright.config.ts',
@@ -310,14 +316,43 @@ function validateApiKeys(models) {
   return warnings;
 }
+/**
+ * Check that required AI SDK provider packages are installed for the configured models.
+ * @param {string[]} models
+ * @returns {Promise<Array<{ pkg: string, reason: string }>>}
+ */
+async function checkProviderPackages(models) {
+  const { createRequire } = await import('module');
+  // Resolve from the project's node_modules, not the global CLI install
+  const require = createRequire(join(process.cwd(), 'package.json'));
+  const missing = [];
+  const checked = new Set();
+  try {
+    require.resolve('ai');
+  } catch {
+    missing.push({ pkg: 'ai', reason: 'core AI SDK' });
+  }
+  for (const modelId of models) {
+    const pkg = getProviderForModel(modelId);
+    if (!pkg || checked.has(pkg)) continue;
+    checked.add(pkg);
+    try {
+      require.resolve(pkg);
+    } catch {
+      missing.push({ pkg, reason: modelId });
+    }
+  }
+  return missing;
+}
 /**
  * Run eval tests via vitest with the eval plugin.
  */
 async function runEvals(args) {
-  const { dirname, resolve, basename } = await import('path');
-  const { fileURLToPath } = await import('url');
-  const __dirname = dirname(fileURLToPath(import.meta.url));
+  const { resolve, basename } = await import('path');
   const evalDir = findEvalDir();
   if (!evalDir) {
@@ -365,8 +400,19 @@ async function runEvals(args) {
     }
   }
-  // Validate API keys for configured models
+  // Check for missing provider packages and API keys
   if (configModels && configModels.length > 0) {
+    const missingPkgs = await checkProviderPackages(configModels);
+    if (missingPkgs.length > 0) {
+      console.error('\nMissing required packages for eval models:\n');
+      for (const { pkg, reason } of missingPkgs) {
+        console.error(`  ${pkg}  (needed for ${reason})`);
+      }
+      const installCmd = missingPkgs.map((m) => m.pkg).join(' ');
+      console.error(`\nInstall with:\n\n  pnpm add -D ${installCmd}\n`);
+      return 1;
+    }
     const warnings = validateApiKeys(configModels);
     if (warnings.length > 0) {
       console.log('');
@@ -416,8 +462,10 @@ async function runEvals(args) {
   // Use .ts extension so vitest handles TypeScript imports natively
   const vitestConfigPath = join(absEvalDir, '.eval-vitest.config.ts');
-  const evalPluginPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-vitest-plugin.mjs');
-  const evalReporterPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-reporter.mjs');
+  // Use package exports so vitest resolves from the project's node_modules,
+  // not the global CLI install. This ensures import('ai') finds project-local deps.
+  const evalPluginImport = 'sunpeak/eval/plugin';
+  const evalReporterImport = 'sunpeak/eval/reporter';
   // Clean up dev server and temp config
   const cleanupResources = () => {
@@ -442,17 +490,19 @@ async function runEvals(args) {
     ? './' + basename(configFile)
     : null;
-  // Generate a vitest config that loads the eval config and plugin
+  // Generate a vitest config that loads the eval config and plugin.
+  // Imports use package names (sunpeak/eval/plugin) so vitest resolves from
+  // the project's node_modules, ensuring import('ai') finds project-local deps.
   const vitestConfig = `
 import { defineConfig } from 'vitest/config';
-import { evalVitestPlugin } from ${JSON.stringify(evalPluginPath)};
+import { evalVitestPlugin } from '${evalPluginImport}';
 ${configImportPath ? `import evalConfig from ${JSON.stringify(configImportPath)};` : 'const evalConfig = { models: [], defaults: {} };'}
 export default defineConfig({
   test: {
     globals: true,
     include: ['**/*.eval.ts', '**/*.eval.js'],
-    reporters: ['default', ${JSON.stringify(evalReporterPath)}],
+    reporters: ['default', '${evalReporterImport}'],
     testTimeout: 600000,
   },
   plugins: [

package/bin/lib/eval/eval-providers.mjs ADDED Viewed

@@ -0,0 +1,34 @@
+/**
+ * Shared eval provider definitions used by both `sunpeak new` and `sunpeak test init`.
+ * Single source of truth for provider packages, model IDs, and CLI labels.
+ */
+export const EVAL_PROVIDERS = [
+  { pkg: '@ai-sdk/openai', models: ['gpt-4o', 'gpt-4o-mini'], label: 'OpenAI       (gpt-4o, gpt-4o-mini)', envVar: 'OPENAI_API_KEY' },
+  { pkg: '@ai-sdk/anthropic', models: ['claude-sonnet-4-20250514'], label: 'Anthropic    (claude-sonnet-4)', envVar: 'ANTHROPIC_API_KEY' },
+  { pkg: '@ai-sdk/google', models: ['gemini-2.0-flash'], label: 'Google       (gemini-2.0-flash)', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
+];
+/**
+ * All model lines that appear in eval.config.ts, in order.
+ * Used by scaffoldEvals to generate the config and by the uncomment logic.
+ */
+export const EVAL_CONFIG_MODELS = [
+  { id: 'gpt-4o', envVar: 'OPENAI_API_KEY' },
+  { id: 'gpt-4o-mini', envVar: 'OPENAI_API_KEY' },
+  { id: 'o4-mini', envVar: 'OPENAI_API_KEY' },
+  { id: 'claude-sonnet-4-20250514', envVar: 'ANTHROPIC_API_KEY' },
+  { id: 'gemini-2.0-flash', envVar: 'GOOGLE_GENERATIVE_AI_API_KEY' },
+];
+/**
+ * Generate the models section lines for eval.config.ts.
+ * @returns {string[]}
+ */
+export function generateModelLines() {
+  const maxIdLen = Math.max(...EVAL_CONFIG_MODELS.map((m) => m.id.length));
+  return EVAL_CONFIG_MODELS.map((m) => {
+    const padded = `'${m.id}',`.padEnd(maxIdLen + 3);
+    return `    // ${padded} // ${m.envVar}`;
+  });
+}

package/bin/lib/eval/eval-runner.mjs CHANGED Viewed

@@ -8,6 +8,43 @@ import { resolveModel, checkAiSdkInstalled } from './model-registry.mjs';
 // Re-export for use in generated test code
 export { checkAiSdkInstalled };
+/**
+ * Strip AI SDK retry wrapper from error messages for cleaner output.
+ * "Failed after 3 attempts. Last error: <actual error>" → "<actual error>"
+ * @param {string} message
+ * @returns {string}
+ */
+export function cleanErrorMessage(message) {
+  return message.replace(/^Failed after \d+ attempts?\. Last error: /i, '');
+}
+/**
+ * Check if an error message indicates a fatal API error that won't resolve on retry.
+ * @param {string} message
+ * @returns {boolean}
+ */
+export function isFatalApiError(message) {
+  const lower = message.toLowerCase();
+  const patterns = [
+    'exceeded your current quota',
+    'credit balance is too low',
+    'insufficient_quota',
+    'billing_hard_limit_reached',
+    'check your plan and billing details',
+    'add a payment method',
+    'invalid api key',
+    'invalid_api_key',
+    'incorrect api key',
+    'unauthorized',
+    'permission denied',
+    'access denied',
+    'authentication failed',
+    'account deactivated',
+    'account suspended',
+  ];
+  return patterns.some((p) => lower.includes(p));
+}
 /**
  * Define an eval spec. Identity function for type safety.
  * @param {import('./eval-types.d.mts').EvalSpec} spec
@@ -112,6 +149,7 @@ export async function runSingleEval({ prompt, model, tools, maxSteps, temperatur
     prompt,
     maxSteps,
     temperature,
+    maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
     abortSignal: AbortSignal.timeout(timeout),
   });
@@ -268,10 +306,15 @@ export async function runEvalCaseAggregate({
   const model = await resolveModel(modelId);
   let passed = 0;
   let failed = 0;
+  let executedRuns = 0;
   let totalDurationMs = 0;
   const failureMap = new Map();
   for (let i = 0; i < runs; i++) {
+    // Small delay between runs to avoid rate limits (skip before first run)
+    if (i > 0) await new Promise((r) => setTimeout(r, 1000));
+    executedRuns++;
     const start = performance.now();
     try {
       const result = await runSingleEval({
@@ -286,8 +329,17 @@ export async function runEvalCaseAggregate({
       passed++;
     } catch (err) {
       failed++;
-      const msg = err.message || String(err);
+      const msg = cleanErrorMessage(err.message || String(err));
       failureMap.set(msg, (failureMap.get(msg) || 0) + 1);
+      if (isFatalApiError(msg)) {
+        // Count remaining runs as failed and stop early
+        const remaining = runs - i - 1;
+        failed += remaining;
+        failureMap.set(msg, (failureMap.get(msg) || 0) + remaining);
+        totalDurationMs += performance.now() - start;
+        break;
+      }
     }
     totalDurationMs += performance.now() - start;
   }
@@ -304,7 +356,7 @@ export async function runEvalCaseAggregate({
     passed,
     failed,
     passRate: runs > 0 ? passed / runs : 0,
-    avgDurationMs: runs > 0 ? totalDurationMs / runs : 0,
+    avgDurationMs: executedRuns > 0 ? totalDurationMs / executedRuns : 0,
     failures,
   };
 }