sunpeak 0.19.2 → 0.19.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +6 -4
  2. package/bin/commands/dev.mjs +1 -1
  3. package/bin/commands/inspect.mjs +1 -1
  4. package/bin/commands/new.mjs +9 -5
  5. package/bin/commands/start.mjs +3 -1
  6. package/bin/commands/test-init.mjs +478 -76
  7. package/bin/commands/test.mjs +357 -4
  8. package/bin/lib/eval/eval-reporter.mjs +105 -0
  9. package/bin/lib/eval/eval-runner.mjs +310 -0
  10. package/bin/lib/eval/eval-types.d.mts +168 -0
  11. package/bin/lib/eval/eval-vitest-plugin.mjs +158 -0
  12. package/bin/lib/eval/model-registry.mjs +73 -0
  13. package/bin/lib/sandbox-server.mjs +5 -2
  14. package/bin/sunpeak.js +1 -0
  15. package/dist/chatgpt/index.cjs +1 -1
  16. package/dist/chatgpt/index.js +1 -1
  17. package/dist/claude/index.cjs +1 -1
  18. package/dist/claude/index.js +1 -1
  19. package/dist/host/chatgpt/index.cjs +1 -1
  20. package/dist/host/chatgpt/index.js +1 -1
  21. package/dist/index.cjs +134 -124
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.d.ts +3 -1
  24. package/dist/index.js +71 -62
  25. package/dist/index.js.map +1 -1
  26. package/dist/inspector/index.cjs +1 -1
  27. package/dist/inspector/index.js +1 -1
  28. package/dist/{inspector-Cdo5BK2D.js → inspector-D5DckQuU.js} +236 -98
  29. package/dist/inspector-D5DckQuU.js.map +1 -0
  30. package/dist/{inspector-8nPV2A-z.cjs → inspector-jY9O18z9.cjs} +237 -99
  31. package/dist/inspector-jY9O18z9.cjs.map +1 -0
  32. package/dist/mcp/index.cjs +237 -140
  33. package/dist/mcp/index.cjs.map +1 -1
  34. package/dist/mcp/index.d.ts +1 -1
  35. package/dist/mcp/index.js +230 -134
  36. package/dist/mcp/index.js.map +1 -1
  37. package/dist/mcp/production-server.d.ts +31 -0
  38. package/dist/{protocol-C7kTcBr_.cjs → protocol-C8pFDmcy.cjs} +8194 -8187
  39. package/dist/protocol-C8pFDmcy.cjs.map +1 -0
  40. package/dist/{protocol-BfAACnv0.js → protocol-CRqiPTLT.js} +8186 -8185
  41. package/dist/protocol-CRqiPTLT.js.map +1 -0
  42. package/dist/{use-app-CfP9VypY.js → use-app-Bfargfa3.js} +194 -94
  43. package/dist/use-app-Bfargfa3.js.map +1 -0
  44. package/dist/{use-app-CzcYw1Kz.cjs → use-app-CbsBEmwv.cjs} +254 -148
  45. package/dist/use-app-CbsBEmwv.cjs.map +1 -0
  46. package/package.json +27 -3
  47. package/template/README.md +17 -7
  48. package/template/_gitignore +2 -0
  49. package/template/dist/albums/albums.html +15 -15
  50. package/template/dist/albums/albums.json +1 -1
  51. package/template/dist/carousel/carousel.html +19 -19
  52. package/template/dist/carousel/carousel.json +1 -1
  53. package/template/dist/map/map.html +14 -14
  54. package/template/dist/map/map.json +1 -1
  55. package/template/dist/review/review.html +11 -11
  56. package/template/dist/review/review.json +1 -1
  57. package/template/node_modules/.bin/vitest +2 -2
  58. package/template/node_modules/.vite/deps/_metadata.json +3 -3
  59. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js +192 -91
  60. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js.map +1 -1
  61. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js +231 -92
  62. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js.map +1 -1
  63. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js +208 -105
  64. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js.map +1 -1
  65. package/template/node_modules/.vite-mcp/deps/_metadata.json +25 -25
  66. package/template/node_modules/.vite-mcp/deps/{protocol-B_qKkui_.js → protocol-BqGB4zBx.js} +45 -45
  67. package/template/node_modules/.vite-mcp/deps/protocol-BqGB4zBx.js.map +1 -0
  68. package/template/node_modules/.vite-mcp/deps/vitest.js +7 -7
  69. package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
  70. package/template/tests/e2e/visual.spec.ts-snapshots/albums-dark-chatgpt-darwin.png +0 -0
  71. package/template/tests/e2e/visual.spec.ts-snapshots/albums-dark-claude-darwin.png +0 -0
  72. package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-darwin.png +0 -0
  73. package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-darwin.png +0 -0
  74. package/template/tests/e2e/visual.spec.ts-snapshots/albums-light-chatgpt-darwin.png +0 -0
  75. package/template/tests/e2e/visual.spec.ts-snapshots/albums-light-claude-darwin.png +0 -0
  76. package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-chatgpt-darwin.png +0 -0
  77. package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-claude-darwin.png +0 -0
  78. package/template/tests/evals/.env.example +5 -0
  79. package/template/tests/evals/albums.eval.ts +28 -0
  80. package/template/tests/evals/carousel.eval.ts +26 -0
  81. package/template/tests/evals/eval.config.ts +26 -0
  82. package/template/tests/evals/map.eval.ts +23 -0
  83. package/template/tests/evals/review.eval.ts +48 -0
  84. package/dist/inspector-8nPV2A-z.cjs.map +0 -1
  85. package/dist/inspector-Cdo5BK2D.js.map +0 -1
  86. package/dist/protocol-BfAACnv0.js.map +0 -1
  87. package/dist/protocol-C7kTcBr_.cjs.map +0 -1
  88. package/dist/use-app-CfP9VypY.js.map +0 -1
  89. package/dist/use-app-CzcYw1Kz.cjs.map +0 -1
  90. package/template/node_modules/.vite-mcp/deps/protocol-B_qKkui_.js.map +0 -1
@@ -1,5 +1,5 @@
1
1
  import { spawn } from 'child_process';
2
- import { existsSync } from 'fs';
2
+ import { existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
3
3
  import { join } from 'path';
4
4
 
5
5
  /**
@@ -10,13 +10,15 @@ import { join } from 'path';
10
10
  * sunpeak test --unit Run unit tests (vitest)
11
11
  * sunpeak test --e2e Run e2e tests (Playwright)
12
12
  * sunpeak test --live Run live tests against real hosts
13
+ * sunpeak test --eval Run evals against LLM models
13
14
  * sunpeak test --visual Run e2e tests with visual regression comparison
14
15
  * sunpeak test --visual --update Update visual regression baselines
15
16
  * sunpeak test [pattern] Pass through to the relevant runner
16
17
  *
17
- * Flags are additive: --unit --e2e --live runs all three.
18
+ * Flags are additive: --unit --e2e --live --eval runs all four.
18
19
  * --visual implies --e2e and enables screenshot comparison.
19
20
  * --update implies --visual.
21
+ * --eval and --live are never included in the default run (they cost money).
20
22
  */
21
23
  export async function runTest(args) {
22
24
  // Handle `sunpeak test init` subcommand
@@ -29,16 +31,17 @@ export async function runTest(args) {
29
31
  const isUnit = args.includes('--unit');
30
32
  const isE2e = args.includes('--e2e');
31
33
  const isLive = args.includes('--live');
34
+ const isEval = args.includes('--eval');
32
35
  let isVisual = args.includes('--visual');
33
36
  const isUpdate = args.includes('--update');
34
37
  const filteredArgs = args.filter(
35
- (a) => !['--unit', '--e2e', '--live', '--visual', '--update'].includes(a)
38
+ (a) => !['--unit', '--e2e', '--live', '--eval', '--visual', '--update'].includes(a)
36
39
  );
37
40
 
38
41
  // --update implies --visual (no point updating without enabling visual)
39
42
  if (isUpdate) isVisual = true;
40
43
 
41
- const hasAnyScope = isUnit || isE2e || isLive || isVisual;
44
+ const hasAnyScope = isUnit || isE2e || isLive || isEval || isVisual;
42
45
 
43
46
  // When extra args are present (file patterns, etc.) and no scope flags given,
44
47
  // default to e2e only — passing Playwright file patterns to vitest would fail.
@@ -47,9 +50,11 @@ export async function runTest(args) {
47
50
  // Determine which suites to run.
48
51
  // No scope flags → unit + e2e (unless extra args narrow to e2e).
49
52
  // --visual implies e2e.
53
+ // --eval and --live are never in the default run (they cost money).
50
54
  const runUnit = hasAnyScope ? isUnit : !hasExtraArgs;
51
55
  const runE2e = hasAnyScope ? (isE2e || isVisual) : true;
52
56
  const runLive = isLive;
57
+ const runEval = isEval;
53
58
 
54
59
  const results = [];
55
60
 
@@ -84,6 +89,11 @@ export async function runTest(args) {
84
89
  results.push({ suite: 'live', code });
85
90
  }
86
91
 
92
+ if (runEval) {
93
+ const code = await runEvals(filteredArgs);
94
+ results.push({ suite: 'eval', code });
95
+ }
96
+
87
97
  // Exit with the first non-zero code, or 0 if all passed
88
98
  const failed = results.find((r) => r.code !== 0);
89
99
  process.exit(failed ? failed.code : 0);
@@ -142,3 +152,346 @@ function findConfig(candidates) {
142
152
  }
143
153
  return null;
144
154
  }
155
+
156
+ /**
157
+ * Detect if the current directory is a sunpeak app project (has tools to serve).
158
+ * Just having sunpeak as a dependency is not enough — the testing framework
159
+ * can be used with any MCP server. A sunpeak app project has src/tools/.
160
+ */
161
+ function isSunpeakProject() {
162
+ const pkgPath = join(process.cwd(), 'package.json');
163
+ if (!existsSync(pkgPath)) return false;
164
+ try {
165
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
166
+ const deps = { ...pkg.dependencies, ...pkg.devDependencies };
167
+ if (!('sunpeak' in deps)) return false;
168
+ // Check for the sunpeak app convention (src/tools/ directory)
169
+ return existsSync(join(process.cwd(), 'src', 'tools'));
170
+ } catch {
171
+ return false;
172
+ }
173
+ }
174
+
175
+ /**
176
+ * Start the dev server and wait for it to be ready.
177
+ * The dev server's inspector listens on inspectorPort (default 3000)
178
+ * and the MCP server listens on mcpPort (default 8000).
179
+ * @param {{ inspectorPort: number, mcpPort: number }} ports
180
+ * @returns {Promise<{ process: import('child_process').ChildProcess } | null>}
181
+ */
182
+ async function startDevServer({ inspectorPort, mcpPort }) {
183
+ // Check if the MCP server is already running
184
+ try {
185
+ const resp = await fetch(`http://localhost:${mcpPort}/health`, {
186
+ signal: AbortSignal.timeout(2000),
187
+ });
188
+ if (resp.ok) return null; // Server already running
189
+ } catch {
190
+ // Not running, start it
191
+ }
192
+
193
+ console.log('Starting dev server for evals...');
194
+ const child = spawn(
195
+ 'pnpm',
196
+ ['exec', 'sunpeak', 'dev', '--', '--prod-tools'],
197
+ {
198
+ stdio: ['ignore', 'pipe', 'pipe'],
199
+ env: {
200
+ ...process.env,
201
+ PORT: String(inspectorPort),
202
+ SUNPEAK_MCP_PORT: String(mcpPort),
203
+ SUNPEAK_DEV_OVERLAY: 'false',
204
+ },
205
+ }
206
+ );
207
+
208
+ // Detect early exit (build errors, port conflicts, etc.)
209
+ let exited = false;
210
+ let stderrChunks = [];
211
+ child.stderr.on('data', (chunk) => stderrChunks.push(chunk.toString()));
212
+ child.on('exit', () => { exited = true; });
213
+
214
+ // Wait for the inspector health endpoint
215
+ const timeout = 60_000;
216
+ const start = Date.now();
217
+ while (Date.now() - start < timeout) {
218
+ if (exited) {
219
+ const stderr = stderrChunks.join('');
220
+ throw new Error(`Dev server exited before becoming ready.\n${stderr}`);
221
+ }
222
+ try {
223
+ const resp = await fetch(`http://localhost:${inspectorPort}/health`, {
224
+ signal: AbortSignal.timeout(2000),
225
+ });
226
+ if (resp.ok) {
227
+ // Also verify the MCP server is reachable
228
+ try {
229
+ await fetch(`http://localhost:${mcpPort}/health`, {
230
+ signal: AbortSignal.timeout(2000),
231
+ });
232
+ } catch {
233
+ // MCP server not ready yet, keep polling
234
+ await new Promise((r) => setTimeout(r, 500));
235
+ continue;
236
+ }
237
+ console.log('Dev server ready');
238
+ return { process: child };
239
+ }
240
+ } catch {
241
+ // Not ready yet
242
+ }
243
+ await new Promise((r) => setTimeout(r, 500));
244
+ }
245
+
246
+ child.kill();
247
+ throw new Error(`Dev server did not start within ${timeout / 1000}s`);
248
+ }
249
+
250
+ /**
251
+ * Load .env file from a directory into process.env (only sets vars not already set).
252
+ * @param {string} dir - Directory containing .env file
253
+ */
254
+ function loadEnvFile(dir) {
255
+ const envPath = join(dir, '.env');
256
+ if (!existsSync(envPath)) return;
257
+
258
+ const content = readFileSync(envPath, 'utf-8');
259
+ for (const line of content.split('\n')) {
260
+ const trimmed = line.trim();
261
+ if (!trimmed || trimmed.startsWith('#')) continue;
262
+ const eqIdx = trimmed.indexOf('=');
263
+ if (eqIdx === -1) continue;
264
+ const key = trimmed.slice(0, eqIdx).trim();
265
+ let value = trimmed.slice(eqIdx + 1).trim();
266
+ // Strip surrounding quotes
267
+ value = value.replace(/^(['"])(.*)\1$/, '$2');
268
+ if (key && value && !process.env[key]) {
269
+ process.env[key] = value;
270
+ }
271
+ }
272
+ }
273
+
274
+ /**
275
+ * Map of provider packages to their required env vars.
276
+ */
277
+ const PROVIDER_ENV_VARS = {
278
+ '@ai-sdk/openai': 'OPENAI_API_KEY',
279
+ '@ai-sdk/anthropic': 'ANTHROPIC_API_KEY',
280
+ '@ai-sdk/google': 'GOOGLE_GENERATIVE_AI_API_KEY',
281
+ };
282
+
283
+ /**
284
+ * Map model ID prefix to provider package.
285
+ */
286
+ function getProviderForModel(modelId) {
287
+ if (/^(gpt-|o[134]-|o[134]$|chatgpt-)/.test(modelId)) return '@ai-sdk/openai';
288
+ if (/^claude-/.test(modelId)) return '@ai-sdk/anthropic';
289
+ if (/^(gemini-|models\/gemini-)/.test(modelId)) return '@ai-sdk/google';
290
+ return null;
291
+ }
292
+
293
+ /**
294
+ * Validate that required API keys are set for the configured models.
295
+ * @param {string[]} models
296
+ * @returns {string[]} Array of warning messages (empty if all good)
297
+ */
298
+ function validateApiKeys(models) {
299
+ const warnings = [];
300
+ const checked = new Set();
301
+ for (const modelId of models) {
302
+ const pkg = getProviderForModel(modelId);
303
+ if (!pkg || checked.has(pkg)) continue;
304
+ checked.add(pkg);
305
+ const envVar = PROVIDER_ENV_VARS[pkg];
306
+ if (envVar && !process.env[envVar]) {
307
+ warnings.push(`${envVar} not set (required for ${modelId}). Add it to tests/evals/.env or export it in your shell.`);
308
+ }
309
+ }
310
+ return warnings;
311
+ }
312
+
313
+ /**
314
+ * Run eval tests via vitest with the eval plugin.
315
+ */
316
+ async function runEvals(args) {
317
+ const { dirname, resolve, basename } = await import('path');
318
+ const { fileURLToPath } = await import('url');
319
+
320
+ const __dirname = dirname(fileURLToPath(import.meta.url));
321
+ const evalDir = findEvalDir();
322
+
323
+ if (!evalDir) {
324
+ console.error('No eval directory found. Run "sunpeak test init" to scaffold eval tests.');
325
+ return 1;
326
+ }
327
+
328
+ const absEvalDir = resolve(process.cwd(), evalDir);
329
+
330
+ // Load .env from eval directory before anything else
331
+ loadEnvFile(absEvalDir);
332
+
333
+ // Find eval config and load it to check models before launching vitest
334
+ const configFile = findConfig([
335
+ join(evalDir, 'eval.config.ts'),
336
+ join(evalDir, 'eval.config.js'),
337
+ ]);
338
+
339
+ // Quick check: load the config to see if models are configured.
340
+ // We do this by reading the file and checking for non-empty models array.
341
+ // For a proper check, we'd need to evaluate the TS, but a quick heuristic
342
+ // is to warn if we can detect an empty array.
343
+ let configModels = null;
344
+ if (configFile) {
345
+ const configContent = readFileSync(configFile, 'utf-8');
346
+ // Check if models array appears to be empty (all lines commented out)
347
+ const modelsMatch = configContent.match(/models:\s*\[([\s\S]*?)\]/);
348
+ if (modelsMatch) {
349
+ const modelsBody = modelsMatch[1].trim();
350
+ // Remove comments and whitespace to check if anything is actually configured
351
+ const uncommented = modelsBody.replace(/\/\/.*$/gm, '').replace(/\/\*[\s\S]*?\*\//g, '').trim();
352
+ if (!uncommented || uncommented === ',') {
353
+ console.log(`\nNo models configured in ${configFile}.`);
354
+ console.log('To run evals, uncomment at least one model in your eval config:\n');
355
+ console.log(' models: [');
356
+ console.log(" 'gpt-4o', // requires OPENAI_API_KEY");
357
+ console.log(" 'claude-sonnet-4-20250514', // requires ANTHROPIC_API_KEY");
358
+ console.log(" 'gemini-2.0-flash', // requires GOOGLE_GENERATIVE_AI_API_KEY");
359
+ console.log(' ],\n');
360
+ return 0;
361
+ }
362
+ // Extract actual model strings for API key validation
363
+ const modelStrings = [...uncommented.matchAll(/['"]([^'"]+)['"]/g)].map(m => m[1]);
364
+ configModels = modelStrings;
365
+ }
366
+ }
367
+
368
+ // Validate API keys for configured models
369
+ if (configModels && configModels.length > 0) {
370
+ const warnings = validateApiKeys(configModels);
371
+ if (warnings.length > 0) {
372
+ console.log('');
373
+ for (const w of warnings) {
374
+ console.warn(`⚠ ${w}`);
375
+ }
376
+ console.log('');
377
+ }
378
+ }
379
+
380
+ // Use non-standard ports for auto-start to avoid conflicts with a running dev server.
381
+ const autoStartMcpPort = 18920;
382
+ const autoStartInspectorPort = 18921;
383
+ // Default server for non-sunpeak projects (standard MCP port)
384
+ const defaultMcpPort = 8000;
385
+ let devServerHandle = null;
386
+ let mcpPort = defaultMcpPort;
387
+
388
+ if (isSunpeakProject() && !process.env.SUNPEAK_EVAL_SERVER) {
389
+ // Auto-start dev server for sunpeak projects on non-standard ports
390
+ mcpPort = autoStartMcpPort;
391
+ try {
392
+ devServerHandle = await startDevServer({
393
+ inspectorPort: autoStartInspectorPort,
394
+ mcpPort: autoStartMcpPort,
395
+ });
396
+ } catch (err) {
397
+ console.error('Failed to start dev server:', err.message);
398
+ return 1;
399
+ }
400
+ } else if (!process.env.SUNPEAK_EVAL_SERVER) {
401
+ // For non-sunpeak projects, check if the server is running
402
+ try {
403
+ await fetch(`http://localhost:${defaultMcpPort}/health`, {
404
+ signal: AbortSignal.timeout(3000),
405
+ });
406
+ } catch {
407
+ const defaultServer = `http://localhost:${defaultMcpPort}/mcp`;
408
+ console.error(`MCP server not reachable at ${defaultServer}`);
409
+ console.error('Make sure your MCP server is running, or set the server URL in eval.config.ts.\n');
410
+ return 1;
411
+ }
412
+ }
413
+
414
+ const defaultServer = `http://localhost:${mcpPort}/mcp`;
415
+
416
+ // Use .ts extension so vitest handles TypeScript imports natively
417
+ const vitestConfigPath = join(absEvalDir, '.eval-vitest.config.ts');
418
+
419
+ const evalPluginPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-vitest-plugin.mjs');
420
+ const evalReporterPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-reporter.mjs');
421
+
422
+ // Clean up dev server and temp config
423
+ const cleanupResources = () => {
424
+ if (devServerHandle?.process) {
425
+ devServerHandle.process.kill();
426
+ devServerHandle = null;
427
+ }
428
+ try { unlinkSync(vitestConfigPath); } catch {}
429
+ };
430
+ // On unexpected signal, clean up and re-exit with conventional signal code
431
+ const onSignal = (signal) => {
432
+ cleanupResources();
433
+ process.exit(signal === 'SIGINT' ? 130 : 143);
434
+ };
435
+ const onSigInt = () => onSignal('SIGINT');
436
+ const onSigTerm = () => onSignal('SIGTERM');
437
+ process.on('SIGINT', onSigInt);
438
+ process.on('SIGTERM', onSigTerm);
439
+
440
+ // Resolve config import path relative to the generated config file location
441
+ const configImportPath = configFile
442
+ ? './' + basename(configFile)
443
+ : null;
444
+
445
+ // Generate a vitest config that loads the eval config and plugin
446
+ const vitestConfig = `
447
+ import { defineConfig } from 'vitest/config';
448
+ import { evalVitestPlugin } from ${JSON.stringify(evalPluginPath)};
449
+ ${configImportPath ? `import evalConfig from ${JSON.stringify(configImportPath)};` : 'const evalConfig = { models: [], defaults: {} };'}
450
+
451
+ export default defineConfig({
452
+ test: {
453
+ globals: true,
454
+ include: ['**/*.eval.ts', '**/*.eval.js'],
455
+ reporters: ['default', ${JSON.stringify(evalReporterPath)}],
456
+ testTimeout: 600000,
457
+ },
458
+ plugins: [
459
+ evalVitestPlugin({
460
+ server: evalConfig.server || process.env.SUNPEAK_EVAL_SERVER || ${JSON.stringify(defaultServer)},
461
+ models: evalConfig.models || [],
462
+ defaults: evalConfig.defaults || {},
463
+ }),
464
+ ],
465
+ });
466
+ `;
467
+
468
+ writeFileSync(vitestConfigPath, vitestConfig);
469
+
470
+ try {
471
+ const code = await runChild(
472
+ 'pnpm',
473
+ ['exec', 'vitest', 'run', '--config', vitestConfigPath, ...args]
474
+ );
475
+ return code;
476
+ } finally {
477
+ cleanupResources();
478
+ process.removeListener('SIGINT', onSigInt);
479
+ process.removeListener('SIGTERM', onSigTerm);
480
+ }
481
+ }
482
+
483
+ /**
484
+ * Find the eval tests directory.
485
+ */
486
+ function findEvalDir() {
487
+ const candidates = [
488
+ 'tests/evals',
489
+ 'tests/sunpeak/evals',
490
+ ];
491
+
492
+ for (const candidate of candidates) {
493
+ const full = join(process.cwd(), candidate);
494
+ if (existsSync(full)) return candidate;
495
+ }
496
+ return null;
497
+ }
@@ -0,0 +1,105 @@
1
+ /**
2
+ * Custom vitest reporter for eval results.
3
+ *
4
+ * Parses __SUNPEAK_EVAL__ log lines from test console output and prints
5
+ * a statistical summary with pass/fail counts per model per case.
6
+ */
7
+
8
+ const EVAL_PREFIX = '__SUNPEAK_EVAL__';
9
+
10
+ export default class EvalReporter {
11
+ /** @type {import('./eval-types.d.mts').EvalCaseResult[]} */
12
+ results = [];
13
+ /** @type {boolean} */
14
+ printed = false;
15
+
16
+ /**
17
+ * Called for each console.log from test code.
18
+ * @param {{ content: string, type: 'stdout' | 'stderr' }} log
19
+ */
20
+ onUserConsoleLog(log) {
21
+ if (log.type !== 'stdout') return;
22
+ const idx = log.content.indexOf(EVAL_PREFIX);
23
+ if (idx === -1) return;
24
+
25
+ try {
26
+ const json = log.content.slice(idx + EVAL_PREFIX.length);
27
+ const data = JSON.parse(json);
28
+ if (data.type === 'eval-result') {
29
+ this.results.push(data);
30
+ }
31
+ } catch {
32
+ // Ignore parse errors
33
+ }
34
+ }
35
+
36
+ /**
37
+ * Called when the test run ends (vitest v4+).
38
+ */
39
+ onTestRunEnd() {
40
+ this.printSummary();
41
+ }
42
+
43
+ /**
44
+ * Fallback for older vitest versions.
45
+ */
46
+ onFinished() {
47
+ this.printSummary();
48
+ }
49
+
50
+ printSummary() {
51
+ if (this.results.length === 0 || this.printed) return;
52
+ this.printed = true;
53
+
54
+ console.log('\n' + '='.repeat(60));
55
+ console.log('Eval Results');
56
+ console.log('='.repeat(60));
57
+
58
+ // Group by case name
59
+ const byCase = new Map();
60
+ for (const r of this.results) {
61
+ if (!byCase.has(r.caseName)) byCase.set(r.caseName, []);
62
+ byCase.get(r.caseName).push(r);
63
+ }
64
+
65
+ let totalPassed = 0;
66
+ let totalRuns = 0;
67
+
68
+ for (const [caseName, caseResults] of byCase) {
69
+ console.log(`\n ${caseName}`);
70
+
71
+ const maxLen = Math.max(...caseResults.map((r) => r.modelId.length));
72
+
73
+ for (const r of caseResults) {
74
+ totalPassed += r.passed;
75
+ totalRuns += r.runs;
76
+
77
+ const pct = (r.passRate * 100).toFixed(0);
78
+ const avgMs = r.avgDurationMs.toFixed(0);
79
+ const status =
80
+ r.passRate === 1
81
+ ? '\x1b[32m\u2713\x1b[0m'
82
+ : r.passRate >= 0.8
83
+ ? '\x1b[33m~\x1b[0m'
84
+ : '\x1b[31m\u2717\x1b[0m';
85
+
86
+ console.log(
87
+ ` ${status} ${r.modelId.padEnd(maxLen)} ${r.passed}/${r.runs} passed (${pct}%) avg ${avgMs}ms`
88
+ );
89
+
90
+ if (r.failures.length > 0) {
91
+ for (const f of r.failures) {
92
+ console.log(` \x1b[2m\u2514 ${f.error} (${f.count}x)\x1b[0m`);
93
+ }
94
+ }
95
+ }
96
+ }
97
+
98
+ const totalPct = totalRuns > 0 ? ((totalPassed / totalRuns) * 100).toFixed(0) : 0;
99
+ const modelCount = new Set(this.results.map((r) => r.modelId)).size;
100
+ console.log(
101
+ `\n Summary: ${totalPassed}/${totalRuns} passed (${totalPct}%) across ${modelCount} model(s)`
102
+ );
103
+ console.log('='.repeat(60) + '\n');
104
+ }
105
+ }