sunpeak 0.19.2 → 0.19.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/bin/commands/dev.mjs +1 -1
- package/bin/commands/inspect.mjs +1 -1
- package/bin/commands/new.mjs +9 -5
- package/bin/commands/start.mjs +3 -1
- package/bin/commands/test-init.mjs +478 -76
- package/bin/commands/test.mjs +357 -4
- package/bin/lib/eval/eval-reporter.mjs +105 -0
- package/bin/lib/eval/eval-runner.mjs +310 -0
- package/bin/lib/eval/eval-types.d.mts +168 -0
- package/bin/lib/eval/eval-vitest-plugin.mjs +158 -0
- package/bin/lib/eval/model-registry.mjs +73 -0
- package/bin/lib/sandbox-server.mjs +5 -2
- package/bin/sunpeak.js +1 -0
- package/dist/chatgpt/index.cjs +1 -1
- package/dist/chatgpt/index.js +1 -1
- package/dist/claude/index.cjs +1 -1
- package/dist/claude/index.js +1 -1
- package/dist/host/chatgpt/index.cjs +1 -1
- package/dist/host/chatgpt/index.js +1 -1
- package/dist/index.cjs +134 -124
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +3 -1
- package/dist/index.js +71 -62
- package/dist/index.js.map +1 -1
- package/dist/inspector/index.cjs +1 -1
- package/dist/inspector/index.js +1 -1
- package/dist/{inspector-Cdo5BK2D.js → inspector-D5DckQuU.js} +236 -98
- package/dist/inspector-D5DckQuU.js.map +1 -0
- package/dist/{inspector-8nPV2A-z.cjs → inspector-jY9O18z9.cjs} +237 -99
- package/dist/inspector-jY9O18z9.cjs.map +1 -0
- package/dist/mcp/index.cjs +237 -140
- package/dist/mcp/index.cjs.map +1 -1
- package/dist/mcp/index.d.ts +1 -1
- package/dist/mcp/index.js +230 -134
- package/dist/mcp/index.js.map +1 -1
- package/dist/mcp/production-server.d.ts +31 -0
- package/dist/{protocol-C7kTcBr_.cjs → protocol-C8pFDmcy.cjs} +8194 -8187
- package/dist/protocol-C8pFDmcy.cjs.map +1 -0
- package/dist/{protocol-BfAACnv0.js → protocol-CRqiPTLT.js} +8186 -8185
- package/dist/protocol-CRqiPTLT.js.map +1 -0
- package/dist/{use-app-CfP9VypY.js → use-app-Bfargfa3.js} +194 -94
- package/dist/use-app-Bfargfa3.js.map +1 -0
- package/dist/{use-app-CzcYw1Kz.cjs → use-app-CbsBEmwv.cjs} +254 -148
- package/dist/use-app-CbsBEmwv.cjs.map +1 -0
- package/package.json +27 -3
- package/template/README.md +17 -7
- package/template/_gitignore +2 -0
- package/template/dist/albums/albums.html +15 -15
- package/template/dist/albums/albums.json +1 -1
- package/template/dist/carousel/carousel.html +19 -19
- package/template/dist/carousel/carousel.json +1 -1
- package/template/dist/map/map.html +14 -14
- package/template/dist/map/map.json +1 -1
- package/template/dist/review/review.html +11 -11
- package/template/dist/review/review.json +1 -1
- package/template/node_modules/.bin/vitest +2 -2
- package/template/node_modules/.vite/deps/_metadata.json +3 -3
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js +192 -91
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js +231 -92
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js +208 -105
- package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js.map +1 -1
- package/template/node_modules/.vite-mcp/deps/_metadata.json +25 -25
- package/template/node_modules/.vite-mcp/deps/{protocol-B_qKkui_.js → protocol-BqGB4zBx.js} +45 -45
- package/template/node_modules/.vite-mcp/deps/protocol-BqGB4zBx.js.map +1 -0
- package/template/node_modules/.vite-mcp/deps/vitest.js +7 -7
- package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-dark-chatgpt-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-dark-claude-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-light-chatgpt-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-light-claude-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-chatgpt-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-claude-darwin.png +0 -0
- package/template/tests/evals/.env.example +5 -0
- package/template/tests/evals/albums.eval.ts +28 -0
- package/template/tests/evals/carousel.eval.ts +26 -0
- package/template/tests/evals/eval.config.ts +26 -0
- package/template/tests/evals/map.eval.ts +23 -0
- package/template/tests/evals/review.eval.ts +48 -0
- package/dist/inspector-8nPV2A-z.cjs.map +0 -1
- package/dist/inspector-Cdo5BK2D.js.map +0 -1
- package/dist/protocol-BfAACnv0.js.map +0 -1
- package/dist/protocol-C7kTcBr_.cjs.map +0 -1
- package/dist/use-app-CfP9VypY.js.map +0 -1
- package/dist/use-app-CzcYw1Kz.cjs.map +0 -1
- package/template/node_modules/.vite-mcp/deps/protocol-B_qKkui_.js.map +0 -1
package/bin/commands/test.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { spawn } from 'child_process';
|
|
2
|
-
import { existsSync } from 'fs';
|
|
2
|
+
import { existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
|
|
3
3
|
import { join } from 'path';
|
|
4
4
|
|
|
5
5
|
/**
|
|
@@ -10,13 +10,15 @@ import { join } from 'path';
|
|
|
10
10
|
* sunpeak test --unit Run unit tests (vitest)
|
|
11
11
|
* sunpeak test --e2e Run e2e tests (Playwright)
|
|
12
12
|
* sunpeak test --live Run live tests against real hosts
|
|
13
|
+
* sunpeak test --eval Run evals against LLM models
|
|
13
14
|
* sunpeak test --visual Run e2e tests with visual regression comparison
|
|
14
15
|
* sunpeak test --visual --update Update visual regression baselines
|
|
15
16
|
* sunpeak test [pattern] Pass through to the relevant runner
|
|
16
17
|
*
|
|
17
|
-
* Flags are additive: --unit --e2e --live runs all
|
|
18
|
+
* Flags are additive: --unit --e2e --live --eval runs all four.
|
|
18
19
|
* --visual implies --e2e and enables screenshot comparison.
|
|
19
20
|
* --update implies --visual.
|
|
21
|
+
* --eval and --live are never included in the default run (they cost money).
|
|
20
22
|
*/
|
|
21
23
|
export async function runTest(args) {
|
|
22
24
|
// Handle `sunpeak test init` subcommand
|
|
@@ -29,16 +31,17 @@ export async function runTest(args) {
|
|
|
29
31
|
const isUnit = args.includes('--unit');
|
|
30
32
|
const isE2e = args.includes('--e2e');
|
|
31
33
|
const isLive = args.includes('--live');
|
|
34
|
+
const isEval = args.includes('--eval');
|
|
32
35
|
let isVisual = args.includes('--visual');
|
|
33
36
|
const isUpdate = args.includes('--update');
|
|
34
37
|
const filteredArgs = args.filter(
|
|
35
|
-
(a) => !['--unit', '--e2e', '--live', '--visual', '--update'].includes(a)
|
|
38
|
+
(a) => !['--unit', '--e2e', '--live', '--eval', '--visual', '--update'].includes(a)
|
|
36
39
|
);
|
|
37
40
|
|
|
38
41
|
// --update implies --visual (no point updating without enabling visual)
|
|
39
42
|
if (isUpdate) isVisual = true;
|
|
40
43
|
|
|
41
|
-
const hasAnyScope = isUnit || isE2e || isLive || isVisual;
|
|
44
|
+
const hasAnyScope = isUnit || isE2e || isLive || isEval || isVisual;
|
|
42
45
|
|
|
43
46
|
// When extra args are present (file patterns, etc.) and no scope flags given,
|
|
44
47
|
// default to e2e only — passing Playwright file patterns to vitest would fail.
|
|
@@ -47,9 +50,11 @@ export async function runTest(args) {
|
|
|
47
50
|
// Determine which suites to run.
|
|
48
51
|
// No scope flags → unit + e2e (unless extra args narrow to e2e).
|
|
49
52
|
// --visual implies e2e.
|
|
53
|
+
// --eval and --live are never in the default run (they cost money).
|
|
50
54
|
const runUnit = hasAnyScope ? isUnit : !hasExtraArgs;
|
|
51
55
|
const runE2e = hasAnyScope ? (isE2e || isVisual) : true;
|
|
52
56
|
const runLive = isLive;
|
|
57
|
+
const runEval = isEval;
|
|
53
58
|
|
|
54
59
|
const results = [];
|
|
55
60
|
|
|
@@ -84,6 +89,11 @@ export async function runTest(args) {
|
|
|
84
89
|
results.push({ suite: 'live', code });
|
|
85
90
|
}
|
|
86
91
|
|
|
92
|
+
if (runEval) {
|
|
93
|
+
const code = await runEvals(filteredArgs);
|
|
94
|
+
results.push({ suite: 'eval', code });
|
|
95
|
+
}
|
|
96
|
+
|
|
87
97
|
// Exit with the first non-zero code, or 0 if all passed
|
|
88
98
|
const failed = results.find((r) => r.code !== 0);
|
|
89
99
|
process.exit(failed ? failed.code : 0);
|
|
@@ -142,3 +152,346 @@ function findConfig(candidates) {
|
|
|
142
152
|
}
|
|
143
153
|
return null;
|
|
144
154
|
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Detect if the current directory is a sunpeak app project (has tools to serve).
|
|
158
|
+
* Just having sunpeak as a dependency is not enough — the testing framework
|
|
159
|
+
* can be used with any MCP server. A sunpeak app project has src/tools/.
|
|
160
|
+
*/
|
|
161
|
+
function isSunpeakProject() {
|
|
162
|
+
const pkgPath = join(process.cwd(), 'package.json');
|
|
163
|
+
if (!existsSync(pkgPath)) return false;
|
|
164
|
+
try {
|
|
165
|
+
const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
|
|
166
|
+
const deps = { ...pkg.dependencies, ...pkg.devDependencies };
|
|
167
|
+
if (!('sunpeak' in deps)) return false;
|
|
168
|
+
// Check for the sunpeak app convention (src/tools/ directory)
|
|
169
|
+
return existsSync(join(process.cwd(), 'src', 'tools'));
|
|
170
|
+
} catch {
|
|
171
|
+
return false;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Start the dev server and wait for it to be ready.
|
|
177
|
+
* The dev server's inspector listens on inspectorPort (default 3000)
|
|
178
|
+
* and the MCP server listens on mcpPort (default 8000).
|
|
179
|
+
* @param {{ inspectorPort: number, mcpPort: number }} ports
|
|
180
|
+
* @returns {Promise<{ process: import('child_process').ChildProcess } | null>}
|
|
181
|
+
*/
|
|
182
|
+
async function startDevServer({ inspectorPort, mcpPort }) {
|
|
183
|
+
// Check if the MCP server is already running
|
|
184
|
+
try {
|
|
185
|
+
const resp = await fetch(`http://localhost:${mcpPort}/health`, {
|
|
186
|
+
signal: AbortSignal.timeout(2000),
|
|
187
|
+
});
|
|
188
|
+
if (resp.ok) return null; // Server already running
|
|
189
|
+
} catch {
|
|
190
|
+
// Not running, start it
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
console.log('Starting dev server for evals...');
|
|
194
|
+
const child = spawn(
|
|
195
|
+
'pnpm',
|
|
196
|
+
['exec', 'sunpeak', 'dev', '--', '--prod-tools'],
|
|
197
|
+
{
|
|
198
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
199
|
+
env: {
|
|
200
|
+
...process.env,
|
|
201
|
+
PORT: String(inspectorPort),
|
|
202
|
+
SUNPEAK_MCP_PORT: String(mcpPort),
|
|
203
|
+
SUNPEAK_DEV_OVERLAY: 'false',
|
|
204
|
+
},
|
|
205
|
+
}
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
// Detect early exit (build errors, port conflicts, etc.)
|
|
209
|
+
let exited = false;
|
|
210
|
+
let stderrChunks = [];
|
|
211
|
+
child.stderr.on('data', (chunk) => stderrChunks.push(chunk.toString()));
|
|
212
|
+
child.on('exit', () => { exited = true; });
|
|
213
|
+
|
|
214
|
+
// Wait for the inspector health endpoint
|
|
215
|
+
const timeout = 60_000;
|
|
216
|
+
const start = Date.now();
|
|
217
|
+
while (Date.now() - start < timeout) {
|
|
218
|
+
if (exited) {
|
|
219
|
+
const stderr = stderrChunks.join('');
|
|
220
|
+
throw new Error(`Dev server exited before becoming ready.\n${stderr}`);
|
|
221
|
+
}
|
|
222
|
+
try {
|
|
223
|
+
const resp = await fetch(`http://localhost:${inspectorPort}/health`, {
|
|
224
|
+
signal: AbortSignal.timeout(2000),
|
|
225
|
+
});
|
|
226
|
+
if (resp.ok) {
|
|
227
|
+
// Also verify the MCP server is reachable
|
|
228
|
+
try {
|
|
229
|
+
await fetch(`http://localhost:${mcpPort}/health`, {
|
|
230
|
+
signal: AbortSignal.timeout(2000),
|
|
231
|
+
});
|
|
232
|
+
} catch {
|
|
233
|
+
// MCP server not ready yet, keep polling
|
|
234
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
237
|
+
console.log('Dev server ready');
|
|
238
|
+
return { process: child };
|
|
239
|
+
}
|
|
240
|
+
} catch {
|
|
241
|
+
// Not ready yet
|
|
242
|
+
}
|
|
243
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
child.kill();
|
|
247
|
+
throw new Error(`Dev server did not start within ${timeout / 1000}s`);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Load .env file from a directory into process.env (only sets vars not already set).
|
|
252
|
+
* @param {string} dir - Directory containing .env file
|
|
253
|
+
*/
|
|
254
|
+
function loadEnvFile(dir) {
|
|
255
|
+
const envPath = join(dir, '.env');
|
|
256
|
+
if (!existsSync(envPath)) return;
|
|
257
|
+
|
|
258
|
+
const content = readFileSync(envPath, 'utf-8');
|
|
259
|
+
for (const line of content.split('\n')) {
|
|
260
|
+
const trimmed = line.trim();
|
|
261
|
+
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
262
|
+
const eqIdx = trimmed.indexOf('=');
|
|
263
|
+
if (eqIdx === -1) continue;
|
|
264
|
+
const key = trimmed.slice(0, eqIdx).trim();
|
|
265
|
+
let value = trimmed.slice(eqIdx + 1).trim();
|
|
266
|
+
// Strip surrounding quotes
|
|
267
|
+
value = value.replace(/^(['"])(.*)\1$/, '$2');
|
|
268
|
+
if (key && value && !process.env[key]) {
|
|
269
|
+
process.env[key] = value;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Map of provider packages to their required env vars.
|
|
276
|
+
*/
|
|
277
|
+
const PROVIDER_ENV_VARS = {
|
|
278
|
+
'@ai-sdk/openai': 'OPENAI_API_KEY',
|
|
279
|
+
'@ai-sdk/anthropic': 'ANTHROPIC_API_KEY',
|
|
280
|
+
'@ai-sdk/google': 'GOOGLE_GENERATIVE_AI_API_KEY',
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Map model ID prefix to provider package.
|
|
285
|
+
*/
|
|
286
|
+
function getProviderForModel(modelId) {
|
|
287
|
+
if (/^(gpt-|o[134]-|o[134]$|chatgpt-)/.test(modelId)) return '@ai-sdk/openai';
|
|
288
|
+
if (/^claude-/.test(modelId)) return '@ai-sdk/anthropic';
|
|
289
|
+
if (/^(gemini-|models\/gemini-)/.test(modelId)) return '@ai-sdk/google';
|
|
290
|
+
return null;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Validate that required API keys are set for the configured models.
|
|
295
|
+
* @param {string[]} models
|
|
296
|
+
* @returns {string[]} Array of warning messages (empty if all good)
|
|
297
|
+
*/
|
|
298
|
+
function validateApiKeys(models) {
|
|
299
|
+
const warnings = [];
|
|
300
|
+
const checked = new Set();
|
|
301
|
+
for (const modelId of models) {
|
|
302
|
+
const pkg = getProviderForModel(modelId);
|
|
303
|
+
if (!pkg || checked.has(pkg)) continue;
|
|
304
|
+
checked.add(pkg);
|
|
305
|
+
const envVar = PROVIDER_ENV_VARS[pkg];
|
|
306
|
+
if (envVar && !process.env[envVar]) {
|
|
307
|
+
warnings.push(`${envVar} not set (required for ${modelId}). Add it to tests/evals/.env or export it in your shell.`);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
return warnings;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* Run eval tests via vitest with the eval plugin.
|
|
315
|
+
*/
|
|
316
|
+
async function runEvals(args) {
|
|
317
|
+
const { dirname, resolve, basename } = await import('path');
|
|
318
|
+
const { fileURLToPath } = await import('url');
|
|
319
|
+
|
|
320
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
321
|
+
const evalDir = findEvalDir();
|
|
322
|
+
|
|
323
|
+
if (!evalDir) {
|
|
324
|
+
console.error('No eval directory found. Run "sunpeak test init" to scaffold eval tests.');
|
|
325
|
+
return 1;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
const absEvalDir = resolve(process.cwd(), evalDir);
|
|
329
|
+
|
|
330
|
+
// Load .env from eval directory before anything else
|
|
331
|
+
loadEnvFile(absEvalDir);
|
|
332
|
+
|
|
333
|
+
// Find eval config and load it to check models before launching vitest
|
|
334
|
+
const configFile = findConfig([
|
|
335
|
+
join(evalDir, 'eval.config.ts'),
|
|
336
|
+
join(evalDir, 'eval.config.js'),
|
|
337
|
+
]);
|
|
338
|
+
|
|
339
|
+
// Quick check: load the config to see if models are configured.
|
|
340
|
+
// We do this by reading the file and checking for non-empty models array.
|
|
341
|
+
// For a proper check, we'd need to evaluate the TS, but a quick heuristic
|
|
342
|
+
// is to warn if we can detect an empty array.
|
|
343
|
+
let configModels = null;
|
|
344
|
+
if (configFile) {
|
|
345
|
+
const configContent = readFileSync(configFile, 'utf-8');
|
|
346
|
+
// Check if models array appears to be empty (all lines commented out)
|
|
347
|
+
const modelsMatch = configContent.match(/models:\s*\[([\s\S]*?)\]/);
|
|
348
|
+
if (modelsMatch) {
|
|
349
|
+
const modelsBody = modelsMatch[1].trim();
|
|
350
|
+
// Remove comments and whitespace to check if anything is actually configured
|
|
351
|
+
const uncommented = modelsBody.replace(/\/\/.*$/gm, '').replace(/\/\*[\s\S]*?\*\//g, '').trim();
|
|
352
|
+
if (!uncommented || uncommented === ',') {
|
|
353
|
+
console.log(`\nNo models configured in ${configFile}.`);
|
|
354
|
+
console.log('To run evals, uncomment at least one model in your eval config:\n');
|
|
355
|
+
console.log(' models: [');
|
|
356
|
+
console.log(" 'gpt-4o', // requires OPENAI_API_KEY");
|
|
357
|
+
console.log(" 'claude-sonnet-4-20250514', // requires ANTHROPIC_API_KEY");
|
|
358
|
+
console.log(" 'gemini-2.0-flash', // requires GOOGLE_GENERATIVE_AI_API_KEY");
|
|
359
|
+
console.log(' ],\n');
|
|
360
|
+
return 0;
|
|
361
|
+
}
|
|
362
|
+
// Extract actual model strings for API key validation
|
|
363
|
+
const modelStrings = [...uncommented.matchAll(/['"]([^'"]+)['"]/g)].map(m => m[1]);
|
|
364
|
+
configModels = modelStrings;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Validate API keys for configured models
|
|
369
|
+
if (configModels && configModels.length > 0) {
|
|
370
|
+
const warnings = validateApiKeys(configModels);
|
|
371
|
+
if (warnings.length > 0) {
|
|
372
|
+
console.log('');
|
|
373
|
+
for (const w of warnings) {
|
|
374
|
+
console.warn(`⚠ ${w}`);
|
|
375
|
+
}
|
|
376
|
+
console.log('');
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Use non-standard ports for auto-start to avoid conflicts with a running dev server.
|
|
381
|
+
const autoStartMcpPort = 18920;
|
|
382
|
+
const autoStartInspectorPort = 18921;
|
|
383
|
+
// Default server for non-sunpeak projects (standard MCP port)
|
|
384
|
+
const defaultMcpPort = 8000;
|
|
385
|
+
let devServerHandle = null;
|
|
386
|
+
let mcpPort = defaultMcpPort;
|
|
387
|
+
|
|
388
|
+
if (isSunpeakProject() && !process.env.SUNPEAK_EVAL_SERVER) {
|
|
389
|
+
// Auto-start dev server for sunpeak projects on non-standard ports
|
|
390
|
+
mcpPort = autoStartMcpPort;
|
|
391
|
+
try {
|
|
392
|
+
devServerHandle = await startDevServer({
|
|
393
|
+
inspectorPort: autoStartInspectorPort,
|
|
394
|
+
mcpPort: autoStartMcpPort,
|
|
395
|
+
});
|
|
396
|
+
} catch (err) {
|
|
397
|
+
console.error('Failed to start dev server:', err.message);
|
|
398
|
+
return 1;
|
|
399
|
+
}
|
|
400
|
+
} else if (!process.env.SUNPEAK_EVAL_SERVER) {
|
|
401
|
+
// For non-sunpeak projects, check if the server is running
|
|
402
|
+
try {
|
|
403
|
+
await fetch(`http://localhost:${defaultMcpPort}/health`, {
|
|
404
|
+
signal: AbortSignal.timeout(3000),
|
|
405
|
+
});
|
|
406
|
+
} catch {
|
|
407
|
+
const defaultServer = `http://localhost:${defaultMcpPort}/mcp`;
|
|
408
|
+
console.error(`MCP server not reachable at ${defaultServer}`);
|
|
409
|
+
console.error('Make sure your MCP server is running, or set the server URL in eval.config.ts.\n');
|
|
410
|
+
return 1;
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
const defaultServer = `http://localhost:${mcpPort}/mcp`;
|
|
415
|
+
|
|
416
|
+
// Use .ts extension so vitest handles TypeScript imports natively
|
|
417
|
+
const vitestConfigPath = join(absEvalDir, '.eval-vitest.config.ts');
|
|
418
|
+
|
|
419
|
+
const evalPluginPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-vitest-plugin.mjs');
|
|
420
|
+
const evalReporterPath = resolve(__dirname, '..', 'lib', 'eval', 'eval-reporter.mjs');
|
|
421
|
+
|
|
422
|
+
// Clean up dev server and temp config
|
|
423
|
+
const cleanupResources = () => {
|
|
424
|
+
if (devServerHandle?.process) {
|
|
425
|
+
devServerHandle.process.kill();
|
|
426
|
+
devServerHandle = null;
|
|
427
|
+
}
|
|
428
|
+
try { unlinkSync(vitestConfigPath); } catch {}
|
|
429
|
+
};
|
|
430
|
+
// On unexpected signal, clean up and re-exit with conventional signal code
|
|
431
|
+
const onSignal = (signal) => {
|
|
432
|
+
cleanupResources();
|
|
433
|
+
process.exit(signal === 'SIGINT' ? 130 : 143);
|
|
434
|
+
};
|
|
435
|
+
const onSigInt = () => onSignal('SIGINT');
|
|
436
|
+
const onSigTerm = () => onSignal('SIGTERM');
|
|
437
|
+
process.on('SIGINT', onSigInt);
|
|
438
|
+
process.on('SIGTERM', onSigTerm);
|
|
439
|
+
|
|
440
|
+
// Resolve config import path relative to the generated config file location
|
|
441
|
+
const configImportPath = configFile
|
|
442
|
+
? './' + basename(configFile)
|
|
443
|
+
: null;
|
|
444
|
+
|
|
445
|
+
// Generate a vitest config that loads the eval config and plugin
|
|
446
|
+
const vitestConfig = `
|
|
447
|
+
import { defineConfig } from 'vitest/config';
|
|
448
|
+
import { evalVitestPlugin } from ${JSON.stringify(evalPluginPath)};
|
|
449
|
+
${configImportPath ? `import evalConfig from ${JSON.stringify(configImportPath)};` : 'const evalConfig = { models: [], defaults: {} };'}
|
|
450
|
+
|
|
451
|
+
export default defineConfig({
|
|
452
|
+
test: {
|
|
453
|
+
globals: true,
|
|
454
|
+
include: ['**/*.eval.ts', '**/*.eval.js'],
|
|
455
|
+
reporters: ['default', ${JSON.stringify(evalReporterPath)}],
|
|
456
|
+
testTimeout: 600000,
|
|
457
|
+
},
|
|
458
|
+
plugins: [
|
|
459
|
+
evalVitestPlugin({
|
|
460
|
+
server: evalConfig.server || process.env.SUNPEAK_EVAL_SERVER || ${JSON.stringify(defaultServer)},
|
|
461
|
+
models: evalConfig.models || [],
|
|
462
|
+
defaults: evalConfig.defaults || {},
|
|
463
|
+
}),
|
|
464
|
+
],
|
|
465
|
+
});
|
|
466
|
+
`;
|
|
467
|
+
|
|
468
|
+
writeFileSync(vitestConfigPath, vitestConfig);
|
|
469
|
+
|
|
470
|
+
try {
|
|
471
|
+
const code = await runChild(
|
|
472
|
+
'pnpm',
|
|
473
|
+
['exec', 'vitest', 'run', '--config', vitestConfigPath, ...args]
|
|
474
|
+
);
|
|
475
|
+
return code;
|
|
476
|
+
} finally {
|
|
477
|
+
cleanupResources();
|
|
478
|
+
process.removeListener('SIGINT', onSigInt);
|
|
479
|
+
process.removeListener('SIGTERM', onSigTerm);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* Find the eval tests directory.
|
|
485
|
+
*/
|
|
486
|
+
function findEvalDir() {
|
|
487
|
+
const candidates = [
|
|
488
|
+
'tests/evals',
|
|
489
|
+
'tests/sunpeak/evals',
|
|
490
|
+
];
|
|
491
|
+
|
|
492
|
+
for (const candidate of candidates) {
|
|
493
|
+
const full = join(process.cwd(), candidate);
|
|
494
|
+
if (existsSync(full)) return candidate;
|
|
495
|
+
}
|
|
496
|
+
return null;
|
|
497
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Custom vitest reporter for eval results.
|
|
3
|
+
*
|
|
4
|
+
* Parses __SUNPEAK_EVAL__ log lines from test console output and prints
|
|
5
|
+
* a statistical summary with pass/fail counts per model per case.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const EVAL_PREFIX = '__SUNPEAK_EVAL__';
|
|
9
|
+
|
|
10
|
+
export default class EvalReporter {
|
|
11
|
+
/** @type {import('./eval-types.d.mts').EvalCaseResult[]} */
|
|
12
|
+
results = [];
|
|
13
|
+
/** @type {boolean} */
|
|
14
|
+
printed = false;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Called for each console.log from test code.
|
|
18
|
+
* @param {{ content: string, type: 'stdout' | 'stderr' }} log
|
|
19
|
+
*/
|
|
20
|
+
onUserConsoleLog(log) {
|
|
21
|
+
if (log.type !== 'stdout') return;
|
|
22
|
+
const idx = log.content.indexOf(EVAL_PREFIX);
|
|
23
|
+
if (idx === -1) return;
|
|
24
|
+
|
|
25
|
+
try {
|
|
26
|
+
const json = log.content.slice(idx + EVAL_PREFIX.length);
|
|
27
|
+
const data = JSON.parse(json);
|
|
28
|
+
if (data.type === 'eval-result') {
|
|
29
|
+
this.results.push(data);
|
|
30
|
+
}
|
|
31
|
+
} catch {
|
|
32
|
+
// Ignore parse errors
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Called when the test run ends (vitest v4+).
|
|
38
|
+
*/
|
|
39
|
+
onTestRunEnd() {
|
|
40
|
+
this.printSummary();
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Fallback for older vitest versions.
|
|
45
|
+
*/
|
|
46
|
+
onFinished() {
|
|
47
|
+
this.printSummary();
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
printSummary() {
|
|
51
|
+
if (this.results.length === 0 || this.printed) return;
|
|
52
|
+
this.printed = true;
|
|
53
|
+
|
|
54
|
+
console.log('\n' + '='.repeat(60));
|
|
55
|
+
console.log('Eval Results');
|
|
56
|
+
console.log('='.repeat(60));
|
|
57
|
+
|
|
58
|
+
// Group by case name
|
|
59
|
+
const byCase = new Map();
|
|
60
|
+
for (const r of this.results) {
|
|
61
|
+
if (!byCase.has(r.caseName)) byCase.set(r.caseName, []);
|
|
62
|
+
byCase.get(r.caseName).push(r);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
let totalPassed = 0;
|
|
66
|
+
let totalRuns = 0;
|
|
67
|
+
|
|
68
|
+
for (const [caseName, caseResults] of byCase) {
|
|
69
|
+
console.log(`\n ${caseName}`);
|
|
70
|
+
|
|
71
|
+
const maxLen = Math.max(...caseResults.map((r) => r.modelId.length));
|
|
72
|
+
|
|
73
|
+
for (const r of caseResults) {
|
|
74
|
+
totalPassed += r.passed;
|
|
75
|
+
totalRuns += r.runs;
|
|
76
|
+
|
|
77
|
+
const pct = (r.passRate * 100).toFixed(0);
|
|
78
|
+
const avgMs = r.avgDurationMs.toFixed(0);
|
|
79
|
+
const status =
|
|
80
|
+
r.passRate === 1
|
|
81
|
+
? '\x1b[32m\u2713\x1b[0m'
|
|
82
|
+
: r.passRate >= 0.8
|
|
83
|
+
? '\x1b[33m~\x1b[0m'
|
|
84
|
+
: '\x1b[31m\u2717\x1b[0m';
|
|
85
|
+
|
|
86
|
+
console.log(
|
|
87
|
+
` ${status} ${r.modelId.padEnd(maxLen)} ${r.passed}/${r.runs} passed (${pct}%) avg ${avgMs}ms`
|
|
88
|
+
);
|
|
89
|
+
|
|
90
|
+
if (r.failures.length > 0) {
|
|
91
|
+
for (const f of r.failures) {
|
|
92
|
+
console.log(` \x1b[2m\u2514 ${f.error} (${f.count}x)\x1b[0m`);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const totalPct = totalRuns > 0 ? ((totalPassed / totalRuns) * 100).toFixed(0) : 0;
|
|
99
|
+
const modelCount = new Set(this.results.map((r) => r.modelId)).size;
|
|
100
|
+
console.log(
|
|
101
|
+
`\n Summary: ${totalPassed}/${totalRuns} passed (${totalPct}%) across ${modelCount} model(s)`
|
|
102
|
+
);
|
|
103
|
+
console.log('='.repeat(60) + '\n');
|
|
104
|
+
}
|
|
105
|
+
}
|