sunpeak 0.20.36 → 0.20.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -2
- package/bin/commands/inspect.mjs +291 -64
- package/bin/commands/test-init.mjs +6 -0
- package/bin/lib/eval/eval-runner.mjs +53 -1
- package/bin/lib/eval/eval-types.d.mts +27 -0
- package/bin/lib/eval/model-registry.mjs +2 -2
- package/dist/chatgpt/index.cjs +1 -1
- package/dist/chatgpt/index.js +1 -1
- package/dist/claude/index.cjs +1 -1
- package/dist/claude/index.js +1 -1
- package/dist/embed.css +1 -1
- package/dist/hooks/tool-data-store.d.ts +26 -0
- package/dist/hooks/use-tool-data.d.ts +3 -9
- package/dist/host/chatgpt/index.cjs +1 -1
- package/dist/host/chatgpt/index.js +1 -1
- package/dist/index.cjs +35 -21
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +35 -21
- package/dist/index.js.map +1 -1
- package/dist/inspector/index.cjs +1 -1
- package/dist/inspector/index.js +1 -1
- package/dist/inspector/inspector.d.ts +7 -0
- package/dist/inspector/use-inspector-state.d.ts +28 -0
- package/dist/{inspector-CiuT_2yA.js → inspector-BSha-CAW.js} +216 -75
- package/dist/inspector-BSha-CAW.js.map +1 -0
- package/dist/{inspector-BNWla95w.cjs → inspector-Chhc2GNO.cjs} +216 -75
- package/dist/inspector-Chhc2GNO.cjs.map +1 -0
- package/dist/lib/utils.d.ts +8 -7
- package/dist/mcp/index.cjs +5 -3
- package/dist/mcp/index.cjs.map +1 -1
- package/dist/mcp/index.js +5 -3
- package/dist/mcp/index.js.map +1 -1
- package/dist/mcp/server.d.ts +12 -1
- package/dist/style.css +22 -0
- package/dist/{use-app-Duar2Ipu.js → use-app-CtKy52kw.js} +62 -1
- package/dist/use-app-CtKy52kw.js.map +1 -0
- package/dist/{use-app-DUdnDLP5.cjs → use-app-xaiN0HAd.cjs} +62 -1
- package/dist/use-app-xaiN0HAd.cjs.map +1 -0
- package/package.json +8 -8
- package/template/dist/albums/albums.html +3 -3
- package/template/dist/albums/albums.json +1 -1
- package/template/dist/carousel/carousel.html +3 -3
- package/template/dist/carousel/carousel.json +1 -1
- package/template/dist/map/map.html +4 -4
- package/template/dist/map/map.json +1 -1
- package/template/dist/review/review.html +3 -3
- package/template/dist/review/review.json +1 -1
- package/template/node_modules/.bin/tsc +2 -2
- package/template/node_modules/.bin/tsserver +2 -2
- package/template/node_modules/.bin/vitest +2 -2
- package/template/node_modules/.vite/deps/_metadata.json +3 -3
- package/template/node_modules/.vite-mcp/deps/_metadata.json +20 -20
- package/template/node_modules/.vite-mcp/deps/vitest.js +7 -7
- package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
- package/template/package.json +1 -1
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-linux.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-linux.png +0 -0
- package/template/tsconfig.json +2 -0
- package/dist/inspector-BNWla95w.cjs.map +0 -1
- package/dist/inspector-CiuT_2yA.js.map +0 -1
- package/dist/use-app-DUdnDLP5.cjs.map +0 -1
- package/dist/use-app-Duar2Ipu.js.map +0 -1
package/README.md
CHANGED
|
@@ -41,7 +41,9 @@ sunpeak replicates the ChatGPT and Claude runtimes locally so you can:
|
|
|
41
41
|
- Pin tool states with simulation fixtures so UI regressions can't ship.
|
|
42
42
|
- Automate the real-host loop with live tests: scripts that open your browser, prompt ChatGPT, and assert against the rendered app so you stop click-testing by hand.
|
|
43
43
|
|
|
44
|
-
sunpeak also runs evals against your MCP server across multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model, so you can prove your tool descriptions and
|
|
44
|
+
sunpeak also runs evals against your MCP server across multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model, so you can prove your tool descriptions, schemas, and model-visible App Context hold up on cheaper models, not just the flagship ones.
|
|
45
|
+
|
|
46
|
+
Eval cases can seed App Context with `appContext`, which lets you test follow-up prompts such as "Book this one" against state the app has shared through `updateModelContext`.
|
|
45
47
|
|
|
46
48
|
<div align="center">
|
|
47
49
|
<a href="https://sunpeak.ai/docs/testing/evals">
|
|
@@ -99,7 +101,7 @@ npx sunpeak test
|
|
|
99
101
|
|
|
100
102
|
Playwright fixtures handle inspector startup, MCP connection, iframe traversal, and host switching. Works with Python, Go, TypeScript, Rust, or any language.
|
|
101
103
|
|
|
102
|
-
Evals add a second dimension: model compatibility. The eval framework connects to your MCP server via the MCP protocol, discovers its tools, and sends prompts to multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model and reports pass/fail counts, so you can measure whether your tool descriptions and
|
|
104
|
+
Evals add a second dimension: model compatibility. The eval framework connects to your MCP server via the MCP protocol, discovers its tools, and sends prompts to multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model and reports pass/fail counts, so you can measure whether your tool descriptions, schemas, and model-visible App Context work reliably across smaller and cheaper models, not just the flagship ones.
|
|
103
105
|
|
|
104
106
|
```ts
|
|
105
107
|
import { test, expect } from 'sunpeak/test';
|
package/bin/commands/inspect.mjs
CHANGED
|
@@ -308,7 +308,7 @@ async function negotiateOAuth(serverUrl) {
|
|
|
308
308
|
|
|
309
309
|
// Try the anonymous/auto-approved path first: follow the authorization URL
|
|
310
310
|
// without a browser and see if it immediately redirects with a code.
|
|
311
|
-
const code = await tryAnonymousOAuth(authUrl.toString(), callbackUrl);
|
|
311
|
+
const code = await tryAnonymousOAuth(authUrl.toString(), callbackUrl, oauthState.stateParam);
|
|
312
312
|
if (code) {
|
|
313
313
|
// Complete the flow with the authorization code.
|
|
314
314
|
const tokenResult = await auth(provider, {
|
|
@@ -347,15 +347,17 @@ async function negotiateOAuth(serverUrl) {
|
|
|
347
347
|
*
|
|
348
348
|
* @param {string} authUrl - The authorization URL
|
|
349
349
|
* @param {string} callbackUrl - The expected callback URL prefix
|
|
350
|
+
* @param {string} [expectedState] - OAuth state value that must be echoed by the callback
|
|
351
|
+
* @param {typeof fetch} [fetchFn]
|
|
350
352
|
* @returns {Promise<string | null>}
|
|
351
353
|
*/
|
|
352
|
-
async function tryAnonymousOAuth(authUrl, callbackUrl) {
|
|
354
|
+
async function tryAnonymousOAuth(authUrl, callbackUrl, expectedState, fetchFn = fetch) {
|
|
353
355
|
// Follow redirects manually to detect when the server redirects back
|
|
354
356
|
// to our callback URL with a code parameter.
|
|
355
357
|
let url = authUrl;
|
|
356
358
|
const maxRedirects = 10;
|
|
357
359
|
for (let i = 0; i < maxRedirects; i++) {
|
|
358
|
-
const response = await
|
|
360
|
+
const response = await fetchFn(url, { redirect: 'manual' });
|
|
359
361
|
const location = response.headers.get('location');
|
|
360
362
|
|
|
361
363
|
if (!location) {
|
|
@@ -366,11 +368,21 @@ async function tryAnonymousOAuth(authUrl, callbackUrl) {
|
|
|
366
368
|
}
|
|
367
369
|
|
|
368
370
|
// Resolve relative redirects.
|
|
369
|
-
const
|
|
371
|
+
const resolvedUrl = new URL(location, url);
|
|
372
|
+
if (resolvedUrl.protocol !== 'http:' && resolvedUrl.protocol !== 'https:') {
|
|
373
|
+
throw new Error(
|
|
374
|
+
`OAuth authorization redirect has unsupported scheme: ${resolvedUrl.protocol}`
|
|
375
|
+
);
|
|
376
|
+
}
|
|
377
|
+
const resolved = resolvedUrl.toString();
|
|
370
378
|
|
|
371
379
|
// Check if the redirect goes to our callback URL.
|
|
372
380
|
if (resolved.startsWith(callbackUrl)) {
|
|
373
381
|
const params = new URL(resolved).searchParams;
|
|
382
|
+
const state = params.get('state');
|
|
383
|
+
if (expectedState && state !== expectedState) {
|
|
384
|
+
throw new Error('OAuth state mismatch — callback rejected');
|
|
385
|
+
}
|
|
374
386
|
const code = params.get('code');
|
|
375
387
|
if (code) return code;
|
|
376
388
|
const error = params.get('error');
|
|
@@ -906,45 +918,68 @@ async function discoverSimulations(client) {
|
|
|
906
918
|
|
|
907
919
|
/**
|
|
908
920
|
* Load simulation JSON fixtures from a directory and merge into discovered simulations.
|
|
921
|
+
*
|
|
922
|
+
* Each fixture becomes a simulation keyed by its filename, so a tool can have
|
|
923
|
+
* multiple fixtures (e.g. `show-albums.json` and `show-albums-empty.json`
|
|
924
|
+
* both targeting tool `show-albums`). Auto-discovered slots are kept only for
|
|
925
|
+
* tools that have no fixture file.
|
|
926
|
+
*
|
|
909
927
|
* @param {string} dir - Simulation directory path
|
|
910
928
|
* @param {Record<string, object>} simulations - Discovered simulations to merge into
|
|
911
929
|
*/
|
|
912
|
-
function mergeSimulationFixtures(dir, simulations) {
|
|
930
|
+
export function mergeSimulationFixtures(dir, simulations) {
|
|
913
931
|
if (!existsSync(dir)) return;
|
|
914
932
|
|
|
915
933
|
const files = readdirSync(dir).filter((f) => f.endsWith('.json'));
|
|
934
|
+
|
|
935
|
+
// Load every fixture first so we can group by tool name. We need the grouping
|
|
936
|
+
// to decide whether to keep the auto-discovered slot (no fixtures) or replace
|
|
937
|
+
// it with one entry per fixture file (one or more fixtures).
|
|
938
|
+
const fixtures = [];
|
|
916
939
|
for (const file of files) {
|
|
917
940
|
try {
|
|
918
941
|
const fixture = JSON.parse(readFileSync(join(dir, file), 'utf-8'));
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
// Find matching simulation by tool name
|
|
923
|
-
const sim = simulations[toolName];
|
|
924
|
-
if (sim) {
|
|
925
|
-
// Merge fixture data into discovered simulation
|
|
926
|
-
if (fixture.toolInput !== undefined) sim.toolInput = fixture.toolInput;
|
|
927
|
-
if (fixture.toolResult !== undefined) sim.toolResult = fixture.toolResult;
|
|
928
|
-
if (fixture.serverTools !== undefined) sim.serverTools = fixture.serverTools;
|
|
929
|
-
if (fixture.userMessage !== undefined) sim.userMessage = fixture.userMessage;
|
|
930
|
-
if (fixture.hostContext !== undefined) sim.hostContext = fixture.hostContext;
|
|
931
|
-
} else {
|
|
932
|
-
// Create a new simulation from the fixture (tool not on server, but user wants to mock it)
|
|
933
|
-
const simName = file.replace(/\.json$/, '');
|
|
934
|
-
simulations[simName] = {
|
|
935
|
-
name: simName,
|
|
936
|
-
tool: { name: toolName, inputSchema: { type: 'object' } },
|
|
937
|
-
toolInput: fixture.toolInput,
|
|
938
|
-
toolResult: fixture.toolResult,
|
|
939
|
-
serverTools: fixture.serverTools,
|
|
940
|
-
userMessage: fixture.userMessage,
|
|
941
|
-
hostContext: fixture.hostContext,
|
|
942
|
-
};
|
|
943
|
-
}
|
|
942
|
+
if (!fixture.tool) continue;
|
|
943
|
+
fixtures.push({ file, fixture });
|
|
944
944
|
} catch (err) {
|
|
945
945
|
console.warn(`Warning: Failed to parse simulation fixture ${file}:`, err.message);
|
|
946
946
|
}
|
|
947
947
|
}
|
|
948
|
+
|
|
949
|
+
const byTool = new Map();
|
|
950
|
+
for (const item of fixtures) {
|
|
951
|
+
const tool = item.fixture.tool;
|
|
952
|
+
if (!byTool.has(tool)) byTool.set(tool, []);
|
|
953
|
+
byTool.get(tool).push(item);
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
for (const [toolName, items] of byTool) {
|
|
957
|
+
const discovered = simulations[toolName];
|
|
958
|
+
|
|
959
|
+
// Drop the auto-discovered slot if none of the fixtures will reuse its
|
|
960
|
+
// key (filename === tool name). Otherwise the named fixture overwrites
|
|
961
|
+
// it in place below.
|
|
962
|
+
const reusesSlot = items.some(({ file }) => file.replace(/\.json$/, '') === toolName);
|
|
963
|
+
if (discovered && !reusesSlot) {
|
|
964
|
+
delete simulations[toolName];
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
for (const { file, fixture } of items) {
|
|
968
|
+
const simName = file.replace(/\.json$/, '');
|
|
969
|
+
const sim = discovered
|
|
970
|
+
? { ...discovered, name: simName }
|
|
971
|
+
: {
|
|
972
|
+
name: simName,
|
|
973
|
+
tool: { name: toolName, inputSchema: { type: 'object' } },
|
|
974
|
+
};
|
|
975
|
+
if (fixture.toolInput !== undefined) sim.toolInput = fixture.toolInput;
|
|
976
|
+
if (fixture.toolResult !== undefined) sim.toolResult = fixture.toolResult;
|
|
977
|
+
if (fixture.serverTools !== undefined) sim.serverTools = fixture.serverTools;
|
|
978
|
+
if (fixture.userMessage !== undefined) sim.userMessage = fixture.userMessage;
|
|
979
|
+
if (fixture.hostContext !== undefined) sim.hostContext = fixture.hostContext;
|
|
980
|
+
simulations[simName] = sim;
|
|
981
|
+
}
|
|
982
|
+
}
|
|
948
983
|
}
|
|
949
984
|
|
|
950
985
|
const MODEL_PROVIDERS = new Set(['openai', 'anthropic']);
|
|
@@ -1363,9 +1398,43 @@ function toolRendersApp(tool) {
|
|
|
1363
1398
|
return !!(tool?._meta?.ui?.resourceUri ?? tool?._meta?.['ui/resourceUri']);
|
|
1364
1399
|
}
|
|
1365
1400
|
|
|
1366
|
-
function
|
|
1367
|
-
|
|
1401
|
+
function sanitizeAiSdkSchemaNode(schema) {
|
|
1402
|
+
if (Array.isArray(schema)) {
|
|
1403
|
+
return schema.map((item) => sanitizeAiSdkSchemaNode(item));
|
|
1404
|
+
}
|
|
1405
|
+
if (!schema || typeof schema !== 'object') return schema;
|
|
1406
|
+
|
|
1407
|
+
const clean = { ...schema };
|
|
1368
1408
|
delete clean.$schema;
|
|
1409
|
+
if (
|
|
1410
|
+
clean.properties &&
|
|
1411
|
+
typeof clean.properties === 'object' &&
|
|
1412
|
+
!Array.isArray(clean.properties)
|
|
1413
|
+
) {
|
|
1414
|
+
clean.properties = Object.fromEntries(
|
|
1415
|
+
Object.entries(clean.properties).map(([key, value]) => [
|
|
1416
|
+
key,
|
|
1417
|
+
sanitizeAiSdkSchemaNode(value),
|
|
1418
|
+
])
|
|
1419
|
+
);
|
|
1420
|
+
}
|
|
1421
|
+
if (clean.items !== undefined) {
|
|
1422
|
+
clean.items = sanitizeAiSdkSchemaNode(clean.items);
|
|
1423
|
+
}
|
|
1424
|
+
for (const key of ['anyOf', 'allOf', 'oneOf']) {
|
|
1425
|
+
if (Array.isArray(clean[key])) {
|
|
1426
|
+
clean[key] = clean[key].map((item) => sanitizeAiSdkSchemaNode(item));
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
const isObjectSchema = clean.type === 'object' || clean.properties != null;
|
|
1431
|
+
if (isObjectSchema) {
|
|
1432
|
+
if (!clean.type) clean.type = 'object';
|
|
1433
|
+
if (!clean.properties) clean.properties = {};
|
|
1434
|
+
clean.additionalProperties = false;
|
|
1435
|
+
return clean;
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1369
1438
|
if (
|
|
1370
1439
|
clean.additionalProperties != null &&
|
|
1371
1440
|
typeof clean.additionalProperties === 'object' &&
|
|
@@ -1373,8 +1442,14 @@ function sanitizeAiSdkSchema(schema) {
|
|
|
1373
1442
|
) {
|
|
1374
1443
|
delete clean.additionalProperties;
|
|
1375
1444
|
}
|
|
1445
|
+
return clean;
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
export function sanitizeAiSdkSchema(schema) {
|
|
1449
|
+
const clean = sanitizeAiSdkSchemaNode(schema || { type: 'object', properties: {} });
|
|
1376
1450
|
if (!clean.type) clean.type = 'object';
|
|
1377
1451
|
if (!clean.properties) clean.properties = {};
|
|
1452
|
+
clean.additionalProperties = false;
|
|
1378
1453
|
return clean;
|
|
1379
1454
|
}
|
|
1380
1455
|
|
|
@@ -1392,15 +1467,39 @@ function normalizeModelId(modelId) {
|
|
|
1392
1467
|
return trimmed;
|
|
1393
1468
|
}
|
|
1394
1469
|
|
|
1470
|
+
function normalizeModelProviderModelId(provider, modelId) {
|
|
1471
|
+
const normalizedModelId = normalizeModelId(modelId);
|
|
1472
|
+
if (
|
|
1473
|
+
provider === 'anthropic' &&
|
|
1474
|
+
/^claude-\d+(?:-\d+)+-(opus|sonnet|haiku)$/i.test(normalizedModelId)
|
|
1475
|
+
) {
|
|
1476
|
+
throw new Error(
|
|
1477
|
+
`Unsupported Anthropic model ID "${normalizedModelId}". Use an Anthropic API model ID such as "claude-sonnet-4-20250514".`
|
|
1478
|
+
);
|
|
1479
|
+
}
|
|
1480
|
+
return normalizedModelId;
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
function normalizeModelConversationId(conversationId) {
|
|
1484
|
+
if (typeof conversationId !== 'string') return undefined;
|
|
1485
|
+
const trimmed = conversationId.trim();
|
|
1486
|
+
if (!trimmed) return undefined;
|
|
1487
|
+
if (trimmed.length > 200 || /[\u0000-\u001f\u007f]/.test(trimmed)) {
|
|
1488
|
+
throw new Error('Invalid model conversation ID.');
|
|
1489
|
+
}
|
|
1490
|
+
return trimmed;
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1395
1493
|
async function createModelInstance(provider, modelId, apiKey) {
|
|
1396
1494
|
assertModelProvider(provider);
|
|
1397
|
-
const normalizedModelId =
|
|
1495
|
+
const normalizedModelId = normalizeModelProviderModelId(provider, modelId);
|
|
1398
1496
|
if (provider === 'openai') {
|
|
1399
1497
|
const { createOpenAI } = await import('@ai-sdk/openai');
|
|
1400
1498
|
const openai = createOpenAI({ apiKey });
|
|
1499
|
+
const settings = { structuredOutputs: false };
|
|
1401
1500
|
return typeof openai.chat === 'function'
|
|
1402
|
-
? openai.chat(normalizedModelId)
|
|
1403
|
-
: openai(normalizedModelId);
|
|
1501
|
+
? openai.chat(normalizedModelId, settings)
|
|
1502
|
+
: openai(normalizedModelId, settings);
|
|
1404
1503
|
}
|
|
1405
1504
|
const { createAnthropic } = await import('@ai-sdk/anthropic');
|
|
1406
1505
|
return createAnthropic({ apiKey })(normalizedModelId);
|
|
@@ -1414,6 +1513,24 @@ function formatJsonForModel(value) {
|
|
|
1414
1513
|
return `${json.slice(0, MODEL_VISIBLE_JSON_LIMIT_BYTES)}...`;
|
|
1415
1514
|
}
|
|
1416
1515
|
|
|
1516
|
+
function normalizeModelChatHost(host) {
|
|
1517
|
+
if (host === 'chatgpt' || host === 'claude') return host;
|
|
1518
|
+
return 'generic';
|
|
1519
|
+
}
|
|
1520
|
+
|
|
1521
|
+
function isPlainObject(value) {
|
|
1522
|
+
return (
|
|
1523
|
+
value !== null &&
|
|
1524
|
+
typeof value === 'object' &&
|
|
1525
|
+
!Array.isArray(value) &&
|
|
1526
|
+
Object.getPrototypeOf(value) === Object.prototype
|
|
1527
|
+
);
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
function normalizeToolArguments(args) {
|
|
1531
|
+
return isPlainObject(args) ? args : {};
|
|
1532
|
+
}
|
|
1533
|
+
|
|
1417
1534
|
function normalizeModelAppContext(appContext) {
|
|
1418
1535
|
if (!appContext || typeof appContext !== 'object') return undefined;
|
|
1419
1536
|
const normalized = {};
|
|
@@ -1432,15 +1549,67 @@ function formatSharedAppContextForModel(appContext) {
|
|
|
1432
1549
|
return formatJsonForModel(normalized);
|
|
1433
1550
|
}
|
|
1434
1551
|
|
|
1435
|
-
function
|
|
1552
|
+
function normalizeModelChatMessages(messages) {
|
|
1553
|
+
if (!Array.isArray(messages)) return [];
|
|
1554
|
+
return messages
|
|
1555
|
+
.filter((message) => message?.role === 'user' || message?.role === 'assistant')
|
|
1556
|
+
.map((message) => ({
|
|
1557
|
+
role: message.role,
|
|
1558
|
+
content: String(message.content ?? '').slice(0, 20000).trim(),
|
|
1559
|
+
}))
|
|
1560
|
+
.filter((message) => message.content.length > 0);
|
|
1561
|
+
}
|
|
1562
|
+
|
|
1563
|
+
function getToolErrorText(tool, result) {
|
|
1564
|
+
const toolName = tool?.name || 'MCP tool';
|
|
1565
|
+
const text = (result?.content || [])
|
|
1566
|
+
.filter((part) => part && typeof part === 'object' && part.type === 'text')
|
|
1567
|
+
.map((part) => String(part.text ?? ''))
|
|
1568
|
+
.join('\n')
|
|
1569
|
+
.trim();
|
|
1570
|
+
if (text) return text;
|
|
1571
|
+
if (result?.structuredContent !== undefined) {
|
|
1572
|
+
return formatJsonForModel({ structuredContent: result.structuredContent });
|
|
1573
|
+
}
|
|
1574
|
+
return text || `${toolName} returned an error.`;
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1577
|
+
function formatModelVisibleToolError(tool, result, { host, arguments: args, toolCallId } = {}) {
|
|
1578
|
+
const toolName = tool?.name || 'MCP tool';
|
|
1579
|
+
const errorText = getToolErrorText(tool, result);
|
|
1580
|
+
const id = typeof toolCallId === 'string' && toolCallId.trim() ? toolCallId : toolName;
|
|
1581
|
+
|
|
1582
|
+
switch (normalizeModelChatHost(host)) {
|
|
1583
|
+
case 'chatgpt':
|
|
1584
|
+
return {
|
|
1585
|
+
type: 'mcp_call',
|
|
1586
|
+
id,
|
|
1587
|
+
name: toolName,
|
|
1588
|
+
arguments: normalizeToolArguments(args),
|
|
1589
|
+
error: errorText,
|
|
1590
|
+
output: null,
|
|
1591
|
+
status: 'failed',
|
|
1592
|
+
};
|
|
1593
|
+
case 'claude':
|
|
1594
|
+
return {
|
|
1595
|
+
type: 'mcp_tool_result',
|
|
1596
|
+
tool_use_id: id,
|
|
1597
|
+
is_error: true,
|
|
1598
|
+
content: [{ type: 'text', text: errorText }],
|
|
1599
|
+
};
|
|
1600
|
+
default:
|
|
1601
|
+
return {
|
|
1602
|
+
isError: true,
|
|
1603
|
+
content: [{ type: 'text', text: errorText }],
|
|
1604
|
+
};
|
|
1605
|
+
}
|
|
1606
|
+
}
|
|
1607
|
+
|
|
1608
|
+
function formatModelVisibleToolResult(tool, result, options = {}) {
|
|
1436
1609
|
const toolName = tool?.name || 'MCP tool';
|
|
1437
1610
|
if (result?.isError) {
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
.map((part) => part.text)
|
|
1441
|
-
.join('\n')
|
|
1442
|
-
.trim();
|
|
1443
|
-
return text || `${toolName} returned an error.`;
|
|
1611
|
+
if (options.host) return formatModelVisibleToolError(tool, result, options);
|
|
1612
|
+
return getToolErrorText(tool, result);
|
|
1444
1613
|
}
|
|
1445
1614
|
|
|
1446
1615
|
const visibleResult = {};
|
|
@@ -1457,17 +1626,63 @@ function formatModelVisibleToolResult(tool, result) {
|
|
|
1457
1626
|
: `${toolName} completed.`;
|
|
1458
1627
|
}
|
|
1459
1628
|
|
|
1460
|
-
|
|
1461
|
-
|
|
1629
|
+
function errorToMessage(error) {
|
|
1630
|
+
if (error instanceof Error && error.message) return error.message;
|
|
1631
|
+
return String(error || 'Unknown MCP tool error');
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
function createModelChatToolErrorResult(error) {
|
|
1635
|
+
const message = errorToMessage(error);
|
|
1462
1636
|
return {
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
source: 'mcp',
|
|
1637
|
+
content: [{ type: 'text', text: message }],
|
|
1638
|
+
isError: true,
|
|
1466
1639
|
};
|
|
1467
1640
|
}
|
|
1468
1641
|
|
|
1469
|
-
async function
|
|
1642
|
+
async function executeModelChatToolCall({ client, name, arguments: args }) {
|
|
1643
|
+
const safeArgs = normalizeToolArguments(args);
|
|
1644
|
+
try {
|
|
1645
|
+
return {
|
|
1646
|
+
arguments: safeArgs,
|
|
1647
|
+
result: await client.callTool({ name, arguments: safeArgs }),
|
|
1648
|
+
source: 'mcp',
|
|
1649
|
+
};
|
|
1650
|
+
} catch (error) {
|
|
1651
|
+
return {
|
|
1652
|
+
arguments: safeArgs,
|
|
1653
|
+
result: createModelChatToolErrorResult(error),
|
|
1654
|
+
source: 'mcp',
|
|
1655
|
+
};
|
|
1656
|
+
}
|
|
1657
|
+
}
|
|
1658
|
+
|
|
1659
|
+
function getModelChatHostInstructions(host) {
|
|
1660
|
+
switch (normalizeModelChatHost(host)) {
|
|
1661
|
+
case 'chatgpt':
|
|
1662
|
+
return 'ChatGPT surfaces failed MCP calls as mcp_call items with an error field. When a tool result object has type "mcp_call", status "failed", or a non-empty error field, treat it as a failed MCP call, not a successful result.';
|
|
1663
|
+
case 'claude':
|
|
1664
|
+
return 'Claude surfaces failed MCP calls as mcp_tool_result blocks with is_error=true. When a tool result object has type "mcp_tool_result" and is_error is true, treat it as a failed MCP call, not a successful result.';
|
|
1665
|
+
default:
|
|
1666
|
+
return 'MCP tool failures are model-visible tool results. When a tool result has isError=true or describes a failed MCP call, treat it as a failed tool call, not a successful result.';
|
|
1667
|
+
}
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
function getModelChatRetryInstructions() {
|
|
1671
|
+
return 'After a failed MCP tool call, use the error text to decide the next step. Retry with corrected arguments for validation or business-logic errors. For transient service, timeout, or connectivity errors, you may retry once if the user request still needs the tool. Do not repeat the same failing tool call with the same arguments more than once.';
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
async function runModelChat({
|
|
1675
|
+
client,
|
|
1676
|
+
provider,
|
|
1677
|
+
modelId,
|
|
1678
|
+
messages,
|
|
1679
|
+
apiKey,
|
|
1680
|
+
appContext,
|
|
1681
|
+
host,
|
|
1682
|
+
conversationId,
|
|
1683
|
+
}) {
|
|
1470
1684
|
assertModelProvider(provider);
|
|
1685
|
+
const normalizedHost = normalizeModelChatHost(host);
|
|
1471
1686
|
const { generateText, tool: aiTool, jsonSchema } = await import('ai');
|
|
1472
1687
|
const model = await createModelInstance(provider, modelId, apiKey);
|
|
1473
1688
|
const { tools: mcpTools } = await client.listTools();
|
|
@@ -1479,14 +1694,23 @@ async function runModelChat({ client, provider, modelId, messages, apiKey, appCo
|
|
|
1479
1694
|
description: mcpTool.description || mcpTool.title || '',
|
|
1480
1695
|
inputSchema: jsonSchema(sanitizeAiSdkSchema(mcpTool.inputSchema)),
|
|
1481
1696
|
parameters: jsonSchema(sanitizeAiSdkSchema(mcpTool.inputSchema)),
|
|
1482
|
-
execute: async (args) => {
|
|
1697
|
+
execute: async (args, options) => {
|
|
1483
1698
|
const { arguments: safeArgs, result } = await executeModelChatToolCall({
|
|
1484
1699
|
client,
|
|
1485
1700
|
name: mcpTool.name,
|
|
1486
1701
|
arguments: args,
|
|
1487
1702
|
});
|
|
1488
|
-
capturedToolCalls.push({
|
|
1489
|
-
|
|
1703
|
+
capturedToolCalls.push({
|
|
1704
|
+
name: mcpTool.name,
|
|
1705
|
+
arguments: safeArgs,
|
|
1706
|
+
result,
|
|
1707
|
+
isError: !!result?.isError,
|
|
1708
|
+
});
|
|
1709
|
+
return formatModelVisibleToolResult(mcpTool, result, {
|
|
1710
|
+
host: normalizedHost,
|
|
1711
|
+
arguments: safeArgs,
|
|
1712
|
+
toolCallId: options?.toolCallId,
|
|
1713
|
+
});
|
|
1490
1714
|
},
|
|
1491
1715
|
});
|
|
1492
1716
|
}
|
|
@@ -1497,22 +1721,25 @@ async function runModelChat({ client, provider, modelId, messages, apiKey, appCo
|
|
|
1497
1721
|
model,
|
|
1498
1722
|
tools,
|
|
1499
1723
|
system: [
|
|
1500
|
-
'You are chatting inside the
|
|
1724
|
+
'You are chatting inside the sunpeak Inspector. When you call an MCP tool that renders an app, the host will render the app below your message. Do not repeat raw tool output, JSON, image URLs, markdown image lists, or full item inventories. Keep any narration brief and let the app carry the visual result.',
|
|
1725
|
+
getModelChatHostInstructions(normalizedHost),
|
|
1726
|
+
getModelChatRetryInstructions(),
|
|
1501
1727
|
sharedAppContext
|
|
1502
1728
|
? `Shared MCP App context from the currently rendered app, available for this turn:\n${sharedAppContext}`
|
|
1503
1729
|
: '',
|
|
1504
1730
|
]
|
|
1505
1731
|
.filter(Boolean)
|
|
1506
1732
|
.join('\n\n'),
|
|
1507
|
-
messages: messages
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
maxSteps: 5,
|
|
1733
|
+
messages: normalizeModelChatMessages(messages),
|
|
1734
|
+
// AI SDK v4 can send an empty assistant text block to Anthropic when a
|
|
1735
|
+
// tool-only response is followed by another model step. We only need the
|
|
1736
|
+
// tool result for inspector rendering, so skip that follow-up call.
|
|
1737
|
+
maxSteps: provider === 'anthropic' ? 1 : 5,
|
|
1512
1738
|
maxRetries: 0,
|
|
1513
1739
|
});
|
|
1514
1740
|
|
|
1515
1741
|
return {
|
|
1742
|
+
...(conversationId ? { conversationId } : {}),
|
|
1516
1743
|
text: result.text || '',
|
|
1517
1744
|
toolCalls: capturedToolCalls,
|
|
1518
1745
|
finishReason: result.finishReason,
|
|
@@ -2513,18 +2740,13 @@ function sunpeakInspectEndpointsPlugin(getClient, setClient, pluginOpts = {}) {
|
|
|
2513
2740
|
res.end(JSON.stringify({ error: `No ${provider} API key saved.` }));
|
|
2514
2741
|
return;
|
|
2515
2742
|
}
|
|
2516
|
-
const
|
|
2517
|
-
const safeMessages = messages
|
|
2518
|
-
.filter((message) => message?.role === 'user' || message?.role === 'assistant')
|
|
2519
|
-
.map((message) => ({
|
|
2520
|
-
role: message.role,
|
|
2521
|
-
content: String(message.content ?? '').slice(0, 20000),
|
|
2522
|
-
}));
|
|
2743
|
+
const safeMessages = normalizeModelChatMessages(parsed.messages);
|
|
2523
2744
|
if (safeMessages.length === 0) {
|
|
2524
2745
|
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
2525
2746
|
res.end(JSON.stringify({ error: 'Missing chat messages.' }));
|
|
2526
2747
|
return;
|
|
2527
2748
|
}
|
|
2749
|
+
const conversationId = normalizeModelConversationId(parsed.conversationId);
|
|
2528
2750
|
|
|
2529
2751
|
const result = await withModelChatClient((client) =>
|
|
2530
2752
|
runModelChat({
|
|
@@ -2533,7 +2755,9 @@ function sunpeakInspectEndpointsPlugin(getClient, setClient, pluginOpts = {}) {
|
|
|
2533
2755
|
modelId: parsed.modelId,
|
|
2534
2756
|
messages: safeMessages,
|
|
2535
2757
|
apiKey,
|
|
2758
|
+
host: parsed.host,
|
|
2536
2759
|
appContext: normalizeModelAppContext(parsed.appContext),
|
|
2760
|
+
conversationId,
|
|
2537
2761
|
})
|
|
2538
2762
|
);
|
|
2539
2763
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
@@ -2678,12 +2902,15 @@ export const _securityTestExports = {
|
|
|
2678
2902
|
formatModelVisibleToolResult,
|
|
2679
2903
|
formatSharedAppContextForModel,
|
|
2680
2904
|
normalizeApiKey,
|
|
2905
|
+
normalizeModelChatMessages,
|
|
2681
2906
|
normalizeModelAppContext,
|
|
2682
2907
|
normalizeModelId,
|
|
2908
|
+
normalizeModelProviderModelId,
|
|
2683
2909
|
quoteSecurityInteractiveArg,
|
|
2684
2910
|
readRequestBody,
|
|
2685
2911
|
resolveHttpRedirectsForMcp,
|
|
2686
2912
|
shouldAllowPrivateServerUrls,
|
|
2913
|
+
tryAnonymousOAuth,
|
|
2687
2914
|
};
|
|
2688
2915
|
|
|
2689
2916
|
/**
|
|
@@ -374,6 +374,8 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
374
374
|
*
|
|
375
375
|
* Each case sends a prompt to every configured model and checks
|
|
376
376
|
* that the model calls the expected tool with the expected arguments.
|
|
377
|
+
* Add appContext to test follow-up prompts that depend on state shared
|
|
378
|
+
* by the rendered MCP App through updateModelContext or useAppState.
|
|
377
379
|
* Cases run multiple times (configured via \`runs\` in eval.config.ts)
|
|
378
380
|
* to measure reliability across non-deterministic LLM responses.
|
|
379
381
|
*/
|
|
@@ -384,6 +386,8 @@ export default defineEval({
|
|
|
384
386
|
{
|
|
385
387
|
name: 'example (replace me)',
|
|
386
388
|
prompt: 'Show me a demo',
|
|
389
|
+
// Optional: seed model-visible App Context for follow-up prompts.
|
|
390
|
+
// appContext: { structuredContent: { selectedItem: { id: 'demo' } } },
|
|
387
391
|
// expect which tool gets called and (optionally) its arguments:
|
|
388
392
|
expect: {
|
|
389
393
|
tool: 'your-tool-name',
|
|
@@ -592,8 +596,10 @@ ${serverBlock}
|
|
|
592
596
|
{
|
|
593
597
|
compilerOptions: {
|
|
594
598
|
target: 'ES2022',
|
|
599
|
+
lib: ['ESNext', 'DOM'],
|
|
595
600
|
module: 'ESNext',
|
|
596
601
|
moduleResolution: 'bundler',
|
|
602
|
+
types: ['node'],
|
|
597
603
|
strict: true,
|
|
598
604
|
esModuleInterop: true,
|
|
599
605
|
},
|
|
@@ -161,6 +161,42 @@ export async function discoverAndConvertTools(client) {
|
|
|
161
161
|
return tools;
|
|
162
162
|
}
|
|
163
163
|
|
|
164
|
+
const MODEL_VISIBLE_JSON_LIMIT_BYTES = 20000;
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Normalize MCP App Context into the same shape hosts expose to the model.
|
|
168
|
+
* Empty context is treated as absent.
|
|
169
|
+
* @param {unknown} appContext
|
|
170
|
+
* @returns {{ content?: unknown[], structuredContent?: unknown } | undefined}
|
|
171
|
+
*/
|
|
172
|
+
export function normalizeEvalAppContext(appContext) {
|
|
173
|
+
if (!appContext || typeof appContext !== 'object') return undefined;
|
|
174
|
+
const normalized = {};
|
|
175
|
+
if (Array.isArray(appContext.content) && appContext.content.length > 0) {
|
|
176
|
+
normalized.content = appContext.content;
|
|
177
|
+
}
|
|
178
|
+
if (appContext.structuredContent !== undefined) {
|
|
179
|
+
normalized.structuredContent = appContext.structuredContent;
|
|
180
|
+
}
|
|
181
|
+
return Object.keys(normalized).length > 0 ? normalized : undefined;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Build the system prompt fragment that makes app context visible to the model.
|
|
186
|
+
* @param {unknown} appContext
|
|
187
|
+
* @returns {string | undefined}
|
|
188
|
+
*/
|
|
189
|
+
export function formatEvalAppContextForModel(appContext) {
|
|
190
|
+
const normalized = normalizeEvalAppContext(appContext);
|
|
191
|
+
if (!normalized) return undefined;
|
|
192
|
+
const json = JSON.stringify(normalized);
|
|
193
|
+
const visibleJson =
|
|
194
|
+
json.length <= MODEL_VISIBLE_JSON_LIMIT_BYTES
|
|
195
|
+
? json
|
|
196
|
+
: `${json.slice(0, MODEL_VISIBLE_JSON_LIMIT_BYTES)}...`;
|
|
197
|
+
return `Shared MCP App context from the currently rendered app, available for this turn:\n${visibleJson}`;
|
|
198
|
+
}
|
|
199
|
+
|
|
164
200
|
/**
|
|
165
201
|
* Run a single eval case once against a model.
|
|
166
202
|
* @param {object} params
|
|
@@ -170,15 +206,30 @@ export async function discoverAndConvertTools(client) {
|
|
|
170
206
|
* @param {number} params.maxSteps
|
|
171
207
|
* @param {number} params.temperature
|
|
172
208
|
* @param {number} params.timeout
|
|
209
|
+
* @param {{ content?: unknown[], structuredContent?: unknown }} [params.appContext]
|
|
173
210
|
* @returns {Promise<import('./eval-types.d.mts').EvalRunResult>}
|
|
174
211
|
*/
|
|
175
|
-
export async function runSingleEval({
|
|
212
|
+
export async function runSingleEval({
|
|
213
|
+
prompt,
|
|
214
|
+
model,
|
|
215
|
+
tools,
|
|
216
|
+
maxSteps,
|
|
217
|
+
temperature,
|
|
218
|
+
timeout,
|
|
219
|
+
appContext,
|
|
220
|
+
}) {
|
|
176
221
|
const { generateText } = await import('ai');
|
|
222
|
+
const system = formatEvalAppContextForModel(appContext);
|
|
223
|
+
const providerOptions = model?.provider?.startsWith('openai.')
|
|
224
|
+
? { openai: { strictJsonSchema: false } }
|
|
225
|
+
: undefined;
|
|
177
226
|
|
|
178
227
|
const result = await generateText({
|
|
179
228
|
model,
|
|
180
229
|
tools,
|
|
181
230
|
prompt,
|
|
231
|
+
...(system ? { system } : {}),
|
|
232
|
+
...(providerOptions ? { providerOptions } : {}),
|
|
182
233
|
maxSteps,
|
|
183
234
|
temperature,
|
|
184
235
|
maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
|
|
@@ -356,6 +407,7 @@ export async function runEvalCaseAggregate({
|
|
|
356
407
|
maxSteps: evalCase.maxSteps ?? maxSteps,
|
|
357
408
|
temperature,
|
|
358
409
|
timeout,
|
|
410
|
+
appContext: evalCase.appContext,
|
|
359
411
|
});
|
|
360
412
|
checkExpectations(result, evalCase);
|
|
361
413
|
passed++;
|