sunpeak 0.20.36 → 0.20.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -2
- package/bin/commands/inspect.mjs +225 -34
- package/bin/commands/test-init.mjs +4 -0
- package/bin/lib/eval/eval-runner.mjs +49 -1
- package/bin/lib/eval/eval-types.d.mts +27 -0
- package/bin/lib/eval/model-registry.mjs +6 -3
- package/dist/chatgpt/index.cjs +1 -1
- package/dist/chatgpt/index.js +1 -1
- package/dist/claude/index.cjs +1 -1
- package/dist/claude/index.js +1 -1
- package/dist/embed.css +1 -1
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/inspector/index.cjs +1 -1
- package/dist/inspector/index.js +1 -1
- package/dist/inspector/inspector.d.ts +7 -0
- package/dist/inspector/use-inspector-state.d.ts +28 -0
- package/dist/{inspector-CiuT_2yA.js → inspector-C6n8zap3.js} +172 -57
- package/dist/{inspector-CiuT_2yA.js.map → inspector-C6n8zap3.js.map} +1 -1
- package/dist/{inspector-BNWla95w.cjs → inspector-DOmiG64-.cjs} +172 -57
- package/dist/{inspector-BNWla95w.cjs.map → inspector-DOmiG64-.cjs.map} +1 -1
- package/dist/style.css +22 -0
- package/package.json +1 -1
- package/template/dist/albums/albums.html +1 -1
- package/template/dist/albums/albums.json +1 -1
- package/template/dist/carousel/carousel.html +1 -1
- package/template/dist/carousel/carousel.json +1 -1
- package/template/dist/map/map.html +1 -1
- package/template/dist/map/map.json +1 -1
- package/template/dist/review/review.html +1 -1
- package/template/dist/review/review.json +1 -1
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-linux.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-linux.png +0 -0
package/README.md
CHANGED
|
@@ -41,7 +41,9 @@ sunpeak replicates the ChatGPT and Claude runtimes locally so you can:
|
|
|
41
41
|
- Pin tool states with simulation fixtures so UI regressions can't ship.
|
|
42
42
|
- Automate the real-host loop with live tests: scripts that open your browser, prompt ChatGPT, and assert against the rendered app so you stop click-testing by hand.
|
|
43
43
|
|
|
44
|
-
sunpeak also runs evals against your MCP server across multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model, so you can prove your tool descriptions and
|
|
44
|
+
sunpeak also runs evals against your MCP server across multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model, so you can prove your tool descriptions, schemas, and model-visible App Context hold up on cheaper models, not just the flagship ones.
|
|
45
|
+
|
|
46
|
+
Eval cases can seed App Context with `appContext`, which lets you test follow-up prompts such as "Book this one" against state the app has shared through `updateModelContext`.
|
|
45
47
|
|
|
46
48
|
<div align="center">
|
|
47
49
|
<a href="https://sunpeak.ai/docs/testing/evals">
|
|
@@ -99,7 +101,7 @@ npx sunpeak test
|
|
|
99
101
|
|
|
100
102
|
Playwright fixtures handle inspector startup, MCP connection, iframe traversal, and host switching. Works with Python, Go, TypeScript, Rust, or any language.
|
|
101
103
|
|
|
102
|
-
Evals add a second dimension: model compatibility. The eval framework connects to your MCP server via the MCP protocol, discovers its tools, and sends prompts to multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model and reports pass/fail counts, so you can measure whether your tool descriptions and
|
|
104
|
+
Evals add a second dimension: model compatibility. The eval framework connects to your MCP server via the MCP protocol, discovers its tools, and sends prompts to multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model and reports pass/fail counts, so you can measure whether your tool descriptions, schemas, and model-visible App Context work reliably across smaller and cheaper models, not just the flagship ones.
|
|
103
105
|
|
|
104
106
|
```ts
|
|
105
107
|
import { test, expect } from 'sunpeak/test';
|
package/bin/commands/inspect.mjs
CHANGED
|
@@ -1363,9 +1363,43 @@ function toolRendersApp(tool) {
|
|
|
1363
1363
|
return !!(tool?._meta?.ui?.resourceUri ?? tool?._meta?.['ui/resourceUri']);
|
|
1364
1364
|
}
|
|
1365
1365
|
|
|
1366
|
-
function
|
|
1367
|
-
|
|
1366
|
+
function sanitizeAiSdkSchemaNode(schema) {
|
|
1367
|
+
if (Array.isArray(schema)) {
|
|
1368
|
+
return schema.map((item) => sanitizeAiSdkSchemaNode(item));
|
|
1369
|
+
}
|
|
1370
|
+
if (!schema || typeof schema !== 'object') return schema;
|
|
1371
|
+
|
|
1372
|
+
const clean = { ...schema };
|
|
1368
1373
|
delete clean.$schema;
|
|
1374
|
+
if (
|
|
1375
|
+
clean.properties &&
|
|
1376
|
+
typeof clean.properties === 'object' &&
|
|
1377
|
+
!Array.isArray(clean.properties)
|
|
1378
|
+
) {
|
|
1379
|
+
clean.properties = Object.fromEntries(
|
|
1380
|
+
Object.entries(clean.properties).map(([key, value]) => [
|
|
1381
|
+
key,
|
|
1382
|
+
sanitizeAiSdkSchemaNode(value),
|
|
1383
|
+
])
|
|
1384
|
+
);
|
|
1385
|
+
}
|
|
1386
|
+
if (clean.items !== undefined) {
|
|
1387
|
+
clean.items = sanitizeAiSdkSchemaNode(clean.items);
|
|
1388
|
+
}
|
|
1389
|
+
for (const key of ['anyOf', 'allOf', 'oneOf']) {
|
|
1390
|
+
if (Array.isArray(clean[key])) {
|
|
1391
|
+
clean[key] = clean[key].map((item) => sanitizeAiSdkSchemaNode(item));
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1394
|
+
|
|
1395
|
+
const isObjectSchema = clean.type === 'object' || clean.properties != null;
|
|
1396
|
+
if (isObjectSchema) {
|
|
1397
|
+
if (!clean.type) clean.type = 'object';
|
|
1398
|
+
if (!clean.properties) clean.properties = {};
|
|
1399
|
+
clean.additionalProperties = false;
|
|
1400
|
+
return clean;
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1369
1403
|
if (
|
|
1370
1404
|
clean.additionalProperties != null &&
|
|
1371
1405
|
typeof clean.additionalProperties === 'object' &&
|
|
@@ -1373,8 +1407,14 @@ function sanitizeAiSdkSchema(schema) {
|
|
|
1373
1407
|
) {
|
|
1374
1408
|
delete clean.additionalProperties;
|
|
1375
1409
|
}
|
|
1410
|
+
return clean;
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
export function sanitizeAiSdkSchema(schema) {
|
|
1414
|
+
const clean = sanitizeAiSdkSchemaNode(schema || { type: 'object', properties: {} });
|
|
1376
1415
|
if (!clean.type) clean.type = 'object';
|
|
1377
1416
|
if (!clean.properties) clean.properties = {};
|
|
1417
|
+
clean.additionalProperties = false;
|
|
1378
1418
|
return clean;
|
|
1379
1419
|
}
|
|
1380
1420
|
|
|
@@ -1392,15 +1432,39 @@ function normalizeModelId(modelId) {
|
|
|
1392
1432
|
return trimmed;
|
|
1393
1433
|
}
|
|
1394
1434
|
|
|
1435
|
+
function normalizeModelProviderModelId(provider, modelId) {
|
|
1436
|
+
const normalizedModelId = normalizeModelId(modelId);
|
|
1437
|
+
if (
|
|
1438
|
+
provider === 'anthropic' &&
|
|
1439
|
+
/^claude-\d+(?:-\d+)+-(opus|sonnet|haiku)$/i.test(normalizedModelId)
|
|
1440
|
+
) {
|
|
1441
|
+
throw new Error(
|
|
1442
|
+
`Unsupported Anthropic model ID "${normalizedModelId}". Use an Anthropic API model ID such as "claude-sonnet-4-20250514".`
|
|
1443
|
+
);
|
|
1444
|
+
}
|
|
1445
|
+
return normalizedModelId;
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
function normalizeModelConversationId(conversationId) {
|
|
1449
|
+
if (typeof conversationId !== 'string') return undefined;
|
|
1450
|
+
const trimmed = conversationId.trim();
|
|
1451
|
+
if (!trimmed) return undefined;
|
|
1452
|
+
if (trimmed.length > 200 || /[\u0000-\u001f\u007f]/.test(trimmed)) {
|
|
1453
|
+
throw new Error('Invalid model conversation ID.');
|
|
1454
|
+
}
|
|
1455
|
+
return trimmed;
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1395
1458
|
async function createModelInstance(provider, modelId, apiKey) {
|
|
1396
1459
|
assertModelProvider(provider);
|
|
1397
|
-
const normalizedModelId =
|
|
1460
|
+
const normalizedModelId = normalizeModelProviderModelId(provider, modelId);
|
|
1398
1461
|
if (provider === 'openai') {
|
|
1399
1462
|
const { createOpenAI } = await import('@ai-sdk/openai');
|
|
1400
1463
|
const openai = createOpenAI({ apiKey });
|
|
1464
|
+
const settings = { structuredOutputs: false };
|
|
1401
1465
|
return typeof openai.chat === 'function'
|
|
1402
|
-
? openai.chat(normalizedModelId)
|
|
1403
|
-
: openai(normalizedModelId);
|
|
1466
|
+
? openai.chat(normalizedModelId, settings)
|
|
1467
|
+
: openai(normalizedModelId, settings);
|
|
1404
1468
|
}
|
|
1405
1469
|
const { createAnthropic } = await import('@ai-sdk/anthropic');
|
|
1406
1470
|
return createAnthropic({ apiKey })(normalizedModelId);
|
|
@@ -1414,6 +1478,24 @@ function formatJsonForModel(value) {
|
|
|
1414
1478
|
return `${json.slice(0, MODEL_VISIBLE_JSON_LIMIT_BYTES)}...`;
|
|
1415
1479
|
}
|
|
1416
1480
|
|
|
1481
|
+
function normalizeModelChatHost(host) {
|
|
1482
|
+
if (host === 'chatgpt' || host === 'claude') return host;
|
|
1483
|
+
return 'generic';
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
function isPlainObject(value) {
|
|
1487
|
+
return (
|
|
1488
|
+
value !== null &&
|
|
1489
|
+
typeof value === 'object' &&
|
|
1490
|
+
!Array.isArray(value) &&
|
|
1491
|
+
Object.getPrototypeOf(value) === Object.prototype
|
|
1492
|
+
);
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
function normalizeToolArguments(args) {
|
|
1496
|
+
return isPlainObject(args) ? args : {};
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1417
1499
|
function normalizeModelAppContext(appContext) {
|
|
1418
1500
|
if (!appContext || typeof appContext !== 'object') return undefined;
|
|
1419
1501
|
const normalized = {};
|
|
@@ -1432,15 +1514,67 @@ function formatSharedAppContextForModel(appContext) {
|
|
|
1432
1514
|
return formatJsonForModel(normalized);
|
|
1433
1515
|
}
|
|
1434
1516
|
|
|
1435
|
-
function
|
|
1517
|
+
function normalizeModelChatMessages(messages) {
|
|
1518
|
+
if (!Array.isArray(messages)) return [];
|
|
1519
|
+
return messages
|
|
1520
|
+
.filter((message) => message?.role === 'user' || message?.role === 'assistant')
|
|
1521
|
+
.map((message) => ({
|
|
1522
|
+
role: message.role,
|
|
1523
|
+
content: String(message.content ?? '').slice(0, 20000).trim(),
|
|
1524
|
+
}))
|
|
1525
|
+
.filter((message) => message.content.length > 0);
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
function getToolErrorText(tool, result) {
|
|
1529
|
+
const toolName = tool?.name || 'MCP tool';
|
|
1530
|
+
const text = (result?.content || [])
|
|
1531
|
+
.filter((part) => part && typeof part === 'object' && part.type === 'text')
|
|
1532
|
+
.map((part) => String(part.text ?? ''))
|
|
1533
|
+
.join('\n')
|
|
1534
|
+
.trim();
|
|
1535
|
+
if (text) return text;
|
|
1536
|
+
if (result?.structuredContent !== undefined) {
|
|
1537
|
+
return formatJsonForModel({ structuredContent: result.structuredContent });
|
|
1538
|
+
}
|
|
1539
|
+
return text || `${toolName} returned an error.`;
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
function formatModelVisibleToolError(tool, result, { host, arguments: args, toolCallId } = {}) {
|
|
1543
|
+
const toolName = tool?.name || 'MCP tool';
|
|
1544
|
+
const errorText = getToolErrorText(tool, result);
|
|
1545
|
+
const id = typeof toolCallId === 'string' && toolCallId.trim() ? toolCallId : toolName;
|
|
1546
|
+
|
|
1547
|
+
switch (normalizeModelChatHost(host)) {
|
|
1548
|
+
case 'chatgpt':
|
|
1549
|
+
return {
|
|
1550
|
+
type: 'mcp_call',
|
|
1551
|
+
id,
|
|
1552
|
+
name: toolName,
|
|
1553
|
+
arguments: normalizeToolArguments(args),
|
|
1554
|
+
error: errorText,
|
|
1555
|
+
output: null,
|
|
1556
|
+
status: 'failed',
|
|
1557
|
+
};
|
|
1558
|
+
case 'claude':
|
|
1559
|
+
return {
|
|
1560
|
+
type: 'mcp_tool_result',
|
|
1561
|
+
tool_use_id: id,
|
|
1562
|
+
is_error: true,
|
|
1563
|
+
content: [{ type: 'text', text: errorText }],
|
|
1564
|
+
};
|
|
1565
|
+
default:
|
|
1566
|
+
return {
|
|
1567
|
+
isError: true,
|
|
1568
|
+
content: [{ type: 'text', text: errorText }],
|
|
1569
|
+
};
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
function formatModelVisibleToolResult(tool, result, options = {}) {
|
|
1436
1574
|
const toolName = tool?.name || 'MCP tool';
|
|
1437
1575
|
if (result?.isError) {
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
.map((part) => part.text)
|
|
1441
|
-
.join('\n')
|
|
1442
|
-
.trim();
|
|
1443
|
-
return text || `${toolName} returned an error.`;
|
|
1576
|
+
if (options.host) return formatModelVisibleToolError(tool, result, options);
|
|
1577
|
+
return getToolErrorText(tool, result);
|
|
1444
1578
|
}
|
|
1445
1579
|
|
|
1446
1580
|
const visibleResult = {};
|
|
@@ -1457,17 +1591,63 @@ function formatModelVisibleToolResult(tool, result) {
|
|
|
1457
1591
|
: `${toolName} completed.`;
|
|
1458
1592
|
}
|
|
1459
1593
|
|
|
1460
|
-
|
|
1461
|
-
|
|
1594
|
+
function errorToMessage(error) {
|
|
1595
|
+
if (error instanceof Error && error.message) return error.message;
|
|
1596
|
+
return String(error || 'Unknown MCP tool error');
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
function createModelChatToolErrorResult(error) {
|
|
1600
|
+
const message = errorToMessage(error);
|
|
1462
1601
|
return {
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
source: 'mcp',
|
|
1602
|
+
content: [{ type: 'text', text: message }],
|
|
1603
|
+
isError: true,
|
|
1466
1604
|
};
|
|
1467
1605
|
}
|
|
1468
1606
|
|
|
1469
|
-
async function
|
|
1607
|
+
async function executeModelChatToolCall({ client, name, arguments: args }) {
|
|
1608
|
+
const safeArgs = normalizeToolArguments(args);
|
|
1609
|
+
try {
|
|
1610
|
+
return {
|
|
1611
|
+
arguments: safeArgs,
|
|
1612
|
+
result: await client.callTool({ name, arguments: safeArgs }),
|
|
1613
|
+
source: 'mcp',
|
|
1614
|
+
};
|
|
1615
|
+
} catch (error) {
|
|
1616
|
+
return {
|
|
1617
|
+
arguments: safeArgs,
|
|
1618
|
+
result: createModelChatToolErrorResult(error),
|
|
1619
|
+
source: 'mcp',
|
|
1620
|
+
};
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
function getModelChatHostInstructions(host) {
|
|
1625
|
+
switch (normalizeModelChatHost(host)) {
|
|
1626
|
+
case 'chatgpt':
|
|
1627
|
+
return 'ChatGPT surfaces failed MCP calls as mcp_call items with an error field. When a tool result object has type "mcp_call", status "failed", or a non-empty error field, treat it as a failed MCP call, not a successful result.';
|
|
1628
|
+
case 'claude':
|
|
1629
|
+
return 'Claude surfaces failed MCP calls as mcp_tool_result blocks with is_error=true. When a tool result object has type "mcp_tool_result" and is_error is true, treat it as a failed MCP call, not a successful result.';
|
|
1630
|
+
default:
|
|
1631
|
+
return 'MCP tool failures are model-visible tool results. When a tool result has isError=true or describes a failed MCP call, treat it as a failed tool call, not a successful result.';
|
|
1632
|
+
}
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
function getModelChatRetryInstructions() {
|
|
1636
|
+
return 'After a failed MCP tool call, use the error text to decide the next step. Retry with corrected arguments for validation or business-logic errors. For transient service, timeout, or connectivity errors, you may retry once if the user request still needs the tool. Do not repeat the same failing tool call with the same arguments more than once.';
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
async function runModelChat({
|
|
1640
|
+
client,
|
|
1641
|
+
provider,
|
|
1642
|
+
modelId,
|
|
1643
|
+
messages,
|
|
1644
|
+
apiKey,
|
|
1645
|
+
appContext,
|
|
1646
|
+
host,
|
|
1647
|
+
conversationId,
|
|
1648
|
+
}) {
|
|
1470
1649
|
assertModelProvider(provider);
|
|
1650
|
+
const normalizedHost = normalizeModelChatHost(host);
|
|
1471
1651
|
const { generateText, tool: aiTool, jsonSchema } = await import('ai');
|
|
1472
1652
|
const model = await createModelInstance(provider, modelId, apiKey);
|
|
1473
1653
|
const { tools: mcpTools } = await client.listTools();
|
|
@@ -1479,14 +1659,23 @@ async function runModelChat({ client, provider, modelId, messages, apiKey, appCo
|
|
|
1479
1659
|
description: mcpTool.description || mcpTool.title || '',
|
|
1480
1660
|
inputSchema: jsonSchema(sanitizeAiSdkSchema(mcpTool.inputSchema)),
|
|
1481
1661
|
parameters: jsonSchema(sanitizeAiSdkSchema(mcpTool.inputSchema)),
|
|
1482
|
-
execute: async (args) => {
|
|
1662
|
+
execute: async (args, options) => {
|
|
1483
1663
|
const { arguments: safeArgs, result } = await executeModelChatToolCall({
|
|
1484
1664
|
client,
|
|
1485
1665
|
name: mcpTool.name,
|
|
1486
1666
|
arguments: args,
|
|
1487
1667
|
});
|
|
1488
|
-
capturedToolCalls.push({
|
|
1489
|
-
|
|
1668
|
+
capturedToolCalls.push({
|
|
1669
|
+
name: mcpTool.name,
|
|
1670
|
+
arguments: safeArgs,
|
|
1671
|
+
result,
|
|
1672
|
+
isError: !!result?.isError,
|
|
1673
|
+
});
|
|
1674
|
+
return formatModelVisibleToolResult(mcpTool, result, {
|
|
1675
|
+
host: normalizedHost,
|
|
1676
|
+
arguments: safeArgs,
|
|
1677
|
+
toolCallId: options?.toolCallId,
|
|
1678
|
+
});
|
|
1490
1679
|
},
|
|
1491
1680
|
});
|
|
1492
1681
|
}
|
|
@@ -1497,22 +1686,25 @@ async function runModelChat({ client, provider, modelId, messages, apiKey, appCo
|
|
|
1497
1686
|
model,
|
|
1498
1687
|
tools,
|
|
1499
1688
|
system: [
|
|
1500
|
-
'You are chatting inside the
|
|
1689
|
+
'You are chatting inside the sunpeak Inspector. When you call an MCP tool that renders an app, the host will render the app below your message. Do not repeat raw tool output, JSON, image URLs, markdown image lists, or full item inventories. Keep any narration brief and let the app carry the visual result.',
|
|
1690
|
+
getModelChatHostInstructions(normalizedHost),
|
|
1691
|
+
getModelChatRetryInstructions(),
|
|
1501
1692
|
sharedAppContext
|
|
1502
1693
|
? `Shared MCP App context from the currently rendered app, available for this turn:\n${sharedAppContext}`
|
|
1503
1694
|
: '',
|
|
1504
1695
|
]
|
|
1505
1696
|
.filter(Boolean)
|
|
1506
1697
|
.join('\n\n'),
|
|
1507
|
-
messages: messages
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
maxSteps: 5,
|
|
1698
|
+
messages: normalizeModelChatMessages(messages),
|
|
1699
|
+
// AI SDK v4 can send an empty assistant text block to Anthropic when a
|
|
1700
|
+
// tool-only response is followed by another model step. We only need the
|
|
1701
|
+
// tool result for inspector rendering, so skip that follow-up call.
|
|
1702
|
+
maxSteps: provider === 'anthropic' ? 1 : 5,
|
|
1512
1703
|
maxRetries: 0,
|
|
1513
1704
|
});
|
|
1514
1705
|
|
|
1515
1706
|
return {
|
|
1707
|
+
...(conversationId ? { conversationId } : {}),
|
|
1516
1708
|
text: result.text || '',
|
|
1517
1709
|
toolCalls: capturedToolCalls,
|
|
1518
1710
|
finishReason: result.finishReason,
|
|
@@ -2513,18 +2705,13 @@ function sunpeakInspectEndpointsPlugin(getClient, setClient, pluginOpts = {}) {
|
|
|
2513
2705
|
res.end(JSON.stringify({ error: `No ${provider} API key saved.` }));
|
|
2514
2706
|
return;
|
|
2515
2707
|
}
|
|
2516
|
-
const
|
|
2517
|
-
const safeMessages = messages
|
|
2518
|
-
.filter((message) => message?.role === 'user' || message?.role === 'assistant')
|
|
2519
|
-
.map((message) => ({
|
|
2520
|
-
role: message.role,
|
|
2521
|
-
content: String(message.content ?? '').slice(0, 20000),
|
|
2522
|
-
}));
|
|
2708
|
+
const safeMessages = normalizeModelChatMessages(parsed.messages);
|
|
2523
2709
|
if (safeMessages.length === 0) {
|
|
2524
2710
|
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
2525
2711
|
res.end(JSON.stringify({ error: 'Missing chat messages.' }));
|
|
2526
2712
|
return;
|
|
2527
2713
|
}
|
|
2714
|
+
const conversationId = normalizeModelConversationId(parsed.conversationId);
|
|
2528
2715
|
|
|
2529
2716
|
const result = await withModelChatClient((client) =>
|
|
2530
2717
|
runModelChat({
|
|
@@ -2533,7 +2720,9 @@ function sunpeakInspectEndpointsPlugin(getClient, setClient, pluginOpts = {}) {
|
|
|
2533
2720
|
modelId: parsed.modelId,
|
|
2534
2721
|
messages: safeMessages,
|
|
2535
2722
|
apiKey,
|
|
2723
|
+
host: parsed.host,
|
|
2536
2724
|
appContext: normalizeModelAppContext(parsed.appContext),
|
|
2725
|
+
conversationId,
|
|
2537
2726
|
})
|
|
2538
2727
|
);
|
|
2539
2728
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
@@ -2678,8 +2867,10 @@ export const _securityTestExports = {
|
|
|
2678
2867
|
formatModelVisibleToolResult,
|
|
2679
2868
|
formatSharedAppContextForModel,
|
|
2680
2869
|
normalizeApiKey,
|
|
2870
|
+
normalizeModelChatMessages,
|
|
2681
2871
|
normalizeModelAppContext,
|
|
2682
2872
|
normalizeModelId,
|
|
2873
|
+
normalizeModelProviderModelId,
|
|
2683
2874
|
quoteSecurityInteractiveArg,
|
|
2684
2875
|
readRequestBody,
|
|
2685
2876
|
resolveHttpRedirectsForMcp,
|
|
@@ -374,6 +374,8 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
374
374
|
*
|
|
375
375
|
* Each case sends a prompt to every configured model and checks
|
|
376
376
|
* that the model calls the expected tool with the expected arguments.
|
|
377
|
+
* Add appContext to test follow-up prompts that depend on state shared
|
|
378
|
+
* by the rendered MCP App through updateModelContext or useAppState.
|
|
377
379
|
* Cases run multiple times (configured via \`runs\` in eval.config.ts)
|
|
378
380
|
* to measure reliability across non-deterministic LLM responses.
|
|
379
381
|
*/
|
|
@@ -384,6 +386,8 @@ export default defineEval({
|
|
|
384
386
|
{
|
|
385
387
|
name: 'example (replace me)',
|
|
386
388
|
prompt: 'Show me a demo',
|
|
389
|
+
// Optional: seed model-visible App Context for follow-up prompts.
|
|
390
|
+
// appContext: { structuredContent: { selectedItem: { id: 'demo' } } },
|
|
387
391
|
// expect which tool gets called and (optionally) its arguments:
|
|
388
392
|
expect: {
|
|
389
393
|
tool: 'your-tool-name',
|
|
@@ -161,6 +161,42 @@ export async function discoverAndConvertTools(client) {
|
|
|
161
161
|
return tools;
|
|
162
162
|
}
|
|
163
163
|
|
|
164
|
+
const MODEL_VISIBLE_JSON_LIMIT_BYTES = 20000;
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Normalize MCP App Context into the same shape hosts expose to the model.
|
|
168
|
+
* Empty context is treated as absent.
|
|
169
|
+
* @param {unknown} appContext
|
|
170
|
+
* @returns {{ content?: unknown[], structuredContent?: unknown } | undefined}
|
|
171
|
+
*/
|
|
172
|
+
export function normalizeEvalAppContext(appContext) {
|
|
173
|
+
if (!appContext || typeof appContext !== 'object') return undefined;
|
|
174
|
+
const normalized = {};
|
|
175
|
+
if (Array.isArray(appContext.content) && appContext.content.length > 0) {
|
|
176
|
+
normalized.content = appContext.content;
|
|
177
|
+
}
|
|
178
|
+
if (appContext.structuredContent !== undefined) {
|
|
179
|
+
normalized.structuredContent = appContext.structuredContent;
|
|
180
|
+
}
|
|
181
|
+
return Object.keys(normalized).length > 0 ? normalized : undefined;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Build the system prompt fragment that makes app context visible to the model.
|
|
186
|
+
* @param {unknown} appContext
|
|
187
|
+
* @returns {string | undefined}
|
|
188
|
+
*/
|
|
189
|
+
export function formatEvalAppContextForModel(appContext) {
|
|
190
|
+
const normalized = normalizeEvalAppContext(appContext);
|
|
191
|
+
if (!normalized) return undefined;
|
|
192
|
+
const json = JSON.stringify(normalized);
|
|
193
|
+
const visibleJson =
|
|
194
|
+
json.length <= MODEL_VISIBLE_JSON_LIMIT_BYTES
|
|
195
|
+
? json
|
|
196
|
+
: `${json.slice(0, MODEL_VISIBLE_JSON_LIMIT_BYTES)}...`;
|
|
197
|
+
return `Shared MCP App context from the currently rendered app, available for this turn:\n${visibleJson}`;
|
|
198
|
+
}
|
|
199
|
+
|
|
164
200
|
/**
|
|
165
201
|
* Run a single eval case once against a model.
|
|
166
202
|
* @param {object} params
|
|
@@ -170,15 +206,26 @@ export async function discoverAndConvertTools(client) {
|
|
|
170
206
|
* @param {number} params.maxSteps
|
|
171
207
|
* @param {number} params.temperature
|
|
172
208
|
* @param {number} params.timeout
|
|
209
|
+
* @param {{ content?: unknown[], structuredContent?: unknown }} [params.appContext]
|
|
173
210
|
* @returns {Promise<import('./eval-types.d.mts').EvalRunResult>}
|
|
174
211
|
*/
|
|
175
|
-
export async function runSingleEval({
|
|
212
|
+
export async function runSingleEval({
|
|
213
|
+
prompt,
|
|
214
|
+
model,
|
|
215
|
+
tools,
|
|
216
|
+
maxSteps,
|
|
217
|
+
temperature,
|
|
218
|
+
timeout,
|
|
219
|
+
appContext,
|
|
220
|
+
}) {
|
|
176
221
|
const { generateText } = await import('ai');
|
|
222
|
+
const system = formatEvalAppContextForModel(appContext);
|
|
177
223
|
|
|
178
224
|
const result = await generateText({
|
|
179
225
|
model,
|
|
180
226
|
tools,
|
|
181
227
|
prompt,
|
|
228
|
+
...(system ? { system } : {}),
|
|
182
229
|
maxSteps,
|
|
183
230
|
temperature,
|
|
184
231
|
maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
|
|
@@ -356,6 +403,7 @@ export async function runEvalCaseAggregate({
|
|
|
356
403
|
maxSteps: evalCase.maxSteps ?? maxSteps,
|
|
357
404
|
temperature,
|
|
358
405
|
timeout,
|
|
406
|
+
appContext: evalCase.appContext,
|
|
359
407
|
});
|
|
360
408
|
checkExpectations(result, evalCase);
|
|
361
409
|
passed++;
|
|
@@ -37,6 +37,16 @@ export interface ToolExpectation {
|
|
|
37
37
|
args?: Record<string, unknown>;
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
+
/**
|
|
41
|
+
* MCP App Context made visible to the model before the eval prompt.
|
|
42
|
+
*/
|
|
43
|
+
export interface EvalAppContext {
|
|
44
|
+
/** Text or multimodal content blocks shared from the app. */
|
|
45
|
+
content?: unknown[];
|
|
46
|
+
/** Structured app state shared from the app. */
|
|
47
|
+
structuredContent?: unknown;
|
|
48
|
+
}
|
|
49
|
+
|
|
40
50
|
/**
|
|
41
51
|
* A single eval test case.
|
|
42
52
|
*/
|
|
@@ -45,6 +55,8 @@ export interface EvalCase {
|
|
|
45
55
|
name: string;
|
|
46
56
|
/** The prompt to send to the model. */
|
|
47
57
|
prompt: string;
|
|
58
|
+
/** MCP App Context to expose to the model for this prompt. */
|
|
59
|
+
appContext?: EvalAppContext;
|
|
48
60
|
/** Maximum tool call steps (default: from config or 1). */
|
|
49
61
|
maxSteps?: number;
|
|
50
62
|
/** Expected tool call (single). */
|
|
@@ -142,6 +154,20 @@ export declare function discoverAndConvertTools(
|
|
|
142
154
|
client: unknown,
|
|
143
155
|
): Promise<Record<string, unknown>>;
|
|
144
156
|
|
|
157
|
+
/**
|
|
158
|
+
* Normalize MCP App Context into the model-visible context shape.
|
|
159
|
+
*/
|
|
160
|
+
export declare function normalizeEvalAppContext(
|
|
161
|
+
appContext: unknown,
|
|
162
|
+
): EvalAppContext | undefined;
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Format MCP App Context as a system prompt fragment for the model.
|
|
166
|
+
*/
|
|
167
|
+
export declare function formatEvalAppContextForModel(
|
|
168
|
+
appContext: unknown,
|
|
169
|
+
): string | undefined;
|
|
170
|
+
|
|
145
171
|
/**
|
|
146
172
|
* Run a single eval case against a model, returning the normalized result.
|
|
147
173
|
*/
|
|
@@ -152,6 +178,7 @@ export declare function runSingleEval(params: {
|
|
|
152
178
|
maxSteps: number;
|
|
153
179
|
temperature: number;
|
|
154
180
|
timeout: number;
|
|
181
|
+
appContext?: EvalAppContext;
|
|
155
182
|
}): Promise<EvalRunResult>;
|
|
156
183
|
|
|
157
184
|
/**
|
|
@@ -47,9 +47,12 @@ export async function resolveModel(modelId) {
|
|
|
47
47
|
// @ai-sdk/openai v3 defaults to the Responses API, which requires strict
|
|
48
48
|
// JSON Schema (additionalProperties: false at every level, all properties
|
|
49
49
|
// required) — incompatible with arbitrary MCP server schemas. Use .chat()
|
|
50
|
-
// (Chat Completions API) when available
|
|
51
|
-
//
|
|
52
|
-
|
|
50
|
+
// (Chat Completions API) when available and disable structured outputs,
|
|
51
|
+
// because reasoning models also enable strict function schemas by default.
|
|
52
|
+
const settings = { structuredOutputs: false };
|
|
53
|
+
return typeof openai.chat === 'function'
|
|
54
|
+
? openai.chat(modelId, settings)
|
|
55
|
+
: openai(modelId, settings);
|
|
53
56
|
}
|
|
54
57
|
if (pkg === '@ai-sdk/anthropic') {
|
|
55
58
|
const { anthropic } = provider;
|
package/dist/chatgpt/index.cjs
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
|
|
2
2
|
const require_chunk = require("../chunk-Cek0wNdY.cjs");
|
|
3
|
-
const require_inspector = require("../inspector-
|
|
3
|
+
const require_inspector = require("../inspector-DOmiG64-.cjs");
|
|
4
4
|
const require_inspector_url = require("../inspector-url-BxScdDag.cjs");
|
|
5
5
|
const require_discovery = require("../discovery-31_n0zcu.cjs");
|
|
6
6
|
//#region src/chatgpt/index.ts
|
package/dist/chatgpt/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Ct as __exportAll } from "../protocol-bhrz2H_E.js";
|
|
2
|
-
import { _ as extractResourceCSP, f as ThemeProvider, g as IframeResource, p as useThemeContext, r as resolveServerToolResult, t as Inspector, v as McpAppHost, y as SCREEN_WIDTHS } from "../inspector-
|
|
2
|
+
import { _ as extractResourceCSP, f as ThemeProvider, g as IframeResource, p as useThemeContext, r as resolveServerToolResult, t as Inspector, v as McpAppHost, y as SCREEN_WIDTHS } from "../inspector-C6n8zap3.js";
|
|
3
3
|
import { t as createInspectorUrl } from "../inspector-url-xUMGbWis.js";
|
|
4
4
|
import { c as toPascalCase, i as findResourceKey, n as extractSimulationKey, r as findResourceDirs, s as getComponentName, t as extractResourceKey } from "../discovery-DOVner--.js";
|
|
5
5
|
//#region src/chatgpt/index.ts
|
package/dist/claude/index.cjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
|
|
2
2
|
require("../chunk-Cek0wNdY.cjs");
|
|
3
|
-
const require_inspector = require("../inspector-
|
|
3
|
+
const require_inspector = require("../inspector-DOmiG64-.cjs");
|
|
4
4
|
exports.Inspector = require_inspector.Inspector;
|
package/dist/claude/index.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { t as Inspector } from "../inspector-
|
|
1
|
+
import { t as Inspector } from "../inspector-C6n8zap3.js";
|
|
2
2
|
export { Inspector };
|