sunpeak 0.20.35 → 0.20.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -2
- package/bin/commands/inspect.mjs +241 -48
- package/bin/commands/test-init.mjs +61 -44
- package/bin/lib/eval/eval-runner.mjs +49 -1
- package/bin/lib/eval/eval-types.d.mts +27 -0
- package/bin/lib/eval/model-registry.mjs +6 -3
- package/bin/lib/test/test-fixtures.d.mts +9 -1
- package/bin/lib/test/test-fixtures.mjs +25 -7
- package/dist/chatgpt/index.cjs +1 -1
- package/dist/chatgpt/index.js +1 -1
- package/dist/claude/index.cjs +1 -1
- package/dist/claude/index.js +1 -1
- package/dist/embed.css +1 -1
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/inspector/app-types.d.ts +6 -4
- package/dist/inspector/index.cjs +1 -1
- package/dist/inspector/index.js +1 -1
- package/dist/inspector/inspector.d.ts +12 -4
- package/dist/inspector/simple-sidebar.d.ts +20 -6
- package/dist/inspector/use-inspector-state.d.ts +32 -0
- package/dist/{inspector-D0TWNx_T.js → inspector-C6n8zap3.js} +432 -171
- package/dist/{inspector-D0TWNx_T.js.map → inspector-C6n8zap3.js.map} +1 -1
- package/dist/{inspector-DQ_vv1wj.cjs → inspector-DOmiG64-.cjs} +432 -171
- package/dist/{inspector-DQ_vv1wj.cjs.map → inspector-DOmiG64-.cjs.map} +1 -1
- package/dist/style.css +18 -14
- package/package.json +1 -1
- package/template/dist/albums/albums.html +1 -1
- package/template/dist/albums/albums.json +1 -1
- package/template/dist/carousel/carousel.html +1 -1
- package/template/dist/carousel/carousel.json +1 -1
- package/template/dist/map/map.html +1 -1
- package/template/dist/map/map.json +1 -1
- package/template/dist/review/review.html +1 -1
- package/template/dist/review/review.json +1 -1
- package/template/tests/e2e/visual.spec.ts +0 -8
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-linux.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-linux.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-chatgpt-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-chatgpt-linux.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-claude-darwin.png +0 -0
- package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-claude-linux.png +0 -0
package/README.md
CHANGED
|
@@ -41,7 +41,9 @@ sunpeak replicates the ChatGPT and Claude runtimes locally so you can:
|
|
|
41
41
|
- Pin tool states with simulation fixtures so UI regressions can't ship.
|
|
42
42
|
- Automate the real-host loop with live tests: scripts that open your browser, prompt ChatGPT, and assert against the rendered app so you stop click-testing by hand.
|
|
43
43
|
|
|
44
|
-
sunpeak also runs evals against your MCP server across multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model, so you can prove your tool descriptions and
|
|
44
|
+
sunpeak also runs evals against your MCP server across multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model, so you can prove your tool descriptions, schemas, and model-visible App Context hold up on cheaper models, not just the flagship ones.
|
|
45
|
+
|
|
46
|
+
Eval cases can seed App Context with `appContext`, which lets you test follow-up prompts such as "Book this one" against state the app has shared through `updateModelContext`.
|
|
45
47
|
|
|
46
48
|
<div align="center">
|
|
47
49
|
<a href="https://sunpeak.ai/docs/testing/evals">
|
|
@@ -99,7 +101,7 @@ npx sunpeak test
|
|
|
99
101
|
|
|
100
102
|
Playwright fixtures handle inspector startup, MCP connection, iframe traversal, and host switching. Works with Python, Go, TypeScript, Rust, or any language.
|
|
101
103
|
|
|
102
|
-
Evals add a second dimension: model compatibility. The eval framework connects to your MCP server via the MCP protocol, discovers its tools, and sends prompts to multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model and reports pass/fail counts, so you can measure whether your tool descriptions and
|
|
104
|
+
Evals add a second dimension: model compatibility. The eval framework connects to your MCP server via the MCP protocol, discovers its tools, and sends prompts to multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model and reports pass/fail counts, so you can measure whether your tool descriptions, schemas, and model-visible App Context work reliably across smaller and cheaper models, not just the flagship ones.
|
|
103
105
|
|
|
104
106
|
```ts
|
|
105
107
|
import { test, expect } from 'sunpeak/test';
|
package/bin/commands/inspect.mjs
CHANGED
|
@@ -1349,19 +1349,57 @@ async function getApiKeyStatus(provider) {
|
|
|
1349
1349
|
};
|
|
1350
1350
|
}
|
|
1351
1351
|
|
|
1352
|
-
function toolResourceUri(tool) {
|
|
1353
|
-
return tool?._meta?.ui?.resourceUri ?? tool?._meta?.['ui/resourceUri'];
|
|
1354
|
-
}
|
|
1355
|
-
|
|
1356
1352
|
function isToolVisibleToModel(tool) {
|
|
1357
1353
|
const visibility = tool?._meta?.ui?.visibility ?? tool?._meta?.['ui/visibility'];
|
|
1358
1354
|
if (visibility == null) return true;
|
|
1359
1355
|
return Array.isArray(visibility) && visibility.includes('model');
|
|
1360
1356
|
}
|
|
1361
1357
|
|
|
1362
|
-
function
|
|
1363
|
-
|
|
1358
|
+
function getModelCallableTools(tools) {
|
|
1359
|
+
return tools.filter((tool) => isToolVisibleToModel(tool));
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
function toolRendersApp(tool) {
|
|
1363
|
+
return !!(tool?._meta?.ui?.resourceUri ?? tool?._meta?.['ui/resourceUri']);
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
function sanitizeAiSdkSchemaNode(schema) {
|
|
1367
|
+
if (Array.isArray(schema)) {
|
|
1368
|
+
return schema.map((item) => sanitizeAiSdkSchemaNode(item));
|
|
1369
|
+
}
|
|
1370
|
+
if (!schema || typeof schema !== 'object') return schema;
|
|
1371
|
+
|
|
1372
|
+
const clean = { ...schema };
|
|
1364
1373
|
delete clean.$schema;
|
|
1374
|
+
if (
|
|
1375
|
+
clean.properties &&
|
|
1376
|
+
typeof clean.properties === 'object' &&
|
|
1377
|
+
!Array.isArray(clean.properties)
|
|
1378
|
+
) {
|
|
1379
|
+
clean.properties = Object.fromEntries(
|
|
1380
|
+
Object.entries(clean.properties).map(([key, value]) => [
|
|
1381
|
+
key,
|
|
1382
|
+
sanitizeAiSdkSchemaNode(value),
|
|
1383
|
+
])
|
|
1384
|
+
);
|
|
1385
|
+
}
|
|
1386
|
+
if (clean.items !== undefined) {
|
|
1387
|
+
clean.items = sanitizeAiSdkSchemaNode(clean.items);
|
|
1388
|
+
}
|
|
1389
|
+
for (const key of ['anyOf', 'allOf', 'oneOf']) {
|
|
1390
|
+
if (Array.isArray(clean[key])) {
|
|
1391
|
+
clean[key] = clean[key].map((item) => sanitizeAiSdkSchemaNode(item));
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1394
|
+
|
|
1395
|
+
const isObjectSchema = clean.type === 'object' || clean.properties != null;
|
|
1396
|
+
if (isObjectSchema) {
|
|
1397
|
+
if (!clean.type) clean.type = 'object';
|
|
1398
|
+
if (!clean.properties) clean.properties = {};
|
|
1399
|
+
clean.additionalProperties = false;
|
|
1400
|
+
return clean;
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1365
1403
|
if (
|
|
1366
1404
|
clean.additionalProperties != null &&
|
|
1367
1405
|
typeof clean.additionalProperties === 'object' &&
|
|
@@ -1369,8 +1407,14 @@ function sanitizeAiSdkSchema(schema) {
|
|
|
1369
1407
|
) {
|
|
1370
1408
|
delete clean.additionalProperties;
|
|
1371
1409
|
}
|
|
1410
|
+
return clean;
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
export function sanitizeAiSdkSchema(schema) {
|
|
1414
|
+
const clean = sanitizeAiSdkSchemaNode(schema || { type: 'object', properties: {} });
|
|
1372
1415
|
if (!clean.type) clean.type = 'object';
|
|
1373
1416
|
if (!clean.properties) clean.properties = {};
|
|
1417
|
+
clean.additionalProperties = false;
|
|
1374
1418
|
return clean;
|
|
1375
1419
|
}
|
|
1376
1420
|
|
|
@@ -1388,15 +1432,39 @@ function normalizeModelId(modelId) {
|
|
|
1388
1432
|
return trimmed;
|
|
1389
1433
|
}
|
|
1390
1434
|
|
|
1435
|
+
function normalizeModelProviderModelId(provider, modelId) {
|
|
1436
|
+
const normalizedModelId = normalizeModelId(modelId);
|
|
1437
|
+
if (
|
|
1438
|
+
provider === 'anthropic' &&
|
|
1439
|
+
/^claude-\d+(?:-\d+)+-(opus|sonnet|haiku)$/i.test(normalizedModelId)
|
|
1440
|
+
) {
|
|
1441
|
+
throw new Error(
|
|
1442
|
+
`Unsupported Anthropic model ID "${normalizedModelId}". Use an Anthropic API model ID such as "claude-sonnet-4-20250514".`
|
|
1443
|
+
);
|
|
1444
|
+
}
|
|
1445
|
+
return normalizedModelId;
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
function normalizeModelConversationId(conversationId) {
|
|
1449
|
+
if (typeof conversationId !== 'string') return undefined;
|
|
1450
|
+
const trimmed = conversationId.trim();
|
|
1451
|
+
if (!trimmed) return undefined;
|
|
1452
|
+
if (trimmed.length > 200 || /[\u0000-\u001f\u007f]/.test(trimmed)) {
|
|
1453
|
+
throw new Error('Invalid model conversation ID.');
|
|
1454
|
+
}
|
|
1455
|
+
return trimmed;
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1391
1458
|
async function createModelInstance(provider, modelId, apiKey) {
|
|
1392
1459
|
assertModelProvider(provider);
|
|
1393
|
-
const normalizedModelId =
|
|
1460
|
+
const normalizedModelId = normalizeModelProviderModelId(provider, modelId);
|
|
1394
1461
|
if (provider === 'openai') {
|
|
1395
1462
|
const { createOpenAI } = await import('@ai-sdk/openai');
|
|
1396
1463
|
const openai = createOpenAI({ apiKey });
|
|
1464
|
+
const settings = { structuredOutputs: false };
|
|
1397
1465
|
return typeof openai.chat === 'function'
|
|
1398
|
-
? openai.chat(normalizedModelId)
|
|
1399
|
-
: openai(normalizedModelId);
|
|
1466
|
+
? openai.chat(normalizedModelId, settings)
|
|
1467
|
+
: openai(normalizedModelId, settings);
|
|
1400
1468
|
}
|
|
1401
1469
|
const { createAnthropic } = await import('@ai-sdk/anthropic');
|
|
1402
1470
|
return createAnthropic({ apiKey })(normalizedModelId);
|
|
@@ -1410,6 +1478,24 @@ function formatJsonForModel(value) {
|
|
|
1410
1478
|
return `${json.slice(0, MODEL_VISIBLE_JSON_LIMIT_BYTES)}...`;
|
|
1411
1479
|
}
|
|
1412
1480
|
|
|
1481
|
+
function normalizeModelChatHost(host) {
|
|
1482
|
+
if (host === 'chatgpt' || host === 'claude') return host;
|
|
1483
|
+
return 'generic';
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
function isPlainObject(value) {
|
|
1487
|
+
return (
|
|
1488
|
+
value !== null &&
|
|
1489
|
+
typeof value === 'object' &&
|
|
1490
|
+
!Array.isArray(value) &&
|
|
1491
|
+
Object.getPrototypeOf(value) === Object.prototype
|
|
1492
|
+
);
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
function normalizeToolArguments(args) {
|
|
1496
|
+
return isPlainObject(args) ? args : {};
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1413
1499
|
function normalizeModelAppContext(appContext) {
|
|
1414
1500
|
if (!appContext || typeof appContext !== 'object') return undefined;
|
|
1415
1501
|
const normalized = {};
|
|
@@ -1428,65 +1514,168 @@ function formatSharedAppContextForModel(appContext) {
|
|
|
1428
1514
|
return formatJsonForModel(normalized);
|
|
1429
1515
|
}
|
|
1430
1516
|
|
|
1431
|
-
function
|
|
1517
|
+
function normalizeModelChatMessages(messages) {
|
|
1518
|
+
if (!Array.isArray(messages)) return [];
|
|
1519
|
+
return messages
|
|
1520
|
+
.filter((message) => message?.role === 'user' || message?.role === 'assistant')
|
|
1521
|
+
.map((message) => ({
|
|
1522
|
+
role: message.role,
|
|
1523
|
+
content: String(message.content ?? '').slice(0, 20000).trim(),
|
|
1524
|
+
}))
|
|
1525
|
+
.filter((message) => message.content.length > 0);
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
function getToolErrorText(tool, result) {
|
|
1529
|
+
const toolName = tool?.name || 'MCP tool';
|
|
1530
|
+
const text = (result?.content || [])
|
|
1531
|
+
.filter((part) => part && typeof part === 'object' && part.type === 'text')
|
|
1532
|
+
.map((part) => String(part.text ?? ''))
|
|
1533
|
+
.join('\n')
|
|
1534
|
+
.trim();
|
|
1535
|
+
if (text) return text;
|
|
1536
|
+
if (result?.structuredContent !== undefined) {
|
|
1537
|
+
return formatJsonForModel({ structuredContent: result.structuredContent });
|
|
1538
|
+
}
|
|
1539
|
+
return text || `${toolName} returned an error.`;
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
function formatModelVisibleToolError(tool, result, { host, arguments: args, toolCallId } = {}) {
|
|
1543
|
+
const toolName = tool?.name || 'MCP tool';
|
|
1544
|
+
const errorText = getToolErrorText(tool, result);
|
|
1545
|
+
const id = typeof toolCallId === 'string' && toolCallId.trim() ? toolCallId : toolName;
|
|
1546
|
+
|
|
1547
|
+
switch (normalizeModelChatHost(host)) {
|
|
1548
|
+
case 'chatgpt':
|
|
1549
|
+
return {
|
|
1550
|
+
type: 'mcp_call',
|
|
1551
|
+
id,
|
|
1552
|
+
name: toolName,
|
|
1553
|
+
arguments: normalizeToolArguments(args),
|
|
1554
|
+
error: errorText,
|
|
1555
|
+
output: null,
|
|
1556
|
+
status: 'failed',
|
|
1557
|
+
};
|
|
1558
|
+
case 'claude':
|
|
1559
|
+
return {
|
|
1560
|
+
type: 'mcp_tool_result',
|
|
1561
|
+
tool_use_id: id,
|
|
1562
|
+
is_error: true,
|
|
1563
|
+
content: [{ type: 'text', text: errorText }],
|
|
1564
|
+
};
|
|
1565
|
+
default:
|
|
1566
|
+
return {
|
|
1567
|
+
isError: true,
|
|
1568
|
+
content: [{ type: 'text', text: errorText }],
|
|
1569
|
+
};
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
function formatModelVisibleToolResult(tool, result, options = {}) {
|
|
1432
1574
|
const toolName = tool?.name || 'MCP tool';
|
|
1433
1575
|
if (result?.isError) {
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
.map((part) => part.text)
|
|
1437
|
-
.join('\n')
|
|
1438
|
-
.trim();
|
|
1439
|
-
return text || `${toolName} returned an error.`;
|
|
1576
|
+
if (options.host) return formatModelVisibleToolError(tool, result, options);
|
|
1577
|
+
return getToolErrorText(tool, result);
|
|
1440
1578
|
}
|
|
1441
1579
|
|
|
1442
1580
|
const visibleResult = {};
|
|
1443
|
-
if (Array.isArray(result?.content)) {
|
|
1581
|
+
if (Array.isArray(result?.content) && result.content.length > 0) {
|
|
1444
1582
|
visibleResult.content = result.content;
|
|
1445
1583
|
}
|
|
1446
|
-
if (result
|
|
1584
|
+
if (result?.structuredContent !== undefined) {
|
|
1447
1585
|
visibleResult.structuredContent = result.structuredContent;
|
|
1448
1586
|
}
|
|
1449
|
-
if (result?.isError != null) {
|
|
1450
|
-
visibleResult.isError = result.isError;
|
|
1451
|
-
}
|
|
1452
|
-
|
|
1453
1587
|
return Object.keys(visibleResult).length > 0
|
|
1454
1588
|
? formatJsonForModel(visibleResult)
|
|
1455
|
-
:
|
|
1589
|
+
: toolRendersApp(tool)
|
|
1590
|
+
? `${toolName} completed. The MCP App is ready to render.`
|
|
1591
|
+
: `${toolName} completed.`;
|
|
1456
1592
|
}
|
|
1457
1593
|
|
|
1458
|
-
|
|
1459
|
-
|
|
1594
|
+
function errorToMessage(error) {
|
|
1595
|
+
if (error instanceof Error && error.message) return error.message;
|
|
1596
|
+
return String(error || 'Unknown MCP tool error');
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
function createModelChatToolErrorResult(error) {
|
|
1600
|
+
const message = errorToMessage(error);
|
|
1460
1601
|
return {
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
source: 'mcp',
|
|
1602
|
+
content: [{ type: 'text', text: message }],
|
|
1603
|
+
isError: true,
|
|
1464
1604
|
};
|
|
1465
1605
|
}
|
|
1466
1606
|
|
|
1467
|
-
async function
|
|
1607
|
+
async function executeModelChatToolCall({ client, name, arguments: args }) {
|
|
1608
|
+
const safeArgs = normalizeToolArguments(args);
|
|
1609
|
+
try {
|
|
1610
|
+
return {
|
|
1611
|
+
arguments: safeArgs,
|
|
1612
|
+
result: await client.callTool({ name, arguments: safeArgs }),
|
|
1613
|
+
source: 'mcp',
|
|
1614
|
+
};
|
|
1615
|
+
} catch (error) {
|
|
1616
|
+
return {
|
|
1617
|
+
arguments: safeArgs,
|
|
1618
|
+
result: createModelChatToolErrorResult(error),
|
|
1619
|
+
source: 'mcp',
|
|
1620
|
+
};
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
function getModelChatHostInstructions(host) {
|
|
1625
|
+
switch (normalizeModelChatHost(host)) {
|
|
1626
|
+
case 'chatgpt':
|
|
1627
|
+
return 'ChatGPT surfaces failed MCP calls as mcp_call items with an error field. When a tool result object has type "mcp_call", status "failed", or a non-empty error field, treat it as a failed MCP call, not a successful result.';
|
|
1628
|
+
case 'claude':
|
|
1629
|
+
return 'Claude surfaces failed MCP calls as mcp_tool_result blocks with is_error=true. When a tool result object has type "mcp_tool_result" and is_error is true, treat it as a failed MCP call, not a successful result.';
|
|
1630
|
+
default:
|
|
1631
|
+
return 'MCP tool failures are model-visible tool results. When a tool result has isError=true or describes a failed MCP call, treat it as a failed tool call, not a successful result.';
|
|
1632
|
+
}
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
function getModelChatRetryInstructions() {
|
|
1636
|
+
return 'After a failed MCP tool call, use the error text to decide the next step. Retry with corrected arguments for validation or business-logic errors. For transient service, timeout, or connectivity errors, you may retry once if the user request still needs the tool. Do not repeat the same failing tool call with the same arguments more than once.';
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
async function runModelChat({
|
|
1640
|
+
client,
|
|
1641
|
+
provider,
|
|
1642
|
+
modelId,
|
|
1643
|
+
messages,
|
|
1644
|
+
apiKey,
|
|
1645
|
+
appContext,
|
|
1646
|
+
host,
|
|
1647
|
+
conversationId,
|
|
1648
|
+
}) {
|
|
1468
1649
|
assertModelProvider(provider);
|
|
1650
|
+
const normalizedHost = normalizeModelChatHost(host);
|
|
1469
1651
|
const { generateText, tool: aiTool, jsonSchema } = await import('ai');
|
|
1470
1652
|
const model = await createModelInstance(provider, modelId, apiKey);
|
|
1471
1653
|
const { tools: mcpTools } = await client.listTools();
|
|
1472
1654
|
const capturedToolCalls = [];
|
|
1473
1655
|
const tools = {};
|
|
1474
1656
|
|
|
1475
|
-
for (const mcpTool of mcpTools
|
|
1476
|
-
(tool) => !!toolResourceUri(tool) && isToolVisibleToModel(tool)
|
|
1477
|
-
)) {
|
|
1657
|
+
for (const mcpTool of getModelCallableTools(mcpTools)) {
|
|
1478
1658
|
tools[mcpTool.name] = aiTool({
|
|
1479
1659
|
description: mcpTool.description || mcpTool.title || '',
|
|
1480
1660
|
inputSchema: jsonSchema(sanitizeAiSdkSchema(mcpTool.inputSchema)),
|
|
1481
1661
|
parameters: jsonSchema(sanitizeAiSdkSchema(mcpTool.inputSchema)),
|
|
1482
|
-
execute: async (args) => {
|
|
1662
|
+
execute: async (args, options) => {
|
|
1483
1663
|
const { arguments: safeArgs, result } = await executeModelChatToolCall({
|
|
1484
1664
|
client,
|
|
1485
1665
|
name: mcpTool.name,
|
|
1486
1666
|
arguments: args,
|
|
1487
1667
|
});
|
|
1488
|
-
capturedToolCalls.push({
|
|
1489
|
-
|
|
1668
|
+
capturedToolCalls.push({
|
|
1669
|
+
name: mcpTool.name,
|
|
1670
|
+
arguments: safeArgs,
|
|
1671
|
+
result,
|
|
1672
|
+
isError: !!result?.isError,
|
|
1673
|
+
});
|
|
1674
|
+
return formatModelVisibleToolResult(mcpTool, result, {
|
|
1675
|
+
host: normalizedHost,
|
|
1676
|
+
arguments: safeArgs,
|
|
1677
|
+
toolCallId: options?.toolCallId,
|
|
1678
|
+
});
|
|
1490
1679
|
},
|
|
1491
1680
|
});
|
|
1492
1681
|
}
|
|
@@ -1497,22 +1686,25 @@ async function runModelChat({ client, provider, modelId, messages, apiKey, appCo
|
|
|
1497
1686
|
model,
|
|
1498
1687
|
tools,
|
|
1499
1688
|
system: [
|
|
1500
|
-
'You are chatting inside the
|
|
1689
|
+
'You are chatting inside the sunpeak Inspector. When you call an MCP tool that renders an app, the host will render the app below your message. Do not repeat raw tool output, JSON, image URLs, markdown image lists, or full item inventories. Keep any narration brief and let the app carry the visual result.',
|
|
1690
|
+
getModelChatHostInstructions(normalizedHost),
|
|
1691
|
+
getModelChatRetryInstructions(),
|
|
1501
1692
|
sharedAppContext
|
|
1502
1693
|
? `Shared MCP App context from the currently rendered app, available for this turn:\n${sharedAppContext}`
|
|
1503
1694
|
: '',
|
|
1504
1695
|
]
|
|
1505
1696
|
.filter(Boolean)
|
|
1506
1697
|
.join('\n\n'),
|
|
1507
|
-
messages: messages
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
maxSteps: 5,
|
|
1698
|
+
messages: normalizeModelChatMessages(messages),
|
|
1699
|
+
// AI SDK v4 can send an empty assistant text block to Anthropic when a
|
|
1700
|
+
// tool-only response is followed by another model step. We only need the
|
|
1701
|
+
// tool result for inspector rendering, so skip that follow-up call.
|
|
1702
|
+
maxSteps: provider === 'anthropic' ? 1 : 5,
|
|
1512
1703
|
maxRetries: 0,
|
|
1513
1704
|
});
|
|
1514
1705
|
|
|
1515
1706
|
return {
|
|
1707
|
+
...(conversationId ? { conversationId } : {}),
|
|
1516
1708
|
text: result.text || '',
|
|
1517
1709
|
toolCalls: capturedToolCalls,
|
|
1518
1710
|
finishReason: result.finishReason,
|
|
@@ -2513,18 +2705,13 @@ function sunpeakInspectEndpointsPlugin(getClient, setClient, pluginOpts = {}) {
|
|
|
2513
2705
|
res.end(JSON.stringify({ error: `No ${provider} API key saved.` }));
|
|
2514
2706
|
return;
|
|
2515
2707
|
}
|
|
2516
|
-
const
|
|
2517
|
-
const safeMessages = messages
|
|
2518
|
-
.filter((message) => message?.role === 'user' || message?.role === 'assistant')
|
|
2519
|
-
.map((message) => ({
|
|
2520
|
-
role: message.role,
|
|
2521
|
-
content: String(message.content ?? '').slice(0, 20000),
|
|
2522
|
-
}));
|
|
2708
|
+
const safeMessages = normalizeModelChatMessages(parsed.messages);
|
|
2523
2709
|
if (safeMessages.length === 0) {
|
|
2524
2710
|
res.writeHead(400, { 'Content-Type': 'application/json' });
|
|
2525
2711
|
res.end(JSON.stringify({ error: 'Missing chat messages.' }));
|
|
2526
2712
|
return;
|
|
2527
2713
|
}
|
|
2714
|
+
const conversationId = normalizeModelConversationId(parsed.conversationId);
|
|
2528
2715
|
|
|
2529
2716
|
const result = await withModelChatClient((client) =>
|
|
2530
2717
|
runModelChat({
|
|
@@ -2533,7 +2720,9 @@ function sunpeakInspectEndpointsPlugin(getClient, setClient, pluginOpts = {}) {
|
|
|
2533
2720
|
modelId: parsed.modelId,
|
|
2534
2721
|
messages: safeMessages,
|
|
2535
2722
|
apiKey,
|
|
2723
|
+
host: parsed.host,
|
|
2536
2724
|
appContext: normalizeModelAppContext(parsed.appContext),
|
|
2725
|
+
conversationId,
|
|
2537
2726
|
})
|
|
2538
2727
|
);
|
|
2539
2728
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
@@ -2672,12 +2861,16 @@ export const _securityTestExports = {
|
|
|
2672
2861
|
isLoopbackRemoteAddress,
|
|
2673
2862
|
isPrivateNetworkAddress,
|
|
2674
2863
|
isToolVisibleToModel,
|
|
2864
|
+
getModelCallableTools,
|
|
2865
|
+
toolRendersApp,
|
|
2675
2866
|
executeModelChatToolCall,
|
|
2676
2867
|
formatModelVisibleToolResult,
|
|
2677
2868
|
formatSharedAppContextForModel,
|
|
2678
2869
|
normalizeApiKey,
|
|
2870
|
+
normalizeModelChatMessages,
|
|
2679
2871
|
normalizeModelAppContext,
|
|
2680
2872
|
normalizeModelId,
|
|
2873
|
+
normalizeModelProviderModelId,
|
|
2681
2874
|
quoteSecurityInteractiveArg,
|
|
2682
2875
|
readRequestBody,
|
|
2683
2876
|
resolveHttpRedirectsForMcp,
|
|
@@ -77,10 +77,7 @@ export async function testInit(args = [], deps = defaultDeps) {
|
|
|
77
77
|
|
|
78
78
|
// Parse --server flag from CLI args
|
|
79
79
|
const serverIdx = args.indexOf('--server');
|
|
80
|
-
const cliServer =
|
|
81
|
-
serverIdx !== -1 && args[serverIdx + 1]
|
|
82
|
-
? args[serverIdx + 1]
|
|
83
|
-
: undefined;
|
|
80
|
+
const cliServer = serverIdx !== -1 && args[serverIdx + 1] ? args[serverIdx + 1] : undefined;
|
|
84
81
|
|
|
85
82
|
const projectType = detectProjectType(d);
|
|
86
83
|
const interactive = d.isTTY();
|
|
@@ -119,7 +116,10 @@ export async function testInit(args = [], deps = defaultDeps) {
|
|
|
119
116
|
for (const prov of providers) {
|
|
120
117
|
for (const model of prov.models) {
|
|
121
118
|
config = config.replace(
|
|
122
|
-
new RegExp(
|
|
119
|
+
new RegExp(
|
|
120
|
+
`^(\\s*)// ('${model.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}',?.*)$`,
|
|
121
|
+
'm'
|
|
122
|
+
),
|
|
123
123
|
'$1$2'
|
|
124
124
|
);
|
|
125
125
|
}
|
|
@@ -139,12 +139,16 @@ export async function testInit(args = [], deps = defaultDeps) {
|
|
|
139
139
|
});
|
|
140
140
|
if (!d.isCancel(key) && key) {
|
|
141
141
|
// Strip newlines so a pasted value can't inject extra .env entries.
|
|
142
|
-
const cleanKey = String(key)
|
|
142
|
+
const cleanKey = String(key)
|
|
143
|
+
.replace(/[\r\n]+/g, '')
|
|
144
|
+
.trim();
|
|
143
145
|
if (cleanKey) envLines.push(`${prov.envVar}=${cleanKey}`);
|
|
144
146
|
}
|
|
145
147
|
}
|
|
146
148
|
if (envLines.length > 0 && evalDir) {
|
|
147
|
-
const relEnvPath = evalDir.startsWith(d.cwd())
|
|
149
|
+
const relEnvPath = evalDir.startsWith(d.cwd())
|
|
150
|
+
? evalDir.slice(d.cwd().length + 1)
|
|
151
|
+
: evalDir;
|
|
148
152
|
d.writeFileSync(join(evalDir, '.env'), envLines.join('\n') + '\n');
|
|
149
153
|
d.log.info(`API keys saved to ${relEnvPath}/.env (gitignored)`);
|
|
150
154
|
}
|
|
@@ -165,7 +169,9 @@ export async function testInit(args = [], deps = defaultDeps) {
|
|
|
165
169
|
stdio: 'inherit',
|
|
166
170
|
});
|
|
167
171
|
} catch {
|
|
168
|
-
d.log.info(
|
|
172
|
+
d.log.info(
|
|
173
|
+
`Skill install skipped. Install later: ${dlx} skills add Sunpeak-AI/sunpeak@test-mcp-server`
|
|
174
|
+
);
|
|
169
175
|
}
|
|
170
176
|
}
|
|
171
177
|
}
|
|
@@ -303,9 +309,10 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
303
309
|
d.mkdirSync(evalsDir, { recursive: true });
|
|
304
310
|
|
|
305
311
|
// Generate server line for eval config
|
|
306
|
-
let serverLine =
|
|
312
|
+
let serverLine = " // server: 'http://localhost:8000/mcp',";
|
|
307
313
|
if (isSunpeak) {
|
|
308
|
-
serverLine =
|
|
314
|
+
serverLine =
|
|
315
|
+
" // Omit server for sunpeak projects (auto-detected).\n // server: 'http://localhost:8000/mcp',";
|
|
309
316
|
} else if (server?.type === 'url') {
|
|
310
317
|
serverLine = ` server: ${JSON.stringify(server.value)},`;
|
|
311
318
|
} else if (server?.type === 'command') {
|
|
@@ -315,27 +322,27 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
315
322
|
// Build the eval config content
|
|
316
323
|
const configLines = [
|
|
317
324
|
"import { defineEvalConfig } from 'sunpeak/eval';",
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
325
|
+
'',
|
|
326
|
+
'// API keys are loaded automatically from .env in this directory (gitignored).',
|
|
327
|
+
'// See .env.example for the format.',
|
|
328
|
+
'',
|
|
329
|
+
'export default defineEvalConfig({',
|
|
330
|
+
' // MCP server to test.',
|
|
324
331
|
serverLine,
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
332
|
+
'',
|
|
333
|
+
' models: [',
|
|
334
|
+
' // Uncomment models and install their provider packages:',
|
|
328
335
|
...generateModelLines(),
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
336
|
+
' ],',
|
|
337
|
+
'',
|
|
338
|
+
' defaults: {',
|
|
339
|
+
' runs: 5, // Number of times to run each case per model',
|
|
340
|
+
' maxSteps: 1, // Max tool call steps per run',
|
|
341
|
+
' temperature: 0, // 0 for most deterministic results',
|
|
342
|
+
' timeout: 30_000, // Timeout per run in ms',
|
|
343
|
+
' },',
|
|
344
|
+
'});',
|
|
345
|
+
'',
|
|
339
346
|
];
|
|
340
347
|
|
|
341
348
|
d.writeFileSync(join(evalsDir, 'eval.config.ts'), configLines.join('\n'));
|
|
@@ -367,6 +374,8 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
|
|
|
367
374
|
*
|
|
368
375
|
* Each case sends a prompt to every configured model and checks
|
|
369
376
|
* that the model calls the expected tool with the expected arguments.
|
|
377
|
+
* Add appContext to test follow-up prompts that depend on state shared
|
|
378
|
+
* by the rendered MCP App through updateModelContext or useAppState.
|
|
370
379
|
* Cases run multiple times (configured via \`runs\` in eval.config.ts)
|
|
371
380
|
* to measure reliability across non-deterministic LLM responses.
|
|
372
381
|
*/
|
|
@@ -377,6 +386,8 @@ export default defineEval({
|
|
|
377
386
|
{
|
|
378
387
|
name: 'example (replace me)',
|
|
379
388
|
prompt: 'Show me a demo',
|
|
389
|
+
// Optional: seed model-visible App Context for follow-up prompts.
|
|
390
|
+
// appContext: { structuredContent: { selectedItem: { id: 'demo' } } },
|
|
380
391
|
// expect which tool gets called and (optionally) its arguments:
|
|
381
392
|
expect: {
|
|
382
393
|
tool: 'your-tool-name',
|
|
@@ -437,13 +448,8 @@ function scaffoldVisualTest(filePath, d) {
|
|
|
437
448
|
// // await result.screenshot('tool-dark');
|
|
438
449
|
// });
|
|
439
450
|
|
|
440
|
-
//
|
|
441
|
-
//
|
|
442
|
-
// const result = await inspector.renderTool('your-tool', {}, { theme: 'light' });
|
|
443
|
-
// const app = result.app();
|
|
444
|
-
// await expect(app.getByText('Expected text')).toBeVisible();
|
|
445
|
-
// await result.screenshot('tool-page', { target: 'page', maxDiffPixelRatio: 0.02 });
|
|
446
|
-
// });
|
|
451
|
+
// renderTool() hides inspector sidebars by default, and screenshot() captures only
|
|
452
|
+
// the MCP App iframe area, so inspector UI changes do not break your visual baselines.
|
|
447
453
|
`
|
|
448
454
|
);
|
|
449
455
|
d.log.success(`Created ${filePath}`);
|
|
@@ -485,9 +491,10 @@ function scaffoldLiveTests(liveDir, { isSunpeak, server, d } = {}) {
|
|
|
485
491
|
const parts = server.value.split(/\s+/);
|
|
486
492
|
const cmd = parts[0];
|
|
487
493
|
const args = parts.slice(1);
|
|
488
|
-
serverOption =
|
|
489
|
-
|
|
490
|
-
|
|
494
|
+
serverOption =
|
|
495
|
+
args.length > 0
|
|
496
|
+
? `\n server: { command: ${JSON.stringify(cmd)}, args: [${args.map((a) => JSON.stringify(a)).join(', ')}] },`
|
|
497
|
+
: `\n server: { command: ${JSON.stringify(cmd)} },`;
|
|
491
498
|
}
|
|
492
499
|
|
|
493
500
|
const configContent = `${liveConfigPreamble}
|
|
@@ -637,7 +644,9 @@ test('server exposes tools', async ({ mcp }) => {
|
|
|
637
644
|
|
|
638
645
|
d.log.success('Created tests/sunpeak/ with all test types.');
|
|
639
646
|
if (server.type === 'later') {
|
|
640
|
-
d.log.warn(
|
|
647
|
+
d.log.warn(
|
|
648
|
+
'Server not configured. Edit tests/sunpeak/playwright.config.ts before running tests.'
|
|
649
|
+
);
|
|
641
650
|
}
|
|
642
651
|
|
|
643
652
|
// Auto-install dependencies so users can run tests immediately
|
|
@@ -653,14 +662,22 @@ test('server exposes tools', async ({ mcp }) => {
|
|
|
653
662
|
try {
|
|
654
663
|
d.execSync(`${pm} exec playwright install chromium`, { cwd: testDir, stdio: 'inherit' });
|
|
655
664
|
} catch {
|
|
656
|
-
d.log.warn(
|
|
665
|
+
d.log.warn(
|
|
666
|
+
`Browser install failed. Run manually: cd tests/sunpeak && ${pm} exec playwright install chromium`
|
|
667
|
+
);
|
|
657
668
|
}
|
|
658
669
|
|
|
659
670
|
d.log.step('Ready! Run tests with:');
|
|
660
671
|
d.log.message(' npx sunpeak test # E2E tests');
|
|
661
|
-
d.log.message(
|
|
662
|
-
|
|
663
|
-
|
|
672
|
+
d.log.message(
|
|
673
|
+
' npx sunpeak test --visual # Visual regression (generates baselines on first run)'
|
|
674
|
+
);
|
|
675
|
+
d.log.message(
|
|
676
|
+
' npx sunpeak test --live # Live tests against real hosts (requires login)'
|
|
677
|
+
);
|
|
678
|
+
d.log.message(
|
|
679
|
+
' npx sunpeak test --eval # Multi-model evals (configure models in evals/eval.config.ts)'
|
|
680
|
+
);
|
|
664
681
|
}
|
|
665
682
|
|
|
666
683
|
async function initJsProject(cliServer, d) {
|