npm - sunpeak - Versions diffs - 0.20.36 → 0.20.47 - Mend

sunpeak 0.20.36 → 0.20.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/README.md +4 -2
package/bin/commands/inspect.mjs +291 -64
package/bin/commands/test-init.mjs +6 -0
package/bin/lib/eval/eval-runner.mjs +53 -1
package/bin/lib/eval/eval-types.d.mts +27 -0
package/bin/lib/eval/model-registry.mjs +2 -2
package/dist/chatgpt/index.cjs +1 -1
package/dist/chatgpt/index.js +1 -1
package/dist/claude/index.cjs +1 -1
package/dist/claude/index.js +1 -1
package/dist/embed.css +1 -1
package/dist/hooks/tool-data-store.d.ts +26 -0
package/dist/hooks/use-tool-data.d.ts +3 -9
package/dist/host/chatgpt/index.cjs +1 -1
package/dist/host/chatgpt/index.js +1 -1
package/dist/index.cjs +35 -21
package/dist/index.cjs.map +1 -1
package/dist/index.js +35 -21
package/dist/index.js.map +1 -1
package/dist/inspector/index.cjs +1 -1
package/dist/inspector/index.js +1 -1
package/dist/inspector/inspector.d.ts +7 -0
package/dist/inspector/use-inspector-state.d.ts +28 -0
package/dist/{inspector-CiuT_2yA.js → inspector-BSha-CAW.js} +216 -75
package/dist/inspector-BSha-CAW.js.map +1 -0
package/dist/{inspector-BNWla95w.cjs → inspector-Chhc2GNO.cjs} +216 -75
package/dist/inspector-Chhc2GNO.cjs.map +1 -0
package/dist/lib/utils.d.ts +8 -7
package/dist/mcp/index.cjs +5 -3
package/dist/mcp/index.cjs.map +1 -1
package/dist/mcp/index.js +5 -3
package/dist/mcp/index.js.map +1 -1
package/dist/mcp/server.d.ts +12 -1
package/dist/style.css +22 -0
package/dist/{use-app-Duar2Ipu.js → use-app-CtKy52kw.js} +62 -1
package/dist/use-app-CtKy52kw.js.map +1 -0
package/dist/{use-app-DUdnDLP5.cjs → use-app-xaiN0HAd.cjs} +62 -1
package/dist/use-app-xaiN0HAd.cjs.map +1 -0
package/package.json +8 -8
package/template/dist/albums/albums.html +3 -3
package/template/dist/albums/albums.json +1 -1
package/template/dist/carousel/carousel.html +3 -3
package/template/dist/carousel/carousel.json +1 -1
package/template/dist/map/map.html +4 -4
package/template/dist/map/map.json +1 -1
package/template/dist/review/review.html +3 -3
package/template/dist/review/review.json +1 -1
package/template/node_modules/.bin/tsc +2 -2
package/template/node_modules/.bin/tsserver +2 -2
package/template/node_modules/.bin/vitest +2 -2
package/template/node_modules/.vite/deps/_metadata.json +3 -3
package/template/node_modules/.vite-mcp/deps/_metadata.json +20 -20
package/template/node_modules/.vite-mcp/deps/vitest.js +7 -7
package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
package/template/package.json +1 -1
package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-linux.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-linux.png +0 -0
package/template/tsconfig.json +2 -0
package/dist/inspector-BNWla95w.cjs.map +0 -1
package/dist/inspector-CiuT_2yA.js.map +0 -1
package/dist/use-app-DUdnDLP5.cjs.map +0 -1
package/dist/use-app-Duar2Ipu.js.map +0 -1

package/README.md CHANGED Viewed

@@ -41,7 +41,9 @@ sunpeak replicates the ChatGPT and Claude runtimes locally so you can:
 - Pin tool states with simulation fixtures so UI regressions can't ship.
 - Automate the real-host loop with live tests: scripts that open your browser, prompt ChatGPT, and assert against the rendered app so you stop click-testing by hand.
-sunpeak also runs evals against your MCP server across multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model, so you can prove your tool descriptions and schemas hold up on cheaper models, not just the flagship ones.
+sunpeak also runs evals against your MCP server across multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model, so you can prove your tool descriptions, schemas, and model-visible App Context hold up on cheaper models, not just the flagship ones.
+Eval cases can seed App Context with `appContext`, which lets you test follow-up prompts such as "Book this one" against state the app has shared through `updateModelContext`.
 <div align="center">
   <a href="https://sunpeak.ai/docs/testing/evals">
@@ -99,7 +101,7 @@ npx sunpeak test
 Playwright fixtures handle inspector startup, MCP connection, iframe traversal, and host switching. Works with Python, Go, TypeScript, Rust, or any language.
-Evals add a second dimension: model compatibility. The eval framework connects to your MCP server via the MCP protocol, discovers its tools, and sends prompts to multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model and reports pass/fail counts, so you can measure whether your tool descriptions and schemas work reliably across smaller and cheaper models, not just the flagship ones.
+Evals add a second dimension: model compatibility. The eval framework connects to your MCP server via the MCP protocol, discovers its tools, and sends prompts to multiple models (GPT-4o, GPT-4o-mini, o4-mini, Claude Sonnet, Gemini 2.0 Flash) via the Vercel AI SDK. Each case runs N times per model and reports pass/fail counts, so you can measure whether your tool descriptions, schemas, and model-visible App Context work reliably across smaller and cheaper models, not just the flagship ones.
 ```ts
 import { test, expect } from 'sunpeak/test';

package/bin/commands/inspect.mjs CHANGED Viewed

@@ -308,7 +308,7 @@ async function negotiateOAuth(serverUrl) {
   // Try the anonymous/auto-approved path first: follow the authorization URL
   // without a browser and see if it immediately redirects with a code.
-  const code = await tryAnonymousOAuth(authUrl.toString(), callbackUrl);
+  const code = await tryAnonymousOAuth(authUrl.toString(), callbackUrl, oauthState.stateParam);
   if (code) {
     // Complete the flow with the authorization code.
     const tokenResult = await auth(provider, {
@@ -347,15 +347,17 @@ async function negotiateOAuth(serverUrl) {
  *
  * @param {string} authUrl - The authorization URL
  * @param {string} callbackUrl - The expected callback URL prefix
+ * @param {string} [expectedState] - OAuth state value that must be echoed by the callback
+ * @param {typeof fetch} [fetchFn]
  * @returns {Promise<string | null>}
  */
-async function tryAnonymousOAuth(authUrl, callbackUrl) {
+async function tryAnonymousOAuth(authUrl, callbackUrl, expectedState, fetchFn = fetch) {
   // Follow redirects manually to detect when the server redirects back
   // to our callback URL with a code parameter.
   let url = authUrl;
   const maxRedirects = 10;
   for (let i = 0; i < maxRedirects; i++) {
-    const response = await fetch(url, { redirect: 'manual' });
+    const response = await fetchFn(url, { redirect: 'manual' });
     const location = response.headers.get('location');
     if (!location) {
@@ -366,11 +368,21 @@ async function tryAnonymousOAuth(authUrl, callbackUrl) {
     }
     // Resolve relative redirects.
-    const resolved = new URL(location, url).toString();
+    const resolvedUrl = new URL(location, url);
+    if (resolvedUrl.protocol !== 'http:' && resolvedUrl.protocol !== 'https:') {
+      throw new Error(
+        `OAuth authorization redirect has unsupported scheme: ${resolvedUrl.protocol}`
+      );
+    }
+    const resolved = resolvedUrl.toString();
     // Check if the redirect goes to our callback URL.
     if (resolved.startsWith(callbackUrl)) {
       const params = new URL(resolved).searchParams;
+      const state = params.get('state');
+      if (expectedState && state !== expectedState) {
+        throw new Error('OAuth state mismatch — callback rejected');
+      }
       const code = params.get('code');
       if (code) return code;
       const error = params.get('error');
@@ -906,45 +918,68 @@ async function discoverSimulations(client) {
 /**
  * Load simulation JSON fixtures from a directory and merge into discovered simulations.
+ *
+ * Each fixture becomes a simulation keyed by its filename, so a tool can have
+ * multiple fixtures (e.g. `show-albums.json` and `show-albums-empty.json`
+ * both targeting tool `show-albums`). Auto-discovered slots are kept only for
+ * tools that have no fixture file.
+ *
  * @param {string} dir - Simulation directory path
  * @param {Record<string, object>} simulations - Discovered simulations to merge into
  */
-function mergeSimulationFixtures(dir, simulations) {
+export function mergeSimulationFixtures(dir, simulations) {
   if (!existsSync(dir)) return;
   const files = readdirSync(dir).filter((f) => f.endsWith('.json'));
+  // Load every fixture first so we can group by tool name. We need the grouping
+  // to decide whether to keep the auto-discovered slot (no fixtures) or replace
+  // it with one entry per fixture file (one or more fixtures).
+  const fixtures = [];
   for (const file of files) {
     try {
       const fixture = JSON.parse(readFileSync(join(dir, file), 'utf-8'));
-      const toolName = fixture.tool;
-      if (!toolName) continue;
-      // Find matching simulation by tool name
-      const sim = simulations[toolName];
-      if (sim) {
-        // Merge fixture data into discovered simulation
-        if (fixture.toolInput !== undefined) sim.toolInput = fixture.toolInput;
-        if (fixture.toolResult !== undefined) sim.toolResult = fixture.toolResult;
-        if (fixture.serverTools !== undefined) sim.serverTools = fixture.serverTools;
-        if (fixture.userMessage !== undefined) sim.userMessage = fixture.userMessage;
-        if (fixture.hostContext !== undefined) sim.hostContext = fixture.hostContext;
-      } else {
-        // Create a new simulation from the fixture (tool not on server, but user wants to mock it)
-        const simName = file.replace(/\.json$/, '');
-        simulations[simName] = {
-          name: simName,
-          tool: { name: toolName, inputSchema: { type: 'object' } },
-          toolInput: fixture.toolInput,
-          toolResult: fixture.toolResult,
-          serverTools: fixture.serverTools,
-          userMessage: fixture.userMessage,
-          hostContext: fixture.hostContext,
-        };
-      }
+      if (!fixture.tool) continue;
+      fixtures.push({ file, fixture });
     } catch (err) {
       console.warn(`Warning: Failed to parse simulation fixture ${file}:`, err.message);
     }
   }
+  const byTool = new Map();
+  for (const item of fixtures) {
+    const tool = item.fixture.tool;
+    if (!byTool.has(tool)) byTool.set(tool, []);
+    byTool.get(tool).push(item);
+  }
+  for (const [toolName, items] of byTool) {
+    const discovered = simulations[toolName];
+    // Drop the auto-discovered slot if none of the fixtures will reuse its
+    // key (filename === tool name). Otherwise the named fixture overwrites
+    // it in place below.
+    const reusesSlot = items.some(({ file }) => file.replace(/\.json$/, '') === toolName);
+    if (discovered && !reusesSlot) {
+      delete simulations[toolName];
+    }
+    for (const { file, fixture } of items) {
+      const simName = file.replace(/\.json$/, '');
+      const sim = discovered
+        ? { ...discovered, name: simName }
+        : {
+            name: simName,
+            tool: { name: toolName, inputSchema: { type: 'object' } },
+          };
+      if (fixture.toolInput !== undefined) sim.toolInput = fixture.toolInput;
+      if (fixture.toolResult !== undefined) sim.toolResult = fixture.toolResult;
+      if (fixture.serverTools !== undefined) sim.serverTools = fixture.serverTools;
+      if (fixture.userMessage !== undefined) sim.userMessage = fixture.userMessage;
+      if (fixture.hostContext !== undefined) sim.hostContext = fixture.hostContext;
+      simulations[simName] = sim;
+    }
+  }
 }
 const MODEL_PROVIDERS = new Set(['openai', 'anthropic']);
@@ -1363,9 +1398,43 @@ function toolRendersApp(tool) {
   return !!(tool?._meta?.ui?.resourceUri ?? tool?._meta?.['ui/resourceUri']);
 }
-function sanitizeAiSdkSchema(schema) {
-  const clean = { ...(schema || { type: 'object', properties: {} }) };
+function sanitizeAiSdkSchemaNode(schema) {
+  if (Array.isArray(schema)) {
+    return schema.map((item) => sanitizeAiSdkSchemaNode(item));
+  }
+  if (!schema || typeof schema !== 'object') return schema;
+  const clean = { ...schema };
   delete clean.$schema;
+  if (
+    clean.properties &&
+    typeof clean.properties === 'object' &&
+    !Array.isArray(clean.properties)
+  ) {
+    clean.properties = Object.fromEntries(
+      Object.entries(clean.properties).map(([key, value]) => [
+        key,
+        sanitizeAiSdkSchemaNode(value),
+      ])
+    );
+  }
+  if (clean.items !== undefined) {
+    clean.items = sanitizeAiSdkSchemaNode(clean.items);
+  }
+  for (const key of ['anyOf', 'allOf', 'oneOf']) {
+    if (Array.isArray(clean[key])) {
+      clean[key] = clean[key].map((item) => sanitizeAiSdkSchemaNode(item));
+    }
+  }
+  const isObjectSchema = clean.type === 'object' || clean.properties != null;
+  if (isObjectSchema) {
+    if (!clean.type) clean.type = 'object';
+    if (!clean.properties) clean.properties = {};
+    clean.additionalProperties = false;
+    return clean;
+  }
   if (
     clean.additionalProperties != null &&
     typeof clean.additionalProperties === 'object' &&
@@ -1373,8 +1442,14 @@ function sanitizeAiSdkSchema(schema) {
   ) {
     delete clean.additionalProperties;
   }
+  return clean;
+}
+export function sanitizeAiSdkSchema(schema) {
+  const clean = sanitizeAiSdkSchemaNode(schema || { type: 'object', properties: {} });
   if (!clean.type) clean.type = 'object';
   if (!clean.properties) clean.properties = {};
+  clean.additionalProperties = false;
   return clean;
 }
@@ -1392,15 +1467,39 @@ function normalizeModelId(modelId) {
   return trimmed;
 }
+function normalizeModelProviderModelId(provider, modelId) {
+  const normalizedModelId = normalizeModelId(modelId);
+  if (
+    provider === 'anthropic' &&
+    /^claude-\d+(?:-\d+)+-(opus|sonnet|haiku)$/i.test(normalizedModelId)
+  ) {
+    throw new Error(
+      `Unsupported Anthropic model ID "${normalizedModelId}". Use an Anthropic API model ID such as "claude-sonnet-4-20250514".`
+    );
+  }
+  return normalizedModelId;
+}
+function normalizeModelConversationId(conversationId) {
+  if (typeof conversationId !== 'string') return undefined;
+  const trimmed = conversationId.trim();
+  if (!trimmed) return undefined;
+  if (trimmed.length > 200 || /[\u0000-\u001f\u007f]/.test(trimmed)) {
+    throw new Error('Invalid model conversation ID.');
+  }
+  return trimmed;
+}
 async function createModelInstance(provider, modelId, apiKey) {
   assertModelProvider(provider);
-  const normalizedModelId = normalizeModelId(modelId);
+  const normalizedModelId = normalizeModelProviderModelId(provider, modelId);
   if (provider === 'openai') {
     const { createOpenAI } = await import('@ai-sdk/openai');
     const openai = createOpenAI({ apiKey });
+    const settings = { structuredOutputs: false };
     return typeof openai.chat === 'function'
-      ? openai.chat(normalizedModelId)
-      : openai(normalizedModelId);
+      ? openai.chat(normalizedModelId, settings)
+      : openai(normalizedModelId, settings);
   }
   const { createAnthropic } = await import('@ai-sdk/anthropic');
   return createAnthropic({ apiKey })(normalizedModelId);
@@ -1414,6 +1513,24 @@ function formatJsonForModel(value) {
   return `${json.slice(0, MODEL_VISIBLE_JSON_LIMIT_BYTES)}...`;
 }
+function normalizeModelChatHost(host) {
+  if (host === 'chatgpt' || host === 'claude') return host;
+  return 'generic';
+}
+function isPlainObject(value) {
+  return (
+    value !== null &&
+    typeof value === 'object' &&
+    !Array.isArray(value) &&
+    Object.getPrototypeOf(value) === Object.prototype
+  );
+}
+function normalizeToolArguments(args) {
+  return isPlainObject(args) ? args : {};
+}
 function normalizeModelAppContext(appContext) {
   if (!appContext || typeof appContext !== 'object') return undefined;
   const normalized = {};
@@ -1432,15 +1549,67 @@ function formatSharedAppContextForModel(appContext) {
   return formatJsonForModel(normalized);
 }
-function formatModelVisibleToolResult(tool, result) {
+function normalizeModelChatMessages(messages) {
+  if (!Array.isArray(messages)) return [];
+  return messages
+    .filter((message) => message?.role === 'user' || message?.role === 'assistant')
+    .map((message) => ({
+      role: message.role,
+      content: String(message.content ?? '').slice(0, 20000).trim(),
+    }))
+    .filter((message) => message.content.length > 0);
+}
+function getToolErrorText(tool, result) {
+  const toolName = tool?.name || 'MCP tool';
+  const text = (result?.content || [])
+    .filter((part) => part && typeof part === 'object' && part.type === 'text')
+    .map((part) => String(part.text ?? ''))
+    .join('\n')
+    .trim();
+  if (text) return text;
+  if (result?.structuredContent !== undefined) {
+    return formatJsonForModel({ structuredContent: result.structuredContent });
+  }
+  return text || `${toolName} returned an error.`;
+}
+function formatModelVisibleToolError(tool, result, { host, arguments: args, toolCallId } = {}) {
+  const toolName = tool?.name || 'MCP tool';
+  const errorText = getToolErrorText(tool, result);
+  const id = typeof toolCallId === 'string' && toolCallId.trim() ? toolCallId : toolName;
+  switch (normalizeModelChatHost(host)) {
+    case 'chatgpt':
+      return {
+        type: 'mcp_call',
+        id,
+        name: toolName,
+        arguments: normalizeToolArguments(args),
+        error: errorText,
+        output: null,
+        status: 'failed',
+      };
+    case 'claude':
+      return {
+        type: 'mcp_tool_result',
+        tool_use_id: id,
+        is_error: true,
+        content: [{ type: 'text', text: errorText }],
+      };
+    default:
+      return {
+        isError: true,
+        content: [{ type: 'text', text: errorText }],
+      };
+  }
+}
+function formatModelVisibleToolResult(tool, result, options = {}) {
   const toolName = tool?.name || 'MCP tool';
   if (result?.isError) {
-    const text = (result.content || [])
-      .filter((part) => part.type === 'text')
-      .map((part) => part.text)
-      .join('\n')
-      .trim();
-    return text || `${toolName} returned an error.`;
+    if (options.host) return formatModelVisibleToolError(tool, result, options);
+    return getToolErrorText(tool, result);
   }
   const visibleResult = {};
@@ -1457,17 +1626,63 @@ function formatModelVisibleToolResult(tool, result) {
       : `${toolName} completed.`;
 }
-async function executeModelChatToolCall({ client, name, arguments: args }) {
-  const safeArgs = args && typeof args === 'object' ? args : {};
+function errorToMessage(error) {
+  if (error instanceof Error && error.message) return error.message;
+  return String(error || 'Unknown MCP tool error');
+}
+function createModelChatToolErrorResult(error) {
+  const message = errorToMessage(error);
   return {
-    arguments: safeArgs,
-    result: await client.callTool({ name, arguments: safeArgs }),
-    source: 'mcp',
+    content: [{ type: 'text', text: message }],
+    isError: true,
   };
 }
-async function runModelChat({ client, provider, modelId, messages, apiKey, appContext }) {
+async function executeModelChatToolCall({ client, name, arguments: args }) {
+  const safeArgs = normalizeToolArguments(args);
+  try {
+    return {
+      arguments: safeArgs,
+      result: await client.callTool({ name, arguments: safeArgs }),
+      source: 'mcp',
+    };
+  } catch (error) {
+    return {
+      arguments: safeArgs,
+      result: createModelChatToolErrorResult(error),
+      source: 'mcp',
+    };
+  }
+}
+function getModelChatHostInstructions(host) {
+  switch (normalizeModelChatHost(host)) {
+    case 'chatgpt':
+      return 'ChatGPT surfaces failed MCP calls as mcp_call items with an error field. When a tool result object has type "mcp_call", status "failed", or a non-empty error field, treat it as a failed MCP call, not a successful result.';
+    case 'claude':
+      return 'Claude surfaces failed MCP calls as mcp_tool_result blocks with is_error=true. When a tool result object has type "mcp_tool_result" and is_error is true, treat it as a failed MCP call, not a successful result.';
+    default:
+      return 'MCP tool failures are model-visible tool results. When a tool result has isError=true or describes a failed MCP call, treat it as a failed tool call, not a successful result.';
+  }
+}
+function getModelChatRetryInstructions() {
+  return 'After a failed MCP tool call, use the error text to decide the next step. Retry with corrected arguments for validation or business-logic errors. For transient service, timeout, or connectivity errors, you may retry once if the user request still needs the tool. Do not repeat the same failing tool call with the same arguments more than once.';
+}
+async function runModelChat({
+  client,
+  provider,
+  modelId,
+  messages,
+  apiKey,
+  appContext,
+  host,
+  conversationId,
+}) {
   assertModelProvider(provider);
+  const normalizedHost = normalizeModelChatHost(host);
   const { generateText, tool: aiTool, jsonSchema } = await import('ai');
   const model = await createModelInstance(provider, modelId, apiKey);
   const { tools: mcpTools } = await client.listTools();
@@ -1479,14 +1694,23 @@ async function runModelChat({ client, provider, modelId, messages, apiKey, appCo
       description: mcpTool.description || mcpTool.title || '',
       inputSchema: jsonSchema(sanitizeAiSdkSchema(mcpTool.inputSchema)),
       parameters: jsonSchema(sanitizeAiSdkSchema(mcpTool.inputSchema)),
-      execute: async (args) => {
+      execute: async (args, options) => {
         const { arguments: safeArgs, result } = await executeModelChatToolCall({
           client,
           name: mcpTool.name,
           arguments: args,
         });
-        capturedToolCalls.push({ name: mcpTool.name, arguments: safeArgs, result });
-        return formatModelVisibleToolResult(mcpTool, result);
+        capturedToolCalls.push({
+          name: mcpTool.name,
+          arguments: safeArgs,
+          result,
+          isError: !!result?.isError,
+        });
+        return formatModelVisibleToolResult(mcpTool, result, {
+          host: normalizedHost,
+          arguments: safeArgs,
+          toolCallId: options?.toolCallId,
+        });
       },
     });
   }
@@ -1497,22 +1721,25 @@ async function runModelChat({ client, provider, modelId, messages, apiKey, appCo
     model,
     tools,
     system: [
-      'You are chatting inside the Sunpeak Inspector. When you call an MCP tool that renders an app, the host will render the app below your message. Do not repeat raw tool output, JSON, image URLs, markdown image lists, or full item inventories. Keep any narration brief and let the app carry the visual result.',
+      'You are chatting inside the sunpeak Inspector. When you call an MCP tool that renders an app, the host will render the app below your message. Do not repeat raw tool output, JSON, image URLs, markdown image lists, or full item inventories. Keep any narration brief and let the app carry the visual result.',
+      getModelChatHostInstructions(normalizedHost),
+      getModelChatRetryInstructions(),
       sharedAppContext
         ? `Shared MCP App context from the currently rendered app, available for this turn:\n${sharedAppContext}`
         : '',
     ]
       .filter(Boolean)
       .join('\n\n'),
-    messages: messages.map((message) => ({
-      role: message.role,
-      content: String(message.content ?? ''),
-    })),
-    maxSteps: 5,
+    messages: normalizeModelChatMessages(messages),
+    // AI SDK v4 can send an empty assistant text block to Anthropic when a
+    // tool-only response is followed by another model step. We only need the
+    // tool result for inspector rendering, so skip that follow-up call.
+    maxSteps: provider === 'anthropic' ? 1 : 5,
     maxRetries: 0,
   });
   return {
+    ...(conversationId ? { conversationId } : {}),
     text: result.text || '',
     toolCalls: capturedToolCalls,
     finishReason: result.finishReason,
@@ -2513,18 +2740,13 @@ function sunpeakInspectEndpointsPlugin(getClient, setClient, pluginOpts = {}) {
             res.end(JSON.stringify({ error: `No ${provider} API key saved.` }));
             return;
           }
-          const messages = Array.isArray(parsed.messages) ? parsed.messages : [];
-          const safeMessages = messages
-            .filter((message) => message?.role === 'user' || message?.role === 'assistant')
-            .map((message) => ({
-              role: message.role,
-              content: String(message.content ?? '').slice(0, 20000),
-            }));
+          const safeMessages = normalizeModelChatMessages(parsed.messages);
           if (safeMessages.length === 0) {
             res.writeHead(400, { 'Content-Type': 'application/json' });
             res.end(JSON.stringify({ error: 'Missing chat messages.' }));
             return;
           }
+          const conversationId = normalizeModelConversationId(parsed.conversationId);
           const result = await withModelChatClient((client) =>
             runModelChat({
@@ -2533,7 +2755,9 @@ function sunpeakInspectEndpointsPlugin(getClient, setClient, pluginOpts = {}) {
               modelId: parsed.modelId,
               messages: safeMessages,
               apiKey,
+              host: parsed.host,
               appContext: normalizeModelAppContext(parsed.appContext),
+              conversationId,
             })
           );
           res.writeHead(200, { 'Content-Type': 'application/json' });
@@ -2678,12 +2902,15 @@ export const _securityTestExports = {
   formatModelVisibleToolResult,
   formatSharedAppContextForModel,
   normalizeApiKey,
+  normalizeModelChatMessages,
   normalizeModelAppContext,
   normalizeModelId,
+  normalizeModelProviderModelId,
   quoteSecurityInteractiveArg,
   readRequestBody,
   resolveHttpRedirectsForMcp,
   shouldAllowPrivateServerUrls,
+  tryAnonymousOAuth,
 };
 /**

package/bin/commands/test-init.mjs CHANGED Viewed

@@ -374,6 +374,8 @@ function scaffoldEvals(evalsDir, { server, isSunpeak, d: deps } = {}) {
  *
  * Each case sends a prompt to every configured model and checks
  * that the model calls the expected tool with the expected arguments.
+ * Add appContext to test follow-up prompts that depend on state shared
+ * by the rendered MCP App through updateModelContext or useAppState.
  * Cases run multiple times (configured via \`runs\` in eval.config.ts)
  * to measure reliability across non-deterministic LLM responses.
  */
@@ -384,6 +386,8 @@ export default defineEval({
     {
       name: 'example (replace me)',
       prompt: 'Show me a demo',
+      // Optional: seed model-visible App Context for follow-up prompts.
+      // appContext: { structuredContent: { selectedItem: { id: 'demo' } } },
       // expect which tool gets called and (optionally) its arguments:
       expect: {
         tool: 'your-tool-name',
@@ -592,8 +596,10 @@ ${serverBlock}
       {
         compilerOptions: {
           target: 'ES2022',
+          lib: ['ESNext', 'DOM'],
           module: 'ESNext',
           moduleResolution: 'bundler',
+          types: ['node'],
           strict: true,
           esModuleInterop: true,
         },

package/bin/lib/eval/eval-runner.mjs CHANGED Viewed

@@ -161,6 +161,42 @@ export async function discoverAndConvertTools(client) {
   return tools;
 }
+const MODEL_VISIBLE_JSON_LIMIT_BYTES = 20000;
+/**
+ * Normalize MCP App Context into the same shape hosts expose to the model.
+ * Empty context is treated as absent.
+ * @param {unknown} appContext
+ * @returns {{ content?: unknown[], structuredContent?: unknown } | undefined}
+ */
+export function normalizeEvalAppContext(appContext) {
+  if (!appContext || typeof appContext !== 'object') return undefined;
+  const normalized = {};
+  if (Array.isArray(appContext.content) && appContext.content.length > 0) {
+    normalized.content = appContext.content;
+  }
+  if (appContext.structuredContent !== undefined) {
+    normalized.structuredContent = appContext.structuredContent;
+  }
+  return Object.keys(normalized).length > 0 ? normalized : undefined;
+}
+/**
+ * Build the system prompt fragment that makes app context visible to the model.
+ * @param {unknown} appContext
+ * @returns {string | undefined}
+ */
+export function formatEvalAppContextForModel(appContext) {
+  const normalized = normalizeEvalAppContext(appContext);
+  if (!normalized) return undefined;
+  const json = JSON.stringify(normalized);
+  const visibleJson =
+    json.length <= MODEL_VISIBLE_JSON_LIMIT_BYTES
+      ? json
+      : `${json.slice(0, MODEL_VISIBLE_JSON_LIMIT_BYTES)}...`;
+  return `Shared MCP App context from the currently rendered app, available for this turn:\n${visibleJson}`;
+}
 /**
  * Run a single eval case once against a model.
  * @param {object} params
@@ -170,15 +206,30 @@ export async function discoverAndConvertTools(client) {
  * @param {number} params.maxSteps
  * @param {number} params.temperature
  * @param {number} params.timeout
+ * @param {{ content?: unknown[], structuredContent?: unknown }} [params.appContext]
  * @returns {Promise<import('./eval-types.d.mts').EvalRunResult>}
  */
-export async function runSingleEval({ prompt, model, tools, maxSteps, temperature, timeout }) {
+export async function runSingleEval({
+  prompt,
+  model,
+  tools,
+  maxSteps,
+  temperature,
+  timeout,
+  appContext,
+}) {
   const { generateText } = await import('ai');
+  const system = formatEvalAppContextForModel(appContext);
+  const providerOptions = model?.provider?.startsWith('openai.')
+    ? { openai: { strictJsonSchema: false } }
+    : undefined;
   const result = await generateText({
     model,
     tools,
     prompt,
+    ...(system ? { system } : {}),
+    ...(providerOptions ? { providerOptions } : {}),
     maxSteps,
     temperature,
     maxRetries: 0, // We manage runs ourselves; AI SDK retries compound rate limits
@@ -356,6 +407,7 @@ export async function runEvalCaseAggregate({
         maxSteps: evalCase.maxSteps ?? maxSteps,
         temperature,
         timeout,
+        appContext: evalCase.appContext,
       });
       checkExpectations(result, evalCase);
       passed++;