npm - sunpeak - Versions diffs - 0.19.2 → 0.19.10 - Mend

sunpeak 0.19.2 → 0.19.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/README.md +6 -4
package/bin/commands/dev.mjs +1 -1
package/bin/commands/inspect.mjs +1 -1
package/bin/commands/new.mjs +9 -5
package/bin/commands/start.mjs +3 -1
package/bin/commands/test-init.mjs +478 -76
package/bin/commands/test.mjs +357 -4
package/bin/lib/eval/eval-reporter.mjs +105 -0
package/bin/lib/eval/eval-runner.mjs +310 -0
package/bin/lib/eval/eval-types.d.mts +168 -0
package/bin/lib/eval/eval-vitest-plugin.mjs +158 -0
package/bin/lib/eval/model-registry.mjs +73 -0
package/bin/lib/sandbox-server.mjs +5 -2
package/bin/sunpeak.js +1 -0
package/dist/chatgpt/index.cjs +1 -1
package/dist/chatgpt/index.js +1 -1
package/dist/claude/index.cjs +1 -1
package/dist/claude/index.js +1 -1
package/dist/host/chatgpt/index.cjs +1 -1
package/dist/host/chatgpt/index.js +1 -1
package/dist/index.cjs +134 -124
package/dist/index.cjs.map +1 -1
package/dist/index.d.ts +3 -1
package/dist/index.js +71 -62
package/dist/index.js.map +1 -1
package/dist/inspector/index.cjs +1 -1
package/dist/inspector/index.js +1 -1
package/dist/{inspector-Cdo5BK2D.js → inspector-D5DckQuU.js} +236 -98
package/dist/inspector-D5DckQuU.js.map +1 -0
package/dist/{inspector-8nPV2A-z.cjs → inspector-jY9O18z9.cjs} +237 -99
package/dist/inspector-jY9O18z9.cjs.map +1 -0
package/dist/mcp/index.cjs +237 -140
package/dist/mcp/index.cjs.map +1 -1
package/dist/mcp/index.d.ts +1 -1
package/dist/mcp/index.js +230 -134
package/dist/mcp/index.js.map +1 -1
package/dist/mcp/production-server.d.ts +31 -0
package/dist/{protocol-C7kTcBr_.cjs → protocol-C8pFDmcy.cjs} +8194 -8187
package/dist/protocol-C8pFDmcy.cjs.map +1 -0
package/dist/{protocol-BfAACnv0.js → protocol-CRqiPTLT.js} +8186 -8185
package/dist/protocol-CRqiPTLT.js.map +1 -0
package/dist/{use-app-CfP9VypY.js → use-app-Bfargfa3.js} +194 -94
package/dist/use-app-Bfargfa3.js.map +1 -0
package/dist/{use-app-CzcYw1Kz.cjs → use-app-CbsBEmwv.cjs} +254 -148
package/dist/use-app-CbsBEmwv.cjs.map +1 -0
package/package.json +27 -3
package/template/README.md +17 -7
package/template/_gitignore +2 -0
package/template/dist/albums/albums.html +15 -15
package/template/dist/albums/albums.json +1 -1
package/template/dist/carousel/carousel.html +19 -19
package/template/dist/carousel/carousel.json +1 -1
package/template/dist/map/map.html +14 -14
package/template/dist/map/map.json +1 -1
package/template/dist/review/review.html +11 -11
package/template/dist/review/review.json +1 -1
package/template/node_modules/.bin/vitest +2 -2
package/template/node_modules/.vite/deps/_metadata.json +3 -3
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js +192 -91
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js +231 -92
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js +208 -105
package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js.map +1 -1
package/template/node_modules/.vite-mcp/deps/_metadata.json +25 -25
package/template/node_modules/.vite-mcp/deps/{protocol-B_qKkui_.js → protocol-BqGB4zBx.js} +45 -45
package/template/node_modules/.vite-mcp/deps/protocol-BqGB4zBx.js.map +1 -0
package/template/node_modules/.vite-mcp/deps/vitest.js +7 -7
package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
package/template/tests/e2e/visual.spec.ts-snapshots/albums-dark-chatgpt-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-dark-claude-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-light-chatgpt-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-light-claude-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-chatgpt-darwin.png +0 -0
package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-claude-darwin.png +0 -0
package/template/tests/evals/.env.example +5 -0
package/template/tests/evals/albums.eval.ts +28 -0
package/template/tests/evals/carousel.eval.ts +26 -0
package/template/tests/evals/eval.config.ts +26 -0
package/template/tests/evals/map.eval.ts +23 -0
package/template/tests/evals/review.eval.ts +48 -0
package/dist/inspector-8nPV2A-z.cjs.map +0 -1
package/dist/inspector-Cdo5BK2D.js.map +0 -1
package/dist/protocol-BfAACnv0.js.map +0 -1
package/dist/protocol-C7kTcBr_.cjs.map +0 -1
package/dist/use-app-CfP9VypY.js.map +0 -1
package/dist/use-app-CzcYw1Kz.cjs.map +0 -1
package/template/node_modules/.vite-mcp/deps/protocol-B_qKkui_.js.map +0 -1

package/bin/lib/eval/eval-runner.mjs ADDED Viewed

@@ -0,0 +1,310 @@
+/**
+ * Core eval runner — connects to MCP server, converts tools to AI SDK format,
+ * runs eval cases against models, and collects results.
+ */
+import { resolveModel, checkAiSdkInstalled } from './model-registry.mjs';
+// Re-export for use in generated test code
+export { checkAiSdkInstalled };
+/**
+ * Define an eval spec. Identity function for type safety.
+ * @param {import('./eval-types.d.mts').EvalSpec} spec
+ * @returns {import('./eval-types.d.mts').EvalSpec}
+ */
+export function defineEval(spec) {
+  return spec;
+}
+/**
+ * Define eval configuration. Identity function for type safety.
+ * @param {import('./eval-types.d.mts').EvalConfig} config
+ * @returns {import('./eval-types.d.mts').EvalConfig}
+ */
+export function defineEvalConfig(config) {
+  return config;
+}
+/**
+ * Create an MCP client connection.
+ * Reuses the same pattern as inspect.mjs createMcpConnection.
+ * @param {string} serverArg - URL or stdio command string
+ * @returns {Promise<{ client: import('@modelcontextprotocol/sdk/client/index.js').Client, transport: import('@modelcontextprotocol/sdk/types.js').Transport }>}
+ */
+export async function createMcpConnection(serverArg) {
+  const { Client } = await import('@modelcontextprotocol/sdk/client/index.js');
+  const client = new Client({ name: 'sunpeak-eval', version: '1.0.0' });
+  if (serverArg.startsWith('http://') || serverArg.startsWith('https://')) {
+    const { StreamableHTTPClientTransport } = await import(
+      '@modelcontextprotocol/sdk/client/streamableHttp.js'
+    );
+    const transport = new StreamableHTTPClientTransport(new URL(serverArg));
+    await client.connect(transport);
+    return { client, transport };
+  } else {
+    const parts = serverArg.split(/\s+/);
+    const command = parts[0];
+    const cmdArgs = parts.slice(1);
+    const { StdioClientTransport } = await import(
+      '@modelcontextprotocol/sdk/client/stdio.js'
+    );
+    const transport = new StdioClientTransport({ command, args: cmdArgs });
+    await client.connect(transport);
+    return { client, transport };
+  }
+}
+/**
+ * Discover tools from an MCP server and convert them to AI SDK format.
+ * @param {import('@modelcontextprotocol/sdk/client/index.js').Client} client
+ * @returns {Promise<Record<string, import('ai').CoreTool>>}
+ */
+export async function discoverAndConvertTools(client) {
+  const { tool: aiTool } = await import('ai');
+  const { jsonSchema } = await import('ai');
+  const { tools: mcpTools } = await client.listTools();
+  const tools = {};
+  for (const t of mcpTools) {
+    tools[t.name] = aiTool({
+      description: t.description || '',
+      parameters: jsonSchema(t.inputSchema || { type: 'object', properties: {} }),
+      execute: async (args) => {
+        const result = await client.callTool({ name: t.name, arguments: args });
+        // Return a simplified version for the model to consume
+        if (result.structuredContent) {
+          return result.structuredContent;
+        }
+        if (result.content && result.content.length > 0) {
+          const textParts = result.content
+            .filter((c) => c.type === 'text')
+            .map((c) => c.text);
+          return textParts.join('\n') || JSON.stringify(result.content);
+        }
+        return 'Tool executed successfully.';
+      },
+    });
+  }
+  return tools;
+}
+/**
+ * Run a single eval case once against a model.
+ * @param {object} params
+ * @param {string} params.prompt
+ * @param {import('ai').LanguageModel} params.model
+ * @param {Record<string, import('ai').CoreTool>} params.tools
+ * @param {number} params.maxSteps
+ * @param {number} params.temperature
+ * @param {number} params.timeout
+ * @returns {Promise<import('./eval-types.d.mts').EvalRunResult>}
+ */
+export async function runSingleEval({ prompt, model, tools, maxSteps, temperature, timeout }) {
+  const { generateText } = await import('ai');
+  const result = await generateText({
+    model,
+    tools,
+    prompt,
+    maxSteps,
+    temperature,
+    abortSignal: AbortSignal.timeout(timeout),
+  });
+  // Normalize the result into our EvalRunResult shape
+  const allToolCalls = [];
+  const allToolResults = [];
+  const steps = [];
+  for (const step of result.steps || []) {
+    const stepToolCalls = (step.toolCalls || []).map((tc) => ({
+      name: tc.toolName,
+      args: tc.args,
+    }));
+    const stepToolResults = (step.toolResults || []).map((tr) => tr.result);
+    allToolCalls.push(...stepToolCalls);
+    allToolResults.push(...stepToolResults);
+    steps.push({
+      toolCalls: stepToolCalls,
+      toolResults: stepToolResults,
+      text: step.text || '',
+    });
+  }
+  return {
+    toolCalls: allToolCalls,
+    toolResults: allToolResults,
+    text: result.text || '',
+    steps,
+    usage: {
+      promptTokens: result.usage?.promptTokens || 0,
+      completionTokens: result.usage?.completionTokens || 0,
+      totalTokens: result.usage?.totalTokens || 0,
+    },
+    finishReason: result.finishReason || 'unknown',
+  };
+}
+/**
+ * Check a single eval result against expectations.
+ * @param {import('./eval-types.d.mts').EvalRunResult} result
+ * @param {import('./eval-types.d.mts').EvalCase} evalCase
+ * @throws {Error} if the assertion fails
+ */
+export function checkExpectations(result, evalCase) {
+  if (evalCase.assert) {
+    evalCase.assert(result);
+    return;
+  }
+  if (!evalCase.expect) return;
+  const expectations = Array.isArray(evalCase.expect) ? evalCase.expect : [evalCase.expect];
+  if (result.toolCalls.length < expectations.length) {
+    let msg = `Expected ${expectations.length} tool call(s), but got ${result.toolCalls.length}`;
+    if (result.toolCalls.length === 0 && result.text) {
+      const truncated = result.text.length > 200 ? result.text.slice(0, 200) + '...' : result.text;
+      msg += `. Model responded with text: "${truncated}"`;
+    }
+    throw new Error(msg);
+  }
+  for (let i = 0; i < expectations.length; i++) {
+    const expected = expectations[i];
+    const actual = result.toolCalls[i];
+    if (expected.tool !== actual.name) {
+      throw new Error(
+        `Step ${i + 1}: expected tool "${expected.tool}", got "${actual.name}"`
+      );
+    }
+    if (expected.args) {
+      checkPartialMatch(expected.args, actual.args, `Step ${i + 1} args`);
+    }
+  }
+}
+/**
+ * Deep partial match — checks that all keys in `expected` exist in `actual`
+ * with matching values. Extra keys in `actual` are allowed.
+ * Supports vitest asymmetric matchers (expect.stringContaining, etc.).
+ * @param {Record<string, unknown>} expected
+ * @param {Record<string, unknown>} actual
+ * @param {string} path
+ */
+function checkPartialMatch(expected, actual, path) {
+  for (const [key, expectedValue] of Object.entries(expected)) {
+    const actualValue = actual?.[key];
+    // Support vitest asymmetric matchers
+    if (expectedValue && typeof expectedValue === 'object' && typeof expectedValue.asymmetricMatch === 'function') {
+      if (!expectedValue.asymmetricMatch(actualValue)) {
+        throw new Error(
+          `${path}.${key}: expected ${expectedValue.toString()}, got ${JSON.stringify(actualValue)}`
+        );
+      }
+      continue;
+    }
+    if (typeof expectedValue === 'object' && expectedValue !== null && !Array.isArray(expectedValue)) {
+      checkPartialMatch(expectedValue, actualValue, `${path}.${key}`);
+    } else if (Array.isArray(expectedValue)) {
+      if (!Array.isArray(actualValue)) {
+        throw new Error(
+          `${path}.${key}: expected array, got ${JSON.stringify(actualValue)}`
+        );
+      }
+      // For arrays, check that each expected element exists in actual
+      for (let i = 0; i < expectedValue.length; i++) {
+        if (i >= actualValue.length) {
+          throw new Error(
+            `${path}.${key}[${i}]: expected ${JSON.stringify(expectedValue[i])}, but array only has ${actualValue.length} elements`
+          );
+        }
+        if (typeof expectedValue[i] === 'object' && expectedValue[i] !== null) {
+          checkPartialMatch(expectedValue[i], actualValue[i], `${path}.${key}[${i}]`);
+        } else if (expectedValue[i] !== actualValue[i]) {
+          throw new Error(
+            `${path}.${key}[${i}]: expected ${JSON.stringify(expectedValue[i])}, got ${JSON.stringify(actualValue[i])}`
+          );
+        }
+      }
+    } else if (expectedValue !== actualValue) {
+      throw new Error(
+        `${path}.${key}: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(actualValue)}`
+      );
+    }
+  }
+}
+/**
+ * Run an eval case N times against a model and collect aggregate results.
+ * @param {object} params
+ * @param {import('./eval-types.d.mts').EvalCase} params.evalCase
+ * @param {string} params.modelId
+ * @param {Record<string, import('ai').CoreTool>} params.tools
+ * @param {number} params.runs
+ * @param {number} params.maxSteps
+ * @param {number} params.temperature
+ * @param {number} params.timeout
+ * @returns {Promise<import('./eval-types.d.mts').EvalCaseResult>}
+ */
+export async function runEvalCaseAggregate({
+  evalCase,
+  modelId,
+  tools,
+  runs,
+  maxSteps,
+  temperature,
+  timeout,
+}) {
+  const model = await resolveModel(modelId);
+  let passed = 0;
+  let failed = 0;
+  let totalDurationMs = 0;
+  const failureMap = new Map();
+  for (let i = 0; i < runs; i++) {
+    const start = performance.now();
+    try {
+      const result = await runSingleEval({
+        prompt: evalCase.prompt,
+        model,
+        tools,
+        maxSteps: evalCase.maxSteps ?? maxSteps,
+        temperature,
+        timeout,
+      });
+      checkExpectations(result, evalCase);
+      passed++;
+    } catch (err) {
+      failed++;
+      const msg = err.message || String(err);
+      failureMap.set(msg, (failureMap.get(msg) || 0) + 1);
+    }
+    totalDurationMs += performance.now() - start;
+  }
+  const failures = Array.from(failureMap.entries()).map(([error, count]) => ({
+    error,
+    count,
+  }));
+  return {
+    caseName: evalCase.name,
+    modelId,
+    runs,
+    passed,
+    failed,
+    passRate: runs > 0 ? passed / runs : 0,
+    avgDurationMs: runs > 0 ? totalDurationMs / runs : 0,
+    failures,
+  };
+}

package/bin/lib/eval/eval-types.d.mts ADDED Viewed

@@ -0,0 +1,168 @@
+/**
+ * Result from a single eval run, containing the model's tool calls and response.
+ */
+export interface EvalRunResult {
+  /** All tool calls the model made across all steps. */
+  toolCalls: Array<{
+    name: string;
+    args: Record<string, unknown>;
+  }>;
+  /** All tool results returned. */
+  toolResults: Array<unknown>;
+  /** Final text response from the model. */
+  text: string;
+  /** Per-step breakdown. */
+  steps: Array<{
+    toolCalls: Array<{ name: string; args: Record<string, unknown> }>;
+    toolResults: Array<unknown>;
+    text: string;
+  }>;
+  /** Token usage. */
+  usage: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
+  /** Why the model stopped. */
+  finishReason: string;
+}
+/**
+ * Expected tool call assertion.
+ */
+export interface ToolExpectation {
+  /** Expected tool name. */
+  tool: string;
+  /** Expected arguments (partial match). */
+  args?: Record<string, unknown>;
+}
+/**
+ * A single eval test case.
+ */
+export interface EvalCase {
+  /** Name of this test case. */
+  name: string;
+  /** The prompt to send to the model. */
+  prompt: string;
+  /** Maximum tool call steps (default: from config or 1). */
+  maxSteps?: number;
+  /** Expected tool call (single). */
+  expect?: ToolExpectation | ToolExpectation[];
+  /** Custom assertion function. */
+  assert?: (result: EvalRunResult) => void;
+}
+/**
+ * An eval spec defined via defineEval().
+ */
+export interface EvalSpec {
+  /** Override model list for this eval (defaults to config). */
+  models?: string[];
+  /** Override run count for this eval (defaults to config). */
+  runs?: number;
+  /** Pass threshold (0-1). Defaults to config or 1.0. */
+  threshold?: number;
+  /** Test cases. */
+  cases: EvalCase[];
+}
+/**
+ * Eval configuration defined via defineEvalConfig().
+ */
+export interface EvalConfig {
+  /** MCP server URL or stdio command. Omit for sunpeak projects (auto-detected). */
+  server?: string;
+  /** Model IDs to test against. */
+  models: string[];
+  /** Default settings. */
+  defaults?: {
+    /** Number of times to run each case per model. Default: 10. */
+    runs?: number;
+    /** Maximum tool call steps. Default: 1. */
+    maxSteps?: number;
+    /** Model temperature. Default: 0. */
+    temperature?: number;
+    /** Timeout per run in milliseconds. Default: 30000. */
+    timeout?: number;
+    /** Pass threshold (0-1). Default: 1.0. */
+    threshold?: number;
+  };
+}
+/**
+ * Aggregated results for one eval case across all runs of one model.
+ */
+export interface EvalCaseResult {
+  caseName: string;
+  modelId: string;
+  runs: number;
+  passed: number;
+  failed: number;
+  passRate: number;
+  avgDurationMs: number;
+  failures: Array<{
+    error: string;
+    count: number;
+  }>;
+}
+/**
+ * Define an eval spec.
+ */
+export declare function defineEval(spec: EvalSpec): EvalSpec;
+/**
+ * Define eval configuration.
+ */
+export declare function defineEvalConfig(config: EvalConfig): EvalConfig;
+/**
+ * Check expectations against an eval run result.
+ * Throws if the result does not match the expected tool calls.
+ */
+export declare function checkExpectations(
+  result: EvalRunResult,
+  evalCase: EvalCase,
+): void;
+/**
+ * Connect to an MCP server and return a client + transport.
+ * @param serverUrl - MCP server URL (e.g., 'http://localhost:8000/mcp')
+ */
+export declare function createMcpConnection(
+  serverUrl: string,
+): Promise<{ client: unknown; transport: { close?: () => Promise<void> } }>;
+/**
+ * Discover tools from an MCP server client and convert them to AI SDK tool format.
+ * @param client - MCP SDK Client instance (from createMcpConnection)
+ */
+export declare function discoverAndConvertTools(
+  client: unknown,
+): Promise<Record<string, unknown>>;
+/**
+ * Run a single eval case against a model, returning the normalized result.
+ */
+export declare function runSingleEval(params: {
+  prompt: string;
+  model: unknown;
+  tools: Record<string, unknown>;
+  maxSteps: number;
+  temperature: number;
+  timeout: number;
+}): Promise<EvalRunResult>;
+/**
+ * Run an eval case multiple times against a model and return aggregated results.
+ */
+export declare function runEvalCaseAggregate(params: {
+  evalCase: EvalCase;
+  modelId: string;
+  tools: Record<string, unknown>;
+  runs: number;
+  maxSteps: number;
+  temperature: number;
+  timeout: number;
+}): Promise<EvalCaseResult>;

package/bin/lib/eval/eval-vitest-plugin.mjs ADDED Viewed

@@ -0,0 +1,158 @@
+/**
+ * Vitest plugin that transforms .eval.ts files into runnable test suites.
+ *
+ * Each eval spec file gets transformed into a vitest test module that:
+ * 1. Connects to the MCP server
+ * 2. Discovers and converts tools
+ * 3. Runs each case × model × N runs
+ * 4. Reports aggregate pass/fail counts
+ *
+ * The original eval spec is re-imported via a virtual module (\0 prefix)
+ * to avoid circular transformation. The virtual ID ends in .eval-spec.ts
+ * so Vite's esbuild transform recognizes it as TypeScript.
+ */
+import { readFileSync } from 'fs';
+import { basename } from 'path';
+import { fileURLToPath } from 'url';
+const EVAL_RE = /\.eval\.[tj]s$/;
+const VIRTUAL_PREFIX = '\0sunpeak-eval-spec:';
+/**
+ * Create the vitest plugin for eval files.
+ * @param {object} options
+ * @param {string} options.server - MCP server URL or command
+ * @param {string[]} options.models - Model IDs to test
+ * @param {object} options.defaults - Default settings
+ * @returns {import('vite').Plugin}
+ */
+export function evalVitestPlugin({ server, models, defaults }) {
+  // Map virtual IDs back to real file paths
+  const virtualToReal = new Map();
+  return {
+    name: 'sunpeak-eval',
+    enforce: 'pre',
+    resolveId(id) {
+      // Resolve virtual spec imports — these bypass the transform
+      if (id.startsWith(VIRTUAL_PREFIX)) {
+        return id;
+      }
+      return null;
+    },
+    load(id) {
+      if (!id.startsWith(VIRTUAL_PREFIX)) return null;
+      const realPath = virtualToReal.get(id);
+      if (!realPath) return null;
+      return readFileSync(realPath, 'utf-8');
+    },
+    transform(code, id) {
+      // Don't transform virtual modules
+      if (id.startsWith(VIRTUAL_PREFIX)) return null;
+      // Only transform eval spec files
+      if (!EVAL_RE.test(id)) return null;
+      // Register the virtual module mapping (use .ts extension so esbuild handles it)
+      const virtualId = VIRTUAL_PREFIX + id;
+      virtualToReal.set(virtualId, id);
+      const testName = basename(id).replace(EVAL_RE, '');
+      const runnerPath = resolveRunnerPath();
+      const transformed = `
+import { describe, it, beforeAll, afterAll } from 'vitest';
+import { createMcpConnection, discoverAndConvertTools, runEvalCaseAggregate, checkAiSdkInstalled } from '${runnerPath}';
+// Import the original eval spec via virtual module (bypasses this transform)
+import evalSpec from ${JSON.stringify(virtualId)};
+if (!evalSpec || !evalSpec.cases) {
+  throw new Error('Eval file must use: export default defineEval({ cases: [...] })');
+}
+const SERVER = ${JSON.stringify(server)};
+const MODELS = ${JSON.stringify(models)};
+const DEFAULTS = ${JSON.stringify(defaults)};
+// Use the eval-level model override, or fall back to config
+const activeModels = evalSpec.models || MODELS;
+// Skip entirely if no models configured
+const shouldSkip = !activeModels || activeModels.length === 0;
+describe.skipIf(shouldSkip)(${JSON.stringify(testName)}, () => {
+  let client;
+  let transport;
+  let tools;
+  beforeAll(async () => {
+    await checkAiSdkInstalled();
+    const conn = await createMcpConnection(SERVER);
+    client = conn.client;
+    transport = conn.transport;
+    tools = await discoverAndConvertTools(client);
+  });
+  afterAll(async () => {
+    try { await transport?.close?.(); } catch {}
+  });
+  for (const evalCase of evalSpec.cases) {
+    describe(evalCase.name, () => {
+      for (const modelId of activeModels) {
+        const runs = evalSpec.runs ?? DEFAULTS.runs ?? 10;
+        const threshold = evalSpec.threshold ?? DEFAULTS.threshold ?? 1.0;
+        it(\`\${modelId} (\${runs} runs)\`, async () => {
+          const result = await runEvalCaseAggregate({
+            evalCase,
+            modelId,
+            tools,
+            runs,
+            maxSteps: DEFAULTS.maxSteps ?? 1,
+            temperature: DEFAULTS.temperature ?? 0,
+            timeout: DEFAULTS.timeout ?? 30000,
+          });
+          // Log statistical results for the reporter
+          console.log('__SUNPEAK_EVAL__' + JSON.stringify({
+            type: 'eval-result',
+            ...result,
+          }));
+          // Assert pass rate meets threshold
+          if (result.passRate < threshold) {
+            const failureSummary = result.failures
+              .map((f) => \`  \${f.error} (\${f.count}x)\`)
+              .join('\\n');
+            throw new Error(
+              \`\${result.passed}/\${result.runs} passed (\${(result.passRate * 100).toFixed(0)}%), threshold \${(threshold * 100).toFixed(0)}%\\nFailures:\\n\${failureSummary}\`
+            );
+          }
+        }, (DEFAULTS.timeout ?? 30000) * (evalSpec.runs ?? DEFAULTS.runs ?? 10) + 10000);
+      }
+    });
+  }
+});
+`;
+      return { code: transformed, map: null };
+    },
+  };
+}
+/**
+ * Get the absolute path to the eval-runner module.
+ */
+function resolveRunnerPath() {
+  const url = new URL('./eval-runner.mjs', import.meta.url);
+  // fileURLToPath requires file:// scheme; fall back to pathname for other schemes (e.g., vitest)
+  if (url.protocol === 'file:') {
+    return fileURLToPath(url);
+  }
+  return url.pathname;
+}

package/bin/lib/eval/model-registry.mjs ADDED Viewed

@@ -0,0 +1,73 @@
+/**
+ * Model registry — maps model ID strings to AI SDK provider instances.
+ *
+ * Provider packages are dynamically imported so users only need to install
+ * the providers they actually use.
+ */
+/**
+ * @typedef {{ modelId: string, providerPackage: string }} ModelMapping
+ */
+/**
+ * Detect which provider package a model ID belongs to.
+ * @param {string} modelId
+ * @returns {string} Provider package name
+ */
+function getProviderPackage(modelId) {
+  if (/^(gpt-|o[134]-|o[134]$|chatgpt-)/.test(modelId)) return '@ai-sdk/openai';
+  if (/^claude-/.test(modelId)) return '@ai-sdk/anthropic';
+  if (/^(gemini-|models\/gemini-)/.test(modelId)) return '@ai-sdk/google';
+  throw new Error(
+    `Unknown model: "${modelId}". Expected a recognized prefix (gpt-, claude-, gemini-, o1-, o3-, o4-).`
+  );
+}
+/**
+ * Resolve a model ID string to an AI SDK LanguageModel instance.
+ * @param {string} modelId - e.g., 'gpt-4o', 'claude-sonnet-4-20250514', 'gemini-2.0-flash'
+ * @returns {Promise<import('ai').LanguageModel>}
+ */
+export async function resolveModel(modelId) {
+  const pkg = getProviderPackage(modelId);
+  let provider;
+  try {
+    provider = await import(pkg);
+  } catch {
+    throw new Error(
+      `Provider package "${pkg}" is not installed. Install it to use ${modelId}:\n\n  npm install ${pkg}  (or pnpm add / yarn add)\n`
+    );
+  }
+  // Each provider package exports a default function or named function
+  // that creates model instances: openai('gpt-4o'), anthropic('claude-...'), google('gemini-...')
+  if (pkg === '@ai-sdk/openai') {
+    const { openai } = provider;
+    return openai(modelId);
+  }
+  if (pkg === '@ai-sdk/anthropic') {
+    const { anthropic } = provider;
+    return anthropic(modelId);
+  }
+  if (pkg === '@ai-sdk/google') {
+    const { google } = provider;
+    return google(modelId);
+  }
+  throw new Error(`No provider factory found for ${pkg}`);
+}
+/**
+ * Check that the `ai` core package is installed.
+ * @returns {Promise<void>}
+ */
+export async function checkAiSdkInstalled() {
+  try {
+    await import('ai');
+  } catch {
+    throw new Error(
+      'The "ai" package is not installed. Install it to use evals:\n\n  npm install ai  (or pnpm add / yarn add)\n'
+    );
+  }
+}