sunpeak 0.19.2 → 0.19.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +6 -4
  2. package/bin/commands/dev.mjs +1 -1
  3. package/bin/commands/inspect.mjs +1 -1
  4. package/bin/commands/new.mjs +9 -5
  5. package/bin/commands/start.mjs +3 -1
  6. package/bin/commands/test-init.mjs +478 -76
  7. package/bin/commands/test.mjs +357 -4
  8. package/bin/lib/eval/eval-reporter.mjs +105 -0
  9. package/bin/lib/eval/eval-runner.mjs +310 -0
  10. package/bin/lib/eval/eval-types.d.mts +168 -0
  11. package/bin/lib/eval/eval-vitest-plugin.mjs +158 -0
  12. package/bin/lib/eval/model-registry.mjs +73 -0
  13. package/bin/lib/sandbox-server.mjs +5 -2
  14. package/bin/sunpeak.js +1 -0
  15. package/dist/chatgpt/index.cjs +1 -1
  16. package/dist/chatgpt/index.js +1 -1
  17. package/dist/claude/index.cjs +1 -1
  18. package/dist/claude/index.js +1 -1
  19. package/dist/host/chatgpt/index.cjs +1 -1
  20. package/dist/host/chatgpt/index.js +1 -1
  21. package/dist/index.cjs +134 -124
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.d.ts +3 -1
  24. package/dist/index.js +71 -62
  25. package/dist/index.js.map +1 -1
  26. package/dist/inspector/index.cjs +1 -1
  27. package/dist/inspector/index.js +1 -1
  28. package/dist/{inspector-Cdo5BK2D.js → inspector-D5DckQuU.js} +236 -98
  29. package/dist/inspector-D5DckQuU.js.map +1 -0
  30. package/dist/{inspector-8nPV2A-z.cjs → inspector-jY9O18z9.cjs} +237 -99
  31. package/dist/inspector-jY9O18z9.cjs.map +1 -0
  32. package/dist/mcp/index.cjs +237 -140
  33. package/dist/mcp/index.cjs.map +1 -1
  34. package/dist/mcp/index.d.ts +1 -1
  35. package/dist/mcp/index.js +230 -134
  36. package/dist/mcp/index.js.map +1 -1
  37. package/dist/mcp/production-server.d.ts +31 -0
  38. package/dist/{protocol-C7kTcBr_.cjs → protocol-C8pFDmcy.cjs} +8194 -8187
  39. package/dist/protocol-C8pFDmcy.cjs.map +1 -0
  40. package/dist/{protocol-BfAACnv0.js → protocol-CRqiPTLT.js} +8186 -8185
  41. package/dist/protocol-CRqiPTLT.js.map +1 -0
  42. package/dist/{use-app-CfP9VypY.js → use-app-Bfargfa3.js} +194 -94
  43. package/dist/use-app-Bfargfa3.js.map +1 -0
  44. package/dist/{use-app-CzcYw1Kz.cjs → use-app-CbsBEmwv.cjs} +254 -148
  45. package/dist/use-app-CbsBEmwv.cjs.map +1 -0
  46. package/package.json +27 -3
  47. package/template/README.md +17 -7
  48. package/template/_gitignore +2 -0
  49. package/template/dist/albums/albums.html +15 -15
  50. package/template/dist/albums/albums.json +1 -1
  51. package/template/dist/carousel/carousel.html +19 -19
  52. package/template/dist/carousel/carousel.json +1 -1
  53. package/template/dist/map/map.html +14 -14
  54. package/template/dist/map/map.json +1 -1
  55. package/template/dist/review/review.html +11 -11
  56. package/template/dist/review/review.json +1 -1
  57. package/template/node_modules/.bin/vitest +2 -2
  58. package/template/node_modules/.vite/deps/_metadata.json +3 -3
  59. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js +192 -91
  60. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps.js.map +1 -1
  61. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js +231 -92
  62. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_app-bridge.js.map +1 -1
  63. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js +208 -105
  64. package/template/node_modules/.vite-mcp/deps/@modelcontextprotocol_ext-apps_react.js.map +1 -1
  65. package/template/node_modules/.vite-mcp/deps/_metadata.json +25 -25
  66. package/template/node_modules/.vite-mcp/deps/{protocol-B_qKkui_.js → protocol-BqGB4zBx.js} +45 -45
  67. package/template/node_modules/.vite-mcp/deps/protocol-BqGB4zBx.js.map +1 -0
  68. package/template/node_modules/.vite-mcp/deps/vitest.js +7 -7
  69. package/template/node_modules/.vite-mcp/deps/vitest.js.map +1 -1
  70. package/template/tests/e2e/visual.spec.ts-snapshots/albums-dark-chatgpt-darwin.png +0 -0
  71. package/template/tests/e2e/visual.spec.ts-snapshots/albums-dark-claude-darwin.png +0 -0
  72. package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-chatgpt-darwin.png +0 -0
  73. package/template/tests/e2e/visual.spec.ts-snapshots/albums-fullscreen-claude-darwin.png +0 -0
  74. package/template/tests/e2e/visual.spec.ts-snapshots/albums-light-chatgpt-darwin.png +0 -0
  75. package/template/tests/e2e/visual.spec.ts-snapshots/albums-light-claude-darwin.png +0 -0
  76. package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-chatgpt-darwin.png +0 -0
  77. package/template/tests/e2e/visual.spec.ts-snapshots/albums-page-light-claude-darwin.png +0 -0
  78. package/template/tests/evals/.env.example +5 -0
  79. package/template/tests/evals/albums.eval.ts +28 -0
  80. package/template/tests/evals/carousel.eval.ts +26 -0
  81. package/template/tests/evals/eval.config.ts +26 -0
  82. package/template/tests/evals/map.eval.ts +23 -0
  83. package/template/tests/evals/review.eval.ts +48 -0
  84. package/dist/inspector-8nPV2A-z.cjs.map +0 -1
  85. package/dist/inspector-Cdo5BK2D.js.map +0 -1
  86. package/dist/protocol-BfAACnv0.js.map +0 -1
  87. package/dist/protocol-C7kTcBr_.cjs.map +0 -1
  88. package/dist/use-app-CfP9VypY.js.map +0 -1
  89. package/dist/use-app-CzcYw1Kz.cjs.map +0 -1
  90. package/template/node_modules/.vite-mcp/deps/protocol-B_qKkui_.js.map +0 -1
@@ -0,0 +1,310 @@
1
+ /**
2
+ * Core eval runner — connects to MCP server, converts tools to AI SDK format,
3
+ * runs eval cases against models, and collects results.
4
+ */
5
+
6
+ import { resolveModel, checkAiSdkInstalled } from './model-registry.mjs';
7
+
8
+ // Re-export for use in generated test code
9
+ export { checkAiSdkInstalled };
10
+
11
+ /**
12
+ * Define an eval spec. Identity function for type safety.
13
+ * @param {import('./eval-types.d.mts').EvalSpec} spec
14
+ * @returns {import('./eval-types.d.mts').EvalSpec}
15
+ */
16
+ export function defineEval(spec) {
17
+ return spec;
18
+ }
19
+
20
+ /**
21
+ * Define eval configuration. Identity function for type safety.
22
+ * @param {import('./eval-types.d.mts').EvalConfig} config
23
+ * @returns {import('./eval-types.d.mts').EvalConfig}
24
+ */
25
+ export function defineEvalConfig(config) {
26
+ return config;
27
+ }
28
+
29
+ /**
30
+ * Create an MCP client connection.
31
+ * Reuses the same pattern as inspect.mjs createMcpConnection.
32
+ * @param {string} serverArg - URL or stdio command string
33
+ * @returns {Promise<{ client: import('@modelcontextprotocol/sdk/client/index.js').Client, transport: import('@modelcontextprotocol/sdk/types.js').Transport }>}
34
+ */
35
+ export async function createMcpConnection(serverArg) {
36
+ const { Client } = await import('@modelcontextprotocol/sdk/client/index.js');
37
+ const client = new Client({ name: 'sunpeak-eval', version: '1.0.0' });
38
+
39
+ if (serverArg.startsWith('http://') || serverArg.startsWith('https://')) {
40
+ const { StreamableHTTPClientTransport } = await import(
41
+ '@modelcontextprotocol/sdk/client/streamableHttp.js'
42
+ );
43
+ const transport = new StreamableHTTPClientTransport(new URL(serverArg));
44
+ await client.connect(transport);
45
+ return { client, transport };
46
+ } else {
47
+ const parts = serverArg.split(/\s+/);
48
+ const command = parts[0];
49
+ const cmdArgs = parts.slice(1);
50
+ const { StdioClientTransport } = await import(
51
+ '@modelcontextprotocol/sdk/client/stdio.js'
52
+ );
53
+ const transport = new StdioClientTransport({ command, args: cmdArgs });
54
+ await client.connect(transport);
55
+ return { client, transport };
56
+ }
57
+ }
58
+
59
+ /**
60
+ * Discover tools from an MCP server and convert them to AI SDK format.
61
+ * @param {import('@modelcontextprotocol/sdk/client/index.js').Client} client
62
+ * @returns {Promise<Record<string, import('ai').CoreTool>>}
63
+ */
64
+ export async function discoverAndConvertTools(client) {
65
+ const { tool: aiTool } = await import('ai');
66
+ const { jsonSchema } = await import('ai');
67
+
68
+ const { tools: mcpTools } = await client.listTools();
69
+ const tools = {};
70
+
71
+ for (const t of mcpTools) {
72
+ tools[t.name] = aiTool({
73
+ description: t.description || '',
74
+ parameters: jsonSchema(t.inputSchema || { type: 'object', properties: {} }),
75
+ execute: async (args) => {
76
+ const result = await client.callTool({ name: t.name, arguments: args });
77
+ // Return a simplified version for the model to consume
78
+ if (result.structuredContent) {
79
+ return result.structuredContent;
80
+ }
81
+ if (result.content && result.content.length > 0) {
82
+ const textParts = result.content
83
+ .filter((c) => c.type === 'text')
84
+ .map((c) => c.text);
85
+ return textParts.join('\n') || JSON.stringify(result.content);
86
+ }
87
+ return 'Tool executed successfully.';
88
+ },
89
+ });
90
+ }
91
+
92
+ return tools;
93
+ }
94
+
95
+ /**
96
+ * Run a single eval case once against a model.
97
+ * @param {object} params
98
+ * @param {string} params.prompt
99
+ * @param {import('ai').LanguageModel} params.model
100
+ * @param {Record<string, import('ai').CoreTool>} params.tools
101
+ * @param {number} params.maxSteps
102
+ * @param {number} params.temperature
103
+ * @param {number} params.timeout
104
+ * @returns {Promise<import('./eval-types.d.mts').EvalRunResult>}
105
+ */
106
+ export async function runSingleEval({ prompt, model, tools, maxSteps, temperature, timeout }) {
107
+ const { generateText } = await import('ai');
108
+
109
+ const result = await generateText({
110
+ model,
111
+ tools,
112
+ prompt,
113
+ maxSteps,
114
+ temperature,
115
+ abortSignal: AbortSignal.timeout(timeout),
116
+ });
117
+
118
+ // Normalize the result into our EvalRunResult shape
119
+ const allToolCalls = [];
120
+ const allToolResults = [];
121
+ const steps = [];
122
+
123
+ for (const step of result.steps || []) {
124
+ const stepToolCalls = (step.toolCalls || []).map((tc) => ({
125
+ name: tc.toolName,
126
+ args: tc.args,
127
+ }));
128
+ const stepToolResults = (step.toolResults || []).map((tr) => tr.result);
129
+
130
+ allToolCalls.push(...stepToolCalls);
131
+ allToolResults.push(...stepToolResults);
132
+ steps.push({
133
+ toolCalls: stepToolCalls,
134
+ toolResults: stepToolResults,
135
+ text: step.text || '',
136
+ });
137
+ }
138
+
139
+ return {
140
+ toolCalls: allToolCalls,
141
+ toolResults: allToolResults,
142
+ text: result.text || '',
143
+ steps,
144
+ usage: {
145
+ promptTokens: result.usage?.promptTokens || 0,
146
+ completionTokens: result.usage?.completionTokens || 0,
147
+ totalTokens: result.usage?.totalTokens || 0,
148
+ },
149
+ finishReason: result.finishReason || 'unknown',
150
+ };
151
+ }
152
+
153
+ /**
154
+ * Check a single eval result against expectations.
155
+ * @param {import('./eval-types.d.mts').EvalRunResult} result
156
+ * @param {import('./eval-types.d.mts').EvalCase} evalCase
157
+ * @throws {Error} if the assertion fails
158
+ */
159
+ export function checkExpectations(result, evalCase) {
160
+ if (evalCase.assert) {
161
+ evalCase.assert(result);
162
+ return;
163
+ }
164
+
165
+ if (!evalCase.expect) return;
166
+
167
+ const expectations = Array.isArray(evalCase.expect) ? evalCase.expect : [evalCase.expect];
168
+
169
+ if (result.toolCalls.length < expectations.length) {
170
+ let msg = `Expected ${expectations.length} tool call(s), but got ${result.toolCalls.length}`;
171
+ if (result.toolCalls.length === 0 && result.text) {
172
+ const truncated = result.text.length > 200 ? result.text.slice(0, 200) + '...' : result.text;
173
+ msg += `. Model responded with text: "${truncated}"`;
174
+ }
175
+ throw new Error(msg);
176
+ }
177
+
178
+ for (let i = 0; i < expectations.length; i++) {
179
+ const expected = expectations[i];
180
+ const actual = result.toolCalls[i];
181
+
182
+ if (expected.tool !== actual.name) {
183
+ throw new Error(
184
+ `Step ${i + 1}: expected tool "${expected.tool}", got "${actual.name}"`
185
+ );
186
+ }
187
+
188
+ if (expected.args) {
189
+ checkPartialMatch(expected.args, actual.args, `Step ${i + 1} args`);
190
+ }
191
+ }
192
+ }
193
+
194
+ /**
195
+ * Deep partial match — checks that all keys in `expected` exist in `actual`
196
+ * with matching values. Extra keys in `actual` are allowed.
197
+ * Supports vitest asymmetric matchers (expect.stringContaining, etc.).
198
+ * @param {Record<string, unknown>} expected
199
+ * @param {Record<string, unknown>} actual
200
+ * @param {string} path
201
+ */
202
+ function checkPartialMatch(expected, actual, path) {
203
+ for (const [key, expectedValue] of Object.entries(expected)) {
204
+ const actualValue = actual?.[key];
205
+
206
+ // Support vitest asymmetric matchers
207
+ if (expectedValue && typeof expectedValue === 'object' && typeof expectedValue.asymmetricMatch === 'function') {
208
+ if (!expectedValue.asymmetricMatch(actualValue)) {
209
+ throw new Error(
210
+ `${path}.${key}: expected ${expectedValue.toString()}, got ${JSON.stringify(actualValue)}`
211
+ );
212
+ }
213
+ continue;
214
+ }
215
+
216
+ if (typeof expectedValue === 'object' && expectedValue !== null && !Array.isArray(expectedValue)) {
217
+ checkPartialMatch(expectedValue, actualValue, `${path}.${key}`);
218
+ } else if (Array.isArray(expectedValue)) {
219
+ if (!Array.isArray(actualValue)) {
220
+ throw new Error(
221
+ `${path}.${key}: expected array, got ${JSON.stringify(actualValue)}`
222
+ );
223
+ }
224
+ // For arrays, check that each expected element exists in actual
225
+ for (let i = 0; i < expectedValue.length; i++) {
226
+ if (i >= actualValue.length) {
227
+ throw new Error(
228
+ `${path}.${key}[${i}]: expected ${JSON.stringify(expectedValue[i])}, but array only has ${actualValue.length} elements`
229
+ );
230
+ }
231
+ if (typeof expectedValue[i] === 'object' && expectedValue[i] !== null) {
232
+ checkPartialMatch(expectedValue[i], actualValue[i], `${path}.${key}[${i}]`);
233
+ } else if (expectedValue[i] !== actualValue[i]) {
234
+ throw new Error(
235
+ `${path}.${key}[${i}]: expected ${JSON.stringify(expectedValue[i])}, got ${JSON.stringify(actualValue[i])}`
236
+ );
237
+ }
238
+ }
239
+ } else if (expectedValue !== actualValue) {
240
+ throw new Error(
241
+ `${path}.${key}: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(actualValue)}`
242
+ );
243
+ }
244
+ }
245
+ }
246
+
247
+ /**
248
+ * Run an eval case N times against a model and collect aggregate results.
249
+ * @param {object} params
250
+ * @param {import('./eval-types.d.mts').EvalCase} params.evalCase
251
+ * @param {string} params.modelId
252
+ * @param {Record<string, import('ai').CoreTool>} params.tools
253
+ * @param {number} params.runs
254
+ * @param {number} params.maxSteps
255
+ * @param {number} params.temperature
256
+ * @param {number} params.timeout
257
+ * @returns {Promise<import('./eval-types.d.mts').EvalCaseResult>}
258
+ */
259
+ export async function runEvalCaseAggregate({
260
+ evalCase,
261
+ modelId,
262
+ tools,
263
+ runs,
264
+ maxSteps,
265
+ temperature,
266
+ timeout,
267
+ }) {
268
+ const model = await resolveModel(modelId);
269
+ let passed = 0;
270
+ let failed = 0;
271
+ let totalDurationMs = 0;
272
+ const failureMap = new Map();
273
+
274
+ for (let i = 0; i < runs; i++) {
275
+ const start = performance.now();
276
+ try {
277
+ const result = await runSingleEval({
278
+ prompt: evalCase.prompt,
279
+ model,
280
+ tools,
281
+ maxSteps: evalCase.maxSteps ?? maxSteps,
282
+ temperature,
283
+ timeout,
284
+ });
285
+ checkExpectations(result, evalCase);
286
+ passed++;
287
+ } catch (err) {
288
+ failed++;
289
+ const msg = err.message || String(err);
290
+ failureMap.set(msg, (failureMap.get(msg) || 0) + 1);
291
+ }
292
+ totalDurationMs += performance.now() - start;
293
+ }
294
+
295
+ const failures = Array.from(failureMap.entries()).map(([error, count]) => ({
296
+ error,
297
+ count,
298
+ }));
299
+
300
+ return {
301
+ caseName: evalCase.name,
302
+ modelId,
303
+ runs,
304
+ passed,
305
+ failed,
306
+ passRate: runs > 0 ? passed / runs : 0,
307
+ avgDurationMs: runs > 0 ? totalDurationMs / runs : 0,
308
+ failures,
309
+ };
310
+ }
@@ -0,0 +1,168 @@
1
+ /**
2
+ * Result from a single eval run, containing the model's tool calls and response.
3
+ */
4
+ export interface EvalRunResult {
5
+ /** All tool calls the model made across all steps. */
6
+ toolCalls: Array<{
7
+ name: string;
8
+ args: Record<string, unknown>;
9
+ }>;
10
+ /** All tool results returned. */
11
+ toolResults: Array<unknown>;
12
+ /** Final text response from the model. */
13
+ text: string;
14
+ /** Per-step breakdown. */
15
+ steps: Array<{
16
+ toolCalls: Array<{ name: string; args: Record<string, unknown> }>;
17
+ toolResults: Array<unknown>;
18
+ text: string;
19
+ }>;
20
+ /** Token usage. */
21
+ usage: {
22
+ promptTokens: number;
23
+ completionTokens: number;
24
+ totalTokens: number;
25
+ };
26
+ /** Why the model stopped. */
27
+ finishReason: string;
28
+ }
29
+
30
+ /**
31
+ * Expected tool call assertion.
32
+ */
33
+ export interface ToolExpectation {
34
+ /** Expected tool name. */
35
+ tool: string;
36
+ /** Expected arguments (partial match). */
37
+ args?: Record<string, unknown>;
38
+ }
39
+
40
+ /**
41
+ * A single eval test case.
42
+ */
43
+ export interface EvalCase {
44
+ /** Name of this test case. */
45
+ name: string;
46
+ /** The prompt to send to the model. */
47
+ prompt: string;
48
+ /** Maximum tool call steps (default: from config or 1). */
49
+ maxSteps?: number;
50
+ /** Expected tool call (single). */
51
+ expect?: ToolExpectation | ToolExpectation[];
52
+ /** Custom assertion function. */
53
+ assert?: (result: EvalRunResult) => void;
54
+ }
55
+
56
+ /**
57
+ * An eval spec defined via defineEval().
58
+ */
59
+ export interface EvalSpec {
60
+ /** Override model list for this eval (defaults to config). */
61
+ models?: string[];
62
+ /** Override run count for this eval (defaults to config). */
63
+ runs?: number;
64
+ /** Pass threshold (0-1). Defaults to config or 1.0. */
65
+ threshold?: number;
66
+ /** Test cases. */
67
+ cases: EvalCase[];
68
+ }
69
+
70
+ /**
71
+ * Eval configuration defined via defineEvalConfig().
72
+ */
73
+ export interface EvalConfig {
74
+ /** MCP server URL or stdio command. Omit for sunpeak projects (auto-detected). */
75
+ server?: string;
76
+ /** Model IDs to test against. */
77
+ models: string[];
78
+ /** Default settings. */
79
+ defaults?: {
80
+ /** Number of times to run each case per model. Default: 10. */
81
+ runs?: number;
82
+ /** Maximum tool call steps. Default: 1. */
83
+ maxSteps?: number;
84
+ /** Model temperature. Default: 0. */
85
+ temperature?: number;
86
+ /** Timeout per run in milliseconds. Default: 30000. */
87
+ timeout?: number;
88
+ /** Pass threshold (0-1). Default: 1.0. */
89
+ threshold?: number;
90
+ };
91
+ }
92
+
93
+ /**
94
+ * Aggregated results for one eval case across all runs of one model.
95
+ */
96
+ export interface EvalCaseResult {
97
+ caseName: string;
98
+ modelId: string;
99
+ runs: number;
100
+ passed: number;
101
+ failed: number;
102
+ passRate: number;
103
+ avgDurationMs: number;
104
+ failures: Array<{
105
+ error: string;
106
+ count: number;
107
+ }>;
108
+ }
109
+
110
+ /**
111
+ * Define an eval spec.
112
+ */
113
+ export declare function defineEval(spec: EvalSpec): EvalSpec;
114
+
115
+ /**
116
+ * Define eval configuration.
117
+ */
118
+ export declare function defineEvalConfig(config: EvalConfig): EvalConfig;
119
+
120
+ /**
121
+ * Check expectations against an eval run result.
122
+ * Throws if the result does not match the expected tool calls.
123
+ */
124
+ export declare function checkExpectations(
125
+ result: EvalRunResult,
126
+ evalCase: EvalCase,
127
+ ): void;
128
+
129
+ /**
130
+ * Connect to an MCP server and return a client + transport.
131
+ * @param serverUrl - MCP server URL (e.g., 'http://localhost:8000/mcp')
132
+ */
133
+ export declare function createMcpConnection(
134
+ serverUrl: string,
135
+ ): Promise<{ client: unknown; transport: { close?: () => Promise<void> } }>;
136
+
137
+ /**
138
+ * Discover tools from an MCP server client and convert them to AI SDK tool format.
139
+ * @param client - MCP SDK Client instance (from createMcpConnection)
140
+ */
141
+ export declare function discoverAndConvertTools(
142
+ client: unknown,
143
+ ): Promise<Record<string, unknown>>;
144
+
145
+ /**
146
+ * Run a single eval case against a model, returning the normalized result.
147
+ */
148
+ export declare function runSingleEval(params: {
149
+ prompt: string;
150
+ model: unknown;
151
+ tools: Record<string, unknown>;
152
+ maxSteps: number;
153
+ temperature: number;
154
+ timeout: number;
155
+ }): Promise<EvalRunResult>;
156
+
157
+ /**
158
+ * Run an eval case multiple times against a model and return aggregated results.
159
+ */
160
+ export declare function runEvalCaseAggregate(params: {
161
+ evalCase: EvalCase;
162
+ modelId: string;
163
+ tools: Record<string, unknown>;
164
+ runs: number;
165
+ maxSteps: number;
166
+ temperature: number;
167
+ timeout: number;
168
+ }): Promise<EvalCaseResult>;
@@ -0,0 +1,158 @@
1
+ /**
2
+ * Vitest plugin that transforms .eval.ts files into runnable test suites.
3
+ *
4
+ * Each eval spec file gets transformed into a vitest test module that:
5
+ * 1. Connects to the MCP server
6
+ * 2. Discovers and converts tools
7
+ * 3. Runs each case × model × N runs
8
+ * 4. Reports aggregate pass/fail counts
9
+ *
10
+ * The original eval spec is re-imported via a virtual module (\0 prefix)
11
+ * to avoid circular transformation. The virtual ID ends in .eval-spec.ts
12
+ * so Vite's esbuild transform recognizes it as TypeScript.
13
+ */
14
+
15
+ import { readFileSync } from 'fs';
16
+ import { basename } from 'path';
17
+ import { fileURLToPath } from 'url';
18
+
19
+ const EVAL_RE = /\.eval\.[tj]s$/;
20
+ const VIRTUAL_PREFIX = '\0sunpeak-eval-spec:';
21
+
22
+ /**
23
+ * Create the vitest plugin for eval files.
24
+ * @param {object} options
25
+ * @param {string} options.server - MCP server URL or command
26
+ * @param {string[]} options.models - Model IDs to test
27
+ * @param {object} options.defaults - Default settings
28
+ * @returns {import('vite').Plugin}
29
+ */
30
+ export function evalVitestPlugin({ server, models, defaults }) {
31
+ // Map virtual IDs back to real file paths
32
+ const virtualToReal = new Map();
33
+
34
+ return {
35
+ name: 'sunpeak-eval',
36
+ enforce: 'pre',
37
+
38
+ resolveId(id) {
39
+ // Resolve virtual spec imports — these bypass the transform
40
+ if (id.startsWith(VIRTUAL_PREFIX)) {
41
+ return id;
42
+ }
43
+ return null;
44
+ },
45
+
46
+ load(id) {
47
+ if (!id.startsWith(VIRTUAL_PREFIX)) return null;
48
+ const realPath = virtualToReal.get(id);
49
+ if (!realPath) return null;
50
+ return readFileSync(realPath, 'utf-8');
51
+ },
52
+
53
+ transform(code, id) {
54
+ // Don't transform virtual modules
55
+ if (id.startsWith(VIRTUAL_PREFIX)) return null;
56
+ // Only transform eval spec files
57
+ if (!EVAL_RE.test(id)) return null;
58
+
59
+ // Register the virtual module mapping (use .ts extension so esbuild handles it)
60
+ const virtualId = VIRTUAL_PREFIX + id;
61
+ virtualToReal.set(virtualId, id);
62
+
63
+ const testName = basename(id).replace(EVAL_RE, '');
64
+ const runnerPath = resolveRunnerPath();
65
+
66
+ const transformed = `
67
+ import { describe, it, beforeAll, afterAll } from 'vitest';
68
+ import { createMcpConnection, discoverAndConvertTools, runEvalCaseAggregate, checkAiSdkInstalled } from '${runnerPath}';
69
+
70
+ // Import the original eval spec via virtual module (bypasses this transform)
71
+ import evalSpec from ${JSON.stringify(virtualId)};
72
+
73
+ if (!evalSpec || !evalSpec.cases) {
74
+ throw new Error('Eval file must use: export default defineEval({ cases: [...] })');
75
+ }
76
+
77
+ const SERVER = ${JSON.stringify(server)};
78
+ const MODELS = ${JSON.stringify(models)};
79
+ const DEFAULTS = ${JSON.stringify(defaults)};
80
+
81
+ // Use the eval-level model override, or fall back to config
82
+ const activeModels = evalSpec.models || MODELS;
83
+
84
+ // Skip entirely if no models configured
85
+ const shouldSkip = !activeModels || activeModels.length === 0;
86
+
87
+ describe.skipIf(shouldSkip)(${JSON.stringify(testName)}, () => {
88
+ let client;
89
+ let transport;
90
+ let tools;
91
+
92
+ beforeAll(async () => {
93
+ await checkAiSdkInstalled();
94
+ const conn = await createMcpConnection(SERVER);
95
+ client = conn.client;
96
+ transport = conn.transport;
97
+ tools = await discoverAndConvertTools(client);
98
+ });
99
+
100
+ afterAll(async () => {
101
+ try { await transport?.close?.(); } catch {}
102
+ });
103
+
104
+ for (const evalCase of evalSpec.cases) {
105
+ describe(evalCase.name, () => {
106
+ for (const modelId of activeModels) {
107
+ const runs = evalSpec.runs ?? DEFAULTS.runs ?? 10;
108
+ const threshold = evalSpec.threshold ?? DEFAULTS.threshold ?? 1.0;
109
+
110
+ it(\`\${modelId} (\${runs} runs)\`, async () => {
111
+ const result = await runEvalCaseAggregate({
112
+ evalCase,
113
+ modelId,
114
+ tools,
115
+ runs,
116
+ maxSteps: DEFAULTS.maxSteps ?? 1,
117
+ temperature: DEFAULTS.temperature ?? 0,
118
+ timeout: DEFAULTS.timeout ?? 30000,
119
+ });
120
+
121
+ // Log statistical results for the reporter
122
+ console.log('__SUNPEAK_EVAL__' + JSON.stringify({
123
+ type: 'eval-result',
124
+ ...result,
125
+ }));
126
+
127
+ // Assert pass rate meets threshold
128
+ if (result.passRate < threshold) {
129
+ const failureSummary = result.failures
130
+ .map((f) => \` \${f.error} (\${f.count}x)\`)
131
+ .join('\\n');
132
+ throw new Error(
133
+ \`\${result.passed}/\${result.runs} passed (\${(result.passRate * 100).toFixed(0)}%), threshold \${(threshold * 100).toFixed(0)}%\\nFailures:\\n\${failureSummary}\`
134
+ );
135
+ }
136
+ }, (DEFAULTS.timeout ?? 30000) * (evalSpec.runs ?? DEFAULTS.runs ?? 10) + 10000);
137
+ }
138
+ });
139
+ }
140
+ });
141
+ `;
142
+
143
+ return { code: transformed, map: null };
144
+ },
145
+ };
146
+ }
147
+
148
+ /**
149
+ * Get the absolute path to the eval-runner module.
150
+ */
151
+ function resolveRunnerPath() {
152
+ const url = new URL('./eval-runner.mjs', import.meta.url);
153
+ // fileURLToPath requires file:// scheme; fall back to pathname for other schemes (e.g., vitest)
154
+ if (url.protocol === 'file:') {
155
+ return fileURLToPath(url);
156
+ }
157
+ return url.pathname;
158
+ }
@@ -0,0 +1,73 @@
1
+ /**
2
+ * Model registry — maps model ID strings to AI SDK provider instances.
3
+ *
4
+ * Provider packages are dynamically imported so users only need to install
5
+ * the providers they actually use.
6
+ */
7
+
8
+ /**
9
+ * @typedef {{ modelId: string, providerPackage: string }} ModelMapping
10
+ */
11
+
12
+ /**
13
+ * Detect which provider package a model ID belongs to.
14
+ * @param {string} modelId
15
+ * @returns {string} Provider package name
16
+ */
17
+ function getProviderPackage(modelId) {
18
+ if (/^(gpt-|o[134]-|o[134]$|chatgpt-)/.test(modelId)) return '@ai-sdk/openai';
19
+ if (/^claude-/.test(modelId)) return '@ai-sdk/anthropic';
20
+ if (/^(gemini-|models\/gemini-)/.test(modelId)) return '@ai-sdk/google';
21
+ throw new Error(
22
+ `Unknown model: "${modelId}". Expected a recognized prefix (gpt-, claude-, gemini-, o1-, o3-, o4-).`
23
+ );
24
+ }
25
+
26
+ /**
27
+ * Resolve a model ID string to an AI SDK LanguageModel instance.
28
+ * @param {string} modelId - e.g., 'gpt-4o', 'claude-sonnet-4-20250514', 'gemini-2.0-flash'
29
+ * @returns {Promise<import('ai').LanguageModel>}
30
+ */
31
+ export async function resolveModel(modelId) {
32
+ const pkg = getProviderPackage(modelId);
33
+
34
+ let provider;
35
+ try {
36
+ provider = await import(pkg);
37
+ } catch {
38
+ throw new Error(
39
+ `Provider package "${pkg}" is not installed. Install it to use ${modelId}:\n\n npm install ${pkg} (or pnpm add / yarn add)\n`
40
+ );
41
+ }
42
+
43
+ // Each provider package exports a default function or named function
44
+ // that creates model instances: openai('gpt-4o'), anthropic('claude-...'), google('gemini-...')
45
+ if (pkg === '@ai-sdk/openai') {
46
+ const { openai } = provider;
47
+ return openai(modelId);
48
+ }
49
+ if (pkg === '@ai-sdk/anthropic') {
50
+ const { anthropic } = provider;
51
+ return anthropic(modelId);
52
+ }
53
+ if (pkg === '@ai-sdk/google') {
54
+ const { google } = provider;
55
+ return google(modelId);
56
+ }
57
+
58
+ throw new Error(`No provider factory found for ${pkg}`);
59
+ }
60
+
61
+ /**
62
+ * Check that the `ai` core package is installed.
63
+ * @returns {Promise<void>}
64
+ */
65
+ export async function checkAiSdkInstalled() {
66
+ try {
67
+ await import('ai');
68
+ } catch {
69
+ throw new Error(
70
+ 'The "ai" package is not installed. Install it to use evals:\n\n npm install ai (or pnpm add / yarn add)\n'
71
+ );
72
+ }
73
+ }