@phi-code-admin/phi-code 0.56.6 → 0.57.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,490 +1,703 @@
1
1
  /**
2
- * Benchmark Extension - Integrated model performance testing
2
+ * Benchmark Extension - Production-grade model performance testing
3
3
  *
4
- * Provides automated benchmarking capabilities to test and compare different
5
- * AI models on coding tasks. Currently includes a simple Fibonacci generation
6
- * test with plans to expand to additional test categories.
7
- *
8
- * Features:
9
- * - /benchmark command for interactive testing
10
- * - Model selection from available models
11
- * - Code generation testing (Fibonacci function)
12
- * - Performance metrics (time, quality, tokens)
13
- * - Results persistence in ~/.phi/benchmark/results.json
14
- * - Ranking and comparison display
4
+ * Tests AI models across 6 categories:
5
+ * 1. Code Generation Write a function from a spec
6
+ * 2. Debugging Find and fix a bug
7
+ * 3. Planning — Create an implementation plan
8
+ * 4. Tool Calling — Generate structured JSON output
9
+ * 5. Speed Response latency measurement
10
+ * 6. Orchestration Multi-step reasoning task
15
11
  *
16
12
  * Usage:
17
- * 1. Copy to packages/coding-agent/extensions/phi/benchmark.ts
18
- * 2. Use /benchmark to start interactive testing
19
- * 3. Results saved in ~/.phi/benchmark/results.json
13
+ * - /benchmark — Run benchmark on current model
14
+ * - /benchmark all — Run on all available models
15
+ * - /benchmark results — Show saved results
16
+ * - /benchmark compare — Side-by-side model comparison
17
+ * - /benchmark clear — Clear all results
20
18
  */
21
19
 
22
- import type { ExtensionAPI } from "phi-code";
20
+ import type { ExtensionAPI, ExtensionContext } from "phi-code";
23
21
  import { writeFile, mkdir, readFile, access } from "node:fs/promises";
24
22
  import { join } from "node:path";
25
23
  import { homedir } from "node:os";
26
24
 
27
- interface BenchmarkResult {
25
+ // ─── Types ───────────────────────────────────────────────────────────────
26
+
27
+ interface TestCase {
28
+ category: "code-gen" | "debug" | "planning" | "tool-calling" | "speed" | "orchestration";
29
+ name: string;
30
+ prompt: string;
31
+ validate: (response: string) => TestResult;
32
+ weight: number; // Score weight (1-3)
33
+ }
34
+
35
+ interface TestResult {
36
+ passed: boolean;
37
+ score: number; // 0-100
38
+ details: string;
39
+ }
40
+
41
+ interface ModelBenchmark {
42
+ modelId: string;
28
43
  modelName: string;
29
- testType: string;
44
+ provider: string;
30
45
  timestamp: string;
31
- timeMs: number;
32
- tokensUsed?: number;
33
- quality: "pass" | "fail" | "partial";
34
- score: number; // 0-100
35
- details: {
36
- prompt: string;
37
- response: string;
38
- compilable: boolean;
39
- testsPassed: number;
40
- totalTests: number;
41
- errors?: string[];
46
+ categories: {
47
+ [key: string]: {
48
+ score: number;
49
+ timeMs: number;
50
+ details: string;
51
+ };
42
52
  };
53
+ totalScore: number;
54
+ totalTimeMs: number;
55
+ avgTimeMs: number;
43
56
  }
44
57
 
45
- interface BenchmarkSummary {
46
- testRuns: BenchmarkResult[];
58
+ interface BenchmarkStore {
59
+ version: 2;
60
+ results: ModelBenchmark[];
47
61
  lastUpdated: string;
48
62
  }
49
63
 
50
- export default function benchmarkExtension(pi: ExtensionAPI) {
51
- const benchmarkDir = join(homedir(), ".phi", "benchmark");
52
- const resultsPath = join(benchmarkDir, "results.json");
64
+ // ─── Test Suite ──────────────────────────────────────────────────────────
65
+
66
+ function createTestSuite(): TestCase[] {
67
+ return [
68
+ // 1. CODE GENERATION
69
+ {
70
+ category: "code-gen",
71
+ name: "Fibonacci Function",
72
+ weight: 2,
73
+ prompt: `Write a TypeScript function called 'fibonacci' that:
74
+ - Takes a number n as parameter
75
+ - Returns the nth Fibonacci number
76
+ - Handles edge cases (n <= 0 returns 0, n = 1 returns 1)
77
+ - Uses iterative approach (not recursive)
78
+ - Is properly typed
79
+
80
+ Respond with ONLY the function code, no explanations.`,
81
+ validate: (response: string) => {
82
+ const code = extractCode(response);
83
+ const checks = [
84
+ { test: /function\s+fibonacci/.test(code), detail: "Function named 'fibonacci'" },
85
+ { test: /:\s*number/.test(code), detail: "TypeScript type annotation" },
86
+ { test: /return/.test(code), detail: "Has return statement" },
87
+ { test: /for|while/.test(code), detail: "Uses iteration (not recursion)" },
88
+ { test: /(<=\s*0|===?\s*0|<\s*1)/.test(code), detail: "Handles edge case n=0" },
89
+ { test: /(===?\s*1|<=\s*1)/.test(code), detail: "Handles edge case n=1" },
90
+ ];
91
+ const passed = checks.filter(c => c.test).length;
92
+ const total = checks.length;
93
+ return {
94
+ passed: passed >= 5,
95
+ score: Math.round((passed / total) * 100),
96
+ details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
97
+ };
98
+ },
99
+ },
53
100
 
54
- /**
55
- * Ensure benchmark directory exists
56
- */
57
- async function ensureBenchmarkDirectory() {
58
- try {
59
- await mkdir(benchmarkDir, { recursive: true });
60
- } catch (error) {
61
- console.warn("Failed to create benchmark directory:", error);
62
- }
63
- }
101
+ // 2. DEBUGGING
102
+ {
103
+ category: "debug",
104
+ name: "Find the Bug",
105
+ weight: 2,
106
+ prompt: `Find and fix the bug in this TypeScript code:
107
+
108
+ \`\`\`typescript
109
+ function mergeArrays<T>(arr1: T[], arr2: T[]): T[] {
110
+ const result = arr1;
111
+ for (let i = 0; i < arr2.length; i++) {
112
+ result.push(arr2[i]);
113
+ }
114
+ return result;
115
+ }
64
116
 
65
- /**
66
- * Load existing benchmark results
67
- */
68
- async function loadResults(): Promise<BenchmarkSummary> {
69
- try {
70
- await access(resultsPath);
71
- const content = await readFile(resultsPath, 'utf-8');
72
- return JSON.parse(content);
73
- } catch {
74
- return { testRuns: [], lastUpdated: new Date().toISOString() };
75
- }
76
- }
117
+ // Bug: calling mergeArrays modifies the original arr1
118
+ const a = [1, 2, 3];
119
+ const b = [4, 5, 6];
120
+ const merged = mergeArrays(a, b);
121
+ console.log(a); // Expected [1,2,3] but got [1,2,3,4,5,6]
122
+ \`\`\`
123
+
124
+ Explain the bug and provide the fixed code.`,
125
+ validate: (response: string) => {
126
+ const lower = response.toLowerCase();
127
+ const checks = [
128
+ { test: /reference|shallow|copy|spread|\[\.\.\./.test(lower), detail: "Identifies reference/copy issue" },
129
+ { test: /\[\.\.\.arr1\]|\[\.\.\.arr1,|Array\.from|\.slice\(\)|structuredClone|concat/.test(response), detail: "Uses spread/copy/concat fix" },
130
+ { test: /mutate|modify|original|side.?effect/.test(lower), detail: "Explains the mutation problem" },
131
+ { test: /const result\s*=\s*\[/.test(response) || /\.slice\(/.test(response) || /\.concat\(/.test(response) || /Array\.from/.test(response), detail: "Creates new array in fix" },
132
+ ];
133
+ const passed = checks.filter(c => c.test).length;
134
+ return {
135
+ passed: passed >= 3,
136
+ score: Math.round((passed / checks.length) * 100),
137
+ details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
138
+ };
139
+ },
140
+ },
77
141
 
78
- /**
79
- * Save benchmark results
80
- */
81
- async function saveResults(summary: BenchmarkSummary) {
82
- await ensureBenchmarkDirectory();
83
- summary.lastUpdated = new Date().toISOString();
84
- await writeFile(resultsPath, JSON.stringify(summary, null, 2), 'utf-8');
85
- }
142
+ // 3. PLANNING
143
+ {
144
+ category: "planning",
145
+ name: "Implementation Plan",
146
+ weight: 2,
147
+ prompt: `Create a detailed implementation plan for adding JWT authentication to an existing Express.js REST API.
86
148
 
87
- /**
88
- * Fibonacci test - Generate and test a Fibonacci function
89
- */
90
- function createFibonacciTest(): { prompt: string; expectedBehavior: string; tests: Array<{ input: number; expected: number }> } {
91
- return {
92
- prompt: `Write a TypeScript function called 'fibonacci' that calculates the nth Fibonacci number.
149
+ The API currently has:
150
+ - User model with email/password
151
+ - CRUD endpoints for /users and /posts
152
+ - PostgreSQL database with Prisma ORM
93
153
 
94
154
  Requirements:
95
- - Function should be named exactly 'fibonacci'
96
- - Take one parameter 'n' of type number
97
- - Return type should be number
98
- - Handle edge cases (n <= 0 should return 0, n = 1 should return 1)
99
- - Use an efficient iterative approach (not recursive)
100
-
101
- Provide only the function code, no explanations or additional text.`,
102
-
103
- expectedBehavior: "Efficient iterative Fibonacci calculation",
104
-
105
- tests: [
106
- { input: 0, expected: 0 },
107
- { input: 1, expected: 1 },
108
- { input: 2, expected: 1 },
109
- { input: 3, expected: 2 },
110
- { input: 5, expected: 5 },
111
- { input: 8, expected: 21 },
112
- { input: 10, expected: 55 }
113
- ]
114
- };
115
- }
155
+ - Login endpoint returns access + refresh tokens
156
+ - Protected routes require valid access token
157
+ - Refresh token rotation
158
+ - Token blacklisting on logout
159
+
160
+ Provide a structured plan with specific files to create/modify, dependencies to add, and implementation steps.`,
161
+ validate: (response: string) => {
162
+ const lower = response.toLowerCase();
163
+ const checks = [
164
+ { test: /jsonwebtoken|jwt|jose/.test(lower), detail: "Mentions JWT library" },
165
+ { test: /access.?token|refresh.?token/.test(lower), detail: "Covers both token types" },
166
+ { test: /middleware/.test(lower), detail: "Mentions auth middleware" },
167
+ { test: /bcrypt|argon|hash/.test(lower), detail: "Addresses password hashing" },
168
+ { test: /blacklist|revoke|invalidat/.test(lower), detail: "Addresses token revocation" },
169
+ { test: /prisma|schema|model|migration/.test(lower), detail: "Covers database changes" },
170
+ { test: /env|secret|config/.test(lower), detail: "Addresses secret management" },
171
+ { test: /step|phase|\d\.|create|modify|add/.test(lower), detail: "Provides structured steps" },
172
+ ];
173
+ const passed = checks.filter(c => c.test).length;
174
+ return {
175
+ passed: passed >= 6,
176
+ score: Math.round((passed / checks.length) * 100),
177
+ details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
178
+ };
179
+ },
180
+ },
116
181
 
117
- /**
118
- * Extract TypeScript code from response
119
- */
120
- function extractTypeScriptCode(response: string): string {
121
- // Try to find code blocks first
122
- const codeBlockMatch = response.match(/```(?:typescript|ts)?\s*([\s\S]*?)```/);
123
- if (codeBlockMatch) {
124
- return codeBlockMatch[1].trim();
125
- }
182
+ // 4. TOOL CALLING (structured output)
183
+ {
184
+ category: "tool-calling",
185
+ name: "Structured JSON Output",
186
+ weight: 1,
187
+ prompt: `Parse this natural language description and output ONLY a valid JSON object (no markdown, no explanation):
188
+
189
+ "Create a new user named Alice Smith, email alice@example.com, she's a software engineer at TechCorp, based in San Francisco, age 28, prefers dark mode and email notifications"
190
+
191
+ Required JSON schema:
192
+ {
193
+ "name": { "first": string, "last": string },
194
+ "email": string,
195
+ "profile": {
196
+ "occupation": string,
197
+ "company": string,
198
+ "location": string,
199
+ "age": number
200
+ },
201
+ "preferences": {
202
+ "theme": "light" | "dark",
203
+ "notifications": { "email": boolean, "push": boolean }
204
+ }
205
+ }
126
206
 
127
- // Look for function definition
128
- const functionMatch = response.match(/function\s+fibonacci[\s\S]*?}\s*$/m);
129
- if (functionMatch) {
130
- return functionMatch[0].trim();
131
- }
207
+ Output ONLY the JSON.`,
208
+ validate: (response: string) => {
209
+ const checks: Array<{ test: boolean; detail: string }> = [];
210
+
211
+ // Try to extract JSON from response
212
+ let jsonStr = response.trim();
213
+ const jsonMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/) || response.match(/(\{[\s\S]*\})/);
214
+ if (jsonMatch) jsonStr = jsonMatch[1].trim();
215
+
216
+ try {
217
+ const obj = JSON.parse(jsonStr);
218
+ checks.push({ test: true, detail: "Valid JSON" });
219
+ checks.push({ test: obj?.name?.first === "Alice", detail: 'name.first = "Alice"' });
220
+ checks.push({ test: obj?.name?.last === "Smith", detail: 'name.last = "Smith"' });
221
+ checks.push({ test: obj?.email === "alice@example.com", detail: "Correct email" });
222
+ checks.push({ test: typeof obj?.profile?.age === "number" && obj.profile.age === 28, detail: "Age is number 28" });
223
+ checks.push({ test: obj?.preferences?.theme === "dark", detail: 'theme = "dark"' });
224
+ checks.push({ test: obj?.preferences?.notifications?.email === true, detail: "email notifications = true" });
225
+ } catch {
226
+ checks.push({ test: false, detail: "Valid JSON (parse failed)" });
227
+ checks.push({ test: false, detail: "name.first" });
228
+ checks.push({ test: false, detail: "name.last" });
229
+ checks.push({ test: false, detail: "email" });
230
+ checks.push({ test: false, detail: "age" });
231
+ checks.push({ test: false, detail: "theme" });
232
+ checks.push({ test: false, detail: "notifications" });
233
+ }
132
234
 
133
- // Look for arrow function
134
- const arrowMatch = response.match(/const\s+fibonacci[\s\S]*?;?\s*$/m);
135
- if (arrowMatch) {
136
- return arrowMatch[0].trim();
137
- }
235
+ const passed = checks.filter(c => c.test).length;
236
+ return {
237
+ passed: passed >= 5,
238
+ score: Math.round((passed / checks.length) * 100),
239
+ details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
240
+ };
241
+ },
242
+ },
138
243
 
139
- // Return the whole response if no specific pattern found
140
- return response.trim();
141
- }
244
+ // 5. SPEED (simple task, measures latency)
245
+ {
246
+ category: "speed",
247
+ name: "Quick Response",
248
+ weight: 1,
249
+ prompt: `Reply with exactly this text and nothing else: "Hello, World!"`,
250
+ validate: (response: string) => {
251
+ const trimmed = response.trim().replace(/^["']|["']$/g, "").replace(/```\w*\n?/g, "").trim();
252
+ const exact = trimmed === "Hello, World!";
253
+ const close = trimmed.toLowerCase().includes("hello, world");
254
+ return {
255
+ passed: close,
256
+ score: exact ? 100 : close ? 75 : 0,
257
+ details: exact ? "✅ Exact match" : close ? "⚠️ Close match" : `❌ Got: "${trimmed.substring(0, 50)}"`,
258
+ };
259
+ },
260
+ },
142
261
 
143
- /**
144
- * Test extracted code against test cases
145
- */
146
- async function testFibonacciCode(code: string, tests: Array<{ input: number; expected: number }>): Promise<{
147
- compilable: boolean;
148
- testsPassed: number;
149
- totalTests: number;
150
- errors: string[];
151
- }> {
152
- const errors: string[] = [];
153
- let testsPassed = 0;
262
+ // 6. ORCHESTRATION (multi-step reasoning)
263
+ {
264
+ category: "orchestration",
265
+ name: "Multi-Step Analysis",
266
+ weight: 2,
267
+ prompt: `Analyze this scenario step by step:
268
+
269
+ A Node.js microservice has these symptoms:
270
+ 1. Response times gradually increase from 50ms to 3000ms over 24 hours
271
+ 2. Memory usage grows steadily from 200MB to 1.5GB
272
+ 3. The service handles file uploads (multipart/form-data)
273
+ 4. After restart, everything returns to normal
274
+ 5. No errors in logs
275
+ 6. Database queries remain fast (<10ms)
276
+
277
+ Tasks:
278
+ A) Identify the most likely root cause
279
+ B) List 3 specific things to check in the code
280
+ C) Propose a fix with code example
281
+ D) Suggest monitoring to prevent recurrence
282
+
283
+ Be specific and technical.`,
284
+ validate: (response: string) => {
285
+ const lower = response.toLowerCase();
286
+ const checks = [
287
+ { test: /memory.?leak|leak/.test(lower), detail: "Identifies memory leak" },
288
+ { test: /stream|buffer|file|upload|temp|cleanup/.test(lower), detail: "Links to file upload handling" },
289
+ { test: /close|destroy|cleanup|dispose|gc|garbage/.test(lower), detail: "Suggests resource cleanup" },
290
+ { test: /event.?listener|handler|remove|off/.test(lower) || /stream|pipe/.test(lower), detail: "Checks for handler/stream leaks" },
291
+ { test: /heapdump|heap.?snapshot|inspect|profile|--max-old-space/.test(lower) || /process\.memoryUsage/.test(lower), detail: "Suggests debugging tools" },
292
+ { test: /monitor|alert|metric|prometheus|grafana|threshold/.test(lower), detail: "Suggests monitoring" },
293
+ ];
294
+ const passed = checks.filter(c => c.test).length;
295
+ return {
296
+ passed: passed >= 4,
297
+ score: Math.round((passed / checks.length) * 100),
298
+ details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
299
+ };
300
+ },
301
+ },
302
+ ];
303
+ }
154
304
 
155
- try {
156
- // Create a test environment with the code
157
- const testCode = `
158
- ${code}
159
-
160
- // Test runner
161
- function runTests() {
162
- const results = [];
163
- const tests = ${JSON.stringify(tests)};
164
-
165
- for (const test of tests) {
166
- try {
167
- const result = fibonacci(test.input);
168
- const passed = result === test.expected;
169
- results.push({
170
- input: test.input,
171
- expected: test.expected,
172
- actual: result,
173
- passed
174
- });
175
- } catch (error) {
176
- results.push({
177
- input: test.input,
178
- expected: test.expected,
179
- actual: 'ERROR: ' + error.message,
180
- passed: false
305
+ // ─── Helpers ─────────────────────────────────────────────────────────────
306
+
307
+ function extractCode(response: string): string {
308
+ const match = response.match(/```(?:typescript|ts|javascript|js)?\s*([\s\S]*?)```/);
309
+ return match ? match[1].trim() : response.trim();
310
+ }
311
+
312
+ interface ProviderConfig {
313
+ name: string;
314
+ envVar: string;
315
+ baseUrl: string;
316
+ models: string[];
317
+ }
318
+
319
+ function getProviderConfigs(): ProviderConfig[] {
320
+ return [
321
+ {
322
+ name: "alibaba-codingplan",
323
+ envVar: "ALIBABA_CODING_PLAN_KEY",
324
+ baseUrl: "https://coding-intl.dashscope.aliyuncs.com/v1",
325
+ models: ["qwen3.5-plus", "qwen3-max-2026-01-23", "qwen3-coder-plus", "qwen3-coder-next", "kimi-k2.5", "glm-5", "glm-4.7", "MiniMax-M2.5"],
326
+ },
327
+ {
328
+ name: "openai",
329
+ envVar: "OPENAI_API_KEY",
330
+ baseUrl: "https://api.openai.com/v1",
331
+ models: ["gpt-4o", "gpt-4o-mini"],
332
+ },
333
+ {
334
+ name: "anthropic-openai",
335
+ envVar: "ANTHROPIC_API_KEY",
336
+ baseUrl: "https://api.anthropic.com/v1",
337
+ models: [],
338
+ },
339
+ {
340
+ name: "openrouter",
341
+ envVar: "OPENROUTER_API_KEY",
342
+ baseUrl: "https://openrouter.ai/api/v1",
343
+ models: [],
344
+ },
345
+ {
346
+ name: "groq",
347
+ envVar: "GROQ_API_KEY",
348
+ baseUrl: "https://api.groq.com/openai/v1",
349
+ models: [],
350
+ },
351
+ ];
352
+ }
353
+
354
+ function getAvailableModels(): Array<{ id: string; provider: string; baseUrl: string; apiKey: string }> {
355
+ const models: Array<{ id: string; provider: string; baseUrl: string; apiKey: string }> = [];
356
+
357
+ for (const provider of getProviderConfigs()) {
358
+ const apiKey = process.env[provider.envVar];
359
+ if (!apiKey) continue;
360
+
361
+ for (const modelId of provider.models) {
362
+ models.push({
363
+ id: modelId,
364
+ provider: provider.name,
365
+ baseUrl: provider.baseUrl,
366
+ apiKey,
181
367
  });
182
368
  }
183
369
  }
184
-
185
- return results;
370
+
371
+ return models;
186
372
  }
187
373
 
188
- runTests();
189
- `;
374
+ async function callModel(
375
+ baseUrl: string,
376
+ apiKey: string,
377
+ model: string,
378
+ prompt: string,
379
+ timeoutMs: number = 60000,
380
+ ): Promise<{ response: string; timeMs: number }> {
381
+ const startTime = Date.now();
382
+
383
+ const controller = new AbortController();
384
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
385
+
386
+ try {
387
+ const res = await fetch(`${baseUrl}/chat/completions`, {
388
+ method: "POST",
389
+ headers: {
390
+ "Content-Type": "application/json",
391
+ Authorization: `Bearer ${apiKey}`,
392
+ },
393
+ body: JSON.stringify({
394
+ model,
395
+ messages: [{ role: "user", content: prompt }],
396
+ max_tokens: 4096,
397
+ temperature: 0.1,
398
+ }),
399
+ signal: controller.signal,
400
+ });
190
401
 
191
- // Use eval in a controlled way (this is for testing, not production)
192
- // In a real implementation, you'd want to use a proper sandbox
193
- const testResults = eval(testCode);
194
-
195
- testsPassed = testResults.filter((r: any) => r.passed).length;
402
+ if (!res.ok) {
403
+ const errorBody = await res.text().catch(() => "");
404
+ throw new Error(`API error ${res.status}: ${errorBody.substring(0, 200)}`);
405
+ }
196
406
 
197
- // Add failed test details to errors
198
- testResults.filter((r: any) => !r.passed).forEach((r: any) => {
199
- errors.push(`fibonacci(${r.input}) = ${r.actual}, expected ${r.expected}`);
200
- });
407
+ const data = (await res.json()) as any;
408
+ const response = data?.choices?.[0]?.message?.content || "";
409
+ const timeMs = Date.now() - startTime;
201
410
 
202
- return {
203
- compilable: true,
204
- testsPassed,
205
- totalTests: tests.length,
206
- errors
207
- };
208
-
209
- } catch (error) {
210
- errors.push(`Compilation/Runtime error: ${error}`);
211
- return {
212
- compilable: false,
213
- testsPassed: 0,
214
- totalTests: tests.length,
215
- errors
216
- };
217
- }
411
+ return { response, timeMs };
412
+ } finally {
413
+ clearTimeout(timeout);
218
414
  }
415
+ }
219
416
 
220
- /**
221
- * Calculate quality score based on test results
222
- */
223
- function calculateScore(result: { compilable: boolean; testsPassed: number; totalTests: number; errors: string[] }): {
224
- quality: "pass" | "fail" | "partial";
225
- score: number;
226
- } {
227
- if (!result.compilable) {
228
- return { quality: "fail", score: 0 };
229
- }
417
+ // ─── Extension ───────────────────────────────────────────────────────────
418
+
419
+ export default function benchmarkExtension(pi: ExtensionAPI) {
420
+ const benchmarkDir = join(homedir(), ".phi", "benchmark");
421
+ const resultsPath = join(benchmarkDir, "results.json");
230
422
 
231
- const passRate = result.testsPassed / result.totalTests;
232
- const score = Math.round(passRate * 100);
423
+ async function ensureDir() {
424
+ await mkdir(benchmarkDir, { recursive: true });
425
+ }
233
426
 
234
- if (score === 100) {
235
- return { quality: "pass", score };
236
- } else if (score > 0) {
237
- return { quality: "partial", score };
238
- } else {
239
- return { quality: "fail", score };
427
+ async function loadStore(): Promise<BenchmarkStore> {
428
+ try {
429
+ await access(resultsPath);
430
+ const content = await readFile(resultsPath, "utf-8");
431
+ const store = JSON.parse(content);
432
+ if (store.version === 2) return store;
433
+ return { version: 2, results: [], lastUpdated: new Date().toISOString() };
434
+ } catch {
435
+ return { version: 2, results: [], lastUpdated: new Date().toISOString() };
240
436
  }
241
437
  }
242
438
 
439
+ async function saveStore(store: BenchmarkStore) {
440
+ await ensureDir();
441
+ store.lastUpdated = new Date().toISOString();
442
+ await writeFile(resultsPath, JSON.stringify(store, null, 2), "utf-8");
443
+ }
444
+
243
445
  /**
244
- * Run benchmark test on a specific model
446
+ * Run full benchmark on a single model
245
447
  */
246
- async function runBenchmarkTest(modelName: string): Promise<BenchmarkResult> {
247
- const test = createFibonacciTest();
248
- const startTime = Date.now();
448
+ async function benchmarkModel(
449
+ modelId: string,
450
+ provider: string,
451
+ baseUrl: string,
452
+ apiKey: string,
453
+ ctx: ExtensionContext,
454
+ ): Promise<ModelBenchmark> {
455
+ const tests = createTestSuite();
456
+ const categories: ModelBenchmark["categories"] = {};
457
+ let totalTime = 0;
458
+
459
+ for (const test of tests) {
460
+ ctx.ui.notify(` ⏳ ${test.category}: ${test.name}...`, "info");
249
461
 
250
- try {
251
- // This is a simplified version - in a real implementation,
252
- // you would need to interface with the actual model registry
253
- // For now, we simulate a response
254
- console.log(`Running benchmark on ${modelName}...`);
255
-
256
- // Simulate model response (in real implementation, call the actual model)
257
- let response: string;
258
- let tokensUsed: number = 50; // Simulated
259
-
260
- // Mock different model responses for demonstration
261
- if (modelName.includes('claude')) {
262
- response = `function fibonacci(n: number): number {
263
- if (n <= 0) return 0;
264
- if (n === 1) return 1;
265
-
266
- let a = 0, b = 1;
267
- for (let i = 2; i <= n; i++) {
268
- const temp = a + b;
269
- a = b;
270
- b = temp;
271
- }
272
- return b;
273
- }`;
274
- } else if (modelName.includes('gpt')) {
275
- response = `function fibonacci(n: number): number {
276
- if (n <= 0) return 0;
277
- if (n === 1) return 1;
278
-
279
- let prev = 0, curr = 1;
280
- for (let i = 2; i <= n; i++) {
281
- let next = prev + curr;
282
- prev = curr;
283
- curr = next;
284
- }
285
- return curr;
286
- }`;
287
- } else {
288
- // Generic/fallback response that might have issues
289
- response = `function fibonacci(n) {
290
- if (n <= 1) return n;
291
- return fibonacci(n-1) + fibonacci(n-2);
292
- }`;
462
+ try {
463
+ const { response, timeMs } = await callModel(baseUrl, apiKey, modelId, test.prompt, 90000);
464
+ totalTime += timeMs;
465
+
466
+ const result = test.validate(response);
467
+
468
+ categories[test.category] = {
469
+ score: result.score,
470
+ timeMs,
471
+ details: result.details,
472
+ };
473
+
474
+ const emoji = result.score >= 80 ? "✅" : result.score >= 50 ? "⚠️" : "❌";
475
+ ctx.ui.notify(` ${emoji} ${test.category}: ${result.score}/100 (${timeMs}ms)`, "info");
476
+ } catch (error) {
477
+ totalTime += 60000;
478
+ categories[test.category] = {
479
+ score: 0,
480
+ timeMs: 60000,
481
+ details: `Error: ${error}`,
482
+ };
483
+ ctx.ui.notify(` ❌ ${test.category}: Error — ${String(error).substring(0, 100)}`, "error");
293
484
  }
485
+ }
294
486
 
295
- const endTime = Date.now();
296
- const timeMs = endTime - startTime;
297
-
298
- // Extract and test the code
299
- const code = extractTypeScriptCode(response);
300
- const testResult = await testFibonacciCode(code, test.tests);
301
- const { quality, score } = calculateScore(testResult);
302
-
303
- return {
304
- modelName,
305
- testType: "fibonacci",
306
- timestamp: new Date().toISOString(),
307
- timeMs,
308
- tokensUsed,
309
- quality,
310
- score,
311
- details: {
312
- prompt: test.prompt,
313
- response,
314
- compilable: testResult.compilable,
315
- testsPassed: testResult.testsPassed,
316
- totalTests: testResult.totalTests,
317
- errors: testResult.errors
318
- }
319
- };
320
-
321
- } catch (error) {
322
- return {
323
- modelName,
324
- testType: "fibonacci",
325
- timestamp: new Date().toISOString(),
326
- timeMs: Date.now() - startTime,
327
- quality: "fail",
328
- score: 0,
329
- details: {
330
- prompt: test.prompt,
331
- response: `Error: ${error}`,
332
- compilable: false,
333
- testsPassed: 0,
334
- totalTests: test.tests.length,
335
- errors: [String(error)]
336
- }
337
- };
487
+ // Calculate weighted total
488
+ const weights: Record<string, number> = {};
489
+ for (const test of tests) {
490
+ weights[test.category] = test.weight;
491
+ }
492
+
493
+ let weightedSum = 0;
494
+ let totalWeight = 0;
495
+ for (const [cat, data] of Object.entries(categories)) {
496
+ const w = weights[cat] || 1;
497
+ weightedSum += data.score * w;
498
+ totalWeight += w;
338
499
  }
500
+
501
+ const totalScore = Math.round(weightedSum / totalWeight);
502
+
503
+ return {
504
+ modelId,
505
+ modelName: modelId,
506
+ provider,
507
+ timestamp: new Date().toISOString(),
508
+ categories,
509
+ totalScore,
510
+ totalTimeMs: totalTime,
511
+ avgTimeMs: Math.round(totalTime / tests.length),
512
+ };
339
513
  }
340
514
 
341
515
  /**
342
- * Generate benchmark report
516
+ * Generate formatted comparison report
343
517
  */
344
- function generateReport(results: BenchmarkResult[]): string {
345
- if (results.length === 0) {
346
- return "No benchmark results available.";
347
- }
518
+ function generateReport(results: ModelBenchmark[]): string {
519
+ if (results.length === 0) return "No benchmark results yet. Run `/benchmark` to start.";
348
520
 
349
- // Group by model and get latest results
350
- const modelResults = new Map<string, BenchmarkResult>();
351
-
352
- for (const result of results) {
353
- const existing = modelResults.get(result.modelName);
354
- if (!existing || new Date(result.timestamp) > new Date(existing.timestamp)) {
355
- modelResults.set(result.modelName, result);
356
- }
357
- }
521
+ // Sort by totalScore desc
522
+ const sorted = [...results].sort((a, b) => b.totalScore - a.totalScore);
523
+ const categories = ["code-gen", "debug", "planning", "tool-calling", "speed", "orchestration"];
524
+
525
+ let report = "🏆 **Phi Code Benchmark Results**\n\n";
358
526
 
359
- // Sort by score (highest first)
360
- const sortedResults = Array.from(modelResults.values())
361
- .sort((a, b) => b.score - a.score);
362
-
363
- let report = `🏆 **Fibonacci Benchmark Results**\n\n`;
364
-
365
- sortedResults.forEach((result, index) => {
366
- const medal = index === 0 ? "🥇" : index === 1 ? "🥈" : index === 2 ? "🥉" : " ";
367
- const statusEmoji = result.quality === "pass" ? "✅" : result.quality === "partial" ? "⚠️" : "❌";
368
-
369
- report += `${medal} **${result.modelName}** ${statusEmoji}\n`;
370
- report += ` Score: ${result.score}/100\n`;
371
- report += ` Tests: ${result.details.testsPassed}/${result.details.totalTests} passed\n`;
372
- report += ` Time: ${result.timeMs}ms\n`;
373
- if (result.tokensUsed) report += ` Tokens: ${result.tokensUsed}\n`;
374
- report += `\n`;
527
+ // Leaderboard
528
+ report += "**Leaderboard:**\n";
529
+ sorted.forEach((r, i) => {
530
+ const medal = i === 0 ? "🥇" : i === 1 ? "🥈" : i === 2 ? "🥉" : `${i + 1}.`;
531
+ const tier = r.totalScore >= 80 ? "S" : r.totalScore >= 65 ? "A" : r.totalScore >= 50 ? "B" : r.totalScore >= 35 ? "C" : "D";
532
+ report += `${medal} **${r.modelId}** — ${r.totalScore}/100 [${tier}] (avg ${r.avgTimeMs}ms)\n`;
375
533
  });
376
534
 
377
- const totalRuns = results.length;
378
- const avgScore = Math.round(results.reduce((sum, r) => sum + r.score, 0) / totalRuns);
379
-
380
- report += `**Summary:**\n`;
381
- report += `- Models tested: ${modelResults.size}\n`;
382
- report += `- Total test runs: ${totalRuns}\n`;
383
- report += `- Average score: ${avgScore}/100\n`;
535
+ // Category breakdown
536
+ report += "\n**Category Breakdown:**\n```\n";
537
+ const header = "Model".padEnd(25) + categories.map(c => c.substring(0, 8).padEnd(10)).join("") + "TOTAL\n";
538
+ report += header;
539
+ report += "-".repeat(header.length) + "\n";
540
+
541
+ for (const r of sorted) {
542
+ let line = r.modelId.substring(0, 24).padEnd(25);
543
+ for (const cat of categories) {
544
+ const score = r.categories[cat]?.score ?? "-";
545
+ line += String(score).padEnd(10);
546
+ }
547
+ line += String(r.totalScore);
548
+ report += line + "\n";
549
+ }
550
+ report += "```\n";
551
+
552
+ // Best model per category
553
+ report += "\n**Best per Category:**\n";
554
+ for (const cat of categories) {
555
+ let best = { model: "none", score: -1 };
556
+ for (const r of sorted) {
557
+ const s = r.categories[cat]?.score ?? 0;
558
+ if (s > best.score) {
559
+ best = { model: r.modelId, score: s };
560
+ }
561
+ }
562
+ report += `- ${cat}: **${best.model}** (${best.score}/100)\n`;
563
+ }
384
564
 
565
+ report += `\n_Last updated: ${sorted[0]?.timestamp ?? "N/A"}_`;
385
566
  return report;
386
567
  }
387
568
 
388
- /**
389
- * /benchmark command
390
- */
569
+ // ─── Command ─────────────────────────────────────────────────────────
570
+
391
571
  pi.registerCommand("benchmark", {
392
- description: "Run AI model benchmarks",
572
+ description: "Run AI model benchmarks (6 categories: code-gen, debug, planning, tool-calling, speed, orchestration)",
393
573
  handler: async (args, ctx) => {
394
574
  const arg = args.trim().toLowerCase();
395
575
 
396
- try {
397
- if (arg === "results" || arg === "report") {
398
- // Show existing results
399
- const summary = await loadResults();
400
- const report = generateReport(summary.testRuns);
401
- ctx.ui.notify(report, "info");
402
- return;
403
- }
576
+ // Show results
577
+ if (arg === "results" || arg === "report") {
578
+ const store = await loadStore();
579
+ ctx.ui.notify(generateReport(store.results), "info");
580
+ return;
581
+ }
404
582
 
405
- if (arg === "clear") {
406
- // Clear results
407
- const summary: BenchmarkSummary = { testRuns: [], lastUpdated: new Date().toISOString() };
408
- await saveResults(summary);
409
- ctx.ui.notify("Benchmark results cleared.", "info");
583
+ // Compare (same as results but emphasized)
584
+ if (arg === "compare") {
585
+ const store = await loadStore();
586
+ if (store.results.length < 2) {
587
+ ctx.ui.notify("Need at least 2 model results to compare. Run `/benchmark all` first.", "info");
410
588
  return;
411
589
  }
590
+ ctx.ui.notify(generateReport(store.results), "info");
591
+ return;
592
+ }
412
593
 
413
- // For now, use mock models since we can't easily access the model registry
414
- const availableModels = [
415
- "anthropic/claude-sonnet-3.5",
416
- "anthropic/claude-opus",
417
- "anthropic/claude-haiku",
418
- "openai/gpt-4",
419
- "openai/gpt-3.5-turbo"
420
- ];
421
-
422
- if (!arg) {
423
- ctx.ui.notify(`Available commands:
424
- /benchmark - Start interactive benchmark
425
- /benchmark results - Show benchmark report
426
- /benchmark clear - Clear all results
427
-
428
- Available models for testing:
429
- ${availableModels.map(m => `- ${m}`).join('\n')}
430
-
431
- Use /benchmark <model-name> to test a specific model.`, "info");
432
- return;
433
- }
594
+ // Clear
595
+ if (arg === "clear") {
596
+ await saveStore({ version: 2, results: [], lastUpdated: new Date().toISOString() });
597
+ ctx.ui.notify("🗑️ All benchmark results cleared.", "info");
598
+ return;
599
+ }
434
600
 
435
- // Test specific model
436
- const modelToTest = availableModels.find(m =>
437
- m.toLowerCase().includes(arg) ||
438
- m.toLowerCase() === arg
439
- );
601
+ // Help
602
+ if (arg === "help" || arg === "?") {
603
+ ctx.ui.notify(`**Phi Code Benchmark** — 6 categories, real API calls
604
+
605
+ Commands:
606
+ /benchmark Run on current model
607
+ /benchmark all Run on ALL available models
608
+ /benchmark <model-id> Run on a specific model
609
+ /benchmark results Show saved results
610
+ /benchmark compare Side-by-side comparison
611
+ /benchmark clear Clear all results
612
+
613
+ Categories tested (weighted):
614
+ ⚡ code-gen (×2) — Generate a TypeScript function
615
+ 🐛 debug (×2) — Find and fix a bug
616
+ 📋 planning (×2) — Create implementation plan
617
+ 🔧 tool-calling (×1) — Structured JSON output
618
+ ⏱️ speed (×1) — Response latency
619
+ 🧩 orchestration (×2) — Multi-step analysis
620
+
621
+ Scoring: S (80+), A (65+), B (50+), C (35+), D (<35)`, "info");
622
+ return;
623
+ }
440
624
 
441
- if (!modelToTest) {
442
- ctx.ui.notify(`Model "${arg}" not found. Available models:\n${availableModels.map(m => `- ${m}`).join('\n')}`, "warning");
443
- return;
444
- }
625
+ // Get available models
626
+ const available = getAvailableModels();
627
+ if (available.length === 0) {
628
+ ctx.ui.notify("❌ No API keys detected. Set ALIBABA_CODING_PLAN_KEY, OPENAI_API_KEY, or another provider key.", "warning");
629
+ return;
630
+ }
445
631
 
446
- ctx.ui.notify(`🧪 Starting benchmark test for ${modelToTest}...`, "info");
632
+ const store = await loadStore();
447
633
 
448
- // Run the benchmark
449
- const result = await runBenchmarkTest(modelToTest);
634
+ if (arg === "all") {
635
+ // Benchmark ALL available models
636
+ ctx.ui.notify(`🚀 Starting benchmark on ${available.length} models (6 tests each)...\n`, "info");
450
637
 
451
- // Save result
452
- const summary = await loadResults();
453
- summary.testRuns.push(result);
454
- await saveResults(summary);
638
+ for (const model of available) {
639
+ ctx.ui.notify(`\n🧪 **${model.id}** (${model.provider})`, "info");
640
+ const result = await benchmarkModel(model.id, model.provider, model.baseUrl, model.apiKey, ctx);
455
641
 
456
- // Show result
457
- const statusEmoji = result.quality === "pass" ? "✅" : result.quality === "partial" ? "⚠️" : "❌";
458
- const message = `${statusEmoji} **Benchmark Complete: ${modelToTest}**
642
+ // Replace existing result for this model
643
+ store.results = store.results.filter(r => r.modelId !== model.id);
644
+ store.results.push(result);
645
+ await saveStore(store);
646
+ }
459
647
 
460
- **Score:** ${result.score}/100
461
- **Quality:** ${result.quality}
462
- **Time:** ${result.timeMs}ms
463
- **Tests Passed:** ${result.details.testsPassed}/${result.details.totalTests}
648
+ ctx.ui.notify(`\n✅ Benchmark complete! ${available.length} models tested.\n`, "info");
649
+ ctx.ui.notify(generateReport(store.results), "info");
650
+ return;
651
+ }
464
652
 
465
- ${result.details.errors.length > 0 ? `**Issues:**\n${result.details.errors.map(e => `- ${e}`).join('\n')}` : "All tests passed! 🎉"}
653
+ if (arg) {
654
+ // Benchmark specific model
655
+ const model = available.find(m => m.id.toLowerCase() === arg || m.id.toLowerCase().includes(arg));
656
+ if (!model) {
657
+ ctx.ui.notify(`Model "${arg}" not found or no API key. Available:\n${available.map(m => ` - ${m.id} (${m.provider})`).join("\n")}`, "warning");
658
+ return;
659
+ }
466
660
 
467
- Use \`/benchmark results\` to see all benchmark results.`;
661
+ ctx.ui.notify(`🧪 Benchmarking **${model.id}** (6 categories)...\n`, "info");
662
+ const result = await benchmarkModel(model.id, model.provider, model.baseUrl, model.apiKey, ctx);
663
+ store.results = store.results.filter(r => r.modelId !== model.id);
664
+ store.results.push(result);
665
+ await saveStore(store);
468
666
 
469
- ctx.ui.notify(message, "info");
667
+ ctx.ui.notify(`\n✅ **${model.id}** — Total: ${result.totalScore}/100 (avg ${result.avgTimeMs}ms)`, "info");
668
+ return;
669
+ }
470
670
 
471
- } catch (error) {
472
- ctx.ui.notify(`Benchmark failed: ${error}`, "error");
671
+ // Default: benchmark current model
672
+ // Try to find current model in available list
673
+ const currentModel = ctx.model;
674
+ if (currentModel) {
675
+ const modelConfig = available.find(m => m.id === currentModel.id);
676
+ if (modelConfig) {
677
+ ctx.ui.notify(`🧪 Benchmarking current model **${currentModel.id}** (6 categories)...\n`, "info");
678
+ const result = await benchmarkModel(modelConfig.id, modelConfig.provider, modelConfig.baseUrl, modelConfig.apiKey, ctx);
679
+ store.results = store.results.filter(r => r.modelId !== modelConfig.id);
680
+ store.results.push(result);
681
+ await saveStore(store);
682
+ ctx.ui.notify(`\n✅ **${currentModel.id}** — Total: ${result.totalScore}/100`, "info");
683
+ return;
684
+ }
473
685
  }
686
+
687
+ // Fallback: show available models
688
+ ctx.ui.notify(`Available models for benchmark:\n${available.map(m => ` - ${m.id} (${m.provider})`).join("\n")}\n\nUsage: /benchmark <model-id> or /benchmark all`, "info");
474
689
  },
475
690
  });
476
691
 
477
- /**
478
- * Show benchmark info on session start
479
- */
692
+ // Session start notification
480
693
  pi.on("session_start", async (_event, ctx) => {
481
694
  try {
482
- const summary = await loadResults();
483
- if (summary.testRuns.length > 0) {
484
- ctx.ui.notify(`🧪 Benchmark data available (${summary.testRuns.length} test runs). Use /benchmark results to view.`, "info");
695
+ const store = await loadStore();
696
+ if (store.results.length > 0) {
697
+ ctx.ui.notify(`🧪 ${store.results.length} benchmark results available. /benchmark results to view.`, "info");
485
698
  }
486
699
  } catch {
487
- // No results file yet, ignore
700
+ // ignore
488
701
  }
489
702
  });
490
- }
703
+ }