@phi-code-admin/phi-code 0.56.6 → 0.57.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/extensions/phi/README.md +34 -214
- package/extensions/phi/agents.ts +202 -0
- package/extensions/phi/benchmark.ts +611 -398
- package/extensions/phi/init.ts +447 -312
- package/extensions/phi/memory.ts +36 -18
- package/package.json +1 -1
|
@@ -1,490 +1,703 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Benchmark Extension -
|
|
2
|
+
* Benchmark Extension - Production-grade model performance testing
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
* - Code generation testing (Fibonacci function)
|
|
12
|
-
* - Performance metrics (time, quality, tokens)
|
|
13
|
-
* - Results persistence in ~/.phi/benchmark/results.json
|
|
14
|
-
* - Ranking and comparison display
|
|
4
|
+
* Tests AI models across 6 categories:
|
|
5
|
+
* 1. Code Generation — Write a function from a spec
|
|
6
|
+
* 2. Debugging — Find and fix a bug
|
|
7
|
+
* 3. Planning — Create an implementation plan
|
|
8
|
+
* 4. Tool Calling — Generate structured JSON output
|
|
9
|
+
* 5. Speed — Response latency measurement
|
|
10
|
+
* 6. Orchestration — Multi-step reasoning task
|
|
15
11
|
*
|
|
16
12
|
* Usage:
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
13
|
+
* - /benchmark — Run benchmark on current model
|
|
14
|
+
* - /benchmark all — Run on all available models
|
|
15
|
+
* - /benchmark results — Show saved results
|
|
16
|
+
* - /benchmark compare — Side-by-side model comparison
|
|
17
|
+
* - /benchmark clear — Clear all results
|
|
20
18
|
*/
|
|
21
19
|
|
|
22
|
-
import type { ExtensionAPI } from "phi-code";
|
|
20
|
+
import type { ExtensionAPI, ExtensionContext } from "phi-code";
|
|
23
21
|
import { writeFile, mkdir, readFile, access } from "node:fs/promises";
|
|
24
22
|
import { join } from "node:path";
|
|
25
23
|
import { homedir } from "node:os";
|
|
26
24
|
|
|
27
|
-
|
|
25
|
+
// ─── Types ───────────────────────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
interface TestCase {
|
|
28
|
+
category: "code-gen" | "debug" | "planning" | "tool-calling" | "speed" | "orchestration";
|
|
29
|
+
name: string;
|
|
30
|
+
prompt: string;
|
|
31
|
+
validate: (response: string) => TestResult;
|
|
32
|
+
weight: number; // Score weight (1-3)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
interface TestResult {
|
|
36
|
+
passed: boolean;
|
|
37
|
+
score: number; // 0-100
|
|
38
|
+
details: string;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
interface ModelBenchmark {
|
|
42
|
+
modelId: string;
|
|
28
43
|
modelName: string;
|
|
29
|
-
|
|
44
|
+
provider: string;
|
|
30
45
|
timestamp: string;
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
response: string;
|
|
38
|
-
compilable: boolean;
|
|
39
|
-
testsPassed: number;
|
|
40
|
-
totalTests: number;
|
|
41
|
-
errors?: string[];
|
|
46
|
+
categories: {
|
|
47
|
+
[key: string]: {
|
|
48
|
+
score: number;
|
|
49
|
+
timeMs: number;
|
|
50
|
+
details: string;
|
|
51
|
+
};
|
|
42
52
|
};
|
|
53
|
+
totalScore: number;
|
|
54
|
+
totalTimeMs: number;
|
|
55
|
+
avgTimeMs: number;
|
|
43
56
|
}
|
|
44
57
|
|
|
45
|
-
interface
|
|
46
|
-
|
|
58
|
+
interface BenchmarkStore {
|
|
59
|
+
version: 2;
|
|
60
|
+
results: ModelBenchmark[];
|
|
47
61
|
lastUpdated: string;
|
|
48
62
|
}
|
|
49
63
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
64
|
+
// ─── Test Suite ──────────────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
function createTestSuite(): TestCase[] {
|
|
67
|
+
return [
|
|
68
|
+
// 1. CODE GENERATION
|
|
69
|
+
{
|
|
70
|
+
category: "code-gen",
|
|
71
|
+
name: "Fibonacci Function",
|
|
72
|
+
weight: 2,
|
|
73
|
+
prompt: `Write a TypeScript function called 'fibonacci' that:
|
|
74
|
+
- Takes a number n as parameter
|
|
75
|
+
- Returns the nth Fibonacci number
|
|
76
|
+
- Handles edge cases (n <= 0 returns 0, n = 1 returns 1)
|
|
77
|
+
- Uses iterative approach (not recursive)
|
|
78
|
+
- Is properly typed
|
|
79
|
+
|
|
80
|
+
Respond with ONLY the function code, no explanations.`,
|
|
81
|
+
validate: (response: string) => {
|
|
82
|
+
const code = extractCode(response);
|
|
83
|
+
const checks = [
|
|
84
|
+
{ test: /function\s+fibonacci/.test(code), detail: "Function named 'fibonacci'" },
|
|
85
|
+
{ test: /:\s*number/.test(code), detail: "TypeScript type annotation" },
|
|
86
|
+
{ test: /return/.test(code), detail: "Has return statement" },
|
|
87
|
+
{ test: /for|while/.test(code), detail: "Uses iteration (not recursion)" },
|
|
88
|
+
{ test: /(<=\s*0|===?\s*0|<\s*1)/.test(code), detail: "Handles edge case n=0" },
|
|
89
|
+
{ test: /(===?\s*1|<=\s*1)/.test(code), detail: "Handles edge case n=1" },
|
|
90
|
+
];
|
|
91
|
+
const passed = checks.filter(c => c.test).length;
|
|
92
|
+
const total = checks.length;
|
|
93
|
+
return {
|
|
94
|
+
passed: passed >= 5,
|
|
95
|
+
score: Math.round((passed / total) * 100),
|
|
96
|
+
details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
|
|
97
|
+
};
|
|
98
|
+
},
|
|
99
|
+
},
|
|
53
100
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
101
|
+
// 2. DEBUGGING
|
|
102
|
+
{
|
|
103
|
+
category: "debug",
|
|
104
|
+
name: "Find the Bug",
|
|
105
|
+
weight: 2,
|
|
106
|
+
prompt: `Find and fix the bug in this TypeScript code:
|
|
107
|
+
|
|
108
|
+
\`\`\`typescript
|
|
109
|
+
function mergeArrays<T>(arr1: T[], arr2: T[]): T[] {
|
|
110
|
+
const result = arr1;
|
|
111
|
+
for (let i = 0; i < arr2.length; i++) {
|
|
112
|
+
result.push(arr2[i]);
|
|
113
|
+
}
|
|
114
|
+
return result;
|
|
115
|
+
}
|
|
64
116
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
117
|
+
// Bug: calling mergeArrays modifies the original arr1
|
|
118
|
+
const a = [1, 2, 3];
|
|
119
|
+
const b = [4, 5, 6];
|
|
120
|
+
const merged = mergeArrays(a, b);
|
|
121
|
+
console.log(a); // Expected [1,2,3] but got [1,2,3,4,5,6]
|
|
122
|
+
\`\`\`
|
|
123
|
+
|
|
124
|
+
Explain the bug and provide the fixed code.`,
|
|
125
|
+
validate: (response: string) => {
|
|
126
|
+
const lower = response.toLowerCase();
|
|
127
|
+
const checks = [
|
|
128
|
+
{ test: /reference|shallow|copy|spread|\[\.\.\./.test(lower), detail: "Identifies reference/copy issue" },
|
|
129
|
+
{ test: /\[\.\.\.arr1\]|\[\.\.\.arr1,|Array\.from|\.slice\(\)|structuredClone|concat/.test(response), detail: "Uses spread/copy/concat fix" },
|
|
130
|
+
{ test: /mutate|modify|original|side.?effect/.test(lower), detail: "Explains the mutation problem" },
|
|
131
|
+
{ test: /const result\s*=\s*\[/.test(response) || /\.slice\(/.test(response) || /\.concat\(/.test(response) || /Array\.from/.test(response), detail: "Creates new array in fix" },
|
|
132
|
+
];
|
|
133
|
+
const passed = checks.filter(c => c.test).length;
|
|
134
|
+
return {
|
|
135
|
+
passed: passed >= 3,
|
|
136
|
+
score: Math.round((passed / checks.length) * 100),
|
|
137
|
+
details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
|
|
138
|
+
};
|
|
139
|
+
},
|
|
140
|
+
},
|
|
77
141
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
await writeFile(resultsPath, JSON.stringify(summary, null, 2), 'utf-8');
|
|
85
|
-
}
|
|
142
|
+
// 3. PLANNING
|
|
143
|
+
{
|
|
144
|
+
category: "planning",
|
|
145
|
+
name: "Implementation Plan",
|
|
146
|
+
weight: 2,
|
|
147
|
+
prompt: `Create a detailed implementation plan for adding JWT authentication to an existing Express.js REST API.
|
|
86
148
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
return {
|
|
92
|
-
prompt: `Write a TypeScript function called 'fibonacci' that calculates the nth Fibonacci number.
|
|
149
|
+
The API currently has:
|
|
150
|
+
- User model with email/password
|
|
151
|
+
- CRUD endpoints for /users and /posts
|
|
152
|
+
- PostgreSQL database with Prisma ORM
|
|
93
153
|
|
|
94
154
|
Requirements:
|
|
95
|
-
-
|
|
96
|
-
-
|
|
97
|
-
-
|
|
98
|
-
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
155
|
+
- Login endpoint returns access + refresh tokens
|
|
156
|
+
- Protected routes require valid access token
|
|
157
|
+
- Refresh token rotation
|
|
158
|
+
- Token blacklisting on logout
|
|
159
|
+
|
|
160
|
+
Provide a structured plan with specific files to create/modify, dependencies to add, and implementation steps.`,
|
|
161
|
+
validate: (response: string) => {
|
|
162
|
+
const lower = response.toLowerCase();
|
|
163
|
+
const checks = [
|
|
164
|
+
{ test: /jsonwebtoken|jwt|jose/.test(lower), detail: "Mentions JWT library" },
|
|
165
|
+
{ test: /access.?token|refresh.?token/.test(lower), detail: "Covers both token types" },
|
|
166
|
+
{ test: /middleware/.test(lower), detail: "Mentions auth middleware" },
|
|
167
|
+
{ test: /bcrypt|argon|hash/.test(lower), detail: "Addresses password hashing" },
|
|
168
|
+
{ test: /blacklist|revoke|invalidat/.test(lower), detail: "Addresses token revocation" },
|
|
169
|
+
{ test: /prisma|schema|model|migration/.test(lower), detail: "Covers database changes" },
|
|
170
|
+
{ test: /env|secret|config/.test(lower), detail: "Addresses secret management" },
|
|
171
|
+
{ test: /step|phase|\d\.|create|modify|add/.test(lower), detail: "Provides structured steps" },
|
|
172
|
+
];
|
|
173
|
+
const passed = checks.filter(c => c.test).length;
|
|
174
|
+
return {
|
|
175
|
+
passed: passed >= 6,
|
|
176
|
+
score: Math.round((passed / checks.length) * 100),
|
|
177
|
+
details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
|
|
178
|
+
};
|
|
179
|
+
},
|
|
180
|
+
},
|
|
116
181
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
182
|
+
// 4. TOOL CALLING (structured output)
|
|
183
|
+
{
|
|
184
|
+
category: "tool-calling",
|
|
185
|
+
name: "Structured JSON Output",
|
|
186
|
+
weight: 1,
|
|
187
|
+
prompt: `Parse this natural language description and output ONLY a valid JSON object (no markdown, no explanation):
|
|
188
|
+
|
|
189
|
+
"Create a new user named Alice Smith, email alice@example.com, she's a software engineer at TechCorp, based in San Francisco, age 28, prefers dark mode and email notifications"
|
|
190
|
+
|
|
191
|
+
Required JSON schema:
|
|
192
|
+
{
|
|
193
|
+
"name": { "first": string, "last": string },
|
|
194
|
+
"email": string,
|
|
195
|
+
"profile": {
|
|
196
|
+
"occupation": string,
|
|
197
|
+
"company": string,
|
|
198
|
+
"location": string,
|
|
199
|
+
"age": number
|
|
200
|
+
},
|
|
201
|
+
"preferences": {
|
|
202
|
+
"theme": "light" | "dark",
|
|
203
|
+
"notifications": { "email": boolean, "push": boolean }
|
|
204
|
+
}
|
|
205
|
+
}
|
|
126
206
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
207
|
+
Output ONLY the JSON.`,
|
|
208
|
+
validate: (response: string) => {
|
|
209
|
+
const checks: Array<{ test: boolean; detail: string }> = [];
|
|
210
|
+
|
|
211
|
+
// Try to extract JSON from response
|
|
212
|
+
let jsonStr = response.trim();
|
|
213
|
+
const jsonMatch = response.match(/```(?:json)?\s*([\s\S]*?)```/) || response.match(/(\{[\s\S]*\})/);
|
|
214
|
+
if (jsonMatch) jsonStr = jsonMatch[1].trim();
|
|
215
|
+
|
|
216
|
+
try {
|
|
217
|
+
const obj = JSON.parse(jsonStr);
|
|
218
|
+
checks.push({ test: true, detail: "Valid JSON" });
|
|
219
|
+
checks.push({ test: obj?.name?.first === "Alice", detail: 'name.first = "Alice"' });
|
|
220
|
+
checks.push({ test: obj?.name?.last === "Smith", detail: 'name.last = "Smith"' });
|
|
221
|
+
checks.push({ test: obj?.email === "alice@example.com", detail: "Correct email" });
|
|
222
|
+
checks.push({ test: typeof obj?.profile?.age === "number" && obj.profile.age === 28, detail: "Age is number 28" });
|
|
223
|
+
checks.push({ test: obj?.preferences?.theme === "dark", detail: 'theme = "dark"' });
|
|
224
|
+
checks.push({ test: obj?.preferences?.notifications?.email === true, detail: "email notifications = true" });
|
|
225
|
+
} catch {
|
|
226
|
+
checks.push({ test: false, detail: "Valid JSON (parse failed)" });
|
|
227
|
+
checks.push({ test: false, detail: "name.first" });
|
|
228
|
+
checks.push({ test: false, detail: "name.last" });
|
|
229
|
+
checks.push({ test: false, detail: "email" });
|
|
230
|
+
checks.push({ test: false, detail: "age" });
|
|
231
|
+
checks.push({ test: false, detail: "theme" });
|
|
232
|
+
checks.push({ test: false, detail: "notifications" });
|
|
233
|
+
}
|
|
132
234
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
235
|
+
const passed = checks.filter(c => c.test).length;
|
|
236
|
+
return {
|
|
237
|
+
passed: passed >= 5,
|
|
238
|
+
score: Math.round((passed / checks.length) * 100),
|
|
239
|
+
details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
|
|
240
|
+
};
|
|
241
|
+
},
|
|
242
|
+
},
|
|
138
243
|
|
|
139
|
-
//
|
|
140
|
-
|
|
141
|
-
|
|
244
|
+
// 5. SPEED (simple task, measures latency)
|
|
245
|
+
{
|
|
246
|
+
category: "speed",
|
|
247
|
+
name: "Quick Response",
|
|
248
|
+
weight: 1,
|
|
249
|
+
prompt: `Reply with exactly this text and nothing else: "Hello, World!"`,
|
|
250
|
+
validate: (response: string) => {
|
|
251
|
+
const trimmed = response.trim().replace(/^["']|["']$/g, "").replace(/```\w*\n?/g, "").trim();
|
|
252
|
+
const exact = trimmed === "Hello, World!";
|
|
253
|
+
const close = trimmed.toLowerCase().includes("hello, world");
|
|
254
|
+
return {
|
|
255
|
+
passed: close,
|
|
256
|
+
score: exact ? 100 : close ? 75 : 0,
|
|
257
|
+
details: exact ? "✅ Exact match" : close ? "⚠️ Close match" : `❌ Got: "${trimmed.substring(0, 50)}"`,
|
|
258
|
+
};
|
|
259
|
+
},
|
|
260
|
+
},
|
|
142
261
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
262
|
+
// 6. ORCHESTRATION (multi-step reasoning)
|
|
263
|
+
{
|
|
264
|
+
category: "orchestration",
|
|
265
|
+
name: "Multi-Step Analysis",
|
|
266
|
+
weight: 2,
|
|
267
|
+
prompt: `Analyze this scenario step by step:
|
|
268
|
+
|
|
269
|
+
A Node.js microservice has these symptoms:
|
|
270
|
+
1. Response times gradually increase from 50ms to 3000ms over 24 hours
|
|
271
|
+
2. Memory usage grows steadily from 200MB to 1.5GB
|
|
272
|
+
3. The service handles file uploads (multipart/form-data)
|
|
273
|
+
4. After restart, everything returns to normal
|
|
274
|
+
5. No errors in logs
|
|
275
|
+
6. Database queries remain fast (<10ms)
|
|
276
|
+
|
|
277
|
+
Tasks:
|
|
278
|
+
A) Identify the most likely root cause
|
|
279
|
+
B) List 3 specific things to check in the code
|
|
280
|
+
C) Propose a fix with code example
|
|
281
|
+
D) Suggest monitoring to prevent recurrence
|
|
282
|
+
|
|
283
|
+
Be specific and technical.`,
|
|
284
|
+
validate: (response: string) => {
|
|
285
|
+
const lower = response.toLowerCase();
|
|
286
|
+
const checks = [
|
|
287
|
+
{ test: /memory.?leak|leak/.test(lower), detail: "Identifies memory leak" },
|
|
288
|
+
{ test: /stream|buffer|file|upload|temp|cleanup/.test(lower), detail: "Links to file upload handling" },
|
|
289
|
+
{ test: /close|destroy|cleanup|dispose|gc|garbage/.test(lower), detail: "Suggests resource cleanup" },
|
|
290
|
+
{ test: /event.?listener|handler|remove|off/.test(lower) || /stream|pipe/.test(lower), detail: "Checks for handler/stream leaks" },
|
|
291
|
+
{ test: /heapdump|heap.?snapshot|inspect|profile|--max-old-space/.test(lower) || /process\.memoryUsage/.test(lower), detail: "Suggests debugging tools" },
|
|
292
|
+
{ test: /monitor|alert|metric|prometheus|grafana|threshold/.test(lower), detail: "Suggests monitoring" },
|
|
293
|
+
];
|
|
294
|
+
const passed = checks.filter(c => c.test).length;
|
|
295
|
+
return {
|
|
296
|
+
passed: passed >= 4,
|
|
297
|
+
score: Math.round((passed / checks.length) * 100),
|
|
298
|
+
details: checks.map(c => `${c.test ? "✅" : "❌"} ${c.detail}`).join("\n"),
|
|
299
|
+
};
|
|
300
|
+
},
|
|
301
|
+
},
|
|
302
|
+
];
|
|
303
|
+
}
|
|
154
304
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
305
|
+
// ─── Helpers ─────────────────────────────────────────────────────────────
|
|
306
|
+
|
|
307
|
+
function extractCode(response: string): string {
|
|
308
|
+
const match = response.match(/```(?:typescript|ts|javascript|js)?\s*([\s\S]*?)```/);
|
|
309
|
+
return match ? match[1].trim() : response.trim();
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
interface ProviderConfig {
|
|
313
|
+
name: string;
|
|
314
|
+
envVar: string;
|
|
315
|
+
baseUrl: string;
|
|
316
|
+
models: string[];
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
function getProviderConfigs(): ProviderConfig[] {
|
|
320
|
+
return [
|
|
321
|
+
{
|
|
322
|
+
name: "alibaba-codingplan",
|
|
323
|
+
envVar: "ALIBABA_CODING_PLAN_KEY",
|
|
324
|
+
baseUrl: "https://coding-intl.dashscope.aliyuncs.com/v1",
|
|
325
|
+
models: ["qwen3.5-plus", "qwen3-max-2026-01-23", "qwen3-coder-plus", "qwen3-coder-next", "kimi-k2.5", "glm-5", "glm-4.7", "MiniMax-M2.5"],
|
|
326
|
+
},
|
|
327
|
+
{
|
|
328
|
+
name: "openai",
|
|
329
|
+
envVar: "OPENAI_API_KEY",
|
|
330
|
+
baseUrl: "https://api.openai.com/v1",
|
|
331
|
+
models: ["gpt-4o", "gpt-4o-mini"],
|
|
332
|
+
},
|
|
333
|
+
{
|
|
334
|
+
name: "anthropic-openai",
|
|
335
|
+
envVar: "ANTHROPIC_API_KEY",
|
|
336
|
+
baseUrl: "https://api.anthropic.com/v1",
|
|
337
|
+
models: [],
|
|
338
|
+
},
|
|
339
|
+
{
|
|
340
|
+
name: "openrouter",
|
|
341
|
+
envVar: "OPENROUTER_API_KEY",
|
|
342
|
+
baseUrl: "https://openrouter.ai/api/v1",
|
|
343
|
+
models: [],
|
|
344
|
+
},
|
|
345
|
+
{
|
|
346
|
+
name: "groq",
|
|
347
|
+
envVar: "GROQ_API_KEY",
|
|
348
|
+
baseUrl: "https://api.groq.com/openai/v1",
|
|
349
|
+
models: [],
|
|
350
|
+
},
|
|
351
|
+
];
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
function getAvailableModels(): Array<{ id: string; provider: string; baseUrl: string; apiKey: string }> {
|
|
355
|
+
const models: Array<{ id: string; provider: string; baseUrl: string; apiKey: string }> = [];
|
|
356
|
+
|
|
357
|
+
for (const provider of getProviderConfigs()) {
|
|
358
|
+
const apiKey = process.env[provider.envVar];
|
|
359
|
+
if (!apiKey) continue;
|
|
360
|
+
|
|
361
|
+
for (const modelId of provider.models) {
|
|
362
|
+
models.push({
|
|
363
|
+
id: modelId,
|
|
364
|
+
provider: provider.name,
|
|
365
|
+
baseUrl: provider.baseUrl,
|
|
366
|
+
apiKey,
|
|
181
367
|
});
|
|
182
368
|
}
|
|
183
369
|
}
|
|
184
|
-
|
|
185
|
-
return
|
|
370
|
+
|
|
371
|
+
return models;
|
|
186
372
|
}
|
|
187
373
|
|
|
188
|
-
|
|
189
|
-
|
|
374
|
+
async function callModel(
|
|
375
|
+
baseUrl: string,
|
|
376
|
+
apiKey: string,
|
|
377
|
+
model: string,
|
|
378
|
+
prompt: string,
|
|
379
|
+
timeoutMs: number = 60000,
|
|
380
|
+
): Promise<{ response: string; timeMs: number }> {
|
|
381
|
+
const startTime = Date.now();
|
|
382
|
+
|
|
383
|
+
const controller = new AbortController();
|
|
384
|
+
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
385
|
+
|
|
386
|
+
try {
|
|
387
|
+
const res = await fetch(`${baseUrl}/chat/completions`, {
|
|
388
|
+
method: "POST",
|
|
389
|
+
headers: {
|
|
390
|
+
"Content-Type": "application/json",
|
|
391
|
+
Authorization: `Bearer ${apiKey}`,
|
|
392
|
+
},
|
|
393
|
+
body: JSON.stringify({
|
|
394
|
+
model,
|
|
395
|
+
messages: [{ role: "user", content: prompt }],
|
|
396
|
+
max_tokens: 4096,
|
|
397
|
+
temperature: 0.1,
|
|
398
|
+
}),
|
|
399
|
+
signal: controller.signal,
|
|
400
|
+
});
|
|
190
401
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
testsPassed = testResults.filter((r: any) => r.passed).length;
|
|
402
|
+
if (!res.ok) {
|
|
403
|
+
const errorBody = await res.text().catch(() => "");
|
|
404
|
+
throw new Error(`API error ${res.status}: ${errorBody.substring(0, 200)}`);
|
|
405
|
+
}
|
|
196
406
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
});
|
|
407
|
+
const data = (await res.json()) as any;
|
|
408
|
+
const response = data?.choices?.[0]?.message?.content || "";
|
|
409
|
+
const timeMs = Date.now() - startTime;
|
|
201
410
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
totalTests: tests.length,
|
|
206
|
-
errors
|
|
207
|
-
};
|
|
208
|
-
|
|
209
|
-
} catch (error) {
|
|
210
|
-
errors.push(`Compilation/Runtime error: ${error}`);
|
|
211
|
-
return {
|
|
212
|
-
compilable: false,
|
|
213
|
-
testsPassed: 0,
|
|
214
|
-
totalTests: tests.length,
|
|
215
|
-
errors
|
|
216
|
-
};
|
|
217
|
-
}
|
|
411
|
+
return { response, timeMs };
|
|
412
|
+
} finally {
|
|
413
|
+
clearTimeout(timeout);
|
|
218
414
|
}
|
|
415
|
+
}
|
|
219
416
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
score: number;
|
|
226
|
-
} {
|
|
227
|
-
if (!result.compilable) {
|
|
228
|
-
return { quality: "fail", score: 0 };
|
|
229
|
-
}
|
|
417
|
+
// ─── Extension ───────────────────────────────────────────────────────────
|
|
418
|
+
|
|
419
|
+
export default function benchmarkExtension(pi: ExtensionAPI) {
|
|
420
|
+
const benchmarkDir = join(homedir(), ".phi", "benchmark");
|
|
421
|
+
const resultsPath = join(benchmarkDir, "results.json");
|
|
230
422
|
|
|
231
|
-
|
|
232
|
-
|
|
423
|
+
async function ensureDir() {
|
|
424
|
+
await mkdir(benchmarkDir, { recursive: true });
|
|
425
|
+
}
|
|
233
426
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
427
|
+
async function loadStore(): Promise<BenchmarkStore> {
|
|
428
|
+
try {
|
|
429
|
+
await access(resultsPath);
|
|
430
|
+
const content = await readFile(resultsPath, "utf-8");
|
|
431
|
+
const store = JSON.parse(content);
|
|
432
|
+
if (store.version === 2) return store;
|
|
433
|
+
return { version: 2, results: [], lastUpdated: new Date().toISOString() };
|
|
434
|
+
} catch {
|
|
435
|
+
return { version: 2, results: [], lastUpdated: new Date().toISOString() };
|
|
240
436
|
}
|
|
241
437
|
}
|
|
242
438
|
|
|
439
|
+
async function saveStore(store: BenchmarkStore) {
|
|
440
|
+
await ensureDir();
|
|
441
|
+
store.lastUpdated = new Date().toISOString();
|
|
442
|
+
await writeFile(resultsPath, JSON.stringify(store, null, 2), "utf-8");
|
|
443
|
+
}
|
|
444
|
+
|
|
243
445
|
/**
|
|
244
|
-
* Run benchmark
|
|
446
|
+
* Run full benchmark on a single model
|
|
245
447
|
*/
|
|
246
|
-
async function
|
|
247
|
-
|
|
248
|
-
|
|
448
|
+
async function benchmarkModel(
|
|
449
|
+
modelId: string,
|
|
450
|
+
provider: string,
|
|
451
|
+
baseUrl: string,
|
|
452
|
+
apiKey: string,
|
|
453
|
+
ctx: ExtensionContext,
|
|
454
|
+
): Promise<ModelBenchmark> {
|
|
455
|
+
const tests = createTestSuite();
|
|
456
|
+
const categories: ModelBenchmark["categories"] = {};
|
|
457
|
+
let totalTime = 0;
|
|
458
|
+
|
|
459
|
+
for (const test of tests) {
|
|
460
|
+
ctx.ui.notify(` ⏳ ${test.category}: ${test.name}...`, "info");
|
|
249
461
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
return b;
|
|
273
|
-
}`;
|
|
274
|
-
} else if (modelName.includes('gpt')) {
|
|
275
|
-
response = `function fibonacci(n: number): number {
|
|
276
|
-
if (n <= 0) return 0;
|
|
277
|
-
if (n === 1) return 1;
|
|
278
|
-
|
|
279
|
-
let prev = 0, curr = 1;
|
|
280
|
-
for (let i = 2; i <= n; i++) {
|
|
281
|
-
let next = prev + curr;
|
|
282
|
-
prev = curr;
|
|
283
|
-
curr = next;
|
|
284
|
-
}
|
|
285
|
-
return curr;
|
|
286
|
-
}`;
|
|
287
|
-
} else {
|
|
288
|
-
// Generic/fallback response that might have issues
|
|
289
|
-
response = `function fibonacci(n) {
|
|
290
|
-
if (n <= 1) return n;
|
|
291
|
-
return fibonacci(n-1) + fibonacci(n-2);
|
|
292
|
-
}`;
|
|
462
|
+
try {
|
|
463
|
+
const { response, timeMs } = await callModel(baseUrl, apiKey, modelId, test.prompt, 90000);
|
|
464
|
+
totalTime += timeMs;
|
|
465
|
+
|
|
466
|
+
const result = test.validate(response);
|
|
467
|
+
|
|
468
|
+
categories[test.category] = {
|
|
469
|
+
score: result.score,
|
|
470
|
+
timeMs,
|
|
471
|
+
details: result.details,
|
|
472
|
+
};
|
|
473
|
+
|
|
474
|
+
const emoji = result.score >= 80 ? "✅" : result.score >= 50 ? "⚠️" : "❌";
|
|
475
|
+
ctx.ui.notify(` ${emoji} ${test.category}: ${result.score}/100 (${timeMs}ms)`, "info");
|
|
476
|
+
} catch (error) {
|
|
477
|
+
totalTime += 60000;
|
|
478
|
+
categories[test.category] = {
|
|
479
|
+
score: 0,
|
|
480
|
+
timeMs: 60000,
|
|
481
|
+
details: `Error: ${error}`,
|
|
482
|
+
};
|
|
483
|
+
ctx.ui.notify(` ❌ ${test.category}: Error — ${String(error).substring(0, 100)}`, "error");
|
|
293
484
|
}
|
|
485
|
+
}
|
|
294
486
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
timeMs,
|
|
308
|
-
tokensUsed,
|
|
309
|
-
quality,
|
|
310
|
-
score,
|
|
311
|
-
details: {
|
|
312
|
-
prompt: test.prompt,
|
|
313
|
-
response,
|
|
314
|
-
compilable: testResult.compilable,
|
|
315
|
-
testsPassed: testResult.testsPassed,
|
|
316
|
-
totalTests: testResult.totalTests,
|
|
317
|
-
errors: testResult.errors
|
|
318
|
-
}
|
|
319
|
-
};
|
|
320
|
-
|
|
321
|
-
} catch (error) {
|
|
322
|
-
return {
|
|
323
|
-
modelName,
|
|
324
|
-
testType: "fibonacci",
|
|
325
|
-
timestamp: new Date().toISOString(),
|
|
326
|
-
timeMs: Date.now() - startTime,
|
|
327
|
-
quality: "fail",
|
|
328
|
-
score: 0,
|
|
329
|
-
details: {
|
|
330
|
-
prompt: test.prompt,
|
|
331
|
-
response: `Error: ${error}`,
|
|
332
|
-
compilable: false,
|
|
333
|
-
testsPassed: 0,
|
|
334
|
-
totalTests: test.tests.length,
|
|
335
|
-
errors: [String(error)]
|
|
336
|
-
}
|
|
337
|
-
};
|
|
487
|
+
// Calculate weighted total
|
|
488
|
+
const weights: Record<string, number> = {};
|
|
489
|
+
for (const test of tests) {
|
|
490
|
+
weights[test.category] = test.weight;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
let weightedSum = 0;
|
|
494
|
+
let totalWeight = 0;
|
|
495
|
+
for (const [cat, data] of Object.entries(categories)) {
|
|
496
|
+
const w = weights[cat] || 1;
|
|
497
|
+
weightedSum += data.score * w;
|
|
498
|
+
totalWeight += w;
|
|
338
499
|
}
|
|
500
|
+
|
|
501
|
+
const totalScore = Math.round(weightedSum / totalWeight);
|
|
502
|
+
|
|
503
|
+
return {
|
|
504
|
+
modelId,
|
|
505
|
+
modelName: modelId,
|
|
506
|
+
provider,
|
|
507
|
+
timestamp: new Date().toISOString(),
|
|
508
|
+
categories,
|
|
509
|
+
totalScore,
|
|
510
|
+
totalTimeMs: totalTime,
|
|
511
|
+
avgTimeMs: Math.round(totalTime / tests.length),
|
|
512
|
+
};
|
|
339
513
|
}
|
|
340
514
|
|
|
341
515
|
/**
|
|
342
|
-
* Generate
|
|
516
|
+
* Generate formatted comparison report
|
|
343
517
|
*/
|
|
344
|
-
function generateReport(results:
|
|
345
|
-
if (results.length === 0)
|
|
346
|
-
return "No benchmark results available.";
|
|
347
|
-
}
|
|
518
|
+
function generateReport(results: ModelBenchmark[]): string {
|
|
519
|
+
if (results.length === 0) return "No benchmark results yet. Run `/benchmark` to start.";
|
|
348
520
|
|
|
349
|
-
//
|
|
350
|
-
const
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
if (!existing || new Date(result.timestamp) > new Date(existing.timestamp)) {
|
|
355
|
-
modelResults.set(result.modelName, result);
|
|
356
|
-
}
|
|
357
|
-
}
|
|
521
|
+
// Sort by totalScore desc
|
|
522
|
+
const sorted = [...results].sort((a, b) => b.totalScore - a.totalScore);
|
|
523
|
+
const categories = ["code-gen", "debug", "planning", "tool-calling", "speed", "orchestration"];
|
|
524
|
+
|
|
525
|
+
let report = "🏆 **Phi Code Benchmark Results**\n\n";
|
|
358
526
|
|
|
359
|
-
//
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
sortedResults.forEach((result, index) => {
|
|
366
|
-
const medal = index === 0 ? "🥇" : index === 1 ? "🥈" : index === 2 ? "🥉" : " ";
|
|
367
|
-
const statusEmoji = result.quality === "pass" ? "✅" : result.quality === "partial" ? "⚠️" : "❌";
|
|
368
|
-
|
|
369
|
-
report += `${medal} **${result.modelName}** ${statusEmoji}\n`;
|
|
370
|
-
report += ` Score: ${result.score}/100\n`;
|
|
371
|
-
report += ` Tests: ${result.details.testsPassed}/${result.details.totalTests} passed\n`;
|
|
372
|
-
report += ` Time: ${result.timeMs}ms\n`;
|
|
373
|
-
if (result.tokensUsed) report += ` Tokens: ${result.tokensUsed}\n`;
|
|
374
|
-
report += `\n`;
|
|
527
|
+
// Leaderboard
|
|
528
|
+
report += "**Leaderboard:**\n";
|
|
529
|
+
sorted.forEach((r, i) => {
|
|
530
|
+
const medal = i === 0 ? "🥇" : i === 1 ? "🥈" : i === 2 ? "🥉" : `${i + 1}.`;
|
|
531
|
+
const tier = r.totalScore >= 80 ? "S" : r.totalScore >= 65 ? "A" : r.totalScore >= 50 ? "B" : r.totalScore >= 35 ? "C" : "D";
|
|
532
|
+
report += `${medal} **${r.modelId}** — ${r.totalScore}/100 [${tier}] (avg ${r.avgTimeMs}ms)\n`;
|
|
375
533
|
});
|
|
376
534
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
report +=
|
|
381
|
-
report +=
|
|
382
|
-
|
|
383
|
-
|
|
535
|
+
// Category breakdown
|
|
536
|
+
report += "\n**Category Breakdown:**\n```\n";
|
|
537
|
+
const header = "Model".padEnd(25) + categories.map(c => c.substring(0, 8).padEnd(10)).join("") + "TOTAL\n";
|
|
538
|
+
report += header;
|
|
539
|
+
report += "-".repeat(header.length) + "\n";
|
|
540
|
+
|
|
541
|
+
for (const r of sorted) {
|
|
542
|
+
let line = r.modelId.substring(0, 24).padEnd(25);
|
|
543
|
+
for (const cat of categories) {
|
|
544
|
+
const score = r.categories[cat]?.score ?? "-";
|
|
545
|
+
line += String(score).padEnd(10);
|
|
546
|
+
}
|
|
547
|
+
line += String(r.totalScore);
|
|
548
|
+
report += line + "\n";
|
|
549
|
+
}
|
|
550
|
+
report += "```\n";
|
|
551
|
+
|
|
552
|
+
// Best model per category
|
|
553
|
+
report += "\n**Best per Category:**\n";
|
|
554
|
+
for (const cat of categories) {
|
|
555
|
+
let best = { model: "none", score: -1 };
|
|
556
|
+
for (const r of sorted) {
|
|
557
|
+
const s = r.categories[cat]?.score ?? 0;
|
|
558
|
+
if (s > best.score) {
|
|
559
|
+
best = { model: r.modelId, score: s };
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
report += `- ${cat}: **${best.model}** (${best.score}/100)\n`;
|
|
563
|
+
}
|
|
384
564
|
|
|
565
|
+
report += `\n_Last updated: ${sorted[0]?.timestamp ?? "N/A"}_`;
|
|
385
566
|
return report;
|
|
386
567
|
}
|
|
387
568
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
*/
|
|
569
|
+
// ─── Command ─────────────────────────────────────────────────────────
|
|
570
|
+
|
|
391
571
|
pi.registerCommand("benchmark", {
|
|
392
|
-
description: "Run AI model benchmarks",
|
|
572
|
+
description: "Run AI model benchmarks (6 categories: code-gen, debug, planning, tool-calling, speed, orchestration)",
|
|
393
573
|
handler: async (args, ctx) => {
|
|
394
574
|
const arg = args.trim().toLowerCase();
|
|
395
575
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
return;
|
|
403
|
-
}
|
|
576
|
+
// Show results
|
|
577
|
+
if (arg === "results" || arg === "report") {
|
|
578
|
+
const store = await loadStore();
|
|
579
|
+
ctx.ui.notify(generateReport(store.results), "info");
|
|
580
|
+
return;
|
|
581
|
+
}
|
|
404
582
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
ctx.ui.notify("
|
|
583
|
+
// Compare (same as results but emphasized)
|
|
584
|
+
if (arg === "compare") {
|
|
585
|
+
const store = await loadStore();
|
|
586
|
+
if (store.results.length < 2) {
|
|
587
|
+
ctx.ui.notify("Need at least 2 model results to compare. Run `/benchmark all` first.", "info");
|
|
410
588
|
return;
|
|
411
589
|
}
|
|
590
|
+
ctx.ui.notify(generateReport(store.results), "info");
|
|
591
|
+
return;
|
|
592
|
+
}
|
|
412
593
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
"openai/gpt-3.5-turbo"
|
|
420
|
-
];
|
|
421
|
-
|
|
422
|
-
if (!arg) {
|
|
423
|
-
ctx.ui.notify(`Available commands:
|
|
424
|
-
/benchmark - Start interactive benchmark
|
|
425
|
-
/benchmark results - Show benchmark report
|
|
426
|
-
/benchmark clear - Clear all results
|
|
427
|
-
|
|
428
|
-
Available models for testing:
|
|
429
|
-
${availableModels.map(m => `- ${m}`).join('\n')}
|
|
430
|
-
|
|
431
|
-
Use /benchmark <model-name> to test a specific model.`, "info");
|
|
432
|
-
return;
|
|
433
|
-
}
|
|
594
|
+
// Clear
|
|
595
|
+
if (arg === "clear") {
|
|
596
|
+
await saveStore({ version: 2, results: [], lastUpdated: new Date().toISOString() });
|
|
597
|
+
ctx.ui.notify("🗑️ All benchmark results cleared.", "info");
|
|
598
|
+
return;
|
|
599
|
+
}
|
|
434
600
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
601
|
+
// Help
|
|
602
|
+
if (arg === "help" || arg === "?") {
|
|
603
|
+
ctx.ui.notify(`**Phi Code Benchmark** — 6 categories, real API calls
|
|
604
|
+
|
|
605
|
+
Commands:
|
|
606
|
+
/benchmark Run on current model
|
|
607
|
+
/benchmark all Run on ALL available models
|
|
608
|
+
/benchmark <model-id> Run on a specific model
|
|
609
|
+
/benchmark results Show saved results
|
|
610
|
+
/benchmark compare Side-by-side comparison
|
|
611
|
+
/benchmark clear Clear all results
|
|
612
|
+
|
|
613
|
+
Categories tested (weighted):
|
|
614
|
+
⚡ code-gen (×2) — Generate a TypeScript function
|
|
615
|
+
🐛 debug (×2) — Find and fix a bug
|
|
616
|
+
📋 planning (×2) — Create implementation plan
|
|
617
|
+
🔧 tool-calling (×1) — Structured JSON output
|
|
618
|
+
⏱️ speed (×1) — Response latency
|
|
619
|
+
🧩 orchestration (×2) — Multi-step analysis
|
|
620
|
+
|
|
621
|
+
Scoring: S (80+), A (65+), B (50+), C (35+), D (<35)`, "info");
|
|
622
|
+
return;
|
|
623
|
+
}
|
|
440
624
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
625
|
+
// Get available models
|
|
626
|
+
const available = getAvailableModels();
|
|
627
|
+
if (available.length === 0) {
|
|
628
|
+
ctx.ui.notify("❌ No API keys detected. Set ALIBABA_CODING_PLAN_KEY, OPENAI_API_KEY, or another provider key.", "warning");
|
|
629
|
+
return;
|
|
630
|
+
}
|
|
445
631
|
|
|
446
|
-
|
|
632
|
+
const store = await loadStore();
|
|
447
633
|
|
|
448
|
-
|
|
449
|
-
|
|
634
|
+
if (arg === "all") {
|
|
635
|
+
// Benchmark ALL available models
|
|
636
|
+
ctx.ui.notify(`🚀 Starting benchmark on ${available.length} models (6 tests each)...\n`, "info");
|
|
450
637
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
await saveResults(summary);
|
|
638
|
+
for (const model of available) {
|
|
639
|
+
ctx.ui.notify(`\n🧪 **${model.id}** (${model.provider})`, "info");
|
|
640
|
+
const result = await benchmarkModel(model.id, model.provider, model.baseUrl, model.apiKey, ctx);
|
|
455
641
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
642
|
+
// Replace existing result for this model
|
|
643
|
+
store.results = store.results.filter(r => r.modelId !== model.id);
|
|
644
|
+
store.results.push(result);
|
|
645
|
+
await saveStore(store);
|
|
646
|
+
}
|
|
459
647
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
648
|
+
ctx.ui.notify(`\n✅ Benchmark complete! ${available.length} models tested.\n`, "info");
|
|
649
|
+
ctx.ui.notify(generateReport(store.results), "info");
|
|
650
|
+
return;
|
|
651
|
+
}
|
|
464
652
|
|
|
465
|
-
|
|
653
|
+
if (arg) {
|
|
654
|
+
// Benchmark specific model
|
|
655
|
+
const model = available.find(m => m.id.toLowerCase() === arg || m.id.toLowerCase().includes(arg));
|
|
656
|
+
if (!model) {
|
|
657
|
+
ctx.ui.notify(`Model "${arg}" not found or no API key. Available:\n${available.map(m => ` - ${m.id} (${m.provider})`).join("\n")}`, "warning");
|
|
658
|
+
return;
|
|
659
|
+
}
|
|
466
660
|
|
|
467
|
-
|
|
661
|
+
ctx.ui.notify(`🧪 Benchmarking **${model.id}** (6 categories)...\n`, "info");
|
|
662
|
+
const result = await benchmarkModel(model.id, model.provider, model.baseUrl, model.apiKey, ctx);
|
|
663
|
+
store.results = store.results.filter(r => r.modelId !== model.id);
|
|
664
|
+
store.results.push(result);
|
|
665
|
+
await saveStore(store);
|
|
468
666
|
|
|
469
|
-
ctx.ui.notify(
|
|
667
|
+
ctx.ui.notify(`\n✅ **${model.id}** — Total: ${result.totalScore}/100 (avg ${result.avgTimeMs}ms)`, "info");
|
|
668
|
+
return;
|
|
669
|
+
}
|
|
470
670
|
|
|
471
|
-
|
|
472
|
-
|
|
671
|
+
// Default: benchmark current model
|
|
672
|
+
// Try to find current model in available list
|
|
673
|
+
const currentModel = ctx.model;
|
|
674
|
+
if (currentModel) {
|
|
675
|
+
const modelConfig = available.find(m => m.id === currentModel.id);
|
|
676
|
+
if (modelConfig) {
|
|
677
|
+
ctx.ui.notify(`🧪 Benchmarking current model **${currentModel.id}** (6 categories)...\n`, "info");
|
|
678
|
+
const result = await benchmarkModel(modelConfig.id, modelConfig.provider, modelConfig.baseUrl, modelConfig.apiKey, ctx);
|
|
679
|
+
store.results = store.results.filter(r => r.modelId !== modelConfig.id);
|
|
680
|
+
store.results.push(result);
|
|
681
|
+
await saveStore(store);
|
|
682
|
+
ctx.ui.notify(`\n✅ **${currentModel.id}** — Total: ${result.totalScore}/100`, "info");
|
|
683
|
+
return;
|
|
684
|
+
}
|
|
473
685
|
}
|
|
686
|
+
|
|
687
|
+
// Fallback: show available models
|
|
688
|
+
ctx.ui.notify(`Available models for benchmark:\n${available.map(m => ` - ${m.id} (${m.provider})`).join("\n")}\n\nUsage: /benchmark <model-id> or /benchmark all`, "info");
|
|
474
689
|
},
|
|
475
690
|
});
|
|
476
691
|
|
|
477
|
-
|
|
478
|
-
* Show benchmark info on session start
|
|
479
|
-
*/
|
|
692
|
+
// Session start notification
|
|
480
693
|
pi.on("session_start", async (_event, ctx) => {
|
|
481
694
|
try {
|
|
482
|
-
const
|
|
483
|
-
if (
|
|
484
|
-
ctx.ui.notify(`🧪
|
|
695
|
+
const store = await loadStore();
|
|
696
|
+
if (store.results.length > 0) {
|
|
697
|
+
ctx.ui.notify(`🧪 ${store.results.length} benchmark results available. /benchmark results to view.`, "info");
|
|
485
698
|
}
|
|
486
699
|
} catch {
|
|
487
|
-
//
|
|
700
|
+
// ignore
|
|
488
701
|
}
|
|
489
702
|
});
|
|
490
|
-
}
|
|
703
|
+
}
|