@kernel.chat/kbot 3.51.0 → 3.52.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -9
- package/dist/agent-protocol.test.d.ts +2 -0
- package/dist/agent-protocol.test.d.ts.map +1 -0
- package/dist/agent-protocol.test.js +730 -0
- package/dist/agent-protocol.test.js.map +1 -0
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +34 -10
- package/dist/agent.js.map +1 -1
- package/dist/auth.js +3 -3
- package/dist/auth.js.map +1 -1
- package/dist/bench.d.ts +64 -0
- package/dist/bench.d.ts.map +1 -0
- package/dist/bench.js +973 -0
- package/dist/bench.js.map +1 -0
- package/dist/cli.js +144 -29
- package/dist/cli.js.map +1 -1
- package/dist/cloud-agent.d.ts +77 -0
- package/dist/cloud-agent.d.ts.map +1 -0
- package/dist/cloud-agent.js +743 -0
- package/dist/cloud-agent.js.map +1 -0
- package/dist/context.test.d.ts +2 -0
- package/dist/context.test.d.ts.map +1 -0
- package/dist/context.test.js +561 -0
- package/dist/context.test.js.map +1 -0
- package/dist/evolution.d.ts.map +1 -1
- package/dist/evolution.js +4 -1
- package/dist/evolution.js.map +1 -1
- package/dist/github-release.d.ts +61 -0
- package/dist/github-release.d.ts.map +1 -0
- package/dist/github-release.js +451 -0
- package/dist/github-release.js.map +1 -0
- package/dist/graph-memory.test.d.ts +2 -0
- package/dist/graph-memory.test.d.ts.map +1 -0
- package/dist/graph-memory.test.js +946 -0
- package/dist/graph-memory.test.js.map +1 -0
- package/dist/init-science.d.ts +43 -0
- package/dist/init-science.d.ts.map +1 -0
- package/dist/init-science.js +477 -0
- package/dist/init-science.js.map +1 -0
- package/dist/lab.d.ts +45 -0
- package/dist/lab.d.ts.map +1 -0
- package/dist/lab.js +1020 -0
- package/dist/lab.js.map +1 -0
- package/dist/lsp-deep.d.ts +101 -0
- package/dist/lsp-deep.d.ts.map +1 -0
- package/dist/lsp-deep.js +689 -0
- package/dist/lsp-deep.js.map +1 -0
- package/dist/memory.test.d.ts +2 -0
- package/dist/memory.test.d.ts.map +1 -0
- package/dist/memory.test.js +369 -0
- package/dist/memory.test.js.map +1 -0
- package/dist/multi-session.d.ts +164 -0
- package/dist/multi-session.d.ts.map +1 -0
- package/dist/multi-session.js +885 -0
- package/dist/multi-session.js.map +1 -0
- package/dist/self-eval.d.ts.map +1 -1
- package/dist/self-eval.js +5 -2
- package/dist/self-eval.js.map +1 -1
- package/dist/streaming.d.ts.map +1 -1
- package/dist/streaming.js +0 -1
- package/dist/streaming.js.map +1 -1
- package/dist/teach.d.ts +136 -0
- package/dist/teach.d.ts.map +1 -0
- package/dist/teach.js +915 -0
- package/dist/teach.js.map +1 -0
- package/dist/telemetry.d.ts +1 -1
- package/dist/telemetry.d.ts.map +1 -1
- package/dist/telemetry.js.map +1 -1
- package/dist/tools/browser-agent.js +2 -2
- package/dist/tools/browser-agent.js.map +1 -1
- package/dist/tools/forge.d.ts.map +1 -1
- package/dist/tools/forge.js +15 -26
- package/dist/tools/forge.js.map +1 -1
- package/dist/tools/git.d.ts.map +1 -1
- package/dist/tools/git.js +10 -7
- package/dist/tools/git.js.map +1 -1
- package/dist/voice-realtime.d.ts +54 -0
- package/dist/voice-realtime.d.ts.map +1 -0
- package/dist/voice-realtime.js +805 -0
- package/dist/voice-realtime.js.map +1 -0
- package/package.json +10 -3
package/dist/bench.js
ADDED
|
@@ -0,0 +1,973 @@
|
|
|
1
|
+
// kbot bench — Self-evaluation and benchmarking system
|
|
2
|
+
// Runs standardized coding tasks and scores kbot's performance
|
|
3
|
+
//
|
|
4
|
+
// Usage:
|
|
5
|
+
// kbot bench # Run all benchmarks
|
|
6
|
+
// kbot bench --category codegen # Run only code generation tasks
|
|
7
|
+
// kbot bench --difficulty hard # Run only hard tasks
|
|
8
|
+
// kbot bench --compare # Compare last two runs
|
|
9
|
+
// kbot bench --history # Show all past runs
|
|
10
|
+
//
|
|
11
|
+
// Scoring (per task):
|
|
12
|
+
// Pattern match (50%) — how many expected regex patterns the response matched
|
|
13
|
+
// Tool usage (30%) — whether correct tools were invoked
|
|
14
|
+
// Speed (20%) — response time relative to timeout budget
|
|
15
|
+
//
|
|
16
|
+
// Results saved to ~/.kbot/bench/ as timestamped JSON files.
|
|
17
|
+
import { existsSync, mkdirSync, writeFileSync, readFileSync, readdirSync } from 'node:fs';
|
|
18
|
+
import { join } from 'node:path';
|
|
19
|
+
import { homedir } from 'node:os';
|
|
20
|
+
import chalk from 'chalk';
|
|
21
|
+
import { runAgent } from './agent.js';
|
|
22
|
+
import { getByokProvider, getProviderModel } from './auth.js';
|
|
23
|
+
import { createRequire } from 'node:module';
|
|
24
|
+
const __require = createRequire(import.meta.url);
|
|
25
|
+
const VERSION = __require('../package.json').version;
|
|
26
|
+
// ── Paths ──
|
|
27
|
+
const BENCH_DIR = join(homedir(), '.kbot', 'bench');
|
|
28
|
+
function ensureBenchDir() {
|
|
29
|
+
if (!existsSync(BENCH_DIR))
|
|
30
|
+
mkdirSync(BENCH_DIR, { recursive: true });
|
|
31
|
+
}
|
|
32
|
+
// ── Benchmark Tasks (20 tasks across 6 categories) ──
|
|
33
|
+
const BENCH_TASKS = [
|
|
34
|
+
// ── Code Generation (5 tasks) ──
|
|
35
|
+
{
|
|
36
|
+
id: 'codegen-fibonacci',
|
|
37
|
+
category: 'codegen',
|
|
38
|
+
difficulty: 'easy',
|
|
39
|
+
prompt: 'Write a TypeScript function called `fibonacci(n: number): number` that returns the nth Fibonacci number. Use memoization for O(n) performance. Include the function signature and implementation only.',
|
|
40
|
+
expectedPatterns: [
|
|
41
|
+
'function\\s+fibonacci',
|
|
42
|
+
'number\\s*\\)',
|
|
43
|
+
'(memo|cache|Map|Record|\\{\\})',
|
|
44
|
+
'return',
|
|
45
|
+
],
|
|
46
|
+
maxTokens: 500,
|
|
47
|
+
timeoutMs: 15000,
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
id: 'codegen-debounce',
|
|
51
|
+
category: 'codegen',
|
|
52
|
+
difficulty: 'medium',
|
|
53
|
+
prompt: 'Write a TypeScript debounce function with this signature: `function debounce<T extends (...args: any[]) => void>(fn: T, delayMs: number): T`. It should cancel pending invocations when called again within the delay window. Return the implementation.',
|
|
54
|
+
expectedPatterns: [
|
|
55
|
+
'function\\s+debounce',
|
|
56
|
+
'setTimeout',
|
|
57
|
+
'clearTimeout',
|
|
58
|
+
'delayMs|delay|ms|wait',
|
|
59
|
+
],
|
|
60
|
+
maxTokens: 600,
|
|
61
|
+
timeoutMs: 20000,
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
id: 'codegen-lru-cache',
|
|
65
|
+
category: 'codegen',
|
|
66
|
+
difficulty: 'hard',
|
|
67
|
+
prompt: 'Implement an LRU (Least Recently Used) cache in TypeScript with O(1) get and put operations. The class should be `LRUCache<K, V>` with constructor `(capacity: number)`, `get(key: K): V | undefined`, and `put(key: K, value: V): void`. Use a Map for O(1) lookups.',
|
|
68
|
+
expectedPatterns: [
|
|
69
|
+
'class\\s+LRUCache',
|
|
70
|
+
'capacity',
|
|
71
|
+
'get\\s*\\(',
|
|
72
|
+
'put\\s*\\(',
|
|
73
|
+
'Map|map',
|
|
74
|
+
'delete|remove',
|
|
75
|
+
],
|
|
76
|
+
maxTokens: 1000,
|
|
77
|
+
timeoutMs: 30000,
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
id: 'codegen-promise-all',
|
|
81
|
+
category: 'codegen',
|
|
82
|
+
difficulty: 'medium',
|
|
83
|
+
prompt: 'Implement a custom `promiseAll` function in TypeScript that behaves like `Promise.all`. Signature: `function promiseAll<T>(promises: Promise<T>[]): Promise<T[]>`. It should reject immediately if any promise rejects, and resolve with all results in order when all succeed.',
|
|
84
|
+
expectedPatterns: [
|
|
85
|
+
'function\\s+promiseAll',
|
|
86
|
+
'Promise',
|
|
87
|
+
'resolve',
|
|
88
|
+
'reject',
|
|
89
|
+
'length|count',
|
|
90
|
+
],
|
|
91
|
+
maxTokens: 600,
|
|
92
|
+
timeoutMs: 20000,
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
id: 'codegen-event-emitter',
|
|
96
|
+
category: 'codegen',
|
|
97
|
+
difficulty: 'hard',
|
|
98
|
+
prompt: 'Write a type-safe EventEmitter class in TypeScript. It should support `on(event, listener)`, `off(event, listener)`, `emit(event, ...args)`, and `once(event, listener)`. Use generics so that event names and payload types are checked at compile time.',
|
|
99
|
+
expectedPatterns: [
|
|
100
|
+
'class\\s+EventEmitter',
|
|
101
|
+
'on\\s*[(<]',
|
|
102
|
+
'off\\s*[(<]',
|
|
103
|
+
'emit\\s*[(<]',
|
|
104
|
+
'once\\s*[(<]',
|
|
105
|
+
'(Map|Record|listeners|handlers)',
|
|
106
|
+
],
|
|
107
|
+
maxTokens: 1000,
|
|
108
|
+
timeoutMs: 30000,
|
|
109
|
+
},
|
|
110
|
+
// ── Bug Fixing (4 tasks) ──
|
|
111
|
+
{
|
|
112
|
+
id: 'bugfix-off-by-one',
|
|
113
|
+
category: 'bugfix',
|
|
114
|
+
difficulty: 'easy',
|
|
115
|
+
prompt: `Find and fix the bug in this function:
|
|
116
|
+
|
|
117
|
+
\`\`\`typescript
|
|
118
|
+
function binarySearch(arr: number[], target: number): number {
|
|
119
|
+
let left = 0
|
|
120
|
+
let right = arr.length
|
|
121
|
+
while (left < right) {
|
|
122
|
+
const mid = Math.floor((left + right) / 2)
|
|
123
|
+
if (arr[mid] === target) return mid
|
|
124
|
+
if (arr[mid] < target) left = mid
|
|
125
|
+
else right = mid
|
|
126
|
+
}
|
|
127
|
+
return -1
|
|
128
|
+
}
|
|
129
|
+
\`\`\`
|
|
130
|
+
|
|
131
|
+
The function enters an infinite loop for some inputs. Explain the bug and provide the corrected version.`,
|
|
132
|
+
expectedPatterns: [
|
|
133
|
+
'(left\\s*=\\s*mid\\s*\\+\\s*1|left\\s*\\+\\s*1|mid\\s*\\+\\s*1)',
|
|
134
|
+
'(infinite|loop|off.by.one|never.advance|stuck)',
|
|
135
|
+
'function\\s+binarySearch',
|
|
136
|
+
],
|
|
137
|
+
maxTokens: 800,
|
|
138
|
+
timeoutMs: 20000,
|
|
139
|
+
},
|
|
140
|
+
{
|
|
141
|
+
id: 'bugfix-closure-loop',
|
|
142
|
+
category: 'bugfix',
|
|
143
|
+
difficulty: 'medium',
|
|
144
|
+
prompt: `Find and fix the bug in this code:
|
|
145
|
+
|
|
146
|
+
\`\`\`javascript
|
|
147
|
+
function createCounters(n) {
|
|
148
|
+
const counters = []
|
|
149
|
+
for (var i = 0; i < n; i++) {
|
|
150
|
+
counters.push(function() { return i })
|
|
151
|
+
}
|
|
152
|
+
return counters
|
|
153
|
+
}
|
|
154
|
+
// createCounters(3) returns [f, f, f] — all return 3 instead of 0, 1, 2
|
|
155
|
+
\`\`\`
|
|
156
|
+
|
|
157
|
+
Explain the closure/scoping bug and provide the fix.`,
|
|
158
|
+
expectedPatterns: [
|
|
159
|
+
'(let|const|IIFE|closure|scope|block.scop)',
|
|
160
|
+
'(var\\s+i|shared|captured|reference)',
|
|
161
|
+
'(let\\s+i|\\(function|=>)',
|
|
162
|
+
],
|
|
163
|
+
maxTokens: 800,
|
|
164
|
+
timeoutMs: 20000,
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
id: 'bugfix-async-race',
|
|
168
|
+
category: 'bugfix',
|
|
169
|
+
difficulty: 'hard',
|
|
170
|
+
prompt: `Find and fix the race condition in this React hook:
|
|
171
|
+
|
|
172
|
+
\`\`\`typescript
|
|
173
|
+
function useSearch(query: string) {
|
|
174
|
+
const [results, setResults] = useState([])
|
|
175
|
+
|
|
176
|
+
useEffect(() => {
|
|
177
|
+
async function search() {
|
|
178
|
+
const data = await fetchResults(query)
|
|
179
|
+
setResults(data)
|
|
180
|
+
}
|
|
181
|
+
search()
|
|
182
|
+
}, [query])
|
|
183
|
+
|
|
184
|
+
return results
|
|
185
|
+
}
|
|
186
|
+
\`\`\`
|
|
187
|
+
|
|
188
|
+
If the user types quickly, stale results from earlier requests can overwrite newer ones. Explain the problem and provide the fix with cleanup.`,
|
|
189
|
+
expectedPatterns: [
|
|
190
|
+
'(race.condition|stale|out.of.order|cancelled|abort|cleanup)',
|
|
191
|
+
'(AbortController|cancelled|ignore|flag|stale)',
|
|
192
|
+
'return\\s*(\\(\\)|function|\\(\\)\\s*=>|\\{)',
|
|
193
|
+
],
|
|
194
|
+
maxTokens: 1000,
|
|
195
|
+
timeoutMs: 25000,
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
id: 'bugfix-memory-leak',
|
|
199
|
+
category: 'bugfix',
|
|
200
|
+
difficulty: 'medium',
|
|
201
|
+
prompt: `Find and fix the memory leak in this Node.js code:
|
|
202
|
+
|
|
203
|
+
\`\`\`typescript
|
|
204
|
+
const cache = new Map<string, Buffer>()
|
|
205
|
+
|
|
206
|
+
app.get('/image/:id', async (req, res) => {
|
|
207
|
+
const id = req.params.id
|
|
208
|
+
if (!cache.has(id)) {
|
|
209
|
+
const buffer = await fs.readFile(\`/images/\${id}.png\`)
|
|
210
|
+
cache.set(id, buffer)
|
|
211
|
+
}
|
|
212
|
+
res.send(cache.get(id))
|
|
213
|
+
})
|
|
214
|
+
\`\`\`
|
|
215
|
+
|
|
216
|
+
The server eventually runs out of memory. Explain why and provide a fix.`,
|
|
217
|
+
expectedPatterns: [
|
|
218
|
+
'(memory.leak|unbounded|grows.forever|no.eviction|no.limit)',
|
|
219
|
+
'(LRU|max.size|TTL|expir|evict|limit|WeakRef|delete)',
|
|
220
|
+
'(cache\\.size|Map\\.size|bound|cap)',
|
|
221
|
+
],
|
|
222
|
+
maxTokens: 800,
|
|
223
|
+
timeoutMs: 20000,
|
|
224
|
+
},
|
|
225
|
+
// ── Refactoring (3 tasks) ──
|
|
226
|
+
{
|
|
227
|
+
id: 'refactor-extract-function',
|
|
228
|
+
category: 'refactor',
|
|
229
|
+
difficulty: 'easy',
|
|
230
|
+
prompt: `Refactor this code by extracting repeated logic into helper functions:
|
|
231
|
+
|
|
232
|
+
\`\`\`typescript
|
|
233
|
+
function processOrders(orders: Order[]) {
|
|
234
|
+
const results = []
|
|
235
|
+
for (const order of orders) {
|
|
236
|
+
const subtotal = order.items.reduce((sum, item) => sum + item.price * item.quantity, 0)
|
|
237
|
+
const tax = subtotal * 0.08
|
|
238
|
+
const shipping = subtotal > 100 ? 0 : 9.99
|
|
239
|
+
const total = subtotal + tax + shipping
|
|
240
|
+
results.push({ orderId: order.id, subtotal, tax, shipping, total })
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const totals = results.map(r => r.total)
|
|
244
|
+
const average = totals.reduce((sum, t) => sum + t, 0) / totals.length
|
|
245
|
+
const max = Math.max(...totals)
|
|
246
|
+
const min = Math.min(...totals)
|
|
247
|
+
|
|
248
|
+
return { results, stats: { average, max, min } }
|
|
249
|
+
}
|
|
250
|
+
\`\`\`
|
|
251
|
+
|
|
252
|
+
Extract at least two helper functions. Show the refactored code.`,
|
|
253
|
+
expectedPatterns: [
|
|
254
|
+
'function\\s+calc(ulate)?(Subtotal|Total|Tax|Shipping|Order)',
|
|
255
|
+
'function\\s+(calc|compute|get)',
|
|
256
|
+
'(reduce|map)',
|
|
257
|
+
],
|
|
258
|
+
maxTokens: 1000,
|
|
259
|
+
timeoutMs: 20000,
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
id: 'refactor-strategy-pattern',
|
|
263
|
+
category: 'refactor',
|
|
264
|
+
difficulty: 'hard',
|
|
265
|
+
prompt: `Refactor this switch-based code to use the strategy pattern:
|
|
266
|
+
|
|
267
|
+
\`\`\`typescript
|
|
268
|
+
function formatOutput(data: any, format: string): string {
|
|
269
|
+
switch (format) {
|
|
270
|
+
case 'json':
|
|
271
|
+
return JSON.stringify(data, null, 2)
|
|
272
|
+
case 'csv':
|
|
273
|
+
const headers = Object.keys(data[0]).join(',')
|
|
274
|
+
const rows = data.map((row: any) => Object.values(row).join(','))
|
|
275
|
+
return [headers, ...rows].join('\\n')
|
|
276
|
+
case 'xml':
|
|
277
|
+
return data.map((item: any) =>
|
|
278
|
+
'<item>' + Object.entries(item).map(([k, v]) => \`<\${k}>\${v}</\${k}>\`).join('') + '</item>'
|
|
279
|
+
).join('\\n')
|
|
280
|
+
case 'yaml':
|
|
281
|
+
return data.map((item: any) =>
|
|
282
|
+
Object.entries(item).map(([k, v]) => \`\${k}: \${v}\`).join('\\n')
|
|
283
|
+
).join('\\n---\\n')
|
|
284
|
+
default:
|
|
285
|
+
throw new Error(\`Unknown format: \${format}\`)
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
\`\`\`
|
|
289
|
+
|
|
290
|
+
Replace the switch statement with a strategy map/object. Show the refactored code.`,
|
|
291
|
+
expectedPatterns: [
|
|
292
|
+
'(Record|Map|strategies|formatters|handlers)',
|
|
293
|
+
'(interface|type)\\s+(Formatter|Strategy|Format)',
|
|
294
|
+
'(\\[format\\]|get\\(format\\))',
|
|
295
|
+
],
|
|
296
|
+
maxTokens: 1200,
|
|
297
|
+
timeoutMs: 25000,
|
|
298
|
+
},
|
|
299
|
+
{
|
|
300
|
+
id: 'refactor-promise-chain',
|
|
301
|
+
category: 'refactor',
|
|
302
|
+
difficulty: 'medium',
|
|
303
|
+
prompt: `Refactor this nested promise chain into clean async/await:
|
|
304
|
+
|
|
305
|
+
\`\`\`typescript
|
|
306
|
+
function fetchUserData(userId: string) {
|
|
307
|
+
return fetch(\`/api/users/\${userId}\`)
|
|
308
|
+
.then(res => res.json())
|
|
309
|
+
.then(user => {
|
|
310
|
+
return fetch(\`/api/teams/\${user.teamId}\`)
|
|
311
|
+
.then(res => res.json())
|
|
312
|
+
.then(team => {
|
|
313
|
+
return fetch(\`/api/orgs/\${team.orgId}\`)
|
|
314
|
+
.then(res => res.json())
|
|
315
|
+
.then(org => {
|
|
316
|
+
return { user, team, org }
|
|
317
|
+
})
|
|
318
|
+
})
|
|
319
|
+
})
|
|
320
|
+
.catch(err => {
|
|
321
|
+
console.error('Failed:', err)
|
|
322
|
+
throw err
|
|
323
|
+
})
|
|
324
|
+
}
|
|
325
|
+
\`\`\`
|
|
326
|
+
|
|
327
|
+
Convert to async/await. Add proper error handling with try/catch. Show the refactored code.`,
|
|
328
|
+
expectedPatterns: [
|
|
329
|
+
'async\\s+function',
|
|
330
|
+
'await\\s+fetch',
|
|
331
|
+
'try\\s*\\{',
|
|
332
|
+
'catch\\s*\\(',
|
|
333
|
+
],
|
|
334
|
+
maxTokens: 800,
|
|
335
|
+
timeoutMs: 20000,
|
|
336
|
+
},
|
|
337
|
+
// ── Explanation (3 tasks) ──
|
|
338
|
+
{
|
|
339
|
+
id: 'explain-event-loop',
|
|
340
|
+
category: 'explain',
|
|
341
|
+
difficulty: 'medium',
|
|
342
|
+
prompt: `Explain what this code outputs and why:
|
|
343
|
+
|
|
344
|
+
\`\`\`javascript
|
|
345
|
+
console.log('1')
|
|
346
|
+
|
|
347
|
+
setTimeout(() => console.log('2'), 0)
|
|
348
|
+
|
|
349
|
+
Promise.resolve().then(() => console.log('3'))
|
|
350
|
+
|
|
351
|
+
queueMicrotask(() => console.log('4'))
|
|
352
|
+
|
|
353
|
+
console.log('5')
|
|
354
|
+
\`\`\`
|
|
355
|
+
|
|
356
|
+
Explain the order of execution in terms of the JavaScript event loop, call stack, microtask queue, and macrotask queue.`,
|
|
357
|
+
expectedPatterns: [
|
|
358
|
+
'1.*5.*3.*4.*2|1,\\s*5,\\s*3,\\s*4,\\s*2',
|
|
359
|
+
'(microtask|micro.task)',
|
|
360
|
+
'(macrotask|macro.task|task.queue|callback.queue|timer)',
|
|
361
|
+
'(event.loop|call.stack)',
|
|
362
|
+
],
|
|
363
|
+
maxTokens: 1000,
|
|
364
|
+
timeoutMs: 20000,
|
|
365
|
+
},
|
|
366
|
+
{
|
|
367
|
+
id: 'explain-typescript-infer',
|
|
368
|
+
category: 'explain',
|
|
369
|
+
difficulty: 'hard',
|
|
370
|
+
prompt: `Explain what this TypeScript type does, step by step:
|
|
371
|
+
|
|
372
|
+
\`\`\`typescript
|
|
373
|
+
type DeepReadonly<T> = T extends (infer U)[]
|
|
374
|
+
? ReadonlyArray<DeepReadonly<U>>
|
|
375
|
+
: T extends object
|
|
376
|
+
? { readonly [K in keyof T]: DeepReadonly<T[K]> }
|
|
377
|
+
: T
|
|
378
|
+
\`\`\`
|
|
379
|
+
|
|
380
|
+
Walk through each conditional branch and explain how it recursively makes a type deeply immutable.`,
|
|
381
|
+
expectedPatterns: [
|
|
382
|
+
'(recursive|recursion|recursively)',
|
|
383
|
+
'(array|infer\\s+U)',
|
|
384
|
+
'(object|keyof)',
|
|
385
|
+
'(readonly|immutable|read.only)',
|
|
386
|
+
'(conditional.type|extends)',
|
|
387
|
+
],
|
|
388
|
+
maxTokens: 1000,
|
|
389
|
+
timeoutMs: 25000,
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
id: 'explain-git-rebase',
|
|
393
|
+
category: 'explain',
|
|
394
|
+
difficulty: 'easy',
|
|
395
|
+
prompt: 'Explain the difference between `git merge` and `git rebase`. When should you use each? What are the risks of rebasing a shared branch? Give concrete examples.',
|
|
396
|
+
expectedPatterns: [
|
|
397
|
+
'(merge|commit.history|merge.commit)',
|
|
398
|
+
'(rebase|replay|re.apply|linear)',
|
|
399
|
+
'(shared.branch|public|force.push|rewrite.history)',
|
|
400
|
+
'(fast.forward|conflict)',
|
|
401
|
+
],
|
|
402
|
+
maxTokens: 800,
|
|
403
|
+
timeoutMs: 15000,
|
|
404
|
+
},
|
|
405
|
+
// ── Research (3 tasks) ──
|
|
406
|
+
{
|
|
407
|
+
id: 'research-transformer-architecture',
|
|
408
|
+
category: 'research',
|
|
409
|
+
difficulty: 'hard',
|
|
410
|
+
prompt: 'Explain the Transformer architecture from "Attention Is All You Need" (Vaswani et al., 2017). Cover: self-attention mechanism, multi-head attention, positional encoding, encoder-decoder structure, and why it replaced RNNs for sequence tasks. Be technically precise.',
|
|
411
|
+
expectedPatterns: [
|
|
412
|
+
'(self.attention|scaled.dot.product)',
|
|
413
|
+
'(multi.head|multiple.heads)',
|
|
414
|
+
'(positional.encoding|position)',
|
|
415
|
+
'(encoder|decoder)',
|
|
416
|
+
'(softmax|Q.*K.*V|query.*key.*value)',
|
|
417
|
+
],
|
|
418
|
+
expectedTools: ['web_search'],
|
|
419
|
+
maxTokens: 2000,
|
|
420
|
+
timeoutMs: 45000,
|
|
421
|
+
},
|
|
422
|
+
{
|
|
423
|
+
id: 'research-rust-ownership',
|
|
424
|
+
category: 'research',
|
|
425
|
+
difficulty: 'medium',
|
|
426
|
+
prompt: 'Explain Rust\'s ownership model. Cover: ownership rules, borrowing (shared vs mutable references), lifetimes, and how the borrow checker prevents data races at compile time. Give code examples for each concept.',
|
|
427
|
+
expectedPatterns: [
|
|
428
|
+
'(ownership|owner)',
|
|
429
|
+
'(borrow|&|reference)',
|
|
430
|
+
'(lifetime|\'a)',
|
|
431
|
+
'(borrow.checker|compile.time|data.race)',
|
|
432
|
+
'(mut|mutable)',
|
|
433
|
+
],
|
|
434
|
+
maxTokens: 2000,
|
|
435
|
+
timeoutMs: 30000,
|
|
436
|
+
},
|
|
437
|
+
{
|
|
438
|
+
id: 'research-crdts',
|
|
439
|
+
category: 'research',
|
|
440
|
+
difficulty: 'hard',
|
|
441
|
+
prompt: 'What are CRDTs (Conflict-free Replicated Data Types)? Explain the difference between state-based (CvRDT) and operation-based (CmRDT) approaches. Give examples of common CRDT types (G-Counter, PN-Counter, LWW-Register, OR-Set) and their merge semantics. Where are CRDTs used in production systems?',
|
|
442
|
+
expectedPatterns: [
|
|
443
|
+
'(CRDT|conflict.free)',
|
|
444
|
+
'(state.based|CvRDT|convergent)',
|
|
445
|
+
'(operation.based|CmRDT|commutative)',
|
|
446
|
+
'(G.Counter|PN.Counter|LWW|OR.Set)',
|
|
447
|
+
'(merge|join|lattice|semilattice)',
|
|
448
|
+
],
|
|
449
|
+
expectedTools: ['web_search'],
|
|
450
|
+
maxTokens: 2000,
|
|
451
|
+
timeoutMs: 45000,
|
|
452
|
+
},
|
|
453
|
+
// ── Science (2 tasks) ──
|
|
454
|
+
{
|
|
455
|
+
id: 'science-big-o',
|
|
456
|
+
category: 'science',
|
|
457
|
+
difficulty: 'medium',
|
|
458
|
+
prompt: 'Analyze the time complexity of this algorithm and prove your answer:\n\n```typescript\nfunction mystery(arr: number[]): number {\n const n = arr.length\n let count = 0\n for (let i = 0; i < n; i++) {\n for (let j = i; j < n; j++) {\n for (let k = i; k <= j; k++) {\n count++\n }\n }\n }\n return count\n}\n```\n\nWhat is the time complexity in Big-O notation? Show your derivation.',
|
|
459
|
+
expectedPatterns: [
|
|
460
|
+
'O\\(n[\\^³3]\\)|O\\(n\\s*\\*\\s*n\\s*\\*\\s*n\\)|cubic',
|
|
461
|
+
'(triple|three|3).*(loop|nested)',
|
|
462
|
+
'(sum|summation|\\bΣ\\b|sigma)',
|
|
463
|
+
],
|
|
464
|
+
maxTokens: 1000,
|
|
465
|
+
timeoutMs: 25000,
|
|
466
|
+
},
|
|
467
|
+
{
|
|
468
|
+
id: 'science-entropy',
|
|
469
|
+
category: 'science',
|
|
470
|
+
difficulty: 'hard',
|
|
471
|
+
prompt: 'Calculate the Shannon entropy of a fair 6-sided die. Then calculate the entropy of a loaded die where P(1)=0.5, P(2)=P(3)=P(4)=P(5)=P(6)=0.1. Show your work, explain the formula H = -Σ p(x) log2 p(x), and explain why the loaded die has lower entropy than the fair die.',
|
|
472
|
+
expectedPatterns: [
|
|
473
|
+
'(Shannon|entropy|information)',
|
|
474
|
+
'H\\s*=|log2|log_2|\\blog\\b',
|
|
475
|
+
'2\\.58|2\\.585',
|
|
476
|
+
'(fair|uniform|maximum)',
|
|
477
|
+
'(lower|less|decrease|uncertainty)',
|
|
478
|
+
],
|
|
479
|
+
maxTokens: 1200,
|
|
480
|
+
timeoutMs: 25000,
|
|
481
|
+
},
|
|
482
|
+
];
|
|
483
|
+
// ── Scoring ──
|
|
484
|
+
const WEIGHT_PATTERN = 0.5;
|
|
485
|
+
const WEIGHT_TOOLS = 0.3;
|
|
486
|
+
const WEIGHT_SPEED = 0.2;
|
|
487
|
+
/** Score pattern matches (0-1) */
|
|
488
|
+
function scorePatterns(response, patterns) {
|
|
489
|
+
if (patterns.length === 0)
|
|
490
|
+
return { score: 1, matched: [], missed: [] };
|
|
491
|
+
const matched = [];
|
|
492
|
+
const missed = [];
|
|
493
|
+
for (const pattern of patterns) {
|
|
494
|
+
try {
|
|
495
|
+
const regex = new RegExp(pattern, 'is');
|
|
496
|
+
if (regex.test(response)) {
|
|
497
|
+
matched.push(pattern);
|
|
498
|
+
}
|
|
499
|
+
else {
|
|
500
|
+
missed.push(pattern);
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
catch {
|
|
504
|
+
// Invalid regex — count as missed
|
|
505
|
+
missed.push(pattern);
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
return {
|
|
509
|
+
score: matched.length / patterns.length,
|
|
510
|
+
matched,
|
|
511
|
+
missed,
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
/** Score tool usage (0-1) */
|
|
515
|
+
function scoreTools(toolsCalled, expectedTools) {
|
|
516
|
+
if (!expectedTools || expectedTools.length === 0)
|
|
517
|
+
return 1; // No tool requirement
|
|
518
|
+
if (toolsCalled.length === 0 && expectedTools.length > 0)
|
|
519
|
+
return 0;
|
|
520
|
+
let matchCount = 0;
|
|
521
|
+
for (const expected of expectedTools) {
|
|
522
|
+
if (toolsCalled.some(t => t.toLowerCase().includes(expected.toLowerCase()))) {
|
|
523
|
+
matchCount++;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
return matchCount / expectedTools.length;
|
|
527
|
+
}
|
|
528
|
+
/** Score speed (0-1) based on time vs budget */
|
|
529
|
+
function scoreSpeed(durationMs, timeoutMs) {
|
|
530
|
+
if (durationMs <= 0)
|
|
531
|
+
return 1;
|
|
532
|
+
if (durationMs >= timeoutMs)
|
|
533
|
+
return 0;
|
|
534
|
+
// Linear interpolation: 0ms=1.0, timeoutMs=0.0
|
|
535
|
+
// But we give a generous curve — completing within 50% of budget is still 1.0
|
|
536
|
+
const ratio = durationMs / timeoutMs;
|
|
537
|
+
if (ratio <= 0.5)
|
|
538
|
+
return 1.0;
|
|
539
|
+
// From 50% to 100%, linearly degrade from 1.0 to 0.0
|
|
540
|
+
return Math.max(0, 1.0 - (ratio - 0.5) * 2);
|
|
541
|
+
}
|
|
542
|
+
/** Compute overall task score */
|
|
543
|
+
function computeTaskScore(patternScore, toolScore, speedScore) {
|
|
544
|
+
return patternScore * WEIGHT_PATTERN + toolScore * WEIGHT_TOOLS + speedScore * WEIGHT_SPEED;
|
|
545
|
+
}
|
|
546
|
+
/** Run the benchmark suite */
|
|
547
|
+
export async function runBenchmark(opts = {}) {
|
|
548
|
+
ensureBenchDir();
|
|
549
|
+
// Filter tasks
|
|
550
|
+
let tasks = [...BENCH_TASKS];
|
|
551
|
+
if (opts.categories && opts.categories.length > 0) {
|
|
552
|
+
const cats = new Set(opts.categories.map(c => c.toLowerCase()));
|
|
553
|
+
tasks = tasks.filter(t => cats.has(t.category));
|
|
554
|
+
}
|
|
555
|
+
if (opts.difficulty) {
|
|
556
|
+
const diff = opts.difficulty.toLowerCase();
|
|
557
|
+
tasks = tasks.filter(t => t.difficulty === diff);
|
|
558
|
+
}
|
|
559
|
+
if (opts.limit && opts.limit > 0) {
|
|
560
|
+
tasks = tasks.slice(0, opts.limit);
|
|
561
|
+
}
|
|
562
|
+
if (tasks.length === 0) {
|
|
563
|
+
throw new Error('No benchmark tasks match the given filters.');
|
|
564
|
+
}
|
|
565
|
+
// Resolve provider/model
|
|
566
|
+
const provider = opts.provider ?? getByokProvider() ?? 'anthropic';
|
|
567
|
+
const model = opts.model ?? getProviderModel(provider, 'default') ?? 'unknown';
|
|
568
|
+
const startTime = Date.now();
|
|
569
|
+
const results = [];
|
|
570
|
+
let totalInput = 0;
|
|
571
|
+
let totalOutput = 0;
|
|
572
|
+
// Print header
|
|
573
|
+
console.error();
|
|
574
|
+
console.error(chalk.bold(' kbot bench') + chalk.dim(` — v${VERSION}`));
|
|
575
|
+
console.error(chalk.dim(` Provider: ${provider} | Model: ${model}`));
|
|
576
|
+
console.error(chalk.dim(` Running ${tasks.length} task${tasks.length !== 1 ? 's' : ''}...`));
|
|
577
|
+
console.error(chalk.dim(' ' + '─'.repeat(50)));
|
|
578
|
+
console.error();
|
|
579
|
+
for (let i = 0; i < tasks.length; i++) {
|
|
580
|
+
const task = tasks[i];
|
|
581
|
+
const taskLabel = `[${i + 1}/${tasks.length}]`;
|
|
582
|
+
const timeout = task.timeoutMs ?? 30000;
|
|
583
|
+
if (opts.verbose) {
|
|
584
|
+
console.error(chalk.dim(` ${taskLabel} ${task.id} (${task.category}/${task.difficulty})`));
|
|
585
|
+
}
|
|
586
|
+
else {
|
|
587
|
+
process.stderr.write(chalk.dim(` ${taskLabel} ${task.id}...`));
|
|
588
|
+
}
|
|
589
|
+
const taskStart = Date.now();
|
|
590
|
+
let response = null;
|
|
591
|
+
let error;
|
|
592
|
+
try {
|
|
593
|
+
// Run agent with timeout
|
|
594
|
+
const agentOpts = {
|
|
595
|
+
model: opts.model,
|
|
596
|
+
stream: false,
|
|
597
|
+
skipPlanner: true, // Direct agent call — skip planner for benchmarking
|
|
598
|
+
};
|
|
599
|
+
response = await Promise.race([
|
|
600
|
+
runAgent(task.prompt, agentOpts),
|
|
601
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Benchmark task timed out')), timeout)),
|
|
602
|
+
]);
|
|
603
|
+
}
|
|
604
|
+
catch (err) {
|
|
605
|
+
error = err instanceof Error ? err.message : String(err);
|
|
606
|
+
}
|
|
607
|
+
const durationMs = Date.now() - taskStart;
|
|
608
|
+
const responseText = response?.content ?? '';
|
|
609
|
+
const toolsCalled = extractToolNames(response);
|
|
610
|
+
// Score
|
|
611
|
+
const patternResult = scorePatterns(responseText, task.expectedPatterns);
|
|
612
|
+
const toolScore = scoreTools(toolsCalled, task.expectedTools);
|
|
613
|
+
const speedScore = scoreSpeed(durationMs, timeout);
|
|
614
|
+
const overallScore = computeTaskScore(patternResult.score, toolScore, speedScore);
|
|
615
|
+
// Track token usage
|
|
616
|
+
if (response?.usage) {
|
|
617
|
+
totalInput += response.usage.input_tokens;
|
|
618
|
+
totalOutput += response.usage.output_tokens;
|
|
619
|
+
}
|
|
620
|
+
const taskResult = {
|
|
621
|
+
taskId: task.id,
|
|
622
|
+
category: task.category,
|
|
623
|
+
difficulty: task.difficulty,
|
|
624
|
+
passed: overallScore >= 0.5,
|
|
625
|
+
patternScore: round(patternResult.score),
|
|
626
|
+
toolScore: round(toolScore),
|
|
627
|
+
speedScore: round(speedScore),
|
|
628
|
+
overallScore: round(overallScore),
|
|
629
|
+
durationMs,
|
|
630
|
+
responseLength: responseText.length,
|
|
631
|
+
matchedPatterns: patternResult.matched,
|
|
632
|
+
missedPatterns: patternResult.missed,
|
|
633
|
+
toolsCalled,
|
|
634
|
+
error,
|
|
635
|
+
};
|
|
636
|
+
results.push(taskResult);
|
|
637
|
+
// Print inline result
|
|
638
|
+
if (!opts.verbose) {
|
|
639
|
+
const icon = taskResult.passed ? chalk.green(' ✓') : chalk.red(' ✗');
|
|
640
|
+
const scoreStr = chalk.dim(` ${Math.round(overallScore * 100)}%`);
|
|
641
|
+
const timeStr = chalk.dim(` ${(durationMs / 1000).toFixed(1)}s`);
|
|
642
|
+
process.stderr.write(`${icon}${scoreStr}${timeStr}\n`);
|
|
643
|
+
}
|
|
644
|
+
else {
|
|
645
|
+
printVerboseResult(taskResult);
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
const totalDuration = Date.now() - startTime;
|
|
649
|
+
// Compute aggregate scores
|
|
650
|
+
const totalScore = results.length > 0
|
|
651
|
+
? round(results.reduce((s, r) => s + r.overallScore, 0) / results.length * 100)
|
|
652
|
+
: 0;
|
|
653
|
+
const categoryScores = {};
|
|
654
|
+
const categoryGroups = groupBy(results, r => r.category);
|
|
655
|
+
for (const [cat, catResults] of Object.entries(categoryGroups)) {
|
|
656
|
+
categoryScores[cat] = round(catResults.reduce((s, r) => s + r.overallScore, 0) / catResults.length * 100);
|
|
657
|
+
}
|
|
658
|
+
const benchResult = {
|
|
659
|
+
timestamp: new Date().toISOString(),
|
|
660
|
+
provider,
|
|
661
|
+
model,
|
|
662
|
+
kbotVersion: VERSION,
|
|
663
|
+
totalScore,
|
|
664
|
+
categoryScores,
|
|
665
|
+
tasks: results,
|
|
666
|
+
duration: totalDuration,
|
|
667
|
+
tokenUsage: { input: totalInput, output: totalOutput },
|
|
668
|
+
};
|
|
669
|
+
// Save
|
|
670
|
+
saveBenchResult(benchResult);
|
|
671
|
+
// Print summary
|
|
672
|
+
formatBenchResult(benchResult);
|
|
673
|
+
return benchResult;
|
|
674
|
+
}
|
|
675
|
+
// ── Tool name extraction ──
|
|
676
|
+
/** Extract tool names from an agent response (best effort from response text) */
|
|
677
|
+
function extractToolNames(response) {
|
|
678
|
+
if (!response)
|
|
679
|
+
return [];
|
|
680
|
+
// The agent response tracks toolCalls count but not individual names.
|
|
681
|
+
// We parse the response text for tool call patterns used by kbot's UI output.
|
|
682
|
+
const tools = [];
|
|
683
|
+
const text = response.content ?? '';
|
|
684
|
+
// Pattern: kbot prints tool calls as "▸ tool_name" or "→ tool_name"
|
|
685
|
+
const toolCallPattern = /(?:▸|→|Tool:\s*)(\w+)/g;
|
|
686
|
+
let match;
|
|
687
|
+
while ((match = toolCallPattern.exec(text)) !== null) {
|
|
688
|
+
if (!tools.includes(match[1])) {
|
|
689
|
+
tools.push(match[1]);
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
// Also detect common tool usage from content
|
|
693
|
+
if (/```[\s\S]*?```/.test(text))
|
|
694
|
+
tools.push('code_generation');
|
|
695
|
+
if (/https?:\/\//.test(text) && /search|found|results/i.test(text))
|
|
696
|
+
tools.push('web_search');
|
|
697
|
+
return tools;
|
|
698
|
+
}
|
|
699
|
+
// ── History & Comparison ──
|
|
700
|
+
/** Save a benchmark result to disk */
|
|
701
|
+
function saveBenchResult(result) {
|
|
702
|
+
ensureBenchDir();
|
|
703
|
+
const filename = `bench-${result.timestamp.replace(/[:.]/g, '-')}.json`;
|
|
704
|
+
const filepath = join(BENCH_DIR, filename);
|
|
705
|
+
writeFileSync(filepath, JSON.stringify(result, null, 2), 'utf-8');
|
|
706
|
+
}
|
|
707
|
+
/** Get all saved benchmark results, sorted newest first */
|
|
708
|
+
export function getBenchHistory() {
|
|
709
|
+
ensureBenchDir();
|
|
710
|
+
const files = readdirSync(BENCH_DIR)
|
|
711
|
+
.filter(f => f.startsWith('bench-') && f.endsWith('.json'))
|
|
712
|
+
.sort()
|
|
713
|
+
.reverse();
|
|
714
|
+
const results = [];
|
|
715
|
+
for (const file of files) {
|
|
716
|
+
try {
|
|
717
|
+
const data = readFileSync(join(BENCH_DIR, file), 'utf-8');
|
|
718
|
+
results.push(JSON.parse(data));
|
|
719
|
+
}
|
|
720
|
+
catch {
|
|
721
|
+
// Skip corrupted files
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
return results;
|
|
725
|
+
}
|
|
726
|
+
/** Compare two benchmark results and format a comparison table */
|
|
727
|
+
export function compareBenchmarks(a, b) {
|
|
728
|
+
const lines = [];
|
|
729
|
+
const SEP = chalk.dim('─'.repeat(62));
|
|
730
|
+
lines.push('');
|
|
731
|
+
lines.push(chalk.bold(' Benchmark Comparison'));
|
|
732
|
+
lines.push(SEP);
|
|
733
|
+
lines.push('');
|
|
734
|
+
// Header row
|
|
735
|
+
const labelA = `${a.provider}/${a.model} (${formatTimestamp(a.timestamp)})`;
|
|
736
|
+
const labelB = `${b.provider}/${b.model} (${formatTimestamp(b.timestamp)})`;
|
|
737
|
+
lines.push(` ${chalk.dim('Metric'.padEnd(20))} ${chalk.cyan(labelA.padEnd(20))} ${chalk.hex('#FB923C')(labelB.padEnd(20))}`);
|
|
738
|
+
lines.push(SEP);
|
|
739
|
+
// Overall score
|
|
740
|
+
lines.push(formatComparisonRow('Overall Score', a.totalScore, b.totalScore, '%'));
|
|
741
|
+
// Category scores
|
|
742
|
+
const allCats = new Set([...Object.keys(a.categoryScores), ...Object.keys(b.categoryScores)]);
|
|
743
|
+
for (const cat of allCats) {
|
|
744
|
+
const scoreA = a.categoryScores[cat] ?? 0;
|
|
745
|
+
const scoreB = b.categoryScores[cat] ?? 0;
|
|
746
|
+
lines.push(formatComparisonRow(` ${cat}`, scoreA, scoreB, '%'));
|
|
747
|
+
}
|
|
748
|
+
lines.push(SEP);
|
|
749
|
+
// Duration
|
|
750
|
+
lines.push(formatComparisonRow('Duration', round(a.duration / 1000), round(b.duration / 1000), 's', true));
|
|
751
|
+
// Token usage
|
|
752
|
+
lines.push(formatComparisonRow('Input Tokens', a.tokenUsage.input, b.tokenUsage.input, '', true));
|
|
753
|
+
lines.push(formatComparisonRow('Output Tokens', a.tokenUsage.output, b.tokenUsage.output, '', true));
|
|
754
|
+
// Task-by-task comparison
|
|
755
|
+
lines.push('');
|
|
756
|
+
lines.push(chalk.bold(' Task Breakdown'));
|
|
757
|
+
lines.push(SEP);
|
|
758
|
+
lines.push(` ${chalk.dim('Task'.padEnd(28))} ${chalk.dim('A'.padStart(6))} ${chalk.dim('B'.padStart(6))} ${chalk.dim('Δ'.padStart(8))}`);
|
|
759
|
+
lines.push(SEP);
|
|
760
|
+
const taskMapA = new Map(a.tasks.map(t => [t.taskId, t]));
|
|
761
|
+
const taskMapB = new Map(b.tasks.map(t => [t.taskId, t]));
|
|
762
|
+
const allTaskIds = new Set([...taskMapA.keys(), ...taskMapB.keys()]);
|
|
763
|
+
for (const id of allTaskIds) {
|
|
764
|
+
const ta = taskMapA.get(id);
|
|
765
|
+
const tb = taskMapB.get(id);
|
|
766
|
+
const scoreA = ta ? Math.round(ta.overallScore * 100) : 0;
|
|
767
|
+
const scoreB = tb ? Math.round(tb.overallScore * 100) : 0;
|
|
768
|
+
const delta = scoreB - scoreA;
|
|
769
|
+
const deltaColor = delta > 0 ? chalk.green : delta < 0 ? chalk.red : chalk.dim;
|
|
770
|
+
const deltaStr = delta > 0 ? `+${delta}%` : `${delta}%`;
|
|
771
|
+
lines.push(` ${id.padEnd(28)} ${String(scoreA + '%').padStart(6)} ${String(scoreB + '%').padStart(6)} ${deltaColor(deltaStr.padStart(8))}`);
|
|
772
|
+
}
|
|
773
|
+
lines.push(SEP);
|
|
774
|
+
lines.push('');
|
|
775
|
+
return lines.join('\n');
|
|
776
|
+
}
|
|
777
|
+
// ── Display ──
|
|
778
|
+
/** Format and print a benchmark result as a terminal table */
|
|
779
|
+
export function formatBenchResult(result) {
|
|
780
|
+
const SEP = chalk.dim(' ' + '─'.repeat(58));
|
|
781
|
+
console.error();
|
|
782
|
+
console.error(chalk.bold(' Benchmark Results'));
|
|
783
|
+
console.error(SEP);
|
|
784
|
+
console.error();
|
|
785
|
+
// Summary row
|
|
786
|
+
const scoreColor = result.totalScore >= 80 ? chalk.green
|
|
787
|
+
: result.totalScore >= 60 ? chalk.yellow
|
|
788
|
+
: chalk.red;
|
|
789
|
+
console.error(` ${chalk.bold('Overall Score:')} ${scoreColor(result.totalScore + '%')}`);
|
|
790
|
+
console.error(` ${chalk.dim('Provider:')} ${result.provider}`);
|
|
791
|
+
console.error(` ${chalk.dim('Model:')} ${result.model}`);
|
|
792
|
+
console.error(` ${chalk.dim('kbot Version:')} ${result.kbotVersion}`);
|
|
793
|
+
console.error(` ${chalk.dim('Duration:')} ${(result.duration / 1000).toFixed(1)}s`);
|
|
794
|
+
console.error(` ${chalk.dim('Tokens:')} ${result.tokenUsage.input.toLocaleString()} in / ${result.tokenUsage.output.toLocaleString()} out`);
|
|
795
|
+
console.error();
|
|
796
|
+
// Category scores
|
|
797
|
+
console.error(chalk.bold(' Category Scores'));
|
|
798
|
+
console.error(SEP);
|
|
799
|
+
const catOrder = ['codegen', 'bugfix', 'refactor', 'explain', 'research', 'science'];
|
|
800
|
+
for (const cat of catOrder) {
|
|
801
|
+
const score = result.categoryScores[cat];
|
|
802
|
+
if (score === undefined)
|
|
803
|
+
continue;
|
|
804
|
+
const bar = renderBar(score);
|
|
805
|
+
const scoreStr = String(score + '%').padStart(4);
|
|
806
|
+
console.error(` ${cat.padEnd(12)} ${bar} ${scoreStr}`);
|
|
807
|
+
}
|
|
808
|
+
console.error();
|
|
809
|
+
// Task details
|
|
810
|
+
console.error(chalk.bold(' Task Results'));
|
|
811
|
+
console.error(SEP);
|
|
812
|
+
console.error(` ${chalk.dim('Task'.padEnd(28))} ${chalk.dim('Score'.padStart(6))} ${chalk.dim('Pat'.padStart(5))} ${chalk.dim('Tool'.padStart(5))} ${chalk.dim('Spd'.padStart(5))} ${chalk.dim('Time'.padStart(7))}`);
|
|
813
|
+
console.error(SEP);
|
|
814
|
+
for (const task of result.tasks) {
|
|
815
|
+
const icon = task.passed ? chalk.green('✓') : chalk.red('✗');
|
|
816
|
+
const score = Math.round(task.overallScore * 100);
|
|
817
|
+
const scoreColor = score >= 80 ? chalk.green : score >= 50 ? chalk.yellow : chalk.red;
|
|
818
|
+
const patStr = Math.round(task.patternScore * 100) + '%';
|
|
819
|
+
const toolStr = Math.round(task.toolScore * 100) + '%';
|
|
820
|
+
const spdStr = Math.round(task.speedScore * 100) + '%';
|
|
821
|
+
const timeStr = (task.durationMs / 1000).toFixed(1) + 's';
|
|
822
|
+
console.error(` ${icon} ${task.taskId.padEnd(26)} ${scoreColor(String(score + '%').padStart(5))} ${chalk.dim(patStr.padStart(5))} ${chalk.dim(toolStr.padStart(5))} ${chalk.dim(spdStr.padStart(5))} ${chalk.dim(timeStr.padStart(7))}`);
|
|
823
|
+
if (task.error) {
|
|
824
|
+
console.error(chalk.red(` Error: ${task.error}`));
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
console.error(SEP);
|
|
828
|
+
// Passed/failed summary
|
|
829
|
+
const passed = result.tasks.filter(t => t.passed).length;
|
|
830
|
+
const failed = result.tasks.length - passed;
|
|
831
|
+
console.error();
|
|
832
|
+
console.error(` ${chalk.green(passed + ' passed')} ${failed > 0 ? chalk.red(failed + ' failed') : chalk.dim('0 failed')}`);
|
|
833
|
+
console.error();
|
|
834
|
+
}
|
|
835
|
+
/** Format benchmark history as a compact table */
|
|
836
|
+
export function formatBenchHistory(results) {
|
|
837
|
+
if (results.length === 0)
|
|
838
|
+
return chalk.dim(' No benchmark history found.');
|
|
839
|
+
const lines = [];
|
|
840
|
+
const SEP = chalk.dim(' ' + '─'.repeat(62));
|
|
841
|
+
lines.push('');
|
|
842
|
+
lines.push(chalk.bold(' Benchmark History'));
|
|
843
|
+
lines.push(SEP);
|
|
844
|
+
lines.push(` ${chalk.dim('#'.padStart(3))} ${chalk.dim('Date'.padEnd(12))} ${chalk.dim('Provider'.padEnd(12))} ${chalk.dim('Model'.padEnd(16))} ${chalk.dim('Score'.padStart(6))} ${chalk.dim('Tasks'.padStart(6))}`);
|
|
845
|
+
lines.push(SEP);
|
|
846
|
+
for (let i = 0; i < results.length; i++) {
|
|
847
|
+
const r = results[i];
|
|
848
|
+
const idx = String(i + 1).padStart(3);
|
|
849
|
+
const date = formatTimestamp(r.timestamp);
|
|
850
|
+
const score = r.totalScore;
|
|
851
|
+
const scoreColor = score >= 80 ? chalk.green : score >= 60 ? chalk.yellow : chalk.red;
|
|
852
|
+
const taskCount = r.tasks.length;
|
|
853
|
+
lines.push(` ${chalk.dim(idx)} ${date.padEnd(12)} ${r.provider.padEnd(12)} ${r.model.padEnd(16)} ${scoreColor(String(score + '%').padStart(5))} ${chalk.dim(String(taskCount).padStart(6))}`);
|
|
854
|
+
}
|
|
855
|
+
lines.push(SEP);
|
|
856
|
+
lines.push('');
|
|
857
|
+
return lines.join('\n');
|
|
858
|
+
}
|
|
859
|
+
// ── Verbose Output ──
|
|
860
|
+
function printVerboseResult(task) {
|
|
861
|
+
const icon = task.passed ? chalk.green(' ✓') : chalk.red(' ✗');
|
|
862
|
+
const score = Math.round(task.overallScore * 100);
|
|
863
|
+
const scoreColor = score >= 80 ? chalk.green : score >= 50 ? chalk.yellow : chalk.red;
|
|
864
|
+
console.error(`${icon} ${chalk.bold(task.taskId)} ${scoreColor(score + '%')} ${chalk.dim(`(${(task.durationMs / 1000).toFixed(1)}s)`)}`);
|
|
865
|
+
console.error(chalk.dim(` Pattern: ${Math.round(task.patternScore * 100)}% | Tool: ${Math.round(task.toolScore * 100)}% | Speed: ${Math.round(task.speedScore * 100)}%`));
|
|
866
|
+
if (task.matchedPatterns.length > 0) {
|
|
867
|
+
console.error(chalk.green(` Matched: ${task.matchedPatterns.length}/${task.matchedPatterns.length + task.missedPatterns.length} patterns`));
|
|
868
|
+
}
|
|
869
|
+
if (task.missedPatterns.length > 0) {
|
|
870
|
+
console.error(chalk.red(` Missed: ${task.missedPatterns.map(p => `/${p}/`).join(', ')}`));
|
|
871
|
+
}
|
|
872
|
+
if (task.toolsCalled.length > 0) {
|
|
873
|
+
console.error(chalk.dim(` Tools: ${task.toolsCalled.join(', ')}`));
|
|
874
|
+
}
|
|
875
|
+
if (task.error) {
|
|
876
|
+
console.error(chalk.red(` Error: ${task.error}`));
|
|
877
|
+
}
|
|
878
|
+
console.error();
|
|
879
|
+
}
|
|
880
|
+
// ── Helpers ──
|
|
881
|
+
/** Render a colored bar chart segment (20 chars wide) */
|
|
882
|
+
function renderBar(percentage) {
|
|
883
|
+
const width = 20;
|
|
884
|
+
const filled = Math.round((percentage / 100) * width);
|
|
885
|
+
const empty = width - filled;
|
|
886
|
+
const color = percentage >= 80 ? chalk.green : percentage >= 60 ? chalk.yellow : chalk.red;
|
|
887
|
+
return color('█'.repeat(filled)) + chalk.dim('░'.repeat(empty));
|
|
888
|
+
}
|
|
889
|
+
/** Format an ISO timestamp to a short date string */
|
|
890
|
+
function formatTimestamp(iso) {
|
|
891
|
+
try {
|
|
892
|
+
const d = new Date(iso);
|
|
893
|
+
return `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')}`;
|
|
894
|
+
}
|
|
895
|
+
catch {
|
|
896
|
+
return iso.slice(0, 10);
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
/** Format a comparison row with delta coloring */
|
|
900
|
+
function formatComparisonRow(label, valueA, valueB, unit, lowerIsBetter = false) {
|
|
901
|
+
const delta = valueB - valueA;
|
|
902
|
+
const isImprovement = lowerIsBetter ? delta < 0 : delta > 0;
|
|
903
|
+
const deltaColor = delta === 0 ? chalk.dim
|
|
904
|
+
: isImprovement ? chalk.green
|
|
905
|
+
: chalk.red;
|
|
906
|
+
const deltaSign = delta > 0 ? '+' : '';
|
|
907
|
+
const deltaStr = deltaColor(`${deltaSign}${round(delta)}${unit}`);
|
|
908
|
+
return ` ${label.padEnd(20)} ${String(valueA + unit).padStart(10)} ${String(valueB + unit).padStart(10)} ${deltaStr.padStart(10)}`;
|
|
909
|
+
}
|
|
910
|
+
/** Round to 2 decimal places */
|
|
911
|
+
function round(n) {
|
|
912
|
+
return Math.round(n * 100) / 100;
|
|
913
|
+
}
|
|
914
|
+
/** Group an array by a key function */
|
|
915
|
+
function groupBy(items, keyFn) {
|
|
916
|
+
const groups = {};
|
|
917
|
+
for (const item of items) {
|
|
918
|
+
const key = keyFn(item);
|
|
919
|
+
if (!groups[key])
|
|
920
|
+
groups[key] = [];
|
|
921
|
+
groups[key].push(item);
|
|
922
|
+
}
|
|
923
|
+
return groups;
|
|
924
|
+
}
|
|
925
|
+
// ── CLI entry point ──
|
|
926
|
+
/** Register the bench subcommand with Commander */
|
|
927
|
+
export function registerBenchCommand(program) {
|
|
928
|
+
program
|
|
929
|
+
.command('bench')
|
|
930
|
+
.description('Run benchmarks — score kbot against standardized tasks')
|
|
931
|
+
.option('-c, --category <categories...>', 'Filter by category (codegen, bugfix, refactor, explain, research, science)')
|
|
932
|
+
.option('-d, --difficulty <level>', 'Filter by difficulty (easy, medium, hard)')
|
|
933
|
+
.option('-p, --provider <provider>', 'Override provider')
|
|
934
|
+
.option('-m, --model <model>', 'Override model')
|
|
935
|
+
.option('-l, --limit <n>', 'Max tasks to run', parseInt)
|
|
936
|
+
.option('-v, --verbose', 'Show detailed per-task output')
|
|
937
|
+
.option('--compare', 'Compare last two benchmark runs')
|
|
938
|
+
.option('--history', 'Show all past benchmark runs')
|
|
939
|
+
.action(async (opts) => {
|
|
940
|
+
// History mode
|
|
941
|
+
if (opts.history) {
|
|
942
|
+
const history = getBenchHistory();
|
|
943
|
+
console.error(formatBenchHistory(history));
|
|
944
|
+
return;
|
|
945
|
+
}
|
|
946
|
+
// Compare mode
|
|
947
|
+
if (opts.compare) {
|
|
948
|
+
const history = getBenchHistory();
|
|
949
|
+
if (history.length < 2) {
|
|
950
|
+
console.error(chalk.red(' Need at least 2 benchmark runs to compare. Run `kbot bench` first.'));
|
|
951
|
+
process.exit(1);
|
|
952
|
+
}
|
|
953
|
+
console.error(compareBenchmarks(history[1], history[0])); // older first, newer second
|
|
954
|
+
return;
|
|
955
|
+
}
|
|
956
|
+
// Run benchmarks
|
|
957
|
+
try {
|
|
958
|
+
await runBenchmark({
|
|
959
|
+
categories: opts.category,
|
|
960
|
+
difficulty: opts.difficulty,
|
|
961
|
+
provider: opts.provider,
|
|
962
|
+
model: opts.model,
|
|
963
|
+
limit: opts.limit,
|
|
964
|
+
verbose: opts.verbose,
|
|
965
|
+
});
|
|
966
|
+
}
|
|
967
|
+
catch (err) {
|
|
968
|
+
console.error(chalk.red(` Benchmark failed: ${err instanceof Error ? err.message : err}`));
|
|
969
|
+
process.exit(1);
|
|
970
|
+
}
|
|
971
|
+
});
|
|
972
|
+
}
|
|
973
|
+
//# sourceMappingURL=bench.js.map
|