offgrid-ai 0.7.4 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/benchmark.mjs +763 -12
package/package.json
CHANGED
package/src/benchmark.mjs
CHANGED
|
@@ -3,10 +3,12 @@ import { existsSync } from "node:fs";
|
|
|
3
3
|
import { createHash } from "node:crypto";
|
|
4
4
|
import { join, resolve } from "node:path";
|
|
5
5
|
import { homedir } from "node:os";
|
|
6
|
-
import { execFile } from "node:child_process";
|
|
6
|
+
import { spawn, execFile } from "node:child_process";
|
|
7
7
|
import { promisify } from "node:util";
|
|
8
8
|
import { ensureDirs, loadConfig, saveConfig } from "./config.mjs";
|
|
9
9
|
import { backendFor } from "./backends.mjs";
|
|
10
|
+
import { hasPi, hasPiModel, syncPiConfig } from "./harness-pi.mjs";
|
|
11
|
+
import { serverReady, startServer, waitForReady, stopProfile } from "./process.mjs";
|
|
10
12
|
import { pc, createPrompt, renderRows, renderSection } from "./ui.mjs";
|
|
11
13
|
|
|
12
14
|
const execFileAsync = promisify(execFile);
|
|
@@ -27,7 +29,7 @@ export function createRunId(date = new Date()) {
|
|
|
27
29
|
return date.toISOString().replace(/:/gu, "-").replace(/\./gu, "-");
|
|
28
30
|
}
|
|
29
31
|
|
|
30
|
-
export function buildToolPrompt(benchmark
|
|
32
|
+
export function buildToolPrompt(benchmark) {
|
|
31
33
|
return benchmark.prompt;
|
|
32
34
|
}
|
|
33
35
|
|
|
@@ -151,8 +153,8 @@ function printBenchmarkNextSteps({ repoPath, runDirectory, profile, modelId, run
|
|
|
151
153
|
console.log(` 3. ${pc.cyan(runnerCommand)}, then copy this run's prompt from the gallery and paste it into ${runnerLabel}`);
|
|
152
154
|
}
|
|
153
155
|
|
|
154
|
-
async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSource, backendLabel, profile }) {
|
|
155
|
-
const toolPrompt = buildToolPrompt(benchmark
|
|
156
|
+
export async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSource, backendLabel, profile, showNextSteps = true }) {
|
|
157
|
+
const toolPrompt = buildToolPrompt(benchmark);
|
|
156
158
|
const now = new Date();
|
|
157
159
|
const runId = createRunId(now);
|
|
158
160
|
const modelSlug = slugModelId(modelId);
|
|
@@ -165,6 +167,13 @@ async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSo
|
|
|
165
167
|
await mkdir(runDirectory, { recursive: true });
|
|
166
168
|
|
|
167
169
|
const isDs = kind === "data-science";
|
|
170
|
+
const baseAssets = {
|
|
171
|
+
metadata: "metadata.json",
|
|
172
|
+
prompt: "prompt.md",
|
|
173
|
+
rawResponse: "response.raw.txt",
|
|
174
|
+
stream: "stream.ndjson",
|
|
175
|
+
stderr: "stderr.log",
|
|
176
|
+
};
|
|
168
177
|
const metadata = {
|
|
169
178
|
schemaVersion: 1,
|
|
170
179
|
kind,
|
|
@@ -177,8 +186,8 @@ async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSo
|
|
|
177
186
|
preparedAt: now.toISOString(),
|
|
178
187
|
runDirectory,
|
|
179
188
|
assets: isDs
|
|
180
|
-
? {
|
|
181
|
-
: {
|
|
189
|
+
? { ...baseAssets, ds: { notebook: "analysis.ipynb", summary: "summary.json", chartDistribution: "chart-distribution.png", chartTreatmentEffect: "chart-treatment-effect.png", chartCompletionRates: "chart-completion-rates.png" } }
|
|
190
|
+
: { ...baseAssets, html: "index.html", preview: "preview.png", video: "preview.webm" },
|
|
182
191
|
runner: {
|
|
183
192
|
mode: modelSource === "cloud" ? "manual" : "external",
|
|
184
193
|
intendedRunner: profile ? runnerLabel : undefined,
|
|
@@ -188,7 +197,30 @@ async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSo
|
|
|
188
197
|
...(profile?.baseUrl ? { baseUrl: profile.baseUrl } : {}),
|
|
189
198
|
model: modelId,
|
|
190
199
|
retries: 0,
|
|
191
|
-
tokenMetrics: {
|
|
200
|
+
tokenMetrics: {
|
|
201
|
+
reported: false,
|
|
202
|
+
promptTokens: 0,
|
|
203
|
+
completionTokens: 0,
|
|
204
|
+
totalTokens: 0,
|
|
205
|
+
},
|
|
206
|
+
speedMetrics: {
|
|
207
|
+
prefillTokensPerSecond: null,
|
|
208
|
+
generationTokensPerSecond: null,
|
|
209
|
+
ttftMs: null,
|
|
210
|
+
modelLoadMs: null,
|
|
211
|
+
speculativeDecodeAcceptance: null,
|
|
212
|
+
kvCacheTokens: null,
|
|
213
|
+
},
|
|
214
|
+
metricSource: null,
|
|
215
|
+
},
|
|
216
|
+
results: {
|
|
217
|
+
wallClockMs: null,
|
|
218
|
+
agentTurns: 0,
|
|
219
|
+
toolCalls: 0,
|
|
220
|
+
toolResults: 0,
|
|
221
|
+
success: false,
|
|
222
|
+
outputFiles: [],
|
|
223
|
+
perTurn: [],
|
|
192
224
|
},
|
|
193
225
|
};
|
|
194
226
|
|
|
@@ -205,11 +237,712 @@ async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSo
|
|
|
205
237
|
["Source", backendLabel || modelSource],
|
|
206
238
|
])));
|
|
207
239
|
|
|
208
|
-
|
|
240
|
+
if (showNextSteps) {
|
|
241
|
+
printBenchmarkNextSteps({ repoPath, runDirectory, profile, modelId, runnerLabel });
|
|
242
|
+
}
|
|
209
243
|
|
|
210
244
|
return runDirectory;
|
|
211
245
|
}
|
|
212
246
|
|
|
247
|
+
// ── Run benchmark in Pi (non-interactive JSON mode) ───────────────────────
|
|
248
|
+
|
|
249
|
+
const BENCH_COLORS = {
|
|
250
|
+
thinking: pc.magenta,
|
|
251
|
+
text: pc.green,
|
|
252
|
+
tool: pc.yellow,
|
|
253
|
+
toolOutput: pc.dim,
|
|
254
|
+
error: pc.red,
|
|
255
|
+
info: pc.cyan,
|
|
256
|
+
dim: pc.dim,
|
|
257
|
+
};
|
|
258
|
+
|
|
259
|
+
function formatToolCall(toolCall) {
|
|
260
|
+
const path = toolCall.arguments?.path || toolCall.arguments?.file_path || toolCall.arguments?.filename || "";
|
|
261
|
+
const summary = path ? ` → ${path}` : "";
|
|
262
|
+
return `[toolCall] ${toolCall.name}${summary}`;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
function renderStreamEvent(parsed, state) {
|
|
266
|
+
const type = parsed.type;
|
|
267
|
+
|
|
268
|
+
switch (type) {
|
|
269
|
+
case "session":
|
|
270
|
+
console.log(BENCH_COLORS.dim(`[session] ${parsed.id}`));
|
|
271
|
+
break;
|
|
272
|
+
case "agent_start":
|
|
273
|
+
console.log(BENCH_COLORS.dim("[agent_start]"));
|
|
274
|
+
break;
|
|
275
|
+
case "turn_start": {
|
|
276
|
+
state.turn += 1;
|
|
277
|
+
console.log(BENCH_COLORS.info(`\n[turn ${state.turn}]`));
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
280
|
+
case "message_start": {
|
|
281
|
+
const msg = parsed.message;
|
|
282
|
+
if (msg?.role === "assistant" && msg.provider && msg.model) {
|
|
283
|
+
console.log(BENCH_COLORS.info(`[assistant] ${msg.provider}/${msg.model}`));
|
|
284
|
+
}
|
|
285
|
+
break;
|
|
286
|
+
}
|
|
287
|
+
case "message_update": {
|
|
288
|
+
const evt = parsed.assistantMessageEvent;
|
|
289
|
+
if (!evt) return;
|
|
290
|
+
const subtype = String(evt.type ?? "").replace(/_/gu, "");
|
|
291
|
+
if (subtype === "thinkingstart" || subtype === "thinkingdelta") {
|
|
292
|
+
process.stdout.write(BENCH_COLORS.thinking(evt.delta || ""));
|
|
293
|
+
} else if (subtype === "textstart" || subtype === "textdelta") {
|
|
294
|
+
process.stdout.write(BENCH_COLORS.text(evt.delta || ""));
|
|
295
|
+
} else if (subtype === "toolcallstart") {
|
|
296
|
+
console.log(BENCH_COLORS.tool("\n[tool_call_start]"));
|
|
297
|
+
} else if (subtype === "toolcalldelta") {
|
|
298
|
+
process.stdout.write(BENCH_COLORS.tool(evt.delta || ""));
|
|
299
|
+
} else if (subtype === "toolcallend") {
|
|
300
|
+
console.log(BENCH_COLORS.tool("[tool_call_end]"));
|
|
301
|
+
}
|
|
302
|
+
break;
|
|
303
|
+
}
|
|
304
|
+
case "message_end": {
|
|
305
|
+
const msg = parsed.message;
|
|
306
|
+
if (msg?.role === "assistant" && Array.isArray(msg.content)) {
|
|
307
|
+
for (const item of msg.content) {
|
|
308
|
+
if (item.type === "toolCall") {
|
|
309
|
+
console.log(BENCH_COLORS.tool(`\n${formatToolCall(item)}`));
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
break;
|
|
314
|
+
}
|
|
315
|
+
case "tool_execution_start":
|
|
316
|
+
console.log(BENCH_COLORS.tool(`\n[exec] ${parsed.toolName}`));
|
|
317
|
+
break;
|
|
318
|
+
case "tool_execution_update":
|
|
319
|
+
if (parsed.content) {
|
|
320
|
+
process.stdout.write(BENCH_COLORS.toolOutput(parsed.content));
|
|
321
|
+
}
|
|
322
|
+
break;
|
|
323
|
+
case "tool_execution_end":
|
|
324
|
+
console.log(BENCH_COLORS.tool(`[exec done] ${parsed.toolName}`));
|
|
325
|
+
break;
|
|
326
|
+
case "toolResult": {
|
|
327
|
+
const errorFlag = parsed.isError ? BENCH_COLORS.error(" error") : "";
|
|
328
|
+
console.log(BENCH_COLORS.tool(`\n[result] ${parsed.toolName}${errorFlag}`));
|
|
329
|
+
break;
|
|
330
|
+
}
|
|
331
|
+
case "agent_end":
|
|
332
|
+
console.log(BENCH_COLORS.dim("\n[agent_end]"));
|
|
333
|
+
break;
|
|
334
|
+
default:
|
|
335
|
+
break;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
export function piModelString(profile) {
|
|
340
|
+
return profile.harnesses?.pi?.model ?? `${profile.providerId}/${profile.modelAlias}`;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
export async function runBenchmarkInPi(profile, runDirectory, { signal } = {}) {
|
|
344
|
+
const model = piModelString(profile);
|
|
345
|
+
const args = ["--model", model, "--mode", "json", "-p", "@prompt.md"];
|
|
346
|
+
|
|
347
|
+
const child = spawn("pi", args, {
|
|
348
|
+
cwd: runDirectory,
|
|
349
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
350
|
+
});
|
|
351
|
+
|
|
352
|
+
const runResult = {
|
|
353
|
+
model,
|
|
354
|
+
exitCode: null,
|
|
355
|
+
wallClockMs: null,
|
|
356
|
+
agentTurns: 0,
|
|
357
|
+
promptTokens: 0,
|
|
358
|
+
completionTokens: 0,
|
|
359
|
+
totalTokens: 0,
|
|
360
|
+
cacheRead: 0,
|
|
361
|
+
cacheWrite: 0,
|
|
362
|
+
toolCalls: 0,
|
|
363
|
+
toolResults: 0,
|
|
364
|
+
perTurn: [],
|
|
365
|
+
rawResponseLines: [],
|
|
366
|
+
error: null,
|
|
367
|
+
};
|
|
368
|
+
|
|
369
|
+
let streamBuffer = "";
|
|
370
|
+
let responseBuffer = "";
|
|
371
|
+
let currentTurnStartMs = null;
|
|
372
|
+
let lastTurnEndMs = null;
|
|
373
|
+
let runStartMs = null;
|
|
374
|
+
let firstEventMs = null;
|
|
375
|
+
let lastEventMs = null;
|
|
376
|
+
let cancelled = false;
|
|
377
|
+
|
|
378
|
+
const streamPath = join(runDirectory, "stream.ndjson");
|
|
379
|
+
const stderrPath = join(runDirectory, "stderr.log");
|
|
380
|
+
const responsePath = join(runDirectory, "response.raw.txt");
|
|
381
|
+
|
|
382
|
+
const streamHandle = await openFileHandle(streamPath, "w");
|
|
383
|
+
const stderrHandle = await openFileHandle(stderrPath, "w");
|
|
384
|
+
|
|
385
|
+
const renderState = { turn: 0 };
|
|
386
|
+
|
|
387
|
+
function appendResponse(text) {
|
|
388
|
+
responseBuffer += text;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
function flushResponse() {
|
|
392
|
+
if (responseBuffer) {
|
|
393
|
+
runResult.rawResponseLines.push(responseBuffer);
|
|
394
|
+
responseBuffer = "";
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
function updateTimeBounds(timestamp) {
|
|
399
|
+
if (!timestamp) return;
|
|
400
|
+
if (firstEventMs === null) firstEventMs = timestamp;
|
|
401
|
+
lastEventMs = timestamp;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
function beginTurn() {
|
|
405
|
+
runResult.agentTurns += 1;
|
|
406
|
+
currentTurnStartMs = lastTurnEndMs ?? runStartMs ?? null;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
function endTurn(usage, timestamp) {
|
|
410
|
+
const turnEndMs = timestamp ?? null;
|
|
411
|
+
const wallClockMs = currentTurnStartMs && turnEndMs ? turnEndMs - currentTurnStartMs : null;
|
|
412
|
+
runResult.perTurn.push({
|
|
413
|
+
turn: runResult.agentTurns,
|
|
414
|
+
inputTokens: usage?.input ?? 0,
|
|
415
|
+
outputTokens: usage?.output ?? 0,
|
|
416
|
+
cacheRead: usage?.cacheRead ?? 0,
|
|
417
|
+
cacheWrite: usage?.cacheWrite ?? 0,
|
|
418
|
+
wallClockMs,
|
|
419
|
+
toolCalls: 0,
|
|
420
|
+
});
|
|
421
|
+
if (turnEndMs) lastTurnEndMs = turnEndMs;
|
|
422
|
+
currentTurnStartMs = null;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
function processLine(line) {
|
|
426
|
+
if (!line.trim()) return;
|
|
427
|
+
streamHandle.write(line + "\n");
|
|
428
|
+
let parsed;
|
|
429
|
+
try {
|
|
430
|
+
parsed = JSON.parse(line);
|
|
431
|
+
} catch (err) {
|
|
432
|
+
console.log(BENCH_COLORS.error(`[parse error] ${err.message}`));
|
|
433
|
+
return;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
const timestamp = extractTimestamp(parsed);
|
|
437
|
+
updateTimeBounds(timestamp);
|
|
438
|
+
|
|
439
|
+
renderStreamEvent(parsed, renderState);
|
|
440
|
+
|
|
441
|
+
if (parsed.type === "session" || parsed.type === "agent_start") {
|
|
442
|
+
if (timestamp && runStartMs === null) runStartMs = timestamp;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
if (parsed.type === "turn_start") {
|
|
446
|
+
beginTurn();
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
if (parsed.type === "turn_end" && parsed.message?.usage) {
|
|
450
|
+
const usage = parsed.message.usage;
|
|
451
|
+
runResult.promptTokens += usage.input ?? 0;
|
|
452
|
+
runResult.completionTokens += usage.output ?? 0;
|
|
453
|
+
runResult.totalTokens += usage.totalTokens ?? 0;
|
|
454
|
+
runResult.cacheRead += usage.cacheRead ?? 0;
|
|
455
|
+
runResult.cacheWrite += usage.cacheWrite ?? 0;
|
|
456
|
+
endTurn(usage, timestamp);
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
if (parsed.type === "message_update" && parsed.assistantMessageEvent) {
|
|
460
|
+
const evt = parsed.assistantMessageEvent;
|
|
461
|
+
const subtype = String(evt.type ?? "").replace(/_/gu, "");
|
|
462
|
+
if (subtype === "thinkingdelta" || subtype === "textdelta") {
|
|
463
|
+
appendResponse(evt.delta || "");
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
if (parsed.type === "message_end" && parsed.message?.role === "assistant") {
|
|
468
|
+
flushResponse();
|
|
469
|
+
const content = parsed.message.content ?? [];
|
|
470
|
+
for (const item of content) {
|
|
471
|
+
if (item.type === "toolCall") {
|
|
472
|
+
runResult.toolCalls += 1;
|
|
473
|
+
appendResponse(`\n${formatToolCall(item)}\n`);
|
|
474
|
+
const currentTurn = runResult.perTurn[runResult.perTurn.length - 1];
|
|
475
|
+
if (currentTurn) currentTurn.toolCalls += 1;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
if (parsed.type === "toolResult") {
|
|
481
|
+
runResult.toolResults += 1;
|
|
482
|
+
const status = parsed.isError ? "error" : "ok";
|
|
483
|
+
appendResponse(`\n[toolResult] ${parsed.toolName} (${status})\n`);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
if (parsed.type === "agent_end") {
|
|
487
|
+
flushResponse();
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
child.stdout.setEncoding("utf8");
|
|
492
|
+
child.stdout.on("data", (chunk) => {
|
|
493
|
+
streamBuffer += chunk;
|
|
494
|
+
const lines = streamBuffer.split("\n");
|
|
495
|
+
streamBuffer = lines.pop();
|
|
496
|
+
for (const line of lines) {
|
|
497
|
+
processLine(line);
|
|
498
|
+
}
|
|
499
|
+
});
|
|
500
|
+
|
|
501
|
+
child.stderr.setEncoding("utf8");
|
|
502
|
+
child.stderr.on("data", (chunk) => {
|
|
503
|
+
stderrHandle.write(chunk);
|
|
504
|
+
});
|
|
505
|
+
|
|
506
|
+
const abortListener = () => {
|
|
507
|
+
if (cancelled) return;
|
|
508
|
+
cancelled = true;
|
|
509
|
+
console.log(BENCH_COLORS.error("\n\n[Cancelled by user]"));
|
|
510
|
+
child.kill("SIGTERM");
|
|
511
|
+
};
|
|
512
|
+
|
|
513
|
+
if (signal) {
|
|
514
|
+
signal.addEventListener("abort", abortListener);
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
return new Promise((resolve) => {
|
|
518
|
+
child.on("exit", async (code) => {
|
|
519
|
+
if (signal) signal.removeEventListener("abort", abortListener);
|
|
520
|
+
if (streamBuffer.trim()) {
|
|
521
|
+
processLine(streamBuffer);
|
|
522
|
+
}
|
|
523
|
+
flushResponse();
|
|
524
|
+
await streamHandle.close();
|
|
525
|
+
await stderrHandle.close();
|
|
526
|
+
await writeFile(responsePath, runResult.rawResponseLines.join(""), "utf8");
|
|
527
|
+
|
|
528
|
+
runResult.exitCode = code ?? 0;
|
|
529
|
+
if (firstEventMs !== null && lastEventMs !== null) {
|
|
530
|
+
runResult.wallClockMs = lastEventMs - firstEventMs;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
if (cancelled) {
|
|
534
|
+
runResult.error = { message: "Cancelled by user" };
|
|
535
|
+
resolve(runResult);
|
|
536
|
+
return;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
if (runResult.exitCode !== 0) {
|
|
540
|
+
runResult.error = { message: `Pi exited with code ${runResult.exitCode}` };
|
|
541
|
+
resolve(runResult);
|
|
542
|
+
return;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
resolve(runResult);
|
|
546
|
+
});
|
|
547
|
+
|
|
548
|
+
child.on("error", async (err) => {
|
|
549
|
+
if (signal) signal.removeEventListener("abort", abortListener);
|
|
550
|
+
await streamHandle.close();
|
|
551
|
+
await stderrHandle.close();
|
|
552
|
+
runResult.error = { message: err.message };
|
|
553
|
+
resolve(runResult);
|
|
554
|
+
});
|
|
555
|
+
});
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
function extractTimestamp(event) {
|
|
559
|
+
const raw = event?.message?.timestamp ?? event?.timestamp ?? event?.assistantMessageEvent?.partial?.timestamp;
|
|
560
|
+
if (typeof raw === "number") return raw;
|
|
561
|
+
if (typeof raw === "string") {
|
|
562
|
+
const parsed = Date.parse(raw);
|
|
563
|
+
if (Number.isFinite(parsed)) return parsed;
|
|
564
|
+
}
|
|
565
|
+
const iso = event?.message?.createdAt ?? event?.createdAt ?? event?.created_at;
|
|
566
|
+
if (typeof iso === "string") {
|
|
567
|
+
const parsed = Date.parse(iso);
|
|
568
|
+
if (Number.isFinite(parsed)) return parsed;
|
|
569
|
+
}
|
|
570
|
+
return null;
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
async function openFileHandle(path, flags) {
|
|
574
|
+
const { open } = await import("node:fs/promises");
|
|
575
|
+
return open(path, flags);
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// ── Backend-aware server speed metrics ───────────────────────────────────
|
|
579
|
+
|
|
580
|
+
const BENCH_SPEED_PROMPT = "Write a one-sentence summary of machine learning.";
|
|
581
|
+
|
|
582
|
+
export async function queryServerMetrics(profile) {
|
|
583
|
+
const backend = backendFor(profile.backend);
|
|
584
|
+
|
|
585
|
+
if (backend.id === "llama-cpp" || backend.id === "llama-cpp-mtp") {
|
|
586
|
+
return await queryLlamaCppMetrics(profile);
|
|
587
|
+
}
|
|
588
|
+
if (backend.id === "omlx") {
|
|
589
|
+
return await queryOmlxMetrics(profile);
|
|
590
|
+
}
|
|
591
|
+
if (backend.id === "ollama") {
|
|
592
|
+
return await queryOllamaMetrics(profile);
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
throw new Error(`Unsupported backend for benchmark speed metrics: ${backend.id}`);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
async function queryLlamaCppMetrics(profile) {
|
|
599
|
+
const body = {
|
|
600
|
+
model: profile.modelAlias,
|
|
601
|
+
messages: [{ role: "user", content: BENCH_SPEED_PROMPT }],
|
|
602
|
+
stream: false,
|
|
603
|
+
};
|
|
604
|
+
|
|
605
|
+
const response = await fetch(profile.baseUrl.replace(/\/$/u, "") + "/chat/completions", {
|
|
606
|
+
method: "POST",
|
|
607
|
+
headers: { "Content-Type": "application/json" },
|
|
608
|
+
body: JSON.stringify(body),
|
|
609
|
+
signal: AbortSignal.timeout(60000),
|
|
610
|
+
});
|
|
611
|
+
|
|
612
|
+
if (!response.ok) {
|
|
613
|
+
throw new Error(`llama.cpp speed query failed: ${response.status} ${response.statusText}`);
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
const data = await response.json();
|
|
617
|
+
const timings = data.timings;
|
|
618
|
+
if (!timings || typeof timings.prompt_per_second !== "number" || typeof timings.predicted_per_second !== "number") {
|
|
619
|
+
throw new Error("llama.cpp response did not include usable timings object");
|
|
620
|
+
}
|
|
621
|
+
const draftN = timings.draft_n;
|
|
622
|
+
const draftAccepted = timings.draft_n_accepted;
|
|
623
|
+
|
|
624
|
+
return {
|
|
625
|
+
prefillTokensPerSecond: timings.prompt_per_second ?? null,
|
|
626
|
+
generationTokensPerSecond: timings.predicted_per_second ?? null,
|
|
627
|
+
ttftMs: timings.prompt_ms ?? null,
|
|
628
|
+
modelLoadMs: null,
|
|
629
|
+
speculativeDecodeAcceptance: (draftN && Number.isFinite(draftAccepted) && Number.isFinite(draftN) && draftN > 0)
|
|
630
|
+
? draftAccepted / draftN
|
|
631
|
+
: null,
|
|
632
|
+
kvCacheTokens: timings.cache_n ?? null,
|
|
633
|
+
metricSource: "llama.cpp /v1/chat/completions timings",
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
async function queryOmlxMetrics(profile) {
|
|
638
|
+
const body = {
|
|
639
|
+
model: profile.modelAlias,
|
|
640
|
+
messages: [{ role: "user", content: BENCH_SPEED_PROMPT }],
|
|
641
|
+
stream: true,
|
|
642
|
+
stream_options: { include_usage: true },
|
|
643
|
+
};
|
|
644
|
+
|
|
645
|
+
const response = await fetch(profile.baseUrl.replace(/\/$/u, "") + "/chat/completions", {
|
|
646
|
+
method: "POST",
|
|
647
|
+
headers: { "Content-Type": "application/json" },
|
|
648
|
+
body: JSON.stringify(body),
|
|
649
|
+
signal: AbortSignal.timeout(60000),
|
|
650
|
+
});
|
|
651
|
+
|
|
652
|
+
if (!response.ok) {
|
|
653
|
+
throw new Error(`oMLX speed query failed: ${response.status} ${response.statusText}`);
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
const text = await response.text();
|
|
657
|
+
let usage = null;
|
|
658
|
+
for (const line of text.split("\n").reverse()) {
|
|
659
|
+
const trimmed = line.trim();
|
|
660
|
+
if (!trimmed || !trimmed.startsWith("data:")) continue;
|
|
661
|
+
const payload = trimmed.slice(5).trim();
|
|
662
|
+
if (payload === "[DONE]") continue;
|
|
663
|
+
try {
|
|
664
|
+
const chunk = JSON.parse(payload);
|
|
665
|
+
if (chunk.usage) {
|
|
666
|
+
usage = chunk.usage;
|
|
667
|
+
break;
|
|
668
|
+
}
|
|
669
|
+
} catch {
|
|
670
|
+
// Ignore malformed SSE chunks.
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
if (!usage) {
|
|
675
|
+
throw new Error("oMLX speed query did not return usage in streaming response");
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
return {
|
|
679
|
+
prefillTokensPerSecond: usage.prompt_tokens_per_second ?? null,
|
|
680
|
+
generationTokensPerSecond: usage.generation_tokens_per_second ?? null,
|
|
681
|
+
ttftMs: usage.time_to_first_token != null ? usage.time_to_first_token * 1000 : null,
|
|
682
|
+
modelLoadMs: null,
|
|
683
|
+
speculativeDecodeAcceptance: null,
|
|
684
|
+
kvCacheTokens: usage.prompt_tokens_details?.cached_tokens ?? null,
|
|
685
|
+
metricSource: "oMLX /v1/chat/completions streaming include_usage",
|
|
686
|
+
};
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
async function queryOllamaMetrics(profile) {
|
|
690
|
+
const body = {
|
|
691
|
+
model: profile.modelAlias,
|
|
692
|
+
prompt: BENCH_SPEED_PROMPT,
|
|
693
|
+
stream: false,
|
|
694
|
+
};
|
|
695
|
+
|
|
696
|
+
const apiBaseUrl = (profile.baseUrl
|
|
697
|
+
? profile.baseUrl.replace(/\/v1\/?$/u, "")
|
|
698
|
+
: backendFor(profile.backend).apiBaseUrl).replace(/\/$/u, "");
|
|
699
|
+
|
|
700
|
+
const response = await fetch(`${apiBaseUrl}/api/generate`, {
|
|
701
|
+
method: "POST",
|
|
702
|
+
headers: { "Content-Type": "application/json" },
|
|
703
|
+
body: JSON.stringify(body),
|
|
704
|
+
signal: AbortSignal.timeout(60000),
|
|
705
|
+
});
|
|
706
|
+
|
|
707
|
+
if (!response.ok) {
|
|
708
|
+
throw new Error(`Ollama speed query failed: ${response.status} ${response.statusText}`);
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
const data = await response.json();
|
|
712
|
+
const promptEvalNs = data.prompt_eval_duration ?? 0;
|
|
713
|
+
const evalNs = data.eval_duration ?? 0;
|
|
714
|
+
const loadNs = data.load_duration ?? 0;
|
|
715
|
+
|
|
716
|
+
const promptEvalCount = data.prompt_eval_count ?? 0;
|
|
717
|
+
const evalCount = data.eval_count ?? 0;
|
|
718
|
+
|
|
719
|
+
return {
|
|
720
|
+
prefillTokensPerSecond: promptEvalNs > 0 ? (promptEvalCount / (promptEvalNs / 1e9)) : null,
|
|
721
|
+
generationTokensPerSecond: evalNs > 0 ? (evalCount / (evalNs / 1e9)) : null,
|
|
722
|
+
ttftMs: promptEvalNs / 1e6,
|
|
723
|
+
modelLoadMs: loadNs / 1e6,
|
|
724
|
+
speculativeDecodeAcceptance: null,
|
|
725
|
+
kvCacheTokens: null,
|
|
726
|
+
metricSource: "Ollama /api/generate",
|
|
727
|
+
};
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
// ── Finalize benchmark run metadata ──────────────────────────────────────
|
|
731
|
+
|
|
732
|
+
export async function finalizeBenchmarkRun(runDirectory, runResult, speedMetrics) {
|
|
733
|
+
const metadataPath = join(runDirectory, "metadata.json");
|
|
734
|
+
const metadata = JSON.parse(await readFile(metadataPath, "utf8"));
|
|
735
|
+
const now = new Date();
|
|
736
|
+
const timestamp = now.toISOString();
|
|
737
|
+
|
|
738
|
+
const kind = metadata.kind ?? "visual";
|
|
739
|
+
const isDs = kind === "data-science";
|
|
740
|
+
const requiredFile = isDs ? "analysis.ipynb" : "index.html";
|
|
741
|
+
const requiredPath = join(runDirectory, requiredFile);
|
|
742
|
+
|
|
743
|
+
const outputFiles = [];
|
|
744
|
+
for (const candidate of [requiredFile, isDs ? "summary.json" : "preview.png", isDs ? "chart-distribution.png" : "preview.webm", "preview.mp4"]) {
|
|
745
|
+
if (existsSync(join(runDirectory, candidate))) {
|
|
746
|
+
outputFiles.push(candidate);
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
const success = existsSync(requiredPath) && (await readFile(requiredPath, "utf8")).trim().length > 0;
|
|
751
|
+
|
|
752
|
+
metadata.status = runResult.error ? "failed" : "completed";
|
|
753
|
+
metadata.updatedAt = timestamp;
|
|
754
|
+
if (runResult.error) {
|
|
755
|
+
metadata.failedAt = timestamp;
|
|
756
|
+
} else {
|
|
757
|
+
metadata.completedAt = timestamp;
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
metadata.runner.tokenMetrics = {
|
|
761
|
+
reported: true,
|
|
762
|
+
promptTokens: runResult.promptTokens,
|
|
763
|
+
completionTokens: runResult.completionTokens,
|
|
764
|
+
totalTokens: runResult.totalTokens,
|
|
765
|
+
};
|
|
766
|
+
|
|
767
|
+
metadata.runner.speedMetrics = speedMetrics;
|
|
768
|
+
metadata.runner.metricSource = speedMetrics?.metricSource ?? null;
|
|
769
|
+
|
|
770
|
+
metadata.results = {
|
|
771
|
+
wallClockMs: runResult.wallClockMs,
|
|
772
|
+
agentTurns: runResult.agentTurns,
|
|
773
|
+
toolCalls: runResult.toolCalls,
|
|
774
|
+
toolResults: runResult.toolResults,
|
|
775
|
+
success,
|
|
776
|
+
outputFiles,
|
|
777
|
+
perTurn: runResult.perTurn,
|
|
778
|
+
};
|
|
779
|
+
|
|
780
|
+
if (runResult.error) {
|
|
781
|
+
metadata.error = typeof runResult.error === "string"
|
|
782
|
+
? { message: runResult.error }
|
|
783
|
+
: { message: runResult.error.message ?? "Unknown error", ...(runResult.error.stack ? { stack: runResult.error.stack } : {}) };
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
await writeFile(metadataPath, JSON.stringify(metadata, null, 2) + "\n", "utf8");
|
|
787
|
+
return metadata;
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
async function ensureServerForBenchmark(profile) {
|
|
791
|
+
const backend = backendFor(profile.backend);
|
|
792
|
+
if (await serverReady(profile.baseUrl)) {
|
|
793
|
+
console.log(pc.green(`[ready] ${backend.label} at ${profile.baseUrl}`));
|
|
794
|
+
return { started: false };
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
if (backend.type === "managed-server") {
|
|
798
|
+
throw new Error(`${backend.label} is not running at ${profile.baseUrl}. Start it and try again.`);
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
console.log(pc.dim(`Starting ${backend.label} for ${profile.label}...`));
|
|
802
|
+
const state = await startServer(profile);
|
|
803
|
+
await waitForReady(profile, state?.pid, state?.rawLogPath);
|
|
804
|
+
console.log(pc.green(`[ready] ${profile.baseUrl}/models`));
|
|
805
|
+
return { started: true, state };
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
export async function runPreparedBenchmark(profile, runDirectory, options = {}) {
|
|
809
|
+
const controller = new AbortController();
|
|
810
|
+
if (options.signal) {
|
|
811
|
+
options.signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
812
|
+
}
|
|
813
|
+
let serverStarted = false;
|
|
814
|
+
let metadata = null;
|
|
815
|
+
|
|
816
|
+
const onSigint = () => {
|
|
817
|
+
controller.abort();
|
|
818
|
+
};
|
|
819
|
+
process.on("SIGINT", onSigint);
|
|
820
|
+
|
|
821
|
+
try {
|
|
822
|
+
if (!(await hasPi())) {
|
|
823
|
+
console.log(pc.yellow("\nPi is not installed. Run prepared for manual execution."));
|
|
824
|
+
return metadata;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
const serverState = await ensureServerForBenchmark(profile);
|
|
828
|
+
serverStarted = serverState.started;
|
|
829
|
+
|
|
830
|
+
if (!(await hasPiModel(profile))) {
|
|
831
|
+
await syncPiConfig(profile);
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
const runResult = await runBenchmarkInPi(profile, runDirectory, { signal: controller.signal });
|
|
835
|
+
|
|
836
|
+
let speedMetrics = null;
|
|
837
|
+
if (!runResult.error) {
|
|
838
|
+
try {
|
|
839
|
+
speedMetrics = await queryServerMetrics(profile);
|
|
840
|
+
} catch (err) {
|
|
841
|
+
runResult.error = { message: `Speed metrics query failed: ${err.message}` };
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
metadata = await finalizeBenchmarkRun(runDirectory, runResult, speedMetrics);
|
|
846
|
+
renderBenchmarkSummary(metadata);
|
|
847
|
+
} catch (err) {
|
|
848
|
+
const failedResult = {
|
|
849
|
+
error: { message: err.message },
|
|
850
|
+
wallClockMs: null,
|
|
851
|
+
agentTurns: 0,
|
|
852
|
+
promptTokens: 0,
|
|
853
|
+
completionTokens: 0,
|
|
854
|
+
totalTokens: 0,
|
|
855
|
+
cacheRead: 0,
|
|
856
|
+
cacheWrite: 0,
|
|
857
|
+
toolCalls: 0,
|
|
858
|
+
toolResults: 0,
|
|
859
|
+
perTurn: [],
|
|
860
|
+
};
|
|
861
|
+
metadata = await finalizeBenchmarkRun(runDirectory, failedResult, null);
|
|
862
|
+
renderBenchmarkSummary(metadata);
|
|
863
|
+
} finally {
|
|
864
|
+
process.removeListener("SIGINT", onSigint);
|
|
865
|
+
if (serverStarted && !options.keepServer) {
|
|
866
|
+
const backend = backendFor(profile.backend);
|
|
867
|
+
if (backend.type !== "managed-server") {
|
|
868
|
+
const result = await stopProfile(profile);
|
|
869
|
+
console.log(result.stopped ? pc.green(`[stop] ${result.message}`) : pc.dim(`[stop] ${result.message}`));
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
return metadata;
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
function formatMetric(value, formatter) {
|
|
878
|
+
if (value === null || value === undefined || !Number.isFinite(value)) return pc.dim("—");
|
|
879
|
+
return formatter(value);
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
function formatMs(ms) {
|
|
883
|
+
return formatMetric(ms, (n) => (n < 1000 ? `${Math.round(n)} ms` : `${(n / 1000).toFixed(1)} s`));
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
function formatNumber(n) {
|
|
887
|
+
return formatMetric(n, (v) => v.toLocaleString());
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
function formatTokPerSec(n) {
|
|
891
|
+
return formatMetric(n, (v) => `${v.toFixed(1)} tok/s`);
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
function formatPercent(n) {
|
|
895
|
+
return formatMetric(n, (v) => `${(v * 100).toFixed(0)} %`);
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
export function renderBenchmarkSummary(metadata) {
|
|
899
|
+
const { status, results, runner, error } = metadata;
|
|
900
|
+
|
|
901
|
+
const agentRows = [
|
|
902
|
+
["Status", status === "completed" ? pc.green("completed") : pc.red(status ?? "failed")],
|
|
903
|
+
["Duration", formatMs(results?.wallClockMs)],
|
|
904
|
+
["Agent turns", formatNumber(results?.agentTurns)],
|
|
905
|
+
["Input tokens", formatNumber(runner?.tokenMetrics?.promptTokens)],
|
|
906
|
+
["Output tokens", formatNumber(runner?.tokenMetrics?.completionTokens)],
|
|
907
|
+
["Total tokens", formatNumber(runner?.tokenMetrics?.totalTokens)],
|
|
908
|
+
["Tool calls", formatNumber(results?.toolCalls)],
|
|
909
|
+
["Tool results", formatNumber(results?.toolResults)],
|
|
910
|
+
["Output files", (results?.outputFiles?.length ?? 0) > 0 ? results.outputFiles.join(", ") : pc.dim("—")],
|
|
911
|
+
];
|
|
912
|
+
|
|
913
|
+
console.log("");
|
|
914
|
+
console.log(renderSection("Benchmark Result", renderRows(agentRows)));
|
|
915
|
+
|
|
916
|
+
if (status === "completed" && runner?.speedMetrics) {
|
|
917
|
+
const speed = runner.speedMetrics;
|
|
918
|
+
const speedRows = [
|
|
919
|
+
["Prefill tok/s", formatTokPerSec(speed.prefillTokensPerSecond)],
|
|
920
|
+
["Generation tok/s", formatTokPerSec(speed.generationTokensPerSecond)],
|
|
921
|
+
["TTFT", formatMs(speed.ttftMs)],
|
|
922
|
+
["Speculative decode", formatPercent(speed.speculativeDecodeAcceptance)],
|
|
923
|
+
["KV cache tokens", formatNumber(speed.kvCacheTokens)],
|
|
924
|
+
["Model load time", formatMs(speed.modelLoadMs)],
|
|
925
|
+
["Metric source", speed.metricSource ?? pc.dim("—")],
|
|
926
|
+
];
|
|
927
|
+
console.log(renderSection("Speed Metrics", renderRows(speedRows)));
|
|
928
|
+
} else if (error) {
|
|
929
|
+
console.log(renderSection("Error", pc.red(error.message ?? "Unknown error")));
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
function benchmarkModelSource(profile) {
|
|
934
|
+
if (!profile) return "cloud";
|
|
935
|
+
return profile.providerId === "llama-cpp-mtp" ? "llama-cpp-mtp" : profile.backend === "ollama" ? "ollama" : profile.backend === "omlx" ? "omlx" : "llama-cpp";
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
async function chooseBenchmarkAction(prompt, canRun) {
|
|
939
|
+
const choices = [
|
|
940
|
+
{ value: "run", label: "Run Benchmark", hint: "Automated with Pi" },
|
|
941
|
+
{ value: "prepare", label: "Prepare Benchmark (manual)", hint: "Copy prompt and run yourself" },
|
|
942
|
+
];
|
|
943
|
+
return await prompt.choice("Action", canRun ? choices : choices.filter((c) => c.value === "prepare"), canRun ? "run" : "prepare");
|
|
944
|
+
}
|
|
945
|
+
|
|
213
946
|
// ── Benchmark from a selected profile (from model picker) ────────────────
|
|
214
947
|
|
|
215
948
|
export async function benchmarkForProfile(profile) {
|
|
@@ -237,10 +970,19 @@ export async function benchmarkForProfile(profile) {
|
|
|
237
970
|
if (!selectedBenchmark) return;
|
|
238
971
|
|
|
239
972
|
const modelId = profile.modelAlias;
|
|
240
|
-
const modelSource = profile
|
|
973
|
+
const modelSource = benchmarkModelSource(profile);
|
|
241
974
|
const backendLabel = backendFor(profile.backend).label;
|
|
242
975
|
|
|
243
|
-
|
|
976
|
+
const canRun = (await hasPi()) && modelSource !== "cloud";
|
|
977
|
+
const action = await chooseBenchmarkAction(prompt, canRun);
|
|
978
|
+
|
|
979
|
+
const runDirectory = await prepareBenchmarkRun({ repoPath, benchmark: selectedBenchmark, kind, modelId, modelSource, backendLabel, profile, showNextSteps: action === "prepare" });
|
|
980
|
+
|
|
981
|
+
if (action === "run") {
|
|
982
|
+
return await runPreparedBenchmark(profile, runDirectory);
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
return runDirectory;
|
|
244
986
|
} finally {
|
|
245
987
|
prompt.close();
|
|
246
988
|
}
|
|
@@ -294,7 +1036,7 @@ export async function benchmarkFlow() {
|
|
|
294
1036
|
profile = profiles.find((p) => p.id === profileId);
|
|
295
1037
|
if (!profile) return;
|
|
296
1038
|
modelId = profile.modelAlias;
|
|
297
|
-
modelSource = profile
|
|
1039
|
+
modelSource = benchmarkModelSource(profile);
|
|
298
1040
|
backendLabel = backendFor(profile.backend).label;
|
|
299
1041
|
} else {
|
|
300
1042
|
backendLabel = await prompt.text("Backend label", "cloud");
|
|
@@ -303,7 +1045,16 @@ export async function benchmarkFlow() {
|
|
|
303
1045
|
modelSource = "cloud";
|
|
304
1046
|
}
|
|
305
1047
|
|
|
306
|
-
|
|
1048
|
+
const canRun = (await hasPi()) && modelSource !== "cloud" && profile != null;
|
|
1049
|
+
const action = await chooseBenchmarkAction(prompt, canRun);
|
|
1050
|
+
|
|
1051
|
+
const runDirectory = await prepareBenchmarkRun({ repoPath, benchmark: selectedBenchmark, kind, modelId, modelSource, backendLabel, profile, showNextSteps: action === "prepare" });
|
|
1052
|
+
|
|
1053
|
+
if (action === "run" && profile) {
|
|
1054
|
+
return await runPreparedBenchmark(profile, runDirectory);
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
return runDirectory;
|
|
307
1058
|
} finally {
|
|
308
1059
|
prompt.close();
|
|
309
1060
|
}
|