offgrid-ai 0.13.0 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -1
- package/src/benchmark/flow.mjs +32 -47
- package/src/benchmark/prepare.mjs +2 -5
- package/src/benchmark/sdk-runner.mjs +363 -0
- package/src/benchmark/shared.mjs +0 -8
- package/src/benchmark.mjs +2 -2
- package/src/commands/run.mjs +0 -12
- package/src/harness-pi.mjs +3 -3
- package/src/model-presenters.mjs +0 -15
- package/src/process.mjs +5 -1
- package/src/profile-setup.mjs +8 -73
- package/src/profiles.mjs +4 -21
- package/src/benchmark/pi-runner.mjs +0 -257
- package/src/benchmark/stream-renderer.mjs +0 -302
- package/src/command.mjs +0 -21
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "offgrid-ai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.14.1",
|
|
4
4
|
"description": "Privacy-first CLI for running local LLMs — discover, configure, run, benchmark",
|
|
5
5
|
"author": "Eeshan Srivastava (https://eeshans.com)",
|
|
6
6
|
"type": "module",
|
|
@@ -43,6 +43,9 @@
|
|
|
43
43
|
},
|
|
44
44
|
"dependencies": {
|
|
45
45
|
"@clack/prompts": "^1.4.0",
|
|
46
|
+
"@earendil-works/pi-agent-core": "^0.80.3",
|
|
47
|
+
"@earendil-works/pi-ai": "^0.80.3",
|
|
48
|
+
"@earendil-works/pi-coding-agent": "^0.80.3",
|
|
46
49
|
"picocolors": "^1.1.0"
|
|
47
50
|
},
|
|
48
51
|
"keywords": [
|
package/src/benchmark/flow.mjs
CHANGED
|
@@ -3,14 +3,13 @@
|
|
|
3
3
|
import { join } from "node:path";
|
|
4
4
|
import { ensureDirs } from "../config.mjs";
|
|
5
5
|
import { backendFor } from "../backends.mjs";
|
|
6
|
-
import { hasPi, hasPiModel, syncPiConfig } from "../harness-pi.mjs";
|
|
7
6
|
import { serverReady, startServer, waitForReady, stopProfile, modelAvailableOnServer, unloadModelFromServer } from "../process.mjs";
|
|
8
7
|
import { loadProfiles } from "../profiles.mjs";
|
|
9
8
|
import { pc, createPrompt } from "../ui.mjs";
|
|
10
9
|
import { linkBenchmarkRepo } from "./repo.mjs";
|
|
11
10
|
import { loadBenchmarks } from "./shared.mjs";
|
|
12
11
|
import { prepareBenchmarkRun } from "./prepare.mjs";
|
|
13
|
-
import { runBenchmarkInPi } from "./
|
|
12
|
+
import { runBenchmarkInPi } from "./sdk-runner.mjs";
|
|
14
13
|
import { queryServerMetrics } from "./metrics.mjs";
|
|
15
14
|
import { finalizeBenchmarkRun, renderBenchmarkSummary } from "./finalize.mjs";
|
|
16
15
|
|
|
@@ -63,7 +62,7 @@ export async function runPreparedBenchmark(profile, runDirectory, options = {})
|
|
|
63
62
|
}
|
|
64
63
|
let serverStarted = false;
|
|
65
64
|
let benchmarkStarted = false;
|
|
66
|
-
let metadata
|
|
65
|
+
let metadata;
|
|
67
66
|
|
|
68
67
|
const onSigint = () => {
|
|
69
68
|
controller.abort();
|
|
@@ -71,18 +70,9 @@ export async function runPreparedBenchmark(profile, runDirectory, options = {})
|
|
|
71
70
|
process.on("SIGINT", onSigint);
|
|
72
71
|
|
|
73
72
|
try {
|
|
74
|
-
if (!(await hasPi())) {
|
|
75
|
-
console.log(pc.yellow("\nPi is not installed. Run prepared for manual execution."));
|
|
76
|
-
return metadata;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
73
|
const serverState = await ensureServerForBenchmark(profile);
|
|
80
74
|
serverStarted = serverState.started;
|
|
81
75
|
|
|
82
|
-
if (!(await hasPiModel(profile))) {
|
|
83
|
-
await syncPiConfig(profile);
|
|
84
|
-
}
|
|
85
|
-
|
|
86
76
|
benchmarkStarted = true;
|
|
87
77
|
const runResult = await runBenchmarkInPi(profile, runDirectory, { signal: controller.signal });
|
|
88
78
|
|
|
@@ -138,6 +128,28 @@ export async function runPreparedBenchmark(profile, runDirectory, options = {})
|
|
|
138
128
|
return metadata;
|
|
139
129
|
}
|
|
140
130
|
|
|
131
|
+
// ── Shared benchmark selection ────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
async function selectBenchmark(prompt, repoPath) {
|
|
134
|
+
const kind = await prompt.choice("Benchmark category", [
|
|
135
|
+
{ value: "visual", label: "Visual Benchmark", hint: "HTML/CSS/JS animation benchmarks" },
|
|
136
|
+
{ value: "data-science", label: "Data Science", hint: "Analysis and charting benchmarks" },
|
|
137
|
+
], "visual");
|
|
138
|
+
|
|
139
|
+
const benchDir = join(repoPath, "benchmarks");
|
|
140
|
+
const benchmarks = (await loadBenchmarks(benchDir)).filter((b) => b.kind === kind);
|
|
141
|
+
if (benchmarks.length === 0) {
|
|
142
|
+
console.log(pc.yellow(`No ${kind} benchmarks found in ${benchDir}`));
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
const benchmarkId = await prompt.choice("Prompt", benchmarks.map((b) => ({
|
|
146
|
+
value: b.id, label: b.title, hint: b.description || b.id,
|
|
147
|
+
})), benchmarks[0].id);
|
|
148
|
+
const benchmark = benchmarks.find((b) => b.id === benchmarkId);
|
|
149
|
+
if (!benchmark) return null;
|
|
150
|
+
return { kind, benchmark };
|
|
151
|
+
}
|
|
152
|
+
|
|
141
153
|
// ── Benchmark from a selected profile (from model picker) ────────────────
|
|
142
154
|
|
|
143
155
|
export async function benchmarkForProfile(profile) {
|
|
@@ -147,28 +159,15 @@ export async function benchmarkForProfile(profile) {
|
|
|
147
159
|
const repoPath = await linkBenchmarkRepo(prompt);
|
|
148
160
|
if (!repoPath) return;
|
|
149
161
|
|
|
150
|
-
const
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
], "visual");
|
|
154
|
-
|
|
155
|
-
const benchDir = join(repoPath, "benchmarks");
|
|
156
|
-
const benchmarks = (await loadBenchmarks(benchDir)).filter((b) => b.kind === kind);
|
|
157
|
-
if (benchmarks.length === 0) {
|
|
158
|
-
console.log(pc.yellow(`No ${kind} benchmarks found in ${benchDir}`));
|
|
159
|
-
return;
|
|
160
|
-
}
|
|
161
|
-
const benchmarkId = await prompt.choice("Prompt", benchmarks.map((b) => ({
|
|
162
|
-
value: b.id, label: b.title, hint: b.description || b.id,
|
|
163
|
-
})), benchmarks[0].id);
|
|
164
|
-
const selectedBenchmark = benchmarks.find((b) => b.id === benchmarkId);
|
|
165
|
-
if (!selectedBenchmark) return;
|
|
162
|
+
const selected = await selectBenchmark(prompt, repoPath);
|
|
163
|
+
if (!selected) return;
|
|
164
|
+
const { kind, benchmark: selectedBenchmark } = selected;
|
|
166
165
|
|
|
167
166
|
const modelId = profile.modelAlias;
|
|
168
167
|
const modelSource = benchmarkModelSource(profile);
|
|
169
168
|
const backendLabel = backendFor(profile.backend).label;
|
|
170
169
|
|
|
171
|
-
const canRun =
|
|
170
|
+
const canRun = modelSource !== "cloud";
|
|
172
171
|
const action = await chooseBenchmarkAction(prompt, canRun);
|
|
173
172
|
|
|
174
173
|
const runDirectory = await prepareBenchmarkRun({ repoPath, benchmark: selectedBenchmark, kind, modelId, modelSource, backendLabel, profile, showNextSteps: action === "prepare" });
|
|
@@ -187,28 +186,14 @@ export async function benchmarkForProfile(profile) {
|
|
|
187
186
|
|
|
188
187
|
export async function benchmarkFlow() {
|
|
189
188
|
await ensureDirs();
|
|
190
|
-
|
|
191
189
|
const prompt = createPrompt();
|
|
192
190
|
try {
|
|
193
191
|
const repoPath = await linkBenchmarkRepo(prompt);
|
|
194
192
|
if (!repoPath) return;
|
|
195
193
|
|
|
196
|
-
const
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
], "visual");
|
|
200
|
-
|
|
201
|
-
const benchDir = join(repoPath, "benchmarks");
|
|
202
|
-
const benchmarks = (await loadBenchmarks(benchDir)).filter((b) => b.kind === kind);
|
|
203
|
-
if (benchmarks.length === 0) {
|
|
204
|
-
console.log(pc.yellow(`No ${kind} benchmarks found in ${benchDir}`));
|
|
205
|
-
return;
|
|
206
|
-
}
|
|
207
|
-
const benchmarkId = await prompt.choice("Prompt", benchmarks.map((b) => ({
|
|
208
|
-
value: b.id, label: b.title, hint: b.description || b.id,
|
|
209
|
-
})), benchmarks[0].id);
|
|
210
|
-
const selectedBenchmark = benchmarks.find((b) => b.id === benchmarkId);
|
|
211
|
-
if (!selectedBenchmark) return;
|
|
194
|
+
const selected = await selectBenchmark(prompt, repoPath);
|
|
195
|
+
if (!selected) return;
|
|
196
|
+
const { kind, benchmark: selectedBenchmark } = selected;
|
|
212
197
|
|
|
213
198
|
const profiles = await loadProfiles();
|
|
214
199
|
const source = await prompt.choice("Model source", [
|
|
@@ -238,7 +223,7 @@ export async function benchmarkFlow() {
|
|
|
238
223
|
modelSource = "cloud";
|
|
239
224
|
}
|
|
240
225
|
|
|
241
|
-
const canRun =
|
|
226
|
+
const canRun = modelSource !== "cloud" && profile != null;
|
|
242
227
|
const action = await chooseBenchmarkAction(prompt, canRun);
|
|
243
228
|
|
|
244
229
|
const runDirectory = await prepareBenchmarkRun({ repoPath, benchmark: selectedBenchmark, kind, modelId, modelSource, backendLabel, profile, showNextSteps: action === "prepare" });
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
4
4
|
import { join } from "node:path";
|
|
5
5
|
import { pc, renderRows, renderSection } from "../ui.mjs";
|
|
6
|
-
import { slugModelId, createRunId
|
|
6
|
+
import { slugModelId, createRunId } from "./shared.mjs";
|
|
7
7
|
import { parseModelName } from "../model-name.mjs";
|
|
8
8
|
|
|
9
9
|
function harnessDisplayName(id) {
|
|
@@ -30,7 +30,7 @@ function printBenchmarkNextSteps({ repoPath, runDirectory, profile, modelId, run
|
|
|
30
30
|
}
|
|
31
31
|
|
|
32
32
|
export async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSource, backendLabel, profile, showNextSteps = true }) {
|
|
33
|
-
const toolPrompt =
|
|
33
|
+
const toolPrompt = benchmark.prompt;
|
|
34
34
|
const now = new Date();
|
|
35
35
|
const runId = createRunId(now);
|
|
36
36
|
const modelSlug = slugModelId(modelId);
|
|
@@ -46,9 +46,6 @@ export async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId,
|
|
|
46
46
|
const baseAssets = {
|
|
47
47
|
metadata: "metadata.json",
|
|
48
48
|
prompt: "prompt.md",
|
|
49
|
-
rawResponse: "response.raw.txt",
|
|
50
|
-
stream: "stream.ndjson",
|
|
51
|
-
stderr: "stderr.log",
|
|
52
49
|
};
|
|
53
50
|
const metadata = {
|
|
54
51
|
schemaVersion: 1,
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
// ── Run benchmark via Pi SDK (no subprocess, no NDJSON parsing) ────────────────
|
|
2
|
+
|
|
3
|
+
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { join, relative, basename } from "node:path";
|
|
5
|
+
import { Agent } from "@earendil-works/pi-agent-core";
|
|
6
|
+
import { streamSimple } from "@earendil-works/pi-ai/compat";
|
|
7
|
+
import { createCodingTools } from "@earendil-works/pi-coding-agent";
|
|
8
|
+
import { pc, formatBytes } from "../ui.mjs";
|
|
9
|
+
import { piApiModelId, modelReasoning, modelCompat } from "../harness-pi.mjs";
|
|
10
|
+
|
|
11
|
+
const C = {
|
|
12
|
+
thinking: pc.magenta,
|
|
13
|
+
text: pc.green,
|
|
14
|
+
tool: pc.yellow,
|
|
15
|
+
success: pc.green,
|
|
16
|
+
warning: pc.yellow,
|
|
17
|
+
error: pc.red,
|
|
18
|
+
info: pc.cyan,
|
|
19
|
+
dim: pc.dim,
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
export async function runBenchmarkInPi(profile, runDirectory, { signal } = {}) {
|
|
23
|
+
const model = buildModel(profile);
|
|
24
|
+
const tools = createCodingTools(runDirectory);
|
|
25
|
+
const systemPrompt = buildSystemPrompt(runDirectory);
|
|
26
|
+
const promptText = await readFile(join(runDirectory, "prompt.md"), "utf8");
|
|
27
|
+
|
|
28
|
+
const runResult = {
|
|
29
|
+
model: `${profile.providerId}/${piApiModelId(profile)}`,
|
|
30
|
+
exitCode: 0,
|
|
31
|
+
wallClockMs: null,
|
|
32
|
+
agentTurns: 0,
|
|
33
|
+
promptTokens: 0,
|
|
34
|
+
completionTokens: 0,
|
|
35
|
+
totalTokens: 0,
|
|
36
|
+
cacheRead: 0,
|
|
37
|
+
cacheWrite: 0,
|
|
38
|
+
toolCalls: 0,
|
|
39
|
+
toolResults: 0,
|
|
40
|
+
perTurn: [],
|
|
41
|
+
rawResponseLines: [],
|
|
42
|
+
error: null,
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
const runStartMs = Date.now();
|
|
46
|
+
let currentTurnStartMs = null;
|
|
47
|
+
let lastTurnEndMs = null;
|
|
48
|
+
let turnToolCalls = 0;
|
|
49
|
+
let responseBuffer = "";
|
|
50
|
+
const verbose = Boolean(process.env.OFFGRID_BENCHMARK_VERBOSE);
|
|
51
|
+
const toolArgsByCallId = new Map();
|
|
52
|
+
|
|
53
|
+
// ── Status line state ────────────────────────────────────────────────────
|
|
54
|
+
let statusBytes = 0;
|
|
55
|
+
let streamedText = false;
|
|
56
|
+
let execTimer = null;
|
|
57
|
+
let execStartedAt = null;
|
|
58
|
+
|
|
59
|
+
function clearStatusLine() {
|
|
60
|
+
if (process.stdout.isTTY) process.stdout.write("\r\x1b[K");
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function printStatusLine(text) {
|
|
64
|
+
if (process.stdout.isTTY) process.stdout.write(`\r\x1b[K${text}`);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function stopExecTimer() {
|
|
68
|
+
if (execTimer) { clearInterval(execTimer); execTimer = null; }
|
|
69
|
+
clearStatusLine();
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function startExecTimer(toolName) {
|
|
73
|
+
stopExecTimer();
|
|
74
|
+
execStartedAt = Date.now();
|
|
75
|
+
if (!process.stdout.isTTY) return;
|
|
76
|
+
const update = () => {
|
|
77
|
+
const elapsed = Math.floor((Date.now() - execStartedAt) / 1000);
|
|
78
|
+
printStatusLine(C.dim(`running ${toolName}… ${elapsed}s`));
|
|
79
|
+
};
|
|
80
|
+
update();
|
|
81
|
+
execTimer = setInterval(update, 1000);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const agent = new Agent({
|
|
85
|
+
initialState: {
|
|
86
|
+
systemPrompt,
|
|
87
|
+
model,
|
|
88
|
+
thinkingLevel: profile.reasoning ? "low" : "off",
|
|
89
|
+
tools,
|
|
90
|
+
},
|
|
91
|
+
streamFn: async (mdl, ctx, opts) =>
|
|
92
|
+
streamSimple(mdl, ctx, { ...opts, apiKey: "none" }),
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
// ── Event handler: render + collect metrics ──────────────────────────────
|
|
96
|
+
|
|
97
|
+
agent.subscribe((event) => {
|
|
98
|
+
try {
|
|
99
|
+
handleEvent(event);
|
|
100
|
+
} catch (err) {
|
|
101
|
+
console.error(C.error(`[renderer error] ${err.message}`));
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
function handleEvent(event) {
|
|
106
|
+
switch (event.type) {
|
|
107
|
+
case "turn_start": {
|
|
108
|
+
stopExecTimer();
|
|
109
|
+
runResult.agentTurns += 1;
|
|
110
|
+
currentTurnStartMs = lastTurnEndMs ?? runStartMs;
|
|
111
|
+
turnToolCalls = 0;
|
|
112
|
+
console.log("");
|
|
113
|
+
console.log(C.info(`Turn ${runResult.agentTurns}`));
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
case "message_update": {
|
|
118
|
+
const evt = event.assistantMessageEvent;
|
|
119
|
+
if (!evt) break;
|
|
120
|
+
const sub = String(evt.type ?? "").replace(/_/gu, "");
|
|
121
|
+
if (sub === "thinkingstart") {
|
|
122
|
+
statusBytes = 0;
|
|
123
|
+
} else if (sub === "thinkingdelta") {
|
|
124
|
+
statusBytes += Buffer.byteLength(evt.delta || "", "utf8");
|
|
125
|
+
const tokens = Math.max(1, Math.ceil(statusBytes / 4));
|
|
126
|
+
printStatusLine(C.dim(`thinking… ${formatBytes(statusBytes)} (~${formatTokens(tokens)} tokens)`));
|
|
127
|
+
if (verbose) process.stdout.write(C.thinking(evt.delta || ""));
|
|
128
|
+
} else if (sub === "textstart") {
|
|
129
|
+
clearStatusLine();
|
|
130
|
+
statusBytes = 0;
|
|
131
|
+
} else if (sub === "textdelta") {
|
|
132
|
+
process.stdout.write(evt.delta || "");
|
|
133
|
+
responseBuffer += evt.delta || "";
|
|
134
|
+
streamedText = true;
|
|
135
|
+
} else if (sub === "toolcallstart") {
|
|
136
|
+
clearStatusLine();
|
|
137
|
+
}
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
case "message_end": {
|
|
142
|
+
if (streamedText) {
|
|
143
|
+
console.log("");
|
|
144
|
+
streamedText = false;
|
|
145
|
+
}
|
|
146
|
+
if (event.message?.role === "assistant") {
|
|
147
|
+
for (const item of event.message.content ?? []) {
|
|
148
|
+
if (item.type === "toolCall") {
|
|
149
|
+
runResult.toolCalls += 1;
|
|
150
|
+
turnToolCalls += 1;
|
|
151
|
+
responseBuffer += `\n[toolCall] ${item.name}\n`;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
if (responseBuffer) {
|
|
155
|
+
runResult.rawResponseLines.push(responseBuffer);
|
|
156
|
+
responseBuffer = "";
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
case "tool_execution_start": {
|
|
163
|
+
clearStatusLine();
|
|
164
|
+
toolArgsByCallId.set(event.toolCallId, event.args);
|
|
165
|
+
console.log(C.tool(formatToolStart(event.toolName, event.args, runDirectory)));
|
|
166
|
+
startExecTimer(event.toolName);
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
case "tool_execution_end": {
|
|
171
|
+
stopExecTimer();
|
|
172
|
+
const { toolName, result, isError, toolCallId } = event;
|
|
173
|
+
const args = toolArgsByCallId.get(toolCallId) ?? {};
|
|
174
|
+
const marker = isError ? C.error("✗") : C.success("✓");
|
|
175
|
+
console.log(`${marker} ${toolSummary(toolName, result, isError, args, runDirectory)}`);
|
|
176
|
+
runResult.toolResults += 1;
|
|
177
|
+
break;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
case "turn_end": {
|
|
181
|
+
stopExecTimer();
|
|
182
|
+
clearStatusLine();
|
|
183
|
+
const msg = event.message;
|
|
184
|
+
const isFailure = msg?.role === "assistant" && (msg.stopReason === "error" || msg.stopReason === "aborted");
|
|
185
|
+
const usage = !isFailure ? msg?.usage : null;
|
|
186
|
+
if (usage) {
|
|
187
|
+
runResult.promptTokens += usage.input ?? 0;
|
|
188
|
+
runResult.completionTokens += usage.output ?? 0;
|
|
189
|
+
runResult.cacheRead += usage.cacheRead ?? 0;
|
|
190
|
+
runResult.cacheWrite += usage.cacheWrite ?? 0;
|
|
191
|
+
}
|
|
192
|
+
const turnEndMs = Date.now();
|
|
193
|
+
const wallClockMs = currentTurnStartMs ? turnEndMs - currentTurnStartMs : null;
|
|
194
|
+
runResult.perTurn.push({
|
|
195
|
+
turn: runResult.agentTurns,
|
|
196
|
+
inputTokens: usage?.input ?? 0,
|
|
197
|
+
outputTokens: usage?.output ?? 0,
|
|
198
|
+
cacheRead: usage?.cacheRead ?? 0,
|
|
199
|
+
cacheWrite: usage?.cacheWrite ?? 0,
|
|
200
|
+
wallClockMs,
|
|
201
|
+
toolCalls: turnToolCalls,
|
|
202
|
+
});
|
|
203
|
+
lastTurnEndMs = turnEndMs;
|
|
204
|
+
const tokStr = usage ? ` · ${formatTokens(usage.output ?? 0)} tokens` : "";
|
|
205
|
+
console.log(C.success(`✓ turn ${runResult.agentTurns}${tokStr}`));
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
case "agent_end": {
|
|
210
|
+
if (responseBuffer) {
|
|
211
|
+
runResult.rawResponseLines.push(responseBuffer);
|
|
212
|
+
responseBuffer = "";
|
|
213
|
+
}
|
|
214
|
+
break;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// ── Wire abort signal ────────────────────────────────────────────────────
|
|
220
|
+
|
|
221
|
+
let cancelled = false;
|
|
222
|
+
const abortListener = () => {
|
|
223
|
+
cancelled = true;
|
|
224
|
+
agent.abort();
|
|
225
|
+
};
|
|
226
|
+
if (signal) signal.addEventListener("abort", abortListener, { once: true });
|
|
227
|
+
|
|
228
|
+
// ── Run ───────────────────────────────────────────────────────────────────
|
|
229
|
+
|
|
230
|
+
try {
|
|
231
|
+
console.log(C.info("Pi benchmark started"));
|
|
232
|
+
console.log(C.dim(` Model ${model.provider}/${model.id}`));
|
|
233
|
+
await agent.prompt(promptText);
|
|
234
|
+
} catch (err) {
|
|
235
|
+
if (!cancelled) {
|
|
236
|
+
runResult.error = { message: err.message };
|
|
237
|
+
}
|
|
238
|
+
} finally {
|
|
239
|
+
if (signal) signal.removeEventListener("abort", abortListener);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (cancelled) {
|
|
243
|
+
runResult.error = { message: "Cancelled by user" };
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (!runResult.error && agent.state.errorMessage) {
|
|
247
|
+
runResult.error = { message: agent.state.errorMessage };
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
runResult.wallClockMs = Date.now() - runStartMs;
|
|
251
|
+
runResult.totalTokens = runResult.promptTokens + runResult.completionTokens;
|
|
252
|
+
|
|
253
|
+
console.log(C.info("Pi benchmark finished"));
|
|
254
|
+
return runResult;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// ── Model construction ──────────────────────────────────────────────────────
|
|
258
|
+
|
|
259
|
+
function buildModel(profile) {
|
|
260
|
+
const reasoning = modelReasoning(profile) ?? false;
|
|
261
|
+
const compat = modelCompat(profile);
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
id: piApiModelId(profile),
|
|
265
|
+
name: profile.label,
|
|
266
|
+
api: "openai-completions",
|
|
267
|
+
provider: profile.providerId,
|
|
268
|
+
baseUrl: profile.baseUrl,
|
|
269
|
+
reasoning,
|
|
270
|
+
input: profile.mmprojPath || profile.capabilities?.vision ? ["text", "image"] : ["text"],
|
|
271
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
272
|
+
contextWindow: profile.flags?.ctxSize ?? 32768,
|
|
273
|
+
maxTokens: 16384,
|
|
274
|
+
compat: {
|
|
275
|
+
supportsDeveloperRole: false,
|
|
276
|
+
supportsReasoningEffort: false,
|
|
277
|
+
maxTokensField: "max_tokens",
|
|
278
|
+
...(compat ?? {}),
|
|
279
|
+
},
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// ── System prompt ───────────────────────────────────────────────────────────
|
|
284
|
+
|
|
285
|
+
function buildSystemPrompt(cwd) {
|
|
286
|
+
const now = new Date();
|
|
287
|
+
const date = `${now.getFullYear()}-${String(now.getMonth() + 1).padStart(2, "0")}-${String(now.getDate()).padStart(2, "0")}`;
|
|
288
|
+
return `You are an expert coding assistant. You help users by reading files, executing commands, editing code, and writing new files.
|
|
289
|
+
|
|
290
|
+
Available tools:
|
|
291
|
+
- read: Read file contents (supports text and images)
|
|
292
|
+
- bash: Execute shell commands
|
|
293
|
+
- edit: Apply targeted text replacements to files
|
|
294
|
+
- write: Write content to files (creates or overwrites)
|
|
295
|
+
|
|
296
|
+
Guidelines:
|
|
297
|
+
- Be concise in your responses
|
|
298
|
+
- Show file paths clearly when working with files
|
|
299
|
+
- Use the write tool to create files — do not return file contents as chat text
|
|
300
|
+
- Use bash to run commands and verify your work
|
|
301
|
+
|
|
302
|
+
Current date: ${date}
|
|
303
|
+
Current working directory: ${cwd.replace(/\\/gu, "/")}`;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// ── Rendering helpers ───────────────────────────────────────────────────────
|
|
307
|
+
|
|
308
|
+
function formatToolStart(toolName, args, cwd) {
|
|
309
|
+
if (toolName === "read") return `→ read ${relPath(args.path, cwd)}`;
|
|
310
|
+
if (toolName === "write") {
|
|
311
|
+
const size = args.content ? ` · ${formatBytes(Buffer.byteLength(String(args.content), "utf8"))}` : "";
|
|
312
|
+
return `→ write ${relPath(args.path, cwd)}${size}`;
|
|
313
|
+
}
|
|
314
|
+
if (toolName === "edit") {
|
|
315
|
+
const count = Array.isArray(args.edits) ? args.edits.length : 0;
|
|
316
|
+
return `→ edit ${relPath(args.path, cwd)}${count > 0 ? ` · ${count} replacement${count === 1 ? "" : "s"}` : ""}`;
|
|
317
|
+
}
|
|
318
|
+
if (toolName === "bash") return `→ run ${truncateOneLine(args.command ?? "")}`;
|
|
319
|
+
return `→ ${toolName}`;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function toolSummary(toolName, result, isError, args, cwd) {
|
|
323
|
+
const text = toolResultText(result);
|
|
324
|
+
if (isError) return `${toolName} failed · ${firstLine(text)}`;
|
|
325
|
+
if (toolName === "write") {
|
|
326
|
+
const m = String(text).match(/Successfully wrote\s+([0-9,]+)\s+bytes/iu);
|
|
327
|
+
const size = m ? ` · ${formatBytes(Number(m[1].replace(/,/gu, "")))}` : "";
|
|
328
|
+
return `wrote ${relPath(args.path, cwd)}${size}`;
|
|
329
|
+
}
|
|
330
|
+
if (toolName === "read") return `read ${relPath(args.path, cwd)}${text ? ` · ${formatBytes(Buffer.byteLength(text, "utf8"))}` : ""}`;
|
|
331
|
+
if (toolName === "edit") return `edited ${relPath(args.path, cwd)}`;
|
|
332
|
+
if (toolName === "bash") return firstLine(text) || "command completed";
|
|
333
|
+
return `${toolName}${text ? ` · ${firstLine(text)}` : ""}`;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
function toolResultText(result) {
|
|
337
|
+
const content = result?.content;
|
|
338
|
+
if (typeof content === "string") return content;
|
|
339
|
+
if (!Array.isArray(content)) return "";
|
|
340
|
+
return content.map((c) => c?.text ?? "").filter(Boolean).join("\n");
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
function firstLine(text) {
|
|
344
|
+
return String(text ?? "").split(/\r?\n/u).map((s) => s.trim()).find(Boolean) ?? "no details";
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
function relPath(path, cwd) {
|
|
348
|
+
if (!path) return "unknown";
|
|
349
|
+
const r = relative(cwd, String(path));
|
|
350
|
+
if (r && !r.startsWith("..") && r !== ".") return r;
|
|
351
|
+
return basename(String(path)) || String(path);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
function truncateOneLine(value, max = 80) {
|
|
355
|
+
const text = String(value ?? "").replace(/\s+/gu, " ").trim();
|
|
356
|
+
return text.length > max ? `${text.slice(0, Math.max(1, max - 1))}…` : text;
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
function formatTokens(n) {
|
|
360
|
+
if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`;
|
|
361
|
+
if (n >= 1_000) return `${Math.round(n / 1_000)}k`;
|
|
362
|
+
return String(Math.round(n));
|
|
363
|
+
}
|
package/src/benchmark/shared.mjs
CHANGED
|
@@ -18,10 +18,6 @@ export function createRunId(date = new Date()) {
|
|
|
18
18
|
return date.toISOString().replace(/:/gu, "-").replace(/\./gu, "-");
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
-
export function buildToolPrompt(benchmark) {
|
|
22
|
-
return benchmark.prompt;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
21
|
export async function loadBenchmarks(benchDir) {
|
|
26
22
|
const entries = await readdir(benchDir);
|
|
27
23
|
const markdownFiles = entries.filter((f) => f.endsWith(".md")).sort();
|
|
@@ -47,8 +43,4 @@ export async function loadBenchmarks(benchDir) {
|
|
|
47
43
|
benchmarks.push({ id, title, description, prompt: content, kind });
|
|
48
44
|
}
|
|
49
45
|
return benchmarks;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
export function piModelString(profile) {
|
|
53
|
-
return profile.harnesses?.pi?.model ?? `${profile.providerId}/${profile.modelAlias}`;
|
|
54
46
|
}
|
package/src/benchmark.mjs
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
// ── Benchmark module (thin facade) ──────────────────────────────────────────
|
|
2
2
|
// Submodules handle the actual logic. This file re-exports for backward compatibility.
|
|
3
3
|
|
|
4
|
-
export { slugModelId, createRunId,
|
|
4
|
+
export { slugModelId, createRunId, loadBenchmarks } from "./benchmark/shared.mjs";
|
|
5
5
|
export { findBenchmarkRepo, linkBenchmarkRepo } from "./benchmark/repo.mjs";
|
|
6
6
|
export { prepareBenchmarkRun } from "./benchmark/prepare.mjs";
|
|
7
|
-
export { runBenchmarkInPi } from "./benchmark/
|
|
7
|
+
export { runBenchmarkInPi } from "./benchmark/sdk-runner.mjs";
|
|
8
8
|
export { queryServerMetrics } from "./benchmark/metrics.mjs";
|
|
9
9
|
// unloadModelFromServer now lives in src/process.mjs (managed-server counterpart to stopProfile).
|
|
10
10
|
export { unloadModelFromServer } from "./process.mjs";
|
package/src/commands/run.mjs
CHANGED
|
@@ -149,18 +149,6 @@ function textOnlyProfile(profile) {
|
|
|
149
149
|
mmprojPath: null,
|
|
150
150
|
disabledMmprojPath: profile.disabledMmprojPath ?? profile.mmprojPath,
|
|
151
151
|
capabilities: { ...(profile.capabilities ?? {}), vision: false, visionDisabledReason: "unsupported-mmproj" },
|
|
152
|
-
commandArgv: removeCommandOption(profile.commandArgv ?? [], "--mmproj"),
|
|
153
152
|
});
|
|
154
153
|
}
|
|
155
154
|
|
|
156
|
-
function removeCommandOption(argv, flag) {
|
|
157
|
-
const next = [];
|
|
158
|
-
for (let i = 0; i < argv.length; i++) {
|
|
159
|
-
if (argv[i] === flag) {
|
|
160
|
-
if (argv[i + 1] && !argv[i + 1].startsWith("--")) i += 1;
|
|
161
|
-
continue;
|
|
162
|
-
}
|
|
163
|
-
next.push(argv[i]);
|
|
164
|
-
}
|
|
165
|
-
return next;
|
|
166
|
-
}
|
package/src/harness-pi.mjs
CHANGED
|
@@ -118,7 +118,7 @@ function modelInput(profile) {
|
|
|
118
118
|
return ["text"];
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
-
function modelCompat(profile) {
|
|
121
|
+
export function modelCompat(profile) {
|
|
122
122
|
if (profile.compat) return profile.compat;
|
|
123
123
|
const family = modelFamily(profile);
|
|
124
124
|
if (family.includes("qwen") || family.includes("gemma-4") || family.includes("gemma 4")) {
|
|
@@ -127,14 +127,14 @@ function modelCompat(profile) {
|
|
|
127
127
|
return null;
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
-
function modelReasoning(profile) {
|
|
130
|
+
export function modelReasoning(profile) {
|
|
131
131
|
if (profile.reasoning !== undefined) return Boolean(profile.reasoning);
|
|
132
132
|
const family = modelFamily(profile);
|
|
133
133
|
if (family.includes("qwen") || family.includes("gemma-4") || family.includes("gemma 4")) return true;
|
|
134
134
|
return undefined;
|
|
135
135
|
}
|
|
136
136
|
|
|
137
|
-
function modelFamily(profile) {
|
|
137
|
+
export function modelFamily(profile) {
|
|
138
138
|
return [profile.id, profile.label, profile.modelAlias, profile.modelPath, profile.omlxModel].filter(Boolean).join(" ").toLowerCase();
|
|
139
139
|
}
|
|
140
140
|
|
package/src/model-presenters.mjs
CHANGED
|
@@ -159,21 +159,6 @@ export function modelSelectOption(item, { runningProfilesNow, modelMissingIds, n
|
|
|
159
159
|
...(hint ? { hint: pc.red(hint) } : {}),
|
|
160
160
|
};
|
|
161
161
|
}
|
|
162
|
-
if (item.type === "new") {
|
|
163
|
-
return {
|
|
164
|
-
value: itemKey(item),
|
|
165
|
-
label: optionLabel({
|
|
166
|
-
status: optionStatusTag("setup"),
|
|
167
|
-
backend: optionBackendTag(backendId),
|
|
168
|
-
source: optionSourceTag(sourceId),
|
|
169
|
-
name: item.label,
|
|
170
|
-
nameWidth,
|
|
171
|
-
quant: optionQuantLabel(item),
|
|
172
|
-
ctx: optionCtxLabel(item),
|
|
173
|
-
size: optionSizeLabel(item),
|
|
174
|
-
}),
|
|
175
|
-
};
|
|
176
|
-
}
|
|
177
162
|
return {
|
|
178
163
|
value: itemKey(item),
|
|
179
164
|
label: optionLabel({
|
package/src/process.mjs
CHANGED
|
@@ -3,7 +3,6 @@ import { promisify } from "node:util";
|
|
|
3
3
|
import { closeSync, openSync } from "node:fs";
|
|
4
4
|
import { readFile, writeFile, chmod } from "node:fs/promises";
|
|
5
5
|
import { basename, join } from "node:path";
|
|
6
|
-
import { quoteShell } from "./command.mjs";
|
|
7
6
|
import { LOG_DIR } from "./config.mjs";
|
|
8
7
|
import { writeState, readState, profileDir } from "./profiles.mjs";
|
|
9
8
|
import { backendFor, backendBinaryFor } from "./backends.mjs";
|
|
@@ -501,4 +500,9 @@ function timestampForFile() {
|
|
|
501
500
|
|
|
502
501
|
function sleep(ms) {
|
|
503
502
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
function quoteShell(value) {
|
|
506
|
+
const text = String(value);
|
|
507
|
+
return /^[A-Za-z0-9_/@%+=:,.-]+$/u.test(text) ? text : `'${text.replace(/'/gu, ` '"'"'`)}'`;
|
|
504
508
|
}
|