offgrid-ai 0.16.3 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/package.json +3 -11
- package/resources/recommendations.json +8 -8
- package/src/cli.mjs +1 -4
- package/src/commands/main.mjs +20 -1
- package/src/commands/models.mjs +296 -39
- package/src/commands/onboard.mjs +6 -106
- package/src/commands/run.mjs +2 -4
- package/src/commands/status.mjs +1 -0
- package/src/commands/stop.mjs +1 -0
- package/src/config.mjs +16 -1
- package/src/discovery-shared.mjs +2 -3
- package/src/download.mjs +221 -0
- package/src/harness-pi.mjs +2 -3
- package/src/huggingface.mjs +72 -72
- package/src/managed.mjs +1 -6
- package/src/model-name.mjs +2 -2
- package/src/model-presenters.mjs +5 -36
- package/src/model-summary.mjs +2 -2
- package/src/omlx-runtime.mjs +29 -4
- package/src/process.mjs +3 -5
- package/src/profile-setup.mjs +206 -49
- package/src/profiles.mjs +1 -1
- package/src/runtime.mjs +2 -2
- package/src/ui.mjs +10 -8
- package/resources/hf-download.py +0 -79
- package/src/backend-installers.mjs +0 -42
- package/src/benchmark/finalize.mjs +0 -169
- package/src/benchmark/flow.mjs +0 -240
- package/src/benchmark/metrics.mjs +0 -107
- package/src/benchmark/prepare.mjs +0 -118
- package/src/benchmark/repo.mjs +0 -77
- package/src/benchmark/sdk-runner.mjs +0 -363
- package/src/benchmark/shared.mjs +0 -46
- package/src/benchmark.mjs +0 -12
- package/src/commands/benchmark.mjs +0 -4
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
// ── Create a benchmark run directory ────────────────────────────────────────
|
|
2
|
-
|
|
3
|
-
import { mkdir, writeFile } from "node:fs/promises";
|
|
4
|
-
import { join } from "node:path";
|
|
5
|
-
import { pc, renderRows, renderSection } from "../ui.mjs";
|
|
6
|
-
import { slugModelId, createRunId } from "./shared.mjs";
|
|
7
|
-
import { parseModelName } from "../model-name.mjs";
|
|
8
|
-
|
|
9
|
-
function harnessDisplayName(id) {
|
|
10
|
-
if (id === "pi") return "Pi";
|
|
11
|
-
return String(id).replace(/[-_]+/gu, " ").replace(/\b\w/gu, (char) => char.toUpperCase());
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
function intendedRunnerForProfile(profile) {
|
|
15
|
-
if (!profile) return "your tool";
|
|
16
|
-
const harnessEntries = Object.entries(profile.harnesses ?? {}).filter(([, config]) => config?.enabled !== false);
|
|
17
|
-
const [id] = harnessEntries.find(([key]) => key === "pi") ?? harnessEntries[0] ?? ["pi"];
|
|
18
|
-
return harnessDisplayName(id);
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
function printBenchmarkNextSteps({ repoPath, runDirectory, profile, modelId, runnerLabel }) {
|
|
22
|
-
const runCommand = profile ? `offgrid-ai run ${profile.id}` : null;
|
|
23
|
-
const runnerCommand = runCommand ?? `Open ${runnerLabel} for ${modelId}`;
|
|
24
|
-
|
|
25
|
-
console.log("");
|
|
26
|
-
console.log(pc.bold("Next steps"));
|
|
27
|
-
console.log(` 1. Open the gallery. If it is not running: ${pc.cyan(`cd ${repoPath} && npm run dev`)}`);
|
|
28
|
-
console.log(` 2. ${pc.cyan(`cd ${runDirectory}`)}`);
|
|
29
|
-
console.log(` 3. ${pc.cyan(runnerCommand)}, then copy this run's prompt from the gallery and paste it into ${runnerLabel}`);
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
export async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSource, backendLabel, profile, showNextSteps = true }) {
|
|
33
|
-
const toolPrompt = benchmark.prompt;
|
|
34
|
-
const now = new Date();
|
|
35
|
-
const runId = createRunId(now);
|
|
36
|
-
const modelSlug = slugModelId(modelId);
|
|
37
|
-
const runnerLabel = intendedRunnerForProfile(profile);
|
|
38
|
-
const runsDir = join(repoPath, "runs");
|
|
39
|
-
const benchmarkDirectory = join(runsDir, benchmark.id);
|
|
40
|
-
const modelDirectory = join(benchmarkDirectory, modelSlug);
|
|
41
|
-
const runDirectory = join(modelDirectory, runId);
|
|
42
|
-
|
|
43
|
-
await mkdir(runDirectory, { recursive: true });
|
|
44
|
-
|
|
45
|
-
const isDs = kind === "data-science";
|
|
46
|
-
const baseAssets = {
|
|
47
|
-
metadata: "metadata.json",
|
|
48
|
-
prompt: "prompt.md",
|
|
49
|
-
};
|
|
50
|
-
const metadata = {
|
|
51
|
-
schemaVersion: 1,
|
|
52
|
-
kind,
|
|
53
|
-
runId,
|
|
54
|
-
benchmark: { id: benchmark.id, title: benchmark.title, description: benchmark.description, prompt: benchmark.prompt },
|
|
55
|
-
model: { id: modelId, slug: modelSlug, displayName: parseModelName(modelId, modelSource === "omlx" ? "omlx" : "local-gguf").display },
|
|
56
|
-
status: "prepared",
|
|
57
|
-
createdAt: now.toISOString(),
|
|
58
|
-
updatedAt: now.toISOString(),
|
|
59
|
-
preparedAt: now.toISOString(),
|
|
60
|
-
runDirectory,
|
|
61
|
-
assets: isDs
|
|
62
|
-
? { ...baseAssets, ds: { notebook: "analysis.ipynb", summary: "summary.json", chartDistribution: "chart-distribution.png", chartTreatmentEffect: "chart-treatment-effect.png", chartCompletionRates: "chart-completion-rates.png" } }
|
|
63
|
-
: { ...baseAssets, html: "index.html", preview: "preview.png", video: "preview.webm" },
|
|
64
|
-
runner: {
|
|
65
|
-
mode: modelSource === "cloud" ? "manual" : "external",
|
|
66
|
-
intendedRunner: profile ? runnerLabel : undefined,
|
|
67
|
-
...(profile?.harnesses?.pi || runnerLabel === "Pi" ? { tool: "pi" } : {}),
|
|
68
|
-
...(modelSource ? { modelSource } : {}),
|
|
69
|
-
...(backendLabel ? { backendLabel } : {}),
|
|
70
|
-
...(profile?.baseUrl ? { baseUrl: profile.baseUrl } : {}),
|
|
71
|
-
model: modelId,
|
|
72
|
-
retries: 0,
|
|
73
|
-
tokenMetrics: {
|
|
74
|
-
reported: false,
|
|
75
|
-
promptTokens: 0,
|
|
76
|
-
completionTokens: 0,
|
|
77
|
-
totalTokens: 0,
|
|
78
|
-
},
|
|
79
|
-
speedMetrics: {
|
|
80
|
-
prefillTokensPerSecond: null,
|
|
81
|
-
generationTokensPerSecond: null,
|
|
82
|
-
ttftMs: null,
|
|
83
|
-
modelLoadMs: null,
|
|
84
|
-
speculativeDecodeAcceptance: null,
|
|
85
|
-
kvCacheTokens: null,
|
|
86
|
-
},
|
|
87
|
-
metricSource: null,
|
|
88
|
-
},
|
|
89
|
-
results: {
|
|
90
|
-
wallClockMs: null,
|
|
91
|
-
agentTurns: 0,
|
|
92
|
-
toolCalls: 0,
|
|
93
|
-
toolResults: 0,
|
|
94
|
-
success: false,
|
|
95
|
-
outputFiles: [],
|
|
96
|
-
perTurn: [],
|
|
97
|
-
},
|
|
98
|
-
};
|
|
99
|
-
|
|
100
|
-
await writeFile(join(runDirectory, "metadata.json"), JSON.stringify(metadata, null, 2) + "\n", "utf8");
|
|
101
|
-
await writeFile(join(runDirectory, "prompt.md"), toolPrompt + "\n", "utf8");
|
|
102
|
-
|
|
103
|
-
console.log("");
|
|
104
|
-
console.log(pc.green("✓ Run slot prepared"));
|
|
105
|
-
console.log(renderSection("Run", renderRows([
|
|
106
|
-
["Directory", pc.cyan(runDirectory)],
|
|
107
|
-
["Benchmark", benchmark.title],
|
|
108
|
-
["Kind", kind],
|
|
109
|
-
["Model", pc.bold(modelId)],
|
|
110
|
-
["Source", backendLabel || modelSource],
|
|
111
|
-
])));
|
|
112
|
-
|
|
113
|
-
if (showNextSteps) {
|
|
114
|
-
printBenchmarkNextSteps({ repoPath, runDirectory, profile, modelId, runnerLabel });
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
return runDirectory;
|
|
118
|
-
}
|
package/src/benchmark/repo.mjs
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
// ── Benchmark repo linking ────────────────────────────────────────────────────
|
|
2
|
-
|
|
3
|
-
import { existsSync } from "node:fs";
|
|
4
|
-
import { join, resolve } from "node:path";
|
|
5
|
-
import { homedir } from "node:os";
|
|
6
|
-
import { execFile } from "node:child_process";
|
|
7
|
-
import { promisify } from "node:util";
|
|
8
|
-
import { loadConfig, saveConfig } from "../config.mjs";
|
|
9
|
-
import { pc } from "../ui.mjs";
|
|
10
|
-
|
|
11
|
-
const execFileAsync = promisify(execFile);
|
|
12
|
-
|
|
13
|
-
const BENCHMARK_REPO = "https://github.com/eeshansrivastava89/local-llm-visual-benchmark.git";
|
|
14
|
-
|
|
15
|
-
export async function findBenchmarkRepo() {
|
|
16
|
-
const config = await loadConfig();
|
|
17
|
-
if (config.benchmarkRepoPath && existsSync(join(config.benchmarkRepoPath, "benchmarks"))) {
|
|
18
|
-
return config.benchmarkRepoPath;
|
|
19
|
-
}
|
|
20
|
-
return null;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
export async function linkBenchmarkRepo(prompt) {
|
|
24
|
-
const existing = await findBenchmarkRepo();
|
|
25
|
-
if (existing) return existing;
|
|
26
|
-
|
|
27
|
-
const candidates = [
|
|
28
|
-
join(homedir(), "dev", "local-llm-visual-benchmark"),
|
|
29
|
-
join(homedir(), "projects", "local-llm-visual-benchmark"),
|
|
30
|
-
join(homedir(), "local-llm-visual-benchmark"),
|
|
31
|
-
];
|
|
32
|
-
for (const candidate of candidates) {
|
|
33
|
-
if (existsSync(join(candidate, "benchmarks"))) {
|
|
34
|
-
const config = await loadConfig();
|
|
35
|
-
config.benchmarkRepoPath = candidate;
|
|
36
|
-
await saveConfig(config);
|
|
37
|
-
return candidate;
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
console.log(pc.dim("\nThe benchmark gallery needs to be linked to offgrid-ai."));
|
|
42
|
-
console.log(pc.dim("This is the local-llm-visual-benchmark repo that stores prompts and run results.\n"));
|
|
43
|
-
|
|
44
|
-
const choice = await prompt.choice("Link benchmark gallery", [
|
|
45
|
-
{ value: "clone", label: "Clone from GitHub", hint: "git clone into ~/dev" },
|
|
46
|
-
{ value: "manual", label: "Enter path manually", hint: "If you already have it cloned" },
|
|
47
|
-
], "clone");
|
|
48
|
-
|
|
49
|
-
if (choice === "clone") {
|
|
50
|
-
const targetDir = join(homedir(), "dev", "local-llm-visual-benchmark");
|
|
51
|
-
console.log(pc.dim(`\nCloning ${BENCHMARK_REPO}...`));
|
|
52
|
-
try {
|
|
53
|
-
await execFileAsync("git", ["clone", BENCHMARK_REPO, targetDir], { stdio: "pipe" });
|
|
54
|
-
const config = await loadConfig();
|
|
55
|
-
config.benchmarkRepoPath = targetDir;
|
|
56
|
-
await saveConfig(config);
|
|
57
|
-
console.log(pc.green(`✓ Cloned to ${targetDir}`));
|
|
58
|
-
return targetDir;
|
|
59
|
-
} catch (err) {
|
|
60
|
-
console.log(pc.red(`Clone failed: ${err.message}`));
|
|
61
|
-
return null;
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
const path = await prompt.text("Path to local-llm-visual-benchmark", "");
|
|
66
|
-
if (!path) return null;
|
|
67
|
-
const resolved = resolve(path.replace(/^~/, homedir()));
|
|
68
|
-
if (!existsSync(join(resolved, "benchmarks"))) {
|
|
69
|
-
console.log(pc.red(`No benchmarks/ directory found at ${resolved}`));
|
|
70
|
-
return null;
|
|
71
|
-
}
|
|
72
|
-
const config = await loadConfig();
|
|
73
|
-
config.benchmarkRepoPath = resolved;
|
|
74
|
-
await saveConfig(config);
|
|
75
|
-
console.log(pc.green(`✓ Linked to ${resolved}`));
|
|
76
|
-
return resolved;
|
|
77
|
-
}
|
|
@@ -1,363 +0,0 @@
|
|
|
1
|
-
// ── Run benchmark via Pi SDK (no subprocess, no NDJSON parsing) ────────────────
|
|
2
|
-
|
|
3
|
-
import { readFile } from "node:fs/promises";
|
|
4
|
-
import { join, relative, basename } from "node:path";
|
|
5
|
-
import { Agent } from "@earendil-works/pi-agent-core";
|
|
6
|
-
import { streamSimple } from "@earendil-works/pi-ai/compat";
|
|
7
|
-
import { createCodingTools } from "@earendil-works/pi-coding-agent";
|
|
8
|
-
import { pc, formatBytes } from "../ui.mjs";
|
|
9
|
-
import { piApiModelId, modelReasoning, modelCompat } from "../harness-pi.mjs";
|
|
10
|
-
|
|
11
|
-
const C = {
|
|
12
|
-
thinking: pc.magenta,
|
|
13
|
-
text: pc.green,
|
|
14
|
-
tool: pc.yellow,
|
|
15
|
-
success: pc.green,
|
|
16
|
-
warning: pc.yellow,
|
|
17
|
-
error: pc.red,
|
|
18
|
-
info: pc.cyan,
|
|
19
|
-
dim: pc.dim,
|
|
20
|
-
};
|
|
21
|
-
|
|
22
|
-
export async function runBenchmarkInPi(profile, runDirectory, { signal } = {}) {
|
|
23
|
-
const model = buildModel(profile);
|
|
24
|
-
const tools = createCodingTools(runDirectory);
|
|
25
|
-
const systemPrompt = buildSystemPrompt(runDirectory);
|
|
26
|
-
const promptText = await readFile(join(runDirectory, "prompt.md"), "utf8");
|
|
27
|
-
|
|
28
|
-
const runResult = {
|
|
29
|
-
model: `${profile.providerId}/${piApiModelId(profile)}`,
|
|
30
|
-
exitCode: 0,
|
|
31
|
-
wallClockMs: null,
|
|
32
|
-
agentTurns: 0,
|
|
33
|
-
promptTokens: 0,
|
|
34
|
-
completionTokens: 0,
|
|
35
|
-
totalTokens: 0,
|
|
36
|
-
cacheRead: 0,
|
|
37
|
-
cacheWrite: 0,
|
|
38
|
-
toolCalls: 0,
|
|
39
|
-
toolResults: 0,
|
|
40
|
-
perTurn: [],
|
|
41
|
-
rawResponseLines: [],
|
|
42
|
-
error: null,
|
|
43
|
-
};
|
|
44
|
-
|
|
45
|
-
const runStartMs = Date.now();
|
|
46
|
-
let currentTurnStartMs = null;
|
|
47
|
-
let lastTurnEndMs = null;
|
|
48
|
-
let turnToolCalls = 0;
|
|
49
|
-
let responseBuffer = "";
|
|
50
|
-
const verbose = Boolean(process.env.OFFGRID_BENCHMARK_VERBOSE);
|
|
51
|
-
const toolArgsByCallId = new Map();
|
|
52
|
-
|
|
53
|
-
// ── Status line state ────────────────────────────────────────────────────
|
|
54
|
-
let statusBytes = 0;
|
|
55
|
-
let streamedText = false;
|
|
56
|
-
let execTimer = null;
|
|
57
|
-
let execStartedAt = null;
|
|
58
|
-
|
|
59
|
-
function clearStatusLine() {
|
|
60
|
-
if (process.stdout.isTTY) process.stdout.write("\r\x1b[K");
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
function printStatusLine(text) {
|
|
64
|
-
if (process.stdout.isTTY) process.stdout.write(`\r\x1b[K${text}`);
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
function stopExecTimer() {
|
|
68
|
-
if (execTimer) { clearInterval(execTimer); execTimer = null; }
|
|
69
|
-
clearStatusLine();
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
function startExecTimer(toolName) {
|
|
73
|
-
stopExecTimer();
|
|
74
|
-
execStartedAt = Date.now();
|
|
75
|
-
if (!process.stdout.isTTY) return;
|
|
76
|
-
const update = () => {
|
|
77
|
-
const elapsed = Math.floor((Date.now() - execStartedAt) / 1000);
|
|
78
|
-
printStatusLine(C.dim(`running ${toolName}… ${elapsed}s`));
|
|
79
|
-
};
|
|
80
|
-
update();
|
|
81
|
-
execTimer = setInterval(update, 1000);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
const agent = new Agent({
|
|
85
|
-
initialState: {
|
|
86
|
-
systemPrompt,
|
|
87
|
-
model,
|
|
88
|
-
thinkingLevel: profile.reasoning ? "low" : "off",
|
|
89
|
-
tools,
|
|
90
|
-
},
|
|
91
|
-
streamFn: async (mdl, ctx, opts) =>
|
|
92
|
-
streamSimple(mdl, ctx, { ...opts, apiKey: "none", timeoutMs: 2147483647 }),
|
|
93
|
-
});
|
|
94
|
-
|
|
95
|
-
// ── Event handler: render + collect metrics ──────────────────────────────
|
|
96
|
-
|
|
97
|
-
agent.subscribe((event) => {
|
|
98
|
-
try {
|
|
99
|
-
handleEvent(event);
|
|
100
|
-
} catch (err) {
|
|
101
|
-
console.error(C.error(`[renderer error] ${err.message}`));
|
|
102
|
-
}
|
|
103
|
-
});
|
|
104
|
-
|
|
105
|
-
function handleEvent(event) {
|
|
106
|
-
switch (event.type) {
|
|
107
|
-
case "turn_start": {
|
|
108
|
-
stopExecTimer();
|
|
109
|
-
runResult.agentTurns += 1;
|
|
110
|
-
currentTurnStartMs = lastTurnEndMs ?? runStartMs;
|
|
111
|
-
turnToolCalls = 0;
|
|
112
|
-
console.log("");
|
|
113
|
-
console.log(C.info(`Turn ${runResult.agentTurns}`));
|
|
114
|
-
break;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
case "message_update": {
|
|
118
|
-
const evt = event.assistantMessageEvent;
|
|
119
|
-
if (!evt) break;
|
|
120
|
-
const sub = String(evt.type ?? "").replace(/_/gu, "");
|
|
121
|
-
if (sub === "thinkingstart") {
|
|
122
|
-
statusBytes = 0;
|
|
123
|
-
} else if (sub === "thinkingdelta") {
|
|
124
|
-
statusBytes += Buffer.byteLength(evt.delta || "", "utf8");
|
|
125
|
-
const tokens = Math.max(1, Math.ceil(statusBytes / 4));
|
|
126
|
-
printStatusLine(C.dim(`thinking… ${formatBytes(statusBytes)} (~${formatTokens(tokens)} tokens)`));
|
|
127
|
-
if (verbose) process.stdout.write(C.thinking(evt.delta || ""));
|
|
128
|
-
} else if (sub === "textstart") {
|
|
129
|
-
clearStatusLine();
|
|
130
|
-
statusBytes = 0;
|
|
131
|
-
} else if (sub === "textdelta") {
|
|
132
|
-
process.stdout.write(evt.delta || "");
|
|
133
|
-
responseBuffer += evt.delta || "";
|
|
134
|
-
streamedText = true;
|
|
135
|
-
} else if (sub === "toolcallstart") {
|
|
136
|
-
clearStatusLine();
|
|
137
|
-
}
|
|
138
|
-
break;
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
case "message_end": {
|
|
142
|
-
if (streamedText) {
|
|
143
|
-
console.log("");
|
|
144
|
-
streamedText = false;
|
|
145
|
-
}
|
|
146
|
-
if (event.message?.role === "assistant") {
|
|
147
|
-
for (const item of event.message.content ?? []) {
|
|
148
|
-
if (item.type === "toolCall") {
|
|
149
|
-
runResult.toolCalls += 1;
|
|
150
|
-
turnToolCalls += 1;
|
|
151
|
-
responseBuffer += `\n[toolCall] ${item.name}\n`;
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
if (responseBuffer) {
|
|
155
|
-
runResult.rawResponseLines.push(responseBuffer);
|
|
156
|
-
responseBuffer = "";
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
break;
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
case "tool_execution_start": {
|
|
163
|
-
clearStatusLine();
|
|
164
|
-
toolArgsByCallId.set(event.toolCallId, event.args);
|
|
165
|
-
console.log(C.tool(formatToolStart(event.toolName, event.args, runDirectory)));
|
|
166
|
-
startExecTimer(event.toolName);
|
|
167
|
-
break;
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
case "tool_execution_end": {
|
|
171
|
-
stopExecTimer();
|
|
172
|
-
const { toolName, result, isError, toolCallId } = event;
|
|
173
|
-
const args = toolArgsByCallId.get(toolCallId) ?? {};
|
|
174
|
-
const marker = isError ? C.error("✗") : C.success("✓");
|
|
175
|
-
console.log(`${marker} ${toolSummary(toolName, result, isError, args, runDirectory)}`);
|
|
176
|
-
runResult.toolResults += 1;
|
|
177
|
-
break;
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
case "turn_end": {
|
|
181
|
-
stopExecTimer();
|
|
182
|
-
clearStatusLine();
|
|
183
|
-
const msg = event.message;
|
|
184
|
-
const isFailure = msg?.role === "assistant" && (msg.stopReason === "error" || msg.stopReason === "aborted");
|
|
185
|
-
const usage = !isFailure ? msg?.usage : null;
|
|
186
|
-
if (usage) {
|
|
187
|
-
runResult.promptTokens += usage.input ?? 0;
|
|
188
|
-
runResult.completionTokens += usage.output ?? 0;
|
|
189
|
-
runResult.cacheRead += usage.cacheRead ?? 0;
|
|
190
|
-
runResult.cacheWrite += usage.cacheWrite ?? 0;
|
|
191
|
-
}
|
|
192
|
-
const turnEndMs = Date.now();
|
|
193
|
-
const wallClockMs = currentTurnStartMs ? turnEndMs - currentTurnStartMs : null;
|
|
194
|
-
runResult.perTurn.push({
|
|
195
|
-
turn: runResult.agentTurns,
|
|
196
|
-
inputTokens: usage?.input ?? 0,
|
|
197
|
-
outputTokens: usage?.output ?? 0,
|
|
198
|
-
cacheRead: usage?.cacheRead ?? 0,
|
|
199
|
-
cacheWrite: usage?.cacheWrite ?? 0,
|
|
200
|
-
wallClockMs,
|
|
201
|
-
toolCalls: turnToolCalls,
|
|
202
|
-
});
|
|
203
|
-
lastTurnEndMs = turnEndMs;
|
|
204
|
-
const tokStr = usage ? ` · ${formatTokens(usage.output ?? 0)} tokens` : "";
|
|
205
|
-
console.log(C.success(`✓ turn ${runResult.agentTurns}${tokStr}`));
|
|
206
|
-
break;
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
case "agent_end": {
|
|
210
|
-
if (responseBuffer) {
|
|
211
|
-
runResult.rawResponseLines.push(responseBuffer);
|
|
212
|
-
responseBuffer = "";
|
|
213
|
-
}
|
|
214
|
-
break;
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
// ── Wire abort signal ────────────────────────────────────────────────────
|
|
220
|
-
|
|
221
|
-
let cancelled = false;
|
|
222
|
-
const abortListener = () => {
|
|
223
|
-
cancelled = true;
|
|
224
|
-
agent.abort();
|
|
225
|
-
};
|
|
226
|
-
if (signal) signal.addEventListener("abort", abortListener, { once: true });
|
|
227
|
-
|
|
228
|
-
// ── Run ───────────────────────────────────────────────────────────────────
|
|
229
|
-
|
|
230
|
-
try {
|
|
231
|
-
console.log(C.info("Pi benchmark started"));
|
|
232
|
-
console.log(C.dim(` Model ${model.provider}/${model.id}`));
|
|
233
|
-
await agent.prompt(promptText);
|
|
234
|
-
} catch (err) {
|
|
235
|
-
if (!cancelled) {
|
|
236
|
-
runResult.error = { message: err.message };
|
|
237
|
-
}
|
|
238
|
-
} finally {
|
|
239
|
-
if (signal) signal.removeEventListener("abort", abortListener);
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
if (cancelled) {
|
|
243
|
-
runResult.error = { message: "Cancelled by user" };
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
if (!runResult.error && agent.state.errorMessage) {
|
|
247
|
-
runResult.error = { message: agent.state.errorMessage };
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
runResult.wallClockMs = Date.now() - runStartMs;
|
|
251
|
-
runResult.totalTokens = runResult.promptTokens + runResult.completionTokens;
|
|
252
|
-
|
|
253
|
-
console.log(C.info("Pi benchmark finished"));
|
|
254
|
-
return runResult;
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
// ── Model construction ──────────────────────────────────────────────────────
|
|
258
|
-
|
|
259
|
-
function buildModel(profile) {
|
|
260
|
-
const reasoning = modelReasoning(profile) ?? false;
|
|
261
|
-
const compat = modelCompat(profile);
|
|
262
|
-
|
|
263
|
-
return {
|
|
264
|
-
id: piApiModelId(profile),
|
|
265
|
-
name: profile.label,
|
|
266
|
-
api: "openai-completions",
|
|
267
|
-
provider: profile.providerId,
|
|
268
|
-
baseUrl: profile.baseUrl,
|
|
269
|
-
reasoning,
|
|
270
|
-
input: profile.mmprojPath || profile.capabilities?.vision ? ["text", "image"] : ["text"],
|
|
271
|
-
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
272
|
-
contextWindow: profile.flags?.ctxSize ?? 32768,
|
|
273
|
-
maxTokens: 16384,
|
|
274
|
-
compat: {
|
|
275
|
-
supportsDeveloperRole: false,
|
|
276
|
-
supportsReasoningEffort: false,
|
|
277
|
-
maxTokensField: "max_tokens",
|
|
278
|
-
...(compat ?? {}),
|
|
279
|
-
},
|
|
280
|
-
};
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
// ── System prompt ───────────────────────────────────────────────────────────
|
|
284
|
-
|
|
285
|
-
function buildSystemPrompt(cwd) {
|
|
286
|
-
const now = new Date();
|
|
287
|
-
const date = `${now.getFullYear()}-${String(now.getMonth() + 1).padStart(2, "0")}-${String(now.getDate()).padStart(2, "0")}`;
|
|
288
|
-
return `You are an expert coding assistant. You help users by reading files, executing commands, editing code, and writing new files.
|
|
289
|
-
|
|
290
|
-
Available tools:
|
|
291
|
-
- read: Read file contents (supports text and images)
|
|
292
|
-
- bash: Execute shell commands
|
|
293
|
-
- edit: Apply targeted text replacements to files
|
|
294
|
-
- write: Write content to files (creates or overwrites)
|
|
295
|
-
|
|
296
|
-
Guidelines:
|
|
297
|
-
- Be concise in your responses
|
|
298
|
-
- Show file paths clearly when working with files
|
|
299
|
-
- Use the write tool to create files — do not return file contents as chat text
|
|
300
|
-
- Use bash to run commands and verify your work
|
|
301
|
-
|
|
302
|
-
Current date: ${date}
|
|
303
|
-
Current working directory: ${cwd.replace(/\\/gu, "/")}`;
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
// ── Rendering helpers ───────────────────────────────────────────────────────
|
|
307
|
-
|
|
308
|
-
function formatToolStart(toolName, args, cwd) {
|
|
309
|
-
if (toolName === "read") return `→ read ${relPath(args.path, cwd)}`;
|
|
310
|
-
if (toolName === "write") {
|
|
311
|
-
const size = args.content ? ` · ${formatBytes(Buffer.byteLength(String(args.content), "utf8"))}` : "";
|
|
312
|
-
return `→ write ${relPath(args.path, cwd)}${size}`;
|
|
313
|
-
}
|
|
314
|
-
if (toolName === "edit") {
|
|
315
|
-
const count = Array.isArray(args.edits) ? args.edits.length : 0;
|
|
316
|
-
return `→ edit ${relPath(args.path, cwd)}${count > 0 ? ` · ${count} replacement${count === 1 ? "" : "s"}` : ""}`;
|
|
317
|
-
}
|
|
318
|
-
if (toolName === "bash") return `→ run ${truncateOneLine(args.command ?? "")}`;
|
|
319
|
-
return `→ ${toolName}`;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
function toolSummary(toolName, result, isError, args, cwd) {
|
|
323
|
-
const text = toolResultText(result);
|
|
324
|
-
if (isError) return `${toolName} failed · ${firstLine(text)}`;
|
|
325
|
-
if (toolName === "write") {
|
|
326
|
-
const m = String(text).match(/Successfully wrote\s+([0-9,]+)\s+bytes/iu);
|
|
327
|
-
const size = m ? ` · ${formatBytes(Number(m[1].replace(/,/gu, "")))}` : "";
|
|
328
|
-
return `wrote ${relPath(args.path, cwd)}${size}`;
|
|
329
|
-
}
|
|
330
|
-
if (toolName === "read") return `read ${relPath(args.path, cwd)}${text ? ` · ${formatBytes(Buffer.byteLength(text, "utf8"))}` : ""}`;
|
|
331
|
-
if (toolName === "edit") return `edited ${relPath(args.path, cwd)}`;
|
|
332
|
-
if (toolName === "bash") return firstLine(text) || "command completed";
|
|
333
|
-
return `${toolName}${text ? ` · ${firstLine(text)}` : ""}`;
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
function toolResultText(result) {
|
|
337
|
-
const content = result?.content;
|
|
338
|
-
if (typeof content === "string") return content;
|
|
339
|
-
if (!Array.isArray(content)) return "";
|
|
340
|
-
return content.map((c) => c?.text ?? "").filter(Boolean).join("\n");
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
function firstLine(text) {
|
|
344
|
-
return String(text ?? "").split(/\r?\n/u).map((s) => s.trim()).find(Boolean) ?? "no details";
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
function relPath(path, cwd) {
|
|
348
|
-
if (!path) return "unknown";
|
|
349
|
-
const r = relative(cwd, String(path));
|
|
350
|
-
if (r && !r.startsWith("..") && r !== ".") return r;
|
|
351
|
-
return basename(String(path)) || String(path);
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
function truncateOneLine(value, max = 80) {
|
|
355
|
-
const text = String(value ?? "").replace(/\s+/gu, " ").trim();
|
|
356
|
-
return text.length > max ? `${text.slice(0, Math.max(1, max - 1))}…` : text;
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
function formatTokens(n) {
|
|
360
|
-
if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`;
|
|
361
|
-
if (n >= 1_000) return `${Math.round(n / 1_000)}k`;
|
|
362
|
-
return String(Math.round(n));
|
|
363
|
-
}
|
package/src/benchmark/shared.mjs
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
// ── Shared utilities (matches local-llm-visual-benchmark) ──────────────────
|
|
2
|
-
|
|
3
|
-
import { createHash } from "node:crypto";
|
|
4
|
-
import { readdir, readFile } from "node:fs/promises";
|
|
5
|
-
import { join } from "node:path";
|
|
6
|
-
|
|
7
|
-
export function slugModelId(modelId, maxLength = 80) {
|
|
8
|
-
const hash = createHash("sha256").update(modelId).digest("hex").slice(0, 10);
|
|
9
|
-
const normalized = modelId.normalize("NFKD").replace(/[\u0300-\u036f]/gu, "").toLowerCase();
|
|
10
|
-
const slug = normalized.replace(/[^a-z0-9]+/gu, "-").replace(/^-+|-+$/gu, "").replace(/-{2,}/gu, "-");
|
|
11
|
-
if (slug.length > 0 && slug.length <= maxLength && slug === normalized) return slug;
|
|
12
|
-
const baseMaxLength = Math.max(1, maxLength - 11);
|
|
13
|
-
const base = slug.slice(0, baseMaxLength).replace(/^-+|-+$/gu, "") || "model";
|
|
14
|
-
return `${base}-${hash}`;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
export function createRunId(date = new Date()) {
|
|
18
|
-
return date.toISOString().replace(/:/gu, "-").replace(/\./gu, "-");
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
export async function loadBenchmarks(benchDir) {
|
|
22
|
-
const entries = await readdir(benchDir);
|
|
23
|
-
const markdownFiles = entries.filter((f) => f.endsWith(".md")).sort();
|
|
24
|
-
const benchmarks = [];
|
|
25
|
-
for (const filename of markdownFiles) {
|
|
26
|
-
const raw = await readFile(join(benchDir, filename), "utf8");
|
|
27
|
-
const match = raw.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
28
|
-
const frontmatter = match ? match[1] : "";
|
|
29
|
-
const content = match ? match[2].trim() : raw.trim();
|
|
30
|
-
let id = filename.replace(/\.md$/u, "");
|
|
31
|
-
let title = id;
|
|
32
|
-
let description = "";
|
|
33
|
-
for (const line of frontmatter.split("\n")) {
|
|
34
|
-
const kv = line.match(/^(\w+):\s*(.+)$/);
|
|
35
|
-
if (kv) {
|
|
36
|
-
const [, key, val] = kv;
|
|
37
|
-
if (key === "id") id = val.trim();
|
|
38
|
-
if (key === "title") title = val.trim();
|
|
39
|
-
if (key === "description") description = val.trim();
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
const kind = id === "ab-test-analysis" ? "data-science" : "visual";
|
|
43
|
-
benchmarks.push({ id, title, description, prompt: content, kind });
|
|
44
|
-
}
|
|
45
|
-
return benchmarks;
|
|
46
|
-
}
|
package/src/benchmark.mjs
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
// ── Benchmark module (thin facade) ──────────────────────────────────────────
|
|
2
|
-
// Submodules handle the actual logic. This file re-exports for backward compatibility.
|
|
3
|
-
|
|
4
|
-
export { slugModelId, createRunId, loadBenchmarks } from "./benchmark/shared.mjs";
|
|
5
|
-
export { findBenchmarkRepo, linkBenchmarkRepo } from "./benchmark/repo.mjs";
|
|
6
|
-
export { prepareBenchmarkRun } from "./benchmark/prepare.mjs";
|
|
7
|
-
export { runBenchmarkInPi } from "./benchmark/sdk-runner.mjs";
|
|
8
|
-
export { queryServerMetrics } from "./benchmark/metrics.mjs";
|
|
9
|
-
// unloadModelFromServer now lives in src/process.mjs (managed-server counterpart to stopProfile).
|
|
10
|
-
export { unloadModelFromServer } from "./process.mjs";
|
|
11
|
-
export { finalizeBenchmarkRun, renderBenchmarkSummary } from "./benchmark/finalize.mjs";
|
|
12
|
-
export { benchmarkForProfile, benchmarkFlow } from "./benchmark/flow.mjs";
|