offgrid-ai 0.8.15 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,152 @@
1
+ // ── Backend-aware server speed metrics ───────────────────────────────────────
2
+
3
+ import { backendFor } from "../backends.mjs";
4
+ import { apiRootUrl } from "../process.mjs";
5
+
6
+ const BENCH_SPEED_PROMPT = "Write a one-sentence summary of machine learning.";
7
+
8
+ export async function queryServerMetrics(profile) {
9
+ const backend = backendFor(profile.backend);
10
+
11
+ if (backend.id === "llama-cpp" || backend.id === "llama-cpp-mtp") {
12
+ return await queryLlamaCppMetrics(profile);
13
+ }
14
+ if (backend.id === "omlx") {
15
+ return await queryOmlxMetrics(profile);
16
+ }
17
+ if (backend.id === "ollama") {
18
+ return await queryOllamaMetrics(profile);
19
+ }
20
+
21
+ throw new Error(`Unsupported backend for benchmark speed metrics: ${backend.id}`);
22
+ }
23
+
24
+ async function queryLlamaCppMetrics(profile) {
25
+ const body = {
26
+ model: profile.modelAlias,
27
+ messages: [{ role: "user", content: BENCH_SPEED_PROMPT }],
28
+ stream: false,
29
+ };
30
+
31
+ const response = await fetch(profile.baseUrl.replace(/\/$/u, "") + "/chat/completions", {
32
+ method: "POST",
33
+ headers: { "Content-Type": "application/json" },
34
+ body: JSON.stringify(body),
35
+ signal: AbortSignal.timeout(60000),
36
+ });
37
+
38
+ if (!response.ok) {
39
+ throw new Error(`llama.cpp speed query failed: ${response.status} ${response.statusText}`);
40
+ }
41
+
42
+ const data = await response.json();
43
+ const timings = data.timings;
44
+ if (!timings || typeof timings.prompt_per_second !== "number" || typeof timings.predicted_per_second !== "number") {
45
+ throw new Error("llama.cpp response did not include usable timings object");
46
+ }
47
+ const draftN = timings.draft_n;
48
+ const draftAccepted = timings.draft_n_accepted;
49
+
50
+ return {
51
+ prefillTokensPerSecond: timings.prompt_per_second ?? null,
52
+ generationTokensPerSecond: timings.predicted_per_second ?? null,
53
+ ttftMs: timings.prompt_ms ?? null,
54
+ modelLoadMs: null,
55
+ speculativeDecodeAcceptance: (draftN && Number.isFinite(draftAccepted) && Number.isFinite(draftN) && draftN > 0)
56
+ ? draftAccepted / draftN
57
+ : null,
58
+ kvCacheTokens: timings.cache_n ?? null,
59
+ metricSource: "llama.cpp /v1/chat/completions timings",
60
+ };
61
+ }
62
+
63
+ async function queryOmlxMetrics(profile) {
64
+ const body = {
65
+ model: profile.modelAlias,
66
+ messages: [{ role: "user", content: BENCH_SPEED_PROMPT }],
67
+ stream: true,
68
+ stream_options: { include_usage: true },
69
+ };
70
+
71
+ const response = await fetch(profile.baseUrl.replace(/\/$/u, "") + "/chat/completions", {
72
+ method: "POST",
73
+ headers: { "Content-Type": "application/json" },
74
+ body: JSON.stringify(body),
75
+ signal: AbortSignal.timeout(60000),
76
+ });
77
+
78
+ if (!response.ok) {
79
+ throw new Error(`oMLX speed query failed: ${response.status} ${response.statusText}`);
80
+ }
81
+
82
+ const text = await response.text();
83
+ let usage = null;
84
+ for (const line of text.split("\n").reverse()) {
85
+ const trimmed = line.trim();
86
+ if (!trimmed || !trimmed.startsWith("data:")) continue;
87
+ const payload = trimmed.slice(5).trim();
88
+ if (payload === "[DONE]") continue;
89
+ try {
90
+ const chunk = JSON.parse(payload);
91
+ if (chunk.usage) {
92
+ usage = chunk.usage;
93
+ break;
94
+ }
95
+ } catch {
96
+ // Ignore malformed SSE chunks.
97
+ }
98
+ }
99
+
100
+ if (!usage) {
101
+ throw new Error("oMLX speed query did not return usage in streaming response");
102
+ }
103
+
104
+ return {
105
+ prefillTokensPerSecond: usage.prompt_tokens_per_second ?? null,
106
+ generationTokensPerSecond: usage.generation_tokens_per_second ?? null,
107
+ ttftMs: usage.time_to_first_token != null ? usage.time_to_first_token * 1000 : null,
108
+ modelLoadMs: null,
109
+ speculativeDecodeAcceptance: null,
110
+ kvCacheTokens: usage.prompt_tokens_details?.cached_tokens ?? null,
111
+ metricSource: "oMLX /v1/chat/completions streaming include_usage",
112
+ };
113
+ }
114
+
115
+ async function queryOllamaMetrics(profile) {
116
+ const body = {
117
+ model: profile.modelAlias,
118
+ prompt: BENCH_SPEED_PROMPT,
119
+ stream: false,
120
+ };
121
+
122
+ const apiBaseUrl = apiRootUrl(profile.baseUrl || backendFor(profile.backend).apiBaseUrl || "");
123
+
124
+ const response = await fetch(`${apiBaseUrl}/api/generate`, {
125
+ method: "POST",
126
+ headers: { "Content-Type": "application/json" },
127
+ body: JSON.stringify(body),
128
+ signal: AbortSignal.timeout(60000),
129
+ });
130
+
131
+ if (!response.ok) {
132
+ throw new Error(`Ollama speed query failed: ${response.status} ${response.statusText}`);
133
+ }
134
+
135
+ const data = await response.json();
136
+ const promptEvalNs = data.prompt_eval_duration ?? 0;
137
+ const evalNs = data.eval_duration ?? 0;
138
+ const loadNs = data.load_duration ?? 0;
139
+
140
+ const promptEvalCount = data.prompt_eval_count ?? 0;
141
+ const evalCount = data.eval_count ?? 0;
142
+
143
+ return {
144
+ prefillTokensPerSecond: promptEvalNs > 0 ? (promptEvalCount / (promptEvalNs / 1e9)) : null,
145
+ generationTokensPerSecond: evalNs > 0 ? (evalCount / (evalNs / 1e9)) : null,
146
+ ttftMs: promptEvalNs / 1e6,
147
+ modelLoadMs: loadNs / 1e6,
148
+ speculativeDecodeAcceptance: null,
149
+ kvCacheTokens: null,
150
+ metricSource: "Ollama /api/generate",
151
+ };
152
+ }
@@ -0,0 +1,252 @@
1
+ // ── Run benchmark in Pi (non-interactive JSON mode) ───────────────────────────
2
+
3
+ import { writeFile } from "node:fs/promises";
4
+ import { join } from "node:path";
5
+ import { spawn } from "node:child_process";
6
+ import {
7
+ BENCH_COLORS, renderStreamEvent,
8
+ formatToolCall,
9
+ } from "./stream-renderer.mjs";
10
+ import { piModelString } from "./shared.mjs";
11
+
12
+ export async function runBenchmarkInPi(profile, runDirectory, { signal } = {}) {
13
+ const model = piModelString(profile);
14
+ const args = ["--model", model, "--mode", "json", "-p", "@prompt.md"];
15
+
16
+ const child = spawn("pi", args, {
17
+ cwd: runDirectory,
18
+ stdio: ["ignore", "pipe", "pipe"],
19
+ });
20
+
21
+ const runResult = {
22
+ model,
23
+ exitCode: null,
24
+ wallClockMs: null,
25
+ agentTurns: 0,
26
+ promptTokens: 0,
27
+ completionTokens: 0,
28
+ totalTokens: 0,
29
+ cacheRead: 0,
30
+ cacheWrite: 0,
31
+ toolCalls: 0,
32
+ toolResults: 0,
33
+ perTurn: [],
34
+ rawResponseLines: [],
35
+ error: null,
36
+ };
37
+
38
+ let streamBuffer = "";
39
+ let responseBuffer = "";
40
+ let currentTurnStartMs = null;
41
+ let lastTurnEndMs = null;
42
+ let runStartMs = null;
43
+ let firstEventMs = null;
44
+ let lastEventMs = null;
45
+ let cancelled = false;
46
+
47
+ const streamPath = join(runDirectory, "stream.ndjson");
48
+ const stderrPath = join(runDirectory, "stderr.log");
49
+ const responsePath = join(runDirectory, "response.raw.txt");
50
+
51
+ const streamHandle = await openFileHandle(streamPath, "w");
52
+ const stderrHandle = await openFileHandle(stderrPath, "w");
53
+
54
+ const verbose = Boolean(process.env.OFFGRID_BENCHMARK_VERBOSE);
55
+ const renderState = {
56
+ cwd: runDirectory,
57
+ turn: 0,
58
+ turnHadToolError: false,
59
+ modelPrinted: false,
60
+ activeTool: null,
61
+ status: { mode: "idle", toolName: null, bytes: 0, tokens: 0 },
62
+ };
63
+
64
+ function appendResponse(text) {
65
+ responseBuffer += text;
66
+ }
67
+
68
+ function flushResponse() {
69
+ if (responseBuffer) {
70
+ runResult.rawResponseLines.push(responseBuffer);
71
+ responseBuffer = "";
72
+ }
73
+ }
74
+
75
+ function updateTimeBounds(timestamp) {
76
+ if (!timestamp) return;
77
+ if (firstEventMs === null) firstEventMs = timestamp;
78
+ lastEventMs = timestamp;
79
+ }
80
+
81
+ function beginTurn() {
82
+ runResult.agentTurns += 1;
83
+ currentTurnStartMs = lastTurnEndMs ?? runStartMs ?? null;
84
+ }
85
+
86
+ function endTurn(usage, timestamp) {
87
+ const turnEndMs = timestamp ?? null;
88
+ const wallClockMs = currentTurnStartMs && turnEndMs ? turnEndMs - currentTurnStartMs : null;
89
+ runResult.perTurn.push({
90
+ turn: runResult.agentTurns,
91
+ inputTokens: usage?.input ?? 0,
92
+ outputTokens: usage?.output ?? 0,
93
+ cacheRead: usage?.cacheRead ?? 0,
94
+ cacheWrite: usage?.cacheWrite ?? 0,
95
+ wallClockMs,
96
+ toolCalls: 0,
97
+ });
98
+ if (turnEndMs) lastTurnEndMs = turnEndMs;
99
+ currentTurnStartMs = null;
100
+ }
101
+
102
+ function processLine(line) {
103
+ if (!line.trim()) return;
104
+ streamHandle.write(line + "\n");
105
+ let parsed;
106
+ try {
107
+ parsed = JSON.parse(line);
108
+ } catch (err) {
109
+ console.log(BENCH_COLORS.error(`[parse error] ${err.message}`));
110
+ return;
111
+ }
112
+
113
+ const timestamp = extractTimestamp(parsed);
114
+ updateTimeBounds(timestamp);
115
+
116
+ renderStreamEvent(parsed, renderState, { verbose });
117
+
118
+ if (parsed.type === "session" || parsed.type === "agent_start") {
119
+ if (timestamp && runStartMs === null) runStartMs = timestamp;
120
+ }
121
+
122
+ if (parsed.type === "turn_start") {
123
+ beginTurn();
124
+ }
125
+
126
+ if (parsed.type === "turn_end" && parsed.message?.usage) {
127
+ const usage = parsed.message.usage;
128
+ runResult.promptTokens += usage.input ?? 0;
129
+ runResult.completionTokens += usage.output ?? 0;
130
+ runResult.cacheRead += usage.cacheRead ?? 0;
131
+ runResult.cacheWrite += usage.cacheWrite ?? 0;
132
+ endTurn(usage, timestamp);
133
+ }
134
+
135
+ if (parsed.type === "message_update" && parsed.assistantMessageEvent) {
136
+ const evt = parsed.assistantMessageEvent;
137
+ const subtype = String(evt.type ?? "").replace(/_/gu, "");
138
+ if (subtype === "thinkingdelta" || subtype === "textdelta") {
139
+ appendResponse(evt.delta || "");
140
+ }
141
+ }
142
+
143
+ if (parsed.type === "message_end" && parsed.message?.role === "assistant") {
144
+ flushResponse();
145
+ const content = parsed.message.content ?? [];
146
+ for (const item of content) {
147
+ if (item.type === "toolCall") {
148
+ runResult.toolCalls += 1;
149
+ appendResponse(`\n${formatToolCall(item)}\n`);
150
+ const currentTurn = runResult.perTurn[runResult.perTurn.length - 1];
151
+ if (currentTurn) currentTurn.toolCalls += 1;
152
+ }
153
+ }
154
+ }
155
+
156
+ if (parsed.type === "toolResult") {
157
+ runResult.toolResults += 1;
158
+ const status = parsed.isError ? "error" : "ok";
159
+ appendResponse(`\n[toolResult] ${parsed.toolName} (${status})\n`);
160
+ }
161
+
162
+ if (parsed.type === "agent_end") {
163
+ flushResponse();
164
+ }
165
+ }
166
+
167
+ child.stdout.setEncoding("utf8");
168
+ child.stdout.on("data", (chunk) => {
169
+ streamBuffer += chunk;
170
+ const lines = streamBuffer.split("\n");
171
+ streamBuffer = lines.pop();
172
+ for (const line of lines) {
173
+ processLine(line);
174
+ }
175
+ });
176
+
177
+ child.stderr.setEncoding("utf8");
178
+ child.stderr.on("data", (chunk) => {
179
+ stderrHandle.write(chunk);
180
+ });
181
+
182
+ const abortListener = () => {
183
+ if (cancelled) return;
184
+ cancelled = true;
185
+ console.log(BENCH_COLORS.error("\n\n[Cancelled by user]"));
186
+ child.kill("SIGTERM");
187
+ };
188
+
189
+ if (signal) {
190
+ signal.addEventListener("abort", abortListener);
191
+ }
192
+
193
+ return new Promise((resolve) => {
194
+ child.on("exit", async (code) => {
195
+ if (signal) signal.removeEventListener("abort", abortListener);
196
+ if (streamBuffer.trim()) {
197
+ processLine(streamBuffer);
198
+ }
199
+ flushResponse();
200
+ await streamHandle.close();
201
+ await stderrHandle.close();
202
+ await writeFile(responsePath, runResult.rawResponseLines.join(""), "utf8");
203
+
204
+ runResult.exitCode = code ?? 0;
205
+ if (firstEventMs !== null && lastEventMs !== null) {
206
+ runResult.wallClockMs = lastEventMs - firstEventMs;
207
+ }
208
+
209
+ if (cancelled) {
210
+ runResult.error = { message: "Cancelled by user" };
211
+ resolve(runResult);
212
+ return;
213
+ }
214
+
215
+ if (runResult.exitCode !== 0) {
216
+ runResult.error = { message: `Pi exited with code ${runResult.exitCode}` };
217
+ resolve(runResult);
218
+ return;
219
+ }
220
+
221
+ resolve(runResult);
222
+ });
223
+
224
+ child.on("error", async (err) => {
225
+ if (signal) signal.removeEventListener("abort", abortListener);
226
+ await streamHandle.close();
227
+ await stderrHandle.close();
228
+ runResult.error = { message: err.message };
229
+ resolve(runResult);
230
+ });
231
+ });
232
+ }
233
+
234
+ function extractTimestamp(event) {
235
+ const raw = event?.message?.timestamp ?? event?.timestamp ?? event?.assistantMessageEvent?.partial?.timestamp;
236
+ if (typeof raw === "number") return raw;
237
+ if (typeof raw === "string") {
238
+ const parsed = Date.parse(raw);
239
+ if (Number.isFinite(parsed)) return parsed;
240
+ }
241
+ const iso = event?.message?.createdAt ?? event?.createdAt ?? event?.created_at;
242
+ if (typeof iso === "string") {
243
+ const parsed = Date.parse(iso);
244
+ if (Number.isFinite(parsed)) return parsed;
245
+ }
246
+ return null;
247
+ }
248
+
249
+ async function openFileHandle(path, flags) {
250
+ const { open } = await import("node:fs/promises");
251
+ return open(path, flags);
252
+ }
@@ -0,0 +1,120 @@
1
+ // ── Create a benchmark run directory ────────────────────────────────────────
2
+
3
+ import { mkdir, writeFile } from "node:fs/promises";
4
+ import { join } from "node:path";
5
+ import { pc, renderRows, renderSection } from "../ui.mjs";
6
+ import { slugModelId, createRunId, buildToolPrompt } from "./shared.mjs";
7
+
8
+ function harnessDisplayName(id) {
9
+ if (id === "pi") return "Pi";
10
+ return String(id).replace(/[-_]+/gu, " ").replace(/\b\w/gu, (char) => char.toUpperCase());
11
+ }
12
+
13
+ function intendedRunnerForProfile(profile) {
14
+ if (!profile) return "your tool";
15
+ const harnessEntries = Object.entries(profile.harnesses ?? {}).filter(([, config]) => config?.enabled !== false);
16
+ const [id] = harnessEntries.find(([key]) => key === "pi") ?? harnessEntries[0] ?? ["pi"];
17
+ return harnessDisplayName(id);
18
+ }
19
+
20
+ function printBenchmarkNextSteps({ repoPath, runDirectory, profile, modelId, runnerLabel }) {
21
+ const runCommand = profile ? `offgrid-ai run ${profile.id}` : null;
22
+ const runnerCommand = runCommand ?? `Open ${runnerLabel} for ${modelId}`;
23
+
24
+ console.log("");
25
+ console.log(pc.bold("Next steps"));
26
+ console.log(` 1. Open the gallery. If it is not running: ${pc.cyan(`cd ${repoPath} && npm run dev`)}`);
27
+ console.log(` 2. ${pc.cyan(`cd ${runDirectory}`)}`);
28
+ console.log(` 3. ${pc.cyan(runnerCommand)}, then copy this run's prompt from the gallery and paste it into ${runnerLabel}`);
29
+ }
30
+
31
+ export async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId, modelSource, backendLabel, profile, showNextSteps = true }) {
32
+ const toolPrompt = buildToolPrompt(benchmark);
33
+ const now = new Date();
34
+ const runId = createRunId(now);
35
+ const modelSlug = slugModelId(modelId);
36
+ const runnerLabel = intendedRunnerForProfile(profile);
37
+ const runsDir = join(repoPath, "runs");
38
+ const benchmarkDirectory = join(runsDir, benchmark.id);
39
+ const modelDirectory = join(benchmarkDirectory, modelSlug);
40
+ const runDirectory = join(modelDirectory, runId);
41
+
42
+ await mkdir(runDirectory, { recursive: true });
43
+
44
+ const isDs = kind === "data-science";
45
+ const baseAssets = {
46
+ metadata: "metadata.json",
47
+ prompt: "prompt.md",
48
+ rawResponse: "response.raw.txt",
49
+ stream: "stream.ndjson",
50
+ stderr: "stderr.log",
51
+ };
52
+ const metadata = {
53
+ schemaVersion: 1,
54
+ kind,
55
+ runId,
56
+ benchmark: { id: benchmark.id, title: benchmark.title, description: benchmark.description, prompt: benchmark.prompt },
57
+ model: { id: modelId, slug: modelSlug },
58
+ status: "prepared",
59
+ createdAt: now.toISOString(),
60
+ updatedAt: now.toISOString(),
61
+ preparedAt: now.toISOString(),
62
+ runDirectory,
63
+ assets: isDs
64
+ ? { ...baseAssets, ds: { notebook: "analysis.ipynb", summary: "summary.json", chartDistribution: "chart-distribution.png", chartTreatmentEffect: "chart-treatment-effect.png", chartCompletionRates: "chart-completion-rates.png" } }
65
+ : { ...baseAssets, html: "index.html", preview: "preview.png", video: "preview.webm" },
66
+ runner: {
67
+ mode: modelSource === "cloud" ? "manual" : "external",
68
+ intendedRunner: profile ? runnerLabel : undefined,
69
+ ...(profile?.harnesses?.pi || runnerLabel === "Pi" ? { tool: "pi" } : {}),
70
+ ...(modelSource ? { modelSource } : {}),
71
+ ...(backendLabel ? { backendLabel } : {}),
72
+ ...(profile?.baseUrl ? { baseUrl: profile.baseUrl } : {}),
73
+ model: modelId,
74
+ retries: 0,
75
+ tokenMetrics: {
76
+ reported: false,
77
+ promptTokens: 0,
78
+ completionTokens: 0,
79
+ totalTokens: 0,
80
+ },
81
+ speedMetrics: {
82
+ prefillTokensPerSecond: null,
83
+ generationTokensPerSecond: null,
84
+ ttftMs: null,
85
+ modelLoadMs: null,
86
+ speculativeDecodeAcceptance: null,
87
+ kvCacheTokens: null,
88
+ },
89
+ metricSource: null,
90
+ },
91
+ results: {
92
+ wallClockMs: null,
93
+ agentTurns: 0,
94
+ toolCalls: 0,
95
+ toolResults: 0,
96
+ success: false,
97
+ outputFiles: [],
98
+ perTurn: [],
99
+ },
100
+ };
101
+
102
+ await writeFile(join(runDirectory, "metadata.json"), JSON.stringify(metadata, null, 2) + "\n", "utf8");
103
+ await writeFile(join(runDirectory, "prompt.md"), toolPrompt + "\n", "utf8");
104
+
105
+ console.log("");
106
+ console.log(pc.green("✓ Run slot prepared"));
107
+ console.log(renderSection("Run", renderRows([
108
+ ["Directory", pc.cyan(runDirectory)],
109
+ ["Benchmark", benchmark.title],
110
+ ["Kind", kind],
111
+ ["Model", pc.bold(modelId)],
112
+ ["Source", backendLabel || modelSource],
113
+ ])));
114
+
115
+ if (showNextSteps) {
116
+ printBenchmarkNextSteps({ repoPath, runDirectory, profile, modelId, runnerLabel });
117
+ }
118
+
119
+ return runDirectory;
120
+ }
@@ -0,0 +1,77 @@
1
+ // ── Benchmark repo linking ────────────────────────────────────────────────────
2
+
3
+ import { existsSync } from "node:fs";
4
+ import { join, resolve } from "node:path";
5
+ import { homedir } from "node:os";
6
+ import { execFile } from "node:child_process";
7
+ import { promisify } from "node:util";
8
+ import { loadConfig, saveConfig } from "../config.mjs";
9
+ import { pc } from "../ui.mjs";
10
+
11
+ const execFileAsync = promisify(execFile);
12
+
13
+ const BENCHMARK_REPO = "https://github.com/eeshansrivastava89/local-llm-visual-benchmark.git";
14
+
15
+ export async function findBenchmarkRepo() {
16
+ const config = await loadConfig();
17
+ if (config.benchmarkRepoPath && existsSync(join(config.benchmarkRepoPath, "benchmarks"))) {
18
+ return config.benchmarkRepoPath;
19
+ }
20
+ return null;
21
+ }
22
+
23
+ export async function linkBenchmarkRepo(prompt) {
24
+ const existing = await findBenchmarkRepo();
25
+ if (existing) return existing;
26
+
27
+ const candidates = [
28
+ join(homedir(), "dev", "local-llm-visual-benchmark"),
29
+ join(homedir(), "projects", "local-llm-visual-benchmark"),
30
+ join(homedir(), "local-llm-visual-benchmark"),
31
+ ];
32
+ for (const candidate of candidates) {
33
+ if (existsSync(join(candidate, "benchmarks"))) {
34
+ const config = await loadConfig();
35
+ config.benchmarkRepoPath = candidate;
36
+ await saveConfig(config);
37
+ return candidate;
38
+ }
39
+ }
40
+
41
+ console.log(pc.dim("\nThe benchmark gallery needs to be linked to offgrid-ai."));
42
+ console.log(pc.dim("This is the local-llm-visual-benchmark repo that stores prompts and run results.\n"));
43
+
44
+ const choice = await prompt.choice("Link benchmark gallery", [
45
+ { value: "clone", label: "Clone from GitHub", hint: "git clone into ~/dev" },
46
+ { value: "manual", label: "Enter path manually", hint: "If you already have it cloned" },
47
+ ], "clone");
48
+
49
+ if (choice === "clone") {
50
+ const targetDir = join(homedir(), "dev", "local-llm-visual-benchmark");
51
+ console.log(pc.dim(`\nCloning ${BENCHMARK_REPO}...`));
52
+ try {
53
+ await execFileAsync("git", ["clone", BENCHMARK_REPO, targetDir], { stdio: "pipe" });
54
+ const config = await loadConfig();
55
+ config.benchmarkRepoPath = targetDir;
56
+ await saveConfig(config);
57
+ console.log(pc.green(`✓ Cloned to ${targetDir}`));
58
+ return targetDir;
59
+ } catch (err) {
60
+ console.log(pc.red(`Clone failed: ${err.message}`));
61
+ return null;
62
+ }
63
+ }
64
+
65
+ const path = await prompt.text("Path to local-llm-visual-benchmark", "");
66
+ if (!path) return null;
67
+ const resolved = resolve(path.replace(/^~/, homedir()));
68
+ if (!existsSync(join(resolved, "benchmarks"))) {
69
+ console.log(pc.red(`No benchmarks/ directory found at ${resolved}`));
70
+ return null;
71
+ }
72
+ const config = await loadConfig();
73
+ config.benchmarkRepoPath = resolved;
74
+ await saveConfig(config);
75
+ console.log(pc.green(`✓ Linked to ${resolved}`));
76
+ return resolved;
77
+ }
@@ -0,0 +1,54 @@
1
+ // ── Shared utilities (matches local-llm-visual-benchmark) ──────────────────
2
+
3
+ import { createHash } from "node:crypto";
4
+ import { readdir, readFile } from "node:fs/promises";
5
+ import { join } from "node:path";
6
+
7
+ export function slugModelId(modelId, maxLength = 80) {
8
+ const hash = createHash("sha256").update(modelId).digest("hex").slice(0, 10);
9
+ const normalized = modelId.normalize("NFKD").replace(/[\u0300-\u036f]/gu, "").toLowerCase();
10
+ const slug = normalized.replace(/[^a-z0-9]+/gu, "-").replace(/^-+|-+$/gu, "").replace(/-{2,}/gu, "-");
11
+ if (slug.length > 0 && slug.length <= maxLength && slug === normalized) return slug;
12
+ const baseMaxLength = Math.max(1, maxLength - 11);
13
+ const base = slug.slice(0, baseMaxLength).replace(/^-+|-+$/gu, "") || "model";
14
+ return `${base}-${hash}`;
15
+ }
16
+
17
+ export function createRunId(date = new Date()) {
18
+ return date.toISOString().replace(/:/gu, "-").replace(/\./gu, "-");
19
+ }
20
+
21
+ export function buildToolPrompt(benchmark) {
22
+ return benchmark.prompt;
23
+ }
24
+
25
+ export async function loadBenchmarks(benchDir) {
26
+ const entries = await readdir(benchDir);
27
+ const markdownFiles = entries.filter((f) => f.endsWith(".md")).sort();
28
+ const benchmarks = [];
29
+ for (const filename of markdownFiles) {
30
+ const raw = await readFile(join(benchDir, filename), "utf8");
31
+ const match = raw.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
32
+ const frontmatter = match ? match[1] : "";
33
+ const content = match ? match[2].trim() : raw.trim();
34
+ let id = filename.replace(/\.md$/u, "");
35
+ let title = id;
36
+ let description = "";
37
+ for (const line of frontmatter.split("\n")) {
38
+ const kv = line.match(/^(\w+):\s*(.+)$/);
39
+ if (kv) {
40
+ const [, key, val] = kv;
41
+ if (key === "id") id = val.trim();
42
+ if (key === "title") title = val.trim();
43
+ if (key === "description") description = val.trim();
44
+ }
45
+ }
46
+ const kind = id === "ab-test-analysis" ? "data-science" : "visual";
47
+ benchmarks.push({ id, title, description, prompt: content, kind });
48
+ }
49
+ return benchmarks;
50
+ }
51
+
52
+ export function piModelString(profile) {
53
+ return profile.harnesses?.pi?.model ?? `${profile.providerId}/${profile.modelAlias}`;
54
+ }