offgrid-ai 0.16.3 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,12 +14,28 @@ import { detectOmlxMtpCapability, findOmlxModelDir } from "./mlx-discovery.mjs";
14
14
  const execFileAsync = promisify(execFile);
15
15
 
16
16
  const CACHE_CHOICES = [
17
- { value: "bf16", label: "bf16", hint: "default: stable, good quality" },
18
- { value: "f16", label: "f16", hint: "stable fallback, similar memory to bf16" },
19
- { value: "q8_0", label: "q8_0", hint: "lower memory, usually safe" },
20
- { value: "q4_0", label: "q4_0", hint: "lowest memory, quality/speed tradeoff" },
17
+ { value: "bf16", label: "bf16", hint: "16-bit · best quality · 2 bytes/elem" },
18
+ { value: "f16", label: "f16", hint: "16-bit · stable fallback · 2 bytes/elem" },
19
+ { value: "q8_0", label: "q8_0", hint: "8-bit · half memory · usually safe · 1 byte/elem" },
20
+ { value: "q4_0", label: "q4_0", hint: "4-bit · quarter memory · quality tradeoff · 0.5 bytes/elem" },
21
21
  ];
22
22
 
23
+ function quickKvBytes(modelPath, flags) {
24
+ try {
25
+ return estimateMemory(modelPath, null, null, flags).kvBytes;
26
+ } catch {
27
+ return 0;
28
+ }
29
+ }
30
+
31
+ function cacheMemoryRows(modelPath, flags, ctxSize) {
32
+ const combos = [["bf16", "bf16"], ["f16", "f16"], ["q8_0", "q8_0"], ["q4_0", "q4_0"]];
33
+ return combos.map(([k, v]) => {
34
+ const bytes = quickKvBytes(modelPath, { ...flags, ctxSize, cacheTypeK: k, cacheTypeV: v });
35
+ return [`${k}/${v} KV cache`, bytes ? `~${formatBytes(bytes)}` : "unknown"];
36
+ });
37
+ }
38
+
23
39
  const GENERAL_DEFAULTS = {
24
40
  topK: 20,
25
41
  presencePenalty: 1.5,
@@ -40,7 +56,6 @@ export async function configureLocalProfile(prompt, profile) {
40
56
  const freshCaps = detectCapabilities(profile.modelPath, profile.mmprojPath);
41
57
  let drafterPath = profile.drafterPath ?? null;
42
58
  if (drafterPath && !existsSync(drafterPath)) {
43
- // Stored drafter is no longer on disk — drop it and re-scan for a fresh one.
44
59
  drafterPath = null;
45
60
  }
46
61
  if (!drafterPath) {
@@ -50,86 +65,229 @@ export async function configureLocalProfile(prompt, profile) {
50
65
  }
51
66
  const hasMtp = freshCaps.mtp || Boolean(drafterPath);
52
67
  const caps = { ...freshCaps, mtp: hasMtp };
53
- // MTP is a capability, not a separate backend. Just update the profile's
54
- // capabilities and drafter path — flag computation handles the rest.
55
68
  configured = { ...configured, drafterPath: drafterPath ?? null, capabilities: { ...configured.capabilities, mtp: hasMtp } };
56
- // If vision was previously disabled but mmproj is back, re-enable
57
69
  if (configured.disabledMmprojPath && configured.mmprojPath === null && freshCaps.vision) {
58
70
  configured = { ...configured, mmprojPath: configured.disabledMmprojPath, disabledMmprojPath: undefined, capabilities: { ...configured.capabilities, vision: true, visionDisabledReason: undefined } };
59
71
  }
60
72
 
73
+ // ── Model overview ──────────────────────────────────────────────────────
61
74
  console.log("");
62
- console.log(renderSection("Model setup", renderRows([
75
+ console.log(renderSection("Model overview", renderRows([
63
76
  ["Model", pc.bold(profile.label)],
64
77
  ["Detected", capabilitySummary(caps)],
65
- ["Context", `${profile.flags.ctxSize.toLocaleString()} tokens`],
66
- ["KV cache", `${profile.flags.cacheTypeK}/${profile.flags.cacheTypeV}`],
67
- ["Sampling", samplingSummary(profile.flags)],
78
+ ["Backend", "llama.cpp (local server)"],
79
+ ["Model file", profile.modelPath],
68
80
  ])));
69
- console.log(pc.dim("Larger context windows use more memory. KV cache precision controls memory used by attention history."));
70
- console.log(pc.dim("Sampling defaults are shown for transparency; you can edit the profile later if needed.\n"));
71
81
 
82
+ // ── MTP ────────────────────────────────────────────────────────────────
72
83
  if (caps.mtp) {
73
- const drafterInfo = configured.drafterPath ? `\n Drafter: ${configured.drafterPath}` : "";
74
- console.log(renderSection("MTP detected", renderRows([
75
- ["Feature", "Multi-Token Prediction (speculative decoding)"],
76
- ["Flags", `--spec-type draft-mtp --spec-draft-n-max 4${configured.drafterPath ? " --spec-draft-model <drafter>" : ""}`],
77
- ])));
78
- if (drafterInfo) console.log(pc.dim(drafterInfo));
79
- const useMtp = await prompt.yesNo("Use MTP speculative decoding?", true);
80
- configured = useMtp ? applyMtpDefaults(configured) : removeMtpDefaults(configured);
81
- }
82
-
83
- if (caps.qat) {
84
84
  console.log("");
85
- console.log(renderSection("QAT detected", renderRows([
86
- ["Meaning", "quantization-aware trained"],
87
- ["Runtime flags", "none QAT-specific"],
85
+ console.log(renderSection("MTP (Multi-Token Prediction)", renderRows([
86
+ ["What it does", "Speculative decoding — predicts multiple tokens at once, verifies them"],
87
+ ["Speed", "1.5–3x faster generation"],
88
+ ["Quality", "No loss — rejected predictions fall back to normal"],
89
+ ["Memory", "Slightly more for draft model weights"],
90
+ ["Flags", "--spec-type draft-mtp --spec-draft-n-max 4"],
91
+ ...(configured.drafterPath ? [["Drafter", configured.drafterPath]] : []),
88
92
  ])));
93
+ const useMtp = await prompt.yesNo("Enable MTP speculative decoding?", true);
94
+ configured = useMtp ? applyMtpDefaults(configured) : removeMtpDefaults(configured);
89
95
  }
90
96
 
97
+ // ── Vision ─────────────────────────────────────────────────────────────
91
98
  if (caps.vision && profile.mmprojPath) {
92
99
  console.log("");
93
100
  const gemma4Unified = isGemma4UnifiedProjector(caps.mmprojProjectorType);
94
101
  const supported = !gemma4Unified || await runtimeSupportsGemma4Unified();
95
- console.log(renderSection("Vision projector detected", renderRows([
96
- ["Projector", caps.mmprojProjectorType ?? "unknown"],
102
+ console.log(renderSection("Vision projector", renderRows([
103
+ ["What it does", "Enables image understanding — model can see and reason about images"],
104
+ ["Projector type", caps.mmprojProjectorType ?? "unknown"],
97
105
  ["Flag", `--mmproj ${profile.mmprojPath}`],
106
+ ["Memory", "~200 MB for projector weights (varies by model)"],
98
107
  ...(gemma4Unified && !supported ? [["Note", pc.yellow("Gemma 4 unified projectors need llama.cpp b9549+.")]] : []),
99
108
  ])));
100
- const useVision = await prompt.yesNo("Enable vision with --mmproj?", supported);
109
+ const useVision = await prompt.yesNo("Enable vision?", supported);
101
110
  configured = useVision ? applyVisionDefaults(configured) : removeVisionDefaults(configured, gemma4Unified && !supported ? "gemma4-unified-unsupported" : "user-disabled");
102
111
  }
103
112
 
113
+ // ── Thinking ───────────────────────────────────────────────────────────
104
114
  if (caps.thinking) {
105
115
  console.log("");
106
- console.log(renderSection("Thinking model detected", renderRows([
107
- ["Defaults", "thinking / loop-safe"],
108
- ["Flags", "--top-k 64 --presence-penalty 0 --repeat-penalty 1.1"],
116
+ console.log(renderSection("Thinking mode", renderRows([
117
+ ["What it does", "Model reasons step-by-step before answering"],
118
+ ["Benefit", "Better results for math, code, logic — but slower (more output tokens)"],
119
+ ["Sampling changes", "top-k → 64, presence penalty → 0, repeat penalty → 1.1 (adjustable below)"],
109
120
  ["Template", "--chat-template-kwargs { enable_thinking: true }"],
110
121
  ])));
111
- const useThinking = await prompt.yesNo("Use these thinking/loop-safe defaults?", true);
122
+ const useThinking = await prompt.yesNo("Use thinking/loop-safe defaults?", true);
112
123
  configured = useThinking ? applyThinkingDefaults(configured) : removeThinkingDefaults(configured);
113
124
  }
114
125
 
115
- const ctxSize = await prompt.number("Context window tokens", configured.flags.ctxSize, 1024, 1048576);
126
+ // ── Context window ─────────────────────────────────────────────────────
127
+ const maxCtx = caps.metaCtx ?? 1048576;
128
+ console.log("");
129
+ console.log(renderSection("Context window", renderRows([
130
+ ["What it does", "Maximum tokens the model can process at once (prompt + response + history)"],
131
+ ["Range", `1,024 – ${maxCtx.toLocaleString()} tokens`],
132
+ ["Memory", "KV cache grows linearly — larger context = more RAM"],
133
+ ["Guidance", "8k–32k: chat · 32k–80k: coding/long convos · 128k+: long documents"],
134
+ ["Model max", `${maxCtx.toLocaleString()} tokens`],
135
+ ["Default", `${configured.flags.ctxSize.toLocaleString()} tokens`],
136
+ ])));
137
+ const ctxSize = await prompt.number("Context window tokens", configured.flags.ctxSize, 1024, maxCtx);
138
+ configured = applyRuntimeFlagOverrides(configured, { ctxSize });
139
+
140
+ // ── K cache precision ──────────────────────────────────────────────────
141
+ console.log("");
142
+ console.log(renderSection("K cache precision", renderRows([
143
+ ["What it is", "KV cache stores attention 'keys' — previous token states used for prediction"],
144
+ ["Tradeoff", "Lower precision = less memory, potential quality loss"],
145
+ ...cacheMemoryRows(profile.modelPath, configured.flags, ctxSize),
146
+ ])));
116
147
  const cacheTypeK = await prompt.choice("K cache precision", CACHE_CHOICES, configured.flags.cacheTypeK);
148
+ configured = applyRuntimeFlagOverrides(configured, { cacheTypeK });
149
+
150
+ // ── V cache precision ──────────────────────────────────────────────────
151
+ console.log("");
152
+ console.log(renderSection("V cache precision", renderRows([
153
+ ["What it is", "KV cache stores attention 'values' — token representations from previous layers"],
154
+ ["Tradeoff", "Same as K cache. Some models are more sensitive to V precision than K"],
155
+ ...cacheMemoryRows(profile.modelPath, configured.flags, ctxSize),
156
+ ])));
117
157
  const cacheTypeV = await prompt.choice("V cache precision", CACHE_CHOICES, configured.flags.cacheTypeV);
118
- configured = applyRuntimeFlagOverrides(configured, { ctxSize, cacheTypeK, cacheTypeV });
158
+ configured = applyRuntimeFlagOverrides(configured, { cacheTypeV });
159
+
160
+ // ── Memory estimate (with chosen context + cache) ──────────────────────
161
+ console.log("");
162
+ console.log(renderMemoryEstimate(configured));
163
+
164
+ // ── Temperature ────────────────────────────────────────────────────────
165
+ console.log("");
166
+ console.log(renderSection("Temperature", renderRows([
167
+ ["What it does", "Controls randomness in token selection"],
168
+ ["Range", "0.0 – 2.0"],
169
+ ["0.0", "Deterministic — always picks most likely token"],
170
+ ["0.6", "Balanced — default for most models"],
171
+ ["1.0+", "Creative — more random, may hallucinate"],
172
+ ["Guidance", "0–0.3: coding/factual · 0.4–0.8: chat · 0.9+: creative writing"],
173
+ ])));
174
+ const temperature = await prompt.number("Temperature", configured.flags.temperature, 0, 2, { float: true });
175
+ configured = applyRuntimeFlagOverrides(configured, { temperature });
176
+
177
+ // ── Top-p ──────────────────────────────────────────────────────────────
178
+ console.log("");
179
+ console.log(renderSection("Top-p (nucleus sampling)", renderRows([
180
+ ["What it does", "Only considers tokens in the top p fraction of probability mass"],
181
+ ["Range", "0.0 – 1.0 (1.0 = consider all tokens)"],
182
+ ["Guidance", "0.9–0.95: good default · Lower = more focused · Higher = more diverse"],
183
+ ])));
184
+ const topP = await prompt.number("Top-p", configured.flags.topP, 0, 1, { float: true });
185
+ configured = applyRuntimeFlagOverrides(configured, { topP });
186
+
187
+ // ── Top-k ──────────────────────────────────────────────────────────────
188
+ console.log("");
189
+ console.log(renderSection("Top-k", renderRows([
190
+ ["What it does", "Limits token selection to top K most likely tokens at each step"],
191
+ ["Range", "0 – 1000 (0 = disabled, uses top-p instead)"],
192
+ ["Guidance", "20: general chat · 40–64: thinking/reasoning · 0: rely on top-p"],
193
+ ])));
194
+ const topK = await prompt.number("Top-k", configured.flags.topK, 0, 1000);
195
+ configured = applyRuntimeFlagOverrides(configured, { topK });
196
+
197
+ // ── Min-p ──────────────────────────────────────────────────────────────
198
+ console.log("");
199
+ console.log(renderSection("Min-p", renderRows([
200
+ ["What it does", "Excludes tokens with probability below this threshold"],
201
+ ["Range", "0.0 – 1.0 (0 = disabled)"],
202
+ ["Guidance", "0: off (default) · 0.05–0.1: reduces hallucination while keeping creativity"],
203
+ ])));
204
+ const minP = await prompt.number("Min-p", configured.flags.minP, 0, 1, { float: true });
205
+ configured = applyRuntimeFlagOverrides(configured, { minP });
206
+
207
+ // ── Presence penalty ───────────────────────────────────────────────────
208
+ console.log("");
209
+ console.log(renderSection("Presence penalty", renderRows([
210
+ ["What it does", "Discourages repeated tokens — each used token gets a fixed logit penalty"],
211
+ ["Range", "0.0 – 2.0 (0 = off)"],
212
+ ["Guidance", "0: thinking/reasoning · 1.0–1.5: general chat · Too high = incoherent"],
213
+ ])));
214
+ const presencePenalty = await prompt.number("Presence penalty", configured.flags.presencePenalty, 0, 2, { float: true });
215
+ configured = applyRuntimeFlagOverrides(configured, { presencePenalty });
216
+
217
+ // ── Repeat penalty ─────────────────────────────────────────────────────
218
+ console.log("");
219
+ console.log(renderSection("Repeat penalty", renderRows([
220
+ ["What it does", "Multiplies probability of repeated tokens (multiplicative, vs presence's additive)"],
221
+ ["Range", "0.0 – 2.0 (1.0 = no effect)"],
222
+ ["Guidance", "1.0–1.1: most use cases · Higher = aggressive anti-repeat · Can break code patterns"],
223
+ ])));
224
+ const repeatPenalty = await prompt.number("Repeat penalty", configured.flags.repeatPenalty, 0, 2, { float: true });
225
+ configured = applyRuntimeFlagOverrides(configured, { repeatPenalty });
226
+
227
+ // ── Batch size ─────────────────────────────────────────────────────────
228
+ console.log("");
229
+ console.log(renderSection("Batch size", renderRows([
230
+ ["What it does", "Tokens processed in parallel during prompt processing (before generation)"],
231
+ ["Range", "1 – 4096"],
232
+ ["Guidance", "512: good default · 2048+: faster for long prompts · Lower if memory tight"],
233
+ ])));
234
+ const batchSize = await prompt.number("Batch size", configured.flags.batchSize, 1, 4096);
235
+ configured = applyRuntimeFlagOverrides(configured, { batchSize });
236
+
237
+ // ── Parallel slots ─────────────────────────────────────────────────────
238
+ console.log("");
239
+ console.log(renderSection("Parallel slots", renderRows([
240
+ ["What it does", "Number of concurrent request slots (each handles an independent conversation)"],
241
+ ["Range", "1 – 10"],
242
+ ["Memory", "KV cache is multiplied by this number"],
243
+ ["Guidance", "1: single-user local · 2+: multiple clients connecting simultaneously"],
244
+ ])));
245
+ const parallel = await prompt.number("Parallel slots", configured.flags.parallel, 1, 10);
246
+ configured = applyRuntimeFlagOverrides(configured, { parallel });
247
+
248
+ // ── Flash attention ────────────────────────────────────────────────────
249
+ console.log("");
250
+ console.log(renderSection("Flash attention", renderRows([
251
+ ["What it does", "Memory-efficient attention algorithm — faster and less RAM than standard"],
252
+ ["Guidance", "Always on for modern hardware · Turn off only for old GPU driver compat issues"],
253
+ ])));
254
+ const flashAttn = await prompt.yesNo("Enable flash attention?", true);
255
+ configured = applyRuntimeFlagOverrides(configured, { flashAttention: flashAttn ? "on" : "off" });
256
+
257
+ // ── Jinja templates ────────────────────────────────────────────────────
258
+ console.log("");
259
+ console.log(renderSection("Jinja chat templates", renderRows([
260
+ ["What it does", "Enables Jinja2 template rendering for proper chat formatting"],
261
+ ["Guidance", "Always on for modern models · Turn off only for very old models without chat templates"],
262
+ ])));
263
+ const jinja = await prompt.yesNo("Enable Jinja templates?", true);
264
+ configured = applyRuntimeFlagOverrides(configured, { jinja });
119
265
 
266
+ // ── Final summary ──────────────────────────────────────────────────────
120
267
  console.log("");
121
- console.log(renderSection("Defaults", renderRows([
268
+ console.log(renderSection("Configuration summary", renderRows([
269
+ ["Model", pc.bold(configured.label)],
122
270
  ["Backend", configured.backend],
123
271
  ["Endpoint", configured.baseUrl],
124
- ["Temperature", configured.flags.temperature],
125
- ["Top-p", configured.flags.topP],
126
- ["Top-k", configured.flags.topK],
127
- ["Min-p", configured.flags.minP],
128
- ["Presence penalty", configured.flags.presencePenalty],
129
- ["Repeat penalty", configured.flags.repeatPenalty],
272
+ ["Context", `${configured.flags.ctxSize.toLocaleString()} tokens`],
273
+ ["KV cache", `${configured.flags.cacheTypeK}/${configured.flags.cacheTypeV}`],
274
+ ["Temperature", String(configured.flags.temperature)],
275
+ ["Top-p", String(configured.flags.topP)],
276
+ ["Top-k", String(configured.flags.topK)],
277
+ ["Min-p", String(configured.flags.minP)],
278
+ ["Presence penalty", String(configured.flags.presencePenalty)],
279
+ ["Repeat penalty", String(configured.flags.repeatPenalty)],
280
+ ["Batch size", String(configured.flags.batchSize)],
281
+ ["Parallel", String(configured.flags.parallel)],
282
+ ["Flash attention", configured.flags.flashAttention],
283
+ ["Jinja", configured.flags.jinja ? "on" : "off"],
284
+ ...(configured.capabilities?.mtp ? [["MTP", "enabled"]] : []),
285
+ ...(configured.capabilities?.vision ? [["Vision", "enabled"]] : []),
286
+ ...(configured.capabilities?.thinking ? [["Thinking", "enabled"]] : []),
130
287
  ])));
131
288
 
132
- console.log("\n" + renderMemoryEstimate(configured));
289
+ console.log("");
290
+ console.log(renderMemoryEstimate(configured));
133
291
  if (!(await prompt.yesNo("Save profile with these settings?", true))) return null;
134
292
  return configured;
135
293
  }
@@ -240,10 +398,12 @@ function applyProfileFlags(profile, flags) {
240
398
 
241
399
  function renderMemoryEstimate(profile) {
242
400
  try {
243
- const est = estimateMemory(profile.modelPath, profile.mmprojPath, null, profile.flags);
401
+ const est = estimateMemory(profile.modelPath, profile.mmprojPath, profile.drafterPath, profile.flags);
244
402
  return renderSection("Memory estimate", renderRows([
245
403
  ["Estimated total", pc.bold(`~${formatBytes(est.totalBytes)}`)],
246
404
  ["Model", formatBytes(est.modelBytes)],
405
+ ...(est.mmprojBytes ? [["Vision projector", formatBytes(est.mmprojBytes)]] : []),
406
+ ...(est.draftBytes ? [["Drafter (MTP)", formatBytes(est.draftBytes)]] : []),
247
407
  ["KV cache", est.kvBytes ? `~${formatBytes(est.kvBytes)} (${profile.flags.ctxSize.toLocaleString()} ctx, ${profile.flags.cacheTypeK}/${profile.flags.cacheTypeV})` : "unknown"],
248
408
  ...(est.note ? [["Note", pc.yellow(est.note)]] : []),
249
409
  ]));
@@ -269,8 +429,5 @@ async function runtimeSupportsGemma4Unified() {
269
429
  }
270
430
  }
271
431
 
272
- function samplingSummary(flags) {
273
- return `temp ${flags.temperature}, top-p ${flags.topP}, top-k ${flags.topK}`;
274
- }
275
432
 
276
433
 
package/src/ui.mjs CHANGED
@@ -95,12 +95,12 @@ export function renderCard(title, body, options = {}) {
95
95
  }
96
96
 
97
97
  const topTitle = title ? `╭${pc.reset(titleStr)}` : "╭";
98
- const topFill = "─".repeat(Math.max(0, width + 2 - visibleLen(titleStr)));
98
+ const topFill = "─".repeat(Math.max(0, width - visibleLen(titleStr)));
99
99
  const top = `${topTitle}${topFill}╮`;
100
100
 
101
101
  const middle = lines.map((line) => `│ ${padVisible(line, contentWidth)} │`);
102
102
 
103
- const bottom = `╰${"─".repeat(width + 2)}╯`;
103
+ const bottom = `╰${"─".repeat(width)}╯`;
104
104
 
105
105
  return [top, ...middle, bottom].map((l) => borderColor(l)).join("\n");
106
106
  }
@@ -180,9 +180,8 @@ async function runPrompt(fn, config) {
180
180
 
181
181
  // ── Interactive prompt factory ──────────────────────────────────────────────
182
182
 
183
- export function startInteractive(title = "offgrid-ai") {
183
+ export function startInteractive() {
184
184
  if (process.stdin.isTTY) console.clear();
185
- console.log(pc.magenta(`◆ ${title}`));
186
185
  }
187
186
 
188
187
  export function createPrompt() {
@@ -195,10 +194,13 @@ export function createPrompt() {
195
194
  return value?.trim() || String(defaultValue ?? "");
196
195
  },
197
196
 
198
- async number(label, defaultValue, min, max) {
197
+ async number(label, defaultValue, min, max, { float = false } = {}) {
199
198
  const value = await runPrompt(number, {
200
199
  message: label,
201
200
  default: defaultValue,
201
+ min,
202
+ max,
203
+ step: float ? 'any' : 1,
202
204
  validate(input) {
203
205
  if (!Number.isFinite(input) || input < min || input > max) {
204
206
  return `Enter a number from ${min} to ${max}.`;
@@ -235,12 +237,10 @@ export function createPrompt() {
235
237
 
236
238
  export async function modelSelect(label, groups, { defaultKey, pageSize = 20 } = {}) {
237
239
  const choices = [];
238
- // Separator below the prompt message
239
- choices.push(new Separator(pc.dim(" ────────────────────────────────────────────────────────────")));
240
240
  for (let i = 0; i < groups.length; i++) {
241
241
  const group = groups[i];
242
242
  // Add blank line before each group (except the first)
243
- if (i > 0) choices.push(new Separator(""));
243
+ if (i > 0) choices.push(new Separator(" "));
244
244
  if (group.separator) {
245
245
  choices.push(new Separator(group.separator));
246
246
  }
@@ -1,169 +0,0 @@
1
- // ── Benchmark finalization (metadata + summary rendering) ───────────────────
2
- // unloadModelFromServer has been moved to src/process.mjs (it's the managed-server
3
- // counterpart to stopProfile, used by both the benchmark flow and the Pi chat flow).
4
-
5
- import { existsSync } from "node:fs";
6
- import { readFile, writeFile } from "node:fs/promises";
7
- import { join } from "node:path";
8
- import { pc, renderRows, renderSection } from "../ui.mjs";
9
-
10
- export async function finalizeBenchmarkRun(runDirectory, runResult, speedMetrics, speedMetricsError = null) {
11
- const metadataPath = join(runDirectory, "metadata.json");
12
- const metadata = JSON.parse(await readFile(metadataPath, "utf8"));
13
- const now = new Date();
14
- const timestamp = now.toISOString();
15
-
16
- const kind = metadata.kind ?? "visual";
17
- const isDs = kind === "data-science";
18
- const requiredFile = isDs ? "analysis.ipynb" : "index.html";
19
- const requiredPath = join(runDirectory, requiredFile);
20
-
21
- const outputFiles = [];
22
- for (const candidate of [requiredFile, isDs ? "summary.json" : "preview.png", isDs ? "chart-distribution.png" : "preview.webm", "preview.mp4"]) {
23
- if (existsSync(join(runDirectory, candidate))) {
24
- outputFiles.push(candidate);
25
- }
26
- }
27
-
28
- const success = existsSync(requiredPath) && (await readFile(requiredPath, "utf8")).trim().length > 0;
29
- const hasTurns = runResult.agentTurns > 0;
30
-
31
- let failureReason = null;
32
- if (runResult.error) {
33
- failureReason = typeof runResult.error === "string" ? runResult.error : (runResult.error.message ?? "Unknown error");
34
- } else if (!hasTurns) {
35
- failureReason = "The model did not produce any response turns.";
36
- } else if (!success) {
37
- if (runResult.toolCalls === 0) {
38
- failureReason = `The model finished without writing the required output file (${requiredFile}). It may have returned the response as chat text instead of using the write tool.`;
39
- } else {
40
- failureReason = `The required output file (${requiredFile}) was missing or empty after the run.`;
41
- }
42
- }
43
-
44
- const failed = failureReason !== null;
45
-
46
- metadata.status = failed ? "failed" : "completed";
47
- metadata.updatedAt = timestamp;
48
- if (failed) {
49
- metadata.failedAt = timestamp;
50
- } else {
51
- metadata.completedAt = timestamp;
52
- }
53
-
54
- const totalTokens = runResult.promptTokens + runResult.completionTokens;
55
-
56
- metadata.runner.tokenMetrics = {
57
- reported: hasTurns,
58
- promptTokens: runResult.promptTokens,
59
- completionTokens: runResult.completionTokens,
60
- totalTokens,
61
- };
62
-
63
- metadata.runner.speedMetrics = speedMetrics;
64
- metadata.runner.metricSource = speedMetrics?.metricSource ?? null;
65
- metadata.runner.speedMetricsError = speedMetricsError ?? null;
66
-
67
- metadata.results = {
68
- wallClockMs: runResult.wallClockMs,
69
- agentTurns: runResult.agentTurns,
70
- toolCalls: runResult.toolCalls,
71
- toolResults: runResult.toolResults,
72
- success,
73
- outputFiles,
74
- perTurn: runResult.perTurn,
75
- };
76
-
77
- if (failureReason) {
78
- metadata.error = { message: failureReason, ...(typeof runResult.error === "object" && runResult.error?.stack ? { stack: runResult.error.stack } : {}) };
79
- } else if (runResult.error) {
80
- metadata.error = typeof runResult.error === "string"
81
- ? { message: runResult.error }
82
- : { message: runResult.error.message ?? "Unknown error", ...(runResult.error.stack ? { stack: runResult.error.stack } : {}) };
83
- }
84
-
85
- await writeFile(metadataPath, JSON.stringify(metadata, null, 2) + "\n", "utf8");
86
- return metadata;
87
- }
88
-
89
- function formatMetric(value, formatter) {
90
- if (value === null || value === undefined || !Number.isFinite(value)) return pc.dim("—");
91
- return formatter(value);
92
- }
93
-
94
- function formatMs(ms) {
95
- return formatMetric(ms, (n) => (n < 1000 ? `${Math.round(n)} ms` : `${(n / 1000).toFixed(1)} s`));
96
- }
97
-
98
- function formatNumber(n) {
99
- return formatMetric(n, (v) => v.toLocaleString());
100
- }
101
-
102
- function formatTokPerSec(n) {
103
- return formatMetric(n, (v) => `${v.toFixed(1)} tok/s`);
104
- }
105
-
106
- function formatPercent(n) {
107
- return formatMetric(n, (v) => `${(v * 100).toFixed(0)} %`);
108
- }
109
-
110
- export function renderBenchmarkSummary(metadata) {
111
- const { status, results, runner, error } = metadata;
112
-
113
- const agentRows = [
114
- ["Status", status === "completed" ? pc.green("completed") : pc.red(status ?? "failed")],
115
- ["Duration", formatMs(results?.wallClockMs)],
116
- ["Agent turns", formatNumber(results?.agentTurns)],
117
- ["Input tokens", formatNumber(runner?.tokenMetrics?.promptTokens)],
118
- ["Output tokens", formatNumber(runner?.tokenMetrics?.completionTokens)],
119
- ["Total tokens", formatNumber(runner?.tokenMetrics?.totalTokens)],
120
- ["Tool calls", formatNumber(results?.toolCalls)],
121
- ["Tool results", formatNumber(results?.toolResults)],
122
- ["Output files", (results?.outputFiles?.length ?? 0) > 0 ? results.outputFiles.join(", ") : pc.dim("—")],
123
- ];
124
-
125
- console.log("");
126
- console.log(renderSection("Benchmark Result", renderRows(agentRows)));
127
-
128
- if (status === "completed" && runner?.speedMetrics) {
129
- const speed = runner.speedMetrics;
130
- const speedRows = [
131
- ["Prefill tok/s", formatTokPerSec(speed.prefillTokensPerSecond)],
132
- ["Generation tok/s", formatTokPerSec(speed.generationTokensPerSecond)],
133
- ["TTFT", formatMs(speed.ttftMs)],
134
- ["Speculative decode", formatPercent(speed.speculativeDecodeAcceptance)],
135
- ["KV cache tokens", formatNumber(speed.kvCacheTokens)],
136
- ["Model load time", formatMs(speed.modelLoadMs)],
137
- ["Metric source", speed.metricSource ?? pc.dim("—")],
138
- ];
139
- console.log(renderSection("Speed Metrics", renderRows(speedRows)));
140
- } else if (error) {
141
- const wrappedError = wrapText(error.message ?? "Unknown error");
142
- console.log(renderSection("Error", pc.red(wrappedError)));
143
- if (error.message?.includes("write tool") || error.message?.includes("required output file")) {
144
- const tip = wrapText("Tip: This usually means the model returned the answer as chat text instead of writing the file. Try a model with stronger tool-use support, or run the prompt manually.", 64);
145
- console.log(pc.dim("\n" + tip));
146
- }
147
- }
148
-
149
- if (status === "completed" && !runner?.speedMetrics && runner?.speedMetricsError) {
150
- console.log(pc.dim(`\nSpeed metrics unavailable: ${runner.speedMetricsError}`));
151
- }
152
- }
153
-
154
- function wrapText(text, width = 64) {
155
- if (!text) return "";
156
- const words = text.split(/\s+/);
157
- const lines = [];
158
- let current = "";
159
- for (const word of words) {
160
- if ((current + " " + word).trim().length > width) {
161
- if (current) lines.push(current.trim());
162
- current = word;
163
- } else {
164
- current = current ? `${current} ${word}` : word;
165
- }
166
- }
167
- if (current) lines.push(current.trim());
168
- return lines.join("\n");
169
- }