offgrid-ai 0.16.3 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/package.json +3 -11
- package/resources/recommendations.json +8 -8
- package/src/cli.mjs +1 -4
- package/src/commands/main.mjs +20 -1
- package/src/commands/models.mjs +296 -39
- package/src/commands/onboard.mjs +6 -106
- package/src/commands/run.mjs +2 -4
- package/src/commands/status.mjs +1 -0
- package/src/commands/stop.mjs +1 -0
- package/src/config.mjs +16 -1
- package/src/discovery-shared.mjs +2 -3
- package/src/download.mjs +221 -0
- package/src/harness-pi.mjs +2 -3
- package/src/huggingface.mjs +72 -72
- package/src/managed.mjs +1 -6
- package/src/model-name.mjs +2 -2
- package/src/model-presenters.mjs +5 -36
- package/src/model-summary.mjs +2 -2
- package/src/omlx-runtime.mjs +29 -4
- package/src/process.mjs +3 -5
- package/src/profile-setup.mjs +206 -49
- package/src/profiles.mjs +1 -1
- package/src/runtime.mjs +2 -2
- package/src/ui.mjs +10 -8
- package/resources/hf-download.py +0 -79
- package/src/backend-installers.mjs +0 -42
- package/src/benchmark/finalize.mjs +0 -169
- package/src/benchmark/flow.mjs +0 -240
- package/src/benchmark/metrics.mjs +0 -107
- package/src/benchmark/prepare.mjs +0 -118
- package/src/benchmark/repo.mjs +0 -77
- package/src/benchmark/sdk-runner.mjs +0 -363
- package/src/benchmark/shared.mjs +0 -46
- package/src/benchmark.mjs +0 -12
- package/src/commands/benchmark.mjs +0 -4
package/src/model-presenters.mjs
CHANGED
|
@@ -1,23 +1,20 @@
|
|
|
1
1
|
import { existsSync, statSync } from "node:fs";
|
|
2
2
|
import { basename, dirname, join } from "node:path";
|
|
3
|
+
import { stripVTControlCharacters } from "node:util";
|
|
3
4
|
import { backendFor } from "./backends.mjs";
|
|
4
5
|
import { computeServerCommand, buildStartScript, isProfileRunning } from "./process.mjs";
|
|
5
6
|
import { profileDir } from "./profiles.mjs";
|
|
6
7
|
import { pc, formatBytes, renderSectionRows } from "./ui.mjs";
|
|
7
8
|
import { capabilitySummary, ggufDetailParts, isProfileFileMissing, profileDetailParts } from "./model-summary.mjs";
|
|
8
9
|
import { itemKey } from "./model-catalog.mjs";
|
|
9
|
-
import { DATA_DIR } from "./config.mjs";
|
|
10
|
-
import { findBenchmarkRepo } from "./benchmark.mjs";
|
|
11
10
|
|
|
12
|
-
const OPTION_SEPARATOR =
|
|
11
|
+
const OPTION_SEPARATOR = " ";
|
|
13
12
|
const OPTION_STATUS_WIDTH = 12;
|
|
14
13
|
const OPTION_BACKEND_WIDTH = 14;
|
|
15
14
|
const OPTION_SOURCE_WIDTH = 14;
|
|
16
15
|
const OPTION_QUANT_WIDTH = 10;
|
|
17
16
|
const OPTION_CTX_WIDTH = 5;
|
|
18
17
|
|
|
19
|
-
const { stripVTControlCharacters } = await import("node:util");
|
|
20
|
-
|
|
21
18
|
function optionPad(text, color, width) {
|
|
22
19
|
const visible = stripVTControlCharacters(String(text)).length;
|
|
23
20
|
const padding = Math.max(1, width - visible);
|
|
@@ -198,35 +195,6 @@ export function inferBackendId(item) {
|
|
|
198
195
|
return "llama-cpp";
|
|
199
196
|
}
|
|
200
197
|
|
|
201
|
-
export function printWorkspaceHeader(normalized, runningProfilesNow, modelMissingIds = new Set()) {
|
|
202
|
-
const profiles = normalized.profiles;
|
|
203
|
-
const isRunning = (p) => runningProfilesNow.some((r) => r.id === p.id);
|
|
204
|
-
const isMissing = (p) => isProfileFileMissing(p) || modelMissingIds.has(p.id);
|
|
205
|
-
const readyCount = profiles.filter((p) => !isMissing(p) && !isRunning(p)).length;
|
|
206
|
-
const runningCount = runningProfilesNow.length;
|
|
207
|
-
const missingCount = profiles.filter(isMissing).length;
|
|
208
|
-
const setupCount = normalized.newModels.length + normalized.managedItems.length;
|
|
209
|
-
|
|
210
|
-
const countParts = [];
|
|
211
|
-
if (runningCount > 0) countParts.push(pc.green(`${runningCount} running`));
|
|
212
|
-
if (readyCount > 0) countParts.push(pc.blue(`${readyCount} model${readyCount === 1 ? "" : "s"} ready`));
|
|
213
|
-
if (missingCount > 0) countParts.push(pc.red(`${missingCount} model${missingCount === 1 ? "" : "s"} missing`));
|
|
214
|
-
if (setupCount > 0) countParts.push(pc.yellow(`${setupCount} model${setupCount === 1 ? "" : "s"} need${setupCount === 1 ? "s" : ""} setup`));
|
|
215
|
-
|
|
216
|
-
console.log(` ${countParts.join(pc.dim(" · "))}`);
|
|
217
|
-
console.log(pc.dim(` Profiles: ${DATA_DIR}`));
|
|
218
|
-
console.log(pc.dim(" ─────────────────────────────────────────────────────────"));
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
export async function printBenchmarkLine() {
|
|
222
|
-
const repoPath = await findBenchmarkRepo();
|
|
223
|
-
if (repoPath) {
|
|
224
|
-
console.log(pc.green(" ✓") + " local-llm-visual-benchmark linked");
|
|
225
|
-
} else {
|
|
226
|
-
console.log(pc.yellow(" ○") + " to run benchmarks, pair with " + pc.cyan("local-llm-visual-benchmark"));
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
|
|
230
198
|
export async function printProfileDetails(profile) {
|
|
231
199
|
const backend = backendFor(profile.backend);
|
|
232
200
|
const isManaged = backend.type === "managed-server";
|
|
@@ -266,8 +234,9 @@ export async function printProfileDetails(profile) {
|
|
|
266
234
|
const scriptPath = join(profileDir(profile.id), "start.sh");
|
|
267
235
|
console.log("\n" + renderSectionRows("Server command", [
|
|
268
236
|
["Run manually", pc.cyan(`bash ${scriptPath}`)],
|
|
269
|
-
|
|
270
|
-
|
|
237
|
+
]));
|
|
238
|
+
console.log("");
|
|
239
|
+
console.log(pc.dim(script));
|
|
271
240
|
}
|
|
272
241
|
}
|
|
273
242
|
}
|
package/src/model-summary.mjs
CHANGED
|
@@ -23,7 +23,7 @@ export function capabilitySummary(caps) {
|
|
|
23
23
|
return parts.length > 0 ? parts.join(" · ") : "standard GGUF";
|
|
24
24
|
}
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
function profileMtpLabel(profile, drafters, { detailed = false } = {}) {
|
|
27
27
|
if (profile.drafterPath) {
|
|
28
28
|
return detailed ? pc.green(`MTP enabled (drafter: ${basename(profile.drafterPath)})`) : pc.green("MTP enabled");
|
|
29
29
|
}
|
|
@@ -34,7 +34,7 @@ export function profileMtpLabel(profile, drafters, { detailed = false } = {}) {
|
|
|
34
34
|
return null;
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
function ggufMtpLabel(model, drafter) {
|
|
38
38
|
const caps = detectCapabilities(model.path, model.mmprojPath);
|
|
39
39
|
if (caps.mtp || Boolean(drafter)) return pc.green("MTP ✓");
|
|
40
40
|
if (caps.architecture === "gemma4") return pc.yellow("MTP: needs drafter");
|
package/src/omlx-runtime.mjs
CHANGED
|
@@ -14,7 +14,7 @@ import { join } from "node:path";
|
|
|
14
14
|
import { promisify } from "node:util";
|
|
15
15
|
import { compareVersions } from "./updates.mjs";
|
|
16
16
|
import { hasHomebrew, ensureHomebrewFor } from "./config.mjs";
|
|
17
|
-
import { commandExists } from "./exec.mjs";
|
|
17
|
+
import { commandExists, runCommand } from "./exec.mjs";
|
|
18
18
|
import { pc, renderCard, renderRows } from "./ui.mjs";
|
|
19
19
|
|
|
20
20
|
const execFileAsync = promisify(execFile);
|
|
@@ -138,7 +138,6 @@ export async function offerManagedOmlxUpdate(prompt, { fetchImpl = globalThis.fe
|
|
|
138
138
|
if (!shouldUpdate) return false;
|
|
139
139
|
|
|
140
140
|
try {
|
|
141
|
-
const { runCommand } = await import("./exec.mjs");
|
|
142
141
|
console.log(pc.dim("Updating oMLX via Homebrew..."));
|
|
143
142
|
await runCommand("brew", ["update"], { label: "brew update" });
|
|
144
143
|
await runCommand("brew", ["upgrade", "omlx"], { label: "brew upgrade omlx" });
|
|
@@ -153,6 +152,34 @@ export async function offerManagedOmlxUpdate(prompt, { fetchImpl = globalThis.fe
|
|
|
153
152
|
|
|
154
153
|
// ── Installation ───────────────────────────────────────────────────────────
|
|
155
154
|
|
|
155
|
+
/**
|
|
156
|
+
* Offer to restart oMLX so it picks up new or deleted models.
|
|
157
|
+
* @param {object} prompt - UI prompt interface (yesNo)
|
|
158
|
+
* @param {string} [reason] - why we're restarting (e.g. "to load the new model")
|
|
159
|
+
* @returns {Promise<boolean>} true if oMLX was restarted
|
|
160
|
+
*/
|
|
161
|
+
export async function offerOmlxRestart(prompt, reason = "to update its model list") {
|
|
162
|
+
const bin = await findOmlx();
|
|
163
|
+
if (!bin) {
|
|
164
|
+
console.log(pc.dim("Restart oMLX manually: omlx restart"));
|
|
165
|
+
return false;
|
|
166
|
+
}
|
|
167
|
+
const shouldRestart = await prompt.yesNo(`Restart oMLX ${reason}?`, true);
|
|
168
|
+
if (!shouldRestart) {
|
|
169
|
+
console.log(pc.dim("Restart manually later: omlx restart"));
|
|
170
|
+
return false;
|
|
171
|
+
}
|
|
172
|
+
try {
|
|
173
|
+
await execFileAsync(bin, ["restart"], { timeout: 15000 });
|
|
174
|
+
console.log(pc.green("✓ oMLX restarted"));
|
|
175
|
+
return true;
|
|
176
|
+
} catch (err) {
|
|
177
|
+
console.log(pc.red(`✗ Restart failed: ${err.message}`));
|
|
178
|
+
console.log(pc.dim("Restart manually: omlx restart"));
|
|
179
|
+
return false;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
156
183
|
/**
|
|
157
184
|
* Install oMLX. Uses Homebrew if available (automating tap + install).
|
|
158
185
|
* If Homebrew is not available, prompts to download the DMG from GitHub
|
|
@@ -167,7 +194,6 @@ export async function installOmlx(prompt, run) {
|
|
|
167
194
|
|
|
168
195
|
if (!hasBrew) {
|
|
169
196
|
if (!(await ensureHomebrewFor(prompt, run || (async (cmd, args, label) => {
|
|
170
|
-
const { runCommand } = await import("./exec.mjs");
|
|
171
197
|
return runCommand(cmd, args, { label });
|
|
172
198
|
}), "oMLX"))) {
|
|
173
199
|
console.log(pc.dim("Install oMLX manually:"));
|
|
@@ -179,7 +205,6 @@ export async function installOmlx(prompt, run) {
|
|
|
179
205
|
|
|
180
206
|
// Install oMLX via Homebrew
|
|
181
207
|
const runner = run || (async (cmd, args, label) => {
|
|
182
|
-
const { runCommand } = await import("./exec.mjs");
|
|
183
208
|
return runCommand(cmd, args, { label });
|
|
184
209
|
});
|
|
185
210
|
|
package/src/process.mjs
CHANGED
|
@@ -6,6 +6,8 @@ import { basename, join } from "node:path";
|
|
|
6
6
|
import { LOG_DIR } from "./config.mjs";
|
|
7
7
|
import { writeState, readState, profileDir } from "./profiles.mjs";
|
|
8
8
|
import { backendFor, backendBinaryFor } from "./backends.mjs";
|
|
9
|
+
import { computeFlags } from "./autodetect.mjs";
|
|
10
|
+
import { findOmlx } from "./omlx-runtime.mjs";
|
|
9
11
|
import { pc } from "./ui.mjs";
|
|
10
12
|
|
|
11
13
|
const execFileAsync = promisify(execFile);
|
|
@@ -23,7 +25,6 @@ export async function computeServerCommand(profile) {
|
|
|
23
25
|
if (!binary) throw new Error("Server binary not found. Run offgrid-ai interactively to install.");
|
|
24
26
|
|
|
25
27
|
// llama-cpp
|
|
26
|
-
const { computeFlags } = await import("./autodetect.mjs");
|
|
27
28
|
const result = computeFlags(
|
|
28
29
|
profile.capabilities ?? {},
|
|
29
30
|
profile.modelPath,
|
|
@@ -129,14 +130,11 @@ async function startManagedServer(profile, backend) {
|
|
|
129
130
|
// Try to start the managed server via CLI
|
|
130
131
|
if (backend.id === "omlx") {
|
|
131
132
|
try {
|
|
132
|
-
const { execFile } = await import("node:child_process");
|
|
133
|
-
const { promisify } = await import("node:util");
|
|
134
|
-
const { findOmlx } = await import("./omlx-runtime.mjs");
|
|
135
133
|
const omlxBin = await findOmlx();
|
|
136
134
|
if (!omlxBin) {
|
|
137
135
|
throw new Error(`${backend.label} is not installed. Run offgrid-ai to install it, or install manually: brew tap jundot/omlx && brew install omlx`);
|
|
138
136
|
}
|
|
139
|
-
await
|
|
137
|
+
await execFileAsync(omlxBin, ["start"], { timeout: 10000 });
|
|
140
138
|
} catch (err) {
|
|
141
139
|
if (err.message.includes("not installed")) throw err;
|
|
142
140
|
throw new Error(`${backend.label} could not be auto-started: ${err.message}. Run \`omlx start\` manually.`, { cause: err });
|
package/src/profile-setup.mjs
CHANGED
|
@@ -14,12 +14,28 @@ import { detectOmlxMtpCapability, findOmlxModelDir } from "./mlx-discovery.mjs";
|
|
|
14
14
|
const execFileAsync = promisify(execFile);
|
|
15
15
|
|
|
16
16
|
const CACHE_CHOICES = [
|
|
17
|
-
{ value: "bf16", label: "bf16", hint: "
|
|
18
|
-
{ value: "f16", label: "f16", hint: "stable fallback
|
|
19
|
-
{ value: "q8_0", label: "q8_0", hint: "
|
|
20
|
-
{ value: "q4_0", label: "q4_0", hint: "
|
|
17
|
+
{ value: "bf16", label: "bf16", hint: "16-bit · best quality · 2 bytes/elem" },
|
|
18
|
+
{ value: "f16", label: "f16", hint: "16-bit · stable fallback · 2 bytes/elem" },
|
|
19
|
+
{ value: "q8_0", label: "q8_0", hint: "8-bit · half memory · usually safe · 1 byte/elem" },
|
|
20
|
+
{ value: "q4_0", label: "q4_0", hint: "4-bit · quarter memory · quality tradeoff · 0.5 bytes/elem" },
|
|
21
21
|
];
|
|
22
22
|
|
|
23
|
+
function quickKvBytes(modelPath, flags) {
|
|
24
|
+
try {
|
|
25
|
+
return estimateMemory(modelPath, null, null, flags).kvBytes;
|
|
26
|
+
} catch {
|
|
27
|
+
return 0;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function cacheMemoryRows(modelPath, flags, ctxSize) {
|
|
32
|
+
const combos = [["bf16", "bf16"], ["f16", "f16"], ["q8_0", "q8_0"], ["q4_0", "q4_0"]];
|
|
33
|
+
return combos.map(([k, v]) => {
|
|
34
|
+
const bytes = quickKvBytes(modelPath, { ...flags, ctxSize, cacheTypeK: k, cacheTypeV: v });
|
|
35
|
+
return [`${k}/${v} KV cache`, bytes ? `~${formatBytes(bytes)}` : "unknown"];
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
|
|
23
39
|
const GENERAL_DEFAULTS = {
|
|
24
40
|
topK: 20,
|
|
25
41
|
presencePenalty: 1.5,
|
|
@@ -40,7 +56,6 @@ export async function configureLocalProfile(prompt, profile) {
|
|
|
40
56
|
const freshCaps = detectCapabilities(profile.modelPath, profile.mmprojPath);
|
|
41
57
|
let drafterPath = profile.drafterPath ?? null;
|
|
42
58
|
if (drafterPath && !existsSync(drafterPath)) {
|
|
43
|
-
// Stored drafter is no longer on disk — drop it and re-scan for a fresh one.
|
|
44
59
|
drafterPath = null;
|
|
45
60
|
}
|
|
46
61
|
if (!drafterPath) {
|
|
@@ -50,86 +65,229 @@ export async function configureLocalProfile(prompt, profile) {
|
|
|
50
65
|
}
|
|
51
66
|
const hasMtp = freshCaps.mtp || Boolean(drafterPath);
|
|
52
67
|
const caps = { ...freshCaps, mtp: hasMtp };
|
|
53
|
-
// MTP is a capability, not a separate backend. Just update the profile's
|
|
54
|
-
// capabilities and drafter path — flag computation handles the rest.
|
|
55
68
|
configured = { ...configured, drafterPath: drafterPath ?? null, capabilities: { ...configured.capabilities, mtp: hasMtp } };
|
|
56
|
-
// If vision was previously disabled but mmproj is back, re-enable
|
|
57
69
|
if (configured.disabledMmprojPath && configured.mmprojPath === null && freshCaps.vision) {
|
|
58
70
|
configured = { ...configured, mmprojPath: configured.disabledMmprojPath, disabledMmprojPath: undefined, capabilities: { ...configured.capabilities, vision: true, visionDisabledReason: undefined } };
|
|
59
71
|
}
|
|
60
72
|
|
|
73
|
+
// ── Model overview ──────────────────────────────────────────────────────
|
|
61
74
|
console.log("");
|
|
62
|
-
console.log(renderSection("Model
|
|
75
|
+
console.log(renderSection("Model overview", renderRows([
|
|
63
76
|
["Model", pc.bold(profile.label)],
|
|
64
77
|
["Detected", capabilitySummary(caps)],
|
|
65
|
-
["
|
|
66
|
-
["
|
|
67
|
-
["Sampling", samplingSummary(profile.flags)],
|
|
78
|
+
["Backend", "llama.cpp (local server)"],
|
|
79
|
+
["Model file", profile.modelPath],
|
|
68
80
|
])));
|
|
69
|
-
console.log(pc.dim("Larger context windows use more memory. KV cache precision controls memory used by attention history."));
|
|
70
|
-
console.log(pc.dim("Sampling defaults are shown for transparency; you can edit the profile later if needed.\n"));
|
|
71
81
|
|
|
82
|
+
// ── MTP ────────────────────────────────────────────────────────────────
|
|
72
83
|
if (caps.mtp) {
|
|
73
|
-
const drafterInfo = configured.drafterPath ? `\n Drafter: ${configured.drafterPath}` : "";
|
|
74
|
-
console.log(renderSection("MTP detected", renderRows([
|
|
75
|
-
["Feature", "Multi-Token Prediction (speculative decoding)"],
|
|
76
|
-
["Flags", `--spec-type draft-mtp --spec-draft-n-max 4${configured.drafterPath ? " --spec-draft-model <drafter>" : ""}`],
|
|
77
|
-
])));
|
|
78
|
-
if (drafterInfo) console.log(pc.dim(drafterInfo));
|
|
79
|
-
const useMtp = await prompt.yesNo("Use MTP speculative decoding?", true);
|
|
80
|
-
configured = useMtp ? applyMtpDefaults(configured) : removeMtpDefaults(configured);
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
if (caps.qat) {
|
|
84
84
|
console.log("");
|
|
85
|
-
console.log(renderSection("
|
|
86
|
-
["
|
|
87
|
-
["
|
|
85
|
+
console.log(renderSection("MTP (Multi-Token Prediction)", renderRows([
|
|
86
|
+
["What it does", "Speculative decoding — predicts multiple tokens at once, verifies them"],
|
|
87
|
+
["Speed", "1.5–3x faster generation"],
|
|
88
|
+
["Quality", "No loss — rejected predictions fall back to normal"],
|
|
89
|
+
["Memory", "Slightly more for draft model weights"],
|
|
90
|
+
["Flags", "--spec-type draft-mtp --spec-draft-n-max 4"],
|
|
91
|
+
...(configured.drafterPath ? [["Drafter", configured.drafterPath]] : []),
|
|
88
92
|
])));
|
|
93
|
+
const useMtp = await prompt.yesNo("Enable MTP speculative decoding?", true);
|
|
94
|
+
configured = useMtp ? applyMtpDefaults(configured) : removeMtpDefaults(configured);
|
|
89
95
|
}
|
|
90
96
|
|
|
97
|
+
// ── Vision ─────────────────────────────────────────────────────────────
|
|
91
98
|
if (caps.vision && profile.mmprojPath) {
|
|
92
99
|
console.log("");
|
|
93
100
|
const gemma4Unified = isGemma4UnifiedProjector(caps.mmprojProjectorType);
|
|
94
101
|
const supported = !gemma4Unified || await runtimeSupportsGemma4Unified();
|
|
95
|
-
console.log(renderSection("Vision projector
|
|
96
|
-
["
|
|
102
|
+
console.log(renderSection("Vision projector", renderRows([
|
|
103
|
+
["What it does", "Enables image understanding — model can see and reason about images"],
|
|
104
|
+
["Projector type", caps.mmprojProjectorType ?? "unknown"],
|
|
97
105
|
["Flag", `--mmproj ${profile.mmprojPath}`],
|
|
106
|
+
["Memory", "~200 MB for projector weights (varies by model)"],
|
|
98
107
|
...(gemma4Unified && !supported ? [["Note", pc.yellow("Gemma 4 unified projectors need llama.cpp b9549+.")]] : []),
|
|
99
108
|
])));
|
|
100
|
-
const useVision = await prompt.yesNo("Enable vision
|
|
109
|
+
const useVision = await prompt.yesNo("Enable vision?", supported);
|
|
101
110
|
configured = useVision ? applyVisionDefaults(configured) : removeVisionDefaults(configured, gemma4Unified && !supported ? "gemma4-unified-unsupported" : "user-disabled");
|
|
102
111
|
}
|
|
103
112
|
|
|
113
|
+
// ── Thinking ───────────────────────────────────────────────────────────
|
|
104
114
|
if (caps.thinking) {
|
|
105
115
|
console.log("");
|
|
106
|
-
console.log(renderSection("Thinking
|
|
107
|
-
["
|
|
108
|
-
["
|
|
116
|
+
console.log(renderSection("Thinking mode", renderRows([
|
|
117
|
+
["What it does", "Model reasons step-by-step before answering"],
|
|
118
|
+
["Benefit", "Better results for math, code, logic — but slower (more output tokens)"],
|
|
119
|
+
["Sampling changes", "top-k → 64, presence penalty → 0, repeat penalty → 1.1 (adjustable below)"],
|
|
109
120
|
["Template", "--chat-template-kwargs { enable_thinking: true }"],
|
|
110
121
|
])));
|
|
111
|
-
const useThinking = await prompt.yesNo("Use
|
|
122
|
+
const useThinking = await prompt.yesNo("Use thinking/loop-safe defaults?", true);
|
|
112
123
|
configured = useThinking ? applyThinkingDefaults(configured) : removeThinkingDefaults(configured);
|
|
113
124
|
}
|
|
114
125
|
|
|
115
|
-
|
|
126
|
+
// ── Context window ─────────────────────────────────────────────────────
|
|
127
|
+
const maxCtx = caps.metaCtx ?? 1048576;
|
|
128
|
+
console.log("");
|
|
129
|
+
console.log(renderSection("Context window", renderRows([
|
|
130
|
+
["What it does", "Maximum tokens the model can process at once (prompt + response + history)"],
|
|
131
|
+
["Range", `1,024 – ${maxCtx.toLocaleString()} tokens`],
|
|
132
|
+
["Memory", "KV cache grows linearly — larger context = more RAM"],
|
|
133
|
+
["Guidance", "8k–32k: chat · 32k–80k: coding/long convos · 128k+: long documents"],
|
|
134
|
+
["Model max", `${maxCtx.toLocaleString()} tokens`],
|
|
135
|
+
["Default", `${configured.flags.ctxSize.toLocaleString()} tokens`],
|
|
136
|
+
])));
|
|
137
|
+
const ctxSize = await prompt.number("Context window tokens", configured.flags.ctxSize, 1024, maxCtx);
|
|
138
|
+
configured = applyRuntimeFlagOverrides(configured, { ctxSize });
|
|
139
|
+
|
|
140
|
+
// ── K cache precision ──────────────────────────────────────────────────
|
|
141
|
+
console.log("");
|
|
142
|
+
console.log(renderSection("K cache precision", renderRows([
|
|
143
|
+
["What it is", "KV cache stores attention 'keys' — previous token states used for prediction"],
|
|
144
|
+
["Tradeoff", "Lower precision = less memory, potential quality loss"],
|
|
145
|
+
...cacheMemoryRows(profile.modelPath, configured.flags, ctxSize),
|
|
146
|
+
])));
|
|
116
147
|
const cacheTypeK = await prompt.choice("K cache precision", CACHE_CHOICES, configured.flags.cacheTypeK);
|
|
148
|
+
configured = applyRuntimeFlagOverrides(configured, { cacheTypeK });
|
|
149
|
+
|
|
150
|
+
// ── V cache precision ──────────────────────────────────────────────────
|
|
151
|
+
console.log("");
|
|
152
|
+
console.log(renderSection("V cache precision", renderRows([
|
|
153
|
+
["What it is", "KV cache stores attention 'values' — token representations from previous layers"],
|
|
154
|
+
["Tradeoff", "Same as K cache. Some models are more sensitive to V precision than K"],
|
|
155
|
+
...cacheMemoryRows(profile.modelPath, configured.flags, ctxSize),
|
|
156
|
+
])));
|
|
117
157
|
const cacheTypeV = await prompt.choice("V cache precision", CACHE_CHOICES, configured.flags.cacheTypeV);
|
|
118
|
-
configured = applyRuntimeFlagOverrides(configured, {
|
|
158
|
+
configured = applyRuntimeFlagOverrides(configured, { cacheTypeV });
|
|
159
|
+
|
|
160
|
+
// ── Memory estimate (with chosen context + cache) ──────────────────────
|
|
161
|
+
console.log("");
|
|
162
|
+
console.log(renderMemoryEstimate(configured));
|
|
163
|
+
|
|
164
|
+
// ── Temperature ────────────────────────────────────────────────────────
|
|
165
|
+
console.log("");
|
|
166
|
+
console.log(renderSection("Temperature", renderRows([
|
|
167
|
+
["What it does", "Controls randomness in token selection"],
|
|
168
|
+
["Range", "0.0 – 2.0"],
|
|
169
|
+
["0.0", "Deterministic — always picks most likely token"],
|
|
170
|
+
["0.6", "Balanced — default for most models"],
|
|
171
|
+
["1.0+", "Creative — more random, may hallucinate"],
|
|
172
|
+
["Guidance", "0–0.3: coding/factual · 0.4–0.8: chat · 0.9+: creative writing"],
|
|
173
|
+
])));
|
|
174
|
+
const temperature = await prompt.number("Temperature", configured.flags.temperature, 0, 2, { float: true });
|
|
175
|
+
configured = applyRuntimeFlagOverrides(configured, { temperature });
|
|
176
|
+
|
|
177
|
+
// ── Top-p ──────────────────────────────────────────────────────────────
|
|
178
|
+
console.log("");
|
|
179
|
+
console.log(renderSection("Top-p (nucleus sampling)", renderRows([
|
|
180
|
+
["What it does", "Only considers tokens in the top p fraction of probability mass"],
|
|
181
|
+
["Range", "0.0 – 1.0 (1.0 = consider all tokens)"],
|
|
182
|
+
["Guidance", "0.9–0.95: good default · Lower = more focused · Higher = more diverse"],
|
|
183
|
+
])));
|
|
184
|
+
const topP = await prompt.number("Top-p", configured.flags.topP, 0, 1, { float: true });
|
|
185
|
+
configured = applyRuntimeFlagOverrides(configured, { topP });
|
|
186
|
+
|
|
187
|
+
// ── Top-k ──────────────────────────────────────────────────────────────
|
|
188
|
+
console.log("");
|
|
189
|
+
console.log(renderSection("Top-k", renderRows([
|
|
190
|
+
["What it does", "Limits token selection to top K most likely tokens at each step"],
|
|
191
|
+
["Range", "0 – 1000 (0 = disabled, uses top-p instead)"],
|
|
192
|
+
["Guidance", "20: general chat · 40–64: thinking/reasoning · 0: rely on top-p"],
|
|
193
|
+
])));
|
|
194
|
+
const topK = await prompt.number("Top-k", configured.flags.topK, 0, 1000);
|
|
195
|
+
configured = applyRuntimeFlagOverrides(configured, { topK });
|
|
196
|
+
|
|
197
|
+
// ── Min-p ──────────────────────────────────────────────────────────────
|
|
198
|
+
console.log("");
|
|
199
|
+
console.log(renderSection("Min-p", renderRows([
|
|
200
|
+
["What it does", "Excludes tokens with probability below this threshold"],
|
|
201
|
+
["Range", "0.0 – 1.0 (0 = disabled)"],
|
|
202
|
+
["Guidance", "0: off (default) · 0.05–0.1: reduces hallucination while keeping creativity"],
|
|
203
|
+
])));
|
|
204
|
+
const minP = await prompt.number("Min-p", configured.flags.minP, 0, 1, { float: true });
|
|
205
|
+
configured = applyRuntimeFlagOverrides(configured, { minP });
|
|
206
|
+
|
|
207
|
+
// ── Presence penalty ───────────────────────────────────────────────────
|
|
208
|
+
console.log("");
|
|
209
|
+
console.log(renderSection("Presence penalty", renderRows([
|
|
210
|
+
["What it does", "Discourages repeated tokens — each used token gets a fixed logit penalty"],
|
|
211
|
+
["Range", "0.0 – 2.0 (0 = off)"],
|
|
212
|
+
["Guidance", "0: thinking/reasoning · 1.0–1.5: general chat · Too high = incoherent"],
|
|
213
|
+
])));
|
|
214
|
+
const presencePenalty = await prompt.number("Presence penalty", configured.flags.presencePenalty, 0, 2, { float: true });
|
|
215
|
+
configured = applyRuntimeFlagOverrides(configured, { presencePenalty });
|
|
216
|
+
|
|
217
|
+
// ── Repeat penalty ─────────────────────────────────────────────────────
|
|
218
|
+
console.log("");
|
|
219
|
+
console.log(renderSection("Repeat penalty", renderRows([
|
|
220
|
+
["What it does", "Multiplies probability of repeated tokens (multiplicative, vs presence's additive)"],
|
|
221
|
+
["Range", "0.0 – 2.0 (1.0 = no effect)"],
|
|
222
|
+
["Guidance", "1.0–1.1: most use cases · Higher = aggressive anti-repeat · Can break code patterns"],
|
|
223
|
+
])));
|
|
224
|
+
const repeatPenalty = await prompt.number("Repeat penalty", configured.flags.repeatPenalty, 0, 2, { float: true });
|
|
225
|
+
configured = applyRuntimeFlagOverrides(configured, { repeatPenalty });
|
|
226
|
+
|
|
227
|
+
// ── Batch size ─────────────────────────────────────────────────────────
|
|
228
|
+
console.log("");
|
|
229
|
+
console.log(renderSection("Batch size", renderRows([
|
|
230
|
+
["What it does", "Tokens processed in parallel during prompt processing (before generation)"],
|
|
231
|
+
["Range", "1 – 4096"],
|
|
232
|
+
["Guidance", "512: good default · 2048+: faster for long prompts · Lower if memory tight"],
|
|
233
|
+
])));
|
|
234
|
+
const batchSize = await prompt.number("Batch size", configured.flags.batchSize, 1, 4096);
|
|
235
|
+
configured = applyRuntimeFlagOverrides(configured, { batchSize });
|
|
236
|
+
|
|
237
|
+
// ── Parallel slots ─────────────────────────────────────────────────────
|
|
238
|
+
console.log("");
|
|
239
|
+
console.log(renderSection("Parallel slots", renderRows([
|
|
240
|
+
["What it does", "Number of concurrent request slots (each handles an independent conversation)"],
|
|
241
|
+
["Range", "1 – 10"],
|
|
242
|
+
["Memory", "KV cache is multiplied by this number"],
|
|
243
|
+
["Guidance", "1: single-user local · 2+: multiple clients connecting simultaneously"],
|
|
244
|
+
])));
|
|
245
|
+
const parallel = await prompt.number("Parallel slots", configured.flags.parallel, 1, 10);
|
|
246
|
+
configured = applyRuntimeFlagOverrides(configured, { parallel });
|
|
247
|
+
|
|
248
|
+
// ── Flash attention ────────────────────────────────────────────────────
|
|
249
|
+
console.log("");
|
|
250
|
+
console.log(renderSection("Flash attention", renderRows([
|
|
251
|
+
["What it does", "Memory-efficient attention algorithm — faster and less RAM than standard"],
|
|
252
|
+
["Guidance", "Always on for modern hardware · Turn off only for old GPU driver compat issues"],
|
|
253
|
+
])));
|
|
254
|
+
const flashAttn = await prompt.yesNo("Enable flash attention?", true);
|
|
255
|
+
configured = applyRuntimeFlagOverrides(configured, { flashAttention: flashAttn ? "on" : "off" });
|
|
256
|
+
|
|
257
|
+
// ── Jinja templates ────────────────────────────────────────────────────
|
|
258
|
+
console.log("");
|
|
259
|
+
console.log(renderSection("Jinja chat templates", renderRows([
|
|
260
|
+
["What it does", "Enables Jinja2 template rendering for proper chat formatting"],
|
|
261
|
+
["Guidance", "Always on for modern models · Turn off only for very old models without chat templates"],
|
|
262
|
+
])));
|
|
263
|
+
const jinja = await prompt.yesNo("Enable Jinja templates?", true);
|
|
264
|
+
configured = applyRuntimeFlagOverrides(configured, { jinja });
|
|
119
265
|
|
|
266
|
+
// ── Final summary ──────────────────────────────────────────────────────
|
|
120
267
|
console.log("");
|
|
121
|
-
console.log(renderSection("
|
|
268
|
+
console.log(renderSection("Configuration summary", renderRows([
|
|
269
|
+
["Model", pc.bold(configured.label)],
|
|
122
270
|
["Backend", configured.backend],
|
|
123
271
|
["Endpoint", configured.baseUrl],
|
|
124
|
-
["
|
|
125
|
-
["
|
|
126
|
-
["
|
|
127
|
-
["
|
|
128
|
-
["
|
|
129
|
-
["
|
|
272
|
+
["Context", `${configured.flags.ctxSize.toLocaleString()} tokens`],
|
|
273
|
+
["KV cache", `${configured.flags.cacheTypeK}/${configured.flags.cacheTypeV}`],
|
|
274
|
+
["Temperature", String(configured.flags.temperature)],
|
|
275
|
+
["Top-p", String(configured.flags.topP)],
|
|
276
|
+
["Top-k", String(configured.flags.topK)],
|
|
277
|
+
["Min-p", String(configured.flags.minP)],
|
|
278
|
+
["Presence penalty", String(configured.flags.presencePenalty)],
|
|
279
|
+
["Repeat penalty", String(configured.flags.repeatPenalty)],
|
|
280
|
+
["Batch size", String(configured.flags.batchSize)],
|
|
281
|
+
["Parallel", String(configured.flags.parallel)],
|
|
282
|
+
["Flash attention", configured.flags.flashAttention],
|
|
283
|
+
["Jinja", configured.flags.jinja ? "on" : "off"],
|
|
284
|
+
...(configured.capabilities?.mtp ? [["MTP", "enabled"]] : []),
|
|
285
|
+
...(configured.capabilities?.vision ? [["Vision", "enabled"]] : []),
|
|
286
|
+
...(configured.capabilities?.thinking ? [["Thinking", "enabled"]] : []),
|
|
130
287
|
])));
|
|
131
288
|
|
|
132
|
-
console.log("
|
|
289
|
+
console.log("");
|
|
290
|
+
console.log(renderMemoryEstimate(configured));
|
|
133
291
|
if (!(await prompt.yesNo("Save profile with these settings?", true))) return null;
|
|
134
292
|
return configured;
|
|
135
293
|
}
|
|
@@ -240,10 +398,12 @@ function applyProfileFlags(profile, flags) {
|
|
|
240
398
|
|
|
241
399
|
function renderMemoryEstimate(profile) {
|
|
242
400
|
try {
|
|
243
|
-
const est = estimateMemory(profile.modelPath, profile.mmprojPath,
|
|
401
|
+
const est = estimateMemory(profile.modelPath, profile.mmprojPath, profile.drafterPath, profile.flags);
|
|
244
402
|
return renderSection("Memory estimate", renderRows([
|
|
245
403
|
["Estimated total", pc.bold(`~${formatBytes(est.totalBytes)}`)],
|
|
246
404
|
["Model", formatBytes(est.modelBytes)],
|
|
405
|
+
...(est.mmprojBytes ? [["Vision projector", formatBytes(est.mmprojBytes)]] : []),
|
|
406
|
+
...(est.draftBytes ? [["Drafter (MTP)", formatBytes(est.draftBytes)]] : []),
|
|
247
407
|
["KV cache", est.kvBytes ? `~${formatBytes(est.kvBytes)} (${profile.flags.ctxSize.toLocaleString()} ctx, ${profile.flags.cacheTypeK}/${profile.flags.cacheTypeV})` : "unknown"],
|
|
248
408
|
...(est.note ? [["Note", pc.yellow(est.note)]] : []),
|
|
249
409
|
]));
|
|
@@ -269,8 +429,5 @@ async function runtimeSupportsGemma4Unified() {
|
|
|
269
429
|
}
|
|
270
430
|
}
|
|
271
431
|
|
|
272
|
-
function samplingSummary(flags) {
|
|
273
|
-
return `temp ${flags.temperature}, top-p ${flags.topP}, top-k ${flags.topK}`;
|
|
274
|
-
}
|
|
275
432
|
|
|
276
433
|
|
package/src/profiles.mjs
CHANGED
|
@@ -3,6 +3,7 @@ import { mkdir, readdir, rm, unlink, writeFile, readFile } from "node:fs/promise
|
|
|
3
3
|
import { join } from "node:path";
|
|
4
4
|
import { PROFILE_DIR, RUN_DIR, LOG_DIR } from "./config.mjs";
|
|
5
5
|
import { backendFor, baseUrlForFlags, defaultFlagsForBackend, BACKENDS } from "./backends.mjs";
|
|
6
|
+
import { detectCapabilities } from "./autodetect.mjs";
|
|
6
7
|
import { computeFlags } from "./autodetect.mjs";
|
|
7
8
|
import { readJson, writeJson } from "./json.mjs";
|
|
8
9
|
|
|
@@ -130,7 +131,6 @@ export function normalizeProfile(profile) {
|
|
|
130
131
|
// ── Auto-create profile from a discovered model ────────────────────────────
|
|
131
132
|
|
|
132
133
|
export async function createProfileFromModel(model, backendId, drafterPath) {
|
|
133
|
-
const { detectCapabilities } = await import("./autodetect.mjs");
|
|
134
134
|
const caps = detectCapabilities(model.path, model.mmprojPath);
|
|
135
135
|
// If a drafter is provided, this model supports MTP regardless of filename
|
|
136
136
|
const hasMtp = caps.mtp || Boolean(drafterPath);
|
package/src/runtime.mjs
CHANGED
|
@@ -3,7 +3,7 @@ import { execFile } from "node:child_process";
|
|
|
3
3
|
import { existsSync } from "node:fs";
|
|
4
4
|
import { chmod, mkdir, mkdtemp, readFile, rm, symlink, unlink, writeFile } from "node:fs/promises";
|
|
5
5
|
import { tmpdir } from "node:os";
|
|
6
|
-
import {
|
|
6
|
+
import { join } from "node:path";
|
|
7
7
|
import { promisify } from "node:util";
|
|
8
8
|
import { MANAGED_LLAMA_SERVER, RUNTIME_DIR } from "./config.mjs";
|
|
9
9
|
import { compareVersions } from "./updates.mjs";
|
|
@@ -137,5 +137,5 @@ function verifyDigest(bytes, digest) {
|
|
|
137
137
|
if (!digest?.startsWith("sha256:")) return;
|
|
138
138
|
const expected = digest.slice("sha256:".length);
|
|
139
139
|
const actual = createHash("sha256").update(bytes).digest("hex");
|
|
140
|
-
if (actual !== expected) throw new Error(
|
|
140
|
+
if (actual !== expected) throw new Error("llama.cpp: checksum mismatch");
|
|
141
141
|
}
|
package/src/ui.mjs
CHANGED
|
@@ -95,12 +95,12 @@ export function renderCard(title, body, options = {}) {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
const topTitle = title ? `╭${pc.reset(titleStr)}` : "╭";
|
|
98
|
-
const topFill = "─".repeat(Math.max(0, width
|
|
98
|
+
const topFill = "─".repeat(Math.max(0, width - visibleLen(titleStr)));
|
|
99
99
|
const top = `${topTitle}${topFill}╮`;
|
|
100
100
|
|
|
101
101
|
const middle = lines.map((line) => `│ ${padVisible(line, contentWidth)} │`);
|
|
102
102
|
|
|
103
|
-
const bottom = `╰${"─".repeat(width
|
|
103
|
+
const bottom = `╰${"─".repeat(width)}╯`;
|
|
104
104
|
|
|
105
105
|
return [top, ...middle, bottom].map((l) => borderColor(l)).join("\n");
|
|
106
106
|
}
|
|
@@ -180,9 +180,8 @@ async function runPrompt(fn, config) {
|
|
|
180
180
|
|
|
181
181
|
// ── Interactive prompt factory ──────────────────────────────────────────────
|
|
182
182
|
|
|
183
|
-
export function startInteractive(
|
|
183
|
+
export function startInteractive() {
|
|
184
184
|
if (process.stdin.isTTY) console.clear();
|
|
185
|
-
console.log(pc.magenta(`◆ ${title}`));
|
|
186
185
|
}
|
|
187
186
|
|
|
188
187
|
export function createPrompt() {
|
|
@@ -195,10 +194,13 @@ export function createPrompt() {
|
|
|
195
194
|
return value?.trim() || String(defaultValue ?? "");
|
|
196
195
|
},
|
|
197
196
|
|
|
198
|
-
async number(label, defaultValue, min, max) {
|
|
197
|
+
async number(label, defaultValue, min, max, { float = false } = {}) {
|
|
199
198
|
const value = await runPrompt(number, {
|
|
200
199
|
message: label,
|
|
201
200
|
default: defaultValue,
|
|
201
|
+
min,
|
|
202
|
+
max,
|
|
203
|
+
step: float ? 'any' : 1,
|
|
202
204
|
validate(input) {
|
|
203
205
|
if (!Number.isFinite(input) || input < min || input > max) {
|
|
204
206
|
return `Enter a number from ${min} to ${max}.`;
|
|
@@ -235,12 +237,12 @@ export function createPrompt() {
|
|
|
235
237
|
|
|
236
238
|
export async function modelSelect(label, groups, { defaultKey, pageSize = 20 } = {}) {
|
|
237
239
|
const choices = [];
|
|
238
|
-
//
|
|
239
|
-
choices.push(new Separator(
|
|
240
|
+
// Blank line after the prompt message for visual separation
|
|
241
|
+
choices.push(new Separator(" "));
|
|
240
242
|
for (let i = 0; i < groups.length; i++) {
|
|
241
243
|
const group = groups[i];
|
|
242
244
|
// Add blank line before each group (except the first)
|
|
243
|
-
if (i > 0) choices.push(new Separator(""));
|
|
245
|
+
if (i > 0) choices.push(new Separator(" "));
|
|
244
246
|
if (group.separator) {
|
|
245
247
|
choices.push(new Separator(group.separator));
|
|
246
248
|
}
|