offgrid-ai 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -6
- package/package.json +3 -11
- package/src/autodetect.mjs +1 -1
- package/src/backends.mjs +0 -12
- package/src/cli.mjs +1 -4
- package/src/commands/main.mjs +27 -3
- package/src/commands/models.mjs +115 -41
- package/src/commands/onboard.mjs +3 -31
- package/src/commands/run.mjs +2 -5
- package/src/config.mjs +62 -1
- package/src/harness-pi.mjs +3 -5
- package/src/managed.mjs +3 -3
- package/src/mlx-discovery.mjs +94 -1
- package/src/model-name.mjs +2 -2
- package/src/model-presenters.mjs +4 -14
- package/src/omlx-runtime.mjs +232 -0
- package/src/process.mjs +55 -5
- package/src/profile-setup.mjs +253 -70
- package/src/profiles.mjs +11 -3
- package/src/ui.mjs +10 -27
- package/src/benchmark/finalize.mjs +0 -169
- package/src/benchmark/flow.mjs +0 -239
- package/src/benchmark/metrics.mjs +0 -113
- package/src/benchmark/prepare.mjs +0 -118
- package/src/benchmark/repo.mjs +0 -77
- package/src/benchmark/sdk-runner.mjs +0 -363
- package/src/benchmark/shared.mjs +0 -46
- package/src/benchmark.mjs +0 -12
- package/src/commands/benchmark.mjs +0 -4
package/src/profile-setup.mjs
CHANGED
|
@@ -3,22 +3,39 @@ import { execFile } from "node:child_process";
|
|
|
3
3
|
import { promisify } from "node:util";
|
|
4
4
|
import { estimateMemory } from "./estimate.mjs";
|
|
5
5
|
import { findLlamaServer } from "./config.mjs";
|
|
6
|
-
import { baseUrlForFlags
|
|
6
|
+
import { baseUrlForFlags } from "./backends.mjs";
|
|
7
7
|
import { pc, formatBytes, renderRows, renderSection } from "./ui.mjs";
|
|
8
8
|
import { detectCapabilities } from "./autodetect.mjs";
|
|
9
9
|
import { matchDrafter } from "./scan.mjs";
|
|
10
10
|
import { scanGgufModels } from "./scan.mjs";
|
|
11
11
|
import { capabilitySummary } from "./model-summary.mjs";
|
|
12
|
+
import { detectOmlxMtpCapability, findOmlxModelDir } from "./mlx-discovery.mjs";
|
|
12
13
|
|
|
13
14
|
const execFileAsync = promisify(execFile);
|
|
14
15
|
|
|
15
16
|
const CACHE_CHOICES = [
|
|
16
|
-
{ value: "bf16", label: "bf16", hint: "
|
|
17
|
-
{ value: "f16", label: "f16", hint: "stable fallback
|
|
18
|
-
{ value: "q8_0", label: "q8_0", hint: "
|
|
19
|
-
{ value: "q4_0", label: "q4_0", hint: "
|
|
17
|
+
{ value: "bf16", label: "bf16", hint: "16-bit · best quality · 2 bytes/elem" },
|
|
18
|
+
{ value: "f16", label: "f16", hint: "16-bit · stable fallback · 2 bytes/elem" },
|
|
19
|
+
{ value: "q8_0", label: "q8_0", hint: "8-bit · half memory · usually safe · 1 byte/elem" },
|
|
20
|
+
{ value: "q4_0", label: "q4_0", hint: "4-bit · quarter memory · quality tradeoff · 0.5 bytes/elem" },
|
|
20
21
|
];
|
|
21
22
|
|
|
23
|
+
function quickKvBytes(modelPath, flags) {
|
|
24
|
+
try {
|
|
25
|
+
return estimateMemory(modelPath, null, null, flags).kvBytes;
|
|
26
|
+
} catch {
|
|
27
|
+
return 0;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function cacheMemoryRows(modelPath, flags, ctxSize) {
|
|
32
|
+
const combos = [["bf16", "bf16"], ["f16", "f16"], ["q8_0", "q8_0"], ["q4_0", "q4_0"]];
|
|
33
|
+
return combos.map(([k, v]) => {
|
|
34
|
+
const bytes = quickKvBytes(modelPath, { ...flags, ctxSize, cacheTypeK: k, cacheTypeV: v });
|
|
35
|
+
return [`${k}/${v} KV cache`, bytes ? `~${formatBytes(bytes)}` : "unknown"];
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
|
|
22
39
|
const GENERAL_DEFAULTS = {
|
|
23
40
|
topK: 20,
|
|
24
41
|
presencePenalty: 1.5,
|
|
@@ -39,7 +56,6 @@ export async function configureLocalProfile(prompt, profile) {
|
|
|
39
56
|
const freshCaps = detectCapabilities(profile.modelPath, profile.mmprojPath);
|
|
40
57
|
let drafterPath = profile.drafterPath ?? null;
|
|
41
58
|
if (drafterPath && !existsSync(drafterPath)) {
|
|
42
|
-
// Stored drafter is no longer on disk — drop it and re-scan for a fresh one.
|
|
43
59
|
drafterPath = null;
|
|
44
60
|
}
|
|
45
61
|
if (!drafterPath) {
|
|
@@ -49,97 +65,229 @@ export async function configureLocalProfile(prompt, profile) {
|
|
|
49
65
|
}
|
|
50
66
|
const hasMtp = freshCaps.mtp || Boolean(drafterPath);
|
|
51
67
|
const caps = { ...freshCaps, mtp: hasMtp };
|
|
52
|
-
|
|
53
|
-
if (hasMtp && configured.backend !== "llama-cpp-mtp") {
|
|
54
|
-
configured = { ...configured, backend: "llama-cpp-mtp", providerId: "llama-cpp-mtp", drafterPath, capabilities: { ...configured.capabilities, mtp: true } };
|
|
55
|
-
}
|
|
56
|
-
// If the profile was MTP but the drafter is now gone (and the model isn't
|
|
57
|
-
// natively MTP), switch back to plain llama.cpp so the server can start.
|
|
58
|
-
if (!hasMtp && configured.backend === "llama-cpp-mtp") {
|
|
59
|
-
console.log(pc.yellow("MTP drafter no longer found — switching to llama.cpp without speculative decoding."));
|
|
60
|
-
configured = removeMtpDefaults(configured);
|
|
61
|
-
}
|
|
62
|
-
if (drafterPath && !configured.drafterPath) {
|
|
63
|
-
configured = { ...configured, drafterPath };
|
|
64
|
-
}
|
|
65
|
-
// If vision was previously disabled but mmproj is back, re-enable
|
|
68
|
+
configured = { ...configured, drafterPath: drafterPath ?? null, capabilities: { ...configured.capabilities, mtp: hasMtp } };
|
|
66
69
|
if (configured.disabledMmprojPath && configured.mmprojPath === null && freshCaps.vision) {
|
|
67
70
|
configured = { ...configured, mmprojPath: configured.disabledMmprojPath, disabledMmprojPath: undefined, capabilities: { ...configured.capabilities, vision: true, visionDisabledReason: undefined } };
|
|
68
71
|
}
|
|
69
72
|
|
|
73
|
+
// ── Model overview ──────────────────────────────────────────────────────
|
|
70
74
|
console.log("");
|
|
71
|
-
console.log(renderSection("Model
|
|
75
|
+
console.log(renderSection("Model overview", renderRows([
|
|
72
76
|
["Model", pc.bold(profile.label)],
|
|
73
77
|
["Detected", capabilitySummary(caps)],
|
|
74
|
-
["
|
|
75
|
-
["
|
|
76
|
-
["Sampling", samplingSummary(profile.flags)],
|
|
78
|
+
["Backend", "llama.cpp (local server)"],
|
|
79
|
+
["Model file", profile.modelPath],
|
|
77
80
|
])));
|
|
78
|
-
console.log(pc.dim("Larger context windows use more memory. KV cache precision controls memory used by attention history."));
|
|
79
|
-
console.log(pc.dim("Sampling defaults are shown for transparency; you can edit the profile later if needed.\n"));
|
|
80
81
|
|
|
82
|
+
// ── MTP ────────────────────────────────────────────────────────────────
|
|
81
83
|
if (caps.mtp) {
|
|
82
|
-
const drafterInfo = configured.drafterPath ? `\n Drafter: ${configured.drafterPath}` : "";
|
|
83
|
-
console.log(renderSection("MTP detected", renderRows([
|
|
84
|
-
["Backend", "llama.cpp MTP"],
|
|
85
|
-
["Port", String(LLAMA_CPP_MTP_PORT)],
|
|
86
|
-
["Flags", `--spec-type draft-mtp --spec-draft-n-max 4${configured.drafterPath ? " --spec-draft-model <drafter>" : ""}`],
|
|
87
|
-
])));
|
|
88
|
-
if (drafterInfo) console.log(pc.dim(drafterInfo));
|
|
89
|
-
const useMtp = await prompt.yesNo("Use MTP speculative decoding?", true);
|
|
90
|
-
configured = useMtp ? applyMtpDefaults(configured) : removeMtpDefaults(configured);
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
if (caps.qat) {
|
|
94
84
|
console.log("");
|
|
95
|
-
console.log(renderSection("
|
|
96
|
-
["
|
|
97
|
-
["
|
|
85
|
+
console.log(renderSection("MTP (Multi-Token Prediction)", renderRows([
|
|
86
|
+
["What it does", "Speculative decoding — predicts multiple tokens at once, verifies them"],
|
|
87
|
+
["Speed", "1.5–3x faster generation"],
|
|
88
|
+
["Quality", "No loss — rejected predictions fall back to normal"],
|
|
89
|
+
["Memory", "Slightly more for draft model weights"],
|
|
90
|
+
["Flags", "--spec-type draft-mtp --spec-draft-n-max 4"],
|
|
91
|
+
...(configured.drafterPath ? [["Drafter", configured.drafterPath]] : []),
|
|
98
92
|
])));
|
|
93
|
+
const useMtp = await prompt.yesNo("Enable MTP speculative decoding?", true);
|
|
94
|
+
configured = useMtp ? applyMtpDefaults(configured) : removeMtpDefaults(configured);
|
|
99
95
|
}
|
|
100
96
|
|
|
97
|
+
// ── Vision ─────────────────────────────────────────────────────────────
|
|
101
98
|
if (caps.vision && profile.mmprojPath) {
|
|
102
99
|
console.log("");
|
|
103
100
|
const gemma4Unified = isGemma4UnifiedProjector(caps.mmprojProjectorType);
|
|
104
101
|
const supported = !gemma4Unified || await runtimeSupportsGemma4Unified();
|
|
105
|
-
console.log(renderSection("Vision projector
|
|
106
|
-
["
|
|
102
|
+
console.log(renderSection("Vision projector", renderRows([
|
|
103
|
+
["What it does", "Enables image understanding — model can see and reason about images"],
|
|
104
|
+
["Projector type", caps.mmprojProjectorType ?? "unknown"],
|
|
107
105
|
["Flag", `--mmproj ${profile.mmprojPath}`],
|
|
106
|
+
["Memory", "~200 MB for projector weights (varies by model)"],
|
|
108
107
|
...(gemma4Unified && !supported ? [["Note", pc.yellow("Gemma 4 unified projectors need llama.cpp b9549+.")]] : []),
|
|
109
108
|
])));
|
|
110
|
-
const useVision = await prompt.yesNo("Enable vision
|
|
109
|
+
const useVision = await prompt.yesNo("Enable vision?", supported);
|
|
111
110
|
configured = useVision ? applyVisionDefaults(configured) : removeVisionDefaults(configured, gemma4Unified && !supported ? "gemma4-unified-unsupported" : "user-disabled");
|
|
112
111
|
}
|
|
113
112
|
|
|
113
|
+
// ── Thinking ───────────────────────────────────────────────────────────
|
|
114
114
|
if (caps.thinking) {
|
|
115
115
|
console.log("");
|
|
116
|
-
console.log(renderSection("Thinking
|
|
117
|
-
["
|
|
118
|
-
["
|
|
116
|
+
console.log(renderSection("Thinking mode", renderRows([
|
|
117
|
+
["What it does", "Model reasons step-by-step before answering"],
|
|
118
|
+
["Benefit", "Better results for math, code, logic — but slower (more output tokens)"],
|
|
119
|
+
["Sampling changes", "top-k → 64, presence penalty → 0, repeat penalty → 1.1 (adjustable below)"],
|
|
119
120
|
["Template", "--chat-template-kwargs { enable_thinking: true }"],
|
|
120
121
|
])));
|
|
121
|
-
const useThinking = await prompt.yesNo("Use
|
|
122
|
+
const useThinking = await prompt.yesNo("Use thinking/loop-safe defaults?", true);
|
|
122
123
|
configured = useThinking ? applyThinkingDefaults(configured) : removeThinkingDefaults(configured);
|
|
123
124
|
}
|
|
124
125
|
|
|
125
|
-
|
|
126
|
+
// ── Context window ─────────────────────────────────────────────────────
|
|
127
|
+
const maxCtx = caps.metaCtx ?? 1048576;
|
|
128
|
+
console.log("");
|
|
129
|
+
console.log(renderSection("Context window", renderRows([
|
|
130
|
+
["What it does", "Maximum tokens the model can process at once (prompt + response + history)"],
|
|
131
|
+
["Range", `1,024 – ${maxCtx.toLocaleString()} tokens`],
|
|
132
|
+
["Memory", "KV cache grows linearly — larger context = more RAM"],
|
|
133
|
+
["Guidance", "8k–32k: chat · 32k–80k: coding/long convos · 128k+: long documents"],
|
|
134
|
+
["Model max", `${maxCtx.toLocaleString()} tokens`],
|
|
135
|
+
["Default", `${configured.flags.ctxSize.toLocaleString()} tokens`],
|
|
136
|
+
])));
|
|
137
|
+
const ctxSize = await prompt.number("Context window tokens", configured.flags.ctxSize, 1024, maxCtx);
|
|
138
|
+
configured = applyRuntimeFlagOverrides(configured, { ctxSize });
|
|
139
|
+
|
|
140
|
+
// ── K cache precision ──────────────────────────────────────────────────
|
|
141
|
+
console.log("");
|
|
142
|
+
console.log(renderSection("K cache precision", renderRows([
|
|
143
|
+
["What it is", "KV cache stores attention 'keys' — previous token states used for prediction"],
|
|
144
|
+
["Tradeoff", "Lower precision = less memory, potential quality loss"],
|
|
145
|
+
...cacheMemoryRows(profile.modelPath, configured.flags, ctxSize),
|
|
146
|
+
])));
|
|
126
147
|
const cacheTypeK = await prompt.choice("K cache precision", CACHE_CHOICES, configured.flags.cacheTypeK);
|
|
148
|
+
configured = applyRuntimeFlagOverrides(configured, { cacheTypeK });
|
|
149
|
+
|
|
150
|
+
// ── V cache precision ──────────────────────────────────────────────────
|
|
151
|
+
console.log("");
|
|
152
|
+
console.log(renderSection("V cache precision", renderRows([
|
|
153
|
+
["What it is", "KV cache stores attention 'values' — token representations from previous layers"],
|
|
154
|
+
["Tradeoff", "Same as K cache. Some models are more sensitive to V precision than K"],
|
|
155
|
+
...cacheMemoryRows(profile.modelPath, configured.flags, ctxSize),
|
|
156
|
+
])));
|
|
127
157
|
const cacheTypeV = await prompt.choice("V cache precision", CACHE_CHOICES, configured.flags.cacheTypeV);
|
|
128
|
-
configured = applyRuntimeFlagOverrides(configured, {
|
|
158
|
+
configured = applyRuntimeFlagOverrides(configured, { cacheTypeV });
|
|
159
|
+
|
|
160
|
+
// ── Memory estimate (with chosen context + cache) ──────────────────────
|
|
161
|
+
console.log("");
|
|
162
|
+
console.log(renderMemoryEstimate(configured));
|
|
163
|
+
|
|
164
|
+
// ── Temperature ────────────────────────────────────────────────────────
|
|
165
|
+
console.log("");
|
|
166
|
+
console.log(renderSection("Temperature", renderRows([
|
|
167
|
+
["What it does", "Controls randomness in token selection"],
|
|
168
|
+
["Range", "0.0 – 2.0"],
|
|
169
|
+
["0.0", "Deterministic — always picks most likely token"],
|
|
170
|
+
["0.6", "Balanced — default for most models"],
|
|
171
|
+
["1.0+", "Creative — more random, may hallucinate"],
|
|
172
|
+
["Guidance", "0–0.3: coding/factual · 0.4–0.8: chat · 0.9+: creative writing"],
|
|
173
|
+
])));
|
|
174
|
+
const temperature = await prompt.number("Temperature", configured.flags.temperature, 0, 2, { float: true });
|
|
175
|
+
configured = applyRuntimeFlagOverrides(configured, { temperature });
|
|
176
|
+
|
|
177
|
+
// ── Top-p ──────────────────────────────────────────────────────────────
|
|
178
|
+
console.log("");
|
|
179
|
+
console.log(renderSection("Top-p (nucleus sampling)", renderRows([
|
|
180
|
+
["What it does", "Only considers tokens in the top p fraction of probability mass"],
|
|
181
|
+
["Range", "0.0 – 1.0 (1.0 = consider all tokens)"],
|
|
182
|
+
["Guidance", "0.9–0.95: good default · Lower = more focused · Higher = more diverse"],
|
|
183
|
+
])));
|
|
184
|
+
const topP = await prompt.number("Top-p", configured.flags.topP, 0, 1, { float: true });
|
|
185
|
+
configured = applyRuntimeFlagOverrides(configured, { topP });
|
|
186
|
+
|
|
187
|
+
// ── Top-k ──────────────────────────────────────────────────────────────
|
|
188
|
+
console.log("");
|
|
189
|
+
console.log(renderSection("Top-k", renderRows([
|
|
190
|
+
["What it does", "Limits token selection to top K most likely tokens at each step"],
|
|
191
|
+
["Range", "0 – 1000 (0 = disabled, uses top-p instead)"],
|
|
192
|
+
["Guidance", "20: general chat · 40–64: thinking/reasoning · 0: rely on top-p"],
|
|
193
|
+
])));
|
|
194
|
+
const topK = await prompt.number("Top-k", configured.flags.topK, 0, 1000);
|
|
195
|
+
configured = applyRuntimeFlagOverrides(configured, { topK });
|
|
196
|
+
|
|
197
|
+
// ── Min-p ──────────────────────────────────────────────────────────────
|
|
198
|
+
console.log("");
|
|
199
|
+
console.log(renderSection("Min-p", renderRows([
|
|
200
|
+
["What it does", "Excludes tokens with probability below this threshold"],
|
|
201
|
+
["Range", "0.0 – 1.0 (0 = disabled)"],
|
|
202
|
+
["Guidance", "0: off (default) · 0.05–0.1: reduces hallucination while keeping creativity"],
|
|
203
|
+
])));
|
|
204
|
+
const minP = await prompt.number("Min-p", configured.flags.minP, 0, 1, { float: true });
|
|
205
|
+
configured = applyRuntimeFlagOverrides(configured, { minP });
|
|
206
|
+
|
|
207
|
+
// ── Presence penalty ───────────────────────────────────────────────────
|
|
208
|
+
console.log("");
|
|
209
|
+
console.log(renderSection("Presence penalty", renderRows([
|
|
210
|
+
["What it does", "Discourages repeated tokens — each used token gets a fixed logit penalty"],
|
|
211
|
+
["Range", "0.0 – 2.0 (0 = off)"],
|
|
212
|
+
["Guidance", "0: thinking/reasoning · 1.0–1.5: general chat · Too high = incoherent"],
|
|
213
|
+
])));
|
|
214
|
+
const presencePenalty = await prompt.number("Presence penalty", configured.flags.presencePenalty, 0, 2, { float: true });
|
|
215
|
+
configured = applyRuntimeFlagOverrides(configured, { presencePenalty });
|
|
216
|
+
|
|
217
|
+
// ── Repeat penalty ─────────────────────────────────────────────────────
|
|
218
|
+
console.log("");
|
|
219
|
+
console.log(renderSection("Repeat penalty", renderRows([
|
|
220
|
+
["What it does", "Multiplies probability of repeated tokens (multiplicative, vs presence's additive)"],
|
|
221
|
+
["Range", "0.0 – 2.0 (1.0 = no effect)"],
|
|
222
|
+
["Guidance", "1.0–1.1: most use cases · Higher = aggressive anti-repeat · Can break code patterns"],
|
|
223
|
+
])));
|
|
224
|
+
const repeatPenalty = await prompt.number("Repeat penalty", configured.flags.repeatPenalty, 0, 2, { float: true });
|
|
225
|
+
configured = applyRuntimeFlagOverrides(configured, { repeatPenalty });
|
|
226
|
+
|
|
227
|
+
// ── Batch size ─────────────────────────────────────────────────────────
|
|
228
|
+
console.log("");
|
|
229
|
+
console.log(renderSection("Batch size", renderRows([
|
|
230
|
+
["What it does", "Tokens processed in parallel during prompt processing (before generation)"],
|
|
231
|
+
["Range", "1 – 4096"],
|
|
232
|
+
["Guidance", "512: good default · 2048+: faster for long prompts · Lower if memory tight"],
|
|
233
|
+
])));
|
|
234
|
+
const batchSize = await prompt.number("Batch size", configured.flags.batchSize, 1, 4096);
|
|
235
|
+
configured = applyRuntimeFlagOverrides(configured, { batchSize });
|
|
236
|
+
|
|
237
|
+
// ── Parallel slots ─────────────────────────────────────────────────────
|
|
238
|
+
console.log("");
|
|
239
|
+
console.log(renderSection("Parallel slots", renderRows([
|
|
240
|
+
["What it does", "Number of concurrent request slots (each handles an independent conversation)"],
|
|
241
|
+
["Range", "1 – 10"],
|
|
242
|
+
["Memory", "KV cache is multiplied by this number"],
|
|
243
|
+
["Guidance", "1: single-user local · 2+: multiple clients connecting simultaneously"],
|
|
244
|
+
])));
|
|
245
|
+
const parallel = await prompt.number("Parallel slots", configured.flags.parallel, 1, 10);
|
|
246
|
+
configured = applyRuntimeFlagOverrides(configured, { parallel });
|
|
247
|
+
|
|
248
|
+
// ── Flash attention ────────────────────────────────────────────────────
|
|
249
|
+
console.log("");
|
|
250
|
+
console.log(renderSection("Flash attention", renderRows([
|
|
251
|
+
["What it does", "Memory-efficient attention algorithm — faster and less RAM than standard"],
|
|
252
|
+
["Guidance", "Always on for modern hardware · Turn off only for old GPU driver compat issues"],
|
|
253
|
+
])));
|
|
254
|
+
const flashAttn = await prompt.yesNo("Enable flash attention?", true);
|
|
255
|
+
configured = applyRuntimeFlagOverrides(configured, { flashAttention: flashAttn ? "on" : "off" });
|
|
256
|
+
|
|
257
|
+
// ── Jinja templates ────────────────────────────────────────────────────
|
|
258
|
+
console.log("");
|
|
259
|
+
console.log(renderSection("Jinja chat templates", renderRows([
|
|
260
|
+
["What it does", "Enables Jinja2 template rendering for proper chat formatting"],
|
|
261
|
+
["Guidance", "Always on for modern models · Turn off only for very old models without chat templates"],
|
|
262
|
+
])));
|
|
263
|
+
const jinja = await prompt.yesNo("Enable Jinja templates?", true);
|
|
264
|
+
configured = applyRuntimeFlagOverrides(configured, { jinja });
|
|
129
265
|
|
|
266
|
+
// ── Final summary ──────────────────────────────────────────────────────
|
|
130
267
|
console.log("");
|
|
131
|
-
console.log(renderSection("
|
|
268
|
+
console.log(renderSection("Configuration summary", renderRows([
|
|
269
|
+
["Model", pc.bold(configured.label)],
|
|
132
270
|
["Backend", configured.backend],
|
|
133
271
|
["Endpoint", configured.baseUrl],
|
|
134
|
-
["
|
|
135
|
-
["
|
|
136
|
-
["
|
|
137
|
-
["
|
|
138
|
-
["
|
|
139
|
-
["
|
|
272
|
+
["Context", `${configured.flags.ctxSize.toLocaleString()} tokens`],
|
|
273
|
+
["KV cache", `${configured.flags.cacheTypeK}/${configured.flags.cacheTypeV}`],
|
|
274
|
+
["Temperature", String(configured.flags.temperature)],
|
|
275
|
+
["Top-p", String(configured.flags.topP)],
|
|
276
|
+
["Top-k", String(configured.flags.topK)],
|
|
277
|
+
["Min-p", String(configured.flags.minP)],
|
|
278
|
+
["Presence penalty", String(configured.flags.presencePenalty)],
|
|
279
|
+
["Repeat penalty", String(configured.flags.repeatPenalty)],
|
|
280
|
+
["Batch size", String(configured.flags.batchSize)],
|
|
281
|
+
["Parallel", String(configured.flags.parallel)],
|
|
282
|
+
["Flash attention", configured.flags.flashAttention],
|
|
283
|
+
["Jinja", configured.flags.jinja ? "on" : "off"],
|
|
284
|
+
...(configured.capabilities?.mtp ? [["MTP", "enabled"]] : []),
|
|
285
|
+
...(configured.capabilities?.vision ? [["Vision", "enabled"]] : []),
|
|
286
|
+
...(configured.capabilities?.thinking ? [["Thinking", "enabled"]] : []),
|
|
140
287
|
])));
|
|
141
288
|
|
|
142
|
-
console.log("
|
|
289
|
+
console.log("");
|
|
290
|
+
console.log(renderMemoryEstimate(configured));
|
|
143
291
|
if (!(await prompt.yesNo("Save profile with these settings?", true))) return null;
|
|
144
292
|
return configured;
|
|
145
293
|
}
|
|
@@ -150,24 +298,60 @@ export function applyRuntimeFlagOverrides(profile, overrides) {
|
|
|
150
298
|
}
|
|
151
299
|
|
|
152
300
|
export function applyMtpDefaults(profile) {
|
|
153
|
-
const flags = { ...profile.flags, port: LLAMA_CPP_MTP_PORT };
|
|
154
301
|
return applyProfileFlags({
|
|
155
302
|
...profile,
|
|
156
|
-
backend: "llama-cpp-mtp",
|
|
157
|
-
providerId: "llama-cpp-mtp",
|
|
158
303
|
capabilities: { ...(profile.capabilities ?? {}), mtp: true },
|
|
159
|
-
}, flags);
|
|
304
|
+
}, profile.flags);
|
|
160
305
|
}
|
|
161
306
|
|
|
162
307
|
export function removeMtpDefaults(profile) {
|
|
163
|
-
const flags = { ...profile.flags, port: LLAMA_CPP_PORT };
|
|
164
308
|
return applyProfileFlags({
|
|
165
309
|
...profile,
|
|
166
|
-
backend: "llama-cpp",
|
|
167
|
-
providerId: "llama-cpp",
|
|
168
310
|
drafterPath: null,
|
|
169
311
|
capabilities: { ...(profile.capabilities ?? {}), mtp: false },
|
|
170
|
-
}, flags);
|
|
312
|
+
}, profile.flags);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// ── oMLX (managed server) profile configuration ───────────────────────────
|
|
316
|
+
|
|
317
|
+
export async function configureManagedProfile(prompt, profile) {
|
|
318
|
+
let configured = profile;
|
|
319
|
+
const modelId = profile.omlxModel ?? profile.modelAlias ?? profile.id;
|
|
320
|
+
|
|
321
|
+
// Detect MTP capability from the model's config.json
|
|
322
|
+
const modelDir = await findOmlxModelDir(modelId);
|
|
323
|
+
if (modelDir) {
|
|
324
|
+
const mtpResult = await detectOmlxMtpCapability(modelDir);
|
|
325
|
+
if (mtpResult.compatible) {
|
|
326
|
+
console.log("");
|
|
327
|
+
console.log(renderSection("MTP detected", renderRows([
|
|
328
|
+
["Feature", "Multi-Token Prediction (speculative decoding)"],
|
|
329
|
+
["Mechanism", "oMLX native MTP (enabled via admin API at load time)"],
|
|
330
|
+
])));
|
|
331
|
+
const useMtp = await prompt.yesNo("Use MTP speculative decoding?", true);
|
|
332
|
+
configured = { ...configured, capabilities: { ...(configured.capabilities ?? {}), mtp: useMtp } };
|
|
333
|
+
} else if (mtpResult.reason !== "model has no MTP heads in config") {
|
|
334
|
+
// Model declares MTP but can't use it — surface the reason
|
|
335
|
+
console.log("");
|
|
336
|
+
console.log(renderSection("MTP not available", renderRows([
|
|
337
|
+
["Feature", "Multi-Token Prediction (speculative decoding)"],
|
|
338
|
+
["Reason", pc.yellow(mtpResult.reason)],
|
|
339
|
+
])));
|
|
340
|
+
}
|
|
341
|
+
// If reason is "no MTP heads in config", don't surface anything —
|
|
342
|
+
// most models don't have MTP, and showing a card for every non-MTP
|
|
343
|
+
// model would be noise.
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
console.log("");
|
|
347
|
+
console.log(renderSection("Model setup", renderRows([
|
|
348
|
+
["Model", pc.bold(profile.label)],
|
|
349
|
+
["Backend", "oMLX"],
|
|
350
|
+
...(configured.capabilities?.mtp ? [["MTP", "enabled"]] : []),
|
|
351
|
+
])));
|
|
352
|
+
|
|
353
|
+
if (!(await prompt.yesNo("Save profile with these settings?", true))) return null;
|
|
354
|
+
return configured;
|
|
171
355
|
}
|
|
172
356
|
|
|
173
357
|
function applyVisionDefaults(profile) {
|
|
@@ -214,10 +398,12 @@ function applyProfileFlags(profile, flags) {
|
|
|
214
398
|
|
|
215
399
|
function renderMemoryEstimate(profile) {
|
|
216
400
|
try {
|
|
217
|
-
const est = estimateMemory(profile.modelPath, profile.mmprojPath,
|
|
401
|
+
const est = estimateMemory(profile.modelPath, profile.mmprojPath, profile.drafterPath, profile.flags);
|
|
218
402
|
return renderSection("Memory estimate", renderRows([
|
|
219
403
|
["Estimated total", pc.bold(`~${formatBytes(est.totalBytes)}`)],
|
|
220
404
|
["Model", formatBytes(est.modelBytes)],
|
|
405
|
+
...(est.mmprojBytes ? [["Vision projector", formatBytes(est.mmprojBytes)]] : []),
|
|
406
|
+
...(est.draftBytes ? [["Drafter (MTP)", formatBytes(est.draftBytes)]] : []),
|
|
221
407
|
["KV cache", est.kvBytes ? `~${formatBytes(est.kvBytes)} (${profile.flags.ctxSize.toLocaleString()} ctx, ${profile.flags.cacheTypeK}/${profile.flags.cacheTypeV})` : "unknown"],
|
|
222
408
|
...(est.note ? [["Note", pc.yellow(est.note)]] : []),
|
|
223
409
|
]));
|
|
@@ -243,8 +429,5 @@ async function runtimeSupportsGemma4Unified() {
|
|
|
243
429
|
}
|
|
244
430
|
}
|
|
245
431
|
|
|
246
|
-
function samplingSummary(flags) {
|
|
247
|
-
return `temp ${flags.temperature}, top-p ${flags.topP}, top-k ${flags.topK}`;
|
|
248
|
-
}
|
|
249
432
|
|
|
250
433
|
|
package/src/profiles.mjs
CHANGED
|
@@ -40,7 +40,15 @@ export async function loadProfiles() {
|
|
|
40
40
|
.filter((e) => e.isDirectory() && existsSync(profileJsonPath(e.name)))
|
|
41
41
|
.map((e) => e.name)
|
|
42
42
|
.sort();
|
|
43
|
-
return (await Promise.all(ids.map((id) => readProfile(id))))
|
|
43
|
+
return (await Promise.all(ids.map((id) => readProfile(id))))
|
|
44
|
+
.map((p) => {
|
|
45
|
+
// Migrate legacy llama-cpp-mtp backend → llama-cpp with mtp capability
|
|
46
|
+
if (p.backend === "llama-cpp-mtp") {
|
|
47
|
+
return { ...p, backend: "llama-cpp", providerId: "llama-cpp", capabilities: { ...(p.capabilities ?? {}), mtp: true } };
|
|
48
|
+
}
|
|
49
|
+
return p;
|
|
50
|
+
})
|
|
51
|
+
.filter((p) => BACKENDS[p.backend]);
|
|
44
52
|
}
|
|
45
53
|
|
|
46
54
|
export async function readProfile(id) {
|
|
@@ -49,7 +57,7 @@ export async function readProfile(id) {
|
|
|
49
57
|
return JSON.parse(await readFile(path, "utf8"));
|
|
50
58
|
}
|
|
51
59
|
|
|
52
|
-
export async function saveProfile(profile
|
|
60
|
+
export async function saveProfile(profile) {
|
|
53
61
|
const id = sanitizeProfileId(profile.id);
|
|
54
62
|
const dir = profileDir(id);
|
|
55
63
|
await mkdir(dir, { recursive: true });
|
|
@@ -126,7 +134,7 @@ export async function createProfileFromModel(model, backendId, drafterPath) {
|
|
|
126
134
|
const caps = detectCapabilities(model.path, model.mmprojPath);
|
|
127
135
|
// If a drafter is provided, this model supports MTP regardless of filename
|
|
128
136
|
const hasMtp = caps.mtp || Boolean(drafterPath);
|
|
129
|
-
const backend = backendId ??
|
|
137
|
+
const backend = backendId ?? "llama-cpp";
|
|
130
138
|
const { flags } = computeFlags(
|
|
131
139
|
{ ...caps, mtp: hasMtp },
|
|
132
140
|
model.path,
|
package/src/ui.mjs
CHANGED
|
@@ -95,36 +95,18 @@ export function renderCard(title, body, options = {}) {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
const topTitle = title ? `╭${pc.reset(titleStr)}` : "╭";
|
|
98
|
-
const topFill = "─".repeat(Math.max(0, width
|
|
98
|
+
const topFill = "─".repeat(Math.max(0, width - visibleLen(titleStr)));
|
|
99
99
|
const top = `${topTitle}${topFill}╮`;
|
|
100
100
|
|
|
101
101
|
const middle = lines.map((line) => `│ ${padVisible(line, contentWidth)} │`);
|
|
102
102
|
|
|
103
|
-
const bottom = `╰${"─".repeat(width
|
|
103
|
+
const bottom = `╰${"─".repeat(width)}╯`;
|
|
104
104
|
|
|
105
105
|
return [top, ...middle, bottom].map((l) => borderColor(l)).join("\n");
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
function wrapVisible(text, width) {
|
|
109
|
-
|
|
110
|
-
const lines = [];
|
|
111
|
-
let current = "";
|
|
112
|
-
for (let word of words) {
|
|
113
|
-
// If a single word exceeds the width, hard-break it
|
|
114
|
-
while (visibleLen(word) > width) {
|
|
115
|
-
if (current.trim()) { lines.push(current.trimEnd()); current = ""; }
|
|
116
|
-
lines.push(word.slice(0, width));
|
|
117
|
-
word = word.slice(width);
|
|
118
|
-
}
|
|
119
|
-
if (visibleLen(current + word) > width && current.trim()) {
|
|
120
|
-
lines.push(current.trimEnd());
|
|
121
|
-
current = word.trimStart();
|
|
122
|
-
} else {
|
|
123
|
-
current += word;
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
if (current.trim()) lines.push(current.trimEnd());
|
|
127
|
-
return lines.length > 0 ? lines : [text];
|
|
109
|
+
return wrapText(text, width);
|
|
128
110
|
}
|
|
129
111
|
|
|
130
112
|
export function renderSectionRows(title, rows, options = {}) {
|
|
@@ -198,9 +180,8 @@ async function runPrompt(fn, config) {
|
|
|
198
180
|
|
|
199
181
|
// ── Interactive prompt factory ──────────────────────────────────────────────
|
|
200
182
|
|
|
201
|
-
export function startInteractive(
|
|
183
|
+
export function startInteractive() {
|
|
202
184
|
if (process.stdin.isTTY) console.clear();
|
|
203
|
-
console.log(pc.magenta(`◆ ${title}`));
|
|
204
185
|
}
|
|
205
186
|
|
|
206
187
|
export function createPrompt() {
|
|
@@ -213,14 +194,18 @@ export function createPrompt() {
|
|
|
213
194
|
return value?.trim() || String(defaultValue ?? "");
|
|
214
195
|
},
|
|
215
196
|
|
|
216
|
-
async number(label, defaultValue, min, max) {
|
|
197
|
+
async number(label, defaultValue, min, max, { float = false } = {}) {
|
|
217
198
|
const value = await runPrompt(number, {
|
|
218
199
|
message: label,
|
|
219
200
|
default: defaultValue,
|
|
201
|
+
min,
|
|
202
|
+
max,
|
|
203
|
+
step: float ? 'any' : 1,
|
|
220
204
|
validate(input) {
|
|
221
205
|
if (!Number.isFinite(input) || input < min || input > max) {
|
|
222
206
|
return `Enter a number from ${min} to ${max}.`;
|
|
223
207
|
}
|
|
208
|
+
return true;
|
|
224
209
|
},
|
|
225
210
|
});
|
|
226
211
|
return Number(value);
|
|
@@ -252,12 +237,10 @@ export function createPrompt() {
|
|
|
252
237
|
|
|
253
238
|
export async function modelSelect(label, groups, { defaultKey, pageSize = 20 } = {}) {
|
|
254
239
|
const choices = [];
|
|
255
|
-
// Separator below the prompt message
|
|
256
|
-
choices.push(new Separator(pc.dim(" ────────────────────────────────────────────────────────────")));
|
|
257
240
|
for (let i = 0; i < groups.length; i++) {
|
|
258
241
|
const group = groups[i];
|
|
259
242
|
// Add blank line before each group (except the first)
|
|
260
|
-
if (i > 0) choices.push(new Separator(""));
|
|
243
|
+
if (i > 0) choices.push(new Separator(" "));
|
|
261
244
|
if (group.separator) {
|
|
262
245
|
choices.push(new Separator(group.separator));
|
|
263
246
|
}
|