offgrid-ai 0.9.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/package.json +4 -3
- package/resources/hf-download.py +79 -0
- package/resources/mlxvlm-server-wrapper.py +112 -0
- package/resources/recommendations.json +60 -0
- package/src/backend-installers.mjs +1 -16
- package/src/backends.mjs +17 -45
- package/src/benchmark/finalize.mjs +9 -91
- package/src/benchmark/flow.mjs +8 -6
- package/src/benchmark/metrics.mjs +6 -45
- package/src/benchmark/pi-runner.mjs +5 -2
- package/src/benchmark/prepare.mjs +1 -1
- package/src/benchmark/stream-renderer.mjs +31 -2
- package/src/benchmark.mjs +3 -1
- package/src/commands/main.mjs +3 -5
- package/src/commands/models.mjs +27 -19
- package/src/commands/onboard.mjs +67 -9
- package/src/commands/run.mjs +20 -5
- package/src/commands/status.mjs +1 -1
- package/src/config.mjs +11 -2
- package/src/discovery-shared.mjs +44 -0
- package/src/hardware.mjs +49 -0
- package/src/harness-pi.mjs +25 -11
- package/src/huggingface.mjs +209 -0
- package/src/managed.mjs +1 -5
- package/src/mlx-discovery.mjs +290 -0
- package/src/mlx-flags.mjs +93 -0
- package/src/model-catalog.mjs +12 -6
- package/src/model-name.mjs +7 -25
- package/src/model-presenters.mjs +138 -28
- package/src/process.mjs +129 -32
- package/src/profile-setup.mjs +116 -0
- package/src/profiles.mjs +30 -0
- package/src/recommendations.mjs +56 -14
- package/src/scan.mjs +39 -8
package/src/process.mjs
CHANGED
|
@@ -35,11 +35,13 @@ async function startLocalServer(profile) {
|
|
|
35
35
|
|
|
36
36
|
// Build argv: binary + command.json args
|
|
37
37
|
const argv = [...commandArgv];
|
|
38
|
+
// mlx-vlm requires APC_ENABLED=1 (86x TTFT improvement; fixes Metal cache clearing).
|
|
39
|
+
const env = profile.backend === "mlx-vlm" ? { ...process.env, APC_ENABLED: "1" } : process.env;
|
|
38
40
|
|
|
39
41
|
const rawFd = openSync(rawLogPath, "a");
|
|
40
42
|
let child;
|
|
41
43
|
try {
|
|
42
|
-
child = spawn(binary, argv, { detached: true, stdio: ["ignore", rawFd, rawFd] });
|
|
44
|
+
child = spawn(binary, argv, { detached: true, stdio: ["ignore", rawFd, rawFd], env });
|
|
43
45
|
} finally {
|
|
44
46
|
closeSync(rawFd);
|
|
45
47
|
}
|
|
@@ -96,16 +98,134 @@ export async function stopProfile(profile) {
|
|
|
96
98
|
await writeState(profile.id, { ...state, pid: null, stoppedAt: new Date().toISOString(), stopReason: "pid-not-running" });
|
|
97
99
|
return { stopped: false, message: `${profile.id} pid ${state.pid} is no longer running.` };
|
|
98
100
|
}
|
|
101
|
+
const pid = state.pid;
|
|
99
102
|
try {
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
}
|
|
103
|
-
process.kill(state.pid, "SIGTERM");
|
|
104
|
-
}
|
|
105
|
-
await writeState(profile.id, { ...state, pid: null, stoppedAt: new Date().toISOString(), stopSignal: "SIGTERM" });
|
|
106
|
-
return { stopped: true, message: `Stopped ${profile.id} pid ${state.pid}` };
|
|
103
|
+
const signal = await terminateProcess(pid);
|
|
104
|
+
await writeState(profile.id, { ...state, pid: null, stoppedAt: new Date().toISOString(), stopSignal: signal });
|
|
105
|
+
return { stopped: true, message: `Stopped ${profile.id} pid ${pid}` };
|
|
107
106
|
} catch (error) {
|
|
108
|
-
return { stopped: false, message: `Could not stop pid ${
|
|
107
|
+
return { stopped: false, message: `Could not stop pid ${pid}: ${error.message}` };
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Reliably terminate a detached local-server process group: SIGTERM with a
|
|
112
|
+
// grace period for graceful shutdown (lets mlx-vlm/llama-server release the
|
|
113
|
+
// model), then SIGKILL if still alive. Guarantees the model is unloaded when a
|
|
114
|
+
// profile stops — consistent across backends (llama-server exits on SIGTERM;
|
|
115
|
+
// mlx-vlm/uvicorn often does not, hence the SIGKILL fallback).
|
|
116
|
+
async function terminateProcess(pid) {
|
|
117
|
+
const signalGroup = (sig) => {
|
|
118
|
+
try { process.kill(-pid, sig); }
|
|
119
|
+
catch { process.kill(pid, sig); } // not a group leader — kill the proc itself
|
|
120
|
+
};
|
|
121
|
+
signalGroup("SIGTERM");
|
|
122
|
+
for (let i = 0; i < 50; i++) { // 5s grace for graceful shutdown
|
|
123
|
+
if (await processGone(pid)) return "SIGTERM";
|
|
124
|
+
await sleep(100);
|
|
125
|
+
}
|
|
126
|
+
signalGroup("SIGKILL");
|
|
127
|
+
for (let i = 0; i < 30; i++) { // 3s for SIGKILL to take effect
|
|
128
|
+
if (await processGone(pid)) return "SIGKILL";
|
|
129
|
+
await sleep(100);
|
|
130
|
+
}
|
|
131
|
+
throw new Error(`pid ${pid} did not exit after SIGKILL`);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// True if the process is dead (or a zombie about to be reaped).
|
|
135
|
+
async function processGone(pid) {
|
|
136
|
+
try { process.kill(pid, 0); }
|
|
137
|
+
catch { return true; } // no such process
|
|
138
|
+
// Alive to signal(0) — but a detached setsid child can briefly appear as a
|
|
139
|
+
// zombie before launchd reaps it. Treat zombie as gone.
|
|
140
|
+
try {
|
|
141
|
+
const { stdout } = await execFileAsync("ps", ["-o", "stat=", "-p", String(pid)]);
|
|
142
|
+
return /^Z/.test(stdout.trim());
|
|
143
|
+
} catch {
|
|
144
|
+
return false;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ── Unload model from a managed server (oMLX) ─────────────────────────────
|
|
149
|
+
// Counterpart to stopProfile for local-server backends: stopProfile kills the
|
|
150
|
+
// server process (which unloads the model); unloadModelFromServer asks a
|
|
151
|
+
// managed server to release the model from memory via its HTTP API, leaving the
|
|
152
|
+
// server itself running. Together they give a consistent UX: quitting Pi
|
|
153
|
+
// unloads the model regardless of backend type.
|
|
154
|
+
|
|
155
|
+
export async function unloadModelFromServer(profile) {
|
|
156
|
+
const backend = backendFor(profile.backend);
|
|
157
|
+
|
|
158
|
+
if (backend.id === "llama-cpp" || backend.id === "llama-cpp-mtp") {
|
|
159
|
+
// llama.cpp unloads when the server process exits; no HTTP unload API exists.
|
|
160
|
+
// If offgrid-ai started the server, stopProfile already handled it.
|
|
161
|
+
return { unloaded: false, backend: backend.id, reason: "stop server to unload" };
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
if (backend.id === "omlx") {
|
|
165
|
+
return await unloadOmlxModel(profile);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (backend.id === "mlx-vlm") {
|
|
169
|
+
// mlx-vlm is a local-server backend — stopProfile handles unload by killing
|
|
170
|
+
// the process. No HTTP unload API.
|
|
171
|
+
return { unloaded: false, backend: backend.id, reason: "stop server to unload" };
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
return { unloaded: false, backend: backend.id, reason: "unsupported backend" };
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
async function unloadOmlxModel(profile) {
|
|
178
|
+
const baseUrl = profile.baseUrl?.replace(/\/v1\/?$/u, "") || "";
|
|
179
|
+
const adminUrl = `${baseUrl}/admin/api/models`;
|
|
180
|
+
const modelId = profile.modelAlias || profile.omlxModel || profile.id;
|
|
181
|
+
|
|
182
|
+
try {
|
|
183
|
+
const ids = await serverModelIds(profile.baseUrl);
|
|
184
|
+
const match = ids.find((id) => id.toLowerCase() === modelId.toLowerCase());
|
|
185
|
+
const targetId = match ?? modelId;
|
|
186
|
+
|
|
187
|
+
const response = await fetch(`${adminUrl}/${encodeURIComponent(targetId)}/unload`, {
|
|
188
|
+
method: "POST",
|
|
189
|
+
headers: { "Content-Type": "application/json" },
|
|
190
|
+
signal: AbortSignal.timeout(30000),
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
if (response.ok) {
|
|
194
|
+
return { unloaded: true, backend: "omlx", modelId: targetId };
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const detail = await responseErrorDetail(response);
|
|
198
|
+
|
|
199
|
+
if (response.status === 400 && /not loaded/i.test(detail)) {
|
|
200
|
+
return { unloaded: true, backend: "omlx", modelId: targetId, reason: "model was not loaded" };
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
if (response.status === 401 || response.status === 403) {
|
|
204
|
+
return {
|
|
205
|
+
unloaded: false,
|
|
206
|
+
backend: "omlx",
|
|
207
|
+
modelId: targetId,
|
|
208
|
+
error: "oMLX admin authentication required. Enable skip_api_key_verification in oMLX settings, or unload manually from the admin panel.",
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
return { unloaded: false, backend: "omlx", modelId: targetId, error: `HTTP ${response.status}: ${detail}` };
|
|
213
|
+
} catch (err) {
|
|
214
|
+
if (err?.name === "AbortError" || err?.name === "TimeoutError") {
|
|
215
|
+
return { unloaded: false, backend: "omlx", modelId, error: "Unload request timed out. The model may still be unloading in the background." };
|
|
216
|
+
}
|
|
217
|
+
return { unloaded: false, backend: "omlx", modelId, error: err.message };
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
async function responseErrorDetail(response) {
|
|
222
|
+
const text = await response.text().catch(() => "");
|
|
223
|
+
if (!text) return "";
|
|
224
|
+
try {
|
|
225
|
+
const body = JSON.parse(text);
|
|
226
|
+
return body?.detail ?? body?.message ?? text;
|
|
227
|
+
} catch {
|
|
228
|
+
return text;
|
|
109
229
|
}
|
|
110
230
|
}
|
|
111
231
|
|
|
@@ -126,7 +246,6 @@ export async function isProfileServerUp(profile) {
|
|
|
126
246
|
|
|
127
247
|
export async function modelLoadedOnServer(profile) {
|
|
128
248
|
const backend = backendFor(profile.backend);
|
|
129
|
-
if (backend.id === "ollama") return modelIdsMatch(await ollamaLoadedModelIds(profile), expectedModelIds(profile));
|
|
130
249
|
if (backend.id === "omlx") return modelIdsMatch(await omlxLoadedModelIds(profile), expectedModelIds(profile));
|
|
131
250
|
const { matches } = await serverMatchesProfile(profile);
|
|
132
251
|
return matches;
|
|
@@ -134,9 +253,6 @@ export async function modelLoadedOnServer(profile) {
|
|
|
134
253
|
|
|
135
254
|
export async function modelAvailableOnServer(profile) {
|
|
136
255
|
const backend = backendFor(profile.backend);
|
|
137
|
-
if (backend.id === "ollama") {
|
|
138
|
-
return modelIdsMatch(await ollamaAvailableModelIds(profile), expectedModelIds(profile));
|
|
139
|
-
}
|
|
140
256
|
if (backend.id === "omlx") {
|
|
141
257
|
// /v1/models lists discovered models; an ID must exist there to be usable.
|
|
142
258
|
return modelIdsMatch(await serverModelIds(profile.baseUrl), expectedModelIds(profile));
|
|
@@ -217,24 +333,6 @@ export async function serverModelIds(baseUrl) {
|
|
|
217
333
|
.filter(Boolean);
|
|
218
334
|
}
|
|
219
335
|
|
|
220
|
-
async function ollamaLoadedModelIds(profile) {
|
|
221
|
-
const result = await fetchJson(`${apiRootUrl(profile.baseUrl)}/api/ps`);
|
|
222
|
-
if (!result.ok) return [];
|
|
223
|
-
return (Array.isArray(result.data?.models) ? result.data.models : [])
|
|
224
|
-
.flatMap((model) => [model?.name, model?.model])
|
|
225
|
-
.map((id) => String(id ?? "").trim())
|
|
226
|
-
.filter(Boolean);
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
async function ollamaAvailableModelIds(profile) {
|
|
230
|
-
const result = await fetchJson(`${apiRootUrl(profile.baseUrl)}/api/tags`);
|
|
231
|
-
if (!result.ok) return [];
|
|
232
|
-
return (Array.isArray(result.data?.models) ? result.data.models : [])
|
|
233
|
-
.flatMap((model) => [model?.name, model?.model])
|
|
234
|
-
.map((id) => String(id ?? "").trim())
|
|
235
|
-
.filter(Boolean);
|
|
236
|
-
}
|
|
237
|
-
|
|
238
336
|
async function omlxLoadedModelIds(profile) {
|
|
239
337
|
const statusResult = await fetchJson(`${profile.baseUrl.replace(/\/+$/u, "")}/models/status`);
|
|
240
338
|
const fromStatus = statusResult.ok
|
|
@@ -305,7 +403,6 @@ function expectedModelIds(profile) {
|
|
|
305
403
|
return [
|
|
306
404
|
profile.modelAlias,
|
|
307
405
|
profile.label,
|
|
308
|
-
profile.ollamaModel,
|
|
309
406
|
profile.omlxModel,
|
|
310
407
|
profile.modelPath,
|
|
311
408
|
fileName,
|
package/src/profile-setup.mjs
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
1
2
|
import { execFile } from "node:child_process";
|
|
2
3
|
import { promisify } from "node:util";
|
|
3
4
|
import { estimateMemory } from "./estimate.mjs";
|
|
@@ -7,6 +8,7 @@ import { pc, formatBytes, renderRows, renderSection } from "./ui.mjs";
|
|
|
7
8
|
import { detectCapabilities } from "./autodetect.mjs";
|
|
8
9
|
import { matchDrafter } from "./scan.mjs";
|
|
9
10
|
import { scanGgufModels } from "./scan.mjs";
|
|
11
|
+
import { estimateMemoryMb } from "./mlx-flags.mjs";
|
|
10
12
|
|
|
11
13
|
const execFileAsync = promisify(execFile);
|
|
12
14
|
|
|
@@ -36,6 +38,10 @@ export async function configureLocalProfile(prompt, profile) {
|
|
|
36
38
|
// so that re-setup can pick up MTP availability, vision changes, etc.
|
|
37
39
|
const freshCaps = detectCapabilities(profile.modelPath, profile.mmprojPath);
|
|
38
40
|
let drafterPath = profile.drafterPath ?? null;
|
|
41
|
+
if (drafterPath && !existsSync(drafterPath)) {
|
|
42
|
+
// Stored drafter is no longer on disk — drop it and re-scan for a fresh one.
|
|
43
|
+
drafterPath = null;
|
|
44
|
+
}
|
|
39
45
|
if (!drafterPath) {
|
|
40
46
|
const { drafters } = await scanGgufModels();
|
|
41
47
|
const drafter = matchDrafter(profile.modelPath, drafters);
|
|
@@ -47,6 +53,12 @@ export async function configureLocalProfile(prompt, profile) {
|
|
|
47
53
|
if (hasMtp && configured.backend !== "llama-cpp-mtp") {
|
|
48
54
|
configured = { ...configured, backend: "llama-cpp-mtp", providerId: "llama-cpp-mtp", drafterPath, capabilities: { ...configured.capabilities, mtp: true } };
|
|
49
55
|
}
|
|
56
|
+
// If the profile was MTP but the drafter is now gone (and the model isn't
|
|
57
|
+
// natively MTP), switch back to plain llama.cpp so the server can start.
|
|
58
|
+
if (!hasMtp && configured.backend === "llama-cpp-mtp") {
|
|
59
|
+
console.log(pc.yellow("MTP drafter no longer found — switching to llama.cpp without speculative decoding."));
|
|
60
|
+
configured = removeMtpDefaults(configured);
|
|
61
|
+
}
|
|
50
62
|
if (drafterPath && !configured.drafterPath) {
|
|
51
63
|
configured = { ...configured, drafterPath };
|
|
52
64
|
}
|
|
@@ -286,3 +298,107 @@ function detectionSummary(caps) {
|
|
|
286
298
|
function samplingSummary(flags) {
|
|
287
299
|
return `temp ${flags.temperature}, top-p ${flags.topP}, top-k ${flags.topK}`;
|
|
288
300
|
}
|
|
301
|
+
|
|
302
|
+
// ── MLX profile configuration ─────────────────────────────────────────────
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Interactive configuration for an mlx-vlm profile.
|
|
306
|
+
*/
|
|
307
|
+
export async function configureMlxProfile(prompt, profile) {
|
|
308
|
+
let configured = profile;
|
|
309
|
+
|
|
310
|
+
console.log("");
|
|
311
|
+
console.log(renderSection("Model setup", renderRows([
|
|
312
|
+
["Model", pc.bold(profile.label)],
|
|
313
|
+
["Detected", mlxDetectionSummary(configured.capabilities)],
|
|
314
|
+
["Context", String(configured.flags.ctxSize) + " tokens"],
|
|
315
|
+
])));
|
|
316
|
+
console.log(pc.dim("Larger context windows use more memory. You can edit the profile later if needed.\n"));
|
|
317
|
+
|
|
318
|
+
if (configured.capabilities.vision) {
|
|
319
|
+
console.log(renderSection("Vision detected", renderRows([
|
|
320
|
+
["Capability", "image / multimodal input"],
|
|
321
|
+
["Note", "mlx-vlm loads vision from the model directory automatically."],
|
|
322
|
+
])));
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (configured.capabilities.thinking) {
|
|
326
|
+
console.log("");
|
|
327
|
+
console.log(renderSection("Thinking mode", renderRows([
|
|
328
|
+
["Flag", "--enable-thinking"],
|
|
329
|
+
["Default", "on for Qwen 3 / Gemma 4 / DeepSeek-R class models"],
|
|
330
|
+
])));
|
|
331
|
+
const useThinking = await prompt.yesNo("Enable thinking mode?", true);
|
|
332
|
+
configured = await applyMlxThinkingToggle(configured, useThinking);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
const ctxSize = await prompt.number("Context window tokens", configured.flags.ctxSize, 1024, 1048576);
|
|
336
|
+
configured = applyMlxContextSize(configured, ctxSize);
|
|
337
|
+
|
|
338
|
+
console.log("\n" + renderMlxMemoryEstimate(configured));
|
|
339
|
+
|
|
340
|
+
console.log("");
|
|
341
|
+
console.log(renderSection("Defaults", renderRows([
|
|
342
|
+
["Backend", configured.backend],
|
|
343
|
+
["Endpoint", configured.baseUrl],
|
|
344
|
+
["Context", String(configured.flags.ctxSize) + " tokens"],
|
|
345
|
+
["Thinking", configured.capabilities.thinking && configured.commandArgv.includes("--enable-thinking") ? "on" : "off"],
|
|
346
|
+
["Vision", configured.capabilities.vision ? "yes" : "no"],
|
|
347
|
+
])));
|
|
348
|
+
|
|
349
|
+
if (!(await prompt.yesNo("Save profile with these settings?", true))) return null;
|
|
350
|
+
return configured;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
async function applyMlxThinkingToggle(profile, enabled) {
|
|
354
|
+
if (!profile.capabilities.thinking) return profile;
|
|
355
|
+
const { computeMlxVlmFlags } = await import("./mlx-flags.mjs");
|
|
356
|
+
const { args } = computeMlxVlmFlags(profile.modelPath, {
|
|
357
|
+
port: profile.flags.port,
|
|
358
|
+
ctxSize: profile.flags.ctxSize,
|
|
359
|
+
thinkingEnabled: enabled,
|
|
360
|
+
});
|
|
361
|
+
return {
|
|
362
|
+
...profile,
|
|
363
|
+
commandArgv: args,
|
|
364
|
+
capabilities: { ...profile.capabilities, thinkingEnabled: enabled },
|
|
365
|
+
};
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
function applyMlxContextSize(profile, ctxSize) {
|
|
369
|
+
const flags = { ...profile.flags, ctxSize };
|
|
370
|
+
const next = {
|
|
371
|
+
...profile,
|
|
372
|
+
flags,
|
|
373
|
+
baseUrl: baseUrlForFlags(flags),
|
|
374
|
+
};
|
|
375
|
+
const idx = next.commandArgv.indexOf("--max-kv-size");
|
|
376
|
+
if (idx !== -1 && next.commandArgv[idx + 1] != null) {
|
|
377
|
+
next.commandArgv[idx + 1] = String(ctxSize);
|
|
378
|
+
} else if (ctxSize && ctxSize > 0) {
|
|
379
|
+
next.commandArgv.push("--max-kv-size", String(ctxSize));
|
|
380
|
+
}
|
|
381
|
+
return next;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
function renderMlxMemoryEstimate(profile) {
|
|
385
|
+
const modelBytes = profile.modelSizeBytes || 0;
|
|
386
|
+
if (!modelBytes) {
|
|
387
|
+
return renderSection("Memory estimate", pc.dim("Model size unknown — save the profile to estimate."));
|
|
388
|
+
}
|
|
389
|
+
const totalMb = estimateMemoryMb(modelBytes);
|
|
390
|
+
const overheadBytes = Math.max(0, totalMb * 1024 * 1024 - modelBytes);
|
|
391
|
+
return renderSection("Memory estimate", renderRows([
|
|
392
|
+
["Estimated total", pc.bold(`~${formatBytes(totalMb * 1024 * 1024)}`)],
|
|
393
|
+
["Model", formatBytes(modelBytes)],
|
|
394
|
+
["Overhead", `~${formatBytes(overheadBytes)} (KV cache, APC, runtime)`],
|
|
395
|
+
]));
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
function mlxDetectionSummary(caps) {
|
|
399
|
+
const parts = [];
|
|
400
|
+
if (caps.architecture) parts.push(caps.architecture);
|
|
401
|
+
if (caps.thinking) parts.push("thinking");
|
|
402
|
+
if (caps.vision) parts.push("vision");
|
|
403
|
+
return parts.length > 0 ? parts.join(" · ") : "standard MLX";
|
|
404
|
+
}
|
package/src/profiles.mjs
CHANGED
|
@@ -4,6 +4,8 @@ import { join } from "node:path";
|
|
|
4
4
|
import { PROFILE_DIR, RUN_DIR, LOG_DIR } from "./config.mjs";
|
|
5
5
|
import { backendFor, baseUrlForFlags, defaultFlagsForBackend } from "./backends.mjs";
|
|
6
6
|
import { computeFlags } from "./autodetect.mjs";
|
|
7
|
+
import { detectMlxCapabilities, defaultMlxContextLength } from "./mlx-discovery.mjs";
|
|
8
|
+
import { detectHardware } from "./hardware.mjs";
|
|
7
9
|
import { readJson, writeJson } from "./json.mjs";
|
|
8
10
|
|
|
9
11
|
// ── Path helpers ───────────────────────────────────────────────────────────
|
|
@@ -161,6 +163,34 @@ export async function createProfileFromModel(model, backendId, drafterPath) {
|
|
|
161
163
|
});
|
|
162
164
|
}
|
|
163
165
|
|
|
166
|
+
// ── Auto-create profile from a discovered MLX model ────────────────────────
|
|
167
|
+
|
|
168
|
+
export async function createProfileFromMlxModel(model) {
|
|
169
|
+
const { computeMlxVlmFlags, DEFAULT_PORT } = await import("./mlx-flags.mjs");
|
|
170
|
+
const caps = await detectMlxCapabilities(model.filePath);
|
|
171
|
+
const ctxSize = defaultMlxContextLength(caps.contextLength, detectHardware().totalRamBytes / (1024 ** 3));
|
|
172
|
+
const { args } = computeMlxVlmFlags(model.filePath, {
|
|
173
|
+
port: DEFAULT_PORT,
|
|
174
|
+
ctxSize,
|
|
175
|
+
thinkingEnabled: caps.thinking,
|
|
176
|
+
});
|
|
177
|
+
return normalizeProfile({
|
|
178
|
+
id: slugFromLabel(model.label),
|
|
179
|
+
label: model.label,
|
|
180
|
+
backend: "mlx-vlm",
|
|
181
|
+
providerId: "mlx-vlm",
|
|
182
|
+
modelAlias: model.label,
|
|
183
|
+
source: model.source,
|
|
184
|
+
modelPath: model.filePath,
|
|
185
|
+
mmprojPath: null,
|
|
186
|
+
drafterPath: null,
|
|
187
|
+
modelSizeBytes: model.sizeBytes,
|
|
188
|
+
capabilities: caps,
|
|
189
|
+
flags: { host: "127.0.0.1", port: DEFAULT_PORT, ctxSize },
|
|
190
|
+
commandArgv: args,
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
|
|
164
194
|
function summarizeCapabilities(caps) {
|
|
165
195
|
return {
|
|
166
196
|
architecture: caps.architecture,
|
package/src/recommendations.mjs
CHANGED
|
@@ -1,17 +1,59 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import { dirname, join } from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
import { detectHardware } from "./hardware.mjs";
|
|
5
|
+
|
|
6
|
+
const GB = 1024 ** 3;
|
|
7
|
+
|
|
8
|
+
const RECOMMENDATIONS_PATH = join(dirname(fileURLToPath(import.meta.url)), "..", "resources", "recommendations.json");
|
|
9
|
+
|
|
10
|
+
function loadRecommendations() {
|
|
11
|
+
try {
|
|
12
|
+
const raw = readFileSync(RECOMMENDATIONS_PATH, "utf8");
|
|
13
|
+
return JSON.parse(raw).models ?? [];
|
|
14
|
+
} catch {
|
|
15
|
+
return [];
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** All curated model entries. */
|
|
20
|
+
export function getModelEntries() {
|
|
21
|
+
return loadRecommendations();
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Recommend models that fit the detected hardware (max tier first). */
|
|
25
|
+
export function recommendModels(hardware) {
|
|
26
|
+
const entries = loadRecommendations();
|
|
27
|
+
const fitting = entries.filter((e) => e.minRamGb * GB <= hardware.totalRamBytes);
|
|
28
|
+
if (fitting.length === 0) return [];
|
|
29
|
+
const maxTier = Math.max(...fitting.map((e) => e.minRamGb));
|
|
30
|
+
// All models at the top fitting tier are genuine alternatives; sort by label
|
|
31
|
+
// so the pick is deterministic regardless of JSON order.
|
|
32
|
+
return fitting
|
|
33
|
+
.filter((e) => e.minRamGb === maxTier)
|
|
34
|
+
.sort((a, b) => a.label.localeCompare(b.label));
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/** Pick the best format for the platform. */
|
|
38
|
+
export function selectFormat(entry, hardware) {
|
|
39
|
+
if (hardware.platform === "darwin" && hardware.arch === "arm64") {
|
|
40
|
+
if (entry.mlx) return "mlx";
|
|
41
|
+
if (entry.gguf) return "gguf";
|
|
42
|
+
} else {
|
|
43
|
+
if (entry.gguf) return "gguf";
|
|
44
|
+
}
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Primary recommendation for this machine. */
|
|
49
|
+
export function recommendedModel(hardware) {
|
|
50
|
+
const fitting = recommendModels(hardware ?? detectHardware());
|
|
51
|
+
return fitting[0] ?? null;
|
|
13
52
|
}
|
|
14
53
|
|
|
15
|
-
|
|
16
|
-
|
|
54
|
+
/** All models that fit, sorted best-first (tier desc, then label). */
|
|
55
|
+
export function allFittingModels(hardware) {
|
|
56
|
+
const entries = loadRecommendations();
|
|
57
|
+
const fitting = entries.filter((e) => e.minRamGb * GB <= hardware.totalRamBytes);
|
|
58
|
+
return fitting.sort((a, b) => b.minRamGb - a.minRamGb || a.label.localeCompare(b.label));
|
|
17
59
|
}
|
package/src/scan.mjs
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import { statSync } from "node:fs";
|
|
2
|
-
import { readdir } from "node:fs/promises";
|
|
2
|
+
import { readdir, stat } from "node:fs/promises";
|
|
3
3
|
import { basename, dirname, join } from "node:path";
|
|
4
4
|
import { getModelScanDirs } from "./config.mjs";
|
|
5
5
|
import { readGgufMetadata } from "./gguf.mjs";
|
|
6
6
|
import { parseModelName } from "./model-name.mjs";
|
|
7
|
+
import { inferSourceLabel, MIN_MODEL_SIZE_BYTES, EMBEDDING_MODEL_TYPES } from "./discovery-shared.mjs";
|
|
7
8
|
|
|
8
9
|
// ── Scan for GGUF models and MTP drafters ────────────────────────────────
|
|
9
10
|
|
|
@@ -13,7 +14,8 @@ export async function scanGgufModels(dirs) {
|
|
|
13
14
|
const allDrafters = [];
|
|
14
15
|
|
|
15
16
|
for (const root of scanDirs) {
|
|
16
|
-
const
|
|
17
|
+
const sourceLabel = inferSourceLabel(root);
|
|
18
|
+
const { models, drafters } = await scanOneDir(root, sourceLabel);
|
|
17
19
|
allModels.push(...models);
|
|
18
20
|
allDrafters.push(...drafters);
|
|
19
21
|
}
|
|
@@ -36,7 +38,7 @@ export async function scanGgufModels(dirs) {
|
|
|
36
38
|
return { models, drafters };
|
|
37
39
|
}
|
|
38
40
|
|
|
39
|
-
async function scanOneDir(root) {
|
|
41
|
+
async function scanOneDir(root, sourceLabel = "local-gguf") {
|
|
40
42
|
const files = await findFiles(root, (path) => path.toLowerCase().endsWith(".gguf"));
|
|
41
43
|
const mmprojs = files.filter((path) => basename(path).toLowerCase().includes("mmproj"));
|
|
42
44
|
const candidates = files.filter((path) => !basename(path).toLowerCase().includes("mmproj"));
|
|
@@ -49,12 +51,15 @@ async function scanOneDir(root) {
|
|
|
49
51
|
const mmprojPath = mmprojs.find((candidate) => dirname(candidate) === dir) ?? null;
|
|
50
52
|
const name = basename(path).replace(/\.gguf$/i, "");
|
|
51
53
|
const sizeBytes = statSync(path).size;
|
|
54
|
+
if (sizeBytes < MIN_MODEL_SIZE_BYTES) continue;
|
|
52
55
|
const parsed = parseModelName(name, "local-gguf");
|
|
53
56
|
|
|
54
|
-
// Read GGUF metadata to detect drafter architecture
|
|
57
|
+
// Read GGUF metadata to detect drafter architecture and embeddings
|
|
55
58
|
const meta = safeReadGgufMetadata(path);
|
|
56
59
|
const architecture = typeof meta["general.architecture"] === "string" ? meta["general.architecture"] : null;
|
|
57
60
|
|
|
61
|
+
if (isEmbeddingArchitecture(architecture, name)) continue;
|
|
62
|
+
|
|
58
63
|
if (architecture === "gemma4-assistant" || architecture === "gemma4_assistant") {
|
|
59
64
|
// This is an MTP drafter model, not a main model
|
|
60
65
|
drafters.push({
|
|
@@ -66,7 +71,7 @@ async function scanOneDir(root) {
|
|
|
66
71
|
architecture,
|
|
67
72
|
targetHint: drafterTargetHint(name),
|
|
68
73
|
backend: "llama-cpp",
|
|
69
|
-
source:
|
|
74
|
+
source: sourceLabel,
|
|
70
75
|
});
|
|
71
76
|
} else {
|
|
72
77
|
models.push({
|
|
@@ -77,7 +82,7 @@ async function scanOneDir(root) {
|
|
|
77
82
|
quant: parsed.quant,
|
|
78
83
|
sizeBytes,
|
|
79
84
|
backend: "llama-cpp",
|
|
80
|
-
source:
|
|
85
|
+
source: sourceLabel,
|
|
81
86
|
});
|
|
82
87
|
}
|
|
83
88
|
}
|
|
@@ -85,6 +90,26 @@ async function scanOneDir(root) {
|
|
|
85
90
|
return { models, drafters };
|
|
86
91
|
}
|
|
87
92
|
|
|
93
|
+
// ── Embedding model filtering ─────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
const EMBEDDING_FILENAME_PATTERNS = [
|
|
96
|
+
/(?:^|[-_])bge[-_]/i,
|
|
97
|
+
/(?:^|[-_])jina[-_]/i,
|
|
98
|
+
/(?:^|[-_])e5[-_]/i,
|
|
99
|
+
/(?:^|[-_])gte[-_]/i,
|
|
100
|
+
/(?:^|[-_])all[-_]minilm/i,
|
|
101
|
+
/(?:^|[-_])mpnet/i,
|
|
102
|
+
/(?:^|[-_])nomic[-_]embed/i,
|
|
103
|
+
/(?:^|[-_])embed/i,
|
|
104
|
+
/(?:^|[-_])rerank/i,
|
|
105
|
+
];
|
|
106
|
+
|
|
107
|
+
export function isEmbeddingArchitecture(architecture, filename = "") {
|
|
108
|
+
if (architecture && EMBEDDING_MODEL_TYPES.has(architecture.toLowerCase())) return true;
|
|
109
|
+
const lowerName = filename.toLowerCase();
|
|
110
|
+
return EMBEDDING_FILENAME_PATTERNS.some((pattern) => pattern.test(lowerName));
|
|
111
|
+
}
|
|
112
|
+
|
|
88
113
|
// ── Match drafters to target models ────────────────────────────────────
|
|
89
114
|
|
|
90
115
|
// Map a drafter filename to a regex that matches its target model filenames.
|
|
@@ -137,8 +162,14 @@ async function findFiles(root, predicate) {
|
|
|
137
162
|
}
|
|
138
163
|
for (const entry of entries) {
|
|
139
164
|
const path = join(dir, entry.name);
|
|
140
|
-
if (entry.isDirectory()
|
|
141
|
-
|
|
165
|
+
if (entry.isDirectory() || entry.isSymbolicLink()) {
|
|
166
|
+
// Follow symlinks (HF cache uses them) and avoid recursion loops.
|
|
167
|
+
const stats = await stat(path).catch(() => null);
|
|
168
|
+
if (stats?.isDirectory()) await walk(path);
|
|
169
|
+
else if (stats?.isFile() && predicate(path)) result.push(path);
|
|
170
|
+
} else if (entry.isFile() && predicate(path)) {
|
|
171
|
+
result.push(path);
|
|
172
|
+
}
|
|
142
173
|
}
|
|
143
174
|
}
|
|
144
175
|
await walk(root);
|