offgrid-ai 0.15.8 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "offgrid-ai",
3
- "version": "0.15.8",
3
+ "version": "0.16.0",
4
4
  "description": "Privacy-first CLI for running local LLMs — discover, configure, run, benchmark",
5
5
  "author": "Eeshan Srivastava (https://eeshans.com)",
6
6
  "type": "module",
@@ -61,5 +61,9 @@
61
61
  "@eslint/js": "^10.0.1",
62
62
  "eslint": "^10.4.1",
63
63
  "globals": "^17.6.0"
64
+ },
65
+ "allowScripts": {
66
+ "@google/genai": true,
67
+ "protobufjs": true
64
68
  }
65
69
  }
package/src/backends.mjs CHANGED
@@ -1,8 +1,7 @@
1
1
  import { findLlamaServer } from "./config.mjs";
2
2
  import { scanGgufModels } from "./scan.mjs";
3
3
  import { parseModelName } from "./model-name.mjs";
4
- import { scanMlxModels, scanOmlxModelSizes, lookupOmlxModelInfo } from "./mlx-discovery.mjs";
5
- import { DEFAULT_PORT as MLX_VLM_PORT } from "./mlx-flags.mjs";
4
+ import { scanOmlxModelSizes, lookupOmlxModelInfo } from "./mlx-discovery.mjs";
6
5
 
7
6
  // ── Backend definitions ────────────────────────────────────────────────────
8
7
 
@@ -54,17 +53,6 @@ export const BACKENDS = {
54
53
  needsCommandFile: false,
55
54
  scanModels: () => scanOmlxModels(),
56
55
  },
57
- "mlx-vlm": {
58
- id: "mlx-vlm",
59
- label: "mlx-vlm",
60
- type: "local-server",
61
- providerId: "mlx-vlm",
62
- defaultHost: LOCAL_HOST,
63
- defaultPort: MLX_VLM_PORT,
64
- defaultBaseUrl: baseUrlFor({ port: MLX_VLM_PORT }),
65
- needsCommandFile: true,
66
- scanModels: async () => scanMlxModels(),
67
- },
68
56
  };
69
57
 
70
58
  export function backendFor(backendId) {
@@ -75,10 +63,8 @@ export function backendFor(backendId) {
75
63
 
76
64
  export async function backendBinaryFor(backendId) {
77
65
  const backend = BACKENDS[backendId ?? "llama-cpp"];
78
- if (backend.id === "mlx-vlm") return "python3"; // mlx-vlm spawns via python3 + the strict=False wrapper
79
66
  if (backend.type === "managed-server") return null;
80
- const discovered = await findLlamaServer();
81
- return discovered; // null means "not found — trigger onboarding"
67
+ return await findLlamaServer();
82
68
  }
83
69
 
84
70
  export function defaultFlagsForBackend(backendId) {
@@ -96,21 +82,15 @@ async function scanOmlxModels() {
96
82
  const body = await response.json();
97
83
  if (!Array.isArray(body?.data)) return [];
98
84
 
99
- // The oMLX API doesn't return model sizes or publishers — look them up from disk.
100
85
  const infoMap = await scanOmlxModelSizes();
101
86
 
102
- // The oMLX API can return the same model multiple times with different
103
- // ID formats (e.g. "Qwen3.6-35B-A3B-OptiQ-4bit" and
104
- // "mlx-community--Qwen3.6-35B-A3B-OptiQ-4bit"). Deduplicate by the
105
- // normalized full name (publisher/model with / separator), keeping
106
- // the first entry (which has the most complete metadata).
87
+ // Deduplicate by normalized full name (publisher/model with / separator)
107
88
  const seen = new Set();
108
89
  const deduped = [];
109
90
  for (const model of body.data.filter(isChatOmlxModel)) {
110
91
  const info = lookupOmlxModelInfo(model.id, infoMap);
111
92
  const hasPublisher = model.id.includes("/") || model.id.includes("--");
112
93
  const fullName = (!hasPublisher && info?.publisher) ? `${info.publisher}/${model.id}` : model.id;
113
- // Normalize: convert -- separator to / for dedup comparison
114
94
  const normalized = fullName.replace(/--/g, "/");
115
95
  if (seen.has(normalized)) continue;
116
96
  seen.add(normalized);
@@ -137,15 +117,10 @@ async function scanOmlxModels() {
137
117
  }).sort((a, b) => a.label.localeCompare(b.label));
138
118
  }
139
119
 
140
- // ── Labels ──────────────────────────────────────────────────────────────
141
-
142
120
  function isChatOmlxModel(model) {
143
121
  if (typeof model?.id !== "string" || !model.id.trim()) return false;
144
122
  const type = String(model.type ?? model.model_type ?? "").toLowerCase();
145
123
  if (["embedding", "embeddings", "reranker", "tool", "converter", "markitdown"].includes(type)) return false;
146
124
  if (Object.hasOwn(model, "max_model_len") && model.max_model_len === null) return false;
147
125
  return true;
148
- }
149
-
150
- // (ollamaLabel and omlxLabel removed — parseModelName in model-name.mjs is the single path)
151
- // (Ollama backend removed — offgrid-ai now uses llama-server + mlx-vlm + oMLX)
126
+ }
@@ -1,7 +1,6 @@
1
1
  import { findLlamaServer, ensureDirs } from "../config.mjs";
2
2
  import { backendFor } from "../backends.mjs";
3
3
  import { scanGgufModels } from "../scan.mjs";
4
- import { scanMlxModels } from "../mlx-discovery.mjs";
5
4
  import { loadProfiles } from "../profiles.mjs";
6
5
  import { hasPi } from "../harness-pi.mjs";
7
6
  import { offerManagedLlamaRuntimeUpdate } from "../runtime.mjs";
@@ -27,10 +26,9 @@ export async function mainFlow() {
27
26
  const llamaBinary = await findLlamaServer();
28
27
  const { models: ggufModels, drafters } = await scanGgufModels();
29
28
  const managedModels = await scanManagedModels();
30
- const mlxModels = await scanMlxModels();
31
29
  const profiles = await loadProfiles();
32
30
  const hasAnyBackend = llamaBinary || managedModels.some((item) => item.status === "ok" && item.models.length > 0);
33
- const hasAnyModels = ggufModels.length > 0 || mlxModels.length > 0 || managedModels.some((item) => item.status === "ok" && item.models.length > 0);
31
+ const hasAnyModels = ggufModels.length > 0 || managedModels.some((item) => item.status === "ok" && item.models.length > 0);
34
32
 
35
33
  const piInstalled = await hasPi();
36
34
  const needsLlama = ggufModels.length > 0 || profiles.some((profile) => backendFor(profile.backend).type === "local-server");
@@ -58,7 +56,7 @@ export async function mainFlow() {
58
56
  if (!process.stdin.isTTY) return await statusCommand();
59
57
 
60
58
  startInteractive("offgrid-ai");
61
- return await modelCommandCenter({ profiles, ggufModels, managedModels, mlxModels, drafters });
59
+ return await modelCommandCenter({ profiles, ggufModels, managedModels, drafters });
62
60
  }
63
61
 
64
62
  async function printNoModelsHelp(llamaBinary) {
@@ -86,4 +84,4 @@ async function printNoModelsHelp(llamaBinary) {
86
84
  console.log(pc.dim(` Recommended: ${model.label}`));
87
85
  }
88
86
  if (omlxInstalled) console.log(pc.bold(" omlx start"));
89
- }
87
+ }
@@ -6,7 +6,7 @@ import { syncPiConfig, removeFromPiConfig } from "../harness-pi.mjs";
6
6
  import { configureLocalProfile } from "../profile-setup.mjs";
7
7
  import { pc, startInteractive, createPrompt, modelSelect } from "../ui.mjs";
8
8
  import { buildCatalogItems, createManagedProfile, itemKey, loadModelCatalog, normalizeCatalog } from "../model-catalog.mjs";
9
- import { modelSelectOption, modelNameWidth, inferBackendId, formatSourceLabel, discoverySourceForItem, printGgufModelDetails, printMlxModelDetails, printManagedModelDetails, printWorkspaceHeader, printBenchmarkLine, printProfileDetails } from "../model-presenters.mjs";
9
+ import { modelSelectOption, modelNameWidth, inferBackendId, formatSourceLabel, discoverySourceForItem, printGgufModelDetails, printManagedModelDetails, printWorkspaceHeader, printBenchmarkLine, printProfileDetails } from "../model-presenters.mjs";
10
10
  import { runProfile } from "./run.mjs";
11
11
 
12
12
  const { stripVTControlCharacters } = await import("node:util");
@@ -83,7 +83,6 @@ export async function modelCommandCenter(initialCatalog) {
83
83
 
84
84
  const groups = [];
85
85
  const backendColors = {
86
- "mlx-vlm": pc.yellow,
87
86
  "llama-cpp": pc.cyan,
88
87
  "llama-cpp-mtp": pc.blue,
89
88
  omlx: pc.magenta,
@@ -185,7 +184,6 @@ async function performAction(prompt, action, item) {
185
184
  if (action === "inspect") {
186
185
  if (item.type === "profile") return await printProfileDetails(await readProfile(item.profile.id));
187
186
  if (item.type === "managed") return printManagedModelDetails(item.model, BACKENDS[item.backendId]);
188
- if (item.model?.format === "mlx") return await printMlxModelDetails(item.model);
189
187
  return printGgufModelDetails(item.model, item.drafter);
190
188
  }
191
189
  if (action === "benchmark") {
@@ -225,18 +223,6 @@ async function setupItem(prompt, item, action) {
225
223
  printProfileSaved(profile.id);
226
224
  return;
227
225
  }
228
- // MLX models: build a mlx-vlm profile and run interactive config.
229
- if (item.model.format === "mlx") {
230
- const { createProfileFromMlxModel } = await import("../profiles.mjs");
231
- const { configureMlxProfile } = await import("../profile-setup.mjs");
232
- const profile = await createProfileFromMlxModel(item.model);
233
- const configured = await configureMlxProfile(prompt, profile);
234
- if (!configured) return;
235
- await saveProfile(configured, { writeCommand: true });
236
- await syncPiConfig(configured);
237
- printProfileSaved(configured.id);
238
- return;
239
- }
240
226
  const profile = await createProfileFromModel(item.model, null, item.drafter?.path);
241
227
  const configured = await configureLocalProfile(prompt, profile);
242
228
  if (!configured) return;
@@ -2,7 +2,6 @@ import { existsSync } from "node:fs";
2
2
  import { ensureDirs, findLlamaServer, hasHomebrew, HF_HUB_DIR } from "../config.mjs";
3
3
  import { BACKENDS } from "../backends.mjs";
4
4
  import { scanGgufModels } from "../scan.mjs";
5
- import { scanMlxModels } from "../mlx-discovery.mjs";
6
5
  import { hasPi } from "../harness-pi.mjs";
7
6
  import { offerManagedLlamaRuntimeUpdate } from "../runtime.mjs";
8
7
  import { scanManagedModels } from "../managed.mjs";
@@ -27,16 +26,15 @@ export async function onboardFlow() {
27
26
  const llamaBinary = await ensureLlamaRuntime(prompt);
28
27
  if (!(await ensurePi(prompt, run))) return;
29
28
 
30
- const [{ models: ggufModels }, managedModels, mlxModels] = await Promise.all([
29
+ const [{ models: ggufModels }, managedModels] = await Promise.all([
31
30
  scanGgufModels(),
32
31
  scanManagedModels(),
33
- scanMlxModels(),
34
32
  ]);
35
33
  const totalManaged = managedModels.reduce((sum, item) => sum + item.models.length, 0);
36
- const hasModels = ggufModels.length > 0 || totalManaged > 0 || mlxModels.length > 0;
34
+ const hasModels = ggufModels.length > 0 || totalManaged > 0;
37
35
 
38
36
  if (hasModels) {
39
- printFoundModels(ggufModels, managedModels, mlxModels, llamaBinary);
37
+ printFoundModels(ggufModels, managedModels, llamaBinary);
40
38
  } else {
41
39
  const canDownload = await hasHuggingfaceHub();
42
40
  if (canDownload) {
@@ -96,14 +94,11 @@ async function ensurePi(prompt, run) {
96
94
  return true;
97
95
  }
98
96
 
99
- function printFoundModels(ggufModels, managedModels, mlxModels, llamaBinary) {
97
+ function printFoundModels(ggufModels, managedModels, llamaBinary) {
100
98
  if (ggufModels.length > 0) {
101
99
  console.log(pc.green(`✓ Found ${ggufModels.length} GGUF model${ggufModels.length === 1 ? "" : "s"}`));
102
100
  if (!llamaBinary) console.log(pc.yellow("Install the managed llama.cpp runtime to run these GGUF models."));
103
101
  }
104
- if (mlxModels.length > 0) {
105
- console.log(pc.green(`✓ Found ${mlxModels.length} MLX model${mlxModels.length === 1 ? "" : "s"}`));
106
- }
107
102
  for (const { backendId, models, status, reason } of managedModels) {
108
103
  if (status === "unavailable") {
109
104
  console.log(pc.yellow(`${BACKENDS[backendId].label}: unavailable${reason ? ` — ${reason}` : ""}`));
@@ -117,7 +112,7 @@ async function offerModelDownload(prompt) {
117
112
  const hardware = detectHardware();
118
113
  const candidates = allFittingModels(hardware)
119
114
  .map((entry) => ({ entry, format: selectFormat(entry, hardware) }))
120
- .filter((item) => item.format != null);
115
+ .filter((item) => item.format === "gguf");
121
116
  if (candidates.length === 0) {
122
117
  console.log(pc.yellow("No curated models fit your hardware."));
123
118
  return false;
@@ -134,7 +129,7 @@ async function offerModelDownload(prompt) {
134
129
  const shouldDownload = await prompt.yesNo("Download " + primary.entry.label + " (" + primary.format + ")?", true);
135
130
  if (!shouldDownload) return false;
136
131
 
137
- const hfRef = primary.format === "mlx" ? primary.entry.mlx : primary.entry.gguf;
132
+ const hfRef = primary.entry.gguf;
138
133
  try {
139
134
  const plan = await resolveHfDownload(hfRef);
140
135
  console.log(pc.dim("Total size: " + formatBytes(plan.totalSizeBytes)));
@@ -236,4 +231,4 @@ async function installAllBackends(prompt, run, model) {
236
231
 
237
232
  async function runInstallerCommands(run, installer) {
238
233
  for (const [cmd, args, label] of installer.commands) await run(cmd, args, label);
239
- }
234
+ }
@@ -2,6 +2,10 @@ import { ensureDirs } from "../config.mjs";
2
2
  import { backendFor } from "../backends.mjs";
3
3
  import { loadProfiles } from "../profiles.mjs";
4
4
  import { profileRuntimeStatus } from "../process.mjs";
5
+ import { existsSync } from "node:fs";
6
+ import { execFileSync } from "node:child_process";
7
+ import { homedir } from "node:os";
8
+ import { join } from "node:path";
5
9
  import { pc, renderRows, renderCard } from "../ui.mjs";
6
10
 
7
11
  export async function statusCommand() {
@@ -38,6 +42,21 @@ export async function statusCommand() {
38
42
 
39
43
  console.log(renderCard("Status", renderRows(summaryRows), { formatBorder: running.length > 0 ? pc.green : pc.dim }));
40
44
 
45
+ // Show oMLX cache disk usage if cache exists
46
+ const omlxCacheDir = join(homedir(), ".omlx", "cache");
47
+ if (existsSync(omlxCacheDir)) {
48
+ try {
49
+ const duOutput = execFileSync("du", ["-sh", omlxCacheDir], { encoding: "utf8" });
50
+ const cacheSize = duOutput.split(/\s+/)[0];
51
+ console.log("\n" + renderCard("oMLX cache", renderRows([
52
+ ["Location", pc.dim(omlxCacheDir)],
53
+ ["Disk usage", pc.bold(cacheSize)],
54
+ ]), { formatBorder: pc.magenta }));
55
+ } catch {
56
+ // du not available — skip
57
+ }
58
+ }
59
+
41
60
  if (managedUpMissing.length > 0 || managedUpNotLoaded.length > 0) {
42
61
  const detailRows = [];
43
62
  for (const { profile, status } of [...managedUpMissing, ...managedUpNotLoaded]) {
package/src/config.mjs CHANGED
@@ -18,8 +18,8 @@ export const MANAGED_LLAMA_SERVER = join(RUNTIME_DIR, "bin", "llama-server");
18
18
  // HuggingFace hub cache: $HF_HUB_CACHE, else $HF_HOME/hub, else
19
19
  // ~/.cache/huggingface/hub. This is where huggingface_hub stores
20
20
  // models--org--name/... and where offgrid-ai scans + downloads. Pointing at the
21
- // hub (not the HF root) keeps the HF-hub MLX/GGUF scanners and the downloader
22
- // on the same layout.
21
+ // hub (not the HF root) keeps the GGUF scanner and the downloader on the
22
+ // same layout.
23
23
  export const HF_HUB_DIR = process.env.HF_HUB_CACHE
24
24
  || (process.env.HF_HOME ? join(process.env.HF_HOME, "hub") : join(homedir(), ".cache", "huggingface", "hub"));
25
25
 
@@ -14,7 +14,7 @@ import pc from "picocolors";
14
14
  * the repo from HuggingFace. Other backends use the friendly modelAlias.
15
15
  */
16
16
  export function piApiModelId(profile) {
17
- return profile.backend === "mlx-vlm" ? profile.modelPath : profile.modelAlias;
17
+ return profile.modelAlias;
18
18
  }
19
19
 
20
20
  // ── Sync Pi config ─────────────────────────────────────────────────────────
@@ -135,7 +135,7 @@ export function modelReasoning(profile) {
135
135
  }
136
136
 
137
137
  export function modelFamily(profile) {
138
- return [profile.id, profile.label, profile.modelAlias, profile.modelPath, profile.omlxModel].filter(Boolean).join(" ").toLowerCase();
138
+ return [profile.id, profile.label, profile.modelAlias, profile.omlxModel].filter(Boolean).join(" ").toLowerCase();
139
139
  }
140
140
 
141
141
  function piApiKey() {
@@ -1,36 +1,14 @@
1
- // MLX model discovery + metadata — scans configured model directories for MLX
2
- // model directories and parses their config.json.
3
- // Ported from deprecated-offgrid-desktop/src/main/model-discovery.ts +
4
- // mlx-metadata.ts (MLX subset only).
5
- //
6
- // This runs ALONGSIDE offgrid-ai's existing GGUF scan (scan.mjs scanGgufModels)
7
- // — it does not replace it. The picker (main.mjs) will merge GGUF + MLX lists.
8
- //
9
- // An MLX model directory is one containing config.json + one or more
10
- // *.safetensors files. HuggingFace Hub cache layout (models--org--name) is
11
- // detected and scanned specially.
1
+ // oMLX model size lookup — scans ~/.omlx/models/ for MLX model directories
2
+ // to compute sizes and publishers. The oMLX API doesn't return these, so we
3
+ // read them from disk.
12
4
 
13
- import { readdir, stat, readFile } from "node:fs/promises";
5
+ import { readdir, stat } from "node:fs/promises";
14
6
  import { existsSync } from "node:fs";
15
- import { join, basename } from "node:path";
7
+ import { join } from "node:path";
16
8
  import { homedir } from "node:os";
17
- import { getModelScanDirs } from "./config.mjs";
18
- import { inferSourceLabel, MIN_MODEL_SIZE_BYTES, EMBEDDING_MODEL_TYPES } from "./discovery-shared.mjs";
19
- import { parseModelName } from "./model-name.mjs";
20
9
 
21
- // ── Folder → backend mapping ──────────────────────────────────────────────
22
- // The oMLX folder is oMLX-exclusive: models there are served by the oMLX
23
- // managed backend, NOT by mlx-vlm. Every OTHER scan dir is format-based
24
- // (GGUF → llama.cpp, MLX → mlx-vlm). So mlx-vlm scans all configured dirs
25
- // EXCEPT the oMLX folder.
26
10
  const OMLX_MODELS_DIR = join(homedir(), ".omlx", "models");
27
- function isOmlxFolder(p) {
28
- return p === OMLX_MODELS_DIR || p.startsWith(OMLX_MODELS_DIR + "/");
29
- }
30
-
31
- // ── MLX directory detection ───────────────────────────────────────────────
32
11
 
33
- /** True if dir contains config.json + at least one .safetensors file. */
34
12
  async function isMlxModelDir(dir) {
35
13
  if (!existsSync(join(dir, "config.json"))) return false;
36
14
  try {
@@ -41,7 +19,6 @@ async function isMlxModelDir(dir) {
41
19
  }
42
20
  }
43
21
 
44
- /** Sum the size of all .safetensors files in an MLX model dir (bytes). */
45
22
  async function getMlxDirSizeBytes(dir) {
46
23
  try {
47
24
  const entries = await readdir(dir);
@@ -57,259 +34,9 @@ async function getMlxDirSizeBytes(dir) {
57
34
  }
58
35
  }
59
36
 
60
- // ── Recursive MLX scanner ─────────────────────────────────────────────────
61
-
62
- /**
63
- * Recursively scan a directory for MLX model directories.
64
- * Searches up to maxDepth levels deep. Does NOT collect GGUF (that's scan.mjs).
65
- */
66
- async function scanDirRecursiveForMlx(rootDir, sourceLabel, maxDepth = 3) {
67
- if (!existsSync(rootDir)) return [];
68
- const models = [];
69
-
70
- async function walk(dir, depth) {
71
- if (depth > maxDepth) return;
72
- let entries;
73
- try {
74
- entries = await readdir(dir, { withFileTypes: true });
75
- } catch {
76
- return;
77
- }
78
-
79
- // Is this directory itself an MLX model dir? (don't recurse into it)
80
- if (depth > 0 && await isMlxModelDir(dir)) {
81
- const sizeBytes = await getMlxDirSizeBytes(dir);
82
- if (sizeBytes < MIN_MODEL_SIZE_BYTES) return;
83
- if (await isEmbeddingMlxModel(join(dir, "config.json"))) return;
84
- const caps = await detectMlxCapabilities(dir);
85
- const { display, quant } = parseModelName(basename(dir), sourceLabel);
86
- models.push(makeMlxModel(dir, display, sizeBytes, sourceLabel, rootDir, caps.contextLength, quant));
87
- return;
88
- }
89
-
90
- for (const entry of entries) {
91
- if (entry.name.startsWith(".") || entry.name === "README.md" || entry.name === ".gitattributes") continue;
92
- const fullPath = join(dir, entry.name);
93
- if (entry.isDirectory()) {
94
- if (await isMlxModelDir(fullPath)) {
95
- const sizeBytes = await getMlxDirSizeBytes(fullPath);
96
- if (sizeBytes < MIN_MODEL_SIZE_BYTES) continue;
97
- if (await isEmbeddingMlxModel(join(fullPath, "config.json"))) continue;
98
- const caps = await detectMlxCapabilities(fullPath);
99
- // Extract publisher from parent dir (LM Studio: publisher/model-dir)
100
- const relParts = fullPath.slice(rootDir.length + 1).split("/");
101
- const publisher = (sourceLabel === "lmstudio" && relParts.length >= 2) ? relParts[0] : null;
102
- const rawLabel = publisher ? `${publisher}/${entry.name}` : entry.name;
103
- const { display, quant } = parseModelName(rawLabel, sourceLabel);
104
- models.push(makeMlxModel(fullPath, display, sizeBytes, sourceLabel, rootDir, caps.contextLength, quant));
105
- } else {
106
- await walk(fullPath, depth + 1);
107
- }
108
- }
109
- }
110
- }
111
-
112
- await walk(rootDir, 0);
113
- return models;
114
- }
115
-
116
- // ── HuggingFace Hub layout ────────────────────────────────────────────────
117
-
118
- /** True if dir looks like an HF Hub cache (has models--* subdirs). */
119
- async function looksLikeHfHub(dir) {
120
- if (!existsSync(dir)) return false;
121
- try {
122
- const entries = await readdir(dir, { withFileTypes: true });
123
- return entries.some((e) => e.isDirectory() && e.name.startsWith("models--"));
124
- } catch {
125
- return false;
126
- }
127
- }
128
-
129
- /**
130
- * Scan an HF Hub cache dir for MLX model dirs.
131
- * HF layout: models--org--name/snapshots/hash/files
132
- */
133
- async function scanHfHubForMlx(dir, sourceLabel) {
134
- if (!existsSync(dir)) return [];
135
- const models = [];
136
- try {
137
- const entries = await readdir(dir, { withFileTypes: true });
138
- for (const entry of entries) {
139
- if (!entry.isDirectory() || !entry.name.startsWith("models--")) continue;
140
- const parts = entry.name.slice("models--".length).split("--");
141
- const label = parts.join("/");
142
- const snapshotsDir = join(dir, entry.name, "snapshots");
143
- if (!existsSync(snapshotsDir)) continue;
144
- const snapshots = await readdir(snapshotsDir, { withFileTypes: true });
145
- // Follow symlinks (HF hub uses them; test imports use them too). A model
146
- // dir can have several snapshots — some incomplete/empty. Check EACH
147
- // snapshot and use the first that is a valid MLX model dir, rather than
148
- // giving up on the whole model if the first snapshot happens to be empty.
149
- const candidates = snapshots.filter((s) => s.isDirectory() || s.isSymbolicLink());
150
- let snapshotPath = null;
151
- for (const snap of candidates) {
152
- const sp = join(snapshotsDir, snap.name);
153
- const st = await stat(sp).catch(() => null);
154
- if (st?.isDirectory() && await isMlxModelDir(sp)) { snapshotPath = sp; break; }
155
- }
156
-
157
- if (!snapshotPath) continue;
158
- const sizeBytes = await getMlxDirSizeBytes(snapshotPath);
159
- if (sizeBytes < MIN_MODEL_SIZE_BYTES) continue;
160
- if (await isEmbeddingMlxModel(join(snapshotPath, "config.json"))) continue;
161
- const caps = await detectMlxCapabilities(snapshotPath);
162
- const { display, quant } = parseModelName(label, sourceLabel);
163
- models.push({
164
- id: `${sourceLabel}:${entry.name}`,
165
- label: display,
166
- path: snapshotPath,
167
- filePath: snapshotPath,
168
- sizeBytes,
169
- contextLength: caps.contextLength,
170
- quant,
171
- backend: "mlx-vlm",
172
- format: "mlx",
173
- source: sourceLabel,
174
- });
175
- }
176
- } catch {
177
- // Can't read — return what we have.
178
- }
179
- return models;
180
- }
181
-
182
- // ── Embedding model filtering for MLX ─────────────────────────────────────
183
-
184
- async function isEmbeddingMlxModel(configPath) {
185
- if (!existsSync(configPath)) return false;
186
- try {
187
- const config = JSON.parse(await readFile(configPath, "utf-8"));
188
- const textConfig = config.text_config ?? config;
189
- const modelType = String(textConfig.model_type ?? "").toLowerCase();
190
- if (EMBEDDING_MODEL_TYPES.has(modelType)) return true;
191
- const arch = Array.isArray(config.architectures) ? config.architectures[0] : "";
192
- const lowerArch = String(arch).toLowerCase();
193
- return EMBEDDING_MODEL_TYPES.has(lowerArch) || lowerArch.includes("bert");
194
- } catch {
195
- return false;
196
- }
197
- }
198
-
199
- // ── MLX model entry builder ───────────────────────────────────────────────
200
-
201
- function makeMlxModel(dir, label, sizeBytes, sourceLabel, rootDir, contextLength = null, quant = null) {
202
- return {
203
- id: `${sourceLabel}:${dir.replace(rootDir + "/", "")}`,
204
- label,
205
- path: dir,
206
- filePath: dir,
207
- sizeBytes,
208
- contextLength,
209
- quant,
210
- backend: "mlx-vlm",
211
- format: "mlx",
212
- source: sourceLabel,
213
- };
214
- }
215
-
216
- // ── Public API ─────────────────────────────────────────────────────────────
217
-
218
- /**
219
- * Discover all MLX models across the configured scan directories.
220
- * Reads scan dirs from config.mjs getModelScanDirs() — same paths GGUF uses
221
- * (LM Studio, HF hub, user-added). Returns a flat, deduplicated list.
222
- */
223
- export async function scanMlxModels(dirs) {
224
- // mlx-vlm scans every configured dir EXCEPT the oMLX folder (oMLX-exclusive).
225
- const scanDirs = (dirs ?? await getModelScanDirs()).filter((d) => !isOmlxFolder(d));
226
- const results = await Promise.all(
227
- scanDirs.map(async (dir) => {
228
- const label = inferSourceLabel(dir);
229
- if (await looksLikeHfHub(dir)) return scanHfHubForMlx(dir, label);
230
- return scanDirRecursiveForMlx(dir, label);
231
- }),
232
- );
233
- const all = results.flat();
234
- // Deduplicate by filePath (same model may appear in multiple paths).
235
- const seen = new Set();
236
- return all.filter((m) => {
237
- if (seen.has(m.filePath)) return false;
238
- seen.add(m.filePath);
239
- return true;
240
- });
241
- }
242
-
243
- // ── MLX capability detection ─────────────────────────────────────────────
244
-
245
- /**
246
- * Detect MLX model capabilities from its config.json.
247
- * Returns { architecture, thinking, vision, contextLength }.
248
- */
249
- export async function detectMlxCapabilities(modelDir) {
250
- const configPath = join(modelDir, "config.json");
251
- if (!existsSync(configPath)) return { thinking: false, vision: false, contextLength: null, architecture: null };
252
- try {
253
- const config = JSON.parse(await readFile(configPath, "utf-8"));
254
- return detectMlxCapabilitiesFromConfig(config, modelDir);
255
- } catch {
256
- return { thinking: false, vision: false, contextLength: null, architecture: null };
257
- }
258
- }
259
-
260
- export function detectMlxCapabilitiesFromConfig(config, modelDir) {
261
- const textConfig = config.text_config ?? config;
262
- const rawName = config._name_or_path ?? basename(modelDir ?? "");
263
- const name = String(rawName).toLowerCase();
264
- const label = String(rawName);
265
-
266
- const modelType = String(config.model_type ?? "").toLowerCase();
267
- const textModelType = String(textConfig.model_type ?? "").toLowerCase();
268
-
269
- const vision = Boolean(
270
- config.vision_config ||
271
- config.image_token_id != null ||
272
- config.video_token_id != null ||
273
- config.vision_start_token_id != null ||
274
- modelType.includes("vl") ||
275
- modelType.includes("vision") ||
276
- textModelType.includes("vl") ||
277
- textModelType.includes("vision")
278
- );
279
-
280
- const thinking = /qwen3|gemma-4|gemma4|deepseek-r[12]/i.test(name + " " + label);
281
-
282
- const architectures = Array.isArray(config.architectures) ? config.architectures : [];
283
- const architecture = architectures[0] ?? null;
284
-
285
- const candidates = [
286
- textConfig.max_position_embeddings,
287
- textConfig.sliding_window,
288
- config.max_position_embeddings,
289
- config.sliding_window,
290
- ].filter((v) => typeof v === "number" && v > 0);
291
- const contextLength = candidates.length > 0 ? Math.max(...candidates) : null;
292
-
293
- return { thinking, vision, contextLength, architecture };
294
- }
295
-
296
- /**
297
- * Pick a sensible default context length for an MLX model, capping by RAM.
298
- */
299
- export function defaultMlxContextLength(trainedCtx, ramGb) {
300
- if (!trainedCtx || trainedCtx <= 0) return 8192;
301
- if (ramGb < 12) return Math.min(trainedCtx, 4096);
302
- if (ramGb < 16) return Math.min(trainedCtx, 8192);
303
- if (ramGb < 32) return Math.min(trainedCtx, 16384);
304
- return trainedCtx;
305
- }
306
-
307
- // ── oMLX model size lookup (from disk) ────────────────────────────────────
308
-
309
37
  /**
310
- * Scan the oMLX models directory (~/.omlx/models/) for MLX model directories
311
- * and return a Map of basename → { sizeBytes, publisher }. The oMLX API
312
- * doesn't return model sizes or publishers, so we compute them from disk.
38
+ * Scan ~/.omlx/models/ for MLX model directories and return a Map of
39
+ * basename → { sizeBytes, publisher }.
313
40
  */
314
41
  export async function scanOmlxModelSizes() {
315
42
  if (!existsSync(OMLX_MODELS_DIR)) return new Map();
@@ -329,7 +56,6 @@ export async function scanOmlxModelSizes() {
329
56
  const sizeBytes = await getMlxDirSizeBytes(fullPath);
330
57
  if (sizeBytes > 0) infoByBasename.set(entry.name, { sizeBytes, publisher });
331
58
  } else {
332
- // First-level directories under ~/.omlx/models/ are publishers
333
59
  await walk(fullPath, publisher ?? entry.name);
334
60
  }
335
61
  }
@@ -340,7 +66,7 @@ export async function scanOmlxModelSizes() {
340
66
  }
341
67
 
342
68
  /**
343
- * Look up a model's info by its oMLX API id. Tries exact match, then the
69
+ * Look up a model's info by its oMLX API id. Tries exact match, then the
344
70
  * segment after `--` (oMLX org--name format), then after `/` (HF format).
345
71
  */
346
72
  export function lookupOmlxModelInfo(modelId, infoMap) {
@@ -1,28 +1,23 @@
1
1
  import { scanGgufModels, matchDrafter } from "./scan.mjs";
2
2
  import { loadProfiles, normalizeProfile, sanitizeProfileId } from "./profiles.mjs";
3
3
  import { scanManagedModels } from "./managed.mjs";
4
- import { scanMlxModels } from "./mlx-discovery.mjs";
5
4
  import { isProfileFileMissing } from "./model-summary.mjs";
6
5
  import { backendFor } from "./backends.mjs";
7
6
 
8
7
  export async function loadModelCatalog() {
9
- const [profiles, { models: ggufModels, drafters }, managedModels, mlxModels] = await Promise.all([
8
+ const [profiles, { models: ggufModels, drafters }, managedModels] = await Promise.all([
10
9
  loadProfiles(),
11
10
  scanGgufModels(),
12
11
  scanManagedModels(),
13
- scanMlxModels(),
14
12
  ]);
15
- return normalizeCatalog({ profiles, ggufModels, drafters, managedModels, mlxModels });
13
+ return normalizeCatalog({ profiles, ggufModels, drafters, managedModels });
16
14
  }
17
15
 
18
16
  export function normalizeCatalog(catalog) {
19
17
  if (catalog.newModels && catalog.managedItems) return catalog;
20
- const { profiles, ggufModels, drafters, managedModels, mlxModels = [] } = catalog;
18
+ const { profiles, ggufModels, drafters, managedModels } = catalog;
21
19
  const profiledPaths = new Set(profiles.map((profile) => profile.modelPath).filter(Boolean));
22
- const newModels = [
23
- ...ggufModels.filter((model) => !profiledPaths.has(model.path)),
24
- ...mlxModels.filter((model) => !profiledPaths.has(model.path)),
25
- ];
20
+ const newModels = ggufModels.filter((model) => !profiledPaths.has(model.path));
26
21
  const managedItems = [];
27
22
  for (const { backendId, models, status } of managedModels) {
28
23
  if (status === "unavailable") continue;
@@ -35,9 +30,10 @@ export function normalizeCatalog(catalog) {
35
30
  if (!profiledAliases.has(`${backendId}:${model.id}`)) managedItems.push({ model, backendId });
36
31
  }
37
32
  }
38
- return { profiles, ggufModels, drafters, managedModels, mlxModels, newModels, managedItems };
33
+ return { profiles, ggufModels, drafters, managedModels, newModels, managedItems };
39
34
  }
40
35
 
36
+
41
37
  export function itemKey(item) {
42
38
  if (item.type === "profile") return `profile:${item.profile.id}`;
43
39
  if (item.type === "new") return `new:${item.model.path}`;
@@ -57,12 +53,11 @@ function compareRecency(a, b) {
57
53
  }
58
54
 
59
55
  export function buildCatalogItems(normalized) {
60
- const { profiles, newModels, managedItems, drafters, ggufModels = [], mlxModels = [], managedModels = [] } = normalized;
56
+ const { profiles, newModels, managedItems, drafters, ggufModels = [], managedModels = [] } = normalized;
61
57
 
62
58
  // Lookup maps for enriching profile items with scan data (size + context).
63
59
  const scanByPath = new Map();
64
60
  for (const m of ggufModels) scanByPath.set(m.path, m);
65
- for (const m of mlxModels) scanByPath.set(m.filePath ?? m.path, m);
66
61
 
67
62
  const managedByKey = new Map();
68
63
  for (const { backendId, models } of managedModels) {
@@ -77,7 +72,7 @@ export function buildCatalogItems(normalized) {
77
72
  if (profile.modelPath) {
78
73
  const scanModel = scanByPath.get(profile.modelPath);
79
74
  if (scanModel) {
80
- item.label = scanModel.label; // re-parsed label (publisher/model-name)
75
+ item.label = scanModel.label;
81
76
  if (scanModel.quant) quant = scanModel.quant;
82
77
  }
83
78
  }
@@ -160,4 +155,4 @@ export function createManagedProfile(model, backendId) {
160
155
  modelSizeBytes: model.sizeBytes || 0,
161
156
  ...(backendId === "omlx" ? { omlxModel: model.id } : {}),
162
157
  });
163
- }
158
+ }
@@ -44,8 +44,6 @@ function optionSourceTag(sourceId) {
44
44
  omlx: pc.magenta,
45
45
  "llama.cpp": pc.cyan,
46
46
  gguf: pc.cyan,
47
- mlx: pc.yellow,
48
- "mlx-vlm": pc.yellow,
49
47
  };
50
48
  return optionPad(label, colors[sourceId] ?? pc.dim, OPTION_SOURCE_WIDTH);
51
49
  }
@@ -57,7 +55,6 @@ function optionBackendTag(backendId) {
57
55
  "llama-cpp": pc.cyan,
58
56
  "llama-cpp-mtp": pc.blue,
59
57
  omlx: pc.magenta,
60
- "mlx-vlm": pc.yellow,
61
58
  };
62
59
  return optionPad(label, colors[backendId] ?? pc.dim, OPTION_BACKEND_WIDTH);
63
60
  }
@@ -70,8 +67,6 @@ export function formatSourceLabel(sourceId) {
70
67
  omlx: "oMLX",
71
68
  "llama.cpp": "llama.cpp",
72
69
  gguf: "GGUF file",
73
- mlx: "MLX",
74
- "mlx-vlm": "MLX",
75
70
  };
76
71
  return map[sourceId] ?? String(sourceId);
77
72
  }
@@ -200,7 +195,6 @@ export function inferBackendId(item) {
200
195
  if (item.type === "profile") return item.profile.backend;
201
196
  if (item.type === "managed") return item.backendId;
202
197
  // new model: derive from format
203
- if (item.model?.format === "mlx") return "mlx-vlm";
204
198
  if (item.model?.backend) return item.model.backend;
205
199
  return "llama-cpp";
206
200
  }
@@ -297,29 +291,6 @@ export function printGgufModelDetails(model, drafter) {
297
291
  console.log("\n" + renderSectionRows("Model details", detailRows, { columns: Math.min(process.stdout.columns ?? 110, 140) }));
298
292
  }
299
293
 
300
- export async function printMlxModelDetails(model) {
301
- const { detectMlxCapabilities } = await import("./mlx-discovery.mjs");
302
- const caps = await detectMlxCapabilities(model.filePath ?? model.path);
303
- const parts = [];
304
- if (caps.architecture) parts.push(caps.architecture);
305
- if (caps.thinking) parts.push("thinking");
306
- if (caps.vision) parts.push("vision");
307
- const summary = parts.length > 0 ? parts.join(pc.dim(" · ")) : "standard MLX";
308
- console.log("\n" + renderSectionRows("Downloaded model", [
309
- ["Name", pc.bold(model.label)],
310
- ["Status", pc.yellow("Needs one-time setup")],
311
- ["Details", summary],
312
- ]));
313
- console.log("\n" + renderSectionRows("Model details", [
314
- ["Model dir", model.path],
315
- ["Backend", "mlx-vlm"],
316
- ["Source", formatSourceLabel(model.source)],
317
- ["Detected", summary],
318
- ["Size", formatBytes(model.sizeBytes)],
319
- ["Context", caps.contextLength ? `${caps.contextLength.toLocaleString()} trained` : "unknown"],
320
- ], { columns: Math.min(process.stdout.columns ?? 110, 140) }));
321
- }
322
-
323
294
  export function printManagedModelDetails(model, backend) {
324
295
  console.log("\n" + renderSectionRows(`${backend.label} model`, [
325
296
  ["Name", pc.bold(model.label)],
package/src/process.mjs CHANGED
@@ -21,32 +21,17 @@ export async function computeServerCommand(profile) {
21
21
  const binary = await backendBinaryFor(profile.backend);
22
22
  if (!binary) throw new Error("Server binary not found. Run offgrid-ai interactively to install.");
23
23
 
24
- let argv, extraEnv;
25
-
26
- if (profile.backend === "mlx-vlm") {
27
- const { computeMlxVlmFlags } = await import("./mlx-flags.mjs");
28
- const result = computeMlxVlmFlags(profile.modelPath, {
29
- port: profile.flags?.port,
30
- ctxSize: profile.flags?.ctxSize,
31
- thinkingEnabled: profile.capabilities?.thinking ?? true,
32
- });
33
- argv = result.args;
34
- extraEnv = { APC_ENABLED: "1", MLX_VLM_MAX_TOKENS: "16384" };
35
- } else {
36
- // llama-cpp / llama-cpp-mtp
37
- const { computeFlags } = await import("./autodetect.mjs");
38
- const result = computeFlags(
39
- profile.capabilities ?? {},
40
- profile.modelPath,
41
- profile.mmprojPath,
42
- profile.drafterPath,
43
- profile.flags ?? {},
44
- );
45
- argv = result.argv;
46
- extraEnv = {};
47
- }
24
+ // llama-cpp / llama-cpp-mtp
25
+ const { computeFlags } = await import("./autodetect.mjs");
26
+ const result = computeFlags(
27
+ profile.capabilities ?? {},
28
+ profile.modelPath,
29
+ profile.mmprojPath,
30
+ profile.drafterPath,
31
+ profile.flags ?? {},
32
+ );
48
33
 
49
- return { binary, argv, extraEnv, backend };
34
+ return { binary, argv: result.argv, extraEnv: {}, backend };
50
35
  }
51
36
 
52
37
  /** Build a runnable start.sh script for the profile. */
@@ -132,19 +117,34 @@ async function startLocalServer(profile) {
132
117
  }
133
118
 
134
119
  async function startManagedServer(profile, backend) {
135
- const ready = await serverReady(profile.baseUrl);
136
- if (ready) {
137
- // Already running
138
- } else {
139
- for (let i = 0; i < 60; i++) {
140
- await sleep(2000);
141
- if (await serverReady(profile.baseUrl)) break;
142
- process.stdout.write(".");
143
- }
144
- if (!(await serverReady(profile.baseUrl))) {
145
- throw new Error(`${backend.label} is not responding at ${profile.baseUrl}. Start it and try again.`);
120
+ if (await serverReady(profile.baseUrl)) {
121
+ return writeManagedState(profile, backend);
122
+ }
123
+
124
+ // Try to start the managed server via CLI
125
+ if (backend.id === "omlx") {
126
+ try {
127
+ const { execFile } = await import("node:child_process");
128
+ const { promisify } = await import("node:util");
129
+ await promisify(execFile)("omlx", ["start"], { timeout: 10000 });
130
+ } catch {
131
+ throw new Error(`${backend.label} is not running and could not be auto-started. Install oMLX or run \`omlx start\` manually.`);
146
132
  }
147
133
  }
134
+
135
+ // Wait for it to come up
136
+ for (let i = 0; i < 60; i++) {
137
+ await sleep(2000);
138
+ if (await serverReady(profile.baseUrl)) break;
139
+ process.stdout.write(".");
140
+ }
141
+ if (!(await serverReady(profile.baseUrl))) {
142
+ throw new Error(`${backend.label} is not responding at ${profile.baseUrl}. Start it and try again.`);
143
+ }
144
+ return writeManagedState(profile, backend);
145
+ }
146
+
147
+ async function writeManagedState(profile, backend) {
148
148
  const state = {
149
149
  pid: null,
150
150
  profileId: profile.id,
@@ -180,10 +180,7 @@ export async function stopProfile(profile) {
180
180
  }
181
181
 
182
182
  // Reliably terminate a detached local-server process group: SIGTERM with a
183
- // grace period for graceful shutdown (lets mlx-vlm/llama-server release the
184
- // model), then SIGKILL if still alive. Guarantees the model is unloaded when a
185
- // profile stops — consistent across backends (llama-server exits on SIGTERM;
186
- // mlx-vlm/uvicorn often does not, hence the SIGKILL fallback).
183
+ // grace period for graceful shutdown, then SIGKILL if still alive.
187
184
  async function terminateProcess(pid) {
188
185
  const signalGroup = (sig) => {
189
186
  try { process.kill(-pid, sig); }
@@ -227,8 +224,6 @@ export async function unloadModelFromServer(profile) {
227
224
  const backend = backendFor(profile.backend);
228
225
 
229
226
  if (backend.id === "llama-cpp" || backend.id === "llama-cpp-mtp") {
230
- // llama.cpp unloads when the server process exits; no HTTP unload API exists.
231
- // If offgrid-ai started the server, stopProfile already handled it.
232
227
  return { unloaded: false, backend: backend.id, reason: "stop server to unload" };
233
228
  }
234
229
 
@@ -236,12 +231,6 @@ export async function unloadModelFromServer(profile) {
236
231
  return await unloadOmlxModel(profile);
237
232
  }
238
233
 
239
- if (backend.id === "mlx-vlm") {
240
- // mlx-vlm is a local-server backend — stopProfile handles unload by killing
241
- // the process. No HTTP unload API.
242
- return { unloaded: false, backend: backend.id, reason: "stop server to unload" };
243
- }
244
-
245
234
  return { unloaded: false, backend: backend.id, reason: "unsupported backend" };
246
235
  }
247
236
 
@@ -8,7 +8,6 @@ import { pc, formatBytes, renderRows, renderSection } from "./ui.mjs";
8
8
  import { detectCapabilities } from "./autodetect.mjs";
9
9
  import { matchDrafter } from "./scan.mjs";
10
10
  import { scanGgufModels } from "./scan.mjs";
11
- import { estimateMemoryMb } from "./mlx-flags.mjs";
12
11
  import { capabilitySummary } from "./model-summary.mjs";
13
12
 
14
13
  const execFileAsync = promisify(execFile);
@@ -248,92 +247,4 @@ function samplingSummary(flags) {
248
247
  return `temp ${flags.temperature}, top-p ${flags.topP}, top-k ${flags.topK}`;
249
248
  }
250
249
 
251
- // ── MLX profile configuration ─────────────────────────────────────────────
252
250
 
253
- /**
254
- * Interactive configuration for an mlx-vlm profile.
255
- */
256
- export async function configureMlxProfile(prompt, profile) {
257
- let configured = profile;
258
-
259
- console.log("");
260
- console.log(renderSection("Model setup", renderRows([
261
- ["Model", pc.bold(profile.label)],
262
- ["Detected", mlxDetectionSummary(configured.capabilities)],
263
- ["Context", String(configured.flags.ctxSize) + " tokens"],
264
- ])));
265
- console.log(pc.dim("Larger context windows use more memory. You can edit the profile later if needed.\n"));
266
-
267
- if (configured.capabilities.vision) {
268
- console.log(renderSection("Vision detected", renderRows([
269
- ["Capability", "image / multimodal input"],
270
- ["Note", "mlx-vlm loads vision from the model directory automatically."],
271
- ])));
272
- }
273
-
274
- if (configured.capabilities.thinking) {
275
- console.log("");
276
- console.log(renderSection("Thinking mode", renderRows([
277
- ["Flag", "--enable-thinking"],
278
- ["Default", "on for Qwen 3 / Gemma 4 / DeepSeek-R class models"],
279
- ])));
280
- const useThinking = await prompt.yesNo("Enable thinking mode?", true);
281
- configured = await applyMlxThinkingToggle(configured, useThinking);
282
- }
283
-
284
- const ctxSize = await prompt.number("Context window tokens", configured.flags.ctxSize, 1024, 1048576);
285
- configured = applyMlxContextSize(configured, ctxSize);
286
-
287
- console.log("\n" + renderMlxMemoryEstimate(configured));
288
-
289
- console.log("");
290
- console.log(renderSection("Defaults", renderRows([
291
- ["Backend", configured.backend],
292
- ["Endpoint", configured.baseUrl],
293
- ["Context", String(configured.flags.ctxSize) + " tokens"],
294
- ["Thinking", configured.capabilities?.thinking ? "on" : "off"],
295
- ["Vision", configured.capabilities.vision ? "yes" : "no"],
296
- ])));
297
-
298
- if (!(await prompt.yesNo("Save profile with these settings?", true))) return null;
299
- return configured;
300
- }
301
-
302
- async function applyMlxThinkingToggle(profile, enabled) {
303
- if (!profile.capabilities.thinking) return profile;
304
- return {
305
- ...profile,
306
- capabilities: { ...profile.capabilities, thinkingEnabled: enabled },
307
- };
308
- }
309
-
310
- function applyMlxContextSize(profile, ctxSize) {
311
- const flags = { ...profile.flags, ctxSize };
312
- return {
313
- ...profile,
314
- flags,
315
- baseUrl: baseUrlForFlags(flags),
316
- };
317
- }
318
-
319
- function renderMlxMemoryEstimate(profile) {
320
- const modelBytes = profile.modelSizeBytes || 0;
321
- if (!modelBytes) {
322
- return renderSection("Memory estimate", pc.dim("Model size unknown — save the profile to estimate."));
323
- }
324
- const totalMb = estimateMemoryMb(modelBytes);
325
- const overheadBytes = Math.max(0, totalMb * 1024 * 1024 - modelBytes);
326
- return renderSection("Memory estimate", renderRows([
327
- ["Estimated total", pc.bold(`~${formatBytes(totalMb * 1024 * 1024)}`)],
328
- ["Model", formatBytes(modelBytes)],
329
- ["Overhead", `~${formatBytes(overheadBytes)} (KV cache, APC, runtime)`],
330
- ]));
331
- }
332
-
333
- function mlxDetectionSummary(caps) {
334
- const parts = [];
335
- if (caps.architecture) parts.push(caps.architecture);
336
- if (caps.thinking) parts.push("thinking");
337
- if (caps.vision) parts.push("vision");
338
- return parts.length > 0 ? parts.join(" · ") : "standard MLX";
339
- }
package/src/profiles.mjs CHANGED
@@ -2,10 +2,8 @@ import { existsSync } from "node:fs";
2
2
  import { mkdir, readdir, rm, unlink, writeFile, readFile } from "node:fs/promises";
3
3
  import { join } from "node:path";
4
4
  import { PROFILE_DIR, RUN_DIR, LOG_DIR } from "./config.mjs";
5
- import { backendFor, baseUrlForFlags, defaultFlagsForBackend } from "./backends.mjs";
5
+ import { backendFor, baseUrlForFlags, defaultFlagsForBackend, BACKENDS } from "./backends.mjs";
6
6
  import { computeFlags } from "./autodetect.mjs";
7
- import { detectMlxCapabilities, defaultMlxContextLength } from "./mlx-discovery.mjs";
8
- import { detectHardware } from "./hardware.mjs";
9
7
  import { readJson, writeJson } from "./json.mjs";
10
8
 
11
9
  // ── Path helpers ───────────────────────────────────────────────────────────
@@ -42,7 +40,7 @@ export async function loadProfiles() {
42
40
  .filter((e) => e.isDirectory() && existsSync(profileJsonPath(e.name)))
43
41
  .map((e) => e.name)
44
42
  .sort();
45
- return Promise.all(ids.map((id) => readProfile(id)));
43
+ return (await Promise.all(ids.map((id) => readProfile(id)))).filter((p) => BACKENDS[p.backend]);
46
44
  }
47
45
 
48
46
  export async function readProfile(id) {
@@ -152,28 +150,6 @@ export async function createProfileFromModel(model, backendId, drafterPath) {
152
150
  });
153
151
  }
154
152
 
155
- // ── Auto-create profile from a discovered MLX model ────────────────────────
156
-
157
- export async function createProfileFromMlxModel(model) {
158
- const { DEFAULT_PORT } = await import("./mlx-flags.mjs");
159
- const caps = await detectMlxCapabilities(model.filePath);
160
- const ctxSize = defaultMlxContextLength(caps.contextLength, detectHardware().totalRamBytes / (1024 ** 3));
161
- return normalizeProfile({
162
- id: slugFromLabel(model.label),
163
- label: model.label,
164
- backend: "mlx-vlm",
165
- providerId: "mlx-vlm",
166
- modelAlias: model.label,
167
- source: model.source,
168
- modelPath: model.filePath,
169
- mmprojPath: null,
170
- drafterPath: null,
171
- modelSizeBytes: model.sizeBytes,
172
- capabilities: caps,
173
- flags: { host: "127.0.0.1", port: DEFAULT_PORT, ctxSize },
174
- });
175
- }
176
-
177
153
  function summarizeCapabilities(caps) {
178
154
  return {
179
155
  architecture: caps.architecture,
@@ -1,112 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- mlx-vlm server wrapper with strict=False model loading + APC merge fix.
4
-
5
- Two monkey-patches are applied before the server starts:
6
-
7
- 1. strict=False model loading — needed for architectures with shared-KV weight
8
- schemes (e.g. Gemma 4). Most models (Qwen, Llama, Mistral, Phi) load fine
9
- with strict=True — strict=False is a no-op for them.
10
-
11
- 2. BatchRotatingKVCache.merge() shape-mismatch fix — upstream mlx-lm bug
12
- (ml-explore/mlx-lm PR #1116, Blaizzy/mlx-vlm Issue #923). The merge() method
13
- crashes with `ValueError: [broadcast_shapes] Shapes (1,1,28,256) and
14
- (1,1,512,256) cannot be broadcast` when APC merges exact-cache entries with
15
- different fill levels. This affects all sliding-window attention models
16
- (Gemma 4, Mistral, Mixtral). The fix uses explicit slicing instead of
17
- negative indexing to guarantee exactly `l` elements are extracted.
18
-
19
- This patch can be removed once mlx-lm fixes merge() upstream (not fixed in
20
- 0.31.2 or 0.31.3 — the merge() method is identical in both).
21
-
22
- Benchmark finding: mlx-vlm clears Metal cache after every request (GitHub Issue
23
- #999) unless APC_ENABLED=1 is set. The env var is set by the Electron app at
24
- spawn time, not in this wrapper.
25
-
26
- Usage:
27
- python3 mlxvlm-server-wrapper.py --model <path> --host 127.0.0.1 --port <port>
28
- """
29
- import sys
30
-
31
- # ── Patch 1: strict=False model loading ──────────────────────────────────────
32
-
33
- import mlx_vlm.utils as _utils
34
- _orig_load_model = _utils.load_model
35
-
36
- def _patched_load_model(model_path, lazy=False, strict=True, **kwargs):
37
- return _orig_load_model(model_path, lazy=lazy, strict=False, **kwargs)
38
-
39
- _utils.load_model = _patched_load_model
40
-
41
- # ── Patch 2: BatchRotatingKVCache.merge() shape-mismatch fix ──────────────────
42
- #
43
- # Upstream bug: _temporal_order() can return a buffer whose seq dimension differs
44
- # from c.size(). The negative slice [..., -l:, :] then produces a mismatched shape,
45
- # crashing with ValueError: [broadcast_shapes].
46
- #
47
- # Fix: use explicit slicing to extract exactly `l` elements, right-aligning within
48
- # the target slice when the buffer is shorter than `l` (left-padded by zeros from
49
- # the pre-allocated target tensor).
50
-
51
- import mlx.core as mx
52
- from mlx_lm.models import cache as _lm_cache
53
-
54
- _orig_merge = _lm_cache.BatchRotatingKVCache.merge
55
-
56
- @classmethod
57
- def _patched_merge(cls, caches):
58
- if not all(c.max_size == caches[0].max_size for c in caches):
59
- raise ValueError(
60
- "BatchRotatingKVCache can only merge caches with the same maximum size"
61
- )
62
-
63
- offsets = [c.offset for c in caches]
64
- lengths = [c.size() for c in caches]
65
- max_length = max(lengths)
66
-
67
- if max_length == 0:
68
- return cls(caches[0].max_size, [0] * len(caches))
69
-
70
- padding = [max_length - l for l in lengths]
71
- B = len(caches)
72
- H = max(c.keys.shape[1] for c in caches if c.keys is not None)
73
- Dk = max(c.keys.shape[3] for c in caches if c.keys is not None)
74
- Dv = max(c.values.shape[3] for c in caches if c.values is not None)
75
- dt = next(iter(c.keys.dtype for c in caches if c.keys is not None))
76
-
77
- keys = mx.zeros((B, H, max_length, Dk), dtype=dt)
78
- values = mx.zeros((B, H, max_length, Dv), dtype=dt)
79
- for i, (p, l, c) in enumerate(zip(padding, lengths, caches)):
80
- if c.keys is None:
81
- continue
82
- ordered_k = c._temporal_order(c.keys)
83
- ordered_v = c._temporal_order(c.values)
84
- seq_len = ordered_k.shape[2]
85
- if seq_len >= l:
86
- # Normal case: extract the last `l` tokens.
87
- start = seq_len - l
88
- keys[i : i + 1, :, p : p + l] = ordered_k[..., start : start + l, :]
89
- values[i : i + 1, :, p : p + l] = ordered_v[..., start : start + l, :]
90
- else:
91
- # Buffer shorter than l: right-align within the slice (left-padded
92
- # by zeros from the pre-allocated target tensor).
93
- gap = l - seq_len
94
- keys[i : i + 1, :, p + gap : p + l] = ordered_k
95
- values[i : i + 1, :, p + gap : p + l] = ordered_v
96
-
97
- cache = cls(caches[0].max_size, padding)
98
- cache.keys = keys
99
- cache.values = values
100
- cache.offset = mx.array(offsets)
101
- cache._idx = keys.shape[2]
102
- cache._offset = keys.shape[2]
103
-
104
- return cache
105
-
106
- _lm_cache.BatchRotatingKVCache.merge = _patched_merge
107
-
108
- # ── Run the server ────────────────────────────────────────────────────────────
109
- # main() parses sys.argv for --model, --host, --port, etc.
110
- from mlx_vlm.server import main
111
- main()
112
-
package/src/mlx-flags.mjs DELETED
@@ -1,100 +0,0 @@
1
- // mlx-vlm server flag computation — pure functions, no side effects.
2
- // Ported from deprecated-offgrid-desktop/src/main/server-flags.ts (MLX subset).
3
- //
4
- // Benchmark-informed decisions (see sidequests/mlx-backend-benchmark/RESULTS.md):
5
- // - mlx-vlm requires APC_ENABLED=1 env var (86x TTFT improvement) — set at spawn
6
- // time in process.mjs, NOT here (this module only computes args).
7
- // - mlx-vlm uses a strict=False wrapper script for shared-KV architectures
8
- // (Gemma 4-class). Safe for all models — strict=False is a no-op for models
9
- // that load fine with strict=True.
10
- // - mlx-vlm uses --enable-thinking for thinking-mode control.
11
- // - mlx-vlm uses --max-kv-size for the KV cache / context window.
12
- //
13
- // Only the mlx-vlm-relevant logic is ported here. offgrid-ai's existing GGUF
14
- // flag logic (autodetect.mjs / profile-setup.mjs / estimate.mjs) is unchanged.
15
-
16
- import { fileURLToPath } from "node:url";
17
- import { dirname, join } from "node:path";
18
-
19
- const MB = 1024 ** 2;
20
-
21
- /** Default port for the local model server. Matches the desktop's DEFAULT_PORT. */
22
- export const DEFAULT_PORT = 18080;
23
-
24
- /** Resolved path to the bundled strict=False wrapper script (sibling of src/). */
25
- export const MLX_VLM_WRAPPER = join(dirname(fileURLToPath(import.meta.url)), "..", "resources", "mlxvlm-server-wrapper.py");
26
-
27
- /** Overhead multiplier for mlx-vlm: weights × 1.5 (covers KV cache, activations, APC cache; benchmark-validated). */
28
- const MLX_VLM_OVERHEAD_MULTIPLIER = 1.5;
29
-
30
- /** Server process overhead in MB. */
31
- const PROCESS_OVERHEAD_MB = 200;
32
-
33
- /**
34
- * Estimate mlx-vlm memory usage (MB): model weights × 1.5 + process overhead.
35
- *
36
- * The 1.5 multiplier covers KV cache, activations, and APC cache overhead
37
- * (benchmark-validated; see sidequests/mlx-backend-benchmark/RESULTS.md).
38
- * GGUF/llama-server estimation uses the detailed path in estimate.mjs.
39
- *
40
- * @param {number} fileSizeBytes - model size on disk (sum of MLX safetensors).
41
- * @returns {number} estimated memory in MB.
42
- */
43
- export function estimateMemoryMb(fileSizeBytes) {
44
- return Math.round((fileSizeBytes / MB) * MLX_VLM_OVERHEAD_MULTIPLIER + PROCESS_OVERHEAD_MB);
45
- }
46
-
47
- /**
48
- * Compute mlx-vlm server arguments.
49
- *
50
- * mlx-vlm is the MLX-native server (benchmark-validated best throughput + memory
51
- * efficiency on Apple Silicon). Invoked via the strict=False wrapper script for
52
- * compatibility with shared-KV architectures (Gemma 4-class).
53
- *
54
- * The APC_ENABLED=1 env var is MANDATORY but is set at spawn time in
55
- * process.mjs, not in args.
56
- *
57
- * The wrapper script (resources/mlxvlm-server-wrapper.py) applies strict=False
58
- * model loading + the BatchRotatingKVCache.merge() fix, both required for
59
- * shared-KV architectures (Gemma 4-class). It is resolved to a real path via
60
- * MLX_VLM_WRAPPER; there is intentionally no raw-mlx_vlm.server path.
61
- *
62
- * @param {string} modelPath - path to the MLX model directory.
63
- * @param {object} [options]
64
- * @param {number} [options.port] - port (default DEFAULT_PORT).
65
- * @param {number} [options.ctxSize] - context window (passed as --max-kv-size).
66
- * @param {boolean} [options.thinkingEnabled=true] - whether to enable thinking.
67
- * @returns {{ args: string[], port: number }}
68
- */
69
- export function computeMlxVlmFlags(modelPath, options = {}) {
70
- const port = options.port ?? DEFAULT_PORT;
71
- const ctxSize = options.ctxSize;
72
- const thinkingEnabled = options.thinkingEnabled ?? true;
73
-
74
- // The binary is "python3" (resolved by backendBinaryFor in backends.mjs); the
75
- // wrapper path is the first arg.
76
- const args = [
77
- MLX_VLM_WRAPPER,
78
- "--model", modelPath,
79
- "--host", "127.0.0.1",
80
- "--port", String(port),
81
- ];
82
-
83
- if (thinkingEnabled) {
84
- args.push("--enable-thinking");
85
- }
86
-
87
- // Context size: mlx-vlm uses --max-kv-size for the KV cache / context window.
88
- if (ctxSize && ctxSize > 0) {
89
- args.push("--max-kv-size", String(ctxSize));
90
- }
91
-
92
- // Default max output tokens — used when the client doesn't specify max_tokens
93
- // in the request. Pi's OpenAI completions provider never sends max_tokens
94
- // (it doesn't fall back to model.maxTokens like the Anthropic provider does).
95
- // llama-server defaults high; mlx-vlm defaults to 2048 which is too low for
96
- // coding tasks. Set a generous server-side default.
97
- args.push("--max-tokens", "16384");
98
-
99
- return { args, port };
100
- }