npm - offgrid-ai - Versions diffs - 0.15.8 → 0.16.0 - Mend

offgrid-ai 0.15.8 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/package.json +5 -1
package/src/backends.mjs +4 -29
package/src/commands/main.mjs +3 -5
package/src/commands/models.mjs +1 -15
package/src/commands/onboard.mjs +7 -12
package/src/commands/status.mjs +19 -0
package/src/config.mjs +2 -2
package/src/harness-pi.mjs +2 -2
package/src/mlx-discovery.mjs +8 -282
package/src/model-catalog.mjs +9 -14
package/src/model-presenters.mjs +0 -29
package/src/process.mjs +37 -48
package/src/profile-setup.mjs +0 -89
package/src/profiles.mjs +2 -26
package/resources/mlxvlm-server-wrapper.py +0 -112
package/src/mlx-flags.mjs +0 -100

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "offgrid-ai",
-  "version": "0.15.8",
+  "version": "0.16.0",
   "description": "Privacy-first CLI for running local LLMs — discover, configure, run, benchmark",
   "author": "Eeshan Srivastava (https://eeshans.com)",
   "type": "module",
@@ -61,5 +61,9 @@
     "@eslint/js": "^10.0.1",
     "eslint": "^10.4.1",
     "globals": "^17.6.0"
+  },
+  "allowScripts": {
+    "@google/genai": true,
+    "protobufjs": true
   }
 }

package/src/backends.mjs CHANGED Viewed

@@ -1,8 +1,7 @@
 import { findLlamaServer } from "./config.mjs";
 import { scanGgufModels } from "./scan.mjs";
 import { parseModelName } from "./model-name.mjs";
-import { scanMlxModels, scanOmlxModelSizes, lookupOmlxModelInfo } from "./mlx-discovery.mjs";
-import { DEFAULT_PORT as MLX_VLM_PORT } from "./mlx-flags.mjs";
+import { scanOmlxModelSizes, lookupOmlxModelInfo } from "./mlx-discovery.mjs";
 // ── Backend definitions ────────────────────────────────────────────────────
@@ -54,17 +53,6 @@ export const BACKENDS = {
     needsCommandFile: false,
     scanModels: () => scanOmlxModels(),
   },
-  "mlx-vlm": {
-    id: "mlx-vlm",
-    label: "mlx-vlm",
-    type: "local-server",
-    providerId: "mlx-vlm",
-    defaultHost: LOCAL_HOST,
-    defaultPort: MLX_VLM_PORT,
-    defaultBaseUrl: baseUrlFor({ port: MLX_VLM_PORT }),
-    needsCommandFile: true,
-    scanModels: async () => scanMlxModels(),
-  },
 };
 export function backendFor(backendId) {
@@ -75,10 +63,8 @@ export function backendFor(backendId) {
 export async function backendBinaryFor(backendId) {
   const backend = BACKENDS[backendId ?? "llama-cpp"];
-  if (backend.id === "mlx-vlm") return "python3"; // mlx-vlm spawns via python3 + the strict=False wrapper
   if (backend.type === "managed-server") return null;
-  const discovered = await findLlamaServer();
-  return discovered; // null means "not found — trigger onboarding"
+  return await findLlamaServer();
 }
 export function defaultFlagsForBackend(backendId) {
@@ -96,21 +82,15 @@ async function scanOmlxModels() {
   const body = await response.json();
   if (!Array.isArray(body?.data)) return [];
-  // The oMLX API doesn't return model sizes or publishers — look them up from disk.
   const infoMap = await scanOmlxModelSizes();
-  // The oMLX API can return the same model multiple times with different
-  // ID formats (e.g. "Qwen3.6-35B-A3B-OptiQ-4bit" and
-  // "mlx-community--Qwen3.6-35B-A3B-OptiQ-4bit"). Deduplicate by the
-  // normalized full name (publisher/model with / separator), keeping
-  // the first entry (which has the most complete metadata).
+  // Deduplicate by normalized full name (publisher/model with / separator)
   const seen = new Set();
   const deduped = [];
   for (const model of body.data.filter(isChatOmlxModel)) {
     const info = lookupOmlxModelInfo(model.id, infoMap);
     const hasPublisher = model.id.includes("/") || model.id.includes("--");
     const fullName = (!hasPublisher && info?.publisher) ? `${info.publisher}/${model.id}` : model.id;
-    // Normalize: convert -- separator to / for dedup comparison
     const normalized = fullName.replace(/--/g, "/");
     if (seen.has(normalized)) continue;
     seen.add(normalized);
@@ -137,15 +117,10 @@ async function scanOmlxModels() {
     }).sort((a, b) => a.label.localeCompare(b.label));
 }
-// ── Labels ──────────────────────────────────────────────────────────────
 function isChatOmlxModel(model) {
   if (typeof model?.id !== "string" || !model.id.trim()) return false;
   const type = String(model.type ?? model.model_type ?? "").toLowerCase();
   if (["embedding", "embeddings", "reranker", "tool", "converter", "markitdown"].includes(type)) return false;
   if (Object.hasOwn(model, "max_model_len") && model.max_model_len === null) return false;
   return true;
-}
-// (ollamaLabel and omlxLabel removed — parseModelName in model-name.mjs is the single path)
-// (Ollama backend removed — offgrid-ai now uses llama-server + mlx-vlm + oMLX)
+}

package/src/commands/main.mjs CHANGED Viewed

@@ -1,7 +1,6 @@
 import { findLlamaServer, ensureDirs } from "../config.mjs";
 import { backendFor } from "../backends.mjs";
 import { scanGgufModels } from "../scan.mjs";
-import { scanMlxModels } from "../mlx-discovery.mjs";
 import { loadProfiles } from "../profiles.mjs";
 import { hasPi } from "../harness-pi.mjs";
 import { offerManagedLlamaRuntimeUpdate } from "../runtime.mjs";
@@ -27,10 +26,9 @@ export async function mainFlow() {
   const llamaBinary = await findLlamaServer();
   const { models: ggufModels, drafters } = await scanGgufModels();
   const managedModels = await scanManagedModels();
-  const mlxModels = await scanMlxModels();
   const profiles = await loadProfiles();
   const hasAnyBackend = llamaBinary || managedModels.some((item) => item.status === "ok" && item.models.length > 0);
-  const hasAnyModels = ggufModels.length > 0 || mlxModels.length > 0 || managedModels.some((item) => item.status === "ok" && item.models.length > 0);
+  const hasAnyModels = ggufModels.length > 0 || managedModels.some((item) => item.status === "ok" && item.models.length > 0);
   const piInstalled = await hasPi();
   const needsLlama = ggufModels.length > 0 || profiles.some((profile) => backendFor(profile.backend).type === "local-server");
@@ -58,7 +56,7 @@ export async function mainFlow() {
   if (!process.stdin.isTTY) return await statusCommand();
   startInteractive("offgrid-ai");
-  return await modelCommandCenter({ profiles, ggufModels, managedModels, mlxModels, drafters });
+  return await modelCommandCenter({ profiles, ggufModels, managedModels, drafters });
 }
 async function printNoModelsHelp(llamaBinary) {
@@ -86,4 +84,4 @@ async function printNoModelsHelp(llamaBinary) {
     console.log(pc.dim(`  Recommended: ${model.label}`));
   }
   if (omlxInstalled) console.log(pc.bold("  omlx start"));
-}
+}

package/src/commands/models.mjs CHANGED Viewed

@@ -6,7 +6,7 @@ import { syncPiConfig, removeFromPiConfig } from "../harness-pi.mjs";
 import { configureLocalProfile } from "../profile-setup.mjs";
 import { pc, startInteractive, createPrompt, modelSelect } from "../ui.mjs";
 import { buildCatalogItems, createManagedProfile, itemKey, loadModelCatalog, normalizeCatalog } from "../model-catalog.mjs";
-import { modelSelectOption, modelNameWidth, inferBackendId, formatSourceLabel, discoverySourceForItem, printGgufModelDetails, printMlxModelDetails, printManagedModelDetails, printWorkspaceHeader, printBenchmarkLine, printProfileDetails } from "../model-presenters.mjs";
+import { modelSelectOption, modelNameWidth, inferBackendId, formatSourceLabel, discoverySourceForItem, printGgufModelDetails, printManagedModelDetails, printWorkspaceHeader, printBenchmarkLine, printProfileDetails } from "../model-presenters.mjs";
 import { runProfile } from "./run.mjs";
 const { stripVTControlCharacters } = await import("node:util");
@@ -83,7 +83,6 @@ export async function modelCommandCenter(initialCatalog) {
   const groups = [];
   const backendColors = {
-    "mlx-vlm": pc.yellow,
     "llama-cpp": pc.cyan,
     "llama-cpp-mtp": pc.blue,
     omlx: pc.magenta,
@@ -185,7 +184,6 @@ async function performAction(prompt, action, item) {
   if (action === "inspect") {
     if (item.type === "profile") return await printProfileDetails(await readProfile(item.profile.id));
     if (item.type === "managed") return printManagedModelDetails(item.model, BACKENDS[item.backendId]);
-    if (item.model?.format === "mlx") return await printMlxModelDetails(item.model);
     return printGgufModelDetails(item.model, item.drafter);
   }
   if (action === "benchmark") {
@@ -225,18 +223,6 @@ async function setupItem(prompt, item, action) {
     printProfileSaved(profile.id);
     return;
   }
-  // MLX models: build a mlx-vlm profile and run interactive config.
-  if (item.model.format === "mlx") {
-    const { createProfileFromMlxModel } = await import("../profiles.mjs");
-    const { configureMlxProfile } = await import("../profile-setup.mjs");
-    const profile = await createProfileFromMlxModel(item.model);
-    const configured = await configureMlxProfile(prompt, profile);
-    if (!configured) return;
-    await saveProfile(configured, { writeCommand: true });
-    await syncPiConfig(configured);
-    printProfileSaved(configured.id);
-    return;
-  }
   const profile = await createProfileFromModel(item.model, null, item.drafter?.path);
   const configured = await configureLocalProfile(prompt, profile);
   if (!configured) return;

package/src/commands/onboard.mjs CHANGED Viewed

@@ -2,7 +2,6 @@ import { existsSync } from "node:fs";
 import { ensureDirs, findLlamaServer, hasHomebrew, HF_HUB_DIR } from "../config.mjs";
 import { BACKENDS } from "../backends.mjs";
 import { scanGgufModels } from "../scan.mjs";
-import { scanMlxModels } from "../mlx-discovery.mjs";
 import { hasPi } from "../harness-pi.mjs";
 import { offerManagedLlamaRuntimeUpdate } from "../runtime.mjs";
 import { scanManagedModels } from "../managed.mjs";
@@ -27,16 +26,15 @@ export async function onboardFlow() {
     const llamaBinary = await ensureLlamaRuntime(prompt);
     if (!(await ensurePi(prompt, run))) return;
-    const [{ models: ggufModels }, managedModels, mlxModels] = await Promise.all([
+    const [{ models: ggufModels }, managedModels] = await Promise.all([
       scanGgufModels(),
       scanManagedModels(),
-      scanMlxModels(),
     ]);
     const totalManaged = managedModels.reduce((sum, item) => sum + item.models.length, 0);
-    const hasModels = ggufModels.length > 0 || totalManaged > 0 || mlxModels.length > 0;
+    const hasModels = ggufModels.length > 0 || totalManaged > 0;
     if (hasModels) {
-      printFoundModels(ggufModels, managedModels, mlxModels, llamaBinary);
+      printFoundModels(ggufModels, managedModels, llamaBinary);
     } else {
       const canDownload = await hasHuggingfaceHub();
       if (canDownload) {
@@ -96,14 +94,11 @@ async function ensurePi(prompt, run) {
   return true;
 }
-function printFoundModels(ggufModels, managedModels, mlxModels, llamaBinary) {
+function printFoundModels(ggufModels, managedModels, llamaBinary) {
   if (ggufModels.length > 0) {
     console.log(pc.green(`✓ Found ${ggufModels.length} GGUF model${ggufModels.length === 1 ? "" : "s"}`));
     if (!llamaBinary) console.log(pc.yellow("Install the managed llama.cpp runtime to run these GGUF models."));
   }
-  if (mlxModels.length > 0) {
-    console.log(pc.green(`✓ Found ${mlxModels.length} MLX model${mlxModels.length === 1 ? "" : "s"}`));
-  }
   for (const { backendId, models, status, reason } of managedModels) {
     if (status === "unavailable") {
       console.log(pc.yellow(`${BACKENDS[backendId].label}: unavailable${reason ? ` — ${reason}` : ""}`));
@@ -117,7 +112,7 @@ async function offerModelDownload(prompt) {
   const hardware = detectHardware();
   const candidates = allFittingModels(hardware)
     .map((entry) => ({ entry, format: selectFormat(entry, hardware) }))
-    .filter((item) => item.format != null);
+    .filter((item) => item.format === "gguf");
   if (candidates.length === 0) {
     console.log(pc.yellow("No curated models fit your hardware."));
     return false;
@@ -134,7 +129,7 @@ async function offerModelDownload(prompt) {
   const shouldDownload = await prompt.yesNo("Download " + primary.entry.label + " (" + primary.format + ")?", true);
   if (!shouldDownload) return false;
-  const hfRef = primary.format === "mlx" ? primary.entry.mlx : primary.entry.gguf;
+  const hfRef = primary.entry.gguf;
   try {
     const plan = await resolveHfDownload(hfRef);
     console.log(pc.dim("Total size: " + formatBytes(plan.totalSizeBytes)));
@@ -236,4 +231,4 @@ async function installAllBackends(prompt, run, model) {
 async function runInstallerCommands(run, installer) {
   for (const [cmd, args, label] of installer.commands) await run(cmd, args, label);
-}
+}

package/src/commands/status.mjs CHANGED Viewed

@@ -2,6 +2,10 @@ import { ensureDirs } from "../config.mjs";
 import { backendFor } from "../backends.mjs";
 import { loadProfiles } from "../profiles.mjs";
 import { profileRuntimeStatus } from "../process.mjs";
+import { existsSync } from "node:fs";
+import { execFileSync } from "node:child_process";
+import { homedir } from "node:os";
+import { join } from "node:path";
 import { pc, renderRows, renderCard } from "../ui.mjs";
 export async function statusCommand() {
@@ -38,6 +42,21 @@ export async function statusCommand() {
   console.log(renderCard("Status", renderRows(summaryRows), { formatBorder: running.length > 0 ? pc.green : pc.dim }));
+  // Show oMLX cache disk usage if cache exists
+  const omlxCacheDir = join(homedir(), ".omlx", "cache");
+  if (existsSync(omlxCacheDir)) {
+    try {
+      const duOutput = execFileSync("du", ["-sh", omlxCacheDir], { encoding: "utf8" });
+      const cacheSize = duOutput.split(/\s+/)[0];
+      console.log("\n" + renderCard("oMLX cache", renderRows([
+        ["Location", pc.dim(omlxCacheDir)],
+        ["Disk usage", pc.bold(cacheSize)],
+      ]), { formatBorder: pc.magenta }));
+    } catch {
+      // du not available — skip
+    }
+  }
   if (managedUpMissing.length > 0 || managedUpNotLoaded.length > 0) {
     const detailRows = [];
     for (const { profile, status } of [...managedUpMissing, ...managedUpNotLoaded]) {

package/src/config.mjs CHANGED Viewed

@@ -18,8 +18,8 @@ export const MANAGED_LLAMA_SERVER = join(RUNTIME_DIR, "bin", "llama-server");
 // HuggingFace hub cache: $HF_HUB_CACHE, else $HF_HOME/hub, else
 // ~/.cache/huggingface/hub. This is where huggingface_hub stores
 // models--org--name/... and where offgrid-ai scans + downloads. Pointing at the
-// hub (not the HF root) keeps the HF-hub MLX/GGUF scanners and the downloader
-// on the same layout.
+// hub (not the HF root) keeps the GGUF scanner and the downloader on the
+// same layout.
 export const HF_HUB_DIR = process.env.HF_HUB_CACHE
   || (process.env.HF_HOME ? join(process.env.HF_HOME, "hub") : join(homedir(), ".cache", "huggingface", "hub"));

package/src/harness-pi.mjs CHANGED Viewed

@@ -14,7 +14,7 @@ import pc from "picocolors";
  * the repo from HuggingFace. Other backends use the friendly modelAlias.
  */
 export function piApiModelId(profile) {
-  return profile.backend === "mlx-vlm" ? profile.modelPath : profile.modelAlias;
+  return profile.modelAlias;
 }
 // ── Sync Pi config ─────────────────────────────────────────────────────────
@@ -135,7 +135,7 @@ export function modelReasoning(profile) {
 }
 export function modelFamily(profile) {
-  return [profile.id, profile.label, profile.modelAlias, profile.modelPath, profile.omlxModel].filter(Boolean).join(" ").toLowerCase();
+  return [profile.id, profile.label, profile.modelAlias, profile.omlxModel].filter(Boolean).join(" ").toLowerCase();
 }
 function piApiKey() {

package/src/mlx-discovery.mjs CHANGED Viewed

@@ -1,36 +1,14 @@
-// MLX model discovery + metadata — scans configured model directories for MLX
-// model directories and parses their config.json.
-// Ported from deprecated-offgrid-desktop/src/main/model-discovery.ts +
-// mlx-metadata.ts (MLX subset only).
-//
-// This runs ALONGSIDE offgrid-ai's existing GGUF scan (scan.mjs scanGgufModels)
-// — it does not replace it. The picker (main.mjs) will merge GGUF + MLX lists.
-//
-// An MLX model directory is one containing config.json + one or more
-// *.safetensors files. HuggingFace Hub cache layout (models--org--name) is
-// detected and scanned specially.
+// oMLX model size lookup — scans ~/.omlx/models/ for MLX model directories
+// to compute sizes and publishers. The oMLX API doesn't return these, so we
+// read them from disk.
-import { readdir, stat, readFile } from "node:fs/promises";
+import { readdir, stat } from "node:fs/promises";
 import { existsSync } from "node:fs";
-import { join, basename } from "node:path";
+import { join } from "node:path";
 import { homedir } from "node:os";
-import { getModelScanDirs } from "./config.mjs";
-import { inferSourceLabel, MIN_MODEL_SIZE_BYTES, EMBEDDING_MODEL_TYPES } from "./discovery-shared.mjs";
-import { parseModelName } from "./model-name.mjs";
-// ── Folder → backend mapping ──────────────────────────────────────────────
-// The oMLX folder is oMLX-exclusive: models there are served by the oMLX
-// managed backend, NOT by mlx-vlm. Every OTHER scan dir is format-based
-// (GGUF → llama.cpp, MLX → mlx-vlm). So mlx-vlm scans all configured dirs
-// EXCEPT the oMLX folder.
 const OMLX_MODELS_DIR = join(homedir(), ".omlx", "models");
-function isOmlxFolder(p) {
-  return p === OMLX_MODELS_DIR || p.startsWith(OMLX_MODELS_DIR + "/");
-}
-// ── MLX directory detection ───────────────────────────────────────────────
-/** True if dir contains config.json + at least one .safetensors file. */
 async function isMlxModelDir(dir) {
   if (!existsSync(join(dir, "config.json"))) return false;
   try {
@@ -41,7 +19,6 @@ async function isMlxModelDir(dir) {
   }
 }
-/** Sum the size of all .safetensors files in an MLX model dir (bytes). */
 async function getMlxDirSizeBytes(dir) {
   try {
     const entries = await readdir(dir);
@@ -57,259 +34,9 @@ async function getMlxDirSizeBytes(dir) {
   }
 }
-// ── Recursive MLX scanner ─────────────────────────────────────────────────
-/**
- * Recursively scan a directory for MLX model directories.
- * Searches up to maxDepth levels deep. Does NOT collect GGUF (that's scan.mjs).
- */
-async function scanDirRecursiveForMlx(rootDir, sourceLabel, maxDepth = 3) {
-  if (!existsSync(rootDir)) return [];
-  const models = [];
-  async function walk(dir, depth) {
-    if (depth > maxDepth) return;
-    let entries;
-    try {
-      entries = await readdir(dir, { withFileTypes: true });
-    } catch {
-      return;
-    }
-    // Is this directory itself an MLX model dir? (don't recurse into it)
-    if (depth > 0 && await isMlxModelDir(dir)) {
-      const sizeBytes = await getMlxDirSizeBytes(dir);
-      if (sizeBytes < MIN_MODEL_SIZE_BYTES) return;
-      if (await isEmbeddingMlxModel(join(dir, "config.json"))) return;
-      const caps = await detectMlxCapabilities(dir);
-      const { display, quant } = parseModelName(basename(dir), sourceLabel);
-      models.push(makeMlxModel(dir, display, sizeBytes, sourceLabel, rootDir, caps.contextLength, quant));
-      return;
-    }
-    for (const entry of entries) {
-      if (entry.name.startsWith(".") || entry.name === "README.md" || entry.name === ".gitattributes") continue;
-      const fullPath = join(dir, entry.name);
-      if (entry.isDirectory()) {
-        if (await isMlxModelDir(fullPath)) {
-          const sizeBytes = await getMlxDirSizeBytes(fullPath);
-          if (sizeBytes < MIN_MODEL_SIZE_BYTES) continue;
-          if (await isEmbeddingMlxModel(join(fullPath, "config.json"))) continue;
-          const caps = await detectMlxCapabilities(fullPath);
-          // Extract publisher from parent dir (LM Studio: publisher/model-dir)
-          const relParts = fullPath.slice(rootDir.length + 1).split("/");
-          const publisher = (sourceLabel === "lmstudio" && relParts.length >= 2) ? relParts[0] : null;
-          const rawLabel = publisher ? `${publisher}/${entry.name}` : entry.name;
-          const { display, quant } = parseModelName(rawLabel, sourceLabel);
-          models.push(makeMlxModel(fullPath, display, sizeBytes, sourceLabel, rootDir, caps.contextLength, quant));
-        } else {
-          await walk(fullPath, depth + 1);
-        }
-      }
-    }
-  }
-  await walk(rootDir, 0);
-  return models;
-}
-// ── HuggingFace Hub layout ────────────────────────────────────────────────
-/** True if dir looks like an HF Hub cache (has models--* subdirs). */
-async function looksLikeHfHub(dir) {
-  if (!existsSync(dir)) return false;
-  try {
-    const entries = await readdir(dir, { withFileTypes: true });
-    return entries.some((e) => e.isDirectory() && e.name.startsWith("models--"));
-  } catch {
-    return false;
-  }
-}
-/**
- * Scan an HF Hub cache dir for MLX model dirs.
- * HF layout: models--org--name/snapshots/hash/files
- */
-async function scanHfHubForMlx(dir, sourceLabel) {
-  if (!existsSync(dir)) return [];
-  const models = [];
-  try {
-    const entries = await readdir(dir, { withFileTypes: true });
-    for (const entry of entries) {
-      if (!entry.isDirectory() || !entry.name.startsWith("models--")) continue;
-      const parts = entry.name.slice("models--".length).split("--");
-      const label = parts.join("/");
-      const snapshotsDir = join(dir, entry.name, "snapshots");
-      if (!existsSync(snapshotsDir)) continue;
-      const snapshots = await readdir(snapshotsDir, { withFileTypes: true });
-      // Follow symlinks (HF hub uses them; test imports use them too). A model
-      // dir can have several snapshots — some incomplete/empty. Check EACH
-      // snapshot and use the first that is a valid MLX model dir, rather than
-      // giving up on the whole model if the first snapshot happens to be empty.
-      const candidates = snapshots.filter((s) => s.isDirectory() || s.isSymbolicLink());
-      let snapshotPath = null;
-      for (const snap of candidates) {
-        const sp = join(snapshotsDir, snap.name);
-        const st = await stat(sp).catch(() => null);
-        if (st?.isDirectory() && await isMlxModelDir(sp)) { snapshotPath = sp; break; }
-      }
-      if (!snapshotPath) continue;
-      const sizeBytes = await getMlxDirSizeBytes(snapshotPath);
-      if (sizeBytes < MIN_MODEL_SIZE_BYTES) continue;
-      if (await isEmbeddingMlxModel(join(snapshotPath, "config.json"))) continue;
-      const caps = await detectMlxCapabilities(snapshotPath);
-      const { display, quant } = parseModelName(label, sourceLabel);
-      models.push({
-        id: `${sourceLabel}:${entry.name}`,
-        label: display,
-        path: snapshotPath,
-        filePath: snapshotPath,
-        sizeBytes,
-        contextLength: caps.contextLength,
-        quant,
-        backend: "mlx-vlm",
-        format: "mlx",
-        source: sourceLabel,
-      });
-    }
-  } catch {
-    // Can't read — return what we have.
-  }
-  return models;
-}
-// ── Embedding model filtering for MLX ─────────────────────────────────────
-async function isEmbeddingMlxModel(configPath) {
-  if (!existsSync(configPath)) return false;
-  try {
-    const config = JSON.parse(await readFile(configPath, "utf-8"));
-    const textConfig = config.text_config ?? config;
-    const modelType = String(textConfig.model_type ?? "").toLowerCase();
-    if (EMBEDDING_MODEL_TYPES.has(modelType)) return true;
-    const arch = Array.isArray(config.architectures) ? config.architectures[0] : "";
-    const lowerArch = String(arch).toLowerCase();
-    return EMBEDDING_MODEL_TYPES.has(lowerArch) || lowerArch.includes("bert");
-  } catch {
-    return false;
-  }
-}
-// ── MLX model entry builder ───────────────────────────────────────────────
-function makeMlxModel(dir, label, sizeBytes, sourceLabel, rootDir, contextLength = null, quant = null) {
-  return {
-    id: `${sourceLabel}:${dir.replace(rootDir + "/", "")}`,
-    label,
-    path: dir,
-    filePath: dir,
-    sizeBytes,
-    contextLength,
-    quant,
-    backend: "mlx-vlm",
-    format: "mlx",
-    source: sourceLabel,
-  };
-}
-// ── Public API ─────────────────────────────────────────────────────────────
-/**
- * Discover all MLX models across the configured scan directories.
- * Reads scan dirs from config.mjs getModelScanDirs() — same paths GGUF uses
- * (LM Studio, HF hub, user-added). Returns a flat, deduplicated list.
- */
-export async function scanMlxModels(dirs) {
-  // mlx-vlm scans every configured dir EXCEPT the oMLX folder (oMLX-exclusive).
-  const scanDirs = (dirs ?? await getModelScanDirs()).filter((d) => !isOmlxFolder(d));
-  const results = await Promise.all(
-    scanDirs.map(async (dir) => {
-      const label = inferSourceLabel(dir);
-      if (await looksLikeHfHub(dir)) return scanHfHubForMlx(dir, label);
-      return scanDirRecursiveForMlx(dir, label);
-    }),
-  );
-  const all = results.flat();
-  // Deduplicate by filePath (same model may appear in multiple paths).
-  const seen = new Set();
-  return all.filter((m) => {
-    if (seen.has(m.filePath)) return false;
-    seen.add(m.filePath);
-    return true;
-  });
-}
-// ── MLX capability detection ─────────────────────────────────────────────
-/**
- * Detect MLX model capabilities from its config.json.
- * Returns { architecture, thinking, vision, contextLength }.
- */
-export async function detectMlxCapabilities(modelDir) {
-  const configPath = join(modelDir, "config.json");
-  if (!existsSync(configPath)) return { thinking: false, vision: false, contextLength: null, architecture: null };
-  try {
-    const config = JSON.parse(await readFile(configPath, "utf-8"));
-    return detectMlxCapabilitiesFromConfig(config, modelDir);
-  } catch {
-    return { thinking: false, vision: false, contextLength: null, architecture: null };
-  }
-}
-export function detectMlxCapabilitiesFromConfig(config, modelDir) {
-  const textConfig = config.text_config ?? config;
-  const rawName = config._name_or_path ?? basename(modelDir ?? "");
-  const name = String(rawName).toLowerCase();
-  const label = String(rawName);
-  const modelType = String(config.model_type ?? "").toLowerCase();
-  const textModelType = String(textConfig.model_type ?? "").toLowerCase();
-  const vision = Boolean(
-    config.vision_config ||
-    config.image_token_id != null ||
-    config.video_token_id != null ||
-    config.vision_start_token_id != null ||
-    modelType.includes("vl") ||
-    modelType.includes("vision") ||
-    textModelType.includes("vl") ||
-    textModelType.includes("vision")
-  );
-  const thinking = /qwen3|gemma-4|gemma4|deepseek-r[12]/i.test(name + " " + label);
-  const architectures = Array.isArray(config.architectures) ? config.architectures : [];
-  const architecture = architectures[0] ?? null;
-  const candidates = [
-    textConfig.max_position_embeddings,
-    textConfig.sliding_window,
-    config.max_position_embeddings,
-    config.sliding_window,
-  ].filter((v) => typeof v === "number" && v > 0);
-  const contextLength = candidates.length > 0 ? Math.max(...candidates) : null;
-  return { thinking, vision, contextLength, architecture };
-}
-/**
- * Pick a sensible default context length for an MLX model, capping by RAM.
- */
-export function defaultMlxContextLength(trainedCtx, ramGb) {
-  if (!trainedCtx || trainedCtx <= 0) return 8192;
-  if (ramGb < 12) return Math.min(trainedCtx, 4096);
-  if (ramGb < 16) return Math.min(trainedCtx, 8192);
-  if (ramGb < 32) return Math.min(trainedCtx, 16384);
-  return trainedCtx;
-}
-// ── oMLX model size lookup (from disk) ────────────────────────────────────
 /**
- * Scan the oMLX models directory (~/.omlx/models/) for MLX model directories
- * and return a Map of basename → { sizeBytes, publisher }.  The oMLX API
- * doesn't return model sizes or publishers, so we compute them from disk.
+ * Scan ~/.omlx/models/ for MLX model directories and return a Map of
+ * basename → { sizeBytes, publisher }.
  */
 export async function scanOmlxModelSizes() {
   if (!existsSync(OMLX_MODELS_DIR)) return new Map();
@@ -329,7 +56,6 @@ export async function scanOmlxModelSizes() {
         const sizeBytes = await getMlxDirSizeBytes(fullPath);
         if (sizeBytes > 0) infoByBasename.set(entry.name, { sizeBytes, publisher });
       } else {
-        // First-level directories under ~/.omlx/models/ are publishers
         await walk(fullPath, publisher ?? entry.name);
       }
     }
@@ -340,7 +66,7 @@ export async function scanOmlxModelSizes() {
 }
 /**
- * Look up a model's info by its oMLX API id.  Tries exact match, then the
+ * Look up a model's info by its oMLX API id. Tries exact match, then the
  * segment after `--` (oMLX org--name format), then after `/` (HF format).
  */
 export function lookupOmlxModelInfo(modelId, infoMap) {

package/src/model-catalog.mjs CHANGED Viewed

@@ -1,28 +1,23 @@
 import { scanGgufModels, matchDrafter } from "./scan.mjs";
 import { loadProfiles, normalizeProfile, sanitizeProfileId } from "./profiles.mjs";
 import { scanManagedModels } from "./managed.mjs";
-import { scanMlxModels } from "./mlx-discovery.mjs";
 import { isProfileFileMissing } from "./model-summary.mjs";
 import { backendFor } from "./backends.mjs";
 export async function loadModelCatalog() {
-  const [profiles, { models: ggufModels, drafters }, managedModels, mlxModels] = await Promise.all([
+  const [profiles, { models: ggufModels, drafters }, managedModels] = await Promise.all([
     loadProfiles(),
     scanGgufModels(),
     scanManagedModels(),
-    scanMlxModels(),
   ]);
-  return normalizeCatalog({ profiles, ggufModels, drafters, managedModels, mlxModels });
+  return normalizeCatalog({ profiles, ggufModels, drafters, managedModels });
 }
 export function normalizeCatalog(catalog) {
   if (catalog.newModels && catalog.managedItems) return catalog;
-  const { profiles, ggufModels, drafters, managedModels, mlxModels = [] } = catalog;
+  const { profiles, ggufModels, drafters, managedModels } = catalog;
   const profiledPaths = new Set(profiles.map((profile) => profile.modelPath).filter(Boolean));
-  const newModels = [
-    ...ggufModels.filter((model) => !profiledPaths.has(model.path)),
-    ...mlxModels.filter((model) => !profiledPaths.has(model.path)),
-  ];
+  const newModels = ggufModels.filter((model) => !profiledPaths.has(model.path));
   const managedItems = [];
   for (const { backendId, models, status } of managedModels) {
     if (status === "unavailable") continue;
@@ -35,9 +30,10 @@ export function normalizeCatalog(catalog) {
       if (!profiledAliases.has(`${backendId}:${model.id}`)) managedItems.push({ model, backendId });
     }
   }
-  return { profiles, ggufModels, drafters, managedModels, mlxModels, newModels, managedItems };
+  return { profiles, ggufModels, drafters, managedModels, newModels, managedItems };
 }
 export function itemKey(item) {
   if (item.type === "profile") return `profile:${item.profile.id}`;
   if (item.type === "new") return `new:${item.model.path}`;
@@ -57,12 +53,11 @@ function compareRecency(a, b) {
 }
 export function buildCatalogItems(normalized) {
-  const { profiles, newModels, managedItems, drafters, ggufModels = [], mlxModels = [], managedModels = [] } = normalized;
+  const { profiles, newModels, managedItems, drafters, ggufModels = [], managedModels = [] } = normalized;
   // Lookup maps for enriching profile items with scan data (size + context).
   const scanByPath = new Map();
   for (const m of ggufModels) scanByPath.set(m.path, m);
-  for (const m of mlxModels) scanByPath.set(m.filePath ?? m.path, m);
   const managedByKey = new Map();
   for (const { backendId, models } of managedModels) {
@@ -77,7 +72,7 @@ export function buildCatalogItems(normalized) {
     if (profile.modelPath) {
       const scanModel = scanByPath.get(profile.modelPath);
       if (scanModel) {
-        item.label = scanModel.label;  // re-parsed label (publisher/model-name)
+        item.label = scanModel.label;
         if (scanModel.quant) quant = scanModel.quant;
       }
     }
@@ -160,4 +155,4 @@ export function createManagedProfile(model, backendId) {
     modelSizeBytes: model.sizeBytes || 0,
     ...(backendId === "omlx" ? { omlxModel: model.id } : {}),
   });
-}
+}

package/src/model-presenters.mjs CHANGED Viewed

@@ -44,8 +44,6 @@ function optionSourceTag(sourceId) {
     omlx: pc.magenta,
     "llama.cpp": pc.cyan,
     gguf: pc.cyan,
-    mlx: pc.yellow,
-    "mlx-vlm": pc.yellow,
   };
   return optionPad(label, colors[sourceId] ?? pc.dim, OPTION_SOURCE_WIDTH);
 }
@@ -57,7 +55,6 @@ function optionBackendTag(backendId) {
     "llama-cpp": pc.cyan,
     "llama-cpp-mtp": pc.blue,
     omlx: pc.magenta,
-    "mlx-vlm": pc.yellow,
   };
   return optionPad(label, colors[backendId] ?? pc.dim, OPTION_BACKEND_WIDTH);
 }
@@ -70,8 +67,6 @@ export function formatSourceLabel(sourceId) {
     omlx: "oMLX",
     "llama.cpp": "llama.cpp",
     gguf: "GGUF file",
-    mlx: "MLX",
-    "mlx-vlm": "MLX",
   };
   return map[sourceId] ?? String(sourceId);
 }
@@ -200,7 +195,6 @@ export function inferBackendId(item) {
   if (item.type === "profile") return item.profile.backend;
   if (item.type === "managed") return item.backendId;
   // new model: derive from format
-  if (item.model?.format === "mlx") return "mlx-vlm";
   if (item.model?.backend) return item.model.backend;
   return "llama-cpp";
 }
@@ -297,29 +291,6 @@ export function printGgufModelDetails(model, drafter) {
   console.log("\n" + renderSectionRows("Model details", detailRows, { columns: Math.min(process.stdout.columns ?? 110, 140) }));
 }
-export async function printMlxModelDetails(model) {
-  const { detectMlxCapabilities } = await import("./mlx-discovery.mjs");
-  const caps = await detectMlxCapabilities(model.filePath ?? model.path);
-  const parts = [];
-  if (caps.architecture) parts.push(caps.architecture);
-  if (caps.thinking) parts.push("thinking");
-  if (caps.vision) parts.push("vision");
-  const summary = parts.length > 0 ? parts.join(pc.dim(" · ")) : "standard MLX";
-  console.log("\n" + renderSectionRows("Downloaded model", [
-    ["Name", pc.bold(model.label)],
-    ["Status", pc.yellow("Needs one-time setup")],
-    ["Details", summary],
-  ]));
-  console.log("\n" + renderSectionRows("Model details", [
-    ["Model dir", model.path],
-    ["Backend", "mlx-vlm"],
-    ["Source", formatSourceLabel(model.source)],
-    ["Detected", summary],
-    ["Size", formatBytes(model.sizeBytes)],
-    ["Context", caps.contextLength ? `${caps.contextLength.toLocaleString()} trained` : "unknown"],
-  ], { columns: Math.min(process.stdout.columns ?? 110, 140) }));
-}
 export function printManagedModelDetails(model, backend) {
   console.log("\n" + renderSectionRows(`${backend.label} model`, [
     ["Name", pc.bold(model.label)],

package/src/process.mjs CHANGED Viewed

@@ -21,32 +21,17 @@ export async function computeServerCommand(profile) {
   const binary = await backendBinaryFor(profile.backend);
   if (!binary) throw new Error("Server binary not found. Run offgrid-ai interactively to install.");
-  let argv, extraEnv;
-  if (profile.backend === "mlx-vlm") {
-    const { computeMlxVlmFlags } = await import("./mlx-flags.mjs");
-    const result = computeMlxVlmFlags(profile.modelPath, {
-      port: profile.flags?.port,
-      ctxSize: profile.flags?.ctxSize,
-      thinkingEnabled: profile.capabilities?.thinking ?? true,
-    });
-    argv = result.args;
-    extraEnv = { APC_ENABLED: "1", MLX_VLM_MAX_TOKENS: "16384" };
-  } else {
-    // llama-cpp / llama-cpp-mtp
-    const { computeFlags } = await import("./autodetect.mjs");
-    const result = computeFlags(
-      profile.capabilities ?? {},
-      profile.modelPath,
-      profile.mmprojPath,
-      profile.drafterPath,
-      profile.flags ?? {},
-    );
-    argv = result.argv;
-    extraEnv = {};
-  }
+  // llama-cpp / llama-cpp-mtp
+  const { computeFlags } = await import("./autodetect.mjs");
+  const result = computeFlags(
+    profile.capabilities ?? {},
+    profile.modelPath,
+    profile.mmprojPath,
+    profile.drafterPath,
+    profile.flags ?? {},
+  );
-  return { binary, argv, extraEnv, backend };
+  return { binary, argv: result.argv, extraEnv: {}, backend };
 }
 /** Build a runnable start.sh script for the profile. */
@@ -132,19 +117,34 @@ async function startLocalServer(profile) {
 }
 async function startManagedServer(profile, backend) {
-  const ready = await serverReady(profile.baseUrl);
-  if (ready) {
-    // Already running
-  } else {
-    for (let i = 0; i < 60; i++) {
-      await sleep(2000);
-      if (await serverReady(profile.baseUrl)) break;
-      process.stdout.write(".");
-    }
-    if (!(await serverReady(profile.baseUrl))) {
-      throw new Error(`${backend.label} is not responding at ${profile.baseUrl}. Start it and try again.`);
+  if (await serverReady(profile.baseUrl)) {
+    return writeManagedState(profile, backend);
+  }
+  // Try to start the managed server via CLI
+  if (backend.id === "omlx") {
+    try {
+      const { execFile } = await import("node:child_process");
+      const { promisify } = await import("node:util");
+      await promisify(execFile)("omlx", ["start"], { timeout: 10000 });
+    } catch {
+      throw new Error(`${backend.label} is not running and could not be auto-started. Install oMLX or run \`omlx start\` manually.`);
     }
   }
+  // Wait for it to come up
+  for (let i = 0; i < 60; i++) {
+    await sleep(2000);
+    if (await serverReady(profile.baseUrl)) break;
+    process.stdout.write(".");
+  }
+  if (!(await serverReady(profile.baseUrl))) {
+    throw new Error(`${backend.label} is not responding at ${profile.baseUrl}. Start it and try again.`);
+  }
+  return writeManagedState(profile, backend);
+}
+async function writeManagedState(profile, backend) {
   const state = {
     pid: null,
     profileId: profile.id,
@@ -180,10 +180,7 @@ export async function stopProfile(profile) {
 }
 // Reliably terminate a detached local-server process group: SIGTERM with a
-// grace period for graceful shutdown (lets mlx-vlm/llama-server release the
-// model), then SIGKILL if still alive. Guarantees the model is unloaded when a
-// profile stops — consistent across backends (llama-server exits on SIGTERM;
-// mlx-vlm/uvicorn often does not, hence the SIGKILL fallback).
+// grace period for graceful shutdown, then SIGKILL if still alive.
 async function terminateProcess(pid) {
   const signalGroup = (sig) => {
     try { process.kill(-pid, sig); }
@@ -227,8 +224,6 @@ export async function unloadModelFromServer(profile) {
   const backend = backendFor(profile.backend);
   if (backend.id === "llama-cpp" || backend.id === "llama-cpp-mtp") {
-    // llama.cpp unloads when the server process exits; no HTTP unload API exists.
-    // If offgrid-ai started the server, stopProfile already handled it.
     return { unloaded: false, backend: backend.id, reason: "stop server to unload" };
   }
@@ -236,12 +231,6 @@ export async function unloadModelFromServer(profile) {
     return await unloadOmlxModel(profile);
   }
-  if (backend.id === "mlx-vlm") {
-    // mlx-vlm is a local-server backend — stopProfile handles unload by killing
-    // the process. No HTTP unload API.
-    return { unloaded: false, backend: backend.id, reason: "stop server to unload" };
-  }
   return { unloaded: false, backend: backend.id, reason: "unsupported backend" };
 }

package/src/profile-setup.mjs CHANGED Viewed

@@ -8,7 +8,6 @@ import { pc, formatBytes, renderRows, renderSection } from "./ui.mjs";
 import { detectCapabilities } from "./autodetect.mjs";
 import { matchDrafter } from "./scan.mjs";
 import { scanGgufModels } from "./scan.mjs";
-import { estimateMemoryMb } from "./mlx-flags.mjs";
 import { capabilitySummary } from "./model-summary.mjs";
 const execFileAsync = promisify(execFile);
@@ -248,92 +247,4 @@ function samplingSummary(flags) {
   return `temp ${flags.temperature}, top-p ${flags.topP}, top-k ${flags.topK}`;
 }
-// ── MLX profile configuration ─────────────────────────────────────────────
-/**
- * Interactive configuration for an mlx-vlm profile.
- */
-export async function configureMlxProfile(prompt, profile) {
-  let configured = profile;
-  console.log("");
-  console.log(renderSection("Model setup", renderRows([
-    ["Model", pc.bold(profile.label)],
-    ["Detected", mlxDetectionSummary(configured.capabilities)],
-    ["Context", String(configured.flags.ctxSize) + " tokens"],
-  ])));
-  console.log(pc.dim("Larger context windows use more memory. You can edit the profile later if needed.\n"));
-  if (configured.capabilities.vision) {
-    console.log(renderSection("Vision detected", renderRows([
-      ["Capability", "image / multimodal input"],
-      ["Note", "mlx-vlm loads vision from the model directory automatically."],
-    ])));
-  }
-  if (configured.capabilities.thinking) {
-    console.log("");
-    console.log(renderSection("Thinking mode", renderRows([
-      ["Flag", "--enable-thinking"],
-      ["Default", "on for Qwen 3 / Gemma 4 / DeepSeek-R class models"],
-    ])));
-    const useThinking = await prompt.yesNo("Enable thinking mode?", true);
-    configured = await applyMlxThinkingToggle(configured, useThinking);
-  }
-  const ctxSize = await prompt.number("Context window tokens", configured.flags.ctxSize, 1024, 1048576);
-  configured = applyMlxContextSize(configured, ctxSize);
-  console.log("\n" + renderMlxMemoryEstimate(configured));
-  console.log("");
-  console.log(renderSection("Defaults", renderRows([
-    ["Backend", configured.backend],
-    ["Endpoint", configured.baseUrl],
-    ["Context", String(configured.flags.ctxSize) + " tokens"],
-    ["Thinking", configured.capabilities?.thinking ? "on" : "off"],
-    ["Vision", configured.capabilities.vision ? "yes" : "no"],
-  ])));
-  if (!(await prompt.yesNo("Save profile with these settings?", true))) return null;
-  return configured;
-}
-async function applyMlxThinkingToggle(profile, enabled) {
-  if (!profile.capabilities.thinking) return profile;
-  return {
-    ...profile,
-    capabilities: { ...profile.capabilities, thinkingEnabled: enabled },
-  };
-}
-function applyMlxContextSize(profile, ctxSize) {
-  const flags = { ...profile.flags, ctxSize };
-  return {
-    ...profile,
-    flags,
-    baseUrl: baseUrlForFlags(flags),
-  };
-}
-function renderMlxMemoryEstimate(profile) {
-  const modelBytes = profile.modelSizeBytes || 0;
-  if (!modelBytes) {
-    return renderSection("Memory estimate", pc.dim("Model size unknown — save the profile to estimate."));
-  }
-  const totalMb = estimateMemoryMb(modelBytes);
-  const overheadBytes = Math.max(0, totalMb * 1024 * 1024 - modelBytes);
-  return renderSection("Memory estimate", renderRows([
-    ["Estimated total", pc.bold(`~${formatBytes(totalMb * 1024 * 1024)}`)],
-    ["Model", formatBytes(modelBytes)],
-    ["Overhead", `~${formatBytes(overheadBytes)} (KV cache, APC, runtime)`],
-  ]));
-}
-function mlxDetectionSummary(caps) {
-  const parts = [];
-  if (caps.architecture) parts.push(caps.architecture);
-  if (caps.thinking) parts.push("thinking");
-  if (caps.vision) parts.push("vision");
-  return parts.length > 0 ? parts.join(" · ") : "standard MLX";
-}

package/src/profiles.mjs CHANGED Viewed

@@ -2,10 +2,8 @@ import { existsSync } from "node:fs";
 import { mkdir, readdir, rm, unlink, writeFile, readFile } from "node:fs/promises";
 import { join } from "node:path";
 import { PROFILE_DIR, RUN_DIR, LOG_DIR } from "./config.mjs";
-import { backendFor, baseUrlForFlags, defaultFlagsForBackend } from "./backends.mjs";
+import { backendFor, baseUrlForFlags, defaultFlagsForBackend, BACKENDS } from "./backends.mjs";
 import { computeFlags } from "./autodetect.mjs";
-import { detectMlxCapabilities, defaultMlxContextLength } from "./mlx-discovery.mjs";
-import { detectHardware } from "./hardware.mjs";
 import { readJson, writeJson } from "./json.mjs";
 // ── Path helpers ───────────────────────────────────────────────────────────
@@ -42,7 +40,7 @@ export async function loadProfiles() {
     .filter((e) => e.isDirectory() && existsSync(profileJsonPath(e.name)))
     .map((e) => e.name)
     .sort();
-  return Promise.all(ids.map((id) => readProfile(id)));
+  return (await Promise.all(ids.map((id) => readProfile(id)))).filter((p) => BACKENDS[p.backend]);
 }
 export async function readProfile(id) {
@@ -152,28 +150,6 @@ export async function createProfileFromModel(model, backendId, drafterPath) {
   });
 }
-// ── Auto-create profile from a discovered MLX model ────────────────────────
-export async function createProfileFromMlxModel(model) {
-  const { DEFAULT_PORT } = await import("./mlx-flags.mjs");
-  const caps = await detectMlxCapabilities(model.filePath);
-  const ctxSize = defaultMlxContextLength(caps.contextLength, detectHardware().totalRamBytes / (1024 ** 3));
-  return normalizeProfile({
-    id: slugFromLabel(model.label),
-    label: model.label,
-    backend: "mlx-vlm",
-    providerId: "mlx-vlm",
-    modelAlias: model.label,
-    source: model.source,
-    modelPath: model.filePath,
-    mmprojPath: null,
-    drafterPath: null,
-    modelSizeBytes: model.sizeBytes,
-    capabilities: caps,
-    flags: { host: "127.0.0.1", port: DEFAULT_PORT, ctxSize },
-  });
-}
 function summarizeCapabilities(caps) {
   return {
     architecture: caps.architecture,

package/resources/mlxvlm-server-wrapper.py DELETED Viewed

@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-"""
-mlx-vlm server wrapper with strict=False model loading + APC merge fix.
-Two monkey-patches are applied before the server starts:
-1. strict=False model loading — needed for architectures with shared-KV weight
-   schemes (e.g. Gemma 4). Most models (Qwen, Llama, Mistral, Phi) load fine
-   with strict=True — strict=False is a no-op for them.
-2. BatchRotatingKVCache.merge() shape-mismatch fix — upstream mlx-lm bug
-   (ml-explore/mlx-lm PR #1116, Blaizzy/mlx-vlm Issue #923). The merge() method
-   crashes with `ValueError: [broadcast_shapes] Shapes (1,1,28,256) and
-   (1,1,512,256) cannot be broadcast` when APC merges exact-cache entries with
-   different fill levels. This affects all sliding-window attention models
-   (Gemma 4, Mistral, Mixtral). The fix uses explicit slicing instead of
-   negative indexing to guarantee exactly `l` elements are extracted.
-   This patch can be removed once mlx-lm fixes merge() upstream (not fixed in
-   0.31.2 or 0.31.3 — the merge() method is identical in both).
-Benchmark finding: mlx-vlm clears Metal cache after every request (GitHub Issue
-#999) unless APC_ENABLED=1 is set. The env var is set by the Electron app at
-spawn time, not in this wrapper.
-Usage:
-  python3 mlxvlm-server-wrapper.py --model <path> --host 127.0.0.1 --port <port>
-"""
-import sys
-# ── Patch 1: strict=False model loading ──────────────────────────────────────
-import mlx_vlm.utils as _utils
-_orig_load_model = _utils.load_model
-def _patched_load_model(model_path, lazy=False, strict=True, **kwargs):
-    return _orig_load_model(model_path, lazy=lazy, strict=False, **kwargs)
-_utils.load_model = _patched_load_model
-# ── Patch 2: BatchRotatingKVCache.merge() shape-mismatch fix ──────────────────
-#
-# Upstream bug: _temporal_order() can return a buffer whose seq dimension differs
-# from c.size(). The negative slice [..., -l:, :] then produces a mismatched shape,
-# crashing with ValueError: [broadcast_shapes].
-#
-# Fix: use explicit slicing to extract exactly `l` elements, right-aligning within
-# the target slice when the buffer is shorter than `l` (left-padded by zeros from
-# the pre-allocated target tensor).
-import mlx.core as mx
-from mlx_lm.models import cache as _lm_cache
-_orig_merge = _lm_cache.BatchRotatingKVCache.merge
-@classmethod
-def _patched_merge(cls, caches):
-    if not all(c.max_size == caches[0].max_size for c in caches):
-        raise ValueError(
-            "BatchRotatingKVCache can only merge caches with the same maximum size"
-        )
-    offsets = [c.offset for c in caches]
-    lengths = [c.size() for c in caches]
-    max_length = max(lengths)
-    if max_length == 0:
-        return cls(caches[0].max_size, [0] * len(caches))
-    padding = [max_length - l for l in lengths]
-    B = len(caches)
-    H = max(c.keys.shape[1] for c in caches if c.keys is not None)
-    Dk = max(c.keys.shape[3] for c in caches if c.keys is not None)
-    Dv = max(c.values.shape[3] for c in caches if c.values is not None)
-    dt = next(iter(c.keys.dtype for c in caches if c.keys is not None))
-    keys = mx.zeros((B, H, max_length, Dk), dtype=dt)
-    values = mx.zeros((B, H, max_length, Dv), dtype=dt)
-    for i, (p, l, c) in enumerate(zip(padding, lengths, caches)):
-        if c.keys is None:
-            continue
-        ordered_k = c._temporal_order(c.keys)
-        ordered_v = c._temporal_order(c.values)
-        seq_len = ordered_k.shape[2]
-        if seq_len >= l:
-            # Normal case: extract the last `l` tokens.
-            start = seq_len - l
-            keys[i : i + 1, :, p : p + l] = ordered_k[..., start : start + l, :]
-            values[i : i + 1, :, p : p + l] = ordered_v[..., start : start + l, :]
-        else:
-            # Buffer shorter than l: right-align within the slice (left-padded
-            # by zeros from the pre-allocated target tensor).
-            gap = l - seq_len
-            keys[i : i + 1, :, p + gap : p + l] = ordered_k
-            values[i : i + 1, :, p + gap : p + l] = ordered_v
-    cache = cls(caches[0].max_size, padding)
-    cache.keys = keys
-    cache.values = values
-    cache.offset = mx.array(offsets)
-    cache._idx = keys.shape[2]
-    cache._offset = keys.shape[2]
-    return cache
-_lm_cache.BatchRotatingKVCache.merge = _patched_merge
-# ── Run the server ────────────────────────────────────────────────────────────
-# main() parses sys.argv for --model, --host, --port, etc.
-from mlx_vlm.server import main
-main()

package/src/mlx-flags.mjs DELETED Viewed

@@ -1,100 +0,0 @@
-// mlx-vlm server flag computation — pure functions, no side effects.
-// Ported from deprecated-offgrid-desktop/src/main/server-flags.ts (MLX subset).
-//
-// Benchmark-informed decisions (see sidequests/mlx-backend-benchmark/RESULTS.md):
-// - mlx-vlm requires APC_ENABLED=1 env var (86x TTFT improvement) — set at spawn
-//   time in process.mjs, NOT here (this module only computes args).
-// - mlx-vlm uses a strict=False wrapper script for shared-KV architectures
-//   (Gemma 4-class). Safe for all models — strict=False is a no-op for models
-//   that load fine with strict=True.
-// - mlx-vlm uses --enable-thinking for thinking-mode control.
-// - mlx-vlm uses --max-kv-size for the KV cache / context window.
-//
-// Only the mlx-vlm-relevant logic is ported here. offgrid-ai's existing GGUF
-// flag logic (autodetect.mjs / profile-setup.mjs / estimate.mjs) is unchanged.
-import { fileURLToPath } from "node:url";
-import { dirname, join } from "node:path";
-const MB = 1024 ** 2;
-/** Default port for the local model server. Matches the desktop's DEFAULT_PORT. */
-export const DEFAULT_PORT = 18080;
-/** Resolved path to the bundled strict=False wrapper script (sibling of src/). */
-export const MLX_VLM_WRAPPER = join(dirname(fileURLToPath(import.meta.url)), "..", "resources", "mlxvlm-server-wrapper.py");
-/** Overhead multiplier for mlx-vlm: weights × 1.5 (covers KV cache, activations, APC cache; benchmark-validated). */
-const MLX_VLM_OVERHEAD_MULTIPLIER = 1.5;
-/** Server process overhead in MB. */
-const PROCESS_OVERHEAD_MB = 200;
-/**
- * Estimate mlx-vlm memory usage (MB): model weights × 1.5 + process overhead.
- *
- * The 1.5 multiplier covers KV cache, activations, and APC cache overhead
- * (benchmark-validated; see sidequests/mlx-backend-benchmark/RESULTS.md).
- * GGUF/llama-server estimation uses the detailed path in estimate.mjs.
- *
- * @param {number} fileSizeBytes - model size on disk (sum of MLX safetensors).
- * @returns {number} estimated memory in MB.
- */
-export function estimateMemoryMb(fileSizeBytes) {
-  return Math.round((fileSizeBytes / MB) * MLX_VLM_OVERHEAD_MULTIPLIER + PROCESS_OVERHEAD_MB);
-}
-/**
- * Compute mlx-vlm server arguments.
- *
- * mlx-vlm is the MLX-native server (benchmark-validated best throughput + memory
- * efficiency on Apple Silicon). Invoked via the strict=False wrapper script for
- * compatibility with shared-KV architectures (Gemma 4-class).
- *
- * The APC_ENABLED=1 env var is MANDATORY but is set at spawn time in
- * process.mjs, not in args.
- *
- * The wrapper script (resources/mlxvlm-server-wrapper.py) applies strict=False
- * model loading + the BatchRotatingKVCache.merge() fix, both required for
- * shared-KV architectures (Gemma 4-class). It is resolved to a real path via
- * MLX_VLM_WRAPPER; there is intentionally no raw-mlx_vlm.server path.
- *
- * @param {string} modelPath - path to the MLX model directory.
- * @param {object} [options]
- * @param {number} [options.port] - port (default DEFAULT_PORT).
- * @param {number} [options.ctxSize] - context window (passed as --max-kv-size).
- * @param {boolean} [options.thinkingEnabled=true] - whether to enable thinking.
- * @returns {{ args: string[], port: number }}
- */
-export function computeMlxVlmFlags(modelPath, options = {}) {
-  const port = options.port ?? DEFAULT_PORT;
-  const ctxSize = options.ctxSize;
-  const thinkingEnabled = options.thinkingEnabled ?? true;
-  // The binary is "python3" (resolved by backendBinaryFor in backends.mjs); the
-  // wrapper path is the first arg.
-  const args = [
-    MLX_VLM_WRAPPER,
-    "--model", modelPath,
-    "--host", "127.0.0.1",
-    "--port", String(port),
-  ];
-  if (thinkingEnabled) {
-    args.push("--enable-thinking");
-  }
-  // Context size: mlx-vlm uses --max-kv-size for the KV cache / context window.
-  if (ctxSize && ctxSize > 0) {
-    args.push("--max-kv-size", String(ctxSize));
-  }
-  // Default max output tokens — used when the client doesn't specify max_tokens
-  // in the request. Pi's OpenAI completions provider never sends max_tokens
-  // (it doesn't fall back to model.maxTokens like the Anthropic provider does).
-  // llama-server defaults high; mlx-vlm defaults to 2048 which is too low for
-  // coding tasks. Set a generous server-side default.
-  args.push("--max-tokens", "16384");
-  return { args, port };
-}