npm - offgrid-ai - Versions diffs - 0.3.16 → 0.3.17 - Mend

offgrid-ai 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "offgrid-ai",
-  "version": "0.3.16",
+  "version": "0.3.17",
   "description": "Privacy-first CLI for running local LLMs — discover, configure, run, benchmark",
   "author": "Eeshan Srivastava (https://eeshans.com)",
   "type": "module",

package/src/autodetect.mjs CHANGED Viewed

@@ -7,20 +7,26 @@ import { readGgufMetadata } from "./gguf.mjs";
 export function detectCapabilities(modelPath, mmprojPath) {
   const meta = safeReadGgufMetadata(modelPath);
   const name = basename(modelPath).toLowerCase();
+  const pathHints = String(modelPath).toLowerCase();
   // Architecture
   const architecture = meta["general.architecture"] ?? null;
   // Thinking / reasoning mode
   const hasThinkingKwargs = meta["chat_template_kwargs"] !== undefined;
-  const nameHintsThinking = /qwen3|gemma-4|gemma4|deepseek-r[12]/i.test(name);
+  const nameHintsThinking = /qwen3|qwen3\.\d|gemma-4|gemma4|deepseek-r[12]/i.test(pathHints);
   const thinking = hasThinkingKwargs || nameHintsThinking;
+  // Quantization-aware / imatrix quantization hints. These mostly affect
+  // display and defaults transparency; llama-server does not need a QAT flag.
+  const qat = /qat|imatrix|i-?matrix/i.test(pathHints) || Object.keys(meta).some((key) => key.startsWith("quantize.imatrix."));
   // Vision — mmproj present
   const vision = Boolean(mmprojPath && existsSync(mmprojPath));
-  // MTP (multi-token prediction) — detect speculative decoding
-  const mtp = /mtp/i.test(name) || architecture === "qwen3";
+  // MTP (multi-token prediction) — detect speculative decoding.
+  // Do not treat all Qwen models as MTP; require an explicit filename or metadata hint.
+  const mtp = /\bmtp\b|draft-mtp|multi-token/i.test(pathHints) || Object.keys(meta).some((key) => /mtp|draft|speculative/i.test(key));
   // Quantization
   const quant = name.match(/(Q\d_K_[A-Z]+|UD-[A-Z0-9_]+)/i)?.[1] ?? null;
@@ -31,7 +37,7 @@ export function detectCapabilities(modelPath, mmprojPath) {
     : undefined;
   const ctxSize = metaCtx ?? (thinking ? 80000 : 32768);
-  return { architecture, thinking, vision, mtp, quant, metaCtx, ctxSize, meta };
+  return { architecture, thinking, vision, mtp, qat, quant, metaCtx, ctxSize, meta };
 }
 // ── Compute llama-server flags from capabilities ───────────────────────────
@@ -42,7 +48,7 @@ export function computeFlags(capabilities, modelPath, mmprojPath, draftModelPath
   const flags = {
     host: "127.0.0.1",
-    port: 8080,
+    port: mtp ? 8081 : 8080,
     ctxSize: capabilities.ctxSize,
     flashAttention: "on",
     cacheTypeK: isLowMem ? "f16" : "bf16",

package/src/cli.mjs CHANGED Viewed

@@ -14,6 +14,7 @@ import { checkForUpdate, currentPackageVersion, detectInvocation, updateCommand,
 import { removeInstallerPathEntries } from "./shell-path.mjs";
 import { configureLocalProfile } from "./profile-setup.mjs";
 import { buildPrettyCommand } from "./command.mjs";
+import { detectCapabilities } from "./autodetect.mjs";
 // ── Entry point ────────────────────────────────────────────────────────────
@@ -176,10 +177,10 @@ async function modelsCommand(argv) {
 async function modelCommandCenter(catalog) {
   const normalized = normalizeCatalog(catalog);
-  await printModelCatalog(normalized);
+  const items = modelCatalogItems(normalized);
+  await printModelCatalog(normalized, items);
   if (!process.stdin.isTTY) return;
-  const items = modelCatalogItems(normalized);
   if (items.length === 0) return;
   const prompt = createPrompt();
@@ -234,36 +235,54 @@ function normalizeCatalog(catalog) {
   return { profiles, ggufModels, managedModels, newModels, managedItems };
 }
-async function printModelCatalog({ profiles, newModels, managedModels }) {
-  if (profiles.length > 0) {
-    console.log(pc.bold("\nSaved profiles"));
+async function printModelCatalog({ profiles, newModels, managedItems }, items = modelCatalogItems({ profiles, newModels, managedItems })) {
+  const itemNumber = (predicate) => {
+    const index = items.findIndex(predicate);
+    return index === -1 ? "  " : String(index + 1).padStart(2, " ");
+  };
+  console.log(pc.bold("\nSaved profiles"));
+  if (profiles.length === 0) {
+    console.log(pc.dim("  None yet."));
+  } else {
     for (const profile of profiles) {
       const backend = backendFor(profile.backend);
       const colorMap = { "llama-cpp": pc.yellow, "llama-cpp-mtp": pc.blue, "ollama": pc.magenta, "omlx": pc.cyan };
       const running = await isProfileRunning(profile);
       const piConfigured = await hasPiModel(profile);
       const c = colorMap[profile.backend] ?? pc.magenta;
-      console.log(`  ${running ? pc.green("●") : pc.dim("○")} ${pc.bold(profile.label)} ${c(`[${backend.label}]`)} · ${pc.cyan(profile.modelAlias)} ${piConfigured ? pc.green("· Pi synced") : pc.yellow("· Pi not synced")}`);
+      const num = itemNumber((item) => item.type === "profile" && item.profile.id === profile.id);
+      console.log(`${num}. ${running ? pc.green("●") : pc.dim("○")} ${pc.bold(profile.label)} ${c(`[${backend.label}]`)} · ${pc.cyan(profile.modelAlias)} ${piConfigured ? pc.green("· Pi synced") : pc.yellow("· Pi not synced")}`);
     }
-  } else {
-    console.log(pc.bold("\nSaved profiles"));
-    console.log(pc.dim("  None yet."));
   }
-  if (newModels.length > 0) {
-    console.log(pc.bold("\nNew GGUF models"));
+  console.log("");
+  console.log(pc.bold("Downloaded models not set up yet"));
+  if (newModels.length === 0) {
+    console.log(pc.dim("  None. Every downloaded GGUF has a profile."));
+  } else {
     for (const model of newModels.slice(0, 20)) {
-      console.log(`  ${pc.cyan(model.label)} ${pc.dim(model.quant ?? "")} · ${pc.dim(formatBytes(model.sizeBytes))}`);
+      const caps = detectCapabilities(model.path, model.mmprojPath);
+      const num = itemNumber((item) => item.type === "new" && item.model.path === model.path);
+      console.log(`${num}. ${pc.cyan(model.label)} ${capabilityBadges(caps)} ${pc.dim(model.quant ?? "")}`);
+      console.log(`    alias:  ${pc.cyan(model.aliasSuggestion)}`);
+      console.log(`    size:   ${formatBytes(model.sizeBytes)}`);
     }
     if (newModels.length > 20) console.log(pc.dim(`  ... and ${newModels.length - 20} more`));
   }
-  for (const { backendId, models } of managedModels) {
-    if (models.length === 0) continue;
+  for (const backendId of ["ollama", "omlx"]) {
+    const backendItems = managedItems.filter((item) => item.backendId === backendId);
+    if (backendItems.length === 0) continue;
     const be = BACKENDS[backendId];
-    console.log(pc.bold(`\n${be.label} models`));
-    for (const model of models.slice(0, 10)) console.log(`  ${pc.cyan(model.label)}`);
-    if (models.length > 10) console.log(pc.dim(`  ... and ${models.length - 10} more`));
+    console.log("");
+    console.log(pc.bold(`${be.label} models`));
+    for (const { model } of backendItems.slice(0, 10)) {
+      const num = itemNumber((item) => item.type === "managed" && item.backendId === backendId && item.model.id === model.id);
+      console.log(`${num}. ${pc.cyan(model.label)} ${pc.dim(model.quant ?? "")}`);
+      console.log(`    id: ${pc.cyan(model.id)}`);
+    }
+    if (backendItems.length > 10) console.log(pc.dim(`  ... and ${backendItems.length - 10} more`));
   }
 }
@@ -276,17 +295,25 @@ function modelCatalogItems({ profiles, newModels, managedItems }) {
 }
 async function chooseCatalogItem(prompt, items, action) {
-  const allowed = action === "remove" ? items.filter((item) => item.type === "profile") : items;
-  if (allowed.length === 0) {
-    console.log(pc.yellow(action === "remove" ? "No saved profiles to remove." : "No models available."));
+  if (action === "remove" && !items.some((item) => item.type === "profile")) {
+    console.log(pc.yellow("No saved profiles to remove."));
     return null;
   }
-  const selected = await prompt.choice("Select", allowed.map((item, index) => ({
-    value: String(index),
-    label: item.label,
-    hint: item.hint,
-  })), "0");
-  return allowed[Number(selected)];
+  const input = await prompt.text("Select a number", "");
+  if (!input) return null;
+  const index = Number(input) - 1;
+  if (!Number.isInteger(index) || index < 0 || index >= items.length) {
+    console.log(pc.yellow(`No item ${input}.`));
+    return null;
+  }
+  const item = items[index];
+  if (action === "remove" && item.type !== "profile") {
+    console.log(pc.yellow("Only saved profiles can be removed."));
+    return null;
+  }
+  return item;
 }
 async function handleCatalogAction(prompt, action, item) {
@@ -337,6 +364,7 @@ async function printProfileDetails(profile) {
     ["ID", pc.cyan(profile.id)],
     ["Label", pc.bold(profile.label)],
     ["Backend", backend.label],
+    ...(profile.capabilities ? [["Detected", capabilitySummary(profile.capabilities)]] : []),
     ["Endpoint", pc.green(profile.baseUrl)],
     ...(!isManaged ? [
       ["Model", profile.modelPath ?? "unknown"],
@@ -354,8 +382,10 @@ async function printProfileDetails(profile) {
 }
 function printGgufModelDetails(model) {
+  const caps = detectCapabilities(model.path, model.mmprojPath);
   console.log("\n" + renderSection("GGUF model", renderRows([
     ["Label", pc.bold(model.label)],
+    ["Detected", capabilitySummary(caps)],
     ["Model", model.path],
     ["MMProj", model.mmprojPath ?? "none"],
     ["Quant", model.quant ?? "unknown"],
@@ -372,6 +402,26 @@ function printManagedModelDetails(model, backend) {
   ])));
 }
+function capabilitySummary(caps) {
+  const parts = [];
+  if (caps.architecture) parts.push(caps.architecture);
+  if (caps.quant) parts.push(caps.quant);
+  if (caps.mtp) parts.push("MTP");
+  if (caps.qat) parts.push("QAT/imatrix");
+  if (caps.thinking) parts.push("thinking");
+  if (caps.vision) parts.push("vision");
+  return parts.length > 0 ? parts.join(" · ") : "standard GGUF";
+}
+function capabilityBadges(caps) {
+  const badges = [];
+  if (caps.mtp) badges.push(pc.blue("[MTP]"));
+  if (caps.qat) badges.push(pc.green("[QAT]"));
+  if (caps.thinking) badges.push(pc.magenta("[thinking]"));
+  if (caps.vision) badges.push(pc.cyan("[vision]"));
+  return badges.join(" ");
+}
 function createManagedProfile(model, backendId) {
   return normalizeProfile({
     id: model.id.replace(/[^a-z0-9._-]+/gi, "-").toLowerCase(),

package/src/profile-setup.mjs CHANGED Viewed

@@ -8,10 +8,27 @@ const CACHE_CHOICES = [
   { value: "q4_0", label: "q4_0", hint: "lowest memory, quality/speed tradeoff" },
 ];
+const GENERAL_DEFAULTS = {
+  topK: 20,
+  presencePenalty: 1.5,
+  repeatPenalty: 1.0,
+};
+const THINKING_DEFAULTS = {
+  topK: 64,
+  presencePenalty: 0,
+  repeatPenalty: 1.1,
+  chatTemplateKwargs: { enable_thinking: true },
+};
 export async function configureLocalProfile(prompt, profile) {
+  let configured = profile;
+  const caps = profile.capabilities ?? {};
   console.log("");
   console.log(renderSection("Model setup", renderRows([
     ["Model", pc.bold(profile.label)],
+    ["Detected", detectionSummary(caps)],
     ["Context", `${profile.flags.ctxSize.toLocaleString()} tokens`],
     ["KV cache", `${profile.flags.cacheTypeK}/${profile.flags.cacheTypeV}`],
     ["Sampling", samplingSummary(profile.flags)],
@@ -19,13 +36,36 @@ export async function configureLocalProfile(prompt, profile) {
   console.log(pc.dim("Larger context windows use more memory. KV cache precision controls memory used by attention history."));
   console.log(pc.dim("Sampling defaults are shown for transparency; you can edit command.json later if needed.\n"));
-  const ctxSize = await prompt.number("Context window tokens", profile.flags.ctxSize, 1024, 1048576);
-  const cacheTypeK = await prompt.choice("K cache precision", CACHE_CHOICES, profile.flags.cacheTypeK);
-  const cacheTypeV = await prompt.choice("V cache precision", CACHE_CHOICES, profile.flags.cacheTypeV);
-  const configured = applyRuntimeFlagOverrides(profile, { ctxSize, cacheTypeK, cacheTypeV });
+  if (caps.mtp) {
+    console.log(renderSection("Detected MTP", renderRows([
+      ["Backend", "llama.cpp MTP"],
+      ["Port", "8081"],
+      ["Flags", "--spec-type draft-mtp --spec-draft-n-max 2"],
+    ])));
+    const useMtp = await prompt.yesNo("Use MTP speculative decoding flags?", true);
+    configured = useMtp ? applyMtpDefaults(configured) : removeMtpDefaults(configured);
+  }
+  if (caps.thinking || caps.qat) {
+    console.log("");
+    console.log(renderSection(caps.qat ? "Detected QAT / imatrix-style model" : "Detected thinking model", renderRows([
+      ["Defaults", "thinking / loop-safe"],
+      ["Flags", "--top-k 64 --presence-penalty 0 --repeat-penalty 1.1"],
+      ["Template", "--chat-template-kwargs { enable_thinking: true }"],
+    ])));
+    const useThinking = await prompt.yesNo("Use these thinking/QAT-safe defaults?", true);
+    configured = useThinking ? applyThinkingDefaults(configured) : removeThinkingDefaults(configured);
+  }
+  const ctxSize = await prompt.number("Context window tokens", configured.flags.ctxSize, 1024, 1048576);
+  const cacheTypeK = await prompt.choice("K cache precision", CACHE_CHOICES, configured.flags.cacheTypeK);
+  const cacheTypeV = await prompt.choice("V cache precision", CACHE_CHOICES, configured.flags.cacheTypeV);
+  configured = applyRuntimeFlagOverrides(configured, { ctxSize, cacheTypeK, cacheTypeV });
   console.log("");
   console.log(renderSection("Defaults", renderRows([
+    ["Backend", configured.backend],
+    ["Endpoint", configured.baseUrl],
     ["Temperature", configured.flags.temperature],
     ["Top-p", configured.flags.topP],
     ["Top-k", configured.flags.topK],
@@ -41,21 +81,63 @@ export async function configureLocalProfile(prompt, profile) {
 export function applyRuntimeFlagOverrides(profile, overrides) {
   const flags = { ...profile.flags, ...overrides };
-  return {
+  return applyProfileFlags(profile, flags);
+}
+function applyMtpDefaults(profile) {
+  const flags = { ...profile.flags, port: 8081 };
+  return applyProfileFlags({ ...profile, backend: "llama-cpp-mtp", providerId: "llama-cpp-mtp" }, flags, {
+    values: { "--spec-type": "draft-mtp", "--spec-draft-n-max": 2 },
+  });
+}
+function removeMtpDefaults(profile) {
+  const flags = { ...profile.flags, port: 8080 };
+  return applyProfileFlags({ ...profile, backend: "llama-cpp", providerId: "llama-cpp" }, flags, {
+    remove: ["--spec-type", "--spec-draft-n-max"],
+  });
+}
+function applyThinkingDefaults(profile) {
+  const flags = { ...profile.flags, ...THINKING_DEFAULTS };
+  return applyProfileFlags(profile, flags);
+}
+function removeThinkingDefaults(profile) {
+  const flags = { ...profile.flags, ...GENERAL_DEFAULTS };
+  delete flags.chatTemplateKwargs;
+  return applyProfileFlags(profile, flags, { remove: ["--chat-template-kwargs"] });
+}
+function applyProfileFlags(profile, flags, edits = {}) {
+  const next = {
     ...profile,
     flags,
     baseUrl: `http://${flags.host}:${flags.port}/v1`,
-    commandArgv: updateArgv(profile.commandArgv ?? [], {
-      "--ctx-size": flags.ctxSize,
-      "--cache-type-k": flags.cacheTypeK,
-      "--cache-type-v": flags.cacheTypeV,
-    }),
+    harnesses: {
+      ...(profile.harnesses ?? {}),
+      pi: { ...(profile.harnesses?.pi ?? {}), enabled: true, model: `${profile.providerId ?? profile.backend}/${profile.modelAlias ?? profile.id}` },
+    },
   };
+  next.commandArgv = updateArgv(profile.commandArgv ?? [], {
+    "--host": flags.host,
+    "--port": flags.port,
+    "--ctx-size": flags.ctxSize,
+    "--cache-type-k": flags.cacheTypeK,
+    "--cache-type-v": flags.cacheTypeV,
+    "--top-k": flags.topK,
+    "--presence-penalty": flags.presencePenalty,
+    "--repeat-penalty": flags.repeatPenalty,
+    ...(flags.chatTemplateKwargs ? { "--chat-template-kwargs": JSON.stringify(flags.chatTemplateKwargs) } : {}),
+  }, edits);
+  return next;
 }
-function updateArgv(argv, values) {
-  const next = [...argv];
-  for (const [flag, value] of Object.entries(values)) {
+function updateArgv(argv, values, edits = {}) {
+  let next = [...argv];
+  for (const flag of edits.remove ?? []) next = removeOption(next, flag);
+  for (const [flag, value] of Object.entries({ ...values, ...(edits.values ?? {}) })) {
+    if (value === undefined) continue;
     const index = next.indexOf(flag);
     if (index === -1) next.push(flag, String(value));
     else next[index + 1] = String(value);
@@ -63,6 +145,18 @@ function updateArgv(argv, values) {
   return next;
 }
+function removeOption(argv, flag) {
+  const next = [];
+  for (let i = 0; i < argv.length; i++) {
+    if (argv[i] === flag) {
+      if (argv[i + 1] && !argv[i + 1].startsWith("--")) i += 1;
+      continue;
+    }
+    next.push(argv[i]);
+  }
+  return next;
+}
 function renderMemoryEstimate(profile) {
   try {
     const est = estimateMemory(profile.modelPath, profile.mmprojPath, null, profile.flags);
@@ -77,6 +171,17 @@ function renderMemoryEstimate(profile) {
   }
 }
+function detectionSummary(caps) {
+  const parts = [];
+  if (caps.architecture) parts.push(caps.architecture);
+  if (caps.quant) parts.push(caps.quant);
+  if (caps.mtp) parts.push("MTP");
+  if (caps.qat) parts.push("QAT/imatrix");
+  if (caps.thinking) parts.push("thinking");
+  if (caps.vision) parts.push("vision");
+  return parts.length > 0 ? parts.join(" · ") : "standard GGUF";
+}
 function samplingSummary(flags) {
   return `temp ${flags.temperature}, top-p ${flags.topP}, top-k ${flags.topK}`;
 }

package/src/profiles.mjs CHANGED Viewed

@@ -136,25 +136,41 @@ export function normalizeProfile(profile) {
 // ── Auto-create profile from a discovered model ────────────────────────────
-export async function createProfileFromModel(model, backendId = "llama-cpp") {
+export async function createProfileFromModel(model, backendId) {
   const { detectCapabilities } = await import("./autodetect.mjs");
   const caps = detectCapabilities(model.path, model.mmprojPath);
+  const backend = backendId ?? (caps.mtp ? "llama-cpp-mtp" : "llama-cpp");
   const id = slugFromLabel(model.label);
   const { flags, argv } = computeFlags(caps, model.path, model.mmprojPath, null);
   return normalizeProfile({
     id,
     label: model.label,
-    backend: backendId,
+    backend,
+    providerId: backend,
     modelAlias: model.aliasSuggestion,
     modelPath: model.path,
     mmprojPath: model.mmprojPath,
+    capabilities: summarizeCapabilities(caps),
     preset: null, // no presets — auto-detected
     flags,
     commandArgv: argv,
   });
 }
+function summarizeCapabilities(caps) {
+  return {
+    architecture: caps.architecture,
+    thinking: caps.thinking,
+    vision: caps.vision,
+    mtp: caps.mtp,
+    qat: caps.qat,
+    quant: caps.quant,
+    metaCtx: caps.metaCtx,
+    ctxSize: caps.ctxSize,
+  };
+}
 // ── State files (for running servers) ──────────────────────────────────────
 export async function readState(id) {