npm - @tobilu/qmd - Versions diffs - 2.1.0 → 2.5.2 - Mend

@tobilu/qmd 2.1.0 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/CHANGELOG.md +88 -0
package/README.md +3 -0
package/bin/qmd +111 -32
package/dist/ast.d.ts +1 -0
package/dist/ast.js +18 -8
package/dist/bench/bench.d.ts +2 -0
package/dist/bench/bench.js +108 -13
package/dist/bench/score.d.ts +11 -4
package/dist/bench/score.js +34 -13
package/dist/bench/types.d.ts +13 -0
package/dist/cli/qmd.d.ts +26 -0
package/dist/cli/qmd.js +1172 -121
package/dist/collections.d.ts +9 -0
package/dist/collections.js +32 -7
package/dist/db.d.ts +6 -3
package/dist/db.js +1 -1
package/dist/index.d.ts +4 -0
package/dist/index.js +5 -2
package/dist/llm.d.ts +65 -3
package/dist/llm.js +376 -63
package/dist/mcp/server.d.ts +6 -3
package/dist/mcp/server.js +41 -26
package/dist/paths.d.ts +1 -0
package/dist/paths.js +4 -0
package/dist/store.d.ts +92 -17
package/dist/store.js +676 -176
package/package.json +23 -12
package/scripts/build.mjs +29 -0
package/scripts/check-package-grammars.mjs +29 -0
package/scripts/package-smoke.mjs +65 -0
package/scripts/test-all.mjs +27 -0
package/skills/qmd/SKILL.md +203 -0
package/skills/qmd/references/mcp-setup.md +102 -0
package/skills/release/SKILL.md +139 -0
package/skills/release/scripts/install-hooks.sh +38 -0
package/dist/embedded-skills.d.ts +0 -6
package/dist/embedded-skills.js +0 -14

package/dist/llm.js CHANGED Viewed

@@ -3,10 +3,49 @@
  *
  * Provides embeddings, text generation, and reranking using local GGUF models.
  */
-import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
+let nodeLlamaCppImport = null;
+async function loadNodeLlamaCpp() {
+    nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(() => import("node-llama-cpp"));
+    return nodeLlamaCppImport;
+}
+export function setNodeLlamaCppModuleForTest(module) {
+    nodeLlamaCppImport = module ? Promise.resolve(module) : null;
+    failedGpuInitModes.clear();
+    noGpuAccelerationWarningShown = false;
+    cpuForcedPrebuiltFallbackWarningShown = false;
+}
+let nativeStdoutRedirectDepth = 0;
+let originalStdoutWrite = null;
+/**
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
+ * noise to stderr while native llama initialization is in progress.
+ */
+export async function withNativeStdoutRedirectedToStderr(fn) {
+    if (nativeStdoutRedirectDepth === 0) {
+        originalStdoutWrite = process.stdout.write.bind(process.stdout);
+        process.stdout.write = ((chunk, encodingOrCallback, callback) => {
+            if (typeof encodingOrCallback === "function") {
+                return process.stderr.write(chunk, encodingOrCallback);
+            }
+            return process.stderr.write(chunk, encodingOrCallback, callback);
+        });
+    }
+    nativeStdoutRedirectDepth++;
+    try {
+        return await fn();
+    }
+    finally {
+        nativeStdoutRedirectDepth--;
+        if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
+            process.stdout.write = originalStdoutWrite;
+            originalStdoutWrite = null;
+        }
+    }
+}
 import { homedir } from "os";
 import { join } from "path";
-import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
+import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
 // =============================================================================
 // Embedding Formatting Functions
 // =============================================================================
@@ -23,7 +62,7 @@ export function isQwen3EmbeddingModel(modelUri) {
  * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
  */
 export function formatQueryForEmbedding(query, modelUri) {
-    const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+    const uri = modelUri ?? resolveEmbedModel();
     if (isQwen3EmbeddingModel(uri)) {
         return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
     }
@@ -35,7 +74,7 @@ export function formatQueryForEmbedding(query, modelUri) {
  * Qwen3-Embedding encodes documents as raw text without special prefixes.
  */
 export function formatDocForEmbedding(text, title, modelUri) {
-    const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+    const uri = modelUri ?? resolveEmbedModel();
     if (isQwen3EmbeddingModel(uri)) {
         // Qwen3-Embedding: documents are raw text, no task prefix
         return title ? `${title}\n${text}` : text;
@@ -60,6 +99,22 @@ export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5
 export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
 export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
 export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
+export function resolveEmbedModel(config) {
+    return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
+}
+export function resolveGenerateModel(config) {
+    return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
+}
+export function resolveRerankModel(config) {
+    return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
+}
+export function resolveModels(config) {
+    return {
+        embed: resolveEmbedModel(config),
+        generate: resolveGenerateModel(config),
+        rerank: resolveRerankModel(config),
+    };
+}
 // Local model cache directory
 const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
     ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
@@ -89,6 +144,106 @@ async function getRemoteEtag(ref) {
         return null;
     }
 }
+const GGUF_MAGIC = Buffer.from("GGUF");
+function formatModelFileSize(sizeBytes) {
+    return `${(sizeBytes / 1024).toFixed(0)} KB`;
+}
+function printableMagic(header) {
+    const text = header.toString("utf-8");
+    return /^[\x20-\x7e]{1,4}$/.test(text) ? text : `0x${header.toString("hex")}`;
+}
+/**
+ * Inspect a potential GGUF model file without mutating it.
+ * Used by doctor for early diagnostics and by runtime validation before load.
+ */
+export function inspectGgufFile(filePath) {
+    if (!existsSync(filePath)) {
+        return { exists: false, valid: false, kind: "missing", details: "file does not exist" };
+    }
+    let sizeBytes = 0;
+    try {
+        sizeBytes = statSync(filePath).size;
+        const fd = openSync(filePath, "r");
+        const sniff = Buffer.alloc(512);
+        try {
+            readSync(fd, sniff, 0, 512, 0);
+        }
+        finally {
+            closeSync(fd);
+        }
+        const header = sniff.subarray(0, 4);
+        if (header.equals(GGUF_MAGIC)) {
+            return {
+                exists: true,
+                valid: true,
+                kind: "gguf",
+                sizeBytes,
+                magic: "GGUF",
+                details: `valid GGUF (${formatModelFileSize(sizeBytes)})`,
+            };
+        }
+        const magic = printableMagic(header);
+        const text = sniff.toString("utf-8").toLowerCase();
+        const isHtml = text.includes("<!doctype") || text.includes("<html");
+        if (isHtml) {
+            return {
+                exists: true,
+                valid: false,
+                kind: "html",
+                sizeBytes,
+                magic,
+                details: `HTML page, not a GGUF model (${formatModelFileSize(sizeBytes)}); likely proxy/firewall/captive portal response`,
+            };
+        }
+        return {
+            exists: true,
+            valid: false,
+            kind: "invalid",
+            sizeBytes,
+            magic,
+            details: `not valid GGUF (expected magic "GGUF", got "${magic}", ${formatModelFileSize(sizeBytes)})`,
+        };
+    }
+    catch (error) {
+        return {
+            exists: true,
+            valid: false,
+            kind: "invalid",
+            sizeBytes,
+            details: `cannot read model file: ${error instanceof Error ? error.message : String(error)}`,
+        };
+    }
+}
+/**
+ * Validate that a file is actually a GGUF model, not an HTML error page
+ * from a proxy, firewall, or failed download.
+ * Throws a descriptive error if the file is not valid GGUF.
+ */
+function validateGgufFile(filePath, modelUri) {
+    const inspection = inspectGgufFile(filePath);
+    if (!inspection.exists || inspection.valid)
+        return; // let downstream handle missing files
+    // Remove the bad file so the next attempt re-downloads
+    try {
+        unlinkSync(filePath);
+    }
+    catch { /* best effort */ }
+    if (inspection.kind === "html") {
+        throw new Error(`Downloaded model file is an HTML page, not a GGUF model (${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
+            `Something is intercepting the download from huggingface.co (a proxy, firewall, or captive portal).\n\n` +
+            `Model: ${modelUri}\n` +
+            `Path:  ${filePath}\n\n` +
+            `To fix this, either:\n` +
+            `  1. Try a HuggingFace mirror:  HF_ENDPOINT=https://hf-mirror.com qmd embed\n` +
+            `  2. Download the model manually and set the env var, e.g.:\n` +
+            `       QMD_EMBED_MODEL=/path/to/model.gguf qmd embed\n\n` +
+            `Note: 'qmd search' works without any model downloads.`);
+    }
+    throw new Error(`Model file is not valid GGUF (expected magic "GGUF", got "${inspection.magic ?? "unknown"}", file is ${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
+        `Model: ${modelUri}\n` +
+        `Path:  ${filePath}\n\n` +
+        `The file has been removed. Run the command again to re-download.`);
+}
 export async function pullModels(models, options = {}) {
     const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
     if (!existsSync(cacheDir)) {
@@ -129,7 +284,9 @@ export async function pullModels(models, options = {}) {
                 refreshed = true;
             }
         }
+        const { resolveModelFile } = await loadNodeLlamaCpp();
         const path = await resolveModelFile(model, cacheDir);
+        validateGgufFile(path, model);
         const sizeBytes = existsSync(path) ? statSync(path).size : 0;
         if (hfRef && filename) {
             const remoteEtag = await getRemoteEtag(hfRef);
@@ -148,6 +305,58 @@ export async function pullModels(models, options = {}) {
 // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
+export function resolveParallelismOverride(envValue = process.env.QMD_EMBED_PARALLELISM) {
+    const normalized = envValue?.trim() ?? "";
+    if (!normalized)
+        return undefined;
+    const parsed = Number(normalized);
+    if (!Number.isInteger(parsed) || parsed < 1) {
+        process.stderr.write(`QMD Warning: invalid QMD_EMBED_PARALLELISM="${envValue}", using automatic parallelism.\n`);
+        return undefined;
+    }
+    return Math.min(8, parsed);
+}
+export function resolveSafeParallelism(options) {
+    const override = resolveParallelismOverride(options.envValue);
+    if (override !== undefined)
+        return override;
+    // node-llama-cpp/llama.cpp CUDA on Windows is unstable with multiple
+    // simultaneous contexts (ggml-cuda.cu:98 in #519). Vulkan and CPU do not
+    // show the same failure mode, so only serialize Windows CUDA by default.
+    if ((options.platform ?? process.platform) === "win32" && options.gpu === "cuda") {
+        return 1;
+    }
+    return Math.max(1, options.computed);
+}
+export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU, forceCpuValue = process.env.QMD_FORCE_CPU) {
+    const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
+    if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
+        return false;
+    }
+    const normalized = envValue?.trim().toLowerCase() ?? "";
+    if (!normalized)
+        return "auto";
+    if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized))
+        return false;
+    if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda")
+        return normalized;
+    process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`);
+    return "auto";
+}
+async function disposeWithTimeout(resourceName, dispose, timeoutMs = 1000) {
+    const timeoutPromise = new Promise((resolve) => {
+        setTimeout(() => resolve("timeout"), timeoutMs).unref();
+    });
+    try {
+        const result = await Promise.race([dispose(), timeoutPromise]);
+        if (result === "timeout") {
+            process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
+        }
+    }
+    catch (error) {
+        process.stderr.write(`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`);
+    }
+}
 function resolveExpandContextSize(configValue) {
     if (configValue !== undefined) {
         if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -165,6 +374,12 @@ function resolveExpandContextSize(configValue) {
     }
     return parsed;
 }
+const failedGpuInitModes = new Set();
+let noGpuAccelerationWarningShown = false;
+let cpuForcedPrebuiltFallbackWarningShown = false;
+function isCpuModeRequested() {
+    return resolveLlamaGpuMode() === false;
+}
 export class LlamaCpp {
     _ciMode = !!process.env.CI;
     llama = null;
@@ -189,9 +404,9 @@ export class LlamaCpp {
     // Track disposal state to prevent double-dispose
     disposed = false;
     constructor(config = {}) {
-        this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
-        this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
-        this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
+        this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
+        this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
+        this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
         this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
         this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
         this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
@@ -200,6 +415,12 @@ export class LlamaCpp {
     get embedModelName() {
         return this.embedModelUri;
     }
+    get generateModelName() {
+        return this.generateModelUri;
+    }
+    get rerankModelName() {
+        return this.rerankModelUri;
+    }
     /**
      * Reset the inactivity timer. Called after each model operation.
      * When timer fires, models are unloaded to free memory (if no active sessions).
@@ -292,45 +513,113 @@ export class LlamaCpp {
     /**
      * Initialize the llama instance (lazy)
      */
-    async ensureLlama() {
+    async ensureLlama(allowBuild = true) {
         if (!this.llama) {
-            // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
-            const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
-            const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
-            const loadLlama = async (gpu) => await getLlama({
-                build: "autoAttempt",
+            const gpuMode = resolveLlamaGpuMode();
+            const { getLlama, getLlamaGpuTypes, LlamaLogLevel } = await loadNodeLlamaCpp();
+            const loadLlama = async (gpu, sourceBuildAllowed = allowBuild, buildOverride) => await withNativeStdoutRedirectedToStderr(() => getLlama({
+                // Prefer packaged prebuilt bindings before compiling llama.cpp locally.
+                // node-llama-cpp documents gpu:"auto" as the best default: Metal on
+                // Apple Silicon, CUDA when fully available, Vulkan where available,
+                // then CPU. Use build:"auto" for normal loads and build:"never" for
+                // diagnostic/probe paths that must not compile llama.cpp.
+                build: buildOverride ?? (sourceBuildAllowed ? "auto" : "never"),
                 logLevel: LlamaLogLevel.error,
                 gpu,
-            });
+                progressLogs: false,
+                skipDownload: !sourceBuildAllowed,
+            }));
+            const loadCpuCompatibleLlama = async () => {
+                try {
+                    return await loadLlama(false, false);
+                }
+                catch (err) {
+                    // Some platforms, notably Apple Silicon, ship a Metal prebuilt but no
+                    // CPU-only prebuilt. Do a fast no-build lookup for an actual CPU
+                    // binding first; if it does not exist, use the packaged auto/Metal
+                    // binding and disable model offloading via gpuLayers: 0.
+                    if (!cpuForcedPrebuiltFallbackWarningShown) {
+                        cpuForcedPrebuiltFallbackWarningShown = true;
+                        process.stderr.write(`QMD Warning: CPU-only llama.cpp prebuilt not available (${err instanceof Error ? err.message : String(err)}); using packaged backend with GPU offloading disabled.\n`);
+                    }
+                    return await loadLlama("auto", false);
+                }
+            };
             let llama;
-            if (forceCpu) {
-                llama = await loadLlama(false);
+            if (gpuMode === false) {
+                llama = await loadCpuCompatibleLlama();
+            }
+            else if (failedGpuInitModes.has(gpuMode)) {
+                process.stderr.write(`QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`);
+                llama = await loadCpuCompatibleLlama();
             }
             else {
                 try {
-                    llama = await loadLlama("auto");
+                    llama = await loadLlama(gpuMode);
+                    // If node-llama-cpp auto-detection chose CPU, do one no-build pass
+                    // over all OS-valid packaged GPU backends. This preserves the
+                    // documented auto mode for Metal/CUDA/Vulkan while recovering on
+                    // systems where a packaged backend can load but detection is too
+                    // conservative. Never compile during these extra probes.
+                    if (gpuMode === "auto" && llama.gpu === false && getLlamaGpuTypes) {
+                        const candidates = (await getLlamaGpuTypes("allValid"))
+                            .filter((candidate) => candidate !== false && candidate !== "auto");
+                        for (const candidate of candidates) {
+                            if (failedGpuInitModes.has(candidate))
+                                continue;
+                            try {
+                                const gpuLlama = await loadLlama(candidate, false, "never");
+                                if (gpuLlama.gpu !== false) {
+                                    await disposeWithTimeout("CPU llama runtime", () => llama.dispose());
+                                    llama = gpuLlama;
+                                    break;
+                                }
+                                await disposeWithTimeout(`${candidate} probe runtime`, () => gpuLlama.dispose());
+                            }
+                            catch {
+                                failedGpuInitModes.add(candidate);
+                            }
+                        }
+                    }
                 }
                 catch (err) {
-                    // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
-                    // Fall back to CPU so qmd still works.
-                    process.stderr.write(`QMD Warning: GPU init failed (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
-                    llama = await loadLlama(false);
+                    // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
+                    // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
+                    // expensive native build/probe attempts in this process.
+                    failedGpuInitModes.add(gpuMode);
+                    process.stderr.write(`QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
+                    llama = await loadCpuCompatibleLlama();
                 }
             }
-            if (llama.gpu === false) {
-                process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
+            if (llama.gpu === false && !noGpuAccelerationWarningShown) {
+                noGpuAccelerationWarningShown = true;
+                process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd doctor' for device diagnostics.\n");
             }
             this.llama = llama;
         }
         return this.llama;
     }
+    isCpuOffloadForced() {
+        return isCpuModeRequested();
+    }
+    modelLoadOptions(modelPath) {
+        return {
+            modelPath,
+            ...(this.isCpuOffloadForced() ? { gpuLayers: 0 } : {}),
+        };
+    }
     /**
-     * Resolve a model URI to a local path, downloading if needed
+     * Resolve a model URI to a local path, downloading if needed.
+     * Validates the downloaded file is actually a GGUF model (not an HTML error page
+     * from a proxy or firewall).
      */
     async resolveModel(modelUri) {
         this.ensureModelCacheDir();
         // resolveModelFile handles HF URIs and downloads to the cache dir
-        return await resolveModelFile(modelUri, this.modelCacheDir);
+        const { resolveModelFile } = await loadNodeLlamaCpp();
+        const modelPath = await resolveModelFile(modelUri, this.modelCacheDir);
+        validateGgufFile(modelPath, modelUri);
+        return modelPath;
     }
     /**
      * Load embedding model (lazy)
@@ -345,7 +634,7 @@ export class LlamaCpp {
         this.embedModelLoadPromise = (async () => {
             const llama = await this.ensureLlama();
             const modelPath = await this.resolveModel(this.embedModelUri);
-            const model = await llama.loadModel({ modelPath });
+            const model = await llama.loadModel(this.modelLoadOptions(modelPath));
             this.embedModel = model;
             // Model loading counts as activity - ping to keep alive
             this.touchActivity();
@@ -369,21 +658,23 @@ export class LlamaCpp {
      */
     async computeParallelism(perContextMB) {
         const llama = await this.ensureLlama();
-        if (llama.gpu) {
+        if (!this.isCpuOffloadForced() && llama.gpu) {
             try {
                 const vram = await llama.getVramState();
                 const freeMB = vram.free / (1024 * 1024);
                 const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
-                return Math.max(1, Math.min(8, maxByVram));
+                const computed = Math.max(1, Math.min(8, maxByVram));
+                return resolveSafeParallelism({ gpu: llama.gpu, computed });
             }
             catch {
-                return 2;
+                return resolveSafeParallelism({ gpu: llama.gpu, computed: 2 });
             }
         }
         // CPU: split cores across contexts. At least 4 threads per context.
         const cores = llama.cpuMathCores || 4;
         const maxContexts = Math.floor(cores / 4);
-        return Math.max(1, Math.min(4, maxContexts));
+        const computed = Math.max(1, Math.min(4, maxContexts));
+        return resolveSafeParallelism({ gpu: false, computed });
     }
     /**
      * Get the number of threads each context should use, given N parallel contexts.
@@ -391,7 +682,7 @@ export class LlamaCpp {
      */
     async threadsPerContext(parallelism) {
         const llama = await this.ensureLlama();
-        if (llama.gpu)
+        if (!this.isCpuOffloadForced() && llama.gpu)
             return 0; // GPU: let the library decide
         const cores = llama.cpuMathCores || 4;
         return Math.max(1, Math.floor(cores / parallelism));
@@ -455,7 +746,7 @@ export class LlamaCpp {
             this.generateModelLoadPromise = (async () => {
                 const llama = await this.ensureLlama();
                 const modelPath = await this.resolveModel(this.generateModelUri);
-                const model = await llama.loadModel({ modelPath });
+                const model = await llama.loadModel(this.modelLoadOptions(modelPath));
                 this.generateModel = model;
                 return model;
             })();
@@ -485,7 +776,7 @@ export class LlamaCpp {
         this.rerankModelLoadPromise = (async () => {
             const llama = await this.ensureLlama();
             const modelPath = await this.resolveModel(this.rerankModelUri);
-            const model = await llama.loadModel({ modelPath });
+            const model = await llama.loadModel(this.modelLoadOptions(modelPath));
             this.rerankModel = model;
             // Model loading counts as activity - ping to keep alive
             this.touchActivity();
@@ -532,7 +823,6 @@ export class LlamaCpp {
                 try {
                     this.rerankContexts.push(await model.createRankingContext({
                         contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
-                        flashAttention: true,
                         ...(threads > 0 ? { threads } : {}),
                     }));
                 }
@@ -596,20 +886,27 @@ export class LlamaCpp {
      * detokenizes back to text if truncation is needed.
      * Returns the (possibly truncated) text and whether truncation occurred.
      */
+    resolveEmbedTokenLimit() {
+        const trainedContextSize = this.embedModel?.trainContextSize;
+        if (typeof trainedContextSize === "number" && Number.isFinite(trainedContextSize) && trainedContextSize > 0) {
+            return Math.max(1, Math.min(LlamaCpp.EMBED_CONTEXT_SIZE, trainedContextSize));
+        }
+        return LlamaCpp.EMBED_CONTEXT_SIZE;
+    }
     async truncateToContextSize(text) {
         if (!this.embedModel)
-            return { text, truncated: false };
-        const maxTokens = this.embedModel.trainContextSize;
+            return { text, truncated: false, limit: LlamaCpp.EMBED_CONTEXT_SIZE };
+        const maxTokens = this.resolveEmbedTokenLimit();
         if (maxTokens <= 0)
-            return { text, truncated: false };
+            return { text, truncated: false, limit: maxTokens };
         const tokens = this.embedModel.tokenize(text);
         if (tokens.length <= maxTokens)
-            return { text, truncated: false };
+            return { text, truncated: false, limit: maxTokens };
         // Leave a small margin (4 tokens) for BOS/EOS overhead
         const safeLimit = Math.max(1, maxTokens - 4);
         const truncatedTokens = tokens.slice(0, safeLimit);
         const truncatedText = this.embedModel.detokenize(truncatedTokens);
-        return { text: truncatedText, truncated: true };
+        return { text: truncatedText, truncated: true, limit: maxTokens };
     }
     async embed(text, options = {}) {
         // Ping activity at start to keep models alive during this operation
@@ -617,9 +914,9 @@ export class LlamaCpp {
         try {
             const context = await this.ensureEmbedContext();
             // Guard: truncate text that exceeds model context window to prevent GGML crash
-            const { text: safeText, truncated } = await this.truncateToContextSize(text);
+            const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
             if (truncated) {
-                console.warn(`⚠ Text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
+                console.warn(`⚠ Text truncated to fit embedding context (${limit} tokens)`);
             }
             const embedding = await context.getEmbeddingFor(safeText);
             return {
@@ -652,9 +949,9 @@ export class LlamaCpp {
                 const embeddings = [];
                 for (const text of texts) {
                     try {
-                        const { text: safeText, truncated } = await this.truncateToContextSize(text);
+                        const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
                         if (truncated) {
-                            console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
+                            console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
                         }
                         const embedding = await context.getEmbeddingFor(safeText);
                         this.touchActivity();
@@ -675,9 +972,9 @@ export class LlamaCpp {
                 const results = [];
                 for (const text of chunk) {
                     try {
-                        const { text: safeText, truncated } = await this.truncateToContextSize(text);
+                        const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
                         if (truncated) {
-                            console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
+                            console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
                         }
                         const embedding = await ctx.getEmbeddingFor(safeText);
                         this.touchActivity();
@@ -707,6 +1004,7 @@ export class LlamaCpp {
         // Create fresh context -> sequence -> session for each call
         const context = await this.generateModel.createContext();
         const sequence = context.getSequence();
+        const { LlamaChatSession } = await loadNodeLlamaCpp();
         const session = new LlamaChatSession({ contextSequence: sequence });
         const maxTokens = options.maxTokens ?? 150;
         // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
@@ -776,6 +1074,7 @@ export class LlamaCpp {
             contextSize: this.expandContextSize,
         });
         const sequence = genContext.getSequence();
+        const { LlamaChatSession } = await loadNodeLlamaCpp();
         const session = new LlamaChatSession({ contextSequence: sequence });
         try {
             // Qwen3 recommended settings for non-thinking mode:
@@ -916,11 +1215,12 @@ export class LlamaCpp {
      * Get device/GPU info for status display.
      * Initializes llama if not already done.
      */
-    async getDeviceInfo() {
-        const llama = await this.ensureLlama();
-        const gpuDevices = await llama.getGpuDeviceNames();
+    async getDeviceInfo(options = {}) {
+        const llama = await this.ensureLlama(options.allowBuild ?? true);
+        const cpuForced = this.isCpuOffloadForced();
+        const gpuDevices = cpuForced ? [] : await llama.getGpuDeviceNames();
         let vram;
-        if (llama.gpu) {
+        if (!cpuForced && llama.gpu) {
             try {
                 const state = await llama.getVramState();
                 vram = { total: state.total, used: state.used, free: state.free };
@@ -928,8 +1228,8 @@ export class LlamaCpp {
             catch { /* no vram info */ }
         }
         return {
-            gpu: llama.gpu,
-            gpuOffloading: llama.supportsGpuOffloading,
+            gpu: cpuForced ? false : llama.gpu,
+            gpuOffloading: !cpuForced && llama.supportsGpuOffloading,
             gpuDevices,
             vram,
             cpuCores: llama.cpuMathCores,
@@ -946,21 +1246,34 @@ export class LlamaCpp {
             clearTimeout(this.inactivityTimer);
             this.inactivityTimer = null;
         }
-        // Disposing llama cascades to models and contexts automatically
-        // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
-        // Note: llama.dispose() can hang indefinitely, so we use a timeout
-        if (this.llama) {
-            const disposePromise = this.llama.dispose();
-            const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000));
-            await Promise.race([disposePromise, timeoutPromise]);
+        // Explicitly dispose in dependency order: contexts first, then models, then llama.
+        // Relying only on llama.dispose() leaves Metal resource sets alive until process
+        // finalization on Apple Silicon, where ggml_metal_device_free can abort after
+        // otherwise-successful CLI output (#368).
+        for (const ctx of this.embedContexts) {
+            await disposeWithTimeout("embedding context", () => ctx.dispose());
         }
-        // Clear references
         this.embedContexts = [];
+        for (const ctx of this.rerankContexts) {
+            await disposeWithTimeout("rerank context", () => ctx.dispose());
+        }
         this.rerankContexts = [];
-        this.embedModel = null;
-        this.generateModel = null;
-        this.rerankModel = null;
-        this.llama = null;
+        if (this.embedModel) {
+            await disposeWithTimeout("embedding model", () => this.embedModel.dispose());
+            this.embedModel = null;
+        }
+        if (this.generateModel) {
+            await disposeWithTimeout("generation model", () => this.generateModel.dispose());
+            this.generateModel = null;
+        }
+        if (this.rerankModel) {
+            await disposeWithTimeout("rerank model", () => this.rerankModel.dispose());
+            this.rerankModel = null;
+        }
+        if (this.llama) {
+            await disposeWithTimeout("llama runtime", () => this.llama.dispose());
+            this.llama = null;
+        }
         // Clear any in-flight load/create promises
         this.embedModelLoadPromise = null;
         this.embedContextsCreatePromise = null;