npm - @tobilu/qmd - Versions diffs - 2.0.1 → 2.5.1 - Mend

@tobilu/qmd 2.0.1 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/CHANGELOG.md +177 -0
package/README.md +64 -1
package/bin/qmd +49 -4
package/dist/ast.d.ts +65 -0
package/dist/ast.js +334 -0
package/dist/bench/bench.d.ts +23 -0
package/dist/bench/bench.js +280 -0
package/dist/bench/score.d.ts +33 -0
package/dist/bench/score.js +88 -0
package/dist/bench/types.d.ts +80 -0
package/dist/bench/types.js +8 -0
package/dist/cli/formatter.js +5 -1
package/dist/cli/qmd.d.ts +27 -0
package/dist/cli/qmd.js +1328 -115
package/dist/collections.d.ts +20 -0
package/dist/collections.js +32 -7
package/dist/db.d.ts +14 -3
package/dist/db.js +45 -4
package/dist/index.d.ts +11 -1
package/dist/index.js +18 -5
package/dist/llm.d.ts +77 -6
package/dist/llm.js +445 -62
package/dist/mcp/server.d.ts +6 -3
package/dist/mcp/server.js +68 -29
package/dist/paths.d.ts +1 -0
package/dist/paths.js +4 -0
package/dist/store.d.ts +148 -23
package/dist/store.js +1018 -255
package/package.json +48 -20
package/scripts/build.mjs +29 -0
package/scripts/check-package-grammars.mjs +29 -0
package/scripts/package-smoke.mjs +65 -0
package/scripts/test-all.mjs +27 -0
package/skills/qmd/SKILL.md +203 -0
package/skills/qmd/references/mcp-setup.md +102 -0
package/skills/release/SKILL.md +139 -0
package/skills/release/scripts/install-hooks.sh +38 -0
package/dist/embedded-skills.d.ts +0 -6
package/dist/embedded-skills.js +0 -14

package/dist/llm.js CHANGED Viewed

@@ -3,10 +3,49 @@
  *
  * Provides embeddings, text generation, and reranking using local GGUF models.
  */
-import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
+let nodeLlamaCppImport = null;
+async function loadNodeLlamaCpp() {
+    nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(() => import("node-llama-cpp"));
+    return nodeLlamaCppImport;
+}
+export function setNodeLlamaCppModuleForTest(module) {
+    nodeLlamaCppImport = module ? Promise.resolve(module) : null;
+    failedGpuInitModes.clear();
+    noGpuAccelerationWarningShown = false;
+    cpuForcedPrebuiltFallbackWarningShown = false;
+}
+let nativeStdoutRedirectDepth = 0;
+let originalStdoutWrite = null;
+/**
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
+ * noise to stderr while native llama initialization is in progress.
+ */
+export async function withNativeStdoutRedirectedToStderr(fn) {
+    if (nativeStdoutRedirectDepth === 0) {
+        originalStdoutWrite = process.stdout.write.bind(process.stdout);
+        process.stdout.write = ((chunk, encodingOrCallback, callback) => {
+            if (typeof encodingOrCallback === "function") {
+                return process.stderr.write(chunk, encodingOrCallback);
+            }
+            return process.stderr.write(chunk, encodingOrCallback, callback);
+        });
+    }
+    nativeStdoutRedirectDepth++;
+    try {
+        return await fn();
+    }
+    finally {
+        nativeStdoutRedirectDepth--;
+        if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
+            process.stdout.write = originalStdoutWrite;
+            originalStdoutWrite = null;
+        }
+    }
+}
 import { homedir } from "os";
 import { join } from "path";
-import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
+import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
 // =============================================================================
 // Embedding Formatting Functions
 // =============================================================================
@@ -23,7 +62,7 @@ export function isQwen3EmbeddingModel(modelUri) {
  * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
  */
 export function formatQueryForEmbedding(query, modelUri) {
-    const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+    const uri = modelUri ?? resolveEmbedModel();
     if (isQwen3EmbeddingModel(uri)) {
         return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
     }
@@ -35,7 +74,7 @@ export function formatQueryForEmbedding(query, modelUri) {
  * Qwen3-Embedding encodes documents as raw text without special prefixes.
  */
 export function formatDocForEmbedding(text, title, modelUri) {
-    const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+    const uri = modelUri ?? resolveEmbedModel();
     if (isQwen3EmbeddingModel(uri)) {
         // Qwen3-Embedding: documents are raw text, no task prefix
         return title ? `${title}\n${text}` : text;
@@ -48,7 +87,7 @@ export function formatDocForEmbedding(text, title, modelUri) {
 // HuggingFace model URIs for node-llama-cpp
 // Format: hf:<user>/<repo>/<file>
 // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
-const DEFAULT_EMBED_MODEL = process.env.QMD_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@@ -60,8 +99,26 @@ export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5
 export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
 export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
 export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
+export function resolveEmbedModel(config) {
+    return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
+}
+export function resolveGenerateModel(config) {
+    return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
+}
+export function resolveRerankModel(config) {
+    return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
+}
+export function resolveModels(config) {
+    return {
+        embed: resolveEmbedModel(config),
+        generate: resolveGenerateModel(config),
+        rerank: resolveRerankModel(config),
+    };
+}
 // Local model cache directory
-const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
+const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
+    ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
+    : join(homedir(), ".cache", "qmd", "models");
 export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
 function parseHfUri(model) {
     if (!model.startsWith("hf:"))
@@ -87,6 +144,106 @@ async function getRemoteEtag(ref) {
         return null;
     }
 }
+const GGUF_MAGIC = Buffer.from("GGUF");
+function formatModelFileSize(sizeBytes) {
+    return `${(sizeBytes / 1024).toFixed(0)} KB`;
+}
+function printableMagic(header) {
+    const text = header.toString("utf-8");
+    return /^[\x20-\x7e]{1,4}$/.test(text) ? text : `0x${header.toString("hex")}`;
+}
+/**
+ * Inspect a potential GGUF model file without mutating it.
+ * Used by doctor for early diagnostics and by runtime validation before load.
+ */
+export function inspectGgufFile(filePath) {
+    if (!existsSync(filePath)) {
+        return { exists: false, valid: false, kind: "missing", details: "file does not exist" };
+    }
+    let sizeBytes = 0;
+    try {
+        sizeBytes = statSync(filePath).size;
+        const fd = openSync(filePath, "r");
+        const sniff = Buffer.alloc(512);
+        try {
+            readSync(fd, sniff, 0, 512, 0);
+        }
+        finally {
+            closeSync(fd);
+        }
+        const header = sniff.subarray(0, 4);
+        if (header.equals(GGUF_MAGIC)) {
+            return {
+                exists: true,
+                valid: true,
+                kind: "gguf",
+                sizeBytes,
+                magic: "GGUF",
+                details: `valid GGUF (${formatModelFileSize(sizeBytes)})`,
+            };
+        }
+        const magic = printableMagic(header);
+        const text = sniff.toString("utf-8").toLowerCase();
+        const isHtml = text.includes("<!doctype") || text.includes("<html");
+        if (isHtml) {
+            return {
+                exists: true,
+                valid: false,
+                kind: "html",
+                sizeBytes,
+                magic,
+                details: `HTML page, not a GGUF model (${formatModelFileSize(sizeBytes)}); likely proxy/firewall/captive portal response`,
+            };
+        }
+        return {
+            exists: true,
+            valid: false,
+            kind: "invalid",
+            sizeBytes,
+            magic,
+            details: `not valid GGUF (expected magic "GGUF", got "${magic}", ${formatModelFileSize(sizeBytes)})`,
+        };
+    }
+    catch (error) {
+        return {
+            exists: true,
+            valid: false,
+            kind: "invalid",
+            sizeBytes,
+            details: `cannot read model file: ${error instanceof Error ? error.message : String(error)}`,
+        };
+    }
+}
+/**
+ * Validate that a file is actually a GGUF model, not an HTML error page
+ * from a proxy, firewall, or failed download.
+ * Throws a descriptive error if the file is not valid GGUF.
+ */
+function validateGgufFile(filePath, modelUri) {
+    const inspection = inspectGgufFile(filePath);
+    if (!inspection.exists || inspection.valid)
+        return; // let downstream handle missing files
+    // Remove the bad file so the next attempt re-downloads
+    try {
+        unlinkSync(filePath);
+    }
+    catch { /* best effort */ }
+    if (inspection.kind === "html") {
+        throw new Error(`Downloaded model file is an HTML page, not a GGUF model (${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
+            `Something is intercepting the download from huggingface.co (a proxy, firewall, or captive portal).\n\n` +
+            `Model: ${modelUri}\n` +
+            `Path:  ${filePath}\n\n` +
+            `To fix this, either:\n` +
+            `  1. Try a HuggingFace mirror:  HF_ENDPOINT=https://hf-mirror.com qmd embed\n` +
+            `  2. Download the model manually and set the env var, e.g.:\n` +
+            `       QMD_EMBED_MODEL=/path/to/model.gguf qmd embed\n\n` +
+            `Note: 'qmd search' works without any model downloads.`);
+    }
+    throw new Error(`Model file is not valid GGUF (expected magic "GGUF", got "${inspection.magic ?? "unknown"}", file is ${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
+        `Model: ${modelUri}\n` +
+        `Path:  ${filePath}\n\n` +
+        `The file has been removed. Run the command again to re-download.`);
+}
 export async function pullModels(models, options = {}) {
     const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
     if (!existsSync(cacheDir)) {
@@ -127,7 +284,9 @@ export async function pullModels(models, options = {}) {
                 refreshed = true;
             }
         }
+        const { resolveModelFile } = await loadNodeLlamaCpp();
         const path = await resolveModelFile(model, cacheDir);
+        validateGgufFile(path, model);
         const sizeBytes = existsSync(path) ? statSync(path).size : 0;
         if (hfRef && filename) {
             const remoteEtag = await getRemoteEtag(hfRef);
@@ -146,6 +305,58 @@ export async function pullModels(models, options = {}) {
 // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
+export function resolveParallelismOverride(envValue = process.env.QMD_EMBED_PARALLELISM) {
+    const normalized = envValue?.trim() ?? "";
+    if (!normalized)
+        return undefined;
+    const parsed = Number(normalized);
+    if (!Number.isInteger(parsed) || parsed < 1) {
+        process.stderr.write(`QMD Warning: invalid QMD_EMBED_PARALLELISM="${envValue}", using automatic parallelism.\n`);
+        return undefined;
+    }
+    return Math.min(8, parsed);
+}
+export function resolveSafeParallelism(options) {
+    const override = resolveParallelismOverride(options.envValue);
+    if (override !== undefined)
+        return override;
+    // node-llama-cpp/llama.cpp CUDA on Windows is unstable with multiple
+    // simultaneous contexts (ggml-cuda.cu:98 in #519). Vulkan and CPU do not
+    // show the same failure mode, so only serialize Windows CUDA by default.
+    if ((options.platform ?? process.platform) === "win32" && options.gpu === "cuda") {
+        return 1;
+    }
+    return Math.max(1, options.computed);
+}
+export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU, forceCpuValue = process.env.QMD_FORCE_CPU) {
+    const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
+    if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
+        return false;
+    }
+    const normalized = envValue?.trim().toLowerCase() ?? "";
+    if (!normalized)
+        return "auto";
+    if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized))
+        return false;
+    if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda")
+        return normalized;
+    process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`);
+    return "auto";
+}
+async function disposeWithTimeout(resourceName, dispose, timeoutMs = 1000) {
+    const timeoutPromise = new Promise((resolve) => {
+        setTimeout(() => resolve("timeout"), timeoutMs).unref();
+    });
+    try {
+        const result = await Promise.race([dispose(), timeoutPromise]);
+        if (result === "timeout") {
+            process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
+        }
+    }
+    catch (error) {
+        process.stderr.write(`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`);
+    }
+}
 function resolveExpandContextSize(configValue) {
     if (configValue !== undefined) {
         if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -163,6 +374,12 @@ function resolveExpandContextSize(configValue) {
     }
     return parsed;
 }
+const failedGpuInitModes = new Set();
+let noGpuAccelerationWarningShown = false;
+let cpuForcedPrebuiltFallbackWarningShown = false;
+function isCpuModeRequested() {
+    return resolveLlamaGpuMode() === false;
+}
 export class LlamaCpp {
     _ciMode = !!process.env.CI;
     llama = null;
@@ -187,14 +404,23 @@ export class LlamaCpp {
     // Track disposal state to prevent double-dispose
     disposed = false;
     constructor(config = {}) {
-        this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
-        this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
-        this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
+        this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
+        this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
+        this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
         this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
         this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
         this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
         this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
     }
+    get embedModelName() {
+        return this.embedModelUri;
+    }
+    get generateModelName() {
+        return this.generateModelUri;
+    }
+    get rerankModelName() {
+        return this.rerankModelUri;
+    }
     /**
      * Reset the inactivity timer. Called after each model operation.
      * When timer fires, models are unloaded to free memory (if no active sessions).
@@ -287,27 +513,113 @@ export class LlamaCpp {
     /**
      * Initialize the llama instance (lazy)
      */
-    async ensureLlama() {
+    async ensureLlama(allowBuild = true) {
         if (!this.llama) {
-            const llama = await getLlama({
-                // attempt to build
-                build: "autoAttempt",
-                logLevel: LlamaLogLevel.error
-            });
-            if (llama.gpu === false) {
-                process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
+            const gpuMode = resolveLlamaGpuMode();
+            const { getLlama, getLlamaGpuTypes, LlamaLogLevel } = await loadNodeLlamaCpp();
+            const loadLlama = async (gpu, sourceBuildAllowed = allowBuild, buildOverride) => await withNativeStdoutRedirectedToStderr(() => getLlama({
+                // Prefer packaged prebuilt bindings before compiling llama.cpp locally.
+                // node-llama-cpp documents gpu:"auto" as the best default: Metal on
+                // Apple Silicon, CUDA when fully available, Vulkan where available,
+                // then CPU. Use build:"auto" for normal loads and build:"never" for
+                // diagnostic/probe paths that must not compile llama.cpp.
+                build: buildOverride ?? (sourceBuildAllowed ? "auto" : "never"),
+                logLevel: LlamaLogLevel.error,
+                gpu,
+                progressLogs: false,
+                skipDownload: !sourceBuildAllowed,
+            }));
+            const loadCpuCompatibleLlama = async () => {
+                try {
+                    return await loadLlama(false, false);
+                }
+                catch (err) {
+                    // Some platforms, notably Apple Silicon, ship a Metal prebuilt but no
+                    // CPU-only prebuilt. Do a fast no-build lookup for an actual CPU
+                    // binding first; if it does not exist, use the packaged auto/Metal
+                    // binding and disable model offloading via gpuLayers: 0.
+                    if (!cpuForcedPrebuiltFallbackWarningShown) {
+                        cpuForcedPrebuiltFallbackWarningShown = true;
+                        process.stderr.write(`QMD Warning: CPU-only llama.cpp prebuilt not available (${err instanceof Error ? err.message : String(err)}); using packaged backend with GPU offloading disabled.\n`);
+                    }
+                    return await loadLlama("auto", false);
+                }
+            };
+            let llama;
+            if (gpuMode === false) {
+                llama = await loadCpuCompatibleLlama();
+            }
+            else if (failedGpuInitModes.has(gpuMode)) {
+                process.stderr.write(`QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`);
+                llama = await loadCpuCompatibleLlama();
+            }
+            else {
+                try {
+                    llama = await loadLlama(gpuMode);
+                    // If node-llama-cpp auto-detection chose CPU, do one no-build pass
+                    // over all OS-valid packaged GPU backends. This preserves the
+                    // documented auto mode for Metal/CUDA/Vulkan while recovering on
+                    // systems where a packaged backend can load but detection is too
+                    // conservative. Never compile during these extra probes.
+                    if (gpuMode === "auto" && llama.gpu === false && getLlamaGpuTypes) {
+                        const candidates = (await getLlamaGpuTypes("allValid"))
+                            .filter((candidate) => candidate !== false && candidate !== "auto");
+                        for (const candidate of candidates) {
+                            if (failedGpuInitModes.has(candidate))
+                                continue;
+                            try {
+                                const gpuLlama = await loadLlama(candidate, false, "never");
+                                if (gpuLlama.gpu !== false) {
+                                    await disposeWithTimeout("CPU llama runtime", () => llama.dispose());
+                                    llama = gpuLlama;
+                                    break;
+                                }
+                                await disposeWithTimeout(`${candidate} probe runtime`, () => gpuLlama.dispose());
+                            }
+                            catch {
+                                failedGpuInitModes.add(candidate);
+                            }
+                        }
+                    }
+                }
+                catch (err) {
+                    // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
+                    // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
+                    // expensive native build/probe attempts in this process.
+                    failedGpuInitModes.add(gpuMode);
+                    process.stderr.write(`QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
+                    llama = await loadCpuCompatibleLlama();
+                }
+            }
+            if (llama.gpu === false && !noGpuAccelerationWarningShown) {
+                noGpuAccelerationWarningShown = true;
+                process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd doctor' for device diagnostics.\n");
             }
             this.llama = llama;
         }
         return this.llama;
     }
+    isCpuOffloadForced() {
+        return isCpuModeRequested();
+    }
+    modelLoadOptions(modelPath) {
+        return {
+            modelPath,
+            ...(this.isCpuOffloadForced() ? { gpuLayers: 0 } : {}),
+        };
+    }
     /**
-     * Resolve a model URI to a local path, downloading if needed
+     * Resolve a model URI to a local path, downloading if needed.
+     * Validates the downloaded file is actually a GGUF model (not an HTML error page
+     * from a proxy or firewall).
      */
     async resolveModel(modelUri) {
         this.ensureModelCacheDir();
         // resolveModelFile handles HF URIs and downloads to the cache dir
-        return await resolveModelFile(modelUri, this.modelCacheDir);
+        const { resolveModelFile } = await loadNodeLlamaCpp();
+        const modelPath = await resolveModelFile(modelUri, this.modelCacheDir);
+        validateGgufFile(modelPath, modelUri);
+        return modelPath;
     }
     /**
      * Load embedding model (lazy)
@@ -322,7 +634,7 @@ export class LlamaCpp {
         this.embedModelLoadPromise = (async () => {
             const llama = await this.ensureLlama();
             const modelPath = await this.resolveModel(this.embedModelUri);
-            const model = await llama.loadModel({ modelPath });
+            const model = await llama.loadModel(this.modelLoadOptions(modelPath));
             this.embedModel = model;
             // Model loading counts as activity - ping to keep alive
             this.touchActivity();
@@ -346,21 +658,23 @@ export class LlamaCpp {
      */
     async computeParallelism(perContextMB) {
         const llama = await this.ensureLlama();
-        if (llama.gpu) {
+        if (!this.isCpuOffloadForced() && llama.gpu) {
             try {
                 const vram = await llama.getVramState();
                 const freeMB = vram.free / (1024 * 1024);
                 const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
-                return Math.max(1, Math.min(8, maxByVram));
+                const computed = Math.max(1, Math.min(8, maxByVram));
+                return resolveSafeParallelism({ gpu: llama.gpu, computed });
             }
             catch {
-                return 2;
+                return resolveSafeParallelism({ gpu: llama.gpu, computed: 2 });
             }
         }
         // CPU: split cores across contexts. At least 4 threads per context.
         const cores = llama.cpuMathCores || 4;
         const maxContexts = Math.floor(cores / 4);
-        return Math.max(1, Math.min(4, maxContexts));
+        const computed = Math.max(1, Math.min(4, maxContexts));
+        return resolveSafeParallelism({ gpu: false, computed });
     }
     /**
      * Get the number of threads each context should use, given N parallel contexts.
@@ -368,7 +682,7 @@ export class LlamaCpp {
      */
     async threadsPerContext(parallelism) {
         const llama = await this.ensureLlama();
-        if (llama.gpu)
+        if (!this.isCpuOffloadForced() && llama.gpu)
             return 0; // GPU: let the library decide
         const cores = llama.cpuMathCores || 4;
         return Math.max(1, Math.floor(cores / parallelism));
@@ -394,6 +708,7 @@ export class LlamaCpp {
             for (let i = 0; i < n; i++) {
                 try {
                     this.embedContexts.push(await model.createEmbeddingContext({
+                        contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
                         ...(threads > 0 ? { threads } : {}),
                     }));
                 }
@@ -431,7 +746,7 @@ export class LlamaCpp {
             this.generateModelLoadPromise = (async () => {
                 const llama = await this.ensureLlama();
                 const modelPath = await this.resolveModel(this.generateModelUri);
-                const model = await llama.loadModel({ modelPath });
+                const model = await llama.loadModel(this.modelLoadOptions(modelPath));
                 this.generateModel = model;
                 return model;
             })();
@@ -461,7 +776,7 @@ export class LlamaCpp {
         this.rerankModelLoadPromise = (async () => {
             const llama = await this.ensureLlama();
             const modelPath = await this.resolveModel(this.rerankModelUri);
-            const model = await llama.loadModel({ modelPath });
+            const model = await llama.loadModel(this.modelLoadOptions(modelPath));
             this.rerankModel = model;
             // Model loading counts as activity - ping to keep alive
             this.touchActivity();
@@ -484,9 +799,20 @@ export class LlamaCpp {
      * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
      */
     // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
-    // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
-    // Use 2048 for safety margin. Still 17× less than auto (40960).
-    static RERANK_CONTEXT_SIZE = 2048;
+    // Default 2048 was too small for longer documents (e.g. session transcripts,
+    // CJK text, or large markdown files) — callers hit "input lengths exceed
+    // context size" errors even after truncation because the overhead estimate
+    // was insufficient.  4096 comfortably fits the largest real-world chunks
+    // while staying well below the 40 960-token auto size.
+    // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
+    static RERANK_CONTEXT_SIZE = (() => {
+        const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
+        return Number.isFinite(v) && v > 0 ? v : 4096;
+    })();
+    static EMBED_CONTEXT_SIZE = (() => {
+        const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
+        return Number.isFinite(v) && v > 0 ? v : 2048;
+    })();
     async ensureRerankContexts() {
         if (this.rerankContexts.length === 0) {
             const model = await this.ensureRerankModel();
@@ -497,7 +823,6 @@ export class LlamaCpp {
                 try {
                     this.rerankContexts.push(await model.createRankingContext({
                         contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
-                        flashAttention: true,
                         ...(threads > 0 ? { threads } : {}),
                     }));
                 }
@@ -555,15 +880,48 @@ export class LlamaCpp {
     // ==========================================================================
     // Core API methods
     // ==========================================================================
+    /**
+     * Truncate text to fit within the embedding model's context window.
+     * Uses the model's own tokenizer for accurate token counting, then
+     * detokenizes back to text if truncation is needed.
+     * Returns the (possibly truncated) text and whether truncation occurred.
+     */
+    resolveEmbedTokenLimit() {
+        const trainedContextSize = this.embedModel?.trainContextSize;
+        if (typeof trainedContextSize === "number" && Number.isFinite(trainedContextSize) && trainedContextSize > 0) {
+            return Math.max(1, Math.min(LlamaCpp.EMBED_CONTEXT_SIZE, trainedContextSize));
+        }
+        return LlamaCpp.EMBED_CONTEXT_SIZE;
+    }
+    async truncateToContextSize(text) {
+        if (!this.embedModel)
+            return { text, truncated: false, limit: LlamaCpp.EMBED_CONTEXT_SIZE };
+        const maxTokens = this.resolveEmbedTokenLimit();
+        if (maxTokens <= 0)
+            return { text, truncated: false, limit: maxTokens };
+        const tokens = this.embedModel.tokenize(text);
+        if (tokens.length <= maxTokens)
+            return { text, truncated: false, limit: maxTokens };
+        // Leave a small margin (4 tokens) for BOS/EOS overhead
+        const safeLimit = Math.max(1, maxTokens - 4);
+        const truncatedTokens = tokens.slice(0, safeLimit);
+        const truncatedText = this.embedModel.detokenize(truncatedTokens);
+        return { text: truncatedText, truncated: true, limit: maxTokens };
+    }
     async embed(text, options = {}) {
         // Ping activity at start to keep models alive during this operation
         this.touchActivity();
         try {
             const context = await this.ensureEmbedContext();
-            const embedding = await context.getEmbeddingFor(text);
+            // Guard: truncate text that exceeds model context window to prevent GGML crash
+            const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
+            if (truncated) {
+                console.warn(`⚠ Text truncated to fit embedding context (${limit} tokens)`);
+            }
+            const embedding = await context.getEmbeddingFor(safeText);
             return {
                 embedding: Array.from(embedding.vector),
-                model: this.embedModelUri,
+                model: options.model ?? this.embedModelUri,
             };
         }
         catch (error) {
@@ -575,7 +933,7 @@ export class LlamaCpp {
      * Batch embed multiple texts efficiently
      * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
      */
-    async embedBatch(texts) {
+    async embedBatch(texts, options = {}) {
         if (this._ciMode)
             throw new Error("LLM operations are disabled in CI (set CI=true)");
         // Ping activity at start to keep models alive during this operation
@@ -591,9 +949,13 @@ export class LlamaCpp {
                 const embeddings = [];
                 for (const text of texts) {
                     try {
-                        const embedding = await context.getEmbeddingFor(text);
+                        const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
+                        if (truncated) {
+                            console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
+                        }
+                        const embedding = await context.getEmbeddingFor(safeText);
                         this.touchActivity();
-                        embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
+                        embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
                     }
                     catch (err) {
                         console.error("Embedding error for text:", err);
@@ -610,9 +972,13 @@ export class LlamaCpp {
                 const results = [];
                 for (const text of chunk) {
                     try {
-                        const embedding = await ctx.getEmbeddingFor(text);
+                        const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
+                        if (truncated) {
+                            console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
+                        }
+                        const embedding = await ctx.getEmbeddingFor(safeText);
                         this.touchActivity();
-                        results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
+                        results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
                     }
                     catch (err) {
                         console.error("Embedding error for text:", err);
@@ -638,6 +1004,7 @@ export class LlamaCpp {
         // Create fresh context -> sequence -> session for each call
         const context = await this.generateModel.createContext();
         const sequence = context.getSequence();
+        const { LlamaChatSession } = await loadNodeLlamaCpp();
         const session = new LlamaChatSession({ contextSequence: sequence });
         const maxTokens = options.maxTokens ?? 150;
         // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
@@ -707,6 +1074,7 @@ export class LlamaCpp {
             contextSize: this.expandContextSize,
         });
         const sequence = genContext.getSequence();
+        const { LlamaChatSession } = await loadNodeLlamaCpp();
         const session = new LlamaChatSession({ contextSequence: sequence });
         try {
             // Qwen3 recommended settings for non-thinking mode:
@@ -767,8 +1135,10 @@ export class LlamaCpp {
             await genContext.dispose();
         }
     }
-    // Qwen3 reranker chat template overhead (system prompt, tags, separators)
-    static RERANK_TEMPLATE_OVERHEAD = 200;
+    // Qwen3 reranker chat template overhead (system prompt, tags, separators).
+    // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
+    // the truncation budget never lets a document slip past the context limit.
+    static RERANK_TEMPLATE_OVERHEAD = 512;
     static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
     async rerank(query, documents, options = {}) {
         if (this._ciMode)
@@ -845,11 +1215,12 @@ export class LlamaCpp {
      * Get device/GPU info for status display.
      * Initializes llama if not already done.
      */
-    async getDeviceInfo() {
-        const llama = await this.ensureLlama();
-        const gpuDevices = await llama.getGpuDeviceNames();
+    async getDeviceInfo(options = {}) {
+        const llama = await this.ensureLlama(options.allowBuild ?? true);
+        const cpuForced = this.isCpuOffloadForced();
+        const gpuDevices = cpuForced ? [] : await llama.getGpuDeviceNames();
         let vram;
-        if (llama.gpu) {
+        if (!cpuForced && llama.gpu) {
             try {
                 const state = await llama.getVramState();
                 vram = { total: state.total, used: state.used, free: state.free };
@@ -857,8 +1228,8 @@ export class LlamaCpp {
             catch { /* no vram info */ }
         }
         return {
-            gpu: llama.gpu,
-            gpuOffloading: llama.supportsGpuOffloading,
+            gpu: cpuForced ? false : llama.gpu,
+            gpuOffloading: !cpuForced && llama.supportsGpuOffloading,
             gpuDevices,
             vram,
             cpuCores: llama.cpuMathCores,
@@ -875,21 +1246,34 @@ export class LlamaCpp {
             clearTimeout(this.inactivityTimer);
             this.inactivityTimer = null;
         }
-        // Disposing llama cascades to models and contexts automatically
-        // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
-        // Note: llama.dispose() can hang indefinitely, so we use a timeout
-        if (this.llama) {
-            const disposePromise = this.llama.dispose();
-            const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000));
-            await Promise.race([disposePromise, timeoutPromise]);
+        // Explicitly dispose in dependency order: contexts first, then models, then llama.
+        // Relying only on llama.dispose() leaves Metal resource sets alive until process
+        // finalization on Apple Silicon, where ggml_metal_device_free can abort after
+        // otherwise-successful CLI output (#368).
+        for (const ctx of this.embedContexts) {
+            await disposeWithTimeout("embedding context", () => ctx.dispose());
         }
-        // Clear references
         this.embedContexts = [];
+        for (const ctx of this.rerankContexts) {
+            await disposeWithTimeout("rerank context", () => ctx.dispose());
+        }
         this.rerankContexts = [];
-        this.embedModel = null;
-        this.generateModel = null;
-        this.rerankModel = null;
-        this.llama = null;
+        if (this.embedModel) {
+            await disposeWithTimeout("embedding model", () => this.embedModel.dispose());
+            this.embedModel = null;
+        }
+        if (this.generateModel) {
+            await disposeWithTimeout("generation model", () => this.generateModel.dispose());
+            this.generateModel = null;
+        }
+        if (this.rerankModel) {
+            await disposeWithTimeout("rerank model", () => this.rerankModel.dispose());
+            this.rerankModel = null;
+        }
+        if (this.llama) {
+            await disposeWithTimeout("llama runtime", () => this.llama.dispose());
+            this.llama = null;
+        }
         // Clear any in-flight load/create promises
         this.embedModelLoadPromise = null;
         this.embedContextsCreatePromise = null;
@@ -1028,8 +1412,8 @@ class LLMSession {
     async embed(text, options) {
         return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
     }
-    async embedBatch(texts) {
-        return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
+    async embedBatch(texts, options) {
+        return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
     }
     async expandQuery(query, options) {
         return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
@@ -1106,8 +1490,7 @@ let defaultLlamaCpp = null;
  */
 export function getDefaultLlamaCpp() {
     if (!defaultLlamaCpp) {
-        const embedModel = process.env.QMD_EMBED_MODEL;
-        defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
+        defaultLlamaCpp = new LlamaCpp();
     }
     return defaultLlamaCpp;
 }