@tobilu/qmd 2.0.1 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/llm.js CHANGED
@@ -3,10 +3,49 @@
3
3
  *
4
4
  * Provides embeddings, text generation, and reranking using local GGUF models.
5
5
  */
6
- import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
6
+ let nodeLlamaCppImport = null;
7
+ async function loadNodeLlamaCpp() {
8
+ nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(() => import("node-llama-cpp"));
9
+ return nodeLlamaCppImport;
10
+ }
11
+ export function setNodeLlamaCppModuleForTest(module) {
12
+ nodeLlamaCppImport = module ? Promise.resolve(module) : null;
13
+ failedGpuInitModes.clear();
14
+ noGpuAccelerationWarningShown = false;
15
+ cpuForcedPrebuiltFallbackWarningShown = false;
16
+ }
17
+ let nativeStdoutRedirectDepth = 0;
18
+ let originalStdoutWrite = null;
19
+ /**
20
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
21
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
22
+ * noise to stderr while native llama initialization is in progress.
23
+ */
24
+ export async function withNativeStdoutRedirectedToStderr(fn) {
25
+ if (nativeStdoutRedirectDepth === 0) {
26
+ originalStdoutWrite = process.stdout.write.bind(process.stdout);
27
+ process.stdout.write = ((chunk, encodingOrCallback, callback) => {
28
+ if (typeof encodingOrCallback === "function") {
29
+ return process.stderr.write(chunk, encodingOrCallback);
30
+ }
31
+ return process.stderr.write(chunk, encodingOrCallback, callback);
32
+ });
33
+ }
34
+ nativeStdoutRedirectDepth++;
35
+ try {
36
+ return await fn();
37
+ }
38
+ finally {
39
+ nativeStdoutRedirectDepth--;
40
+ if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
41
+ process.stdout.write = originalStdoutWrite;
42
+ originalStdoutWrite = null;
43
+ }
44
+ }
45
+ }
7
46
  import { homedir } from "os";
8
47
  import { join } from "path";
9
- import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
48
+ import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
10
49
  // =============================================================================
11
50
  // Embedding Formatting Functions
12
51
  // =============================================================================
@@ -23,7 +62,7 @@ export function isQwen3EmbeddingModel(modelUri) {
23
62
  * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
24
63
  */
25
64
  export function formatQueryForEmbedding(query, modelUri) {
26
- const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
65
+ const uri = modelUri ?? resolveEmbedModel();
27
66
  if (isQwen3EmbeddingModel(uri)) {
28
67
  return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
29
68
  }
@@ -35,7 +74,7 @@ export function formatQueryForEmbedding(query, modelUri) {
35
74
  * Qwen3-Embedding encodes documents as raw text without special prefixes.
36
75
  */
37
76
  export function formatDocForEmbedding(text, title, modelUri) {
38
- const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
77
+ const uri = modelUri ?? resolveEmbedModel();
39
78
  if (isQwen3EmbeddingModel(uri)) {
40
79
  // Qwen3-Embedding: documents are raw text, no task prefix
41
80
  return title ? `${title}\n${text}` : text;
@@ -48,7 +87,7 @@ export function formatDocForEmbedding(text, title, modelUri) {
48
87
  // HuggingFace model URIs for node-llama-cpp
49
88
  // Format: hf:<user>/<repo>/<file>
50
89
  // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
51
- const DEFAULT_EMBED_MODEL = process.env.QMD_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
90
+ const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
52
91
  const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
53
92
  // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
54
93
  const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@@ -60,8 +99,26 @@ export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5
60
99
  export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
61
100
  export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
62
101
  export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
102
+ export function resolveEmbedModel(config) {
103
+ return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
104
+ }
105
+ export function resolveGenerateModel(config) {
106
+ return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
107
+ }
108
+ export function resolveRerankModel(config) {
109
+ return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
110
+ }
111
+ export function resolveModels(config) {
112
+ return {
113
+ embed: resolveEmbedModel(config),
114
+ generate: resolveGenerateModel(config),
115
+ rerank: resolveRerankModel(config),
116
+ };
117
+ }
63
118
  // Local model cache directory
64
- const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
119
+ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
120
+ ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
121
+ : join(homedir(), ".cache", "qmd", "models");
65
122
  export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
66
123
  function parseHfUri(model) {
67
124
  if (!model.startsWith("hf:"))
@@ -87,6 +144,106 @@ async function getRemoteEtag(ref) {
87
144
  return null;
88
145
  }
89
146
  }
147
+ const GGUF_MAGIC = Buffer.from("GGUF");
148
+ function formatModelFileSize(sizeBytes) {
149
+ return `${(sizeBytes / 1024).toFixed(0)} KB`;
150
+ }
151
+ function printableMagic(header) {
152
+ const text = header.toString("utf-8");
153
+ return /^[\x20-\x7e]{1,4}$/.test(text) ? text : `0x${header.toString("hex")}`;
154
+ }
155
+ /**
156
+ * Inspect a potential GGUF model file without mutating it.
157
+ * Used by doctor for early diagnostics and by runtime validation before load.
158
+ */
159
+ export function inspectGgufFile(filePath) {
160
+ if (!existsSync(filePath)) {
161
+ return { exists: false, valid: false, kind: "missing", details: "file does not exist" };
162
+ }
163
+ let sizeBytes = 0;
164
+ try {
165
+ sizeBytes = statSync(filePath).size;
166
+ const fd = openSync(filePath, "r");
167
+ const sniff = Buffer.alloc(512);
168
+ try {
169
+ readSync(fd, sniff, 0, 512, 0);
170
+ }
171
+ finally {
172
+ closeSync(fd);
173
+ }
174
+ const header = sniff.subarray(0, 4);
175
+ if (header.equals(GGUF_MAGIC)) {
176
+ return {
177
+ exists: true,
178
+ valid: true,
179
+ kind: "gguf",
180
+ sizeBytes,
181
+ magic: "GGUF",
182
+ details: `valid GGUF (${formatModelFileSize(sizeBytes)})`,
183
+ };
184
+ }
185
+ const magic = printableMagic(header);
186
+ const text = sniff.toString("utf-8").toLowerCase();
187
+ const isHtml = text.includes("<!doctype") || text.includes("<html");
188
+ if (isHtml) {
189
+ return {
190
+ exists: true,
191
+ valid: false,
192
+ kind: "html",
193
+ sizeBytes,
194
+ magic,
195
+ details: `HTML page, not a GGUF model (${formatModelFileSize(sizeBytes)}); likely proxy/firewall/captive portal response`,
196
+ };
197
+ }
198
+ return {
199
+ exists: true,
200
+ valid: false,
201
+ kind: "invalid",
202
+ sizeBytes,
203
+ magic,
204
+ details: `not valid GGUF (expected magic "GGUF", got "${magic}", ${formatModelFileSize(sizeBytes)})`,
205
+ };
206
+ }
207
+ catch (error) {
208
+ return {
209
+ exists: true,
210
+ valid: false,
211
+ kind: "invalid",
212
+ sizeBytes,
213
+ details: `cannot read model file: ${error instanceof Error ? error.message : String(error)}`,
214
+ };
215
+ }
216
+ }
217
+ /**
218
+ * Validate that a file is actually a GGUF model, not an HTML error page
219
+ * from a proxy, firewall, or failed download.
220
+ * Throws a descriptive error if the file is not valid GGUF.
221
+ */
222
+ function validateGgufFile(filePath, modelUri) {
223
+ const inspection = inspectGgufFile(filePath);
224
+ if (!inspection.exists || inspection.valid)
225
+ return; // let downstream handle missing files
226
+ // Remove the bad file so the next attempt re-downloads
227
+ try {
228
+ unlinkSync(filePath);
229
+ }
230
+ catch { /* best effort */ }
231
+ if (inspection.kind === "html") {
232
+ throw new Error(`Downloaded model file is an HTML page, not a GGUF model (${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
233
+ `Something is intercepting the download from huggingface.co (a proxy, firewall, or captive portal).\n\n` +
234
+ `Model: ${modelUri}\n` +
235
+ `Path: ${filePath}\n\n` +
236
+ `To fix this, either:\n` +
237
+ ` 1. Try a HuggingFace mirror: HF_ENDPOINT=https://hf-mirror.com qmd embed\n` +
238
+ ` 2. Download the model manually and set the env var, e.g.:\n` +
239
+ ` QMD_EMBED_MODEL=/path/to/model.gguf qmd embed\n\n` +
240
+ `Note: 'qmd search' works without any model downloads.`);
241
+ }
242
+ throw new Error(`Model file is not valid GGUF (expected magic "GGUF", got "${inspection.magic ?? "unknown"}", file is ${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
243
+ `Model: ${modelUri}\n` +
244
+ `Path: ${filePath}\n\n` +
245
+ `The file has been removed. Run the command again to re-download.`);
246
+ }
90
247
  export async function pullModels(models, options = {}) {
91
248
  const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
92
249
  if (!existsSync(cacheDir)) {
@@ -127,7 +284,9 @@ export async function pullModels(models, options = {}) {
127
284
  refreshed = true;
128
285
  }
129
286
  }
287
+ const { resolveModelFile } = await loadNodeLlamaCpp();
130
288
  const path = await resolveModelFile(model, cacheDir);
289
+ validateGgufFile(path, model);
131
290
  const sizeBytes = existsSync(path) ? statSync(path).size : 0;
132
291
  if (hfRef && filename) {
133
292
  const remoteEtag = await getRemoteEtag(hfRef);
@@ -146,6 +305,58 @@ export async function pullModels(models, options = {}) {
146
305
  // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
147
306
  const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
148
307
  const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
308
+ export function resolveParallelismOverride(envValue = process.env.QMD_EMBED_PARALLELISM) {
309
+ const normalized = envValue?.trim() ?? "";
310
+ if (!normalized)
311
+ return undefined;
312
+ const parsed = Number(normalized);
313
+ if (!Number.isInteger(parsed) || parsed < 1) {
314
+ process.stderr.write(`QMD Warning: invalid QMD_EMBED_PARALLELISM="${envValue}", using automatic parallelism.\n`);
315
+ return undefined;
316
+ }
317
+ return Math.min(8, parsed);
318
+ }
319
+ export function resolveSafeParallelism(options) {
320
+ const override = resolveParallelismOverride(options.envValue);
321
+ if (override !== undefined)
322
+ return override;
323
+ // node-llama-cpp/llama.cpp CUDA on Windows is unstable with multiple
324
+ // simultaneous contexts (ggml-cuda.cu:98 in #519). Vulkan and CPU do not
325
+ // show the same failure mode, so only serialize Windows CUDA by default.
326
+ if ((options.platform ?? process.platform) === "win32" && options.gpu === "cuda") {
327
+ return 1;
328
+ }
329
+ return Math.max(1, options.computed);
330
+ }
331
+ export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU, forceCpuValue = process.env.QMD_FORCE_CPU) {
332
+ const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
333
+ if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
334
+ return false;
335
+ }
336
+ const normalized = envValue?.trim().toLowerCase() ?? "";
337
+ if (!normalized)
338
+ return "auto";
339
+ if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized))
340
+ return false;
341
+ if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda")
342
+ return normalized;
343
+ process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`);
344
+ return "auto";
345
+ }
346
+ async function disposeWithTimeout(resourceName, dispose, timeoutMs = 1000) {
347
+ const timeoutPromise = new Promise((resolve) => {
348
+ setTimeout(() => resolve("timeout"), timeoutMs).unref();
349
+ });
350
+ try {
351
+ const result = await Promise.race([dispose(), timeoutPromise]);
352
+ if (result === "timeout") {
353
+ process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
354
+ }
355
+ }
356
+ catch (error) {
357
+ process.stderr.write(`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`);
358
+ }
359
+ }
149
360
  function resolveExpandContextSize(configValue) {
150
361
  if (configValue !== undefined) {
151
362
  if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -163,6 +374,12 @@ function resolveExpandContextSize(configValue) {
163
374
  }
164
375
  return parsed;
165
376
  }
377
+ const failedGpuInitModes = new Set();
378
+ let noGpuAccelerationWarningShown = false;
379
+ let cpuForcedPrebuiltFallbackWarningShown = false;
380
+ function isCpuModeRequested() {
381
+ return resolveLlamaGpuMode() === false;
382
+ }
166
383
  export class LlamaCpp {
167
384
  _ciMode = !!process.env.CI;
168
385
  llama = null;
@@ -187,14 +404,23 @@ export class LlamaCpp {
187
404
  // Track disposal state to prevent double-dispose
188
405
  disposed = false;
189
406
  constructor(config = {}) {
190
- this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
191
- this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
192
- this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
407
+ this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
408
+ this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
409
+ this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
193
410
  this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
194
411
  this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
195
412
  this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
196
413
  this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
197
414
  }
415
+ get embedModelName() {
416
+ return this.embedModelUri;
417
+ }
418
+ get generateModelName() {
419
+ return this.generateModelUri;
420
+ }
421
+ get rerankModelName() {
422
+ return this.rerankModelUri;
423
+ }
198
424
  /**
199
425
  * Reset the inactivity timer. Called after each model operation.
200
426
  * When timer fires, models are unloaded to free memory (if no active sessions).
@@ -287,27 +513,113 @@ export class LlamaCpp {
287
513
  /**
288
514
  * Initialize the llama instance (lazy)
289
515
  */
290
- async ensureLlama() {
516
+ async ensureLlama(allowBuild = true) {
291
517
  if (!this.llama) {
292
- const llama = await getLlama({
293
- // attempt to build
294
- build: "autoAttempt",
295
- logLevel: LlamaLogLevel.error
296
- });
297
- if (llama.gpu === false) {
298
- process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
518
+ const gpuMode = resolveLlamaGpuMode();
519
+ const { getLlama, getLlamaGpuTypes, LlamaLogLevel } = await loadNodeLlamaCpp();
520
+ const loadLlama = async (gpu, sourceBuildAllowed = allowBuild, buildOverride) => await withNativeStdoutRedirectedToStderr(() => getLlama({
521
+ // Prefer packaged prebuilt bindings before compiling llama.cpp locally.
522
+ // node-llama-cpp documents gpu:"auto" as the best default: Metal on
523
+ // Apple Silicon, CUDA when fully available, Vulkan where available,
524
+ // then CPU. Use build:"auto" for normal loads and build:"never" for
525
+ // diagnostic/probe paths that must not compile llama.cpp.
526
+ build: buildOverride ?? (sourceBuildAllowed ? "auto" : "never"),
527
+ logLevel: LlamaLogLevel.error,
528
+ gpu,
529
+ progressLogs: false,
530
+ skipDownload: !sourceBuildAllowed,
531
+ }));
532
+ const loadCpuCompatibleLlama = async () => {
533
+ try {
534
+ return await loadLlama(false, false);
535
+ }
536
+ catch (err) {
537
+ // Some platforms, notably Apple Silicon, ship a Metal prebuilt but no
538
+ // CPU-only prebuilt. Do a fast no-build lookup for an actual CPU
539
+ // binding first; if it does not exist, use the packaged auto/Metal
540
+ // binding and disable model offloading via gpuLayers: 0.
541
+ if (!cpuForcedPrebuiltFallbackWarningShown) {
542
+ cpuForcedPrebuiltFallbackWarningShown = true;
543
+ process.stderr.write(`QMD Warning: CPU-only llama.cpp prebuilt not available (${err instanceof Error ? err.message : String(err)}); using packaged backend with GPU offloading disabled.\n`);
544
+ }
545
+ return await loadLlama("auto", false);
546
+ }
547
+ };
548
+ let llama;
549
+ if (gpuMode === false) {
550
+ llama = await loadCpuCompatibleLlama();
551
+ }
552
+ else if (failedGpuInitModes.has(gpuMode)) {
553
+ process.stderr.write(`QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`);
554
+ llama = await loadCpuCompatibleLlama();
555
+ }
556
+ else {
557
+ try {
558
+ llama = await loadLlama(gpuMode);
559
+ // If node-llama-cpp auto-detection chose CPU, do one no-build pass
560
+ // over all OS-valid packaged GPU backends. This preserves the
561
+ // documented auto mode for Metal/CUDA/Vulkan while recovering on
562
+ // systems where a packaged backend can load but detection is too
563
+ // conservative. Never compile during these extra probes.
564
+ if (gpuMode === "auto" && llama.gpu === false && getLlamaGpuTypes) {
565
+ const candidates = (await getLlamaGpuTypes("allValid"))
566
+ .filter((candidate) => candidate !== false && candidate !== "auto");
567
+ for (const candidate of candidates) {
568
+ if (failedGpuInitModes.has(candidate))
569
+ continue;
570
+ try {
571
+ const gpuLlama = await loadLlama(candidate, false, "never");
572
+ if (gpuLlama.gpu !== false) {
573
+ await disposeWithTimeout("CPU llama runtime", () => llama.dispose());
574
+ llama = gpuLlama;
575
+ break;
576
+ }
577
+ await disposeWithTimeout(`${candidate} probe runtime`, () => gpuLlama.dispose());
578
+ }
579
+ catch {
580
+ failedGpuInitModes.add(candidate);
581
+ }
582
+ }
583
+ }
584
+ }
585
+ catch (err) {
586
+ // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
587
+ // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
588
+ // expensive native build/probe attempts in this process.
589
+ failedGpuInitModes.add(gpuMode);
590
+ process.stderr.write(`QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
591
+ llama = await loadCpuCompatibleLlama();
592
+ }
593
+ }
594
+ if (llama.gpu === false && !noGpuAccelerationWarningShown) {
595
+ noGpuAccelerationWarningShown = true;
596
+ process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd doctor' for device diagnostics.\n");
299
597
  }
300
598
  this.llama = llama;
301
599
  }
302
600
  return this.llama;
303
601
  }
602
+ isCpuOffloadForced() {
603
+ return isCpuModeRequested();
604
+ }
605
+ modelLoadOptions(modelPath) {
606
+ return {
607
+ modelPath,
608
+ ...(this.isCpuOffloadForced() ? { gpuLayers: 0 } : {}),
609
+ };
610
+ }
304
611
  /**
305
- * Resolve a model URI to a local path, downloading if needed
612
+ * Resolve a model URI to a local path, downloading if needed.
613
+ * Validates the downloaded file is actually a GGUF model (not an HTML error page
614
+ * from a proxy or firewall).
306
615
  */
307
616
  async resolveModel(modelUri) {
308
617
  this.ensureModelCacheDir();
309
618
  // resolveModelFile handles HF URIs and downloads to the cache dir
310
- return await resolveModelFile(modelUri, this.modelCacheDir);
619
+ const { resolveModelFile } = await loadNodeLlamaCpp();
620
+ const modelPath = await resolveModelFile(modelUri, this.modelCacheDir);
621
+ validateGgufFile(modelPath, modelUri);
622
+ return modelPath;
311
623
  }
312
624
  /**
313
625
  * Load embedding model (lazy)
@@ -322,7 +634,7 @@ export class LlamaCpp {
322
634
  this.embedModelLoadPromise = (async () => {
323
635
  const llama = await this.ensureLlama();
324
636
  const modelPath = await this.resolveModel(this.embedModelUri);
325
- const model = await llama.loadModel({ modelPath });
637
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
326
638
  this.embedModel = model;
327
639
  // Model loading counts as activity - ping to keep alive
328
640
  this.touchActivity();
@@ -346,21 +658,23 @@ export class LlamaCpp {
346
658
  */
347
659
  async computeParallelism(perContextMB) {
348
660
  const llama = await this.ensureLlama();
349
- if (llama.gpu) {
661
+ if (!this.isCpuOffloadForced() && llama.gpu) {
350
662
  try {
351
663
  const vram = await llama.getVramState();
352
664
  const freeMB = vram.free / (1024 * 1024);
353
665
  const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
354
- return Math.max(1, Math.min(8, maxByVram));
666
+ const computed = Math.max(1, Math.min(8, maxByVram));
667
+ return resolveSafeParallelism({ gpu: llama.gpu, computed });
355
668
  }
356
669
  catch {
357
- return 2;
670
+ return resolveSafeParallelism({ gpu: llama.gpu, computed: 2 });
358
671
  }
359
672
  }
360
673
  // CPU: split cores across contexts. At least 4 threads per context.
361
674
  const cores = llama.cpuMathCores || 4;
362
675
  const maxContexts = Math.floor(cores / 4);
363
- return Math.max(1, Math.min(4, maxContexts));
676
+ const computed = Math.max(1, Math.min(4, maxContexts));
677
+ return resolveSafeParallelism({ gpu: false, computed });
364
678
  }
365
679
  /**
366
680
  * Get the number of threads each context should use, given N parallel contexts.
@@ -368,7 +682,7 @@ export class LlamaCpp {
368
682
  */
369
683
  async threadsPerContext(parallelism) {
370
684
  const llama = await this.ensureLlama();
371
- if (llama.gpu)
685
+ if (!this.isCpuOffloadForced() && llama.gpu)
372
686
  return 0; // GPU: let the library decide
373
687
  const cores = llama.cpuMathCores || 4;
374
688
  return Math.max(1, Math.floor(cores / parallelism));
@@ -394,6 +708,7 @@ export class LlamaCpp {
394
708
  for (let i = 0; i < n; i++) {
395
709
  try {
396
710
  this.embedContexts.push(await model.createEmbeddingContext({
711
+ contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
397
712
  ...(threads > 0 ? { threads } : {}),
398
713
  }));
399
714
  }
@@ -431,7 +746,7 @@ export class LlamaCpp {
431
746
  this.generateModelLoadPromise = (async () => {
432
747
  const llama = await this.ensureLlama();
433
748
  const modelPath = await this.resolveModel(this.generateModelUri);
434
- const model = await llama.loadModel({ modelPath });
749
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
435
750
  this.generateModel = model;
436
751
  return model;
437
752
  })();
@@ -461,7 +776,7 @@ export class LlamaCpp {
461
776
  this.rerankModelLoadPromise = (async () => {
462
777
  const llama = await this.ensureLlama();
463
778
  const modelPath = await this.resolveModel(this.rerankModelUri);
464
- const model = await llama.loadModel({ modelPath });
779
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
465
780
  this.rerankModel = model;
466
781
  // Model loading counts as activity - ping to keep alive
467
782
  this.touchActivity();
@@ -484,9 +799,20 @@ export class LlamaCpp {
484
799
  * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
485
800
  */
486
801
  // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
487
- // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
488
- // Use 2048 for safety margin. Still 17× less than auto (40960).
489
- static RERANK_CONTEXT_SIZE = 2048;
802
+ // Default 2048 was too small for longer documents (e.g. session transcripts,
803
+ // CJK text, or large markdown files) callers hit "input lengths exceed
804
+ // context size" errors even after truncation because the overhead estimate
805
+ // was insufficient. 4096 comfortably fits the largest real-world chunks
806
+ // while staying well below the 40 960-token auto size.
807
+ // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
808
+ static RERANK_CONTEXT_SIZE = (() => {
809
+ const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
810
+ return Number.isFinite(v) && v > 0 ? v : 4096;
811
+ })();
812
+ static EMBED_CONTEXT_SIZE = (() => {
813
+ const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
814
+ return Number.isFinite(v) && v > 0 ? v : 2048;
815
+ })();
490
816
  async ensureRerankContexts() {
491
817
  if (this.rerankContexts.length === 0) {
492
818
  const model = await this.ensureRerankModel();
@@ -497,7 +823,6 @@ export class LlamaCpp {
497
823
  try {
498
824
  this.rerankContexts.push(await model.createRankingContext({
499
825
  contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
500
- flashAttention: true,
501
826
  ...(threads > 0 ? { threads } : {}),
502
827
  }));
503
828
  }
@@ -555,15 +880,48 @@ export class LlamaCpp {
555
880
  // ==========================================================================
556
881
  // Core API methods
557
882
  // ==========================================================================
883
+ /**
884
+ * Truncate text to fit within the embedding model's context window.
885
+ * Uses the model's own tokenizer for accurate token counting, then
886
+ * detokenizes back to text if truncation is needed.
887
+ * Returns the (possibly truncated) text and whether truncation occurred.
888
+ */
889
+ resolveEmbedTokenLimit() {
890
+ const trainedContextSize = this.embedModel?.trainContextSize;
891
+ if (typeof trainedContextSize === "number" && Number.isFinite(trainedContextSize) && trainedContextSize > 0) {
892
+ return Math.max(1, Math.min(LlamaCpp.EMBED_CONTEXT_SIZE, trainedContextSize));
893
+ }
894
+ return LlamaCpp.EMBED_CONTEXT_SIZE;
895
+ }
896
+ async truncateToContextSize(text) {
897
+ if (!this.embedModel)
898
+ return { text, truncated: false, limit: LlamaCpp.EMBED_CONTEXT_SIZE };
899
+ const maxTokens = this.resolveEmbedTokenLimit();
900
+ if (maxTokens <= 0)
901
+ return { text, truncated: false, limit: maxTokens };
902
+ const tokens = this.embedModel.tokenize(text);
903
+ if (tokens.length <= maxTokens)
904
+ return { text, truncated: false, limit: maxTokens };
905
+ // Leave a small margin (4 tokens) for BOS/EOS overhead
906
+ const safeLimit = Math.max(1, maxTokens - 4);
907
+ const truncatedTokens = tokens.slice(0, safeLimit);
908
+ const truncatedText = this.embedModel.detokenize(truncatedTokens);
909
+ return { text: truncatedText, truncated: true, limit: maxTokens };
910
+ }
558
911
  async embed(text, options = {}) {
559
912
  // Ping activity at start to keep models alive during this operation
560
913
  this.touchActivity();
561
914
  try {
562
915
  const context = await this.ensureEmbedContext();
563
- const embedding = await context.getEmbeddingFor(text);
916
+ // Guard: truncate text that exceeds model context window to prevent GGML crash
917
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
918
+ if (truncated) {
919
+ console.warn(`⚠ Text truncated to fit embedding context (${limit} tokens)`);
920
+ }
921
+ const embedding = await context.getEmbeddingFor(safeText);
564
922
  return {
565
923
  embedding: Array.from(embedding.vector),
566
- model: this.embedModelUri,
924
+ model: options.model ?? this.embedModelUri,
567
925
  };
568
926
  }
569
927
  catch (error) {
@@ -575,7 +933,7 @@ export class LlamaCpp {
575
933
  * Batch embed multiple texts efficiently
576
934
  * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
577
935
  */
578
- async embedBatch(texts) {
936
+ async embedBatch(texts, options = {}) {
579
937
  if (this._ciMode)
580
938
  throw new Error("LLM operations are disabled in CI (set CI=true)");
581
939
  // Ping activity at start to keep models alive during this operation
@@ -591,9 +949,13 @@ export class LlamaCpp {
591
949
  const embeddings = [];
592
950
  for (const text of texts) {
593
951
  try {
594
- const embedding = await context.getEmbeddingFor(text);
952
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
953
+ if (truncated) {
954
+ console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
955
+ }
956
+ const embedding = await context.getEmbeddingFor(safeText);
595
957
  this.touchActivity();
596
- embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
958
+ embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
597
959
  }
598
960
  catch (err) {
599
961
  console.error("Embedding error for text:", err);
@@ -610,9 +972,13 @@ export class LlamaCpp {
610
972
  const results = [];
611
973
  for (const text of chunk) {
612
974
  try {
613
- const embedding = await ctx.getEmbeddingFor(text);
975
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
976
+ if (truncated) {
977
+ console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
978
+ }
979
+ const embedding = await ctx.getEmbeddingFor(safeText);
614
980
  this.touchActivity();
615
- results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
981
+ results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
616
982
  }
617
983
  catch (err) {
618
984
  console.error("Embedding error for text:", err);
@@ -638,6 +1004,7 @@ export class LlamaCpp {
638
1004
  // Create fresh context -> sequence -> session for each call
639
1005
  const context = await this.generateModel.createContext();
640
1006
  const sequence = context.getSequence();
1007
+ const { LlamaChatSession } = await loadNodeLlamaCpp();
641
1008
  const session = new LlamaChatSession({ contextSequence: sequence });
642
1009
  const maxTokens = options.maxTokens ?? 150;
643
1010
  // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
@@ -707,6 +1074,7 @@ export class LlamaCpp {
707
1074
  contextSize: this.expandContextSize,
708
1075
  });
709
1076
  const sequence = genContext.getSequence();
1077
+ const { LlamaChatSession } = await loadNodeLlamaCpp();
710
1078
  const session = new LlamaChatSession({ contextSequence: sequence });
711
1079
  try {
712
1080
  // Qwen3 recommended settings for non-thinking mode:
@@ -767,8 +1135,10 @@ export class LlamaCpp {
767
1135
  await genContext.dispose();
768
1136
  }
769
1137
  }
770
- // Qwen3 reranker chat template overhead (system prompt, tags, separators)
771
- static RERANK_TEMPLATE_OVERHEAD = 200;
1138
+ // Qwen3 reranker chat template overhead (system prompt, tags, separators).
1139
+ // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
1140
+ // the truncation budget never lets a document slip past the context limit.
1141
+ static RERANK_TEMPLATE_OVERHEAD = 512;
772
1142
  static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
773
1143
  async rerank(query, documents, options = {}) {
774
1144
  if (this._ciMode)
@@ -845,11 +1215,12 @@ export class LlamaCpp {
845
1215
  * Get device/GPU info for status display.
846
1216
  * Initializes llama if not already done.
847
1217
  */
848
- async getDeviceInfo() {
849
- const llama = await this.ensureLlama();
850
- const gpuDevices = await llama.getGpuDeviceNames();
1218
+ async getDeviceInfo(options = {}) {
1219
+ const llama = await this.ensureLlama(options.allowBuild ?? true);
1220
+ const cpuForced = this.isCpuOffloadForced();
1221
+ const gpuDevices = cpuForced ? [] : await llama.getGpuDeviceNames();
851
1222
  let vram;
852
- if (llama.gpu) {
1223
+ if (!cpuForced && llama.gpu) {
853
1224
  try {
854
1225
  const state = await llama.getVramState();
855
1226
  vram = { total: state.total, used: state.used, free: state.free };
@@ -857,8 +1228,8 @@ export class LlamaCpp {
857
1228
  catch { /* no vram info */ }
858
1229
  }
859
1230
  return {
860
- gpu: llama.gpu,
861
- gpuOffloading: llama.supportsGpuOffloading,
1231
+ gpu: cpuForced ? false : llama.gpu,
1232
+ gpuOffloading: !cpuForced && llama.supportsGpuOffloading,
862
1233
  gpuDevices,
863
1234
  vram,
864
1235
  cpuCores: llama.cpuMathCores,
@@ -875,21 +1246,34 @@ export class LlamaCpp {
875
1246
  clearTimeout(this.inactivityTimer);
876
1247
  this.inactivityTimer = null;
877
1248
  }
878
- // Disposing llama cascades to models and contexts automatically
879
- // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
880
- // Note: llama.dispose() can hang indefinitely, so we use a timeout
881
- if (this.llama) {
882
- const disposePromise = this.llama.dispose();
883
- const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000));
884
- await Promise.race([disposePromise, timeoutPromise]);
1249
+ // Explicitly dispose in dependency order: contexts first, then models, then llama.
1250
+ // Relying only on llama.dispose() leaves Metal resource sets alive until process
1251
+ // finalization on Apple Silicon, where ggml_metal_device_free can abort after
1252
+ // otherwise-successful CLI output (#368).
1253
+ for (const ctx of this.embedContexts) {
1254
+ await disposeWithTimeout("embedding context", () => ctx.dispose());
885
1255
  }
886
- // Clear references
887
1256
  this.embedContexts = [];
1257
+ for (const ctx of this.rerankContexts) {
1258
+ await disposeWithTimeout("rerank context", () => ctx.dispose());
1259
+ }
888
1260
  this.rerankContexts = [];
889
- this.embedModel = null;
890
- this.generateModel = null;
891
- this.rerankModel = null;
892
- this.llama = null;
1261
+ if (this.embedModel) {
1262
+ await disposeWithTimeout("embedding model", () => this.embedModel.dispose());
1263
+ this.embedModel = null;
1264
+ }
1265
+ if (this.generateModel) {
1266
+ await disposeWithTimeout("generation model", () => this.generateModel.dispose());
1267
+ this.generateModel = null;
1268
+ }
1269
+ if (this.rerankModel) {
1270
+ await disposeWithTimeout("rerank model", () => this.rerankModel.dispose());
1271
+ this.rerankModel = null;
1272
+ }
1273
+ if (this.llama) {
1274
+ await disposeWithTimeout("llama runtime", () => this.llama.dispose());
1275
+ this.llama = null;
1276
+ }
893
1277
  // Clear any in-flight load/create promises
894
1278
  this.embedModelLoadPromise = null;
895
1279
  this.embedContextsCreatePromise = null;
@@ -1028,8 +1412,8 @@ class LLMSession {
1028
1412
  async embed(text, options) {
1029
1413
  return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
1030
1414
  }
1031
- async embedBatch(texts) {
1032
- return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
1415
+ async embedBatch(texts, options) {
1416
+ return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
1033
1417
  }
1034
1418
  async expandQuery(query, options) {
1035
1419
  return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
@@ -1106,8 +1490,7 @@ let defaultLlamaCpp = null;
1106
1490
  */
1107
1491
  export function getDefaultLlamaCpp() {
1108
1492
  if (!defaultLlamaCpp) {
1109
- const embedModel = process.env.QMD_EMBED_MODEL;
1110
- defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
1493
+ defaultLlamaCpp = new LlamaCpp();
1111
1494
  }
1112
1495
  return defaultLlamaCpp;
1113
1496
  }