@tobilu/qmd 2.1.0 → 2.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/llm.js CHANGED
@@ -3,10 +3,49 @@
3
3
  *
4
4
  * Provides embeddings, text generation, and reranking using local GGUF models.
5
5
  */
6
- import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
6
+ let nodeLlamaCppImport = null;
7
+ async function loadNodeLlamaCpp() {
8
+ nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(() => import("node-llama-cpp"));
9
+ return nodeLlamaCppImport;
10
+ }
11
+ export function setNodeLlamaCppModuleForTest(module) {
12
+ nodeLlamaCppImport = module ? Promise.resolve(module) : null;
13
+ failedGpuInitModes.clear();
14
+ noGpuAccelerationWarningShown = false;
15
+ cpuForcedPrebuiltFallbackWarningShown = false;
16
+ }
17
+ let nativeStdoutRedirectDepth = 0;
18
+ let originalStdoutWrite = null;
19
+ /**
20
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
21
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
22
+ * noise to stderr while native llama initialization is in progress.
23
+ */
24
+ export async function withNativeStdoutRedirectedToStderr(fn) {
25
+ if (nativeStdoutRedirectDepth === 0) {
26
+ originalStdoutWrite = process.stdout.write.bind(process.stdout);
27
+ process.stdout.write = ((chunk, encodingOrCallback, callback) => {
28
+ if (typeof encodingOrCallback === "function") {
29
+ return process.stderr.write(chunk, encodingOrCallback);
30
+ }
31
+ return process.stderr.write(chunk, encodingOrCallback, callback);
32
+ });
33
+ }
34
+ nativeStdoutRedirectDepth++;
35
+ try {
36
+ return await fn();
37
+ }
38
+ finally {
39
+ nativeStdoutRedirectDepth--;
40
+ if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
41
+ process.stdout.write = originalStdoutWrite;
42
+ originalStdoutWrite = null;
43
+ }
44
+ }
45
+ }
7
46
  import { homedir } from "os";
8
47
  import { join } from "path";
9
- import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
48
+ import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
10
49
  // =============================================================================
11
50
  // Embedding Formatting Functions
12
51
  // =============================================================================
@@ -23,7 +62,7 @@ export function isQwen3EmbeddingModel(modelUri) {
23
62
  * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
24
63
  */
25
64
  export function formatQueryForEmbedding(query, modelUri) {
26
- const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
65
+ const uri = modelUri ?? resolveEmbedModel();
27
66
  if (isQwen3EmbeddingModel(uri)) {
28
67
  return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
29
68
  }
@@ -35,7 +74,7 @@ export function formatQueryForEmbedding(query, modelUri) {
35
74
  * Qwen3-Embedding encodes documents as raw text without special prefixes.
36
75
  */
37
76
  export function formatDocForEmbedding(text, title, modelUri) {
38
- const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
77
+ const uri = modelUri ?? resolveEmbedModel();
39
78
  if (isQwen3EmbeddingModel(uri)) {
40
79
  // Qwen3-Embedding: documents are raw text, no task prefix
41
80
  return title ? `${title}\n${text}` : text;
@@ -60,6 +99,22 @@ export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5
60
99
  export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
61
100
  export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
62
101
  export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
102
+ export function resolveEmbedModel(config) {
103
+ return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
104
+ }
105
+ export function resolveGenerateModel(config) {
106
+ return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
107
+ }
108
+ export function resolveRerankModel(config) {
109
+ return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
110
+ }
111
+ export function resolveModels(config) {
112
+ return {
113
+ embed: resolveEmbedModel(config),
114
+ generate: resolveGenerateModel(config),
115
+ rerank: resolveRerankModel(config),
116
+ };
117
+ }
63
118
  // Local model cache directory
64
119
  const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
65
120
  ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
@@ -89,6 +144,106 @@ async function getRemoteEtag(ref) {
89
144
  return null;
90
145
  }
91
146
  }
147
+ const GGUF_MAGIC = Buffer.from("GGUF");
148
+ function formatModelFileSize(sizeBytes) {
149
+ return `${(sizeBytes / 1024).toFixed(0)} KB`;
150
+ }
151
+ function printableMagic(header) {
152
+ const text = header.toString("utf-8");
153
+ return /^[\x20-\x7e]{1,4}$/.test(text) ? text : `0x${header.toString("hex")}`;
154
+ }
155
+ /**
156
+ * Inspect a potential GGUF model file without mutating it.
157
+ * Used by doctor for early diagnostics and by runtime validation before load.
158
+ */
159
+ export function inspectGgufFile(filePath) {
160
+ if (!existsSync(filePath)) {
161
+ return { exists: false, valid: false, kind: "missing", details: "file does not exist" };
162
+ }
163
+ let sizeBytes = 0;
164
+ try {
165
+ sizeBytes = statSync(filePath).size;
166
+ const fd = openSync(filePath, "r");
167
+ const sniff = Buffer.alloc(512);
168
+ try {
169
+ readSync(fd, sniff, 0, 512, 0);
170
+ }
171
+ finally {
172
+ closeSync(fd);
173
+ }
174
+ const header = sniff.subarray(0, 4);
175
+ if (header.equals(GGUF_MAGIC)) {
176
+ return {
177
+ exists: true,
178
+ valid: true,
179
+ kind: "gguf",
180
+ sizeBytes,
181
+ magic: "GGUF",
182
+ details: `valid GGUF (${formatModelFileSize(sizeBytes)})`,
183
+ };
184
+ }
185
+ const magic = printableMagic(header);
186
+ const text = sniff.toString("utf-8").toLowerCase();
187
+ const isHtml = text.includes("<!doctype") || text.includes("<html");
188
+ if (isHtml) {
189
+ return {
190
+ exists: true,
191
+ valid: false,
192
+ kind: "html",
193
+ sizeBytes,
194
+ magic,
195
+ details: `HTML page, not a GGUF model (${formatModelFileSize(sizeBytes)}); likely proxy/firewall/captive portal response`,
196
+ };
197
+ }
198
+ return {
199
+ exists: true,
200
+ valid: false,
201
+ kind: "invalid",
202
+ sizeBytes,
203
+ magic,
204
+ details: `not valid GGUF (expected magic "GGUF", got "${magic}", ${formatModelFileSize(sizeBytes)})`,
205
+ };
206
+ }
207
+ catch (error) {
208
+ return {
209
+ exists: true,
210
+ valid: false,
211
+ kind: "invalid",
212
+ sizeBytes,
213
+ details: `cannot read model file: ${error instanceof Error ? error.message : String(error)}`,
214
+ };
215
+ }
216
+ }
217
+ /**
218
+ * Validate that a file is actually a GGUF model, not an HTML error page
219
+ * from a proxy, firewall, or failed download.
220
+ * Throws a descriptive error if the file is not valid GGUF.
221
+ */
222
+ function validateGgufFile(filePath, modelUri) {
223
+ const inspection = inspectGgufFile(filePath);
224
+ if (!inspection.exists || inspection.valid)
225
+ return; // let downstream handle missing files
226
+ // Remove the bad file so the next attempt re-downloads
227
+ try {
228
+ unlinkSync(filePath);
229
+ }
230
+ catch { /* best effort */ }
231
+ if (inspection.kind === "html") {
232
+ throw new Error(`Downloaded model file is an HTML page, not a GGUF model (${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
233
+ `Something is intercepting the download from huggingface.co (a proxy, firewall, or captive portal).\n\n` +
234
+ `Model: ${modelUri}\n` +
235
+ `Path: ${filePath}\n\n` +
236
+ `To fix this, either:\n` +
237
+ ` 1. Try a HuggingFace mirror: HF_ENDPOINT=https://hf-mirror.com qmd embed\n` +
238
+ ` 2. Download the model manually and set the env var, e.g.:\n` +
239
+ ` QMD_EMBED_MODEL=/path/to/model.gguf qmd embed\n\n` +
240
+ `Note: 'qmd search' works without any model downloads.`);
241
+ }
242
+ throw new Error(`Model file is not valid GGUF (expected magic "GGUF", got "${inspection.magic ?? "unknown"}", file is ${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
243
+ `Model: ${modelUri}\n` +
244
+ `Path: ${filePath}\n\n` +
245
+ `The file has been removed. Run the command again to re-download.`);
246
+ }
92
247
  export async function pullModels(models, options = {}) {
93
248
  const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
94
249
  if (!existsSync(cacheDir)) {
@@ -129,7 +284,9 @@ export async function pullModels(models, options = {}) {
129
284
  refreshed = true;
130
285
  }
131
286
  }
287
+ const { resolveModelFile } = await loadNodeLlamaCpp();
132
288
  const path = await resolveModelFile(model, cacheDir);
289
+ validateGgufFile(path, model);
133
290
  const sizeBytes = existsSync(path) ? statSync(path).size : 0;
134
291
  if (hfRef && filename) {
135
292
  const remoteEtag = await getRemoteEtag(hfRef);
@@ -148,6 +305,58 @@ export async function pullModels(models, options = {}) {
148
305
  // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
149
306
  const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
150
307
  const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
308
+ export function resolveParallelismOverride(envValue = process.env.QMD_EMBED_PARALLELISM) {
309
+ const normalized = envValue?.trim() ?? "";
310
+ if (!normalized)
311
+ return undefined;
312
+ const parsed = Number(normalized);
313
+ if (!Number.isInteger(parsed) || parsed < 1) {
314
+ process.stderr.write(`QMD Warning: invalid QMD_EMBED_PARALLELISM="${envValue}", using automatic parallelism.\n`);
315
+ return undefined;
316
+ }
317
+ return Math.min(8, parsed);
318
+ }
319
+ export function resolveSafeParallelism(options) {
320
+ const override = resolveParallelismOverride(options.envValue);
321
+ if (override !== undefined)
322
+ return override;
323
+ // node-llama-cpp/llama.cpp CUDA on Windows is unstable with multiple
324
+ // simultaneous contexts (ggml-cuda.cu:98 in #519). Vulkan and CPU do not
325
+ // show the same failure mode, so only serialize Windows CUDA by default.
326
+ if ((options.platform ?? process.platform) === "win32" && options.gpu === "cuda") {
327
+ return 1;
328
+ }
329
+ return Math.max(1, options.computed);
330
+ }
331
+ export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU, forceCpuValue = process.env.QMD_FORCE_CPU) {
332
+ const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
333
+ if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
334
+ return false;
335
+ }
336
+ const normalized = envValue?.trim().toLowerCase() ?? "";
337
+ if (!normalized)
338
+ return "auto";
339
+ if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized))
340
+ return false;
341
+ if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda")
342
+ return normalized;
343
+ process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`);
344
+ return "auto";
345
+ }
346
+ async function disposeWithTimeout(resourceName, dispose, timeoutMs = 1000) {
347
+ const timeoutPromise = new Promise((resolve) => {
348
+ setTimeout(() => resolve("timeout"), timeoutMs).unref();
349
+ });
350
+ try {
351
+ const result = await Promise.race([dispose(), timeoutPromise]);
352
+ if (result === "timeout") {
353
+ process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
354
+ }
355
+ }
356
+ catch (error) {
357
+ process.stderr.write(`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`);
358
+ }
359
+ }
151
360
  function resolveExpandContextSize(configValue) {
152
361
  if (configValue !== undefined) {
153
362
  if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -165,6 +374,12 @@ function resolveExpandContextSize(configValue) {
165
374
  }
166
375
  return parsed;
167
376
  }
377
+ const failedGpuInitModes = new Set();
378
+ let noGpuAccelerationWarningShown = false;
379
+ let cpuForcedPrebuiltFallbackWarningShown = false;
380
+ function isCpuModeRequested() {
381
+ return resolveLlamaGpuMode() === false;
382
+ }
168
383
  export class LlamaCpp {
169
384
  _ciMode = !!process.env.CI;
170
385
  llama = null;
@@ -189,9 +404,9 @@ export class LlamaCpp {
189
404
  // Track disposal state to prevent double-dispose
190
405
  disposed = false;
191
406
  constructor(config = {}) {
192
- this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
193
- this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
194
- this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
407
+ this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
408
+ this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
409
+ this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
195
410
  this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
196
411
  this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
197
412
  this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
@@ -200,6 +415,12 @@ export class LlamaCpp {
200
415
  get embedModelName() {
201
416
  return this.embedModelUri;
202
417
  }
418
+ get generateModelName() {
419
+ return this.generateModelUri;
420
+ }
421
+ get rerankModelName() {
422
+ return this.rerankModelUri;
423
+ }
203
424
  /**
204
425
  * Reset the inactivity timer. Called after each model operation.
205
426
  * When timer fires, models are unloaded to free memory (if no active sessions).
@@ -292,45 +513,113 @@ export class LlamaCpp {
292
513
  /**
293
514
  * Initialize the llama instance (lazy)
294
515
  */
295
- async ensureLlama() {
516
+ async ensureLlama(allowBuild = true) {
296
517
  if (!this.llama) {
297
- // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
298
- const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
299
- const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
300
- const loadLlama = async (gpu) => await getLlama({
301
- build: "autoAttempt",
518
+ const gpuMode = resolveLlamaGpuMode();
519
+ const { getLlama, getLlamaGpuTypes, LlamaLogLevel } = await loadNodeLlamaCpp();
520
+ const loadLlama = async (gpu, sourceBuildAllowed = allowBuild, buildOverride) => await withNativeStdoutRedirectedToStderr(() => getLlama({
521
+ // Prefer packaged prebuilt bindings before compiling llama.cpp locally.
522
+ // node-llama-cpp documents gpu:"auto" as the best default: Metal on
523
+ // Apple Silicon, CUDA when fully available, Vulkan where available,
524
+ // then CPU. Use build:"auto" for normal loads and build:"never" for
525
+ // diagnostic/probe paths that must not compile llama.cpp.
526
+ build: buildOverride ?? (sourceBuildAllowed ? "auto" : "never"),
302
527
  logLevel: LlamaLogLevel.error,
303
528
  gpu,
304
- });
529
+ progressLogs: false,
530
+ skipDownload: !sourceBuildAllowed,
531
+ }));
532
+ const loadCpuCompatibleLlama = async () => {
533
+ try {
534
+ return await loadLlama(false, false);
535
+ }
536
+ catch (err) {
537
+ // Some platforms, notably Apple Silicon, ship a Metal prebuilt but no
538
+ // CPU-only prebuilt. Do a fast no-build lookup for an actual CPU
539
+ // binding first; if it does not exist, use the packaged auto/Metal
540
+ // binding and disable model offloading via gpuLayers: 0.
541
+ if (!cpuForcedPrebuiltFallbackWarningShown) {
542
+ cpuForcedPrebuiltFallbackWarningShown = true;
543
+ process.stderr.write(`QMD Warning: CPU-only llama.cpp prebuilt not available (${err instanceof Error ? err.message : String(err)}); using packaged backend with GPU offloading disabled.\n`);
544
+ }
545
+ return await loadLlama("auto", false);
546
+ }
547
+ };
305
548
  let llama;
306
- if (forceCpu) {
307
- llama = await loadLlama(false);
549
+ if (gpuMode === false) {
550
+ llama = await loadCpuCompatibleLlama();
551
+ }
552
+ else if (failedGpuInitModes.has(gpuMode)) {
553
+ process.stderr.write(`QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`);
554
+ llama = await loadCpuCompatibleLlama();
308
555
  }
309
556
  else {
310
557
  try {
311
- llama = await loadLlama("auto");
558
+ llama = await loadLlama(gpuMode);
559
+ // If node-llama-cpp auto-detection chose CPU, do one no-build pass
560
+ // over all OS-valid packaged GPU backends. This preserves the
561
+ // documented auto mode for Metal/CUDA/Vulkan while recovering on
562
+ // systems where a packaged backend can load but detection is too
563
+ // conservative. Never compile during these extra probes.
564
+ if (gpuMode === "auto" && llama.gpu === false && getLlamaGpuTypes) {
565
+ const candidates = (await getLlamaGpuTypes("allValid"))
566
+ .filter((candidate) => candidate !== false && candidate !== "auto");
567
+ for (const candidate of candidates) {
568
+ if (failedGpuInitModes.has(candidate))
569
+ continue;
570
+ try {
571
+ const gpuLlama = await loadLlama(candidate, false, "never");
572
+ if (gpuLlama.gpu !== false) {
573
+ await disposeWithTimeout("CPU llama runtime", () => llama.dispose());
574
+ llama = gpuLlama;
575
+ break;
576
+ }
577
+ await disposeWithTimeout(`${candidate} probe runtime`, () => gpuLlama.dispose());
578
+ }
579
+ catch {
580
+ failedGpuInitModes.add(candidate);
581
+ }
582
+ }
583
+ }
312
584
  }
313
585
  catch (err) {
314
- // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
315
- // Fall back to CPU so qmd still works.
316
- process.stderr.write(`QMD Warning: GPU init failed (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
317
- llama = await loadLlama(false);
586
+ // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
587
+ // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
588
+ // expensive native build/probe attempts in this process.
589
+ failedGpuInitModes.add(gpuMode);
590
+ process.stderr.write(`QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
591
+ llama = await loadCpuCompatibleLlama();
318
592
  }
319
593
  }
320
- if (llama.gpu === false) {
321
- process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
594
+ if (llama.gpu === false && !noGpuAccelerationWarningShown) {
595
+ noGpuAccelerationWarningShown = true;
596
+ process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd doctor' for device diagnostics.\n");
322
597
  }
323
598
  this.llama = llama;
324
599
  }
325
600
  return this.llama;
326
601
  }
602
+ isCpuOffloadForced() {
603
+ return isCpuModeRequested();
604
+ }
605
+ modelLoadOptions(modelPath) {
606
+ return {
607
+ modelPath,
608
+ ...(this.isCpuOffloadForced() ? { gpuLayers: 0 } : {}),
609
+ };
610
+ }
327
611
  /**
328
- * Resolve a model URI to a local path, downloading if needed
612
+ * Resolve a model URI to a local path, downloading if needed.
613
+ * Validates the downloaded file is actually a GGUF model (not an HTML error page
614
+ * from a proxy or firewall).
329
615
  */
330
616
  async resolveModel(modelUri) {
331
617
  this.ensureModelCacheDir();
332
618
  // resolveModelFile handles HF URIs and downloads to the cache dir
333
- return await resolveModelFile(modelUri, this.modelCacheDir);
619
+ const { resolveModelFile } = await loadNodeLlamaCpp();
620
+ const modelPath = await resolveModelFile(modelUri, this.modelCacheDir);
621
+ validateGgufFile(modelPath, modelUri);
622
+ return modelPath;
334
623
  }
335
624
  /**
336
625
  * Load embedding model (lazy)
@@ -345,7 +634,7 @@ export class LlamaCpp {
345
634
  this.embedModelLoadPromise = (async () => {
346
635
  const llama = await this.ensureLlama();
347
636
  const modelPath = await this.resolveModel(this.embedModelUri);
348
- const model = await llama.loadModel({ modelPath });
637
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
349
638
  this.embedModel = model;
350
639
  // Model loading counts as activity - ping to keep alive
351
640
  this.touchActivity();
@@ -369,21 +658,23 @@ export class LlamaCpp {
369
658
  */
370
659
  async computeParallelism(perContextMB) {
371
660
  const llama = await this.ensureLlama();
372
- if (llama.gpu) {
661
+ if (!this.isCpuOffloadForced() && llama.gpu) {
373
662
  try {
374
663
  const vram = await llama.getVramState();
375
664
  const freeMB = vram.free / (1024 * 1024);
376
665
  const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
377
- return Math.max(1, Math.min(8, maxByVram));
666
+ const computed = Math.max(1, Math.min(8, maxByVram));
667
+ return resolveSafeParallelism({ gpu: llama.gpu, computed });
378
668
  }
379
669
  catch {
380
- return 2;
670
+ return resolveSafeParallelism({ gpu: llama.gpu, computed: 2 });
381
671
  }
382
672
  }
383
673
  // CPU: split cores across contexts. At least 4 threads per context.
384
674
  const cores = llama.cpuMathCores || 4;
385
675
  const maxContexts = Math.floor(cores / 4);
386
- return Math.max(1, Math.min(4, maxContexts));
676
+ const computed = Math.max(1, Math.min(4, maxContexts));
677
+ return resolveSafeParallelism({ gpu: false, computed });
387
678
  }
388
679
  /**
389
680
  * Get the number of threads each context should use, given N parallel contexts.
@@ -391,7 +682,7 @@ export class LlamaCpp {
391
682
  */
392
683
  async threadsPerContext(parallelism) {
393
684
  const llama = await this.ensureLlama();
394
- if (llama.gpu)
685
+ if (!this.isCpuOffloadForced() && llama.gpu)
395
686
  return 0; // GPU: let the library decide
396
687
  const cores = llama.cpuMathCores || 4;
397
688
  return Math.max(1, Math.floor(cores / parallelism));
@@ -455,7 +746,7 @@ export class LlamaCpp {
455
746
  this.generateModelLoadPromise = (async () => {
456
747
  const llama = await this.ensureLlama();
457
748
  const modelPath = await this.resolveModel(this.generateModelUri);
458
- const model = await llama.loadModel({ modelPath });
749
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
459
750
  this.generateModel = model;
460
751
  return model;
461
752
  })();
@@ -485,7 +776,7 @@ export class LlamaCpp {
485
776
  this.rerankModelLoadPromise = (async () => {
486
777
  const llama = await this.ensureLlama();
487
778
  const modelPath = await this.resolveModel(this.rerankModelUri);
488
- const model = await llama.loadModel({ modelPath });
779
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
489
780
  this.rerankModel = model;
490
781
  // Model loading counts as activity - ping to keep alive
491
782
  this.touchActivity();
@@ -532,7 +823,6 @@ export class LlamaCpp {
532
823
  try {
533
824
  this.rerankContexts.push(await model.createRankingContext({
534
825
  contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
535
- flashAttention: true,
536
826
  ...(threads > 0 ? { threads } : {}),
537
827
  }));
538
828
  }
@@ -596,20 +886,27 @@ export class LlamaCpp {
596
886
  * detokenizes back to text if truncation is needed.
597
887
  * Returns the (possibly truncated) text and whether truncation occurred.
598
888
  */
889
+ resolveEmbedTokenLimit() {
890
+ const trainedContextSize = this.embedModel?.trainContextSize;
891
+ if (typeof trainedContextSize === "number" && Number.isFinite(trainedContextSize) && trainedContextSize > 0) {
892
+ return Math.max(1, Math.min(LlamaCpp.EMBED_CONTEXT_SIZE, trainedContextSize));
893
+ }
894
+ return LlamaCpp.EMBED_CONTEXT_SIZE;
895
+ }
599
896
  async truncateToContextSize(text) {
600
897
  if (!this.embedModel)
601
- return { text, truncated: false };
602
- const maxTokens = this.embedModel.trainContextSize;
898
+ return { text, truncated: false, limit: LlamaCpp.EMBED_CONTEXT_SIZE };
899
+ const maxTokens = this.resolveEmbedTokenLimit();
603
900
  if (maxTokens <= 0)
604
- return { text, truncated: false };
901
+ return { text, truncated: false, limit: maxTokens };
605
902
  const tokens = this.embedModel.tokenize(text);
606
903
  if (tokens.length <= maxTokens)
607
- return { text, truncated: false };
904
+ return { text, truncated: false, limit: maxTokens };
608
905
  // Leave a small margin (4 tokens) for BOS/EOS overhead
609
906
  const safeLimit = Math.max(1, maxTokens - 4);
610
907
  const truncatedTokens = tokens.slice(0, safeLimit);
611
908
  const truncatedText = this.embedModel.detokenize(truncatedTokens);
612
- return { text: truncatedText, truncated: true };
909
+ return { text: truncatedText, truncated: true, limit: maxTokens };
613
910
  }
614
911
  async embed(text, options = {}) {
615
912
  // Ping activity at start to keep models alive during this operation
@@ -617,9 +914,9 @@ export class LlamaCpp {
617
914
  try {
618
915
  const context = await this.ensureEmbedContext();
619
916
  // Guard: truncate text that exceeds model context window to prevent GGML crash
620
- const { text: safeText, truncated } = await this.truncateToContextSize(text);
917
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
621
918
  if (truncated) {
622
- console.warn(`⚠ Text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
919
+ console.warn(`⚠ Text truncated to fit embedding context (${limit} tokens)`);
623
920
  }
624
921
  const embedding = await context.getEmbeddingFor(safeText);
625
922
  return {
@@ -652,9 +949,9 @@ export class LlamaCpp {
652
949
  const embeddings = [];
653
950
  for (const text of texts) {
654
951
  try {
655
- const { text: safeText, truncated } = await this.truncateToContextSize(text);
952
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
656
953
  if (truncated) {
657
- console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
954
+ console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
658
955
  }
659
956
  const embedding = await context.getEmbeddingFor(safeText);
660
957
  this.touchActivity();
@@ -675,9 +972,9 @@ export class LlamaCpp {
675
972
  const results = [];
676
973
  for (const text of chunk) {
677
974
  try {
678
- const { text: safeText, truncated } = await this.truncateToContextSize(text);
975
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
679
976
  if (truncated) {
680
- console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
977
+ console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
681
978
  }
682
979
  const embedding = await ctx.getEmbeddingFor(safeText);
683
980
  this.touchActivity();
@@ -707,6 +1004,7 @@ export class LlamaCpp {
707
1004
  // Create fresh context -> sequence -> session for each call
708
1005
  const context = await this.generateModel.createContext();
709
1006
  const sequence = context.getSequence();
1007
+ const { LlamaChatSession } = await loadNodeLlamaCpp();
710
1008
  const session = new LlamaChatSession({ contextSequence: sequence });
711
1009
  const maxTokens = options.maxTokens ?? 150;
712
1010
  // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
@@ -776,6 +1074,7 @@ export class LlamaCpp {
776
1074
  contextSize: this.expandContextSize,
777
1075
  });
778
1076
  const sequence = genContext.getSequence();
1077
+ const { LlamaChatSession } = await loadNodeLlamaCpp();
779
1078
  const session = new LlamaChatSession({ contextSequence: sequence });
780
1079
  try {
781
1080
  // Qwen3 recommended settings for non-thinking mode:
@@ -916,11 +1215,12 @@ export class LlamaCpp {
916
1215
  * Get device/GPU info for status display.
917
1216
  * Initializes llama if not already done.
918
1217
  */
919
- async getDeviceInfo() {
920
- const llama = await this.ensureLlama();
921
- const gpuDevices = await llama.getGpuDeviceNames();
1218
+ async getDeviceInfo(options = {}) {
1219
+ const llama = await this.ensureLlama(options.allowBuild ?? true);
1220
+ const cpuForced = this.isCpuOffloadForced();
1221
+ const gpuDevices = cpuForced ? [] : await llama.getGpuDeviceNames();
922
1222
  let vram;
923
- if (llama.gpu) {
1223
+ if (!cpuForced && llama.gpu) {
924
1224
  try {
925
1225
  const state = await llama.getVramState();
926
1226
  vram = { total: state.total, used: state.used, free: state.free };
@@ -928,8 +1228,8 @@ export class LlamaCpp {
928
1228
  catch { /* no vram info */ }
929
1229
  }
930
1230
  return {
931
- gpu: llama.gpu,
932
- gpuOffloading: llama.supportsGpuOffloading,
1231
+ gpu: cpuForced ? false : llama.gpu,
1232
+ gpuOffloading: !cpuForced && llama.supportsGpuOffloading,
933
1233
  gpuDevices,
934
1234
  vram,
935
1235
  cpuCores: llama.cpuMathCores,
@@ -946,21 +1246,34 @@ export class LlamaCpp {
946
1246
  clearTimeout(this.inactivityTimer);
947
1247
  this.inactivityTimer = null;
948
1248
  }
949
- // Disposing llama cascades to models and contexts automatically
950
- // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
951
- // Note: llama.dispose() can hang indefinitely, so we use a timeout
952
- if (this.llama) {
953
- const disposePromise = this.llama.dispose();
954
- const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000));
955
- await Promise.race([disposePromise, timeoutPromise]);
1249
+ // Explicitly dispose in dependency order: contexts first, then models, then llama.
1250
+ // Relying only on llama.dispose() leaves Metal resource sets alive until process
1251
+ // finalization on Apple Silicon, where ggml_metal_device_free can abort after
1252
+ // otherwise-successful CLI output (#368).
1253
+ for (const ctx of this.embedContexts) {
1254
+ await disposeWithTimeout("embedding context", () => ctx.dispose());
956
1255
  }
957
- // Clear references
958
1256
  this.embedContexts = [];
1257
+ for (const ctx of this.rerankContexts) {
1258
+ await disposeWithTimeout("rerank context", () => ctx.dispose());
1259
+ }
959
1260
  this.rerankContexts = [];
960
- this.embedModel = null;
961
- this.generateModel = null;
962
- this.rerankModel = null;
963
- this.llama = null;
1261
+ if (this.embedModel) {
1262
+ await disposeWithTimeout("embedding model", () => this.embedModel.dispose());
1263
+ this.embedModel = null;
1264
+ }
1265
+ if (this.generateModel) {
1266
+ await disposeWithTimeout("generation model", () => this.generateModel.dispose());
1267
+ this.generateModel = null;
1268
+ }
1269
+ if (this.rerankModel) {
1270
+ await disposeWithTimeout("rerank model", () => this.rerankModel.dispose());
1271
+ this.rerankModel = null;
1272
+ }
1273
+ if (this.llama) {
1274
+ await disposeWithTimeout("llama runtime", () => this.llama.dispose());
1275
+ this.llama = null;
1276
+ }
964
1277
  // Clear any in-flight load/create promises
965
1278
  this.embedModelLoadPromise = null;
966
1279
  this.embedContextsCreatePromise = null;