@joycodetech/qmd-ja 2.5.3-ja.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +821 -0
  2. package/LICENSE +21 -0
  3. package/README.md +1143 -0
  4. package/bin/qmd-ja +162 -0
  5. package/dist/ast.d.ts +65 -0
  6. package/dist/ast.js +334 -0
  7. package/dist/bench/bench.d.ts +23 -0
  8. package/dist/bench/bench.js +280 -0
  9. package/dist/bench/score.d.ts +33 -0
  10. package/dist/bench/score.js +88 -0
  11. package/dist/bench/types.d.ts +80 -0
  12. package/dist/bench/types.js +8 -0
  13. package/dist/cli/formatter.d.ts +120 -0
  14. package/dist/cli/formatter.js +355 -0
  15. package/dist/cli/qmd.d.ts +43 -0
  16. package/dist/cli/qmd.js +4179 -0
  17. package/dist/collections.d.ts +166 -0
  18. package/dist/collections.js +410 -0
  19. package/dist/db.d.ts +44 -0
  20. package/dist/db.js +75 -0
  21. package/dist/index.d.ts +230 -0
  22. package/dist/index.js +242 -0
  23. package/dist/llm.d.ts +500 -0
  24. package/dist/llm.js +1615 -0
  25. package/dist/maintenance.d.ts +23 -0
  26. package/dist/maintenance.js +37 -0
  27. package/dist/mcp/server.d.ts +24 -0
  28. package/dist/mcp/server.js +702 -0
  29. package/dist/paths.d.ts +1 -0
  30. package/dist/paths.js +4 -0
  31. package/dist/store.d.ts +1002 -0
  32. package/dist/store.js +4208 -0
  33. package/models/vaporetto-bccwj.model +0 -0
  34. package/package.json +130 -0
  35. package/scripts/build.mjs +30 -0
  36. package/scripts/check-package-grammars.mjs +29 -0
  37. package/scripts/package-smoke.mjs +65 -0
  38. package/scripts/test-all.mjs +38 -0
  39. package/skills/qmd/SKILL.md +295 -0
  40. package/skills/qmd/references/mcp-setup.md +102 -0
  41. package/skills/release/SKILL.md +139 -0
  42. package/skills/release/scripts/install-hooks.sh +38 -0
  43. package/vendor/vaporetto-node-wasm/LICENSE +22 -0
  44. package/vendor/vaporetto-node-wasm/package.json +11 -0
  45. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm.d.ts +19 -0
  46. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm.js +202 -0
  47. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm_bg.wasm +0 -0
  48. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm_bg.wasm.d.ts +13 -0
package/dist/llm.js ADDED
@@ -0,0 +1,1615 @@
1
+ /**
2
+ * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
3
+ *
4
+ * Provides embeddings, text generation, and reranking using local GGUF models.
5
+ */
6
+ let nodeLlamaCppImport = null;
7
+ async function loadNodeLlamaCpp() {
8
+ nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(() => import("node-llama-cpp"));
9
+ return nodeLlamaCppImport;
10
+ }
11
+ export function setNodeLlamaCppModuleForTest(module) {
12
+ nodeLlamaCppImport = module ? Promise.resolve(module) : null;
13
+ failedGpuInitModes.clear();
14
+ noGpuAccelerationWarningShown = false;
15
+ cpuForcedPrebuiltFallbackWarningShown = false;
16
+ }
17
+ let nativeStdoutRedirectDepth = 0;
18
+ let originalStdoutWrite = null;
19
+ /**
20
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
21
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
22
+ * noise to stderr while native llama initialization is in progress.
23
+ */
24
+ export async function withNativeStdoutRedirectedToStderr(fn) {
25
+ if (nativeStdoutRedirectDepth === 0) {
26
+ originalStdoutWrite = process.stdout.write.bind(process.stdout);
27
+ process.stdout.write = ((chunk, encodingOrCallback, callback) => {
28
+ if (typeof encodingOrCallback === "function") {
29
+ return process.stderr.write(chunk, encodingOrCallback);
30
+ }
31
+ return process.stderr.write(chunk, encodingOrCallback, callback);
32
+ });
33
+ }
34
+ nativeStdoutRedirectDepth++;
35
+ try {
36
+ return await fn();
37
+ }
38
+ finally {
39
+ nativeStdoutRedirectDepth--;
40
+ if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
41
+ process.stdout.write = originalStdoutWrite;
42
+ originalStdoutWrite = null;
43
+ }
44
+ }
45
+ }
46
+ import { homedir } from "os";
47
+ import { join } from "path";
48
+ import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
49
+ // =============================================================================
50
+ // Embedding Formatting Functions
51
+ // =============================================================================
52
+ /**
53
+ * Detect if a model URI uses the Qwen3-Embedding format.
54
+ * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
55
+ */
56
+ export function isQwen3EmbeddingModel(modelUri) {
57
+ return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
58
+ }
59
+ /**
60
+ * Format a query for embedding.
61
+ * Uses nomic-style task prefix format for embeddinggemma (default).
62
+ * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
63
+ */
64
+ export function formatQueryForEmbedding(query, modelUri) {
65
+ const uri = modelUri ?? resolveEmbedModel();
66
+ if (isQwen3EmbeddingModel(uri)) {
67
+ return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
68
+ }
69
+ return `task: search result | query: ${query}`;
70
+ }
71
+ /**
72
+ * Format a document for embedding.
73
+ * Uses nomic-style format with title and text fields (default).
74
+ * Qwen3-Embedding encodes documents as raw text without special prefixes.
75
+ */
76
+ export function formatDocForEmbedding(text, title, modelUri) {
77
+ const uri = modelUri ?? resolveEmbedModel();
78
+ if (isQwen3EmbeddingModel(uri)) {
79
+ // Qwen3-Embedding: documents are raw text, no task prefix
80
+ return title ? `${title}\n${text}` : text;
81
+ }
82
+ return `title: ${title || "none"} | text: ${text}`;
83
+ }
84
+ // =============================================================================
85
+ // Model Configuration
86
+ // =============================================================================
87
+ // HuggingFace model URIs for node-llama-cpp
88
+ // Format: hf:<user>/<repo>/<file>
89
+ // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
90
+ const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
91
+ const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
92
+ // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
93
+ const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
94
+ // Alternative generation models for query expansion:
95
+ // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
96
+ // Use these as base for fine-tuning with configs/sft_lfm2.yaml
97
+ export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
98
+ export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
99
+ export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
100
+ export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
101
+ export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
102
+ export function resolveEmbedModel(config) {
103
+ return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
104
+ }
105
+ export function resolveGenerateModel(config) {
106
+ return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
107
+ }
108
+ export function resolveRerankModel(config) {
109
+ return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
110
+ }
111
+ export function resolveModels(config) {
112
+ return {
113
+ embed: resolveEmbedModel(config),
114
+ generate: resolveGenerateModel(config),
115
+ rerank: resolveRerankModel(config),
116
+ };
117
+ }
118
+ // Local model cache directory
119
+ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
120
+ ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
121
+ : join(homedir(), ".cache", "qmd", "models");
122
+ export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
123
+ function parseHfUri(model) {
124
+ if (!model.startsWith("hf:"))
125
+ return null;
126
+ const without = model.slice(3);
127
+ const parts = without.split("/");
128
+ if (parts.length < 3)
129
+ return null;
130
+ const repo = parts.slice(0, 2).join("/");
131
+ const file = parts.slice(2).join("/");
132
+ return { repo, file };
133
+ }
134
+ async function getRemoteEtag(ref) {
135
+ const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
136
+ try {
137
+ const resp = await fetch(url, { method: "HEAD" });
138
+ if (!resp.ok)
139
+ return null;
140
+ const etag = resp.headers.get("etag");
141
+ return etag || null;
142
+ }
143
+ catch {
144
+ return null;
145
+ }
146
+ }
147
+ const GGUF_MAGIC = Buffer.from("GGUF");
148
+ function formatModelFileSize(sizeBytes) {
149
+ return `${(sizeBytes / 1024).toFixed(0)} KB`;
150
+ }
151
+ function printableMagic(header) {
152
+ const text = header.toString("utf-8");
153
+ return /^[\x20-\x7e]{1,4}$/.test(text) ? text : `0x${header.toString("hex")}`;
154
+ }
155
+ /**
156
+ * Inspect a potential GGUF model file without mutating it.
157
+ * Used by doctor for early diagnostics and by runtime validation before load.
158
+ */
159
+ export function inspectGgufFile(filePath) {
160
+ if (!existsSync(filePath)) {
161
+ return { exists: false, valid: false, kind: "missing", details: "file does not exist" };
162
+ }
163
+ let sizeBytes = 0;
164
+ try {
165
+ sizeBytes = statSync(filePath).size;
166
+ const fd = openSync(filePath, "r");
167
+ const sniff = Buffer.alloc(512);
168
+ try {
169
+ readSync(fd, sniff, 0, 512, 0);
170
+ }
171
+ finally {
172
+ closeSync(fd);
173
+ }
174
+ const header = sniff.subarray(0, 4);
175
+ if (header.equals(GGUF_MAGIC)) {
176
+ return {
177
+ exists: true,
178
+ valid: true,
179
+ kind: "gguf",
180
+ sizeBytes,
181
+ magic: "GGUF",
182
+ details: `valid GGUF (${formatModelFileSize(sizeBytes)})`,
183
+ };
184
+ }
185
+ const magic = printableMagic(header);
186
+ const text = sniff.toString("utf-8").toLowerCase();
187
+ const isHtml = text.includes("<!doctype") || text.includes("<html");
188
+ if (isHtml) {
189
+ return {
190
+ exists: true,
191
+ valid: false,
192
+ kind: "html",
193
+ sizeBytes,
194
+ magic,
195
+ details: `HTML page, not a GGUF model (${formatModelFileSize(sizeBytes)}); likely proxy/firewall/captive portal response`,
196
+ };
197
+ }
198
+ return {
199
+ exists: true,
200
+ valid: false,
201
+ kind: "invalid",
202
+ sizeBytes,
203
+ magic,
204
+ details: `not valid GGUF (expected magic "GGUF", got "${magic}", ${formatModelFileSize(sizeBytes)})`,
205
+ };
206
+ }
207
+ catch (error) {
208
+ return {
209
+ exists: true,
210
+ valid: false,
211
+ kind: "invalid",
212
+ sizeBytes,
213
+ details: `cannot read model file: ${error instanceof Error ? error.message : String(error)}`,
214
+ };
215
+ }
216
+ }
217
+ /**
218
+ * Validate that a file is actually a GGUF model, not an HTML error page
219
+ * from a proxy, firewall, or failed download.
220
+ * Throws a descriptive error if the file is not valid GGUF.
221
+ */
222
+ function validateGgufFile(filePath, modelUri) {
223
+ const inspection = inspectGgufFile(filePath);
224
+ if (!inspection.exists || inspection.valid)
225
+ return; // let downstream handle missing files
226
+ // Remove the bad file so the next attempt re-downloads
227
+ try {
228
+ unlinkSync(filePath);
229
+ }
230
+ catch { /* best effort */ }
231
+ if (inspection.kind === "html") {
232
+ throw new Error(`Downloaded model file is an HTML page, not a GGUF model (${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
233
+ `Something is intercepting the download from huggingface.co (a proxy, firewall, or captive portal).\n\n` +
234
+ `Model: ${modelUri}\n` +
235
+ `Path: ${filePath}\n\n` +
236
+ `To fix this, either:\n` +
237
+ ` 1. Try a HuggingFace mirror: HF_ENDPOINT=https://hf-mirror.com qmd embed\n` +
238
+ ` 2. Download the model manually and set the env var, e.g.:\n` +
239
+ ` QMD_EMBED_MODEL=/path/to/model.gguf qmd embed\n\n` +
240
+ `Note: 'qmd search' works without any model downloads.`);
241
+ }
242
+ throw new Error(`Model file is not valid GGUF (expected magic "GGUF", got "${inspection.magic ?? "unknown"}", file is ${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
243
+ `Model: ${modelUri}\n` +
244
+ `Path: ${filePath}\n\n` +
245
+ `The file has been removed. Run the command again to re-download.`);
246
+ }
247
+ export async function pullModels(models, options = {}) {
248
+ const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
249
+ if (!existsSync(cacheDir)) {
250
+ mkdirSync(cacheDir, { recursive: true });
251
+ }
252
+ const results = [];
253
+ for (const model of models) {
254
+ let refreshed = false;
255
+ const hfRef = parseHfUri(model);
256
+ const filename = model.split("/").pop();
257
+ const entries = readdirSync(cacheDir, { withFileTypes: true });
258
+ const cached = filename
259
+ ? entries
260
+ .filter((entry) => entry.isFile() && entry.name.includes(filename))
261
+ .map((entry) => join(cacheDir, entry.name))
262
+ : [];
263
+ if (hfRef && filename) {
264
+ const etagPath = join(cacheDir, `${filename}.etag`);
265
+ const remoteEtag = await getRemoteEtag(hfRef);
266
+ const localEtag = existsSync(etagPath)
267
+ ? readFileSync(etagPath, "utf-8").trim()
268
+ : null;
269
+ const shouldRefresh = options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
270
+ if (shouldRefresh) {
271
+ for (const candidate of cached) {
272
+ if (existsSync(candidate))
273
+ unlinkSync(candidate);
274
+ }
275
+ if (existsSync(etagPath))
276
+ unlinkSync(etagPath);
277
+ refreshed = cached.length > 0;
278
+ }
279
+ }
280
+ else if (options.refresh && filename) {
281
+ for (const candidate of cached) {
282
+ if (existsSync(candidate))
283
+ unlinkSync(candidate);
284
+ refreshed = true;
285
+ }
286
+ }
287
+ const { resolveModelFile } = await loadNodeLlamaCpp();
288
+ const path = await resolveModelFile(model, cacheDir);
289
+ validateGgufFile(path, model);
290
+ const sizeBytes = existsSync(path) ? statSync(path).size : 0;
291
+ if (hfRef && filename) {
292
+ const remoteEtag = await getRemoteEtag(hfRef);
293
+ if (remoteEtag) {
294
+ const etagPath = join(cacheDir, `${filename}.etag`);
295
+ writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
296
+ }
297
+ }
298
+ results.push({ model, path, sizeBytes, refreshed });
299
+ }
300
+ return results;
301
+ }
302
+ /**
303
+ * LLM implementation using node-llama-cpp
304
+ */
305
+ // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
306
+ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
307
+ const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
308
+ export function resolveParallelismOverride(envValue = process.env.QMD_EMBED_PARALLELISM) {
309
+ const normalized = envValue?.trim() ?? "";
310
+ if (!normalized)
311
+ return undefined;
312
+ const parsed = Number(normalized);
313
+ if (!Number.isInteger(parsed) || parsed < 1) {
314
+ process.stderr.write(`QMD Warning: invalid QMD_EMBED_PARALLELISM="${envValue}", using automatic parallelism.\n`);
315
+ return undefined;
316
+ }
317
+ return Math.min(8, parsed);
318
+ }
319
+ export function resolveSafeParallelism(options) {
320
+ const override = resolveParallelismOverride(options.envValue);
321
+ if (override !== undefined)
322
+ return override;
323
+ // node-llama-cpp/llama.cpp CUDA on Windows is unstable with multiple
324
+ // simultaneous contexts (ggml-cuda.cu:98 in #519). Vulkan and CPU do not
325
+ // show the same failure mode, so only serialize Windows CUDA by default.
326
+ if ((options.platform ?? process.platform) === "win32" && options.gpu === "cuda") {
327
+ return 1;
328
+ }
329
+ return Math.max(1, options.computed);
330
+ }
331
+ export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU, forceCpuValue = process.env.QMD_FORCE_CPU) {
332
+ const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
333
+ if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
334
+ return false;
335
+ }
336
+ const normalized = envValue?.trim().toLowerCase() ?? "";
337
+ if (!normalized)
338
+ return "auto";
339
+ if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized))
340
+ return false;
341
+ if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda")
342
+ return normalized;
343
+ process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`);
344
+ return "auto";
345
+ }
346
+ async function disposeWithTimeout(resourceName, dispose, timeoutMs = 1000) {
347
+ const timeoutPromise = new Promise((resolve) => {
348
+ setTimeout(() => resolve("timeout"), timeoutMs).unref();
349
+ });
350
+ try {
351
+ const result = await Promise.race([dispose(), timeoutPromise]);
352
+ if (result === "timeout") {
353
+ process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
354
+ }
355
+ }
356
+ catch (error) {
357
+ process.stderr.write(`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`);
358
+ }
359
+ }
360
+ function resolveExpandContextSize(configValue) {
361
+ if (configValue !== undefined) {
362
+ if (!Number.isInteger(configValue) || configValue <= 0) {
363
+ throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
364
+ }
365
+ return configValue;
366
+ }
367
+ const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
368
+ if (!envValue)
369
+ return DEFAULT_EXPAND_CONTEXT_SIZE;
370
+ const parsed = Number.parseInt(envValue, 10);
371
+ if (!Number.isInteger(parsed) || parsed <= 0) {
372
+ process.stderr.write(`QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`);
373
+ return DEFAULT_EXPAND_CONTEXT_SIZE;
374
+ }
375
+ return parsed;
376
+ }
377
+ const failedGpuInitModes = new Set();
378
+ let noGpuAccelerationWarningShown = false;
379
+ let cpuForcedPrebuiltFallbackWarningShown = false;
380
+ function isCpuModeRequested() {
381
+ return resolveLlamaGpuMode() === false;
382
+ }
383
+ export class LlamaCpp {
384
+ _ciMode = !!process.env.CI;
385
+ llama = null;
386
+ embedModel = null;
387
+ embedContexts = [];
388
+ generateModel = null;
389
+ rerankModel = null;
390
+ rerankContexts = [];
391
+ embedModelUri;
392
+ generateModelUri;
393
+ rerankModelUri;
394
+ modelCacheDir;
395
+ expandContextSize;
396
+ // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
397
+ embedModelLoadPromise = null;
398
+ generateModelLoadPromise = null;
399
+ rerankModelLoadPromise = null;
400
+ // Guard against concurrent ensureLlama() calls creating duplicate Llama
401
+ // instances. Without this, two concurrent callers each build their own
402
+ // runtime and the last write to this.llama wins, leaving models/grammars
403
+ // bound to different Llama instances ("different Llama instance" errors).
404
+ llamaLoadPromise = null;
405
+ // Inactivity timer for auto-unloading models
406
+ inactivityTimer = null;
407
+ inactivityTimeoutMs;
408
+ disposeModelsOnInactivity;
409
+ // Track disposal state to prevent double-dispose
410
+ disposed = false;
411
+ constructor(config = {}) {
412
+ // STRUCTURAL INVARIANT: the launcher (bin/qmd) sets GGML_METAL_NO_RESIDENCY=1
413
+ // on darwin BEFORE the native binding loads, which prevents the libggml-metal
414
+ // static destructor assertion at process exit (ggml-org/llama.cpp#22593).
415
+ // See isDarwinMetalMitigationActive() for the runtime check exposed to
416
+ // diagnostics. No constructor-time guard installation is needed.
417
+ this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
418
+ this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
419
+ this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
420
+ this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
421
+ this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
422
+ this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
423
+ this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
424
+ }
425
+ get embedModelName() {
426
+ return this.embedModelUri;
427
+ }
428
+ get generateModelName() {
429
+ return this.generateModelUri;
430
+ }
431
+ get rerankModelName() {
432
+ return this.rerankModelUri;
433
+ }
434
+ /**
435
+ * Reset the inactivity timer. Called after each model operation.
436
+ * When timer fires, models are unloaded to free memory (if no active sessions).
437
+ */
438
+ touchActivity() {
439
+ // Clear existing timer
440
+ if (this.inactivityTimer) {
441
+ clearTimeout(this.inactivityTimer);
442
+ this.inactivityTimer = null;
443
+ }
444
+ // Only set timer if we have disposable contexts and timeout is enabled
445
+ if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
446
+ this.inactivityTimer = setTimeout(() => {
447
+ // Check if session manager allows unloading
448
+ // canUnloadLLM is defined later in this file - it checks the session manager
449
+ // We use dynamic import pattern to avoid circular dependency issues
450
+ if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
451
+ // Active sessions/operations - reschedule timer
452
+ this.touchActivity();
453
+ return;
454
+ }
455
+ this.unloadIdleResources().catch(err => {
456
+ console.error("Error unloading idle resources:", err);
457
+ });
458
+ }, this.inactivityTimeoutMs);
459
+ // Don't keep process alive just for this timer
460
+ this.inactivityTimer.unref();
461
+ }
462
+ }
463
+ /**
464
+ * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
465
+ */
466
+ hasLoadedContexts() {
467
+ return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
468
+ }
469
+ /**
470
+ * Unload idle resources but keep the instance alive for future use.
471
+ *
472
+ * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
473
+ * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
474
+ */
475
+ async unloadIdleResources() {
476
+ // Don't unload if already disposed
477
+ if (this.disposed) {
478
+ return;
479
+ }
480
+ // Clear timer
481
+ if (this.inactivityTimer) {
482
+ clearTimeout(this.inactivityTimer);
483
+ this.inactivityTimer = null;
484
+ }
485
+ // Dispose contexts first
486
+ for (const ctx of this.embedContexts) {
487
+ await ctx.dispose();
488
+ }
489
+ this.embedContexts = [];
490
+ for (const ctx of this.rerankContexts) {
491
+ await ctx.dispose();
492
+ }
493
+ this.rerankContexts = [];
494
+ // Optionally dispose models too (opt-in)
495
+ if (this.disposeModelsOnInactivity) {
496
+ if (this.embedModel) {
497
+ await this.embedModel.dispose();
498
+ this.embedModel = null;
499
+ }
500
+ if (this.generateModel) {
501
+ await this.generateModel.dispose();
502
+ this.generateModel = null;
503
+ }
504
+ if (this.rerankModel) {
505
+ await this.rerankModel.dispose();
506
+ this.rerankModel = null;
507
+ }
508
+ // Reset load promises so models can be reloaded later
509
+ this.embedModelLoadPromise = null;
510
+ this.generateModelLoadPromise = null;
511
+ this.rerankModelLoadPromise = null;
512
+ }
513
+ // Note: We keep llama instance alive - it's lightweight
514
+ }
515
+ /**
516
+ * Ensure model cache directory exists
517
+ */
518
+ ensureModelCacheDir() {
519
+ if (!existsSync(this.modelCacheDir)) {
520
+ mkdirSync(this.modelCacheDir, { recursive: true });
521
+ }
522
+ }
523
+ /**
524
+ * Initialize the llama instance (lazy)
525
+ */
526
+ async ensureLlama(allowBuild = true) {
527
+ if (this.llama) {
528
+ return this.llama;
529
+ }
530
+ if (this.llamaLoadPromise) {
531
+ return await this.llamaLoadPromise;
532
+ }
533
+ this.llamaLoadPromise = this.loadLlamaRuntime(allowBuild);
534
+ try {
535
+ return await this.llamaLoadPromise;
536
+ }
537
+ finally {
538
+ this.llamaLoadPromise = null;
539
+ }
540
+ }
541
+ async loadLlamaRuntime(allowBuild = true) {
542
+ if (!this.llama) {
543
+ const gpuMode = resolveLlamaGpuMode();
544
+ const { getLlama, getLlamaGpuTypes, LlamaLogLevel } = await loadNodeLlamaCpp();
545
+ const loadLlama = async (gpu, sourceBuildAllowed = allowBuild, buildOverride) => await withNativeStdoutRedirectedToStderr(() => getLlama({
546
+ // Prefer packaged prebuilt bindings before compiling llama.cpp locally.
547
+ // node-llama-cpp documents gpu:"auto" as the best default: Metal on
548
+ // Apple Silicon, CUDA when fully available, Vulkan where available,
549
+ // then CPU. Use build:"auto" for normal loads and build:"never" for
550
+ // diagnostic/probe paths that must not compile llama.cpp.
551
+ build: buildOverride ?? (sourceBuildAllowed ? "auto" : "never"),
552
+ logLevel: LlamaLogLevel.error,
553
+ gpu,
554
+ progressLogs: false,
555
+ skipDownload: !sourceBuildAllowed,
556
+ }));
557
+ const loadCpuCompatibleLlama = async () => {
558
+ try {
559
+ return await loadLlama(false, false);
560
+ }
561
+ catch (err) {
562
+ // Some platforms, notably Apple Silicon, ship a Metal prebuilt but no
563
+ // CPU-only prebuilt. Do a fast no-build lookup for an actual CPU
564
+ // binding first; if it does not exist, use the packaged auto/Metal
565
+ // binding and disable model offloading via gpuLayers: 0.
566
+ if (!cpuForcedPrebuiltFallbackWarningShown) {
567
+ cpuForcedPrebuiltFallbackWarningShown = true;
568
+ process.stderr.write(`QMD Warning: CPU-only llama.cpp prebuilt not available (${err instanceof Error ? err.message : String(err)}); using packaged backend with GPU offloading disabled.\n`);
569
+ }
570
+ return await loadLlama("auto", false);
571
+ }
572
+ };
573
+ let llama;
574
+ if (gpuMode === false) {
575
+ llama = await loadCpuCompatibleLlama();
576
+ }
577
+ else if (failedGpuInitModes.has(gpuMode)) {
578
+ process.stderr.write(`QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`);
579
+ llama = await loadCpuCompatibleLlama();
580
+ }
581
+ else {
582
+ try {
583
+ llama = await loadLlama(gpuMode);
584
+ // If node-llama-cpp auto-detection chose CPU, do one no-build pass
585
+ // over all OS-valid packaged GPU backends. This preserves the
586
+ // documented auto mode for Metal/CUDA/Vulkan while recovering on
587
+ // systems where a packaged backend can load but detection is too
588
+ // conservative. Never compile during these extra probes.
589
+ if (gpuMode === "auto" && llama.gpu === false && getLlamaGpuTypes) {
590
+ const candidates = (await getLlamaGpuTypes("allValid"))
591
+ .filter((candidate) => candidate !== false && candidate !== "auto");
592
+ for (const candidate of candidates) {
593
+ if (failedGpuInitModes.has(candidate))
594
+ continue;
595
+ try {
596
+ const gpuLlama = await loadLlama(candidate, false, "never");
597
+ if (gpuLlama.gpu !== false) {
598
+ await disposeWithTimeout("CPU llama runtime", () => llama.dispose());
599
+ llama = gpuLlama;
600
+ break;
601
+ }
602
+ await disposeWithTimeout(`${candidate} probe runtime`, () => gpuLlama.dispose());
603
+ }
604
+ catch {
605
+ failedGpuInitModes.add(candidate);
606
+ }
607
+ }
608
+ }
609
+ }
610
+ catch (err) {
611
+ // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
612
+ // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
613
+ // expensive native build/probe attempts in this process.
614
+ failedGpuInitModes.add(gpuMode);
615
+ process.stderr.write(`QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
616
+ llama = await loadCpuCompatibleLlama();
617
+ }
618
+ }
619
+ if (llama.gpu === false && !noGpuAccelerationWarningShown) {
620
+ noGpuAccelerationWarningShown = true;
621
+ process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd doctor' for device diagnostics.\n");
622
+ }
623
+ this.llama = llama;
624
+ }
625
+ return this.llama;
626
+ }
627
+ isCpuOffloadForced() {
628
+ return isCpuModeRequested();
629
+ }
630
+ modelLoadOptions(modelPath) {
631
+ return {
632
+ modelPath,
633
+ ...(this.isCpuOffloadForced() ? { gpuLayers: 0 } : {}),
634
+ };
635
+ }
636
+ /**
637
+ * Resolve a model URI to a local path, downloading if needed.
638
+ * Validates the downloaded file is actually a GGUF model (not an HTML error page
639
+ * from a proxy or firewall).
640
+ */
641
+ async resolveModel(modelUri) {
642
+ this.ensureModelCacheDir();
643
+ // resolveModelFile handles HF URIs and downloads to the cache dir
644
+ const { resolveModelFile } = await loadNodeLlamaCpp();
645
+ const modelPath = await resolveModelFile(modelUri, this.modelCacheDir);
646
+ validateGgufFile(modelPath, modelUri);
647
+ return modelPath;
648
+ }
649
+ /**
650
+ * Load embedding model (lazy)
651
+ */
652
+ async ensureEmbedModel() {
653
+ if (this.embedModel) {
654
+ return this.embedModel;
655
+ }
656
+ if (this.embedModelLoadPromise) {
657
+ return await this.embedModelLoadPromise;
658
+ }
659
+ this.embedModelLoadPromise = (async () => {
660
+ const llama = await this.ensureLlama();
661
+ const modelPath = await this.resolveModel(this.embedModelUri);
662
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
663
+ this.embedModel = model;
664
+ // Model loading counts as activity - ping to keep alive
665
+ this.touchActivity();
666
+ return model;
667
+ })();
668
+ try {
669
+ return await this.embedModelLoadPromise;
670
+ }
671
+ finally {
672
+ // Keep the resolved model cached; clear only the in-flight promise.
673
+ this.embedModelLoadPromise = null;
674
+ }
675
+ }
676
+ /**
677
+ * Compute how many parallel contexts to create.
678
+ *
679
+ * GPU: constrained by VRAM (25% of free, capped at 8).
680
+ * CPU: constrained by cores. Splitting threads across contexts enables
681
+ * true parallelism (each context runs on its own cores). Use at most
682
+ * half the math cores, with at least 4 threads per context.
683
+ */
684
+ async computeParallelism(perContextMB) {
685
+ const llama = await this.ensureLlama();
686
+ if (!this.isCpuOffloadForced() && llama.gpu) {
687
+ try {
688
+ const vram = await llama.getVramState();
689
+ const freeMB = vram.free / (1024 * 1024);
690
+ const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
691
+ const computed = Math.max(1, Math.min(8, maxByVram));
692
+ return resolveSafeParallelism({ gpu: llama.gpu, computed });
693
+ }
694
+ catch {
695
+ return resolveSafeParallelism({ gpu: llama.gpu, computed: 2 });
696
+ }
697
+ }
698
+ // CPU: split cores across contexts. At least 4 threads per context.
699
+ const cores = llama.cpuMathCores || 4;
700
+ const maxContexts = Math.floor(cores / 4);
701
+ const computed = Math.max(1, Math.min(4, maxContexts));
702
+ return resolveSafeParallelism({ gpu: false, computed });
703
+ }
704
+ /**
705
+ * Get the number of threads each context should use, given N parallel contexts.
706
+ * Splits available math cores evenly across contexts.
707
+ */
708
+ async threadsPerContext(parallelism) {
709
+ const llama = await this.ensureLlama();
710
+ if (!this.isCpuOffloadForced() && llama.gpu)
711
+ return 0; // GPU: let the library decide
712
+ const cores = llama.cpuMathCores || 4;
713
+ return Math.max(1, Math.floor(cores / parallelism));
714
+ }
715
+ /**
716
+ * Load embedding contexts (lazy). Creates multiple for parallel embedding.
717
+ * Uses promise guard to prevent concurrent context creation race condition.
718
+ */
719
+ embedContextsCreatePromise = null;
720
+ async ensureEmbedContexts() {
721
+ if (this.embedContexts.length > 0) {
722
+ this.touchActivity();
723
+ return this.embedContexts;
724
+ }
725
+ if (this.embedContextsCreatePromise) {
726
+ return await this.embedContextsCreatePromise;
727
+ }
728
+ this.embedContextsCreatePromise = (async () => {
729
+ const model = await this.ensureEmbedModel();
730
+ // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
731
+ const n = await this.computeParallelism(150);
732
+ const threads = await this.threadsPerContext(n);
733
+ for (let i = 0; i < n; i++) {
734
+ try {
735
+ this.embedContexts.push(await model.createEmbeddingContext({
736
+ contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
737
+ ...(threads > 0 ? { threads } : {}),
738
+ }));
739
+ }
740
+ catch {
741
+ if (this.embedContexts.length === 0)
742
+ throw new Error("Failed to create any embedding context");
743
+ break;
744
+ }
745
+ }
746
+ this.touchActivity();
747
+ return this.embedContexts;
748
+ })();
749
+ try {
750
+ return await this.embedContextsCreatePromise;
751
+ }
752
+ finally {
753
+ this.embedContextsCreatePromise = null;
754
+ }
755
+ }
756
+ /**
757
+ * Get a single embed context (for single-embed calls). Uses first from pool.
758
+ */
759
+ async ensureEmbedContext() {
760
+ const contexts = await this.ensureEmbedContexts();
761
+ return contexts[0];
762
+ }
763
+ /**
764
+ * Load generation model (lazy) - context is created fresh per call
765
+ */
766
+ async ensureGenerateModel() {
767
+ if (!this.generateModel) {
768
+ if (this.generateModelLoadPromise) {
769
+ return await this.generateModelLoadPromise;
770
+ }
771
+ this.generateModelLoadPromise = (async () => {
772
+ const llama = await this.ensureLlama();
773
+ const modelPath = await this.resolveModel(this.generateModelUri);
774
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
775
+ this.generateModel = model;
776
+ return model;
777
+ })();
778
+ try {
779
+ await this.generateModelLoadPromise;
780
+ }
781
+ finally {
782
+ this.generateModelLoadPromise = null;
783
+ }
784
+ }
785
+ this.touchActivity();
786
+ if (!this.generateModel) {
787
+ throw new Error("Generate model not loaded");
788
+ }
789
+ return this.generateModel;
790
+ }
791
+ /**
792
+ * Load rerank model (lazy)
793
+ */
794
+ async ensureRerankModel() {
795
+ if (this.rerankModel) {
796
+ return this.rerankModel;
797
+ }
798
+ if (this.rerankModelLoadPromise) {
799
+ return await this.rerankModelLoadPromise;
800
+ }
801
+ this.rerankModelLoadPromise = (async () => {
802
+ const llama = await this.ensureLlama();
803
+ const modelPath = await this.resolveModel(this.rerankModelUri);
804
+ const model = await llama.loadModel(this.modelLoadOptions(modelPath));
805
+ this.rerankModel = model;
806
+ // Model loading counts as activity - ping to keep alive
807
+ this.touchActivity();
808
+ return model;
809
+ })();
810
+ try {
811
+ return await this.rerankModelLoadPromise;
812
+ }
813
+ finally {
814
+ this.rerankModelLoadPromise = null;
815
+ }
816
+ }
817
+ /**
818
+ * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
819
+ * Each context has its own sequence, so they can evaluate independently.
820
+ *
821
+ * Tuning choices:
822
+ * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
823
+ * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
824
+ * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
825
+ */
826
+ // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
827
+ // Default 2048 was too small for longer documents (e.g. session transcripts,
828
+ // CJK text, or large markdown files) — callers hit "input lengths exceed
829
+ // context size" errors even after truncation because the overhead estimate
830
+ // was insufficient. 4096 comfortably fits the largest real-world chunks
831
+ // while staying well below the 40 960-token auto size.
832
+ // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
833
+ static RERANK_CONTEXT_SIZE = (() => {
834
+ const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
835
+ return Number.isFinite(v) && v > 0 ? v : 4096;
836
+ })();
837
+ static EMBED_CONTEXT_SIZE = (() => {
838
+ const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
839
+ return Number.isFinite(v) && v > 0 ? v : 2048;
840
+ })();
841
+ async ensureRerankContexts() {
842
+ if (this.rerankContexts.length === 0) {
843
+ const model = await this.ensureRerankModel();
844
+ // ~960 MB per context with flash attention at contextSize 2048
845
+ const n = Math.min(await this.computeParallelism(1000), 4);
846
+ const threads = await this.threadsPerContext(n);
847
+ for (let i = 0; i < n; i++) {
848
+ try {
849
+ this.rerankContexts.push(await model.createRankingContext({
850
+ contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
851
+ ...(threads > 0 ? { threads } : {}),
852
+ }));
853
+ }
854
+ catch {
855
+ if (this.rerankContexts.length === 0) {
856
+ // Flash attention might not be supported — retry without it
857
+ try {
858
+ this.rerankContexts.push(await model.createRankingContext({
859
+ contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
860
+ ...(threads > 0 ? { threads } : {}),
861
+ }));
862
+ }
863
+ catch {
864
+ throw new Error("Failed to create any rerank context");
865
+ }
866
+ }
867
+ break;
868
+ }
869
+ }
870
+ }
871
+ this.touchActivity();
872
+ return this.rerankContexts;
873
+ }
874
+ // ==========================================================================
875
+ // Tokenization
876
+ // ==========================================================================
877
+ /**
878
+ * Tokenize text using the embedding model's tokenizer
879
+ * Returns tokenizer tokens (opaque type from node-llama-cpp)
880
+ */
881
+ async tokenize(text) {
882
+ await this.ensureEmbedContext(); // Ensure model is loaded
883
+ if (!this.embedModel) {
884
+ throw new Error("Embed model not loaded");
885
+ }
886
+ return this.embedModel.tokenize(text);
887
+ }
888
+ /**
889
+ * Count tokens in text using the embedding model's tokenizer
890
+ */
891
+ async countTokens(text) {
892
+ const tokens = await this.tokenize(text);
893
+ return tokens.length;
894
+ }
895
+ /**
896
+ * Detokenize token IDs back to text
897
+ */
898
+ async detokenize(tokens) {
899
+ await this.ensureEmbedContext();
900
+ if (!this.embedModel) {
901
+ throw new Error("Embed model not loaded");
902
+ }
903
+ return this.embedModel.detokenize(tokens);
904
+ }
905
+ // ==========================================================================
906
+ // Core API methods
907
+ // ==========================================================================
908
+ /**
909
+ * Truncate text to fit within the embedding model's context window.
910
+ * Uses the model's own tokenizer for accurate token counting, then
911
+ * detokenizes back to text if truncation is needed.
912
+ * Returns the (possibly truncated) text and whether truncation occurred.
913
+ */
914
+ resolveEmbedTokenLimit() {
915
+ const trainedContextSize = this.embedModel?.trainContextSize;
916
+ if (typeof trainedContextSize === "number" && Number.isFinite(trainedContextSize) && trainedContextSize > 0) {
917
+ return Math.max(1, Math.min(LlamaCpp.EMBED_CONTEXT_SIZE, trainedContextSize));
918
+ }
919
+ return LlamaCpp.EMBED_CONTEXT_SIZE;
920
+ }
921
+ async truncateToContextSize(text) {
922
+ if (!this.embedModel)
923
+ return { text, truncated: false, limit: LlamaCpp.EMBED_CONTEXT_SIZE };
924
+ const maxTokens = this.resolveEmbedTokenLimit();
925
+ if (maxTokens <= 0)
926
+ return { text, truncated: false, limit: maxTokens };
927
+ const tokens = this.embedModel.tokenize(text);
928
+ if (tokens.length <= maxTokens)
929
+ return { text, truncated: false, limit: maxTokens };
930
+ // Leave a small margin (4 tokens) for BOS/EOS overhead
931
+ const safeLimit = Math.max(1, maxTokens - 4);
932
+ const truncatedTokens = tokens.slice(0, safeLimit);
933
+ const truncatedText = this.embedModel.detokenize(truncatedTokens);
934
+ return { text: truncatedText, truncated: true, limit: maxTokens };
935
+ }
936
+ async embed(text, options = {}) {
937
+ // Ping activity at start to keep models alive during this operation
938
+ this.touchActivity();
939
+ try {
940
+ const context = await this.ensureEmbedContext();
941
+ // Guard: truncate text that exceeds model context window to prevent GGML crash
942
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
943
+ if (truncated) {
944
+ console.warn(`⚠ Text truncated to fit embedding context (${limit} tokens)`);
945
+ }
946
+ const embedding = await context.getEmbeddingFor(safeText);
947
+ return {
948
+ embedding: Array.from(embedding.vector),
949
+ model: options.model ?? this.embedModelUri,
950
+ };
951
+ }
952
+ catch (error) {
953
+ console.error("Embedding error:", error);
954
+ return null;
955
+ }
956
+ }
957
+ /**
958
+ * Batch embed multiple texts efficiently
959
+ * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
960
+ */
961
+ async embedBatch(texts, options = {}) {
962
+ if (this._ciMode)
963
+ throw new Error("LLM operations are disabled in CI (set CI=true)");
964
+ // Ping activity at start to keep models alive during this operation
965
+ this.touchActivity();
966
+ if (texts.length === 0)
967
+ return [];
968
+ try {
969
+ const contexts = await this.ensureEmbedContexts();
970
+ const n = contexts.length;
971
+ if (n === 1) {
972
+ // Single context: sequential (no point splitting)
973
+ const context = contexts[0];
974
+ const embeddings = [];
975
+ for (const text of texts) {
976
+ try {
977
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
978
+ if (truncated) {
979
+ console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
980
+ }
981
+ const embedding = await context.getEmbeddingFor(safeText);
982
+ this.touchActivity();
983
+ embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
984
+ }
985
+ catch (err) {
986
+ console.error("Embedding error for text:", err);
987
+ embeddings.push(null);
988
+ }
989
+ }
990
+ return embeddings;
991
+ }
992
+ // Multiple contexts: split texts across contexts for parallel evaluation
993
+ const chunkSize = Math.ceil(texts.length / n);
994
+ const chunks = Array.from({ length: n }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize));
995
+ const chunkResults = await Promise.all(chunks.map(async (chunk, i) => {
996
+ const ctx = contexts[i];
997
+ const results = [];
998
+ for (const text of chunk) {
999
+ try {
1000
+ const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
1001
+ if (truncated) {
1002
+ console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
1003
+ }
1004
+ const embedding = await ctx.getEmbeddingFor(safeText);
1005
+ this.touchActivity();
1006
+ results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
1007
+ }
1008
+ catch (err) {
1009
+ console.error("Embedding error for text:", err);
1010
+ results.push(null);
1011
+ }
1012
+ }
1013
+ return results;
1014
+ }));
1015
+ return chunkResults.flat();
1016
+ }
1017
+ catch (error) {
1018
+ console.error("Batch embedding error:", error);
1019
+ return texts.map(() => null);
1020
+ }
1021
+ }
1022
+ async generate(prompt, options = {}) {
1023
+ if (this._ciMode)
1024
+ throw new Error("LLM operations are disabled in CI (set CI=true)");
1025
+ // Ping activity at start to keep models alive during this operation
1026
+ this.touchActivity();
1027
+ // Ensure model is loaded
1028
+ await this.ensureGenerateModel();
1029
+ // Create fresh context -> sequence -> session for each call
1030
+ const context = await this.generateModel.createContext();
1031
+ const sequence = context.getSequence();
1032
+ const { LlamaChatSession } = await loadNodeLlamaCpp();
1033
+ const session = new LlamaChatSession({ contextSequence: sequence });
1034
+ const maxTokens = options.maxTokens ?? 150;
1035
+ // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
1036
+ // DO NOT use greedy decoding (temp=0) - causes repetition loops
1037
+ const temperature = options.temperature ?? 0.7;
1038
+ let result = "";
1039
+ try {
1040
+ await session.prompt(prompt, {
1041
+ maxTokens,
1042
+ temperature,
1043
+ topK: 20,
1044
+ topP: 0.8,
1045
+ onTextChunk: (text) => {
1046
+ result += text;
1047
+ },
1048
+ });
1049
+ return {
1050
+ text: result,
1051
+ model: this.generateModelUri,
1052
+ done: true,
1053
+ };
1054
+ }
1055
+ finally {
1056
+ // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
1057
+ await context.dispose();
1058
+ }
1059
+ }
1060
+ async modelExists(modelUri) {
1061
+ // For HuggingFace URIs, we assume they exist
1062
+ // For local paths, check if file exists
1063
+ if (modelUri.startsWith("hf:")) {
1064
+ return { name: modelUri, exists: true };
1065
+ }
1066
+ const exists = existsSync(modelUri);
1067
+ return {
1068
+ name: modelUri,
1069
+ exists,
1070
+ path: exists ? modelUri : undefined,
1071
+ };
1072
+ }
1073
+ // ==========================================================================
1074
+ // High-level abstractions
1075
+ // ==========================================================================
1076
+ async expandQuery(query, options = {}) {
1077
+ if (this._ciMode)
1078
+ throw new Error("LLM operations are disabled in CI (set CI=true)");
1079
+ // Ping activity at start to keep models alive during this operation
1080
+ this.touchActivity();
1081
+ const llama = await this.ensureLlama();
1082
+ await this.ensureGenerateModel();
1083
+ const includeLexical = options.includeLexical ?? true;
1084
+ const context = options.context;
1085
+ const intent = options.intent;
1086
+ const prompt = intent
1087
+ ? `/no_think Expand this search query: ${query}\nQuery intent: ${intent}`
1088
+ : `/no_think Expand this search query: ${query}`;
1089
+ // Set up inside the try so any failure (grammar creation, context
1090
+ // allocation/VRAM, session prompt) falls back to the original query
1091
+ // instead of propagating and failing the caller's operation.
1092
+ let genContext;
1093
+ try {
1094
+ const grammar = await llama.createGrammar({
1095
+ grammar: `
1096
+ root ::= line+
1097
+ line ::= type ": " content "\\n"
1098
+ type ::= "lex" | "vec" | "hyde"
1099
+ content ::= [^\\n]+
1100
+ `
1101
+ });
1102
+ // Create a bounded context for expansion to prevent large default VRAM allocations.
1103
+ genContext = await this.generateModel.createContext({
1104
+ contextSize: this.expandContextSize,
1105
+ });
1106
+ const sequence = genContext.getSequence();
1107
+ const { LlamaChatSession } = await loadNodeLlamaCpp();
1108
+ const session = new LlamaChatSession({ contextSequence: sequence });
1109
+ // Qwen3 recommended settings for non-thinking mode:
1110
+ // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
1111
+ // DO NOT use greedy decoding (temp=0) - causes infinite loops
1112
+ const result = await session.prompt(prompt, {
1113
+ grammar,
1114
+ maxTokens: 600,
1115
+ temperature: 0.7,
1116
+ topK: 20,
1117
+ topP: 0.8,
1118
+ repeatPenalty: {
1119
+ lastTokens: 64,
1120
+ presencePenalty: 0.5,
1121
+ },
1122
+ });
1123
+ const lines = result.trim().split("\n");
1124
+ const queryLower = query.toLowerCase();
1125
+ const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
1126
+ const hasQueryTerm = (text) => {
1127
+ const lower = text.toLowerCase();
1128
+ if (queryTerms.length === 0)
1129
+ return true;
1130
+ return queryTerms.some(term => lower.includes(term));
1131
+ };
1132
+ const queryables = lines.map(line => {
1133
+ const colonIdx = line.indexOf(":");
1134
+ if (colonIdx === -1)
1135
+ return null;
1136
+ const type = line.slice(0, colonIdx).trim();
1137
+ if (type !== 'lex' && type !== 'vec' && type !== 'hyde')
1138
+ return null;
1139
+ const text = line.slice(colonIdx + 1).trim();
1140
+ if (!hasQueryTerm(text))
1141
+ return null;
1142
+ return { type: type, text };
1143
+ }).filter((q) => q !== null);
1144
+ // Filter out lex entries if not requested
1145
+ const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
1146
+ if (filtered.length > 0)
1147
+ return filtered;
1148
+ const fallback = [
1149
+ { type: 'hyde', text: `Information about ${query}` },
1150
+ { type: 'lex', text: query },
1151
+ { type: 'vec', text: query },
1152
+ ];
1153
+ return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
1154
+ }
1155
+ catch (error) {
1156
+ console.error("Structured query expansion failed:", error);
1157
+ // Fallback to original query
1158
+ const fallback = [{ type: 'vec', text: query }];
1159
+ if (includeLexical)
1160
+ fallback.unshift({ type: 'lex', text: query });
1161
+ return fallback;
1162
+ }
1163
+ finally {
1164
+ if (genContext)
1165
+ await genContext.dispose();
1166
+ }
1167
+ }
1168
+ // Qwen3 reranker chat template overhead (system prompt, tags, separators).
1169
+ // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
1170
+ // the truncation budget never lets a document slip past the context limit.
1171
+ static RERANK_TEMPLATE_OVERHEAD = 512;
1172
+ static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
1173
+ async rerank(query, documents, options = {}) {
1174
+ if (this._ciMode)
1175
+ throw new Error("LLM operations are disabled in CI (set CI=true)");
1176
+ // Ping activity at start to keep models alive during this operation
1177
+ this.touchActivity();
1178
+ const contexts = await this.ensureRerankContexts();
1179
+ const model = await this.ensureRerankModel();
1180
+ // Truncate documents that would exceed the rerank context size.
1181
+ // Budget = contextSize - template overhead - query tokens
1182
+ const queryTokens = model.tokenize(query).length;
1183
+ const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
1184
+ const truncationCache = new Map();
1185
+ const truncatedDocs = documents.map((doc) => {
1186
+ const cached = truncationCache.get(doc.text);
1187
+ if (cached !== undefined) {
1188
+ return cached === doc.text ? doc : { ...doc, text: cached };
1189
+ }
1190
+ const tokens = model.tokenize(doc.text);
1191
+ const truncatedText = tokens.length <= maxDocTokens
1192
+ ? doc.text
1193
+ : model.detokenize(tokens.slice(0, maxDocTokens));
1194
+ truncationCache.set(doc.text, truncatedText);
1195
+ if (truncatedText === doc.text)
1196
+ return doc;
1197
+ return { ...doc, text: truncatedText };
1198
+ });
1199
+ // Deduplicate identical effective texts before scoring.
1200
+ // This avoids redundant work for repeated chunks and fixes collisions where
1201
+ // multiple docs map to the same chunk text.
1202
+ const textToDocs = new Map();
1203
+ truncatedDocs.forEach((doc, index) => {
1204
+ const existing = textToDocs.get(doc.text);
1205
+ if (existing) {
1206
+ existing.push({ file: doc.file, index });
1207
+ }
1208
+ else {
1209
+ textToDocs.set(doc.text, [{ file: doc.file, index }]);
1210
+ }
1211
+ });
1212
+ // Extract just the text for ranking
1213
+ const texts = Array.from(textToDocs.keys());
1214
+ // Split documents across contexts for parallel evaluation.
1215
+ // Each context has its own sequence with a lock, so parallelism comes
1216
+ // from multiple contexts evaluating different chunks simultaneously.
1217
+ const activeContextCount = Math.max(1, Math.min(contexts.length, Math.ceil(texts.length / LlamaCpp.RERANK_TARGET_DOCS_PER_CONTEXT)));
1218
+ const activeContexts = contexts.slice(0, activeContextCount);
1219
+ const chunkSize = Math.ceil(texts.length / activeContexts.length);
1220
+ const chunks = Array.from({ length: activeContexts.length }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize)).filter(chunk => chunk.length > 0);
1221
+ const allScores = await Promise.all(chunks.map((chunk, i) => activeContexts[i].rankAll(query, chunk)));
1222
+ // Reassemble scores in original order and sort
1223
+ const flatScores = allScores.flat();
1224
+ const ranked = texts
1225
+ .map((text, i) => ({ document: text, score: flatScores[i] }))
1226
+ .sort((a, b) => b.score - a.score);
1227
+ // Map back to our result format.
1228
+ const results = [];
1229
+ for (const item of ranked) {
1230
+ const docInfos = textToDocs.get(item.document) ?? [];
1231
+ for (const docInfo of docInfos) {
1232
+ results.push({
1233
+ file: docInfo.file,
1234
+ score: item.score,
1235
+ index: docInfo.index,
1236
+ });
1237
+ }
1238
+ }
1239
+ return {
1240
+ results,
1241
+ model: this.rerankModelUri,
1242
+ };
1243
+ }
1244
+ /**
1245
+ * Get device/GPU info for status display.
1246
+ * Initializes llama if not already done.
1247
+ */
1248
+ async getDeviceInfo(options = {}) {
1249
+ const llama = await this.ensureLlama(options.allowBuild ?? true);
1250
+ const cpuForced = this.isCpuOffloadForced();
1251
+ const gpuDevices = cpuForced ? [] : await llama.getGpuDeviceNames();
1252
+ let vram;
1253
+ if (!cpuForced && llama.gpu) {
1254
+ try {
1255
+ const state = await llama.getVramState();
1256
+ vram = { total: state.total, used: state.used, free: state.free };
1257
+ }
1258
+ catch { /* no vram info */ }
1259
+ }
1260
+ return {
1261
+ gpu: cpuForced ? false : llama.gpu,
1262
+ gpuOffloading: !cpuForced && llama.supportsGpuOffloading,
1263
+ gpuDevices,
1264
+ vram,
1265
+ cpuCores: llama.cpuMathCores,
1266
+ };
1267
+ }
1268
+ async dispose() {
1269
+ // Prevent double-dispose
1270
+ if (this.disposed) {
1271
+ return;
1272
+ }
1273
+ this.disposed = true;
1274
+ // Clear inactivity timer
1275
+ if (this.inactivityTimer) {
1276
+ clearTimeout(this.inactivityTimer);
1277
+ this.inactivityTimer = null;
1278
+ }
1279
+ // Explicitly dispose in dependency order: contexts first, then models, then llama.
1280
+ // Relying only on llama.dispose() leaves Metal resource sets alive until process
1281
+ // finalization on Apple Silicon, where ggml_metal_device_free can abort after
1282
+ // otherwise-successful CLI output (#368).
1283
+ for (const ctx of this.embedContexts) {
1284
+ await disposeWithTimeout("embedding context", () => ctx.dispose());
1285
+ }
1286
+ this.embedContexts = [];
1287
+ for (const ctx of this.rerankContexts) {
1288
+ await disposeWithTimeout("rerank context", () => ctx.dispose());
1289
+ }
1290
+ this.rerankContexts = [];
1291
+ if (this.embedModel) {
1292
+ await disposeWithTimeout("embedding model", () => this.embedModel.dispose());
1293
+ this.embedModel = null;
1294
+ }
1295
+ if (this.generateModel) {
1296
+ await disposeWithTimeout("generation model", () => this.generateModel.dispose());
1297
+ this.generateModel = null;
1298
+ }
1299
+ if (this.rerankModel) {
1300
+ await disposeWithTimeout("rerank model", () => this.rerankModel.dispose());
1301
+ this.rerankModel = null;
1302
+ }
1303
+ if (this.llama) {
1304
+ await disposeWithTimeout("llama runtime", () => this.llama.dispose());
1305
+ this.llama = null;
1306
+ }
1307
+ // Clear any in-flight load/create promises
1308
+ this.embedModelLoadPromise = null;
1309
+ this.embedContextsCreatePromise = null;
1310
+ this.generateModelLoadPromise = null;
1311
+ this.rerankModelLoadPromise = null;
1312
+ this.llamaLoadPromise = null;
1313
+ }
1314
+ }
1315
+ // =============================================================================
1316
+ // Session Management Layer
1317
+ // =============================================================================
1318
+ /**
1319
+ * Manages LLM session lifecycle with reference counting.
1320
+ * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
1321
+ */
1322
+ class LLMSessionManager {
1323
+ llm;
1324
+ _activeSessionCount = 0;
1325
+ _inFlightOperations = 0;
1326
+ constructor(llm) {
1327
+ this.llm = llm;
1328
+ }
1329
+ get activeSessionCount() {
1330
+ return this._activeSessionCount;
1331
+ }
1332
+ get inFlightOperations() {
1333
+ return this._inFlightOperations;
1334
+ }
1335
+ /**
1336
+ * Returns true only when both session count and in-flight operations are 0.
1337
+ * Used by LlamaCpp to determine if idle unload is safe.
1338
+ */
1339
+ canUnload() {
1340
+ return this._activeSessionCount === 0 && this._inFlightOperations === 0;
1341
+ }
1342
+ acquire() {
1343
+ this._activeSessionCount++;
1344
+ }
1345
+ release() {
1346
+ this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
1347
+ }
1348
+ operationStart() {
1349
+ this._inFlightOperations++;
1350
+ }
1351
+ operationEnd() {
1352
+ this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
1353
+ }
1354
+ getLlamaCpp() {
1355
+ return this.llm;
1356
+ }
1357
+ }
1358
+ /**
1359
+ * Error thrown when an operation is attempted on a released or aborted session.
1360
+ */
1361
+ export class SessionReleasedError extends Error {
1362
+ constructor(message = "LLM session has been released or aborted") {
1363
+ super(message);
1364
+ this.name = "SessionReleasedError";
1365
+ }
1366
+ }
1367
+ /**
1368
+ * Scoped LLM session with automatic lifecycle management.
1369
+ * Wraps LlamaCpp methods with operation tracking and abort handling.
1370
+ */
1371
+ class LLMSession {
1372
+ manager;
1373
+ released = false;
1374
+ abortController;
1375
+ maxDurationTimer = null;
1376
+ name;
1377
+ constructor(manager, options = {}) {
1378
+ this.manager = manager;
1379
+ this.name = options.name || "unnamed";
1380
+ this.abortController = new AbortController();
1381
+ // Link external abort signal if provided
1382
+ if (options.signal) {
1383
+ if (options.signal.aborted) {
1384
+ this.abortController.abort(options.signal.reason);
1385
+ }
1386
+ else {
1387
+ options.signal.addEventListener("abort", () => {
1388
+ this.abortController.abort(options.signal.reason);
1389
+ }, { once: true });
1390
+ }
1391
+ }
1392
+ // Set up max duration timer
1393
+ const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
1394
+ if (maxDuration > 0) {
1395
+ this.maxDurationTimer = setTimeout(() => {
1396
+ this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
1397
+ }, maxDuration);
1398
+ this.maxDurationTimer.unref(); // Don't keep process alive
1399
+ }
1400
+ // Acquire session lease
1401
+ this.manager.acquire();
1402
+ }
1403
+ get isValid() {
1404
+ return !this.released && !this.abortController.signal.aborted;
1405
+ }
1406
+ get signal() {
1407
+ return this.abortController.signal;
1408
+ }
1409
+ /**
1410
+ * Release the session and decrement ref count.
1411
+ * Called automatically by withLLMSession when the callback completes.
1412
+ */
1413
+ release() {
1414
+ if (this.released)
1415
+ return;
1416
+ this.released = true;
1417
+ if (this.maxDurationTimer) {
1418
+ clearTimeout(this.maxDurationTimer);
1419
+ this.maxDurationTimer = null;
1420
+ }
1421
+ this.abortController.abort(new Error("Session released"));
1422
+ this.manager.release();
1423
+ }
1424
+ /**
1425
+ * Wrap an operation with tracking and abort checking.
1426
+ */
1427
+ async withOperation(fn) {
1428
+ if (!this.isValid) {
1429
+ throw new SessionReleasedError();
1430
+ }
1431
+ this.manager.operationStart();
1432
+ try {
1433
+ // Check abort before starting
1434
+ if (this.abortController.signal.aborted) {
1435
+ throw new SessionReleasedError(this.abortController.signal.reason?.message || "Session aborted");
1436
+ }
1437
+ return await fn();
1438
+ }
1439
+ finally {
1440
+ this.manager.operationEnd();
1441
+ }
1442
+ }
1443
+ async embed(text, options) {
1444
+ return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
1445
+ }
1446
+ async embedBatch(texts, options) {
1447
+ return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
1448
+ }
1449
+ async expandQuery(query, options) {
1450
+ return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
1451
+ }
1452
+ async rerank(query, documents, options) {
1453
+ return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
1454
+ }
1455
+ }
1456
+ // Session manager for the default LlamaCpp instance
1457
+ let defaultSessionManager = null;
1458
+ /**
1459
+ * Get the session manager for the default LlamaCpp instance.
1460
+ */
1461
+ function getSessionManager() {
1462
+ const llm = getDefaultLlamaCpp();
1463
+ if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
1464
+ defaultSessionManager = new LLMSessionManager(llm);
1465
+ }
1466
+ return defaultSessionManager;
1467
+ }
1468
+ /**
1469
+ * Execute a function with a scoped LLM session.
1470
+ * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
1471
+ *
1472
+ * @example
1473
+ * ```typescript
1474
+ * await withLLMSession(async (session) => {
1475
+ * const expanded = await session.expandQuery(query);
1476
+ * const embeddings = await session.embedBatch(texts);
1477
+ * const reranked = await session.rerank(query, docs);
1478
+ * return reranked;
1479
+ * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
1480
+ * ```
1481
+ */
1482
+ export async function withLLMSession(fn, options) {
1483
+ const manager = getSessionManager();
1484
+ const session = new LLMSession(manager, options);
1485
+ try {
1486
+ return await fn(session);
1487
+ }
1488
+ finally {
1489
+ session.release();
1490
+ }
1491
+ }
1492
+ /**
1493
+ * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
1494
+ * Unlike withLLMSession, this does not use the global singleton.
1495
+ */
1496
+ export async function withLLMSessionForLlm(llm, fn, options) {
1497
+ const manager = new LLMSessionManager(llm);
1498
+ const session = new LLMSession(manager, options);
1499
+ try {
1500
+ return await fn(session);
1501
+ }
1502
+ finally {
1503
+ session.release();
1504
+ }
1505
+ }
1506
+ /**
1507
+ * Check if idle unload is safe (no active sessions or operations).
1508
+ * Used internally by LlamaCpp idle timer.
1509
+ */
1510
+ export function canUnloadLLM() {
1511
+ if (!defaultSessionManager)
1512
+ return true;
1513
+ return defaultSessionManager.canUnload();
1514
+ }
1515
+ // =============================================================================
1516
+ // Darwin Metal exit-crash mitigation
1517
+ // =============================================================================
1518
+ //
1519
+ // libggml-metal on macOS keeps allocated model memory wired via "residency
1520
+ // sets" with a 180-second keep_alive timer (added in ggml-org/llama.cpp#11427).
1521
+ // The process-static `std::vector<std::unique_ptr<ggml_metal_device>>`
1522
+ // destructor fires during libc `exit()` → `__cxa_finalize_ranges` and asserts
1523
+ // `[rsets->data count] == 0` — but the keep_alive hasn't expired, so the
1524
+ // assertion fails and `ggml_abort` dumps a multi-kilobyte stack trace to
1525
+ // stderr after the user-visible output. See ggml-org/llama.cpp#22593.
1526
+ //
1527
+ // No JS-side dispose call (`llama.dispose()`, `model.dispose()`, etc.) can
1528
+ // prevent it: the static destructor runs after every JS-reachable cleanup,
1529
+ // and `process.reallyExit` on Node calls libc `exit()` not `_exit()` (it
1530
+ // does NOT skip C++ static destructors — verified in
1531
+ // node/src/api/environment.cc).
1532
+ //
1533
+ // The actual fix is to disable residency sets via `GGML_METAL_NO_RESIDENCY=1`,
1534
+ // which we set from `bin/qmd` before Node loads the native binding. For QMD's
1535
+ // short-lived CLI workflow this has no measurable cost (subsequent calls
1536
+ // don't reuse the warm mapping). The functions below report whether that
1537
+ // mitigation is in effect — kept here, in the module that depends on the
1538
+ // underlying resource, so doctor can answer "is the protection active?"
1539
+ // without reaching into env handling directly.
1540
+ //
1541
+ // Setting `QMD_METAL_KEEP_RESIDENCY=1` opts back into residency sets (with
1542
+ // the visible-noise consequences). The legacy `QMD_DISABLE_DARWIN_SAFE_EXIT`
1543
+ // env var is accepted as a no-op alias for back-compat; it had no effect on
1544
+ // Node prior to this fix.
1545
+ /**
1546
+ * Whether QMD's darwin Metal exit-crash mitigation is active in this process:
1547
+ * true → residency sets disabled, process exit completes silently
1548
+ * false → either non-darwin, or `QMD_METAL_KEEP_RESIDENCY=1` overrode it,
1549
+ * in which case the libggml-metal teardown assertion may fire
1550
+ */
1551
+ export function isDarwinMetalMitigationActive() {
1552
+ if (process.platform !== "darwin")
1553
+ return false;
1554
+ if (process.env.QMD_METAL_KEEP_RESIDENCY === "1")
1555
+ return false;
1556
+ return process.env.GGML_METAL_NO_RESIDENCY === "1";
1557
+ }
1558
+ /**
1559
+ * Compatibility shim: previous releases installed a `process.on('exit')` hook
1560
+ * that tried to skip the C++ static destructor by calling `process.reallyExit`.
1561
+ * That mechanism didn't work on Node (Environment::Exit still calls libc
1562
+ * `exit()`), so it was replaced by `GGML_METAL_NO_RESIDENCY=1` from bin/qmd.
1563
+ * Kept as a no-op for code paths that still call it; safe to remove once no
1564
+ * production launcher predates the residency-set fix.
1565
+ */
1566
+ export function installDarwinExitGuard() {
1567
+ // Intentional no-op. See isDarwinMetalMitigationActive() for the real check.
1568
+ }
1569
+ /** @deprecated Replaced by isDarwinMetalMitigationActive. */
1570
+ export function isDarwinExitGuardInstalled() {
1571
+ return isDarwinMetalMitigationActive();
1572
+ }
1573
+ // =============================================================================
1574
+ // Singleton for default LlamaCpp instance
1575
+ // =============================================================================
1576
+ let defaultLlamaCpp = null;
1577
+ /**
1578
+ * Get the default LlamaCpp instance (creates one if needed). The LlamaCpp
1579
+ * constructor installs the darwin exit guard, so any code path that obtains
1580
+ * the singleton is protected.
1581
+ */
1582
+ export function getDefaultLlamaCpp() {
1583
+ if (!defaultLlamaCpp) {
1584
+ defaultLlamaCpp = new LlamaCpp();
1585
+ }
1586
+ return defaultLlamaCpp;
1587
+ }
1588
+ /**
1589
+ * Set a custom default LlamaCpp instance (useful for testing). Setting a
1590
+ * non-null instance also ensures the darwin exit guard is installed — keeps
1591
+ * the invariant intact for test doubles that didn't go through the real
1592
+ * constructor.
1593
+ */
1594
+ export function setDefaultLlamaCpp(llm) {
1595
+ if (llm !== null)
1596
+ installDarwinExitGuard();
1597
+ defaultLlamaCpp = llm;
1598
+ }
1599
+ /**
1600
+ * Peek at the default LlamaCpp instance without instantiating one. Used by
1601
+ * doctor and lifecycle diagnostics.
1602
+ */
1603
+ export function hasDefaultLlamaCpp() {
1604
+ return defaultLlamaCpp !== null;
1605
+ }
1606
+ /**
1607
+ * Dispose the default LlamaCpp instance if it exists.
1608
+ * Call this before process exit to prevent NAPI crashes.
1609
+ */
1610
+ export async function disposeDefaultLlamaCpp() {
1611
+ if (defaultLlamaCpp) {
1612
+ await defaultLlamaCpp.dispose();
1613
+ defaultLlamaCpp = null;
1614
+ }
1615
+ }