@tobilu/qmd 2.1.0 → 2.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +88 -0
- package/README.md +3 -0
- package/bin/qmd +111 -32
- package/dist/ast.d.ts +1 -0
- package/dist/ast.js +18 -8
- package/dist/bench/bench.d.ts +2 -0
- package/dist/bench/bench.js +108 -13
- package/dist/bench/score.d.ts +11 -4
- package/dist/bench/score.js +34 -13
- package/dist/bench/types.d.ts +13 -0
- package/dist/cli/qmd.d.ts +26 -0
- package/dist/cli/qmd.js +1172 -121
- package/dist/collections.d.ts +9 -0
- package/dist/collections.js +32 -7
- package/dist/db.d.ts +6 -3
- package/dist/db.js +1 -1
- package/dist/index.d.ts +4 -0
- package/dist/index.js +5 -2
- package/dist/llm.d.ts +65 -3
- package/dist/llm.js +376 -63
- package/dist/mcp/server.d.ts +6 -3
- package/dist/mcp/server.js +41 -26
- package/dist/paths.d.ts +1 -0
- package/dist/paths.js +4 -0
- package/dist/store.d.ts +92 -17
- package/dist/store.js +676 -176
- package/package.json +23 -12
- package/scripts/build.mjs +29 -0
- package/scripts/check-package-grammars.mjs +29 -0
- package/scripts/package-smoke.mjs +65 -0
- package/scripts/test-all.mjs +27 -0
- package/skills/qmd/SKILL.md +203 -0
- package/skills/qmd/references/mcp-setup.md +102 -0
- package/skills/release/SKILL.md +139 -0
- package/skills/release/scripts/install-hooks.sh +38 -0
- package/dist/embedded-skills.d.ts +0 -6
- package/dist/embedded-skills.js +0 -14
package/dist/llm.js
CHANGED
|
@@ -3,10 +3,49 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Provides embeddings, text generation, and reranking using local GGUF models.
|
|
5
5
|
*/
|
|
6
|
-
|
|
6
|
+
let nodeLlamaCppImport = null;
|
|
7
|
+
async function loadNodeLlamaCpp() {
|
|
8
|
+
nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(() => import("node-llama-cpp"));
|
|
9
|
+
return nodeLlamaCppImport;
|
|
10
|
+
}
|
|
11
|
+
export function setNodeLlamaCppModuleForTest(module) {
|
|
12
|
+
nodeLlamaCppImport = module ? Promise.resolve(module) : null;
|
|
13
|
+
failedGpuInitModes.clear();
|
|
14
|
+
noGpuAccelerationWarningShown = false;
|
|
15
|
+
cpuForcedPrebuiltFallbackWarningShown = false;
|
|
16
|
+
}
|
|
17
|
+
let nativeStdoutRedirectDepth = 0;
|
|
18
|
+
let originalStdoutWrite = null;
|
|
19
|
+
/**
|
|
20
|
+
* Some node-llama-cpp native build/probe paths write library noise to stdout.
|
|
21
|
+
* JSON APIs must reserve stdout for machine-readable payloads, so route that
|
|
22
|
+
* noise to stderr while native llama initialization is in progress.
|
|
23
|
+
*/
|
|
24
|
+
export async function withNativeStdoutRedirectedToStderr(fn) {
|
|
25
|
+
if (nativeStdoutRedirectDepth === 0) {
|
|
26
|
+
originalStdoutWrite = process.stdout.write.bind(process.stdout);
|
|
27
|
+
process.stdout.write = ((chunk, encodingOrCallback, callback) => {
|
|
28
|
+
if (typeof encodingOrCallback === "function") {
|
|
29
|
+
return process.stderr.write(chunk, encodingOrCallback);
|
|
30
|
+
}
|
|
31
|
+
return process.stderr.write(chunk, encodingOrCallback, callback);
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
nativeStdoutRedirectDepth++;
|
|
35
|
+
try {
|
|
36
|
+
return await fn();
|
|
37
|
+
}
|
|
38
|
+
finally {
|
|
39
|
+
nativeStdoutRedirectDepth--;
|
|
40
|
+
if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
|
|
41
|
+
process.stdout.write = originalStdoutWrite;
|
|
42
|
+
originalStdoutWrite = null;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
7
46
|
import { homedir } from "os";
|
|
8
47
|
import { join } from "path";
|
|
9
|
-
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
|
|
48
|
+
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
|
|
10
49
|
// =============================================================================
|
|
11
50
|
// Embedding Formatting Functions
|
|
12
51
|
// =============================================================================
|
|
@@ -23,7 +62,7 @@ export function isQwen3EmbeddingModel(modelUri) {
|
|
|
23
62
|
* Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
|
|
24
63
|
*/
|
|
25
64
|
export function formatQueryForEmbedding(query, modelUri) {
|
|
26
|
-
const uri = modelUri ??
|
|
65
|
+
const uri = modelUri ?? resolveEmbedModel();
|
|
27
66
|
if (isQwen3EmbeddingModel(uri)) {
|
|
28
67
|
return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
|
|
29
68
|
}
|
|
@@ -35,7 +74,7 @@ export function formatQueryForEmbedding(query, modelUri) {
|
|
|
35
74
|
* Qwen3-Embedding encodes documents as raw text without special prefixes.
|
|
36
75
|
*/
|
|
37
76
|
export function formatDocForEmbedding(text, title, modelUri) {
|
|
38
|
-
const uri = modelUri ??
|
|
77
|
+
const uri = modelUri ?? resolveEmbedModel();
|
|
39
78
|
if (isQwen3EmbeddingModel(uri)) {
|
|
40
79
|
// Qwen3-Embedding: documents are raw text, no task prefix
|
|
41
80
|
return title ? `${title}\n${text}` : text;
|
|
@@ -60,6 +99,22 @@ export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5
|
|
|
60
99
|
export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
|
|
61
100
|
export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
|
|
62
101
|
export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
|
|
102
|
+
export function resolveEmbedModel(config) {
|
|
103
|
+
return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
|
|
104
|
+
}
|
|
105
|
+
export function resolveGenerateModel(config) {
|
|
106
|
+
return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
|
|
107
|
+
}
|
|
108
|
+
export function resolveRerankModel(config) {
|
|
109
|
+
return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
|
|
110
|
+
}
|
|
111
|
+
export function resolveModels(config) {
|
|
112
|
+
return {
|
|
113
|
+
embed: resolveEmbedModel(config),
|
|
114
|
+
generate: resolveGenerateModel(config),
|
|
115
|
+
rerank: resolveRerankModel(config),
|
|
116
|
+
};
|
|
117
|
+
}
|
|
63
118
|
// Local model cache directory
|
|
64
119
|
const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
|
|
65
120
|
? join(process.env.XDG_CACHE_HOME, "qmd", "models")
|
|
@@ -89,6 +144,106 @@ async function getRemoteEtag(ref) {
|
|
|
89
144
|
return null;
|
|
90
145
|
}
|
|
91
146
|
}
|
|
147
|
+
const GGUF_MAGIC = Buffer.from("GGUF");
|
|
148
|
+
function formatModelFileSize(sizeBytes) {
|
|
149
|
+
return `${(sizeBytes / 1024).toFixed(0)} KB`;
|
|
150
|
+
}
|
|
151
|
+
function printableMagic(header) {
|
|
152
|
+
const text = header.toString("utf-8");
|
|
153
|
+
return /^[\x20-\x7e]{1,4}$/.test(text) ? text : `0x${header.toString("hex")}`;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Inspect a potential GGUF model file without mutating it.
|
|
157
|
+
* Used by doctor for early diagnostics and by runtime validation before load.
|
|
158
|
+
*/
|
|
159
|
+
export function inspectGgufFile(filePath) {
|
|
160
|
+
if (!existsSync(filePath)) {
|
|
161
|
+
return { exists: false, valid: false, kind: "missing", details: "file does not exist" };
|
|
162
|
+
}
|
|
163
|
+
let sizeBytes = 0;
|
|
164
|
+
try {
|
|
165
|
+
sizeBytes = statSync(filePath).size;
|
|
166
|
+
const fd = openSync(filePath, "r");
|
|
167
|
+
const sniff = Buffer.alloc(512);
|
|
168
|
+
try {
|
|
169
|
+
readSync(fd, sniff, 0, 512, 0);
|
|
170
|
+
}
|
|
171
|
+
finally {
|
|
172
|
+
closeSync(fd);
|
|
173
|
+
}
|
|
174
|
+
const header = sniff.subarray(0, 4);
|
|
175
|
+
if (header.equals(GGUF_MAGIC)) {
|
|
176
|
+
return {
|
|
177
|
+
exists: true,
|
|
178
|
+
valid: true,
|
|
179
|
+
kind: "gguf",
|
|
180
|
+
sizeBytes,
|
|
181
|
+
magic: "GGUF",
|
|
182
|
+
details: `valid GGUF (${formatModelFileSize(sizeBytes)})`,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
const magic = printableMagic(header);
|
|
186
|
+
const text = sniff.toString("utf-8").toLowerCase();
|
|
187
|
+
const isHtml = text.includes("<!doctype") || text.includes("<html");
|
|
188
|
+
if (isHtml) {
|
|
189
|
+
return {
|
|
190
|
+
exists: true,
|
|
191
|
+
valid: false,
|
|
192
|
+
kind: "html",
|
|
193
|
+
sizeBytes,
|
|
194
|
+
magic,
|
|
195
|
+
details: `HTML page, not a GGUF model (${formatModelFileSize(sizeBytes)}); likely proxy/firewall/captive portal response`,
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
return {
|
|
199
|
+
exists: true,
|
|
200
|
+
valid: false,
|
|
201
|
+
kind: "invalid",
|
|
202
|
+
sizeBytes,
|
|
203
|
+
magic,
|
|
204
|
+
details: `not valid GGUF (expected magic "GGUF", got "${magic}", ${formatModelFileSize(sizeBytes)})`,
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
catch (error) {
|
|
208
|
+
return {
|
|
209
|
+
exists: true,
|
|
210
|
+
valid: false,
|
|
211
|
+
kind: "invalid",
|
|
212
|
+
sizeBytes,
|
|
213
|
+
details: `cannot read model file: ${error instanceof Error ? error.message : String(error)}`,
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Validate that a file is actually a GGUF model, not an HTML error page
|
|
219
|
+
* from a proxy, firewall, or failed download.
|
|
220
|
+
* Throws a descriptive error if the file is not valid GGUF.
|
|
221
|
+
*/
|
|
222
|
+
function validateGgufFile(filePath, modelUri) {
|
|
223
|
+
const inspection = inspectGgufFile(filePath);
|
|
224
|
+
if (!inspection.exists || inspection.valid)
|
|
225
|
+
return; // let downstream handle missing files
|
|
226
|
+
// Remove the bad file so the next attempt re-downloads
|
|
227
|
+
try {
|
|
228
|
+
unlinkSync(filePath);
|
|
229
|
+
}
|
|
230
|
+
catch { /* best effort */ }
|
|
231
|
+
if (inspection.kind === "html") {
|
|
232
|
+
throw new Error(`Downloaded model file is an HTML page, not a GGUF model (${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
|
|
233
|
+
`Something is intercepting the download from huggingface.co (a proxy, firewall, or captive portal).\n\n` +
|
|
234
|
+
`Model: ${modelUri}\n` +
|
|
235
|
+
`Path: ${filePath}\n\n` +
|
|
236
|
+
`To fix this, either:\n` +
|
|
237
|
+
` 1. Try a HuggingFace mirror: HF_ENDPOINT=https://hf-mirror.com qmd embed\n` +
|
|
238
|
+
` 2. Download the model manually and set the env var, e.g.:\n` +
|
|
239
|
+
` QMD_EMBED_MODEL=/path/to/model.gguf qmd embed\n\n` +
|
|
240
|
+
`Note: 'qmd search' works without any model downloads.`);
|
|
241
|
+
}
|
|
242
|
+
throw new Error(`Model file is not valid GGUF (expected magic "GGUF", got "${inspection.magic ?? "unknown"}", file is ${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
|
|
243
|
+
`Model: ${modelUri}\n` +
|
|
244
|
+
`Path: ${filePath}\n\n` +
|
|
245
|
+
`The file has been removed. Run the command again to re-download.`);
|
|
246
|
+
}
|
|
92
247
|
export async function pullModels(models, options = {}) {
|
|
93
248
|
const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
|
|
94
249
|
if (!existsSync(cacheDir)) {
|
|
@@ -129,7 +284,9 @@ export async function pullModels(models, options = {}) {
|
|
|
129
284
|
refreshed = true;
|
|
130
285
|
}
|
|
131
286
|
}
|
|
287
|
+
const { resolveModelFile } = await loadNodeLlamaCpp();
|
|
132
288
|
const path = await resolveModelFile(model, cacheDir);
|
|
289
|
+
validateGgufFile(path, model);
|
|
133
290
|
const sizeBytes = existsSync(path) ? statSync(path).size : 0;
|
|
134
291
|
if (hfRef && filename) {
|
|
135
292
|
const remoteEtag = await getRemoteEtag(hfRef);
|
|
@@ -148,6 +305,58 @@ export async function pullModels(models, options = {}) {
|
|
|
148
305
|
// Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
|
|
149
306
|
const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
|
|
150
307
|
const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
|
|
308
|
+
export function resolveParallelismOverride(envValue = process.env.QMD_EMBED_PARALLELISM) {
|
|
309
|
+
const normalized = envValue?.trim() ?? "";
|
|
310
|
+
if (!normalized)
|
|
311
|
+
return undefined;
|
|
312
|
+
const parsed = Number(normalized);
|
|
313
|
+
if (!Number.isInteger(parsed) || parsed < 1) {
|
|
314
|
+
process.stderr.write(`QMD Warning: invalid QMD_EMBED_PARALLELISM="${envValue}", using automatic parallelism.\n`);
|
|
315
|
+
return undefined;
|
|
316
|
+
}
|
|
317
|
+
return Math.min(8, parsed);
|
|
318
|
+
}
|
|
319
|
+
export function resolveSafeParallelism(options) {
|
|
320
|
+
const override = resolveParallelismOverride(options.envValue);
|
|
321
|
+
if (override !== undefined)
|
|
322
|
+
return override;
|
|
323
|
+
// node-llama-cpp/llama.cpp CUDA on Windows is unstable with multiple
|
|
324
|
+
// simultaneous contexts (ggml-cuda.cu:98 in #519). Vulkan and CPU do not
|
|
325
|
+
// show the same failure mode, so only serialize Windows CUDA by default.
|
|
326
|
+
if ((options.platform ?? process.platform) === "win32" && options.gpu === "cuda") {
|
|
327
|
+
return 1;
|
|
328
|
+
}
|
|
329
|
+
return Math.max(1, options.computed);
|
|
330
|
+
}
|
|
331
|
+
export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU, forceCpuValue = process.env.QMD_FORCE_CPU) {
|
|
332
|
+
const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
|
|
333
|
+
if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
|
|
334
|
+
return false;
|
|
335
|
+
}
|
|
336
|
+
const normalized = envValue?.trim().toLowerCase() ?? "";
|
|
337
|
+
if (!normalized)
|
|
338
|
+
return "auto";
|
|
339
|
+
if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized))
|
|
340
|
+
return false;
|
|
341
|
+
if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda")
|
|
342
|
+
return normalized;
|
|
343
|
+
process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`);
|
|
344
|
+
return "auto";
|
|
345
|
+
}
|
|
346
|
+
async function disposeWithTimeout(resourceName, dispose, timeoutMs = 1000) {
|
|
347
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
348
|
+
setTimeout(() => resolve("timeout"), timeoutMs).unref();
|
|
349
|
+
});
|
|
350
|
+
try {
|
|
351
|
+
const result = await Promise.race([dispose(), timeoutPromise]);
|
|
352
|
+
if (result === "timeout") {
|
|
353
|
+
process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
catch (error) {
|
|
357
|
+
process.stderr.write(`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
151
360
|
function resolveExpandContextSize(configValue) {
|
|
152
361
|
if (configValue !== undefined) {
|
|
153
362
|
if (!Number.isInteger(configValue) || configValue <= 0) {
|
|
@@ -165,6 +374,12 @@ function resolveExpandContextSize(configValue) {
|
|
|
165
374
|
}
|
|
166
375
|
return parsed;
|
|
167
376
|
}
|
|
377
|
+
const failedGpuInitModes = new Set();
|
|
378
|
+
let noGpuAccelerationWarningShown = false;
|
|
379
|
+
let cpuForcedPrebuiltFallbackWarningShown = false;
|
|
380
|
+
function isCpuModeRequested() {
|
|
381
|
+
return resolveLlamaGpuMode() === false;
|
|
382
|
+
}
|
|
168
383
|
export class LlamaCpp {
|
|
169
384
|
_ciMode = !!process.env.CI;
|
|
170
385
|
llama = null;
|
|
@@ -189,9 +404,9 @@ export class LlamaCpp {
|
|
|
189
404
|
// Track disposal state to prevent double-dispose
|
|
190
405
|
disposed = false;
|
|
191
406
|
constructor(config = {}) {
|
|
192
|
-
this.embedModelUri = config.embedModel
|
|
193
|
-
this.generateModelUri = config.generateModel
|
|
194
|
-
this.rerankModelUri = config.rerankModel
|
|
407
|
+
this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
|
|
408
|
+
this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
|
|
409
|
+
this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
|
|
195
410
|
this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
|
|
196
411
|
this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
|
|
197
412
|
this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
|
|
@@ -200,6 +415,12 @@ export class LlamaCpp {
|
|
|
200
415
|
get embedModelName() {
|
|
201
416
|
return this.embedModelUri;
|
|
202
417
|
}
|
|
418
|
+
get generateModelName() {
|
|
419
|
+
return this.generateModelUri;
|
|
420
|
+
}
|
|
421
|
+
get rerankModelName() {
|
|
422
|
+
return this.rerankModelUri;
|
|
423
|
+
}
|
|
203
424
|
/**
|
|
204
425
|
* Reset the inactivity timer. Called after each model operation.
|
|
205
426
|
* When timer fires, models are unloaded to free memory (if no active sessions).
|
|
@@ -292,45 +513,113 @@ export class LlamaCpp {
|
|
|
292
513
|
/**
|
|
293
514
|
* Initialize the llama instance (lazy)
|
|
294
515
|
*/
|
|
295
|
-
async ensureLlama() {
|
|
516
|
+
async ensureLlama(allowBuild = true) {
|
|
296
517
|
if (!this.llama) {
|
|
297
|
-
|
|
298
|
-
const
|
|
299
|
-
const
|
|
300
|
-
|
|
301
|
-
|
|
518
|
+
const gpuMode = resolveLlamaGpuMode();
|
|
519
|
+
const { getLlama, getLlamaGpuTypes, LlamaLogLevel } = await loadNodeLlamaCpp();
|
|
520
|
+
const loadLlama = async (gpu, sourceBuildAllowed = allowBuild, buildOverride) => await withNativeStdoutRedirectedToStderr(() => getLlama({
|
|
521
|
+
// Prefer packaged prebuilt bindings before compiling llama.cpp locally.
|
|
522
|
+
// node-llama-cpp documents gpu:"auto" as the best default: Metal on
|
|
523
|
+
// Apple Silicon, CUDA when fully available, Vulkan where available,
|
|
524
|
+
// then CPU. Use build:"auto" for normal loads and build:"never" for
|
|
525
|
+
// diagnostic/probe paths that must not compile llama.cpp.
|
|
526
|
+
build: buildOverride ?? (sourceBuildAllowed ? "auto" : "never"),
|
|
302
527
|
logLevel: LlamaLogLevel.error,
|
|
303
528
|
gpu,
|
|
304
|
-
|
|
529
|
+
progressLogs: false,
|
|
530
|
+
skipDownload: !sourceBuildAllowed,
|
|
531
|
+
}));
|
|
532
|
+
const loadCpuCompatibleLlama = async () => {
|
|
533
|
+
try {
|
|
534
|
+
return await loadLlama(false, false);
|
|
535
|
+
}
|
|
536
|
+
catch (err) {
|
|
537
|
+
// Some platforms, notably Apple Silicon, ship a Metal prebuilt but no
|
|
538
|
+
// CPU-only prebuilt. Do a fast no-build lookup for an actual CPU
|
|
539
|
+
// binding first; if it does not exist, use the packaged auto/Metal
|
|
540
|
+
// binding and disable model offloading via gpuLayers: 0.
|
|
541
|
+
if (!cpuForcedPrebuiltFallbackWarningShown) {
|
|
542
|
+
cpuForcedPrebuiltFallbackWarningShown = true;
|
|
543
|
+
process.stderr.write(`QMD Warning: CPU-only llama.cpp prebuilt not available (${err instanceof Error ? err.message : String(err)}); using packaged backend with GPU offloading disabled.\n`);
|
|
544
|
+
}
|
|
545
|
+
return await loadLlama("auto", false);
|
|
546
|
+
}
|
|
547
|
+
};
|
|
305
548
|
let llama;
|
|
306
|
-
if (
|
|
307
|
-
llama = await
|
|
549
|
+
if (gpuMode === false) {
|
|
550
|
+
llama = await loadCpuCompatibleLlama();
|
|
551
|
+
}
|
|
552
|
+
else if (failedGpuInitModes.has(gpuMode)) {
|
|
553
|
+
process.stderr.write(`QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`);
|
|
554
|
+
llama = await loadCpuCompatibleLlama();
|
|
308
555
|
}
|
|
309
556
|
else {
|
|
310
557
|
try {
|
|
311
|
-
llama = await loadLlama(
|
|
558
|
+
llama = await loadLlama(gpuMode);
|
|
559
|
+
// If node-llama-cpp auto-detection chose CPU, do one no-build pass
|
|
560
|
+
// over all OS-valid packaged GPU backends. This preserves the
|
|
561
|
+
// documented auto mode for Metal/CUDA/Vulkan while recovering on
|
|
562
|
+
// systems where a packaged backend can load but detection is too
|
|
563
|
+
// conservative. Never compile during these extra probes.
|
|
564
|
+
if (gpuMode === "auto" && llama.gpu === false && getLlamaGpuTypes) {
|
|
565
|
+
const candidates = (await getLlamaGpuTypes("allValid"))
|
|
566
|
+
.filter((candidate) => candidate !== false && candidate !== "auto");
|
|
567
|
+
for (const candidate of candidates) {
|
|
568
|
+
if (failedGpuInitModes.has(candidate))
|
|
569
|
+
continue;
|
|
570
|
+
try {
|
|
571
|
+
const gpuLlama = await loadLlama(candidate, false, "never");
|
|
572
|
+
if (gpuLlama.gpu !== false) {
|
|
573
|
+
await disposeWithTimeout("CPU llama runtime", () => llama.dispose());
|
|
574
|
+
llama = gpuLlama;
|
|
575
|
+
break;
|
|
576
|
+
}
|
|
577
|
+
await disposeWithTimeout(`${candidate} probe runtime`, () => gpuLlama.dispose());
|
|
578
|
+
}
|
|
579
|
+
catch {
|
|
580
|
+
failedGpuInitModes.add(candidate);
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
}
|
|
312
584
|
}
|
|
313
585
|
catch (err) {
|
|
314
|
-
// GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
|
|
315
|
-
// Fall back to CPU so qmd still works
|
|
316
|
-
|
|
317
|
-
|
|
586
|
+
// GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
|
|
587
|
+
// Fall back to CPU so qmd still works, and cache the failure to avoid repeated
|
|
588
|
+
// expensive native build/probe attempts in this process.
|
|
589
|
+
failedGpuInitModes.add(gpuMode);
|
|
590
|
+
process.stderr.write(`QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
|
|
591
|
+
llama = await loadCpuCompatibleLlama();
|
|
318
592
|
}
|
|
319
593
|
}
|
|
320
|
-
if (llama.gpu === false) {
|
|
321
|
-
|
|
594
|
+
if (llama.gpu === false && !noGpuAccelerationWarningShown) {
|
|
595
|
+
noGpuAccelerationWarningShown = true;
|
|
596
|
+
process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd doctor' for device diagnostics.\n");
|
|
322
597
|
}
|
|
323
598
|
this.llama = llama;
|
|
324
599
|
}
|
|
325
600
|
return this.llama;
|
|
326
601
|
}
|
|
602
|
+
isCpuOffloadForced() {
|
|
603
|
+
return isCpuModeRequested();
|
|
604
|
+
}
|
|
605
|
+
modelLoadOptions(modelPath) {
|
|
606
|
+
return {
|
|
607
|
+
modelPath,
|
|
608
|
+
...(this.isCpuOffloadForced() ? { gpuLayers: 0 } : {}),
|
|
609
|
+
};
|
|
610
|
+
}
|
|
327
611
|
/**
|
|
328
|
-
* Resolve a model URI to a local path, downloading if needed
|
|
612
|
+
* Resolve a model URI to a local path, downloading if needed.
|
|
613
|
+
* Validates the downloaded file is actually a GGUF model (not an HTML error page
|
|
614
|
+
* from a proxy or firewall).
|
|
329
615
|
*/
|
|
330
616
|
async resolveModel(modelUri) {
|
|
331
617
|
this.ensureModelCacheDir();
|
|
332
618
|
// resolveModelFile handles HF URIs and downloads to the cache dir
|
|
333
|
-
|
|
619
|
+
const { resolveModelFile } = await loadNodeLlamaCpp();
|
|
620
|
+
const modelPath = await resolveModelFile(modelUri, this.modelCacheDir);
|
|
621
|
+
validateGgufFile(modelPath, modelUri);
|
|
622
|
+
return modelPath;
|
|
334
623
|
}
|
|
335
624
|
/**
|
|
336
625
|
* Load embedding model (lazy)
|
|
@@ -345,7 +634,7 @@ export class LlamaCpp {
|
|
|
345
634
|
this.embedModelLoadPromise = (async () => {
|
|
346
635
|
const llama = await this.ensureLlama();
|
|
347
636
|
const modelPath = await this.resolveModel(this.embedModelUri);
|
|
348
|
-
const model = await llama.loadModel(
|
|
637
|
+
const model = await llama.loadModel(this.modelLoadOptions(modelPath));
|
|
349
638
|
this.embedModel = model;
|
|
350
639
|
// Model loading counts as activity - ping to keep alive
|
|
351
640
|
this.touchActivity();
|
|
@@ -369,21 +658,23 @@ export class LlamaCpp {
|
|
|
369
658
|
*/
|
|
370
659
|
async computeParallelism(perContextMB) {
|
|
371
660
|
const llama = await this.ensureLlama();
|
|
372
|
-
if (llama.gpu) {
|
|
661
|
+
if (!this.isCpuOffloadForced() && llama.gpu) {
|
|
373
662
|
try {
|
|
374
663
|
const vram = await llama.getVramState();
|
|
375
664
|
const freeMB = vram.free / (1024 * 1024);
|
|
376
665
|
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
|
|
377
|
-
|
|
666
|
+
const computed = Math.max(1, Math.min(8, maxByVram));
|
|
667
|
+
return resolveSafeParallelism({ gpu: llama.gpu, computed });
|
|
378
668
|
}
|
|
379
669
|
catch {
|
|
380
|
-
return 2;
|
|
670
|
+
return resolveSafeParallelism({ gpu: llama.gpu, computed: 2 });
|
|
381
671
|
}
|
|
382
672
|
}
|
|
383
673
|
// CPU: split cores across contexts. At least 4 threads per context.
|
|
384
674
|
const cores = llama.cpuMathCores || 4;
|
|
385
675
|
const maxContexts = Math.floor(cores / 4);
|
|
386
|
-
|
|
676
|
+
const computed = Math.max(1, Math.min(4, maxContexts));
|
|
677
|
+
return resolveSafeParallelism({ gpu: false, computed });
|
|
387
678
|
}
|
|
388
679
|
/**
|
|
389
680
|
* Get the number of threads each context should use, given N parallel contexts.
|
|
@@ -391,7 +682,7 @@ export class LlamaCpp {
|
|
|
391
682
|
*/
|
|
392
683
|
async threadsPerContext(parallelism) {
|
|
393
684
|
const llama = await this.ensureLlama();
|
|
394
|
-
if (llama.gpu)
|
|
685
|
+
if (!this.isCpuOffloadForced() && llama.gpu)
|
|
395
686
|
return 0; // GPU: let the library decide
|
|
396
687
|
const cores = llama.cpuMathCores || 4;
|
|
397
688
|
return Math.max(1, Math.floor(cores / parallelism));
|
|
@@ -455,7 +746,7 @@ export class LlamaCpp {
|
|
|
455
746
|
this.generateModelLoadPromise = (async () => {
|
|
456
747
|
const llama = await this.ensureLlama();
|
|
457
748
|
const modelPath = await this.resolveModel(this.generateModelUri);
|
|
458
|
-
const model = await llama.loadModel(
|
|
749
|
+
const model = await llama.loadModel(this.modelLoadOptions(modelPath));
|
|
459
750
|
this.generateModel = model;
|
|
460
751
|
return model;
|
|
461
752
|
})();
|
|
@@ -485,7 +776,7 @@ export class LlamaCpp {
|
|
|
485
776
|
this.rerankModelLoadPromise = (async () => {
|
|
486
777
|
const llama = await this.ensureLlama();
|
|
487
778
|
const modelPath = await this.resolveModel(this.rerankModelUri);
|
|
488
|
-
const model = await llama.loadModel(
|
|
779
|
+
const model = await llama.loadModel(this.modelLoadOptions(modelPath));
|
|
489
780
|
this.rerankModel = model;
|
|
490
781
|
// Model loading counts as activity - ping to keep alive
|
|
491
782
|
this.touchActivity();
|
|
@@ -532,7 +823,6 @@ export class LlamaCpp {
|
|
|
532
823
|
try {
|
|
533
824
|
this.rerankContexts.push(await model.createRankingContext({
|
|
534
825
|
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
|
|
535
|
-
flashAttention: true,
|
|
536
826
|
...(threads > 0 ? { threads } : {}),
|
|
537
827
|
}));
|
|
538
828
|
}
|
|
@@ -596,20 +886,27 @@ export class LlamaCpp {
|
|
|
596
886
|
* detokenizes back to text if truncation is needed.
|
|
597
887
|
* Returns the (possibly truncated) text and whether truncation occurred.
|
|
598
888
|
*/
|
|
889
|
+
resolveEmbedTokenLimit() {
|
|
890
|
+
const trainedContextSize = this.embedModel?.trainContextSize;
|
|
891
|
+
if (typeof trainedContextSize === "number" && Number.isFinite(trainedContextSize) && trainedContextSize > 0) {
|
|
892
|
+
return Math.max(1, Math.min(LlamaCpp.EMBED_CONTEXT_SIZE, trainedContextSize));
|
|
893
|
+
}
|
|
894
|
+
return LlamaCpp.EMBED_CONTEXT_SIZE;
|
|
895
|
+
}
|
|
599
896
|
async truncateToContextSize(text) {
|
|
600
897
|
if (!this.embedModel)
|
|
601
|
-
return { text, truncated: false };
|
|
602
|
-
const maxTokens = this.
|
|
898
|
+
return { text, truncated: false, limit: LlamaCpp.EMBED_CONTEXT_SIZE };
|
|
899
|
+
const maxTokens = this.resolveEmbedTokenLimit();
|
|
603
900
|
if (maxTokens <= 0)
|
|
604
|
-
return { text, truncated: false };
|
|
901
|
+
return { text, truncated: false, limit: maxTokens };
|
|
605
902
|
const tokens = this.embedModel.tokenize(text);
|
|
606
903
|
if (tokens.length <= maxTokens)
|
|
607
|
-
return { text, truncated: false };
|
|
904
|
+
return { text, truncated: false, limit: maxTokens };
|
|
608
905
|
// Leave a small margin (4 tokens) for BOS/EOS overhead
|
|
609
906
|
const safeLimit = Math.max(1, maxTokens - 4);
|
|
610
907
|
const truncatedTokens = tokens.slice(0, safeLimit);
|
|
611
908
|
const truncatedText = this.embedModel.detokenize(truncatedTokens);
|
|
612
|
-
return { text: truncatedText, truncated: true };
|
|
909
|
+
return { text: truncatedText, truncated: true, limit: maxTokens };
|
|
613
910
|
}
|
|
614
911
|
async embed(text, options = {}) {
|
|
615
912
|
// Ping activity at start to keep models alive during this operation
|
|
@@ -617,9 +914,9 @@ export class LlamaCpp {
|
|
|
617
914
|
try {
|
|
618
915
|
const context = await this.ensureEmbedContext();
|
|
619
916
|
// Guard: truncate text that exceeds model context window to prevent GGML crash
|
|
620
|
-
const { text: safeText, truncated } = await this.truncateToContextSize(text);
|
|
917
|
+
const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
|
|
621
918
|
if (truncated) {
|
|
622
|
-
console.warn(`⚠ Text truncated to fit embedding context (${
|
|
919
|
+
console.warn(`⚠ Text truncated to fit embedding context (${limit} tokens)`);
|
|
623
920
|
}
|
|
624
921
|
const embedding = await context.getEmbeddingFor(safeText);
|
|
625
922
|
return {
|
|
@@ -652,9 +949,9 @@ export class LlamaCpp {
|
|
|
652
949
|
const embeddings = [];
|
|
653
950
|
for (const text of texts) {
|
|
654
951
|
try {
|
|
655
|
-
const { text: safeText, truncated } = await this.truncateToContextSize(text);
|
|
952
|
+
const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
|
|
656
953
|
if (truncated) {
|
|
657
|
-
console.warn(`⚠ Batch text truncated to fit embedding context (${
|
|
954
|
+
console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
|
|
658
955
|
}
|
|
659
956
|
const embedding = await context.getEmbeddingFor(safeText);
|
|
660
957
|
this.touchActivity();
|
|
@@ -675,9 +972,9 @@ export class LlamaCpp {
|
|
|
675
972
|
const results = [];
|
|
676
973
|
for (const text of chunk) {
|
|
677
974
|
try {
|
|
678
|
-
const { text: safeText, truncated } = await this.truncateToContextSize(text);
|
|
975
|
+
const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
|
|
679
976
|
if (truncated) {
|
|
680
|
-
console.warn(`⚠ Batch text truncated to fit embedding context (${
|
|
977
|
+
console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
|
|
681
978
|
}
|
|
682
979
|
const embedding = await ctx.getEmbeddingFor(safeText);
|
|
683
980
|
this.touchActivity();
|
|
@@ -707,6 +1004,7 @@ export class LlamaCpp {
|
|
|
707
1004
|
// Create fresh context -> sequence -> session for each call
|
|
708
1005
|
const context = await this.generateModel.createContext();
|
|
709
1006
|
const sequence = context.getSequence();
|
|
1007
|
+
const { LlamaChatSession } = await loadNodeLlamaCpp();
|
|
710
1008
|
const session = new LlamaChatSession({ contextSequence: sequence });
|
|
711
1009
|
const maxTokens = options.maxTokens ?? 150;
|
|
712
1010
|
// Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
|
|
@@ -776,6 +1074,7 @@ export class LlamaCpp {
|
|
|
776
1074
|
contextSize: this.expandContextSize,
|
|
777
1075
|
});
|
|
778
1076
|
const sequence = genContext.getSequence();
|
|
1077
|
+
const { LlamaChatSession } = await loadNodeLlamaCpp();
|
|
779
1078
|
const session = new LlamaChatSession({ contextSequence: sequence });
|
|
780
1079
|
try {
|
|
781
1080
|
// Qwen3 recommended settings for non-thinking mode:
|
|
@@ -916,11 +1215,12 @@ export class LlamaCpp {
|
|
|
916
1215
|
* Get device/GPU info for status display.
|
|
917
1216
|
* Initializes llama if not already done.
|
|
918
1217
|
*/
|
|
919
|
-
async getDeviceInfo() {
|
|
920
|
-
const llama = await this.ensureLlama();
|
|
921
|
-
const
|
|
1218
|
+
async getDeviceInfo(options = {}) {
|
|
1219
|
+
const llama = await this.ensureLlama(options.allowBuild ?? true);
|
|
1220
|
+
const cpuForced = this.isCpuOffloadForced();
|
|
1221
|
+
const gpuDevices = cpuForced ? [] : await llama.getGpuDeviceNames();
|
|
922
1222
|
let vram;
|
|
923
|
-
if (llama.gpu) {
|
|
1223
|
+
if (!cpuForced && llama.gpu) {
|
|
924
1224
|
try {
|
|
925
1225
|
const state = await llama.getVramState();
|
|
926
1226
|
vram = { total: state.total, used: state.used, free: state.free };
|
|
@@ -928,8 +1228,8 @@ export class LlamaCpp {
|
|
|
928
1228
|
catch { /* no vram info */ }
|
|
929
1229
|
}
|
|
930
1230
|
return {
|
|
931
|
-
gpu: llama.gpu,
|
|
932
|
-
gpuOffloading: llama.supportsGpuOffloading,
|
|
1231
|
+
gpu: cpuForced ? false : llama.gpu,
|
|
1232
|
+
gpuOffloading: !cpuForced && llama.supportsGpuOffloading,
|
|
933
1233
|
gpuDevices,
|
|
934
1234
|
vram,
|
|
935
1235
|
cpuCores: llama.cpuMathCores,
|
|
@@ -946,21 +1246,34 @@ export class LlamaCpp {
|
|
|
946
1246
|
clearTimeout(this.inactivityTimer);
|
|
947
1247
|
this.inactivityTimer = null;
|
|
948
1248
|
}
|
|
949
|
-
//
|
|
950
|
-
//
|
|
951
|
-
//
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
await Promise.race([disposePromise, timeoutPromise]);
|
|
1249
|
+
// Explicitly dispose in dependency order: contexts first, then models, then llama.
|
|
1250
|
+
// Relying only on llama.dispose() leaves Metal resource sets alive until process
|
|
1251
|
+
// finalization on Apple Silicon, where ggml_metal_device_free can abort after
|
|
1252
|
+
// otherwise-successful CLI output (#368).
|
|
1253
|
+
for (const ctx of this.embedContexts) {
|
|
1254
|
+
await disposeWithTimeout("embedding context", () => ctx.dispose());
|
|
956
1255
|
}
|
|
957
|
-
// Clear references
|
|
958
1256
|
this.embedContexts = [];
|
|
1257
|
+
for (const ctx of this.rerankContexts) {
|
|
1258
|
+
await disposeWithTimeout("rerank context", () => ctx.dispose());
|
|
1259
|
+
}
|
|
959
1260
|
this.rerankContexts = [];
|
|
960
|
-
this.embedModel
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
1261
|
+
if (this.embedModel) {
|
|
1262
|
+
await disposeWithTimeout("embedding model", () => this.embedModel.dispose());
|
|
1263
|
+
this.embedModel = null;
|
|
1264
|
+
}
|
|
1265
|
+
if (this.generateModel) {
|
|
1266
|
+
await disposeWithTimeout("generation model", () => this.generateModel.dispose());
|
|
1267
|
+
this.generateModel = null;
|
|
1268
|
+
}
|
|
1269
|
+
if (this.rerankModel) {
|
|
1270
|
+
await disposeWithTimeout("rerank model", () => this.rerankModel.dispose());
|
|
1271
|
+
this.rerankModel = null;
|
|
1272
|
+
}
|
|
1273
|
+
if (this.llama) {
|
|
1274
|
+
await disposeWithTimeout("llama runtime", () => this.llama.dispose());
|
|
1275
|
+
this.llama = null;
|
|
1276
|
+
}
|
|
964
1277
|
// Clear any in-flight load/create promises
|
|
965
1278
|
this.embedModelLoadPromise = null;
|
|
966
1279
|
this.embedContextsCreatePromise = null;
|