@tobilu/qmd 2.0.1 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +177 -0
- package/README.md +64 -1
- package/bin/qmd +49 -4
- package/dist/ast.d.ts +65 -0
- package/dist/ast.js +334 -0
- package/dist/bench/bench.d.ts +23 -0
- package/dist/bench/bench.js +280 -0
- package/dist/bench/score.d.ts +33 -0
- package/dist/bench/score.js +88 -0
- package/dist/bench/types.d.ts +80 -0
- package/dist/bench/types.js +8 -0
- package/dist/cli/formatter.js +5 -1
- package/dist/cli/qmd.d.ts +27 -0
- package/dist/cli/qmd.js +1328 -115
- package/dist/collections.d.ts +20 -0
- package/dist/collections.js +32 -7
- package/dist/db.d.ts +14 -3
- package/dist/db.js +45 -4
- package/dist/index.d.ts +11 -1
- package/dist/index.js +18 -5
- package/dist/llm.d.ts +77 -6
- package/dist/llm.js +445 -62
- package/dist/mcp/server.d.ts +6 -3
- package/dist/mcp/server.js +68 -29
- package/dist/paths.d.ts +1 -0
- package/dist/paths.js +4 -0
- package/dist/store.d.ts +148 -23
- package/dist/store.js +1018 -255
- package/package.json +48 -20
- package/scripts/build.mjs +29 -0
- package/scripts/check-package-grammars.mjs +29 -0
- package/scripts/package-smoke.mjs +65 -0
- package/scripts/test-all.mjs +27 -0
- package/skills/qmd/SKILL.md +203 -0
- package/skills/qmd/references/mcp-setup.md +102 -0
- package/skills/release/SKILL.md +139 -0
- package/skills/release/scripts/install-hooks.sh +38 -0
- package/dist/embedded-skills.d.ts +0 -6
- package/dist/embedded-skills.js +0 -14
package/dist/llm.js
CHANGED
|
@@ -3,10 +3,49 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Provides embeddings, text generation, and reranking using local GGUF models.
|
|
5
5
|
*/
|
|
6
|
-
|
|
6
|
+
let nodeLlamaCppImport = null;
|
|
7
|
+
async function loadNodeLlamaCpp() {
|
|
8
|
+
nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(() => import("node-llama-cpp"));
|
|
9
|
+
return nodeLlamaCppImport;
|
|
10
|
+
}
|
|
11
|
+
export function setNodeLlamaCppModuleForTest(module) {
|
|
12
|
+
nodeLlamaCppImport = module ? Promise.resolve(module) : null;
|
|
13
|
+
failedGpuInitModes.clear();
|
|
14
|
+
noGpuAccelerationWarningShown = false;
|
|
15
|
+
cpuForcedPrebuiltFallbackWarningShown = false;
|
|
16
|
+
}
|
|
17
|
+
let nativeStdoutRedirectDepth = 0;
|
|
18
|
+
let originalStdoutWrite = null;
|
|
19
|
+
/**
|
|
20
|
+
* Some node-llama-cpp native build/probe paths write library noise to stdout.
|
|
21
|
+
* JSON APIs must reserve stdout for machine-readable payloads, so route that
|
|
22
|
+
* noise to stderr while native llama initialization is in progress.
|
|
23
|
+
*/
|
|
24
|
+
export async function withNativeStdoutRedirectedToStderr(fn) {
|
|
25
|
+
if (nativeStdoutRedirectDepth === 0) {
|
|
26
|
+
originalStdoutWrite = process.stdout.write.bind(process.stdout);
|
|
27
|
+
process.stdout.write = ((chunk, encodingOrCallback, callback) => {
|
|
28
|
+
if (typeof encodingOrCallback === "function") {
|
|
29
|
+
return process.stderr.write(chunk, encodingOrCallback);
|
|
30
|
+
}
|
|
31
|
+
return process.stderr.write(chunk, encodingOrCallback, callback);
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
nativeStdoutRedirectDepth++;
|
|
35
|
+
try {
|
|
36
|
+
return await fn();
|
|
37
|
+
}
|
|
38
|
+
finally {
|
|
39
|
+
nativeStdoutRedirectDepth--;
|
|
40
|
+
if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
|
|
41
|
+
process.stdout.write = originalStdoutWrite;
|
|
42
|
+
originalStdoutWrite = null;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
7
46
|
import { homedir } from "os";
|
|
8
47
|
import { join } from "path";
|
|
9
|
-
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
|
|
48
|
+
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
|
|
10
49
|
// =============================================================================
|
|
11
50
|
// Embedding Formatting Functions
|
|
12
51
|
// =============================================================================
|
|
@@ -23,7 +62,7 @@ export function isQwen3EmbeddingModel(modelUri) {
|
|
|
23
62
|
* Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
|
|
24
63
|
*/
|
|
25
64
|
export function formatQueryForEmbedding(query, modelUri) {
|
|
26
|
-
const uri = modelUri ??
|
|
65
|
+
const uri = modelUri ?? resolveEmbedModel();
|
|
27
66
|
if (isQwen3EmbeddingModel(uri)) {
|
|
28
67
|
return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
|
|
29
68
|
}
|
|
@@ -35,7 +74,7 @@ export function formatQueryForEmbedding(query, modelUri) {
|
|
|
35
74
|
* Qwen3-Embedding encodes documents as raw text without special prefixes.
|
|
36
75
|
*/
|
|
37
76
|
export function formatDocForEmbedding(text, title, modelUri) {
|
|
38
|
-
const uri = modelUri ??
|
|
77
|
+
const uri = modelUri ?? resolveEmbedModel();
|
|
39
78
|
if (isQwen3EmbeddingModel(uri)) {
|
|
40
79
|
// Qwen3-Embedding: documents are raw text, no task prefix
|
|
41
80
|
return title ? `${title}\n${text}` : text;
|
|
@@ -48,7 +87,7 @@ export function formatDocForEmbedding(text, title, modelUri) {
|
|
|
48
87
|
// HuggingFace model URIs for node-llama-cpp
|
|
49
88
|
// Format: hf:<user>/<repo>/<file>
|
|
50
89
|
// Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
|
|
51
|
-
const DEFAULT_EMBED_MODEL =
|
|
90
|
+
const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
|
|
52
91
|
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
|
53
92
|
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
|
|
54
93
|
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
|
@@ -60,8 +99,26 @@ export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5
|
|
|
60
99
|
export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
|
|
61
100
|
export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
|
|
62
101
|
export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
|
|
102
|
+
export function resolveEmbedModel(config) {
|
|
103
|
+
return config?.embed || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
|
|
104
|
+
}
|
|
105
|
+
export function resolveGenerateModel(config) {
|
|
106
|
+
return config?.generate || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
|
|
107
|
+
}
|
|
108
|
+
export function resolveRerankModel(config) {
|
|
109
|
+
return config?.rerank || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
|
|
110
|
+
}
|
|
111
|
+
export function resolveModels(config) {
|
|
112
|
+
return {
|
|
113
|
+
embed: resolveEmbedModel(config),
|
|
114
|
+
generate: resolveGenerateModel(config),
|
|
115
|
+
rerank: resolveRerankModel(config),
|
|
116
|
+
};
|
|
117
|
+
}
|
|
63
118
|
// Local model cache directory
|
|
64
|
-
const MODEL_CACHE_DIR =
|
|
119
|
+
const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
|
|
120
|
+
? join(process.env.XDG_CACHE_HOME, "qmd", "models")
|
|
121
|
+
: join(homedir(), ".cache", "qmd", "models");
|
|
65
122
|
export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
|
|
66
123
|
function parseHfUri(model) {
|
|
67
124
|
if (!model.startsWith("hf:"))
|
|
@@ -87,6 +144,106 @@ async function getRemoteEtag(ref) {
|
|
|
87
144
|
return null;
|
|
88
145
|
}
|
|
89
146
|
}
|
|
147
|
+
const GGUF_MAGIC = Buffer.from("GGUF");
|
|
148
|
+
function formatModelFileSize(sizeBytes) {
|
|
149
|
+
return `${(sizeBytes / 1024).toFixed(0)} KB`;
|
|
150
|
+
}
|
|
151
|
+
function printableMagic(header) {
|
|
152
|
+
const text = header.toString("utf-8");
|
|
153
|
+
return /^[\x20-\x7e]{1,4}$/.test(text) ? text : `0x${header.toString("hex")}`;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Inspect a potential GGUF model file without mutating it.
|
|
157
|
+
* Used by doctor for early diagnostics and by runtime validation before load.
|
|
158
|
+
*/
|
|
159
|
+
export function inspectGgufFile(filePath) {
|
|
160
|
+
if (!existsSync(filePath)) {
|
|
161
|
+
return { exists: false, valid: false, kind: "missing", details: "file does not exist" };
|
|
162
|
+
}
|
|
163
|
+
let sizeBytes = 0;
|
|
164
|
+
try {
|
|
165
|
+
sizeBytes = statSync(filePath).size;
|
|
166
|
+
const fd = openSync(filePath, "r");
|
|
167
|
+
const sniff = Buffer.alloc(512);
|
|
168
|
+
try {
|
|
169
|
+
readSync(fd, sniff, 0, 512, 0);
|
|
170
|
+
}
|
|
171
|
+
finally {
|
|
172
|
+
closeSync(fd);
|
|
173
|
+
}
|
|
174
|
+
const header = sniff.subarray(0, 4);
|
|
175
|
+
if (header.equals(GGUF_MAGIC)) {
|
|
176
|
+
return {
|
|
177
|
+
exists: true,
|
|
178
|
+
valid: true,
|
|
179
|
+
kind: "gguf",
|
|
180
|
+
sizeBytes,
|
|
181
|
+
magic: "GGUF",
|
|
182
|
+
details: `valid GGUF (${formatModelFileSize(sizeBytes)})`,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
const magic = printableMagic(header);
|
|
186
|
+
const text = sniff.toString("utf-8").toLowerCase();
|
|
187
|
+
const isHtml = text.includes("<!doctype") || text.includes("<html");
|
|
188
|
+
if (isHtml) {
|
|
189
|
+
return {
|
|
190
|
+
exists: true,
|
|
191
|
+
valid: false,
|
|
192
|
+
kind: "html",
|
|
193
|
+
sizeBytes,
|
|
194
|
+
magic,
|
|
195
|
+
details: `HTML page, not a GGUF model (${formatModelFileSize(sizeBytes)}); likely proxy/firewall/captive portal response`,
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
return {
|
|
199
|
+
exists: true,
|
|
200
|
+
valid: false,
|
|
201
|
+
kind: "invalid",
|
|
202
|
+
sizeBytes,
|
|
203
|
+
magic,
|
|
204
|
+
details: `not valid GGUF (expected magic "GGUF", got "${magic}", ${formatModelFileSize(sizeBytes)})`,
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
catch (error) {
|
|
208
|
+
return {
|
|
209
|
+
exists: true,
|
|
210
|
+
valid: false,
|
|
211
|
+
kind: "invalid",
|
|
212
|
+
sizeBytes,
|
|
213
|
+
details: `cannot read model file: ${error instanceof Error ? error.message : String(error)}`,
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Validate that a file is actually a GGUF model, not an HTML error page
|
|
219
|
+
* from a proxy, firewall, or failed download.
|
|
220
|
+
* Throws a descriptive error if the file is not valid GGUF.
|
|
221
|
+
*/
|
|
222
|
+
function validateGgufFile(filePath, modelUri) {
|
|
223
|
+
const inspection = inspectGgufFile(filePath);
|
|
224
|
+
if (!inspection.exists || inspection.valid)
|
|
225
|
+
return; // let downstream handle missing files
|
|
226
|
+
// Remove the bad file so the next attempt re-downloads
|
|
227
|
+
try {
|
|
228
|
+
unlinkSync(filePath);
|
|
229
|
+
}
|
|
230
|
+
catch { /* best effort */ }
|
|
231
|
+
if (inspection.kind === "html") {
|
|
232
|
+
throw new Error(`Downloaded model file is an HTML page, not a GGUF model (${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
|
|
233
|
+
`Something is intercepting the download from huggingface.co (a proxy, firewall, or captive portal).\n\n` +
|
|
234
|
+
`Model: ${modelUri}\n` +
|
|
235
|
+
`Path: ${filePath}\n\n` +
|
|
236
|
+
`To fix this, either:\n` +
|
|
237
|
+
` 1. Try a HuggingFace mirror: HF_ENDPOINT=https://hf-mirror.com qmd embed\n` +
|
|
238
|
+
` 2. Download the model manually and set the env var, e.g.:\n` +
|
|
239
|
+
` QMD_EMBED_MODEL=/path/to/model.gguf qmd embed\n\n` +
|
|
240
|
+
`Note: 'qmd search' works without any model downloads.`);
|
|
241
|
+
}
|
|
242
|
+
throw new Error(`Model file is not valid GGUF (expected magic "GGUF", got "${inspection.magic ?? "unknown"}", file is ${formatModelFileSize(inspection.sizeBytes ?? 0)}).\n` +
|
|
243
|
+
`Model: ${modelUri}\n` +
|
|
244
|
+
`Path: ${filePath}\n\n` +
|
|
245
|
+
`The file has been removed. Run the command again to re-download.`);
|
|
246
|
+
}
|
|
90
247
|
export async function pullModels(models, options = {}) {
|
|
91
248
|
const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
|
|
92
249
|
if (!existsSync(cacheDir)) {
|
|
@@ -127,7 +284,9 @@ export async function pullModels(models, options = {}) {
|
|
|
127
284
|
refreshed = true;
|
|
128
285
|
}
|
|
129
286
|
}
|
|
287
|
+
const { resolveModelFile } = await loadNodeLlamaCpp();
|
|
130
288
|
const path = await resolveModelFile(model, cacheDir);
|
|
289
|
+
validateGgufFile(path, model);
|
|
131
290
|
const sizeBytes = existsSync(path) ? statSync(path).size : 0;
|
|
132
291
|
if (hfRef && filename) {
|
|
133
292
|
const remoteEtag = await getRemoteEtag(hfRef);
|
|
@@ -146,6 +305,58 @@ export async function pullModels(models, options = {}) {
|
|
|
146
305
|
// Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
|
|
147
306
|
const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
|
|
148
307
|
const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
|
|
308
|
+
export function resolveParallelismOverride(envValue = process.env.QMD_EMBED_PARALLELISM) {
|
|
309
|
+
const normalized = envValue?.trim() ?? "";
|
|
310
|
+
if (!normalized)
|
|
311
|
+
return undefined;
|
|
312
|
+
const parsed = Number(normalized);
|
|
313
|
+
if (!Number.isInteger(parsed) || parsed < 1) {
|
|
314
|
+
process.stderr.write(`QMD Warning: invalid QMD_EMBED_PARALLELISM="${envValue}", using automatic parallelism.\n`);
|
|
315
|
+
return undefined;
|
|
316
|
+
}
|
|
317
|
+
return Math.min(8, parsed);
|
|
318
|
+
}
|
|
319
|
+
export function resolveSafeParallelism(options) {
|
|
320
|
+
const override = resolveParallelismOverride(options.envValue);
|
|
321
|
+
if (override !== undefined)
|
|
322
|
+
return override;
|
|
323
|
+
// node-llama-cpp/llama.cpp CUDA on Windows is unstable with multiple
|
|
324
|
+
// simultaneous contexts (ggml-cuda.cu:98 in #519). Vulkan and CPU do not
|
|
325
|
+
// show the same failure mode, so only serialize Windows CUDA by default.
|
|
326
|
+
if ((options.platform ?? process.platform) === "win32" && options.gpu === "cuda") {
|
|
327
|
+
return 1;
|
|
328
|
+
}
|
|
329
|
+
return Math.max(1, options.computed);
|
|
330
|
+
}
|
|
331
|
+
export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU, forceCpuValue = process.env.QMD_FORCE_CPU) {
|
|
332
|
+
const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
|
|
333
|
+
if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
|
|
334
|
+
return false;
|
|
335
|
+
}
|
|
336
|
+
const normalized = envValue?.trim().toLowerCase() ?? "";
|
|
337
|
+
if (!normalized)
|
|
338
|
+
return "auto";
|
|
339
|
+
if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized))
|
|
340
|
+
return false;
|
|
341
|
+
if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda")
|
|
342
|
+
return normalized;
|
|
343
|
+
process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`);
|
|
344
|
+
return "auto";
|
|
345
|
+
}
|
|
346
|
+
async function disposeWithTimeout(resourceName, dispose, timeoutMs = 1000) {
|
|
347
|
+
const timeoutPromise = new Promise((resolve) => {
|
|
348
|
+
setTimeout(() => resolve("timeout"), timeoutMs).unref();
|
|
349
|
+
});
|
|
350
|
+
try {
|
|
351
|
+
const result = await Promise.race([dispose(), timeoutPromise]);
|
|
352
|
+
if (result === "timeout") {
|
|
353
|
+
process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
catch (error) {
|
|
357
|
+
process.stderr.write(`QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
149
360
|
function resolveExpandContextSize(configValue) {
|
|
150
361
|
if (configValue !== undefined) {
|
|
151
362
|
if (!Number.isInteger(configValue) || configValue <= 0) {
|
|
@@ -163,6 +374,12 @@ function resolveExpandContextSize(configValue) {
|
|
|
163
374
|
}
|
|
164
375
|
return parsed;
|
|
165
376
|
}
|
|
377
|
+
const failedGpuInitModes = new Set();
|
|
378
|
+
let noGpuAccelerationWarningShown = false;
|
|
379
|
+
let cpuForcedPrebuiltFallbackWarningShown = false;
|
|
380
|
+
function isCpuModeRequested() {
|
|
381
|
+
return resolveLlamaGpuMode() === false;
|
|
382
|
+
}
|
|
166
383
|
export class LlamaCpp {
|
|
167
384
|
_ciMode = !!process.env.CI;
|
|
168
385
|
llama = null;
|
|
@@ -187,14 +404,23 @@ export class LlamaCpp {
|
|
|
187
404
|
// Track disposal state to prevent double-dispose
|
|
188
405
|
disposed = false;
|
|
189
406
|
constructor(config = {}) {
|
|
190
|
-
this.embedModelUri = config.embedModel
|
|
191
|
-
this.generateModelUri = config.generateModel
|
|
192
|
-
this.rerankModelUri = config.rerankModel
|
|
407
|
+
this.embedModelUri = resolveEmbedModel({ embed: config.embedModel });
|
|
408
|
+
this.generateModelUri = resolveGenerateModel({ generate: config.generateModel });
|
|
409
|
+
this.rerankModelUri = resolveRerankModel({ rerank: config.rerankModel });
|
|
193
410
|
this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
|
|
194
411
|
this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
|
|
195
412
|
this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
|
|
196
413
|
this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
|
|
197
414
|
}
|
|
415
|
+
get embedModelName() {
|
|
416
|
+
return this.embedModelUri;
|
|
417
|
+
}
|
|
418
|
+
get generateModelName() {
|
|
419
|
+
return this.generateModelUri;
|
|
420
|
+
}
|
|
421
|
+
get rerankModelName() {
|
|
422
|
+
return this.rerankModelUri;
|
|
423
|
+
}
|
|
198
424
|
/**
|
|
199
425
|
* Reset the inactivity timer. Called after each model operation.
|
|
200
426
|
* When timer fires, models are unloaded to free memory (if no active sessions).
|
|
@@ -287,27 +513,113 @@ export class LlamaCpp {
|
|
|
287
513
|
/**
|
|
288
514
|
* Initialize the llama instance (lazy)
|
|
289
515
|
*/
|
|
290
|
-
async ensureLlama() {
|
|
516
|
+
async ensureLlama(allowBuild = true) {
|
|
291
517
|
if (!this.llama) {
|
|
292
|
-
const
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
518
|
+
const gpuMode = resolveLlamaGpuMode();
|
|
519
|
+
const { getLlama, getLlamaGpuTypes, LlamaLogLevel } = await loadNodeLlamaCpp();
|
|
520
|
+
const loadLlama = async (gpu, sourceBuildAllowed = allowBuild, buildOverride) => await withNativeStdoutRedirectedToStderr(() => getLlama({
|
|
521
|
+
// Prefer packaged prebuilt bindings before compiling llama.cpp locally.
|
|
522
|
+
// node-llama-cpp documents gpu:"auto" as the best default: Metal on
|
|
523
|
+
// Apple Silicon, CUDA when fully available, Vulkan where available,
|
|
524
|
+
// then CPU. Use build:"auto" for normal loads and build:"never" for
|
|
525
|
+
// diagnostic/probe paths that must not compile llama.cpp.
|
|
526
|
+
build: buildOverride ?? (sourceBuildAllowed ? "auto" : "never"),
|
|
527
|
+
logLevel: LlamaLogLevel.error,
|
|
528
|
+
gpu,
|
|
529
|
+
progressLogs: false,
|
|
530
|
+
skipDownload: !sourceBuildAllowed,
|
|
531
|
+
}));
|
|
532
|
+
const loadCpuCompatibleLlama = async () => {
|
|
533
|
+
try {
|
|
534
|
+
return await loadLlama(false, false);
|
|
535
|
+
}
|
|
536
|
+
catch (err) {
|
|
537
|
+
// Some platforms, notably Apple Silicon, ship a Metal prebuilt but no
|
|
538
|
+
// CPU-only prebuilt. Do a fast no-build lookup for an actual CPU
|
|
539
|
+
// binding first; if it does not exist, use the packaged auto/Metal
|
|
540
|
+
// binding and disable model offloading via gpuLayers: 0.
|
|
541
|
+
if (!cpuForcedPrebuiltFallbackWarningShown) {
|
|
542
|
+
cpuForcedPrebuiltFallbackWarningShown = true;
|
|
543
|
+
process.stderr.write(`QMD Warning: CPU-only llama.cpp prebuilt not available (${err instanceof Error ? err.message : String(err)}); using packaged backend with GPU offloading disabled.\n`);
|
|
544
|
+
}
|
|
545
|
+
return await loadLlama("auto", false);
|
|
546
|
+
}
|
|
547
|
+
};
|
|
548
|
+
let llama;
|
|
549
|
+
if (gpuMode === false) {
|
|
550
|
+
llama = await loadCpuCompatibleLlama();
|
|
551
|
+
}
|
|
552
|
+
else if (failedGpuInitModes.has(gpuMode)) {
|
|
553
|
+
process.stderr.write(`QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`);
|
|
554
|
+
llama = await loadCpuCompatibleLlama();
|
|
555
|
+
}
|
|
556
|
+
else {
|
|
557
|
+
try {
|
|
558
|
+
llama = await loadLlama(gpuMode);
|
|
559
|
+
// If node-llama-cpp auto-detection chose CPU, do one no-build pass
|
|
560
|
+
// over all OS-valid packaged GPU backends. This preserves the
|
|
561
|
+
// documented auto mode for Metal/CUDA/Vulkan while recovering on
|
|
562
|
+
// systems where a packaged backend can load but detection is too
|
|
563
|
+
// conservative. Never compile during these extra probes.
|
|
564
|
+
if (gpuMode === "auto" && llama.gpu === false && getLlamaGpuTypes) {
|
|
565
|
+
const candidates = (await getLlamaGpuTypes("allValid"))
|
|
566
|
+
.filter((candidate) => candidate !== false && candidate !== "auto");
|
|
567
|
+
for (const candidate of candidates) {
|
|
568
|
+
if (failedGpuInitModes.has(candidate))
|
|
569
|
+
continue;
|
|
570
|
+
try {
|
|
571
|
+
const gpuLlama = await loadLlama(candidate, false, "never");
|
|
572
|
+
if (gpuLlama.gpu !== false) {
|
|
573
|
+
await disposeWithTimeout("CPU llama runtime", () => llama.dispose());
|
|
574
|
+
llama = gpuLlama;
|
|
575
|
+
break;
|
|
576
|
+
}
|
|
577
|
+
await disposeWithTimeout(`${candidate} probe runtime`, () => gpuLlama.dispose());
|
|
578
|
+
}
|
|
579
|
+
catch {
|
|
580
|
+
failedGpuInitModes.add(candidate);
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
catch (err) {
|
|
586
|
+
// GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
|
|
587
|
+
// Fall back to CPU so qmd still works, and cache the failure to avoid repeated
|
|
588
|
+
// expensive native build/probe attempts in this process.
|
|
589
|
+
failedGpuInitModes.add(gpuMode);
|
|
590
|
+
process.stderr.write(`QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
|
|
591
|
+
llama = await loadCpuCompatibleLlama();
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
if (llama.gpu === false && !noGpuAccelerationWarningShown) {
|
|
595
|
+
noGpuAccelerationWarningShown = true;
|
|
596
|
+
process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd doctor' for device diagnostics.\n");
|
|
299
597
|
}
|
|
300
598
|
this.llama = llama;
|
|
301
599
|
}
|
|
302
600
|
return this.llama;
|
|
303
601
|
}
|
|
602
|
+
isCpuOffloadForced() {
|
|
603
|
+
return isCpuModeRequested();
|
|
604
|
+
}
|
|
605
|
+
modelLoadOptions(modelPath) {
|
|
606
|
+
return {
|
|
607
|
+
modelPath,
|
|
608
|
+
...(this.isCpuOffloadForced() ? { gpuLayers: 0 } : {}),
|
|
609
|
+
};
|
|
610
|
+
}
|
|
304
611
|
/**
|
|
305
|
-
* Resolve a model URI to a local path, downloading if needed
|
|
612
|
+
* Resolve a model URI to a local path, downloading if needed.
|
|
613
|
+
* Validates the downloaded file is actually a GGUF model (not an HTML error page
|
|
614
|
+
* from a proxy or firewall).
|
|
306
615
|
*/
|
|
307
616
|
async resolveModel(modelUri) {
|
|
308
617
|
this.ensureModelCacheDir();
|
|
309
618
|
// resolveModelFile handles HF URIs and downloads to the cache dir
|
|
310
|
-
|
|
619
|
+
const { resolveModelFile } = await loadNodeLlamaCpp();
|
|
620
|
+
const modelPath = await resolveModelFile(modelUri, this.modelCacheDir);
|
|
621
|
+
validateGgufFile(modelPath, modelUri);
|
|
622
|
+
return modelPath;
|
|
311
623
|
}
|
|
312
624
|
/**
|
|
313
625
|
* Load embedding model (lazy)
|
|
@@ -322,7 +634,7 @@ export class LlamaCpp {
|
|
|
322
634
|
this.embedModelLoadPromise = (async () => {
|
|
323
635
|
const llama = await this.ensureLlama();
|
|
324
636
|
const modelPath = await this.resolveModel(this.embedModelUri);
|
|
325
|
-
const model = await llama.loadModel(
|
|
637
|
+
const model = await llama.loadModel(this.modelLoadOptions(modelPath));
|
|
326
638
|
this.embedModel = model;
|
|
327
639
|
// Model loading counts as activity - ping to keep alive
|
|
328
640
|
this.touchActivity();
|
|
@@ -346,21 +658,23 @@ export class LlamaCpp {
|
|
|
346
658
|
*/
|
|
347
659
|
async computeParallelism(perContextMB) {
|
|
348
660
|
const llama = await this.ensureLlama();
|
|
349
|
-
if (llama.gpu) {
|
|
661
|
+
if (!this.isCpuOffloadForced() && llama.gpu) {
|
|
350
662
|
try {
|
|
351
663
|
const vram = await llama.getVramState();
|
|
352
664
|
const freeMB = vram.free / (1024 * 1024);
|
|
353
665
|
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
|
|
354
|
-
|
|
666
|
+
const computed = Math.max(1, Math.min(8, maxByVram));
|
|
667
|
+
return resolveSafeParallelism({ gpu: llama.gpu, computed });
|
|
355
668
|
}
|
|
356
669
|
catch {
|
|
357
|
-
return 2;
|
|
670
|
+
return resolveSafeParallelism({ gpu: llama.gpu, computed: 2 });
|
|
358
671
|
}
|
|
359
672
|
}
|
|
360
673
|
// CPU: split cores across contexts. At least 4 threads per context.
|
|
361
674
|
const cores = llama.cpuMathCores || 4;
|
|
362
675
|
const maxContexts = Math.floor(cores / 4);
|
|
363
|
-
|
|
676
|
+
const computed = Math.max(1, Math.min(4, maxContexts));
|
|
677
|
+
return resolveSafeParallelism({ gpu: false, computed });
|
|
364
678
|
}
|
|
365
679
|
/**
|
|
366
680
|
* Get the number of threads each context should use, given N parallel contexts.
|
|
@@ -368,7 +682,7 @@ export class LlamaCpp {
|
|
|
368
682
|
*/
|
|
369
683
|
async threadsPerContext(parallelism) {
|
|
370
684
|
const llama = await this.ensureLlama();
|
|
371
|
-
if (llama.gpu)
|
|
685
|
+
if (!this.isCpuOffloadForced() && llama.gpu)
|
|
372
686
|
return 0; // GPU: let the library decide
|
|
373
687
|
const cores = llama.cpuMathCores || 4;
|
|
374
688
|
return Math.max(1, Math.floor(cores / parallelism));
|
|
@@ -394,6 +708,7 @@ export class LlamaCpp {
|
|
|
394
708
|
for (let i = 0; i < n; i++) {
|
|
395
709
|
try {
|
|
396
710
|
this.embedContexts.push(await model.createEmbeddingContext({
|
|
711
|
+
contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
|
|
397
712
|
...(threads > 0 ? { threads } : {}),
|
|
398
713
|
}));
|
|
399
714
|
}
|
|
@@ -431,7 +746,7 @@ export class LlamaCpp {
|
|
|
431
746
|
this.generateModelLoadPromise = (async () => {
|
|
432
747
|
const llama = await this.ensureLlama();
|
|
433
748
|
const modelPath = await this.resolveModel(this.generateModelUri);
|
|
434
|
-
const model = await llama.loadModel(
|
|
749
|
+
const model = await llama.loadModel(this.modelLoadOptions(modelPath));
|
|
435
750
|
this.generateModel = model;
|
|
436
751
|
return model;
|
|
437
752
|
})();
|
|
@@ -461,7 +776,7 @@ export class LlamaCpp {
|
|
|
461
776
|
this.rerankModelLoadPromise = (async () => {
|
|
462
777
|
const llama = await this.ensureLlama();
|
|
463
778
|
const modelPath = await this.resolveModel(this.rerankModelUri);
|
|
464
|
-
const model = await llama.loadModel(
|
|
779
|
+
const model = await llama.loadModel(this.modelLoadOptions(modelPath));
|
|
465
780
|
this.rerankModel = model;
|
|
466
781
|
// Model loading counts as activity - ping to keep alive
|
|
467
782
|
this.touchActivity();
|
|
@@ -484,9 +799,20 @@ export class LlamaCpp {
|
|
|
484
799
|
* - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
|
|
485
800
|
*/
|
|
486
801
|
// Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
|
|
487
|
-
//
|
|
488
|
-
//
|
|
489
|
-
|
|
802
|
+
// Default 2048 was too small for longer documents (e.g. session transcripts,
|
|
803
|
+
// CJK text, or large markdown files) — callers hit "input lengths exceed
|
|
804
|
+
// context size" errors even after truncation because the overhead estimate
|
|
805
|
+
// was insufficient. 4096 comfortably fits the largest real-world chunks
|
|
806
|
+
// while staying well below the 40 960-token auto size.
|
|
807
|
+
// Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
|
|
808
|
+
static RERANK_CONTEXT_SIZE = (() => {
|
|
809
|
+
const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
|
|
810
|
+
return Number.isFinite(v) && v > 0 ? v : 4096;
|
|
811
|
+
})();
|
|
812
|
+
static EMBED_CONTEXT_SIZE = (() => {
|
|
813
|
+
const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
|
|
814
|
+
return Number.isFinite(v) && v > 0 ? v : 2048;
|
|
815
|
+
})();
|
|
490
816
|
async ensureRerankContexts() {
|
|
491
817
|
if (this.rerankContexts.length === 0) {
|
|
492
818
|
const model = await this.ensureRerankModel();
|
|
@@ -497,7 +823,6 @@ export class LlamaCpp {
|
|
|
497
823
|
try {
|
|
498
824
|
this.rerankContexts.push(await model.createRankingContext({
|
|
499
825
|
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
|
|
500
|
-
flashAttention: true,
|
|
501
826
|
...(threads > 0 ? { threads } : {}),
|
|
502
827
|
}));
|
|
503
828
|
}
|
|
@@ -555,15 +880,48 @@ export class LlamaCpp {
|
|
|
555
880
|
// ==========================================================================
|
|
556
881
|
// Core API methods
|
|
557
882
|
// ==========================================================================
|
|
883
|
+
/**
|
|
884
|
+
* Truncate text to fit within the embedding model's context window.
|
|
885
|
+
* Uses the model's own tokenizer for accurate token counting, then
|
|
886
|
+
* detokenizes back to text if truncation is needed.
|
|
887
|
+
* Returns the (possibly truncated) text and whether truncation occurred.
|
|
888
|
+
*/
|
|
889
|
+
resolveEmbedTokenLimit() {
|
|
890
|
+
const trainedContextSize = this.embedModel?.trainContextSize;
|
|
891
|
+
if (typeof trainedContextSize === "number" && Number.isFinite(trainedContextSize) && trainedContextSize > 0) {
|
|
892
|
+
return Math.max(1, Math.min(LlamaCpp.EMBED_CONTEXT_SIZE, trainedContextSize));
|
|
893
|
+
}
|
|
894
|
+
return LlamaCpp.EMBED_CONTEXT_SIZE;
|
|
895
|
+
}
|
|
896
|
+
async truncateToContextSize(text) {
|
|
897
|
+
if (!this.embedModel)
|
|
898
|
+
return { text, truncated: false, limit: LlamaCpp.EMBED_CONTEXT_SIZE };
|
|
899
|
+
const maxTokens = this.resolveEmbedTokenLimit();
|
|
900
|
+
if (maxTokens <= 0)
|
|
901
|
+
return { text, truncated: false, limit: maxTokens };
|
|
902
|
+
const tokens = this.embedModel.tokenize(text);
|
|
903
|
+
if (tokens.length <= maxTokens)
|
|
904
|
+
return { text, truncated: false, limit: maxTokens };
|
|
905
|
+
// Leave a small margin (4 tokens) for BOS/EOS overhead
|
|
906
|
+
const safeLimit = Math.max(1, maxTokens - 4);
|
|
907
|
+
const truncatedTokens = tokens.slice(0, safeLimit);
|
|
908
|
+
const truncatedText = this.embedModel.detokenize(truncatedTokens);
|
|
909
|
+
return { text: truncatedText, truncated: true, limit: maxTokens };
|
|
910
|
+
}
|
|
558
911
|
async embed(text, options = {}) {
|
|
559
912
|
// Ping activity at start to keep models alive during this operation
|
|
560
913
|
this.touchActivity();
|
|
561
914
|
try {
|
|
562
915
|
const context = await this.ensureEmbedContext();
|
|
563
|
-
|
|
916
|
+
// Guard: truncate text that exceeds model context window to prevent GGML crash
|
|
917
|
+
const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
|
|
918
|
+
if (truncated) {
|
|
919
|
+
console.warn(`⚠ Text truncated to fit embedding context (${limit} tokens)`);
|
|
920
|
+
}
|
|
921
|
+
const embedding = await context.getEmbeddingFor(safeText);
|
|
564
922
|
return {
|
|
565
923
|
embedding: Array.from(embedding.vector),
|
|
566
|
-
model: this.embedModelUri,
|
|
924
|
+
model: options.model ?? this.embedModelUri,
|
|
567
925
|
};
|
|
568
926
|
}
|
|
569
927
|
catch (error) {
|
|
@@ -575,7 +933,7 @@ export class LlamaCpp {
|
|
|
575
933
|
* Batch embed multiple texts efficiently
|
|
576
934
|
* Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
|
|
577
935
|
*/
|
|
578
|
-
async embedBatch(texts) {
|
|
936
|
+
async embedBatch(texts, options = {}) {
|
|
579
937
|
if (this._ciMode)
|
|
580
938
|
throw new Error("LLM operations are disabled in CI (set CI=true)");
|
|
581
939
|
// Ping activity at start to keep models alive during this operation
|
|
@@ -591,9 +949,13 @@ export class LlamaCpp {
|
|
|
591
949
|
const embeddings = [];
|
|
592
950
|
for (const text of texts) {
|
|
593
951
|
try {
|
|
594
|
-
const
|
|
952
|
+
const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
|
|
953
|
+
if (truncated) {
|
|
954
|
+
console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
|
|
955
|
+
}
|
|
956
|
+
const embedding = await context.getEmbeddingFor(safeText);
|
|
595
957
|
this.touchActivity();
|
|
596
|
-
embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
|
|
958
|
+
embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
|
|
597
959
|
}
|
|
598
960
|
catch (err) {
|
|
599
961
|
console.error("Embedding error for text:", err);
|
|
@@ -610,9 +972,13 @@ export class LlamaCpp {
|
|
|
610
972
|
const results = [];
|
|
611
973
|
for (const text of chunk) {
|
|
612
974
|
try {
|
|
613
|
-
const
|
|
975
|
+
const { text: safeText, truncated, limit } = await this.truncateToContextSize(text);
|
|
976
|
+
if (truncated) {
|
|
977
|
+
console.warn(`⚠ Batch text truncated to fit embedding context (${limit} tokens)`);
|
|
978
|
+
}
|
|
979
|
+
const embedding = await ctx.getEmbeddingFor(safeText);
|
|
614
980
|
this.touchActivity();
|
|
615
|
-
results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
|
|
981
|
+
results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
|
|
616
982
|
}
|
|
617
983
|
catch (err) {
|
|
618
984
|
console.error("Embedding error for text:", err);
|
|
@@ -638,6 +1004,7 @@ export class LlamaCpp {
|
|
|
638
1004
|
// Create fresh context -> sequence -> session for each call
|
|
639
1005
|
const context = await this.generateModel.createContext();
|
|
640
1006
|
const sequence = context.getSequence();
|
|
1007
|
+
const { LlamaChatSession } = await loadNodeLlamaCpp();
|
|
641
1008
|
const session = new LlamaChatSession({ contextSequence: sequence });
|
|
642
1009
|
const maxTokens = options.maxTokens ?? 150;
|
|
643
1010
|
// Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
|
|
@@ -707,6 +1074,7 @@ export class LlamaCpp {
|
|
|
707
1074
|
contextSize: this.expandContextSize,
|
|
708
1075
|
});
|
|
709
1076
|
const sequence = genContext.getSequence();
|
|
1077
|
+
const { LlamaChatSession } = await loadNodeLlamaCpp();
|
|
710
1078
|
const session = new LlamaChatSession({ contextSequence: sequence });
|
|
711
1079
|
try {
|
|
712
1080
|
// Qwen3 recommended settings for non-thinking mode:
|
|
@@ -767,8 +1135,10 @@ export class LlamaCpp {
|
|
|
767
1135
|
await genContext.dispose();
|
|
768
1136
|
}
|
|
769
1137
|
}
|
|
770
|
-
// Qwen3 reranker chat template overhead (system prompt, tags, separators)
|
|
771
|
-
|
|
1138
|
+
// Qwen3 reranker chat template overhead (system prompt, tags, separators).
|
|
1139
|
+
// Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
|
|
1140
|
+
// the truncation budget never lets a document slip past the context limit.
|
|
1141
|
+
static RERANK_TEMPLATE_OVERHEAD = 512;
|
|
772
1142
|
static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
|
|
773
1143
|
async rerank(query, documents, options = {}) {
|
|
774
1144
|
if (this._ciMode)
|
|
@@ -845,11 +1215,12 @@ export class LlamaCpp {
|
|
|
845
1215
|
* Get device/GPU info for status display.
|
|
846
1216
|
* Initializes llama if not already done.
|
|
847
1217
|
*/
|
|
848
|
-
async getDeviceInfo() {
|
|
849
|
-
const llama = await this.ensureLlama();
|
|
850
|
-
const
|
|
1218
|
+
async getDeviceInfo(options = {}) {
|
|
1219
|
+
const llama = await this.ensureLlama(options.allowBuild ?? true);
|
|
1220
|
+
const cpuForced = this.isCpuOffloadForced();
|
|
1221
|
+
const gpuDevices = cpuForced ? [] : await llama.getGpuDeviceNames();
|
|
851
1222
|
let vram;
|
|
852
|
-
if (llama.gpu) {
|
|
1223
|
+
if (!cpuForced && llama.gpu) {
|
|
853
1224
|
try {
|
|
854
1225
|
const state = await llama.getVramState();
|
|
855
1226
|
vram = { total: state.total, used: state.used, free: state.free };
|
|
@@ -857,8 +1228,8 @@ export class LlamaCpp {
|
|
|
857
1228
|
catch { /* no vram info */ }
|
|
858
1229
|
}
|
|
859
1230
|
return {
|
|
860
|
-
gpu: llama.gpu,
|
|
861
|
-
gpuOffloading: llama.supportsGpuOffloading,
|
|
1231
|
+
gpu: cpuForced ? false : llama.gpu,
|
|
1232
|
+
gpuOffloading: !cpuForced && llama.supportsGpuOffloading,
|
|
862
1233
|
gpuDevices,
|
|
863
1234
|
vram,
|
|
864
1235
|
cpuCores: llama.cpuMathCores,
|
|
@@ -875,21 +1246,34 @@ export class LlamaCpp {
|
|
|
875
1246
|
clearTimeout(this.inactivityTimer);
|
|
876
1247
|
this.inactivityTimer = null;
|
|
877
1248
|
}
|
|
878
|
-
//
|
|
879
|
-
//
|
|
880
|
-
//
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
await Promise.race([disposePromise, timeoutPromise]);
|
|
1249
|
+
// Explicitly dispose in dependency order: contexts first, then models, then llama.
|
|
1250
|
+
// Relying only on llama.dispose() leaves Metal resource sets alive until process
|
|
1251
|
+
// finalization on Apple Silicon, where ggml_metal_device_free can abort after
|
|
1252
|
+
// otherwise-successful CLI output (#368).
|
|
1253
|
+
for (const ctx of this.embedContexts) {
|
|
1254
|
+
await disposeWithTimeout("embedding context", () => ctx.dispose());
|
|
885
1255
|
}
|
|
886
|
-
// Clear references
|
|
887
1256
|
this.embedContexts = [];
|
|
1257
|
+
for (const ctx of this.rerankContexts) {
|
|
1258
|
+
await disposeWithTimeout("rerank context", () => ctx.dispose());
|
|
1259
|
+
}
|
|
888
1260
|
this.rerankContexts = [];
|
|
889
|
-
this.embedModel
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
1261
|
+
if (this.embedModel) {
|
|
1262
|
+
await disposeWithTimeout("embedding model", () => this.embedModel.dispose());
|
|
1263
|
+
this.embedModel = null;
|
|
1264
|
+
}
|
|
1265
|
+
if (this.generateModel) {
|
|
1266
|
+
await disposeWithTimeout("generation model", () => this.generateModel.dispose());
|
|
1267
|
+
this.generateModel = null;
|
|
1268
|
+
}
|
|
1269
|
+
if (this.rerankModel) {
|
|
1270
|
+
await disposeWithTimeout("rerank model", () => this.rerankModel.dispose());
|
|
1271
|
+
this.rerankModel = null;
|
|
1272
|
+
}
|
|
1273
|
+
if (this.llama) {
|
|
1274
|
+
await disposeWithTimeout("llama runtime", () => this.llama.dispose());
|
|
1275
|
+
this.llama = null;
|
|
1276
|
+
}
|
|
893
1277
|
// Clear any in-flight load/create promises
|
|
894
1278
|
this.embedModelLoadPromise = null;
|
|
895
1279
|
this.embedContextsCreatePromise = null;
|
|
@@ -1028,8 +1412,8 @@ class LLMSession {
|
|
|
1028
1412
|
async embed(text, options) {
|
|
1029
1413
|
return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
|
|
1030
1414
|
}
|
|
1031
|
-
async embedBatch(texts) {
|
|
1032
|
-
return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
|
|
1415
|
+
async embedBatch(texts, options) {
|
|
1416
|
+
return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
|
|
1033
1417
|
}
|
|
1034
1418
|
async expandQuery(query, options) {
|
|
1035
1419
|
return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
|
|
@@ -1106,8 +1490,7 @@ let defaultLlamaCpp = null;
|
|
|
1106
1490
|
*/
|
|
1107
1491
|
export function getDefaultLlamaCpp() {
|
|
1108
1492
|
if (!defaultLlamaCpp) {
|
|
1109
|
-
|
|
1110
|
-
defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
|
|
1493
|
+
defaultLlamaCpp = new LlamaCpp();
|
|
1111
1494
|
}
|
|
1112
1495
|
return defaultLlamaCpp;
|
|
1113
1496
|
}
|