@ambicuity/kindx 0.1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +51 -0
- package/README.md +409 -129
- package/bin/kindx +38 -0
- package/capabilities/kindx/SKILL.md +127 -0
- package/capabilities/kindx/references/mcp-setup.md +102 -0
- package/dist/catalogs.js +57 -16
- package/dist/inference.d.ts +82 -7
- package/dist/inference.js +241 -49
- package/dist/kindx.js +425 -91
- package/dist/migrate.d.ts +2 -0
- package/dist/migrate.js +133 -0
- package/dist/protocol.d.ts +2 -1
- package/dist/protocol.js +110 -6
- package/dist/remote-llm.d.ts +23 -0
- package/dist/remote-llm.js +307 -0
- package/dist/repository.d.ts +18 -1
- package/dist/repository.js +260 -35
- package/dist/watcher.d.ts +29 -0
- package/dist/watcher.js +243 -0
- package/package.json +26 -11
package/dist/inference.js
CHANGED
|
@@ -4,9 +4,11 @@
|
|
|
4
4
|
* Provides embeddings, text generation, and reranking using local GGUF models.
|
|
5
5
|
*/
|
|
6
6
|
import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
|
|
7
|
-
import {
|
|
7
|
+
import { RemoteLLM } from "./remote-llm.js";
|
|
8
|
+
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
9
|
+
import * as os from "node:os";
|
|
8
10
|
import { join } from "path";
|
|
9
|
-
import {
|
|
11
|
+
import { homedir } from "node:os";
|
|
10
12
|
// =============================================================================
|
|
11
13
|
// Embedding Formatting Functions
|
|
12
14
|
// =============================================================================
|
|
@@ -49,8 +51,8 @@ export function formatDocForEmbedding(text, title, modelUri) {
|
|
|
49
51
|
// Format: hf:<user>/<repo>/<file>
|
|
50
52
|
// Override via KINDX_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/qwen3-embedding-0.6b-q8_0.gguf)
|
|
51
53
|
const DEFAULT_EMBED_MODEL = process.env.KINDX_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
|
|
52
|
-
const DEFAULT_RERANK_MODEL = process.env.KINDX_RERANK_MODEL ?? "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-
|
|
53
|
-
const DEFAULT_GENERATE_MODEL = process.env.KINDX_GENERATE_MODEL ?? "hf:
|
|
54
|
+
const DEFAULT_RERANK_MODEL = process.env.KINDX_RERANK_MODEL ?? "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-Q8_0.gguf";
|
|
55
|
+
const DEFAULT_GENERATE_MODEL = process.env.KINDX_GENERATE_MODEL ?? "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
|
|
54
56
|
// Alternative generation models for query expansion:
|
|
55
57
|
// LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
|
|
56
58
|
// Use these as base for fine-tuning with configs/sft_lfm2.yaml
|
|
@@ -146,6 +148,40 @@ export async function pullModels(models, options = {}) {
|
|
|
146
148
|
const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
|
|
147
149
|
const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
|
|
148
150
|
const DEFAULT_RERANK_CONTEXT_SIZE = 4096;
|
|
151
|
+
const DEFAULT_LOW_VRAM_THRESHOLD_MB = 6144;
|
|
152
|
+
const DEFAULT_LOW_VRAM_EMBED_PARALLELISM = 2;
|
|
153
|
+
const DEFAULT_LOW_VRAM_RERANK_PARALLELISM = 1;
|
|
154
|
+
const DEFAULT_LOW_VRAM_EXPAND_CONTEXT_SIZE = 1024;
|
|
155
|
+
const DEFAULT_LOW_VRAM_RERANK_CONTEXT_SIZE = 1024;
|
|
156
|
+
function parseOptionalBoolean(raw, envName) {
|
|
157
|
+
if (raw === undefined)
|
|
158
|
+
return undefined;
|
|
159
|
+
const value = raw.trim().toLowerCase();
|
|
160
|
+
if (value === "1" || value === "true" || value === "yes" || value === "on")
|
|
161
|
+
return true;
|
|
162
|
+
if (value === "0" || value === "false" || value === "no" || value === "off")
|
|
163
|
+
return false;
|
|
164
|
+
process.stderr.write(`KINDX Warning: invalid ${envName}="${raw}", ignoring.\n`);
|
|
165
|
+
return undefined;
|
|
166
|
+
}
|
|
167
|
+
function parsePositiveIntOrWarn(raw, envName, fallback) {
|
|
168
|
+
if (raw === undefined || raw.trim() === "")
|
|
169
|
+
return fallback;
|
|
170
|
+
const parsed = Number.parseInt(raw.trim(), 10);
|
|
171
|
+
if (Number.isInteger(parsed) && parsed > 0)
|
|
172
|
+
return parsed;
|
|
173
|
+
process.stderr.write(`KINDX Warning: invalid ${envName}="${raw}", using ${fallback}.\n`);
|
|
174
|
+
return fallback;
|
|
175
|
+
}
|
|
176
|
+
function resolvePositiveIntWithEnv(configValue, envName, fallback) {
|
|
177
|
+
if (configValue !== undefined) {
|
|
178
|
+
if (!Number.isInteger(configValue) || configValue <= 0) {
|
|
179
|
+
throw new Error(`Invalid ${envName}: ${configValue}. Must be a positive integer.`);
|
|
180
|
+
}
|
|
181
|
+
return configValue;
|
|
182
|
+
}
|
|
183
|
+
return parsePositiveIntOrWarn(process.env[envName], envName, fallback);
|
|
184
|
+
}
|
|
149
185
|
function resolveExpandContextSize(configValue) {
|
|
150
186
|
if (configValue !== undefined) {
|
|
151
187
|
if (!Number.isInteger(configValue) || configValue <= 0) {
|
|
@@ -193,6 +229,15 @@ export class LlamaCpp {
|
|
|
193
229
|
modelCacheDir;
|
|
194
230
|
rerankContextSize;
|
|
195
231
|
expandContextSize;
|
|
232
|
+
lowVramOverride;
|
|
233
|
+
vramBudgetMB;
|
|
234
|
+
lowVramThresholdMB;
|
|
235
|
+
lowVramEmbedParallelism;
|
|
236
|
+
lowVramRerankParallelism;
|
|
237
|
+
lowVramExpandContextSize;
|
|
238
|
+
lowVramRerankContextSize;
|
|
239
|
+
memoryPolicyPromise = null;
|
|
240
|
+
lowVramWarningShown = false;
|
|
196
241
|
// Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
|
|
197
242
|
embedModelLoadPromise = null;
|
|
198
243
|
generateModelLoadPromise = null;
|
|
@@ -210,6 +255,13 @@ export class LlamaCpp {
|
|
|
210
255
|
this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
|
|
211
256
|
this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
|
|
212
257
|
this.rerankContextSize = resolveRerankContextSize(config.rerankContextSize);
|
|
258
|
+
this.lowVramOverride = config.lowVram ?? parseOptionalBoolean(process.env.KINDX_LOW_VRAM, "KINDX_LOW_VRAM");
|
|
259
|
+
this.vramBudgetMB = resolvePositiveIntWithEnv(config.vramBudgetMB, "KINDX_VRAM_BUDGET_MB", 0) || null;
|
|
260
|
+
this.lowVramThresholdMB = resolvePositiveIntWithEnv(config.lowVramThresholdMB, "KINDX_LOW_VRAM_THRESHOLD_MB", DEFAULT_LOW_VRAM_THRESHOLD_MB);
|
|
261
|
+
this.lowVramEmbedParallelism = resolvePositiveIntWithEnv(config.lowVramEmbedParallelism, "KINDX_LOW_VRAM_EMBED_PARALLELISM", DEFAULT_LOW_VRAM_EMBED_PARALLELISM);
|
|
262
|
+
this.lowVramRerankParallelism = resolvePositiveIntWithEnv(config.lowVramRerankParallelism, "KINDX_LOW_VRAM_RERANK_PARALLELISM", DEFAULT_LOW_VRAM_RERANK_PARALLELISM);
|
|
263
|
+
this.lowVramExpandContextSize = resolvePositiveIntWithEnv(config.lowVramExpandContextSize, "KINDX_LOW_VRAM_EXPAND_CONTEXT_SIZE", DEFAULT_LOW_VRAM_EXPAND_CONTEXT_SIZE);
|
|
264
|
+
this.lowVramRerankContextSize = resolvePositiveIntWithEnv(config.lowVramRerankContextSize, "KINDX_LOW_VRAM_RERANK_CONTEXT_SIZE", DEFAULT_LOW_VRAM_RERANK_CONTEXT_SIZE);
|
|
213
265
|
this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
|
|
214
266
|
this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
|
|
215
267
|
}
|
|
@@ -322,6 +374,12 @@ export class LlamaCpp {
|
|
|
322
374
|
else if (llama.gpu === false) {
|
|
323
375
|
process.stderr.write("KINDX Warning: no GPU acceleration, running on CPU (slow). Run 'kindx status' for details.\n");
|
|
324
376
|
}
|
|
377
|
+
else if (llama.gpu === "vulkan" && os.release().toLowerCase().includes("microsoft")) {
|
|
378
|
+
process.stderr.write("\nKINDX Warning: Vulkan backend detected on WSL2. This is often slow or unstable (using 'dzn').\n" +
|
|
379
|
+
"For native NVIDIA GPU acceleration on WSL2, please install the CUDA toolkit:\n" +
|
|
380
|
+
" sudo apt-get install cuda-toolkit-13-1 (or cuda-toolkit-12-6)\n" +
|
|
381
|
+
"See https://github.com/ambicuity/KINDX/issues/141 for more details.\n\n");
|
|
382
|
+
}
|
|
325
383
|
this.llama = llama;
|
|
326
384
|
}
|
|
327
385
|
return this.llama;
|
|
@@ -361,26 +419,104 @@ export class LlamaCpp {
|
|
|
361
419
|
this.embedModelLoadPromise = null;
|
|
362
420
|
}
|
|
363
421
|
}
|
|
422
|
+
showLowVramWarning(policy) {
|
|
423
|
+
if (!policy.lowVram || this.lowVramWarningShown)
|
|
424
|
+
return;
|
|
425
|
+
const freeText = policy.freeMB === null ? "unknown free VRAM" : `${Math.round(policy.freeMB)} MB free VRAM`;
|
|
426
|
+
const budgetText = policy.budgetMB === null ? "auto budget" : `${Math.round(policy.budgetMB)} MB budget`;
|
|
427
|
+
process.stderr.write(`KINDX Warning: low VRAM mode enabled (${freeText}, ${budgetText}, ${policy.reason}).\n`);
|
|
428
|
+
this.lowVramWarningShown = true;
|
|
429
|
+
}
|
|
430
|
+
async resolveMemoryPolicy() {
|
|
431
|
+
if (this.memoryPolicyPromise)
|
|
432
|
+
return await this.memoryPolicyPromise;
|
|
433
|
+
this.memoryPolicyPromise = (async () => {
|
|
434
|
+
const llama = await this.ensureLlama();
|
|
435
|
+
if (!llama.gpu) {
|
|
436
|
+
return { lowVram: false, freeMB: null, budgetMB: this.vramBudgetMB, reason: "cpu_or_no_gpu" };
|
|
437
|
+
}
|
|
438
|
+
let freeMB = null;
|
|
439
|
+
try {
|
|
440
|
+
const vram = await llama.getVramState();
|
|
441
|
+
freeMB = vram.free / (1024 * 1024);
|
|
442
|
+
}
|
|
443
|
+
catch {
|
|
444
|
+
freeMB = null;
|
|
445
|
+
}
|
|
446
|
+
if (this.lowVramOverride !== undefined) {
|
|
447
|
+
const policy = {
|
|
448
|
+
lowVram: this.lowVramOverride,
|
|
449
|
+
freeMB,
|
|
450
|
+
budgetMB: this.vramBudgetMB,
|
|
451
|
+
reason: "forced_by_KINDX_LOW_VRAM_or_config",
|
|
452
|
+
};
|
|
453
|
+
this.showLowVramWarning(policy);
|
|
454
|
+
return policy;
|
|
455
|
+
}
|
|
456
|
+
if (this.vramBudgetMB !== null) {
|
|
457
|
+
const policy = {
|
|
458
|
+
lowVram: true,
|
|
459
|
+
freeMB,
|
|
460
|
+
budgetMB: this.vramBudgetMB,
|
|
461
|
+
reason: "budget_set_by_KINDX_VRAM_BUDGET_MB_or_config",
|
|
462
|
+
};
|
|
463
|
+
this.showLowVramWarning(policy);
|
|
464
|
+
return policy;
|
|
465
|
+
}
|
|
466
|
+
if (freeMB !== null && freeMB < this.lowVramThresholdMB) {
|
|
467
|
+
const policy = {
|
|
468
|
+
lowVram: true,
|
|
469
|
+
freeMB,
|
|
470
|
+
budgetMB: null,
|
|
471
|
+
reason: `auto_detected_below_${this.lowVramThresholdMB}MB_threshold`,
|
|
472
|
+
};
|
|
473
|
+
this.showLowVramWarning(policy);
|
|
474
|
+
return policy;
|
|
475
|
+
}
|
|
476
|
+
return { lowVram: false, freeMB, budgetMB: null, reason: "auto_high_vram" };
|
|
477
|
+
})();
|
|
478
|
+
return await this.memoryPolicyPromise;
|
|
479
|
+
}
|
|
480
|
+
async effectiveExpandContextSize() {
|
|
481
|
+
const policy = await this.resolveMemoryPolicy();
|
|
482
|
+
if (!policy.lowVram)
|
|
483
|
+
return this.expandContextSize;
|
|
484
|
+
return Math.min(this.expandContextSize, this.lowVramExpandContextSize);
|
|
485
|
+
}
|
|
486
|
+
async effectiveRerankContextSize() {
|
|
487
|
+
const policy = await this.resolveMemoryPolicy();
|
|
488
|
+
if (!policy.lowVram)
|
|
489
|
+
return this.rerankContextSize;
|
|
490
|
+
return Math.min(this.rerankContextSize, this.lowVramRerankContextSize);
|
|
491
|
+
}
|
|
364
492
|
/**
|
|
365
493
|
* Compute how many parallel contexts to create.
|
|
366
494
|
*
|
|
367
|
-
* GPU: constrained by VRAM
|
|
495
|
+
* GPU: constrained by free VRAM / budget and low-VRAM policy caps.
|
|
368
496
|
* CPU: constrained by cores. Splitting threads across contexts enables
|
|
369
497
|
* true parallelism (each context runs on its own cores). Use at most
|
|
370
498
|
* half the math cores, with at least 4 threads per context.
|
|
371
499
|
*/
|
|
372
|
-
async computeParallelism(perContextMB) {
|
|
500
|
+
async computeParallelism(perContextMB, kind) {
|
|
373
501
|
const llama = await this.ensureLlama();
|
|
502
|
+
const policy = await this.resolveMemoryPolicy();
|
|
374
503
|
if (llama.gpu) {
|
|
375
|
-
|
|
376
|
-
const
|
|
377
|
-
const
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
catch {
|
|
382
|
-
return 2;
|
|
504
|
+
if (policy.freeMB === null) {
|
|
505
|
+
const fallback = kind === "embed" ? 2 : 1;
|
|
506
|
+
const cap = policy.lowVram
|
|
507
|
+
? (kind === "embed" ? this.lowVramEmbedParallelism : this.lowVramRerankParallelism)
|
|
508
|
+
: 8;
|
|
509
|
+
return Math.max(1, Math.min(cap, fallback));
|
|
383
510
|
}
|
|
511
|
+
const availableMB = policy.budgetMB === null
|
|
512
|
+
? policy.freeMB
|
|
513
|
+
: Math.min(policy.budgetMB, policy.freeMB);
|
|
514
|
+
const maxByVram = Math.floor((availableMB * 0.25) / perContextMB);
|
|
515
|
+
const base = Math.max(1, Math.min(8, maxByVram));
|
|
516
|
+
if (!policy.lowVram)
|
|
517
|
+
return base;
|
|
518
|
+
const cap = kind === "embed" ? this.lowVramEmbedParallelism : this.lowVramRerankParallelism;
|
|
519
|
+
return Math.max(1, Math.min(base, cap));
|
|
384
520
|
}
|
|
385
521
|
// CPU: split cores across contexts. At least 4 threads per context.
|
|
386
522
|
const cores = llama.cpuMathCores || 4;
|
|
@@ -414,7 +550,7 @@ export class LlamaCpp {
|
|
|
414
550
|
this.embedContextsCreatePromise = (async () => {
|
|
415
551
|
const model = await this.ensureEmbedModel();
|
|
416
552
|
// Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
|
|
417
|
-
const n = await this.computeParallelism(150);
|
|
553
|
+
const n = await this.computeParallelism(150, "embed");
|
|
418
554
|
const threads = await this.threadsPerContext(n);
|
|
419
555
|
for (let i = 0; i < n; i++) {
|
|
420
556
|
try {
|
|
@@ -515,12 +651,13 @@ export class LlamaCpp {
|
|
|
515
651
|
if (this.rerankContexts.length === 0) {
|
|
516
652
|
const model = await this.ensureRerankModel();
|
|
517
653
|
// ~960 MB per context with flash attention at default contextSize 4096
|
|
518
|
-
const n = Math.min(await this.computeParallelism(1000), 4);
|
|
654
|
+
const n = Math.min(await this.computeParallelism(1000, "rerank"), 4);
|
|
655
|
+
const rerankContextSize = await this.effectiveRerankContextSize();
|
|
519
656
|
const threads = await this.threadsPerContext(n);
|
|
520
657
|
for (let i = 0; i < n; i++) {
|
|
521
658
|
try {
|
|
522
659
|
this.rerankContexts.push(await model.createRankingContext({
|
|
523
|
-
contextSize:
|
|
660
|
+
contextSize: rerankContextSize,
|
|
524
661
|
flashAttention: true,
|
|
525
662
|
...(threads > 0 ? { threads } : {}),
|
|
526
663
|
}));
|
|
@@ -530,7 +667,7 @@ export class LlamaCpp {
|
|
|
530
667
|
// Flash attention might not be supported — retry without it
|
|
531
668
|
try {
|
|
532
669
|
this.rerankContexts.push(await model.createRankingContext({
|
|
533
|
-
contextSize:
|
|
670
|
+
contextSize: rerankContextSize,
|
|
534
671
|
...(threads > 0 ? { threads } : {}),
|
|
535
672
|
}));
|
|
536
673
|
}
|
|
@@ -704,10 +841,25 @@ export class LlamaCpp {
|
|
|
704
841
|
async expandQuery(query, options = {}) {
|
|
705
842
|
// Ping activity at start to keep models alive during this operation
|
|
706
843
|
this.touchActivity();
|
|
707
|
-
const llama = await this.ensureLlama();
|
|
708
|
-
await this.ensureGenerateModel();
|
|
709
844
|
const includeLexical = options.includeLexical ?? true;
|
|
710
845
|
const context = options.context;
|
|
846
|
+
// -------------------------------------------------------------------------
|
|
847
|
+
// Task 2: Dynamic HyDE Bypass
|
|
848
|
+
// Short entity-lookup queries (≤3 tokens) lack sufficient semantic surface
|
|
849
|
+
// area for the LLM to extrapolate a meaningful hypothetical document.
|
|
850
|
+
// Generating HyDE passages for these actively harms precision by shifting
|
|
851
|
+
// the embedding centroid away from the actual target.
|
|
852
|
+
// Bypass generation entirely and return direct lex + vec targets.
|
|
853
|
+
// -------------------------------------------------------------------------
|
|
854
|
+
const tokenCount = query.trim().split(/\s+/).filter(Boolean).length;
|
|
855
|
+
if (tokenCount <= 3) {
|
|
856
|
+
const bypass = [{ type: 'vec', text: query }];
|
|
857
|
+
if (includeLexical)
|
|
858
|
+
bypass.unshift({ type: 'lex', text: query });
|
|
859
|
+
return bypass;
|
|
860
|
+
}
|
|
861
|
+
const llama = await this.ensureLlama();
|
|
862
|
+
await this.ensureGenerateModel();
|
|
711
863
|
const grammar = await llama.createGrammar({
|
|
712
864
|
grammar: `
|
|
713
865
|
root ::= line+
|
|
@@ -717,12 +869,46 @@ export class LlamaCpp {
|
|
|
717
869
|
`
|
|
718
870
|
});
|
|
719
871
|
const prompt = `/no_think Expand this search query: ${query}`;
|
|
720
|
-
//
|
|
872
|
+
// -------------------------------------------------------------------------
|
|
873
|
+
// Task 1: Strict System Prompt Injection
|
|
874
|
+
// Enforces three constraints on the generation model:
|
|
875
|
+
// 1. Grammar adherence: output must strictly match the EBNF grammar.
|
|
876
|
+
// 2. No conversational filler: eliminates "Of course!", apologies, etc.
|
|
877
|
+
// 3. Domain anchoring: if options.context is provided, inject it so the
|
|
878
|
+
// model stays within the bounded knowledge domain and does not
|
|
879
|
+
// hallucinate external concepts (e.g., "blockchain" for a code repo).
|
|
880
|
+
// -------------------------------------------------------------------------
|
|
881
|
+
const domainInstruction = context
|
|
882
|
+
? `Domain context: ${context}\nYour expansions MUST stay within this domain. Do not introduce concepts from outside it.`
|
|
883
|
+
: `Stay strictly within the semantic domain implied by the query itself.`;
|
|
884
|
+
const systemPrompt = [
|
|
885
|
+
`You are a search query expansion engine. Your ONLY task is to output structured query variations.`,
|
|
886
|
+
``,
|
|
887
|
+
`OUTPUT FORMAT (strict — do not deviate):`,
|
|
888
|
+
` lex: <exact keyword phrase>`,
|
|
889
|
+
` vec: <semantically equivalent rephrasing>`,
|
|
890
|
+
` hyde: <a verbatim 1-2 sentence excerpt that would appear in a relevant technical document>`,
|
|
891
|
+
``,
|
|
892
|
+
`RULES (violations will break the downstream parser):`,
|
|
893
|
+
` - Do NOT write greetings, apologies, explanations, or any prose outside the format.`,
|
|
894
|
+
` - Do NOT write "Here is...", "Of course!", "I'd be happy to...", or similar filler.`,
|
|
895
|
+
` - Each line MUST start with "lex:", "vec:", or "hyde:" followed by a single space.`,
|
|
896
|
+
` - hyde entries MUST read like an excerpt from a technical document, NOT a question or summary.`,
|
|
897
|
+
` - Output 2–4 lines maximum. Output NOTHING else.`,
|
|
898
|
+
``,
|
|
899
|
+
domainInstruction,
|
|
900
|
+
].join('\n');
|
|
901
|
+
// Create a bounded context for expansion.
|
|
902
|
+
// The system prompt adds ~400 tokens of overhead. We allocate a 512-token buffer
|
|
903
|
+
// on top of expandContextSize to prevent native llama.cpp from aborting on context
|
|
904
|
+
// overflow when the combined system prompt + user query exceeds the window.
|
|
905
|
+
const SYSTEM_PROMPT_TOKEN_OVERHEAD = 512;
|
|
906
|
+
const effectiveExpandContextSize = await this.effectiveExpandContextSize();
|
|
721
907
|
const genContext = await this.generateModel.createContext({
|
|
722
|
-
contextSize:
|
|
908
|
+
contextSize: effectiveExpandContextSize + SYSTEM_PROMPT_TOKEN_OVERHEAD,
|
|
723
909
|
});
|
|
724
910
|
const sequence = genContext.getSequence();
|
|
725
|
-
const session = new LlamaChatSession({ contextSequence: sequence });
|
|
911
|
+
const session = new LlamaChatSession({ contextSequence: sequence, systemPrompt });
|
|
726
912
|
try {
|
|
727
913
|
// Qwen3 recommended settings for non-thinking mode:
|
|
728
914
|
// temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
|
|
@@ -793,7 +979,8 @@ export class LlamaCpp {
|
|
|
793
979
|
// Truncate documents that would exceed the rerank context size.
|
|
794
980
|
// Budget = contextSize - template overhead - query tokens
|
|
795
981
|
const queryTokens = model.tokenize(query).length;
|
|
796
|
-
const
|
|
982
|
+
const effectiveRerankContextSize = await this.effectiveRerankContextSize();
|
|
983
|
+
const rawMaxDocTokens = effectiveRerankContextSize - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
|
|
797
984
|
// Guard against non-positive budget (e.g., very long queries or CJK content with
|
|
798
985
|
// high token density). Allow at minimum 128 tokens per document to avoid crashes.
|
|
799
986
|
const maxDocTokens = Math.max(128, rawMaxDocTokens);
|
|
@@ -952,7 +1139,7 @@ class LLMSessionManager {
|
|
|
952
1139
|
operationEnd() {
|
|
953
1140
|
this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
|
|
954
1141
|
}
|
|
955
|
-
|
|
1142
|
+
getLLM() {
|
|
956
1143
|
return this.llm;
|
|
957
1144
|
}
|
|
958
1145
|
}
|
|
@@ -1042,26 +1229,26 @@ class LLMSession {
|
|
|
1042
1229
|
}
|
|
1043
1230
|
}
|
|
1044
1231
|
async embed(text, options) {
|
|
1045
|
-
return this.withOperation(() => this.manager.
|
|
1232
|
+
return this.withOperation(() => this.manager.getLLM().embed(text, options));
|
|
1046
1233
|
}
|
|
1047
1234
|
async embedBatch(texts) {
|
|
1048
|
-
return this.withOperation(() => this.manager.
|
|
1235
|
+
return this.withOperation(() => this.manager.getLLM().embedBatch(texts));
|
|
1049
1236
|
}
|
|
1050
1237
|
async expandQuery(query, options) {
|
|
1051
|
-
return this.withOperation(() => this.manager.
|
|
1238
|
+
return this.withOperation(() => this.manager.getLLM().expandQuery(query, options));
|
|
1052
1239
|
}
|
|
1053
1240
|
async rerank(query, documents, options) {
|
|
1054
|
-
return this.withOperation(() => this.manager.
|
|
1241
|
+
return this.withOperation(() => this.manager.getLLM().rerank(query, documents, options));
|
|
1055
1242
|
}
|
|
1056
1243
|
}
|
|
1057
|
-
// Session manager for the default
|
|
1244
|
+
// Session manager for the default LLM instance
|
|
1058
1245
|
let defaultSessionManager = null;
|
|
1059
1246
|
/**
|
|
1060
|
-
* Get the session manager for the default
|
|
1247
|
+
* Get the session manager for the default LLM instance.
|
|
1061
1248
|
*/
|
|
1062
1249
|
function getSessionManager() {
|
|
1063
|
-
const llm =
|
|
1064
|
-
if (!defaultSessionManager || defaultSessionManager.
|
|
1250
|
+
const llm = getDefaultLLM();
|
|
1251
|
+
if (!defaultSessionManager || defaultSessionManager.getLLM() !== llm) {
|
|
1065
1252
|
defaultSessionManager = new LLMSessionManager(llm);
|
|
1066
1253
|
}
|
|
1067
1254
|
return defaultSessionManager;
|
|
@@ -1100,32 +1287,37 @@ export function canUnloadLLM() {
|
|
|
1100
1287
|
return defaultSessionManager.canUnload();
|
|
1101
1288
|
}
|
|
1102
1289
|
// =============================================================================
|
|
1103
|
-
// Singleton for default
|
|
1290
|
+
// Singleton for default LLM instance
|
|
1104
1291
|
// =============================================================================
|
|
1105
|
-
let
|
|
1292
|
+
let defaultLLM = null;
|
|
1106
1293
|
/**
|
|
1107
|
-
* Get the default
|
|
1294
|
+
* Get the default LLM instance (creates one if needed)
|
|
1108
1295
|
*/
|
|
1109
|
-
export function
|
|
1110
|
-
if (!
|
|
1111
|
-
|
|
1112
|
-
|
|
1296
|
+
export function getDefaultLLM() {
|
|
1297
|
+
if (!defaultLLM) {
|
|
1298
|
+
if (process.env.KINDX_LLM_BACKEND === "remote") {
|
|
1299
|
+
defaultLLM = new RemoteLLM();
|
|
1300
|
+
}
|
|
1301
|
+
else {
|
|
1302
|
+
const embedModel = process.env.KINDX_EMBED_MODEL;
|
|
1303
|
+
defaultLLM = new LlamaCpp(embedModel ? { embedModel } : {});
|
|
1304
|
+
}
|
|
1113
1305
|
}
|
|
1114
|
-
return
|
|
1306
|
+
return defaultLLM;
|
|
1115
1307
|
}
|
|
1116
1308
|
/**
|
|
1117
|
-
* Set a custom default
|
|
1309
|
+
* Set a custom default LLM instance (useful for testing)
|
|
1118
1310
|
*/
|
|
1119
|
-
export function
|
|
1120
|
-
|
|
1311
|
+
export function setDefaultLLM(llm) {
|
|
1312
|
+
defaultLLM = llm;
|
|
1121
1313
|
}
|
|
1122
1314
|
/**
|
|
1123
|
-
* Dispose the default
|
|
1315
|
+
* Dispose the default LLM instance if it exists.
|
|
1124
1316
|
* Call this before process exit to prevent NAPI crashes.
|
|
1125
1317
|
*/
|
|
1126
|
-
export async function
|
|
1127
|
-
if (
|
|
1128
|
-
await
|
|
1129
|
-
|
|
1318
|
+
export async function disposeDefaultLLM() {
|
|
1319
|
+
if (defaultLLM) {
|
|
1320
|
+
await defaultLLM.dispose();
|
|
1321
|
+
defaultLLM = null;
|
|
1130
1322
|
}
|
|
1131
1323
|
}
|