@ambicuity/kindx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1131 @@
1
+ /**
2
+ * inference.ts - LLM abstraction layer for KINDX using node-llama-cpp
3
+ *
4
+ * Provides embeddings, text generation, and reranking using local GGUF models.
5
+ */
6
+ import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
7
+ import { homedir } from "os";
8
+ import { join } from "path";
9
+ import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
10
+ // =============================================================================
11
+ // Embedding Formatting Functions
12
+ // =============================================================================
13
+ /**
14
+ * Detect if a model URI uses the Qwen3-Embedding format.
15
+ * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
16
+ */
17
+ export function isQwen3EmbeddingModel(modelUri) {
18
+ return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
19
+ }
20
+ /**
21
+ * Format a query for embedding.
22
+ * Uses nomic-style task prefix format for embeddinggemma (default).
23
+ * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
24
+ */
25
+ export function formatQueryForEmbedding(query, modelUri) {
26
+ const uri = modelUri ?? process.env.KINDX_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
27
+ if (isQwen3EmbeddingModel(uri)) {
28
+ return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
29
+ }
30
+ return `task: search result | query: ${query}`;
31
+ }
32
+ /**
33
+ * Format a document for embedding.
34
+ * Uses nomic-style format with title and text fields (default).
35
+ * Qwen3-Embedding encodes documents as raw text without special prefixes.
36
+ */
37
+ export function formatDocForEmbedding(text, title, modelUri) {
38
+ const uri = modelUri ?? process.env.KINDX_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
39
+ if (isQwen3EmbeddingModel(uri)) {
40
+ // Qwen3-Embedding: documents are raw text, no task prefix
41
+ return title ? `${title}\n${text}` : text;
42
+ }
43
+ return `title: ${title || "none"} | text: ${text}`;
44
+ }
45
+ // =============================================================================
46
+ // Model Configuration
47
+ // =============================================================================
48
+ // HuggingFace model URIs for node-llama-cpp
49
+ // Format: hf:<user>/<repo>/<file>
50
+ // Override via KINDX_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/qwen3-embedding-0.6b-q8_0.gguf)
51
+ const DEFAULT_EMBED_MODEL = process.env.KINDX_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
52
+ const DEFAULT_RERANK_MODEL = process.env.KINDX_RERANK_MODEL ?? "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
53
+ const DEFAULT_GENERATE_MODEL = process.env.KINDX_GENERATE_MODEL ?? "hf:rr1904/kindx-query-expansion-1.7B-gguf/kindx-query-expansion-1.7B-q4_k_m.gguf";
54
+ // Alternative generation models for query expansion:
55
+ // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
56
+ // Use these as base for fine-tuning with configs/sft_lfm2.yaml
57
+ export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
58
+ export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
59
+ export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
60
+ export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
61
+ export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
62
+ // Local model cache directory
63
+ const MODEL_CACHE_DIR = join(homedir(), ".cache", "kindx", "models");
64
+ export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
65
+ function parseHfUri(model) {
66
+ if (!model.startsWith("hf:"))
67
+ return null;
68
+ const without = model.slice(3);
69
+ const parts = without.split("/");
70
+ if (parts.length < 3)
71
+ return null;
72
+ const repo = parts.slice(0, 2).join("/");
73
+ const file = parts.slice(2).join("/");
74
+ return { repo, file };
75
+ }
76
+ async function getRemoteEtag(ref) {
77
+ const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
78
+ try {
79
+ const resp = await fetch(url, { method: "HEAD" });
80
+ if (!resp.ok)
81
+ return null;
82
+ const etag = resp.headers.get("etag");
83
+ return etag || null;
84
+ }
85
+ catch {
86
+ return null;
87
+ }
88
+ }
89
+ export async function pullModels(models, options = {}) {
90
+ const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
91
+ if (!existsSync(cacheDir)) {
92
+ mkdirSync(cacheDir, { recursive: true });
93
+ }
94
+ const results = [];
95
+ for (const model of models) {
96
+ let refreshed = false;
97
+ const hfRef = parseHfUri(model);
98
+ const filename = model.split("/").pop();
99
+ const entries = readdirSync(cacheDir, { withFileTypes: true });
100
+ const cached = filename
101
+ ? entries
102
+ .filter((entry) => entry.isFile() && entry.name.includes(filename))
103
+ .map((entry) => join(cacheDir, entry.name))
104
+ : [];
105
+ if (hfRef && filename) {
106
+ const etagPath = join(cacheDir, `${filename}.etag`);
107
+ const remoteEtag = await getRemoteEtag(hfRef);
108
+ const localEtag = existsSync(etagPath)
109
+ ? readFileSync(etagPath, "utf-8").trim()
110
+ : null;
111
+ const shouldRefresh = options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
112
+ if (shouldRefresh) {
113
+ for (const candidate of cached) {
114
+ if (existsSync(candidate))
115
+ unlinkSync(candidate);
116
+ }
117
+ if (existsSync(etagPath))
118
+ unlinkSync(etagPath);
119
+ refreshed = cached.length > 0;
120
+ }
121
+ }
122
+ else if (options.refresh && filename) {
123
+ for (const candidate of cached) {
124
+ if (existsSync(candidate))
125
+ unlinkSync(candidate);
126
+ refreshed = true;
127
+ }
128
+ }
129
+ const path = await resolveModelFile(model, cacheDir);
130
+ const sizeBytes = existsSync(path) ? statSync(path).size : 0;
131
+ if (hfRef && filename) {
132
+ const remoteEtag = await getRemoteEtag(hfRef);
133
+ if (remoteEtag) {
134
+ const etagPath = join(cacheDir, `${filename}.etag`);
135
+ writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
136
+ }
137
+ }
138
+ results.push({ model, path, sizeBytes, refreshed });
139
+ }
140
+ return results;
141
+ }
142
+ /**
143
+ * LLM implementation using node-llama-cpp
144
+ */
145
+ // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
146
+ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
147
+ const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
148
+ const DEFAULT_RERANK_CONTEXT_SIZE = 4096;
149
+ function resolveExpandContextSize(configValue) {
150
+ if (configValue !== undefined) {
151
+ if (!Number.isInteger(configValue) || configValue <= 0) {
152
+ throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
153
+ }
154
+ return configValue;
155
+ }
156
+ const envValue = process.env.KINDX_EXPAND_CONTEXT_SIZE?.trim();
157
+ if (!envValue)
158
+ return DEFAULT_EXPAND_CONTEXT_SIZE;
159
+ const parsed = Number.parseInt(envValue, 10);
160
+ if (!Number.isInteger(parsed) || parsed <= 0) {
161
+ process.stderr.write(`KINDX Warning: invalid KINDX_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`);
162
+ return DEFAULT_EXPAND_CONTEXT_SIZE;
163
+ }
164
+ return parsed;
165
+ }
166
+ function resolveRerankContextSize(configValue) {
167
+ if (configValue !== undefined) {
168
+ if (!Number.isInteger(configValue) || configValue <= 0) {
169
+ throw new Error(`Invalid rerankContextSize: ${configValue}. Must be a positive integer.`);
170
+ }
171
+ return configValue;
172
+ }
173
+ const envValue = process.env.KINDX_RERANK_CONTEXT_SIZE?.trim();
174
+ if (!envValue)
175
+ return DEFAULT_RERANK_CONTEXT_SIZE;
176
+ const parsed = Number.parseInt(envValue, 10);
177
+ if (!Number.isInteger(parsed) || parsed <= 0) {
178
+ process.stderr.write(`KINDX Warning: invalid KINDX_RERANK_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_RERANK_CONTEXT_SIZE}.\n`);
179
+ return DEFAULT_RERANK_CONTEXT_SIZE;
180
+ }
181
+ return parsed;
182
+ }
183
+ export class LlamaCpp {
184
+ llama = null;
185
+ embedModel = null;
186
+ embedContexts = [];
187
+ generateModel = null;
188
+ rerankModel = null;
189
+ rerankContexts = [];
190
+ embedModelUri;
191
+ generateModelUri;
192
+ rerankModelUri;
193
+ modelCacheDir;
194
+ rerankContextSize;
195
+ expandContextSize;
196
+ // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
197
+ embedModelLoadPromise = null;
198
+ generateModelLoadPromise = null;
199
+ rerankModelLoadPromise = null;
200
+ // Inactivity timer for auto-unloading models
201
+ inactivityTimer = null;
202
+ inactivityTimeoutMs;
203
+ disposeModelsOnInactivity;
204
+ // Track disposal state to prevent double-dispose
205
+ disposed = false;
206
+ constructor(config = {}) {
207
+ this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
208
+ this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
209
+ this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
210
+ this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
211
+ this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
212
+ this.rerankContextSize = resolveRerankContextSize(config.rerankContextSize);
213
+ this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
214
+ this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
215
+ }
216
+ /**
217
+ * Reset the inactivity timer. Called after each model operation.
218
+ * When timer fires, models are unloaded to free memory (if no active sessions).
219
+ */
220
+ touchActivity() {
221
+ // Clear existing timer
222
+ if (this.inactivityTimer) {
223
+ clearTimeout(this.inactivityTimer);
224
+ this.inactivityTimer = null;
225
+ }
226
+ // Only set timer if we have disposable contexts and timeout is enabled
227
+ if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
228
+ this.inactivityTimer = setTimeout(() => {
229
+ // Check if session manager allows unloading
230
+ // canUnloadLLM is defined later in this file - it checks the session manager
231
+ // We use dynamic import pattern to avoid circular dependency issues
232
+ if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
233
+ // Active sessions/operations - reschedule timer
234
+ this.touchActivity();
235
+ return;
236
+ }
237
+ this.unloadIdleResources().catch(err => {
238
+ console.error("Error unloading idle resources:", err);
239
+ });
240
+ }, this.inactivityTimeoutMs);
241
+ // Don't keep process alive just for this timer
242
+ this.inactivityTimer.unref();
243
+ }
244
+ }
245
+ /**
246
+ * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
247
+ */
248
+ hasLoadedContexts() {
249
+ return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
250
+ }
251
+ /**
252
+ * Unload idle resources but keep the instance alive for future use.
253
+ *
254
+ * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
255
+ * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
256
+ */
257
+ async unloadIdleResources() {
258
+ // Don't unload if already disposed
259
+ if (this.disposed) {
260
+ return;
261
+ }
262
+ // Clear timer
263
+ if (this.inactivityTimer) {
264
+ clearTimeout(this.inactivityTimer);
265
+ this.inactivityTimer = null;
266
+ }
267
+ // Dispose contexts first
268
+ for (const ctx of this.embedContexts) {
269
+ await ctx.dispose();
270
+ }
271
+ this.embedContexts = [];
272
+ for (const ctx of this.rerankContexts) {
273
+ await ctx.dispose();
274
+ }
275
+ this.rerankContexts = [];
276
+ // Optionally dispose models too (opt-in)
277
+ if (this.disposeModelsOnInactivity) {
278
+ if (this.embedModel) {
279
+ await this.embedModel.dispose();
280
+ this.embedModel = null;
281
+ }
282
+ if (this.generateModel) {
283
+ await this.generateModel.dispose();
284
+ this.generateModel = null;
285
+ }
286
+ if (this.rerankModel) {
287
+ await this.rerankModel.dispose();
288
+ this.rerankModel = null;
289
+ }
290
+ // Reset load promises so models can be reloaded later
291
+ this.embedModelLoadPromise = null;
292
+ this.generateModelLoadPromise = null;
293
+ this.rerankModelLoadPromise = null;
294
+ }
295
+ // Note: We keep llama instance alive - it's lightweight
296
+ }
297
+ /**
298
+ * Ensure model cache directory exists
299
+ */
300
+ ensureModelCacheDir() {
301
+ if (!existsSync(this.modelCacheDir)) {
302
+ mkdirSync(this.modelCacheDir, { recursive: true });
303
+ }
304
+ }
305
+ /**
306
+ * Initialize the llama instance (lazy)
307
+ */
308
+ async ensureLlama() {
309
+ if (!this.llama) {
310
+ // KINDX_CPU_ONLY=1 forces CPU execution (useful for older GPU architectures
311
+ // like Pascal that fail with node-llama-cpp GPU acceleration).
312
+ const forceCPU = process.env.KINDX_CPU_ONLY === '1';
313
+ const llama = await getLlama({
314
+ // attempt to build
315
+ build: "autoAttempt",
316
+ logLevel: LlamaLogLevel.error,
317
+ ...(forceCPU ? { gpu: false } : {}),
318
+ });
319
+ if (forceCPU) {
320
+ process.stderr.write("KINDX: CPU-only mode enabled via KINDX_CPU_ONLY=1.\n");
321
+ }
322
+ else if (llama.gpu === false) {
323
+ process.stderr.write("KINDX Warning: no GPU acceleration, running on CPU (slow). Run 'kindx status' for details.\n");
324
+ }
325
+ this.llama = llama;
326
+ }
327
+ return this.llama;
328
+ }
329
+ /**
330
+ * Resolve a model URI to a local path, downloading if needed
331
+ */
332
+ async resolveModel(modelUri) {
333
+ this.ensureModelCacheDir();
334
+ // resolveModelFile handles HF URIs and downloads to the cache dir
335
+ return await resolveModelFile(modelUri, this.modelCacheDir);
336
+ }
337
+ /**
338
+ * Load embedding model (lazy)
339
+ */
340
+ async ensureEmbedModel() {
341
+ if (this.embedModel) {
342
+ return this.embedModel;
343
+ }
344
+ if (this.embedModelLoadPromise) {
345
+ return await this.embedModelLoadPromise;
346
+ }
347
+ this.embedModelLoadPromise = (async () => {
348
+ const llama = await this.ensureLlama();
349
+ const modelPath = await this.resolveModel(this.embedModelUri);
350
+ const model = await llama.loadModel({ modelPath });
351
+ this.embedModel = model;
352
+ // Model loading counts as activity - ping to keep alive
353
+ this.touchActivity();
354
+ return model;
355
+ })();
356
+ try {
357
+ return await this.embedModelLoadPromise;
358
+ }
359
+ finally {
360
+ // Keep the resolved model cached; clear only the in-flight promise.
361
+ this.embedModelLoadPromise = null;
362
+ }
363
+ }
364
+ /**
365
+ * Compute how many parallel contexts to create.
366
+ *
367
+ * GPU: constrained by VRAM (25% of free, capped at 8).
368
+ * CPU: constrained by cores. Splitting threads across contexts enables
369
+ * true parallelism (each context runs on its own cores). Use at most
370
+ * half the math cores, with at least 4 threads per context.
371
+ */
372
+ async computeParallelism(perContextMB) {
373
+ const llama = await this.ensureLlama();
374
+ if (llama.gpu) {
375
+ try {
376
+ const vram = await llama.getVramState();
377
+ const freeMB = vram.free / (1024 * 1024);
378
+ const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
379
+ return Math.max(1, Math.min(8, maxByVram));
380
+ }
381
+ catch {
382
+ return 2;
383
+ }
384
+ }
385
+ // CPU: split cores across contexts. At least 4 threads per context.
386
+ const cores = llama.cpuMathCores || 4;
387
+ const maxContexts = Math.floor(cores / 4);
388
+ return Math.max(1, Math.min(4, maxContexts));
389
+ }
390
+ /**
391
+ * Get the number of threads each context should use, given N parallel contexts.
392
+ * Splits available math cores evenly across contexts.
393
+ */
394
+ async threadsPerContext(parallelism) {
395
+ const llama = await this.ensureLlama();
396
+ if (llama.gpu)
397
+ return 0; // GPU: let the library decide
398
+ const cores = llama.cpuMathCores || 4;
399
+ return Math.max(1, Math.floor(cores / parallelism));
400
+ }
401
+ /**
402
+ * Load embedding contexts (lazy). Creates multiple for parallel embedding.
403
+ * Uses promise guard to prevent concurrent context creation race condition.
404
+ */
405
+ embedContextsCreatePromise = null;
406
+ async ensureEmbedContexts() {
407
+ if (this.embedContexts.length > 0) {
408
+ this.touchActivity();
409
+ return this.embedContexts;
410
+ }
411
+ if (this.embedContextsCreatePromise) {
412
+ return await this.embedContextsCreatePromise;
413
+ }
414
+ this.embedContextsCreatePromise = (async () => {
415
+ const model = await this.ensureEmbedModel();
416
+ // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
417
+ const n = await this.computeParallelism(150);
418
+ const threads = await this.threadsPerContext(n);
419
+ for (let i = 0; i < n; i++) {
420
+ try {
421
+ this.embedContexts.push(await model.createEmbeddingContext({
422
+ ...(threads > 0 ? { threads } : {}),
423
+ }));
424
+ }
425
+ catch {
426
+ if (this.embedContexts.length === 0)
427
+ throw new Error("Failed to create any embedding context");
428
+ break;
429
+ }
430
+ }
431
+ this.touchActivity();
432
+ return this.embedContexts;
433
+ })();
434
+ try {
435
+ return await this.embedContextsCreatePromise;
436
+ }
437
+ finally {
438
+ this.embedContextsCreatePromise = null;
439
+ }
440
+ }
441
+ /**
442
+ * Get a single embed context (for single-embed calls). Uses first from pool.
443
+ */
444
+ async ensureEmbedContext() {
445
+ const contexts = await this.ensureEmbedContexts();
446
+ return contexts[0];
447
+ }
448
+ /**
449
+ * Load generation model (lazy) - context is created fresh per call
450
+ */
451
+ async ensureGenerateModel() {
452
+ if (!this.generateModel) {
453
+ if (this.generateModelLoadPromise) {
454
+ return await this.generateModelLoadPromise;
455
+ }
456
+ this.generateModelLoadPromise = (async () => {
457
+ const llama = await this.ensureLlama();
458
+ const modelPath = await this.resolveModel(this.generateModelUri);
459
+ const model = await llama.loadModel({ modelPath });
460
+ this.generateModel = model;
461
+ return model;
462
+ })();
463
+ try {
464
+ await this.generateModelLoadPromise;
465
+ }
466
+ finally {
467
+ this.generateModelLoadPromise = null;
468
+ }
469
+ }
470
+ this.touchActivity();
471
+ if (!this.generateModel) {
472
+ throw new Error("Generate model not loaded");
473
+ }
474
+ return this.generateModel;
475
+ }
476
+ /**
477
+ * Load rerank model (lazy)
478
+ */
479
+ async ensureRerankModel() {
480
+ if (this.rerankModel) {
481
+ return this.rerankModel;
482
+ }
483
+ if (this.rerankModelLoadPromise) {
484
+ return await this.rerankModelLoadPromise;
485
+ }
486
+ this.rerankModelLoadPromise = (async () => {
487
+ const llama = await this.ensureLlama();
488
+ const modelPath = await this.resolveModel(this.rerankModelUri);
489
+ const model = await llama.loadModel({ modelPath });
490
+ this.rerankModel = model;
491
+ // Model loading counts as activity - ping to keep alive
492
+ this.touchActivity();
493
+ return model;
494
+ })();
495
+ try {
496
+ return await this.rerankModelLoadPromise;
497
+ }
498
+ finally {
499
+ this.rerankModelLoadPromise = null;
500
+ }
501
+ }
502
+ /**
503
+ * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
504
+ * Each context has its own sequence, so they can evaluate independently.
505
+ *
506
+ * Tuning choices:
507
+ * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
508
+ * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
509
+ * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
510
+ */
511
+ // Rerank context size is configurable via KINDX_RERANK_CONTEXT_SIZE env var.
512
+ // Default: 4096 tokens. Raised from 2048 to handle CJK content and longer chunks
513
+ // without truncation crashes. Still far less than auto (40960).
514
+ async ensureRerankContexts() {
515
+ if (this.rerankContexts.length === 0) {
516
+ const model = await this.ensureRerankModel();
517
+ // ~960 MB per context with flash attention at default contextSize 4096
518
+ const n = Math.min(await this.computeParallelism(1000), 4);
519
+ const threads = await this.threadsPerContext(n);
520
+ for (let i = 0; i < n; i++) {
521
+ try {
522
+ this.rerankContexts.push(await model.createRankingContext({
523
+ contextSize: this.rerankContextSize,
524
+ flashAttention: true,
525
+ ...(threads > 0 ? { threads } : {}),
526
+ }));
527
+ }
528
+ catch {
529
+ if (this.rerankContexts.length === 0) {
530
+ // Flash attention might not be supported — retry without it
531
+ try {
532
+ this.rerankContexts.push(await model.createRankingContext({
533
+ contextSize: this.rerankContextSize,
534
+ ...(threads > 0 ? { threads } : {}),
535
+ }));
536
+ }
537
+ catch {
538
+ throw new Error("Failed to create any rerank context");
539
+ }
540
+ }
541
+ break;
542
+ }
543
+ }
544
+ }
545
+ this.touchActivity();
546
+ return this.rerankContexts;
547
+ }
548
+ // ==========================================================================
549
+ // Tokenization
550
+ // ==========================================================================
551
+ /**
552
+ * Tokenize text using the embedding model's tokenizer
553
+ * Returns tokenizer tokens (opaque type from node-llama-cpp)
554
+ */
555
+ async tokenize(text) {
556
+ await this.ensureEmbedContext(); // Ensure model is loaded
557
+ if (!this.embedModel) {
558
+ throw new Error("Embed model not loaded");
559
+ }
560
+ return this.embedModel.tokenize(text);
561
+ }
562
+ /**
563
+ * Count tokens in text using the embedding model's tokenizer
564
+ */
565
+ async countTokens(text) {
566
+ const tokens = await this.tokenize(text);
567
+ return tokens.length;
568
+ }
569
+ /**
570
+ * Detokenize token IDs back to text
571
+ */
572
+ async detokenize(tokens) {
573
+ await this.ensureEmbedContext();
574
+ if (!this.embedModel) {
575
+ throw new Error("Embed model not loaded");
576
+ }
577
+ return this.embedModel.detokenize(tokens);
578
+ }
579
+ // ==========================================================================
580
+ // Core API methods
581
+ // ==========================================================================
582
+ async embed(text, options = {}) {
583
+ // Ping activity at start to keep models alive during this operation
584
+ this.touchActivity();
585
+ try {
586
+ const context = await this.ensureEmbedContext();
587
+ const embedding = await context.getEmbeddingFor(text);
588
+ return {
589
+ embedding: Array.from(embedding.vector),
590
+ model: this.embedModelUri,
591
+ };
592
+ }
593
+ catch (error) {
594
+ console.error("Embedding error:", error);
595
+ return null;
596
+ }
597
+ }
598
+ /**
599
+ * Batch embed multiple texts efficiently
600
+ * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
601
+ */
602
+ async embedBatch(texts) {
603
+ // Ping activity at start to keep models alive during this operation
604
+ this.touchActivity();
605
+ if (texts.length === 0)
606
+ return [];
607
+ try {
608
+ const contexts = await this.ensureEmbedContexts();
609
+ const n = contexts.length;
610
+ if (n === 1) {
611
+ // Single context: sequential (no point splitting)
612
+ const context = contexts[0];
613
+ const embeddings = [];
614
+ for (const text of texts) {
615
+ try {
616
+ const embedding = await context.getEmbeddingFor(text);
617
+ this.touchActivity();
618
+ embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
619
+ }
620
+ catch (err) {
621
+ console.error("Embedding error for text:", err);
622
+ embeddings.push(null);
623
+ }
624
+ }
625
+ return embeddings;
626
+ }
627
+ // Multiple contexts: split texts across contexts for parallel evaluation
628
+ const chunkSize = Math.ceil(texts.length / n);
629
+ const chunks = Array.from({ length: n }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize));
630
+ const chunkResults = await Promise.all(chunks.map(async (chunk, i) => {
631
+ const ctx = contexts[i];
632
+ const results = [];
633
+ for (const text of chunk) {
634
+ try {
635
+ const embedding = await ctx.getEmbeddingFor(text);
636
+ this.touchActivity();
637
+ results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
638
+ }
639
+ catch (err) {
640
+ console.error("Embedding error for text:", err);
641
+ results.push(null);
642
+ }
643
+ }
644
+ return results;
645
+ }));
646
+ return chunkResults.flat();
647
+ }
648
+ catch (error) {
649
+ console.error("Batch embedding error:", error);
650
+ return texts.map(() => null);
651
+ }
652
+ }
653
+ async generate(prompt, options = {}) {
654
+ // Ping activity at start to keep models alive during this operation
655
+ this.touchActivity();
656
+ // Ensure model is loaded
657
+ await this.ensureGenerateModel();
658
+ // Create fresh context -> sequence -> session for each call
659
+ const context = await this.generateModel.createContext();
660
+ const sequence = context.getSequence();
661
+ const session = new LlamaChatSession({ contextSequence: sequence });
662
+ const maxTokens = options.maxTokens ?? 150;
663
+ // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
664
+ // DO NOT use greedy decoding (temp=0) - causes repetition loops
665
+ const temperature = options.temperature ?? 0.7;
666
+ let result = "";
667
+ try {
668
+ await session.prompt(prompt, {
669
+ maxTokens,
670
+ temperature,
671
+ topK: 20,
672
+ topP: 0.8,
673
+ onTextChunk: (text) => {
674
+ result += text;
675
+ },
676
+ });
677
+ return {
678
+ text: result,
679
+ model: this.generateModelUri,
680
+ done: true,
681
+ };
682
+ }
683
+ finally {
684
+ // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
685
+ await context.dispose();
686
+ }
687
+ }
688
+ async modelExists(modelUri) {
689
+ // For HuggingFace URIs, we assume they exist
690
+ // For local paths, check if file exists
691
+ if (modelUri.startsWith("hf:")) {
692
+ return { name: modelUri, exists: true };
693
+ }
694
+ const exists = existsSync(modelUri);
695
+ return {
696
+ name: modelUri,
697
+ exists,
698
+ path: exists ? modelUri : undefined,
699
+ };
700
+ }
701
+ // ==========================================================================
702
+ // High-level abstractions
703
+ // ==========================================================================
704
+ async expandQuery(query, options = {}) {
705
+ // Ping activity at start to keep models alive during this operation
706
+ this.touchActivity();
707
+ const llama = await this.ensureLlama();
708
+ await this.ensureGenerateModel();
709
+ const includeLexical = options.includeLexical ?? true;
710
+ const context = options.context;
711
+ const grammar = await llama.createGrammar({
712
+ grammar: `
713
+ root ::= line+
714
+ line ::= type ": " content "\\n"
715
+ type ::= "lex" | "vec" | "hyde"
716
+ content ::= [^\\n]+
717
+ `
718
+ });
719
+ const prompt = `/no_think Expand this search query: ${query}`;
720
+ // Create a bounded context for expansion to prevent large default VRAM allocations.
721
+ const genContext = await this.generateModel.createContext({
722
+ contextSize: this.expandContextSize,
723
+ });
724
+ const sequence = genContext.getSequence();
725
+ const session = new LlamaChatSession({ contextSequence: sequence });
726
+ try {
727
+ // Qwen3 recommended settings for non-thinking mode:
728
+ // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
729
+ // DO NOT use greedy decoding (temp=0) - causes infinite loops
730
+ const result = await session.prompt(prompt, {
731
+ grammar,
732
+ maxTokens: 600,
733
+ temperature: 0.7,
734
+ topK: 20,
735
+ topP: 0.8,
736
+ repeatPenalty: {
737
+ lastTokens: 64,
738
+ presencePenalty: 0.5,
739
+ },
740
+ });
741
+ const lines = result.trim().split("\n");
742
+ const queryLower = query.toLowerCase();
743
+ const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
744
+ const hasQueryTerm = (text) => {
745
+ const lower = text.toLowerCase();
746
+ if (queryTerms.length === 0)
747
+ return true;
748
+ return queryTerms.some(term => lower.includes(term));
749
+ };
750
+ const queryables = lines.map(line => {
751
+ const colonIdx = line.indexOf(":");
752
+ if (colonIdx === -1)
753
+ return null;
754
+ const type = line.slice(0, colonIdx).trim();
755
+ if (type !== 'lex' && type !== 'vec' && type !== 'hyde')
756
+ return null;
757
+ const text = line.slice(colonIdx + 1).trim();
758
+ if (!hasQueryTerm(text))
759
+ return null;
760
+ return { type: type, text };
761
+ }).filter((q) => q !== null);
762
+ // Filter out lex entries if not requested
763
+ const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
764
+ if (filtered.length > 0)
765
+ return filtered;
766
+ const fallback = [
767
+ { type: 'hyde', text: `Information about ${query}` },
768
+ { type: 'lex', text: query },
769
+ { type: 'vec', text: query },
770
+ ];
771
+ return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
772
+ }
773
+ catch (error) {
774
+ console.error("Structured query expansion failed:", error);
775
+ // Fallback to original query
776
+ const fallback = [{ type: 'vec', text: query }];
777
+ if (includeLexical)
778
+ fallback.unshift({ type: 'lex', text: query });
779
+ return fallback;
780
+ }
781
+ finally {
782
+ await genContext.dispose();
783
+ }
784
+ }
785
+ // Qwen3 reranker chat template overhead (system prompt, tags, separators)
786
+ static RERANK_TEMPLATE_OVERHEAD = 200;
787
+ static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
788
+ async rerank(query, documents, options = {}) {
789
+ // Ping activity at start to keep models alive during this operation
790
+ this.touchActivity();
791
+ const contexts = await this.ensureRerankContexts();
792
+ const model = await this.ensureRerankModel();
793
+ // Truncate documents that would exceed the rerank context size.
794
+ // Budget = contextSize - template overhead - query tokens
795
+ const queryTokens = model.tokenize(query).length;
796
+ const rawMaxDocTokens = this.rerankContextSize - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
797
+ // Guard against non-positive budget (e.g., very long queries or CJK content with
798
+ // high token density). Allow at minimum 128 tokens per document to avoid crashes.
799
+ const maxDocTokens = Math.max(128, rawMaxDocTokens);
800
+ const truncationCache = new Map();
801
+ const truncatedDocs = documents.map((doc) => {
802
+ const cached = truncationCache.get(doc.text);
803
+ if (cached !== undefined) {
804
+ return cached === doc.text ? doc : { ...doc, text: cached };
805
+ }
806
+ const tokens = model.tokenize(doc.text);
807
+ const truncatedText = tokens.length <= maxDocTokens
808
+ ? doc.text
809
+ : model.detokenize(tokens.slice(0, maxDocTokens));
810
+ truncationCache.set(doc.text, truncatedText);
811
+ if (truncatedText === doc.text)
812
+ return doc;
813
+ return { ...doc, text: truncatedText };
814
+ });
815
+ // Deduplicate identical effective texts before scoring.
816
+ // This avoids redundant work for repeated chunks and fixes collisions where
817
+ // multiple docs map to the same chunk text.
818
+ const textToDocs = new Map();
819
+ truncatedDocs.forEach((doc, index) => {
820
+ const existing = textToDocs.get(doc.text);
821
+ if (existing) {
822
+ existing.push({ file: doc.file, index });
823
+ }
824
+ else {
825
+ textToDocs.set(doc.text, [{ file: doc.file, index }]);
826
+ }
827
+ });
828
+ // Extract just the text for ranking
829
+ const texts = Array.from(textToDocs.keys());
830
+ // Split documents across contexts for parallel evaluation.
831
+ // Each context has its own sequence with a lock, so parallelism comes
832
+ // from multiple contexts evaluating different chunks simultaneously.
833
+ const activeContextCount = Math.max(1, Math.min(contexts.length, Math.ceil(texts.length / LlamaCpp.RERANK_TARGET_DOCS_PER_CONTEXT)));
834
+ const activeContexts = contexts.slice(0, activeContextCount);
835
+ const chunkSize = Math.ceil(texts.length / activeContexts.length);
836
+ const chunks = Array.from({ length: activeContexts.length }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize)).filter(chunk => chunk.length > 0);
837
+ const allScores = await Promise.all(chunks.map((chunk, i) => activeContexts[i].rankAll(query, chunk)));
838
+ // Reassemble scores in original order and sort
839
+ const flatScores = allScores.flat();
840
+ const ranked = texts
841
+ .map((text, i) => ({ document: text, score: flatScores[i] }))
842
+ .sort((a, b) => b.score - a.score);
843
+ // Map back to our result format.
844
+ const results = [];
845
+ for (const item of ranked) {
846
+ const docInfos = textToDocs.get(item.document) ?? [];
847
+ for (const docInfo of docInfos) {
848
+ results.push({
849
+ file: docInfo.file,
850
+ score: item.score,
851
+ index: docInfo.index,
852
+ });
853
+ }
854
+ }
855
+ return {
856
+ results,
857
+ model: this.rerankModelUri,
858
+ };
859
+ }
860
+ /**
861
+ * Get device/GPU info for status display.
862
+ * Initializes llama if not already done.
863
+ */
864
+ async getDeviceInfo() {
865
+ const llama = await this.ensureLlama();
866
+ const gpuDevices = await llama.getGpuDeviceNames();
867
+ let vram;
868
+ if (llama.gpu) {
869
+ try {
870
+ const state = await llama.getVramState();
871
+ vram = { total: state.total, used: state.used, free: state.free };
872
+ }
873
+ catch { /* no vram info */ }
874
+ }
875
+ return {
876
+ gpu: llama.gpu,
877
+ gpuOffloading: llama.supportsGpuOffloading,
878
+ gpuDevices,
879
+ vram,
880
+ cpuCores: llama.cpuMathCores,
881
+ };
882
+ }
883
+ async dispose() {
884
+ // Prevent double-dispose
885
+ if (this.disposed) {
886
+ return;
887
+ }
888
+ this.disposed = true;
889
+ // Clear inactivity timer
890
+ if (this.inactivityTimer) {
891
+ clearTimeout(this.inactivityTimer);
892
+ this.inactivityTimer = null;
893
+ }
894
+ // Disposing llama cascades to models and contexts automatically
895
+ // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
896
+ // Note: llama.dispose() can hang indefinitely, so we use a timeout
897
+ if (this.llama) {
898
+ const disposePromise = this.llama.dispose();
899
+ const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000));
900
+ await Promise.race([disposePromise, timeoutPromise]);
901
+ }
902
+ // Clear references
903
+ this.embedContexts = [];
904
+ this.rerankContexts = [];
905
+ this.embedModel = null;
906
+ this.generateModel = null;
907
+ this.rerankModel = null;
908
+ this.llama = null;
909
+ // Clear any in-flight load/create promises
910
+ this.embedModelLoadPromise = null;
911
+ this.embedContextsCreatePromise = null;
912
+ this.generateModelLoadPromise = null;
913
+ this.rerankModelLoadPromise = null;
914
+ }
915
+ }
916
+ // =============================================================================
917
+ // Session Management Layer
918
+ // =============================================================================
919
+ /**
920
+ * Manages LLM session lifecycle with reference counting.
921
+ * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
922
+ */
923
+ class LLMSessionManager {
924
+ llm;
925
+ _activeSessionCount = 0;
926
+ _inFlightOperations = 0;
927
+ constructor(llm) {
928
+ this.llm = llm;
929
+ }
930
+ get activeSessionCount() {
931
+ return this._activeSessionCount;
932
+ }
933
+ get inFlightOperations() {
934
+ return this._inFlightOperations;
935
+ }
936
+ /**
937
+ * Returns true only when both session count and in-flight operations are 0.
938
+ * Used by LlamaCpp to determine if idle unload is safe.
939
+ */
940
+ canUnload() {
941
+ return this._activeSessionCount === 0 && this._inFlightOperations === 0;
942
+ }
943
+ acquire() {
944
+ this._activeSessionCount++;
945
+ }
946
+ release() {
947
+ this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
948
+ }
949
+ operationStart() {
950
+ this._inFlightOperations++;
951
+ }
952
+ operationEnd() {
953
+ this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
954
+ }
955
+ getLlamaCpp() {
956
+ return this.llm;
957
+ }
958
+ }
959
+ /**
960
+ * Error thrown when an operation is attempted on a released or aborted session.
961
+ */
962
+ export class SessionReleasedError extends Error {
963
+ constructor(message = "LLM session has been released or aborted") {
964
+ super(message);
965
+ this.name = "SessionReleasedError";
966
+ }
967
+ }
968
+ /**
969
+ * Scoped LLM session with automatic lifecycle management.
970
+ * Wraps LlamaCpp methods with operation tracking and abort handling.
971
+ */
972
+ class LLMSession {
973
+ manager;
974
+ released = false;
975
+ abortController;
976
+ maxDurationTimer = null;
977
+ name;
978
+ constructor(manager, options = {}) {
979
+ this.manager = manager;
980
+ this.name = options.name || "unnamed";
981
+ this.abortController = new AbortController();
982
+ // Link external abort signal if provided
983
+ if (options.signal) {
984
+ if (options.signal.aborted) {
985
+ this.abortController.abort(options.signal.reason);
986
+ }
987
+ else {
988
+ options.signal.addEventListener("abort", () => {
989
+ this.abortController.abort(options.signal.reason);
990
+ }, { once: true });
991
+ }
992
+ }
993
+ // Set up max duration timer
994
+ const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
995
+ if (maxDuration > 0) {
996
+ this.maxDurationTimer = setTimeout(() => {
997
+ this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
998
+ }, maxDuration);
999
+ this.maxDurationTimer.unref(); // Don't keep process alive
1000
+ }
1001
+ // Acquire session lease
1002
+ this.manager.acquire();
1003
+ }
1004
+ get isValid() {
1005
+ return !this.released && !this.abortController.signal.aborted;
1006
+ }
1007
+ get signal() {
1008
+ return this.abortController.signal;
1009
+ }
1010
+ /**
1011
+ * Release the session and decrement ref count.
1012
+ * Called automatically by withLLMSession when the callback completes.
1013
+ */
1014
+ release() {
1015
+ if (this.released)
1016
+ return;
1017
+ this.released = true;
1018
+ if (this.maxDurationTimer) {
1019
+ clearTimeout(this.maxDurationTimer);
1020
+ this.maxDurationTimer = null;
1021
+ }
1022
+ this.abortController.abort(new Error("Session released"));
1023
+ this.manager.release();
1024
+ }
1025
+ /**
1026
+ * Wrap an operation with tracking and abort checking.
1027
+ */
1028
+ async withOperation(fn) {
1029
+ if (!this.isValid) {
1030
+ throw new SessionReleasedError();
1031
+ }
1032
+ this.manager.operationStart();
1033
+ try {
1034
+ // Check abort before starting
1035
+ if (this.abortController.signal.aborted) {
1036
+ throw new SessionReleasedError(this.abortController.signal.reason?.message || "Session aborted");
1037
+ }
1038
+ return await fn();
1039
+ }
1040
+ finally {
1041
+ this.manager.operationEnd();
1042
+ }
1043
+ }
1044
+ async embed(text, options) {
1045
+ return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
1046
+ }
1047
+ async embedBatch(texts) {
1048
+ return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
1049
+ }
1050
+ async expandQuery(query, options) {
1051
+ return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
1052
+ }
1053
+ async rerank(query, documents, options) {
1054
+ return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
1055
+ }
1056
+ }
1057
+ // Session manager for the default LlamaCpp instance
1058
+ let defaultSessionManager = null;
1059
+ /**
1060
+ * Get the session manager for the default LlamaCpp instance.
1061
+ */
1062
+ function getSessionManager() {
1063
+ const llm = getDefaultLlamaCpp();
1064
+ if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
1065
+ defaultSessionManager = new LLMSessionManager(llm);
1066
+ }
1067
+ return defaultSessionManager;
1068
+ }
1069
+ /**
1070
+ * Execute a function with a scoped LLM session.
1071
+ * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
1072
+ *
1073
+ * @example
1074
+ * ```typescript
1075
+ * await withLLMSession(async (session) => {
1076
+ * const expanded = await session.expandQuery(query);
1077
+ * const embeddings = await session.embedBatch(texts);
1078
+ * const reranked = await session.rerank(query, docs);
1079
+ * return reranked;
1080
+ * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
1081
+ * ```
1082
+ */
1083
+ export async function withLLMSession(fn, options) {
1084
+ const manager = getSessionManager();
1085
+ const session = new LLMSession(manager, options);
1086
+ try {
1087
+ return await fn(session);
1088
+ }
1089
+ finally {
1090
+ session.release();
1091
+ }
1092
+ }
1093
+ /**
1094
+ * Check if idle unload is safe (no active sessions or operations).
1095
+ * Used internally by LlamaCpp idle timer.
1096
+ */
1097
+ export function canUnloadLLM() {
1098
+ if (!defaultSessionManager)
1099
+ return true;
1100
+ return defaultSessionManager.canUnload();
1101
+ }
1102
+ // =============================================================================
1103
+ // Singleton for default LlamaCpp instance
1104
+ // =============================================================================
1105
+ let defaultLlamaCpp = null;
1106
+ /**
1107
+ * Get the default LlamaCpp instance (creates one if needed)
1108
+ */
1109
+ export function getDefaultLlamaCpp() {
1110
+ if (!defaultLlamaCpp) {
1111
+ const embedModel = process.env.KINDX_EMBED_MODEL;
1112
+ defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
1113
+ }
1114
+ return defaultLlamaCpp;
1115
+ }
1116
+ /**
1117
+ * Set a custom default LlamaCpp instance (useful for testing)
1118
+ */
1119
+ export function setDefaultLlamaCpp(llm) {
1120
+ defaultLlamaCpp = llm;
1121
+ }
1122
+ /**
1123
+ * Dispose the default LlamaCpp instance if it exists.
1124
+ * Call this before process exit to prevent NAPI crashes.
1125
+ */
1126
+ export async function disposeDefaultLlamaCpp() {
1127
+ if (defaultLlamaCpp) {
1128
+ await defaultLlamaCpp.dispose();
1129
+ defaultLlamaCpp = null;
1130
+ }
1131
+ }