@ambicuity/kindx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,398 @@
1
+ /**
2
+ * inference.ts - LLM abstraction layer for KINDX using node-llama-cpp
3
+ *
4
+ * Provides embeddings, text generation, and reranking using local GGUF models.
5
+ */
6
+ import { type Token as LlamaToken } from "node-llama-cpp";
7
+ /**
8
+ * Detect if a model URI uses the Qwen3-Embedding format.
9
+ * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
10
+ */
11
+ export declare function isQwen3EmbeddingModel(modelUri: string): boolean;
12
+ /**
13
+ * Format a query for embedding.
14
+ * Uses nomic-style task prefix format for embeddinggemma (default).
15
+ * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
16
+ */
17
+ export declare function formatQueryForEmbedding(query: string, modelUri?: string): string;
18
+ /**
19
+ * Format a document for embedding.
20
+ * Uses nomic-style format with title and text fields (default).
21
+ * Qwen3-Embedding encodes documents as raw text without special prefixes.
22
+ */
23
+ export declare function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string;
24
+ /**
25
+ * Token with log probability
26
+ */
27
+ export type TokenLogProb = {
28
+ token: string;
29
+ logprob: number;
30
+ };
31
+ /**
32
+ * Embedding result
33
+ */
34
+ export type EmbeddingResult = {
35
+ embedding: number[];
36
+ model: string;
37
+ };
38
+ /**
39
+ * Generation result with optional logprobs
40
+ */
41
+ export type GenerateResult = {
42
+ text: string;
43
+ model: string;
44
+ logprobs?: TokenLogProb[];
45
+ done: boolean;
46
+ };
47
+ /**
48
+ * Rerank result for a single document
49
+ */
50
+ export type RerankDocumentResult = {
51
+ file: string;
52
+ score: number;
53
+ index: number;
54
+ };
55
+ /**
56
+ * Batch rerank result
57
+ */
58
+ export type RerankResult = {
59
+ results: RerankDocumentResult[];
60
+ model: string;
61
+ };
62
+ /**
63
+ * Model info
64
+ */
65
+ export type ModelInfo = {
66
+ name: string;
67
+ exists: boolean;
68
+ path?: string;
69
+ };
70
+ /**
71
+ * Options for embedding
72
+ */
73
+ export type EmbedOptions = {
74
+ model?: string;
75
+ isQuery?: boolean;
76
+ title?: string;
77
+ };
78
+ /**
79
+ * Options for text generation
80
+ */
81
+ export type GenerateOptions = {
82
+ model?: string;
83
+ maxTokens?: number;
84
+ temperature?: number;
85
+ };
86
+ /**
87
+ * Options for reranking
88
+ */
89
+ export type RerankOptions = {
90
+ model?: string;
91
+ };
92
+ /**
93
+ * Options for LLM sessions
94
+ */
95
+ export type LLMSessionOptions = {
96
+ /** Max session duration in ms (default: 10 minutes) */
97
+ maxDuration?: number;
98
+ /** External abort signal */
99
+ signal?: AbortSignal;
100
+ /** Debug name for logging */
101
+ name?: string;
102
+ };
103
+ /**
104
+ * Session interface for scoped LLM access with lifecycle guarantees
105
+ */
106
+ export interface ILLMSession {
107
+ embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
108
+ embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
109
+ expandQuery(query: string, options?: {
110
+ context?: string;
111
+ includeLexical?: boolean;
112
+ }): Promise<Queryable[]>;
113
+ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
114
+ /** Whether this session is still valid (not released or aborted) */
115
+ readonly isValid: boolean;
116
+ /** Abort signal for this session (aborts on release or maxDuration) */
117
+ readonly signal: AbortSignal;
118
+ }
119
+ /**
120
+ * Supported query types for different search backends
121
+ */
122
+ export type QueryType = 'lex' | 'vec' | 'hyde';
123
+ /**
124
+ * A single query and its target backend type
125
+ */
126
+ export type Queryable = {
127
+ type: QueryType;
128
+ text: string;
129
+ };
130
+ /**
131
+ * Document to rerank
132
+ */
133
+ export type RerankDocument = {
134
+ file: string;
135
+ text: string;
136
+ title?: string;
137
+ };
138
+ export declare const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
139
+ export declare const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
140
+ export declare const DEFAULT_EMBED_MODEL_URI: string;
141
+ export declare const DEFAULT_RERANK_MODEL_URI: string;
142
+ export declare const DEFAULT_GENERATE_MODEL_URI: string;
143
+ export declare const DEFAULT_MODEL_CACHE_DIR: string;
144
+ export type PullResult = {
145
+ model: string;
146
+ path: string;
147
+ sizeBytes: number;
148
+ refreshed: boolean;
149
+ };
150
+ export declare function pullModels(models: string[], options?: {
151
+ refresh?: boolean;
152
+ cacheDir?: string;
153
+ }): Promise<PullResult[]>;
154
+ /**
155
+ * Abstract LLM interface - implement this for different backends
156
+ */
157
+ export interface LLM {
158
+ /**
159
+ * Get embeddings for text
160
+ */
161
+ embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
162
+ /**
163
+ * Generate text completion
164
+ */
165
+ generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
166
+ /**
167
+ * Check if a model exists/is available
168
+ */
169
+ modelExists(model: string): Promise<ModelInfo>;
170
+ /**
171
+ * Expand a search query into multiple variations for different backends.
172
+ * Returns a list of Queryable objects.
173
+ */
174
+ expandQuery(query: string, options?: {
175
+ context?: string;
176
+ includeLexical?: boolean;
177
+ }): Promise<Queryable[]>;
178
+ /**
179
+ * Rerank documents by relevance to a query
180
+ * Returns list of documents with relevance scores (higher = more relevant)
181
+ */
182
+ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
183
+ /**
184
+ * Dispose of resources
185
+ */
186
+ dispose(): Promise<void>;
187
+ }
188
+ export type LlamaCppConfig = {
189
+ embedModel?: string;
190
+ generateModel?: string;
191
+ rerankModel?: string;
192
+ modelCacheDir?: string;
193
+ /**
194
+ * Context size used for query expansion generation contexts.
195
+ * Default: 2048. Can also be set via KINDX_EXPAND_CONTEXT_SIZE.
196
+ */
197
+ expandContextSize?: number;
198
+ /**
199
+ * Context size for reranker contexts.
200
+ * Default: 4096. Can also be set via KINDX_RERANK_CONTEXT_SIZE.
201
+ * Increase this if reranking CJK or long-form content that exceeds the window.
202
+ */
203
+ rerankContextSize?: number;
204
+ /**
205
+ * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
206
+ *
207
+ * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
208
+ * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
209
+ * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
210
+ */
211
+ inactivityTimeoutMs?: number;
212
+ /**
213
+ * Whether to dispose models on inactivity (default: false).
214
+ *
215
+ * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
216
+ * memory reclaim.
217
+ */
218
+ disposeModelsOnInactivity?: boolean;
219
+ };
220
+ export declare class LlamaCpp implements LLM {
221
+ private llama;
222
+ private embedModel;
223
+ private embedContexts;
224
+ private generateModel;
225
+ private rerankModel;
226
+ private rerankContexts;
227
+ private embedModelUri;
228
+ private generateModelUri;
229
+ private rerankModelUri;
230
+ private modelCacheDir;
231
+ private rerankContextSize;
232
+ private expandContextSize;
233
+ private embedModelLoadPromise;
234
+ private generateModelLoadPromise;
235
+ private rerankModelLoadPromise;
236
+ private inactivityTimer;
237
+ private inactivityTimeoutMs;
238
+ private disposeModelsOnInactivity;
239
+ private disposed;
240
+ constructor(config?: LlamaCppConfig);
241
+ /**
242
+ * Reset the inactivity timer. Called after each model operation.
243
+ * When timer fires, models are unloaded to free memory (if no active sessions).
244
+ */
245
+ private touchActivity;
246
+ /**
247
+ * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
248
+ */
249
+ private hasLoadedContexts;
250
+ /**
251
+ * Unload idle resources but keep the instance alive for future use.
252
+ *
253
+ * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
254
+ * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
255
+ */
256
+ unloadIdleResources(): Promise<void>;
257
+ /**
258
+ * Ensure model cache directory exists
259
+ */
260
+ private ensureModelCacheDir;
261
+ /**
262
+ * Initialize the llama instance (lazy)
263
+ */
264
+ private ensureLlama;
265
+ /**
266
+ * Resolve a model URI to a local path, downloading if needed
267
+ */
268
+ private resolveModel;
269
+ /**
270
+ * Load embedding model (lazy)
271
+ */
272
+ private ensureEmbedModel;
273
+ /**
274
+ * Compute how many parallel contexts to create.
275
+ *
276
+ * GPU: constrained by VRAM (25% of free, capped at 8).
277
+ * CPU: constrained by cores. Splitting threads across contexts enables
278
+ * true parallelism (each context runs on its own cores). Use at most
279
+ * half the math cores, with at least 4 threads per context.
280
+ */
281
+ private computeParallelism;
282
+ /**
283
+ * Get the number of threads each context should use, given N parallel contexts.
284
+ * Splits available math cores evenly across contexts.
285
+ */
286
+ private threadsPerContext;
287
+ /**
288
+ * Load embedding contexts (lazy). Creates multiple for parallel embedding.
289
+ * Uses promise guard to prevent concurrent context creation race condition.
290
+ */
291
+ private embedContextsCreatePromise;
292
+ private ensureEmbedContexts;
293
+ /**
294
+ * Get a single embed context (for single-embed calls). Uses first from pool.
295
+ */
296
+ private ensureEmbedContext;
297
+ /**
298
+ * Load generation model (lazy) - context is created fresh per call
299
+ */
300
+ private ensureGenerateModel;
301
+ /**
302
+ * Load rerank model (lazy)
303
+ */
304
+ private ensureRerankModel;
305
+ /**
306
+ * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
307
+ * Each context has its own sequence, so they can evaluate independently.
308
+ *
309
+ * Tuning choices:
310
+ * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
311
+ * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
312
+ * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
313
+ */
314
+ private ensureRerankContexts;
315
+ /**
316
+ * Tokenize text using the embedding model's tokenizer
317
+ * Returns tokenizer tokens (opaque type from node-llama-cpp)
318
+ */
319
+ tokenize(text: string): Promise<readonly LlamaToken[]>;
320
+ /**
321
+ * Count tokens in text using the embedding model's tokenizer
322
+ */
323
+ countTokens(text: string): Promise<number>;
324
+ /**
325
+ * Detokenize token IDs back to text
326
+ */
327
+ detokenize(tokens: readonly LlamaToken[]): Promise<string>;
328
+ embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
329
+ /**
330
+ * Batch embed multiple texts efficiently
331
+ * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
332
+ */
333
+ embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
334
+ generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
335
+ modelExists(modelUri: string): Promise<ModelInfo>;
336
+ expandQuery(query: string, options?: {
337
+ context?: string;
338
+ includeLexical?: boolean;
339
+ }): Promise<Queryable[]>;
340
+ private static readonly RERANK_TEMPLATE_OVERHEAD;
341
+ private static readonly RERANK_TARGET_DOCS_PER_CONTEXT;
342
+ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
343
+ /**
344
+ * Get device/GPU info for status display.
345
+ * Initializes llama if not already done.
346
+ */
347
+ getDeviceInfo(): Promise<{
348
+ gpu: string | false;
349
+ gpuOffloading: boolean;
350
+ gpuDevices: string[];
351
+ vram?: {
352
+ total: number;
353
+ used: number;
354
+ free: number;
355
+ };
356
+ cpuCores: number;
357
+ }>;
358
+ dispose(): Promise<void>;
359
+ }
360
+ /**
361
+ * Error thrown when an operation is attempted on a released or aborted session.
362
+ */
363
+ export declare class SessionReleasedError extends Error {
364
+ constructor(message?: string);
365
+ }
366
+ /**
367
+ * Execute a function with a scoped LLM session.
368
+ * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
369
+ *
370
+ * @example
371
+ * ```typescript
372
+ * await withLLMSession(async (session) => {
373
+ * const expanded = await session.expandQuery(query);
374
+ * const embeddings = await session.embedBatch(texts);
375
+ * const reranked = await session.rerank(query, docs);
376
+ * return reranked;
377
+ * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
378
+ * ```
379
+ */
380
+ export declare function withLLMSession<T>(fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
381
+ /**
382
+ * Check if idle unload is safe (no active sessions or operations).
383
+ * Used internally by LlamaCpp idle timer.
384
+ */
385
+ export declare function canUnloadLLM(): boolean;
386
+ /**
387
+ * Get the default LlamaCpp instance (creates one if needed)
388
+ */
389
+ export declare function getDefaultLlamaCpp(): LlamaCpp;
390
+ /**
391
+ * Set a custom default LlamaCpp instance (useful for testing)
392
+ */
393
+ export declare function setDefaultLlamaCpp(llm: LlamaCpp | null): void;
394
+ /**
395
+ * Dispose the default LlamaCpp instance if it exists.
396
+ * Call this before process exit to prevent NAPI crashes.
397
+ */
398
+ export declare function disposeDefaultLlamaCpp(): Promise<void>;