@tobilu/qmd 1.0.0 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/llm.ts DELETED
@@ -1,1397 +0,0 @@
1
- /**
2
- * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
3
- *
4
- * Provides embeddings, text generation, and reranking using local GGUF models.
5
- */
6
-
7
- import {
8
- getLlama,
9
- getLlamaGpuTypes,
10
- resolveModelFile,
11
- LlamaChatSession,
12
- LlamaLogLevel,
13
- type Llama,
14
- type LlamaModel,
15
- type LlamaEmbeddingContext,
16
- type Token as LlamaToken,
17
- } from "node-llama-cpp";
18
- import { homedir } from "os";
19
- import { join } from "path";
20
- import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
21
-
22
- // =============================================================================
23
- // Embedding Formatting Functions
24
- // =============================================================================
25
-
26
- /**
27
- * Format a query for embedding.
28
- * Uses nomic-style task prefix format for embeddinggemma.
29
- */
30
- export function formatQueryForEmbedding(query: string): string {
31
- return `task: search result | query: ${query}`;
32
- }
33
-
34
- /**
35
- * Format a document for embedding.
36
- * Uses nomic-style format with title and text fields.
37
- */
38
- export function formatDocForEmbedding(text: string, title?: string): string {
39
- return `title: ${title || "none"} | text: ${text}`;
40
- }
41
-
42
- // =============================================================================
43
- // Types
44
- // =============================================================================
45
-
46
- /**
47
- * Token with log probability
48
- */
49
- export type TokenLogProb = {
50
- token: string;
51
- logprob: number;
52
- };
53
-
54
- /**
55
- * Embedding result
56
- */
57
- export type EmbeddingResult = {
58
- embedding: number[];
59
- model: string;
60
- };
61
-
62
- /**
63
- * Generation result with optional logprobs
64
- */
65
- export type GenerateResult = {
66
- text: string;
67
- model: string;
68
- logprobs?: TokenLogProb[];
69
- done: boolean;
70
- };
71
-
72
- /**
73
- * Rerank result for a single document
74
- */
75
- export type RerankDocumentResult = {
76
- file: string;
77
- score: number;
78
- index: number;
79
- };
80
-
81
- /**
82
- * Batch rerank result
83
- */
84
- export type RerankResult = {
85
- results: RerankDocumentResult[];
86
- model: string;
87
- };
88
-
89
- /**
90
- * Model info
91
- */
92
- export type ModelInfo = {
93
- name: string;
94
- exists: boolean;
95
- path?: string;
96
- };
97
-
98
- /**
99
- * Options for embedding
100
- */
101
- export type EmbedOptions = {
102
- model?: string;
103
- isQuery?: boolean;
104
- title?: string;
105
- };
106
-
107
- /**
108
- * Options for text generation
109
- */
110
- export type GenerateOptions = {
111
- model?: string;
112
- maxTokens?: number;
113
- temperature?: number;
114
- };
115
-
116
- /**
117
- * Options for reranking
118
- */
119
- export type RerankOptions = {
120
- model?: string;
121
- };
122
-
123
- /**
124
- * Options for LLM sessions
125
- */
126
- export type LLMSessionOptions = {
127
- /** Max session duration in ms (default: 10 minutes) */
128
- maxDuration?: number;
129
- /** External abort signal */
130
- signal?: AbortSignal;
131
- /** Debug name for logging */
132
- name?: string;
133
- };
134
-
135
- /**
136
- * Session interface for scoped LLM access with lifecycle guarantees
137
- */
138
- export interface ILLMSession {
139
- embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
140
- embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
141
- expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise<Queryable[]>;
142
- rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
143
- /** Whether this session is still valid (not released or aborted) */
144
- readonly isValid: boolean;
145
- /** Abort signal for this session (aborts on release or maxDuration) */
146
- readonly signal: AbortSignal;
147
- }
148
-
149
- /**
150
- * Supported query types for different search backends
151
- */
152
- export type QueryType = 'lex' | 'vec' | 'hyde';
153
-
154
- /**
155
- * A single query and its target backend type
156
- */
157
- export type Queryable = {
158
- type: QueryType;
159
- text: string;
160
- };
161
-
162
- /**
163
- * Document to rerank
164
- */
165
- export type RerankDocument = {
166
- file: string;
167
- text: string;
168
- title?: string;
169
- };
170
-
171
- // =============================================================================
172
- // Model Configuration
173
- // =============================================================================
174
-
175
- // HuggingFace model URIs for node-llama-cpp
176
- // Format: hf:<user>/<repo>/<file>
177
- const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
178
- const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
179
- // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
180
- const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
181
-
182
- export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
183
- export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
184
- export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
185
-
186
- // Local model cache directory
187
- const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
188
- export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
189
-
190
- export type PullResult = {
191
- model: string;
192
- path: string;
193
- sizeBytes: number;
194
- refreshed: boolean;
195
- };
196
-
197
- type HfRef = {
198
- repo: string;
199
- file: string;
200
- };
201
-
202
- function parseHfUri(model: string): HfRef | null {
203
- if (!model.startsWith("hf:")) return null;
204
- const without = model.slice(3);
205
- const parts = without.split("/");
206
- if (parts.length < 3) return null;
207
- const repo = parts.slice(0, 2).join("/");
208
- const file = parts.slice(2).join("/");
209
- return { repo, file };
210
- }
211
-
212
- async function getRemoteEtag(ref: HfRef): Promise<string | null> {
213
- const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
214
- try {
215
- const resp = await fetch(url, { method: "HEAD" });
216
- if (!resp.ok) return null;
217
- const etag = resp.headers.get("etag");
218
- return etag || null;
219
- } catch {
220
- return null;
221
- }
222
- }
223
-
224
- export async function pullModels(
225
- models: string[],
226
- options: { refresh?: boolean; cacheDir?: string } = {}
227
- ): Promise<PullResult[]> {
228
- const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
229
- if (!existsSync(cacheDir)) {
230
- mkdirSync(cacheDir, { recursive: true });
231
- }
232
-
233
- const results: PullResult[] = [];
234
- for (const model of models) {
235
- let refreshed = false;
236
- const hfRef = parseHfUri(model);
237
- const filename = model.split("/").pop();
238
- const entries = readdirSync(cacheDir, { withFileTypes: true });
239
- const cached = filename
240
- ? entries
241
- .filter((entry) => entry.isFile() && entry.name.includes(filename))
242
- .map((entry) => join(cacheDir, entry.name))
243
- : [];
244
-
245
- if (hfRef && filename) {
246
- const etagPath = join(cacheDir, `${filename}.etag`);
247
- const remoteEtag = await getRemoteEtag(hfRef);
248
- const localEtag = existsSync(etagPath)
249
- ? readFileSync(etagPath, "utf-8").trim()
250
- : null;
251
- const shouldRefresh =
252
- options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
253
-
254
- if (shouldRefresh) {
255
- for (const candidate of cached) {
256
- if (existsSync(candidate)) unlinkSync(candidate);
257
- }
258
- if (existsSync(etagPath)) unlinkSync(etagPath);
259
- refreshed = cached.length > 0;
260
- }
261
- } else if (options.refresh && filename) {
262
- for (const candidate of cached) {
263
- if (existsSync(candidate)) unlinkSync(candidate);
264
- refreshed = true;
265
- }
266
- }
267
-
268
- const path = await resolveModelFile(model, cacheDir);
269
- const sizeBytes = existsSync(path) ? statSync(path).size : 0;
270
- if (hfRef && filename) {
271
- const remoteEtag = await getRemoteEtag(hfRef);
272
- if (remoteEtag) {
273
- const etagPath = join(cacheDir, `${filename}.etag`);
274
- writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
275
- }
276
- }
277
- results.push({ model, path, sizeBytes, refreshed });
278
- }
279
- return results;
280
- }
281
-
282
- // =============================================================================
283
- // LLM Interface
284
- // =============================================================================
285
-
286
- /**
287
- * Abstract LLM interface - implement this for different backends
288
- */
289
- export interface LLM {
290
- /**
291
- * Get embeddings for text
292
- */
293
- embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
294
-
295
- /**
296
- * Generate text completion
297
- */
298
- generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
299
-
300
- /**
301
- * Check if a model exists/is available
302
- */
303
- modelExists(model: string): Promise<ModelInfo>;
304
-
305
- /**
306
- * Expand a search query into multiple variations for different backends.
307
- * Returns a list of Queryable objects.
308
- */
309
- expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise<Queryable[]>;
310
-
311
- /**
312
- * Rerank documents by relevance to a query
313
- * Returns list of documents with relevance scores (higher = more relevant)
314
- */
315
- rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
316
-
317
- /**
318
- * Dispose of resources
319
- */
320
- dispose(): Promise<void>;
321
- }
322
-
323
- // =============================================================================
324
- // node-llama-cpp Implementation
325
- // =============================================================================
326
-
327
- export type LlamaCppConfig = {
328
- embedModel?: string;
329
- generateModel?: string;
330
- rerankModel?: string;
331
- modelCacheDir?: string;
332
- /**
333
- * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
334
- *
335
- * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
336
- * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
337
- * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
338
- */
339
- inactivityTimeoutMs?: number;
340
- /**
341
- * Whether to dispose models on inactivity (default: false).
342
- *
343
- * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
344
- * memory reclaim.
345
- */
346
- disposeModelsOnInactivity?: boolean;
347
- };
348
-
349
- /**
350
- * LLM implementation using node-llama-cpp
351
- */
352
- // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
353
- const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
354
-
355
- export class LlamaCpp implements LLM {
356
- private llama: Llama | null = null;
357
- private embedModel: LlamaModel | null = null;
358
- private embedContexts: LlamaEmbeddingContext[] = [];
359
- private generateModel: LlamaModel | null = null;
360
- private rerankModel: LlamaModel | null = null;
361
- private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
362
-
363
- private embedModelUri: string;
364
- private generateModelUri: string;
365
- private rerankModelUri: string;
366
- private modelCacheDir: string;
367
-
368
- // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
369
- private embedModelLoadPromise: Promise<LlamaModel> | null = null;
370
- private generateModelLoadPromise: Promise<LlamaModel> | null = null;
371
- private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
372
-
373
- // Inactivity timer for auto-unloading models
374
- private inactivityTimer: ReturnType<typeof setTimeout> | null = null;
375
- private inactivityTimeoutMs: number;
376
- private disposeModelsOnInactivity: boolean;
377
-
378
- // Track disposal state to prevent double-dispose
379
- private disposed = false;
380
-
381
-
382
- constructor(config: LlamaCppConfig = {}) {
383
- this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
384
- this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
385
- this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
386
- this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
387
- this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
388
- this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
389
- }
390
-
391
- /**
392
- * Reset the inactivity timer. Called after each model operation.
393
- * When timer fires, models are unloaded to free memory (if no active sessions).
394
- */
395
- private touchActivity(): void {
396
- // Clear existing timer
397
- if (this.inactivityTimer) {
398
- clearTimeout(this.inactivityTimer);
399
- this.inactivityTimer = null;
400
- }
401
-
402
- // Only set timer if we have disposable contexts and timeout is enabled
403
- if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
404
- this.inactivityTimer = setTimeout(() => {
405
- // Check if session manager allows unloading
406
- // canUnloadLLM is defined later in this file - it checks the session manager
407
- // We use dynamic import pattern to avoid circular dependency issues
408
- if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
409
- // Active sessions/operations - reschedule timer
410
- this.touchActivity();
411
- return;
412
- }
413
- this.unloadIdleResources().catch(err => {
414
- console.error("Error unloading idle resources:", err);
415
- });
416
- }, this.inactivityTimeoutMs);
417
- // Don't keep process alive just for this timer
418
- this.inactivityTimer.unref();
419
- }
420
- }
421
-
422
- /**
423
- * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
424
- */
425
- private hasLoadedContexts(): boolean {
426
- return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
427
- }
428
-
429
- /**
430
- * Unload idle resources but keep the instance alive for future use.
431
- *
432
- * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
433
- * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
434
- */
435
- async unloadIdleResources(): Promise<void> {
436
- // Don't unload if already disposed
437
- if (this.disposed) {
438
- return;
439
- }
440
-
441
- // Clear timer
442
- if (this.inactivityTimer) {
443
- clearTimeout(this.inactivityTimer);
444
- this.inactivityTimer = null;
445
- }
446
-
447
- // Dispose contexts first
448
- for (const ctx of this.embedContexts) {
449
- await ctx.dispose();
450
- }
451
- this.embedContexts = [];
452
- for (const ctx of this.rerankContexts) {
453
- await ctx.dispose();
454
- }
455
- this.rerankContexts = [];
456
-
457
- // Optionally dispose models too (opt-in)
458
- if (this.disposeModelsOnInactivity) {
459
- if (this.embedModel) {
460
- await this.embedModel.dispose();
461
- this.embedModel = null;
462
- }
463
- if (this.generateModel) {
464
- await this.generateModel.dispose();
465
- this.generateModel = null;
466
- }
467
- if (this.rerankModel) {
468
- await this.rerankModel.dispose();
469
- this.rerankModel = null;
470
- }
471
- // Reset load promises so models can be reloaded later
472
- this.embedModelLoadPromise = null;
473
- this.generateModelLoadPromise = null;
474
- this.rerankModelLoadPromise = null;
475
- }
476
-
477
- // Note: We keep llama instance alive - it's lightweight
478
- }
479
-
480
- /**
481
- * Ensure model cache directory exists
482
- */
483
- private ensureModelCacheDir(): void {
484
- if (!existsSync(this.modelCacheDir)) {
485
- mkdirSync(this.modelCacheDir, { recursive: true });
486
- }
487
- }
488
-
489
- /**
490
- * Initialize the llama instance (lazy)
491
- */
492
- private async ensureLlama(): Promise<Llama> {
493
- if (!this.llama) {
494
- // Detect available GPU types and use the best one.
495
- // We can't rely on gpu:"auto" — it returns false even when CUDA is available
496
- // (likely a binary/build config issue in node-llama-cpp).
497
- const gpuTypes = await getLlamaGpuTypes();
498
- // Prefer CUDA > Metal > Vulkan > CPU
499
- const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
500
-
501
- let llama: Llama;
502
- if (preferred) {
503
- try {
504
- llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
505
- } catch {
506
- llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
507
- process.stderr.write(
508
- `QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`
509
- );
510
- }
511
- } else {
512
- llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
513
- }
514
-
515
- if (!llama.gpu) {
516
- process.stderr.write(
517
- "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
518
- );
519
- }
520
- this.llama = llama;
521
- }
522
- return this.llama;
523
- }
524
-
525
- /**
526
- * Resolve a model URI to a local path, downloading if needed
527
- */
528
- private async resolveModel(modelUri: string): Promise<string> {
529
- this.ensureModelCacheDir();
530
- // resolveModelFile handles HF URIs and downloads to the cache dir
531
- return await resolveModelFile(modelUri, this.modelCacheDir);
532
- }
533
-
534
- /**
535
- * Load embedding model (lazy)
536
- */
537
- private async ensureEmbedModel(): Promise<LlamaModel> {
538
- if (this.embedModel) {
539
- return this.embedModel;
540
- }
541
- if (this.embedModelLoadPromise) {
542
- return await this.embedModelLoadPromise;
543
- }
544
-
545
- this.embedModelLoadPromise = (async () => {
546
- const llama = await this.ensureLlama();
547
- const modelPath = await this.resolveModel(this.embedModelUri);
548
- const model = await llama.loadModel({ modelPath });
549
- this.embedModel = model;
550
- // Model loading counts as activity - ping to keep alive
551
- this.touchActivity();
552
- return model;
553
- })();
554
-
555
- try {
556
- return await this.embedModelLoadPromise;
557
- } finally {
558
- // Keep the resolved model cached; clear only the in-flight promise.
559
- this.embedModelLoadPromise = null;
560
- }
561
- }
562
-
563
- /**
564
- * Compute how many parallel contexts to create.
565
- *
566
- * GPU: constrained by VRAM (25% of free, capped at 8).
567
- * CPU: constrained by cores. Splitting threads across contexts enables
568
- * true parallelism (each context runs on its own cores). Use at most
569
- * half the math cores, with at least 4 threads per context.
570
- */
571
- private async computeParallelism(perContextMB: number): Promise<number> {
572
- const llama = await this.ensureLlama();
573
-
574
- if (llama.gpu) {
575
- try {
576
- const vram = await llama.getVramState();
577
- const freeMB = vram.free / (1024 * 1024);
578
- const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
579
- return Math.max(1, Math.min(8, maxByVram));
580
- } catch {
581
- return 2;
582
- }
583
- }
584
-
585
- // CPU: split cores across contexts. At least 4 threads per context.
586
- const cores = llama.cpuMathCores || 4;
587
- const maxContexts = Math.floor(cores / 4);
588
- return Math.max(1, Math.min(4, maxContexts));
589
- }
590
-
591
- /**
592
- * Get the number of threads each context should use, given N parallel contexts.
593
- * Splits available math cores evenly across contexts.
594
- */
595
- private async threadsPerContext(parallelism: number): Promise<number> {
596
- const llama = await this.ensureLlama();
597
- if (llama.gpu) return 0; // GPU: let the library decide
598
- const cores = llama.cpuMathCores || 4;
599
- return Math.max(1, Math.floor(cores / parallelism));
600
- }
601
-
602
- /**
603
- * Load embedding contexts (lazy). Creates multiple for parallel embedding.
604
- * Uses promise guard to prevent concurrent context creation race condition.
605
- */
606
- private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
607
-
608
- private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
609
- if (this.embedContexts.length > 0) {
610
- this.touchActivity();
611
- return this.embedContexts;
612
- }
613
-
614
- if (this.embedContextsCreatePromise) {
615
- return await this.embedContextsCreatePromise;
616
- }
617
-
618
- this.embedContextsCreatePromise = (async () => {
619
- const model = await this.ensureEmbedModel();
620
- // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
621
- const n = await this.computeParallelism(150);
622
- const threads = await this.threadsPerContext(n);
623
- for (let i = 0; i < n; i++) {
624
- try {
625
- this.embedContexts.push(await model.createEmbeddingContext({
626
- ...(threads > 0 ? { threads } : {}),
627
- }));
628
- } catch {
629
- if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
630
- break;
631
- }
632
- }
633
- this.touchActivity();
634
- return this.embedContexts;
635
- })();
636
-
637
- try {
638
- return await this.embedContextsCreatePromise;
639
- } finally {
640
- this.embedContextsCreatePromise = null;
641
- }
642
- }
643
-
644
- /**
645
- * Get a single embed context (for single-embed calls). Uses first from pool.
646
- */
647
- private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
648
- const contexts = await this.ensureEmbedContexts();
649
- return contexts[0]!;
650
- }
651
-
652
- /**
653
- * Load generation model (lazy) - context is created fresh per call
654
- */
655
- private async ensureGenerateModel(): Promise<LlamaModel> {
656
- if (!this.generateModel) {
657
- if (this.generateModelLoadPromise) {
658
- return await this.generateModelLoadPromise;
659
- }
660
-
661
- this.generateModelLoadPromise = (async () => {
662
- const llama = await this.ensureLlama();
663
- const modelPath = await this.resolveModel(this.generateModelUri);
664
- const model = await llama.loadModel({ modelPath });
665
- this.generateModel = model;
666
- return model;
667
- })();
668
-
669
- try {
670
- await this.generateModelLoadPromise;
671
- } finally {
672
- this.generateModelLoadPromise = null;
673
- }
674
- }
675
- this.touchActivity();
676
- if (!this.generateModel) {
677
- throw new Error("Generate model not loaded");
678
- }
679
- return this.generateModel;
680
- }
681
-
682
- /**
683
- * Load rerank model (lazy)
684
- */
685
- private async ensureRerankModel(): Promise<LlamaModel> {
686
- if (this.rerankModel) {
687
- return this.rerankModel;
688
- }
689
- if (this.rerankModelLoadPromise) {
690
- return await this.rerankModelLoadPromise;
691
- }
692
-
693
- this.rerankModelLoadPromise = (async () => {
694
- const llama = await this.ensureLlama();
695
- const modelPath = await this.resolveModel(this.rerankModelUri);
696
- const model = await llama.loadModel({ modelPath });
697
- this.rerankModel = model;
698
- // Model loading counts as activity - ping to keep alive
699
- this.touchActivity();
700
- return model;
701
- })();
702
-
703
- try {
704
- return await this.rerankModelLoadPromise;
705
- } finally {
706
- this.rerankModelLoadPromise = null;
707
- }
708
- }
709
-
710
- /**
711
- * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
712
- * Each context has its own sequence, so they can evaluate independently.
713
- *
714
- * Tuning choices:
715
- * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
716
- * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
717
- * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
718
- */
719
- // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
720
- // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
721
- // Use 2048 for safety margin. Still 17× less than auto (40960).
722
- private static readonly RERANK_CONTEXT_SIZE = 2048;
723
-
724
- private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
725
- if (this.rerankContexts.length === 0) {
726
- const model = await this.ensureRerankModel();
727
- // ~960 MB per context with flash attention at contextSize 2048
728
- const n = await this.computeParallelism(1000);
729
- const threads = await this.threadsPerContext(n);
730
- for (let i = 0; i < n; i++) {
731
- try {
732
- this.rerankContexts.push(await model.createRankingContext({
733
- contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
734
- flashAttention: true,
735
- ...(threads > 0 ? { threads } : {}),
736
- }));
737
- } catch {
738
- if (this.rerankContexts.length === 0) {
739
- // Flash attention might not be supported — retry without it
740
- try {
741
- this.rerankContexts.push(await model.createRankingContext({
742
- contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
743
- ...(threads > 0 ? { threads } : {}),
744
- }));
745
- } catch {
746
- throw new Error("Failed to create any rerank context");
747
- }
748
- }
749
- break;
750
- }
751
- }
752
- }
753
- this.touchActivity();
754
- return this.rerankContexts;
755
- }
756
-
757
- // ==========================================================================
758
- // Tokenization
759
- // ==========================================================================
760
-
761
- /**
762
- * Tokenize text using the embedding model's tokenizer
763
- * Returns tokenizer tokens (opaque type from node-llama-cpp)
764
- */
765
- async tokenize(text: string): Promise<readonly LlamaToken[]> {
766
- await this.ensureEmbedContext(); // Ensure model is loaded
767
- if (!this.embedModel) {
768
- throw new Error("Embed model not loaded");
769
- }
770
- return this.embedModel.tokenize(text);
771
- }
772
-
773
- /**
774
- * Count tokens in text using the embedding model's tokenizer
775
- */
776
- async countTokens(text: string): Promise<number> {
777
- const tokens = await this.tokenize(text);
778
- return tokens.length;
779
- }
780
-
781
- /**
782
- * Detokenize token IDs back to text
783
- */
784
- async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
785
- await this.ensureEmbedContext();
786
- if (!this.embedModel) {
787
- throw new Error("Embed model not loaded");
788
- }
789
- return this.embedModel.detokenize(tokens);
790
- }
791
-
792
- // ==========================================================================
793
- // Core API methods
794
- // ==========================================================================
795
-
796
- async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
797
- // Ping activity at start to keep models alive during this operation
798
- this.touchActivity();
799
-
800
- try {
801
- const context = await this.ensureEmbedContext();
802
- const embedding = await context.getEmbeddingFor(text);
803
-
804
- return {
805
- embedding: Array.from(embedding.vector),
806
- model: this.embedModelUri,
807
- };
808
- } catch (error) {
809
- console.error("Embedding error:", error);
810
- return null;
811
- }
812
- }
813
-
814
- /**
815
- * Batch embed multiple texts efficiently
816
- * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
817
- */
818
- async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
819
- // Ping activity at start to keep models alive during this operation
820
- this.touchActivity();
821
-
822
- if (texts.length === 0) return [];
823
-
824
- try {
825
- const contexts = await this.ensureEmbedContexts();
826
- const n = contexts.length;
827
-
828
- if (n === 1) {
829
- // Single context: sequential (no point splitting)
830
- const context = contexts[0]!;
831
- const embeddings = [];
832
- for (const text of texts) {
833
- try {
834
- const embedding = await context.getEmbeddingFor(text);
835
- this.touchActivity();
836
- embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
837
- } catch (err) {
838
- console.error("Embedding error for text:", err);
839
- embeddings.push(null);
840
- }
841
- }
842
- return embeddings;
843
- }
844
-
845
- // Multiple contexts: split texts across contexts for parallel evaluation
846
- const chunkSize = Math.ceil(texts.length / n);
847
- const chunks = Array.from({ length: n }, (_, i) =>
848
- texts.slice(i * chunkSize, (i + 1) * chunkSize)
849
- );
850
-
851
- const chunkResults = await Promise.all(
852
- chunks.map(async (chunk, i) => {
853
- const ctx = contexts[i]!;
854
- const results: (EmbeddingResult | null)[] = [];
855
- for (const text of chunk) {
856
- try {
857
- const embedding = await ctx.getEmbeddingFor(text);
858
- this.touchActivity();
859
- results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
860
- } catch (err) {
861
- console.error("Embedding error for text:", err);
862
- results.push(null);
863
- }
864
- }
865
- return results;
866
- })
867
- );
868
-
869
- return chunkResults.flat();
870
- } catch (error) {
871
- console.error("Batch embedding error:", error);
872
- return texts.map(() => null);
873
- }
874
- }
875
-
876
- async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
877
- // Ping activity at start to keep models alive during this operation
878
- this.touchActivity();
879
-
880
- // Ensure model is loaded
881
- await this.ensureGenerateModel();
882
-
883
- // Create fresh context -> sequence -> session for each call
884
- const context = await this.generateModel!.createContext();
885
- const sequence = context.getSequence();
886
- const session = new LlamaChatSession({ contextSequence: sequence });
887
-
888
- const maxTokens = options.maxTokens ?? 150;
889
- // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
890
- // DO NOT use greedy decoding (temp=0) - causes repetition loops
891
- const temperature = options.temperature ?? 0.7;
892
-
893
- let result = "";
894
- try {
895
- await session.prompt(prompt, {
896
- maxTokens,
897
- temperature,
898
- topK: 20,
899
- topP: 0.8,
900
- onTextChunk: (text) => {
901
- result += text;
902
- },
903
- });
904
-
905
- return {
906
- text: result,
907
- model: this.generateModelUri,
908
- done: true,
909
- };
910
- } finally {
911
- // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
912
- await context.dispose();
913
- }
914
- }
915
-
916
- async modelExists(modelUri: string): Promise<ModelInfo> {
917
- // For HuggingFace URIs, we assume they exist
918
- // For local paths, check if file exists
919
- if (modelUri.startsWith("hf:")) {
920
- return { name: modelUri, exists: true };
921
- }
922
-
923
- const exists = existsSync(modelUri);
924
- return {
925
- name: modelUri,
926
- exists,
927
- path: exists ? modelUri : undefined,
928
- };
929
- }
930
-
931
- // ==========================================================================
932
- // High-level abstractions
933
- // ==========================================================================
934
-
935
- async expandQuery(query: string, options: { context?: string, includeLexical?: boolean } = {}): Promise<Queryable[]> {
936
- // Ping activity at start to keep models alive during this operation
937
- this.touchActivity();
938
-
939
- const llama = await this.ensureLlama();
940
- await this.ensureGenerateModel();
941
-
942
- const includeLexical = options.includeLexical ?? true;
943
- const context = options.context;
944
-
945
- const grammar = await llama.createGrammar({
946
- grammar: `
947
- root ::= line+
948
- line ::= type ": " content "\\n"
949
- type ::= "lex" | "vec" | "hyde"
950
- content ::= [^\\n]+
951
- `
952
- });
953
-
954
- const prompt = `/no_think Expand this search query: ${query}`;
955
-
956
- // Create fresh context for each call
957
- const genContext = await this.generateModel!.createContext();
958
- const sequence = genContext.getSequence();
959
- const session = new LlamaChatSession({ contextSequence: sequence });
960
-
961
- try {
962
- // Qwen3 recommended settings for non-thinking mode:
963
- // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
964
- // DO NOT use greedy decoding (temp=0) - causes infinite loops
965
- const result = await session.prompt(prompt, {
966
- grammar,
967
- maxTokens: 600,
968
- temperature: 0.7,
969
- topK: 20,
970
- topP: 0.8,
971
- repeatPenalty: {
972
- lastTokens: 64,
973
- presencePenalty: 0.5,
974
- },
975
- });
976
-
977
- const lines = result.trim().split("\n");
978
- const queryLower = query.toLowerCase();
979
- const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
980
-
981
- const hasQueryTerm = (text: string): boolean => {
982
- const lower = text.toLowerCase();
983
- if (queryTerms.length === 0) return true;
984
- return queryTerms.some(term => lower.includes(term));
985
- };
986
-
987
- const queryables: Queryable[] = lines.map(line => {
988
- const colonIdx = line.indexOf(":");
989
- if (colonIdx === -1) return null;
990
- const type = line.slice(0, colonIdx).trim();
991
- if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
992
- const text = line.slice(colonIdx + 1).trim();
993
- if (!hasQueryTerm(text)) return null;
994
- return { type: type as QueryType, text };
995
- }).filter((q): q is Queryable => q !== null);
996
-
997
- // Filter out lex entries if not requested
998
- const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
999
- if (filtered.length > 0) return filtered;
1000
-
1001
- const fallback: Queryable[] = [
1002
- { type: 'hyde', text: `Information about ${query}` },
1003
- { type: 'lex', text: query },
1004
- { type: 'vec', text: query },
1005
- ];
1006
- return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
1007
- } catch (error) {
1008
- console.error("Structured query expansion failed:", error);
1009
- // Fallback to original query
1010
- const fallback: Queryable[] = [{ type: 'vec', text: query }];
1011
- if (includeLexical) fallback.unshift({ type: 'lex', text: query });
1012
- return fallback;
1013
- } finally {
1014
- await genContext.dispose();
1015
- }
1016
- }
1017
-
1018
- async rerank(
1019
- query: string,
1020
- documents: RerankDocument[],
1021
- options: RerankOptions = {}
1022
- ): Promise<RerankResult> {
1023
- // Ping activity at start to keep models alive during this operation
1024
- this.touchActivity();
1025
-
1026
- const contexts = await this.ensureRerankContexts();
1027
-
1028
- // Build a map from document text to original indices (for lookup after sorting)
1029
- const textToDoc = new Map<string, { file: string; index: number }>();
1030
- documents.forEach((doc, index) => {
1031
- textToDoc.set(doc.text, { file: doc.file, index });
1032
- });
1033
-
1034
- // Extract just the text for ranking
1035
- const texts = documents.map((doc) => doc.text);
1036
-
1037
- // Split documents across contexts for parallel evaluation.
1038
- // Each context has its own sequence with a lock, so parallelism comes
1039
- // from multiple contexts evaluating different chunks simultaneously.
1040
- const n = contexts.length;
1041
- const chunkSize = Math.ceil(texts.length / n);
1042
- const chunks = Array.from({ length: n }, (_, i) =>
1043
- texts.slice(i * chunkSize, (i + 1) * chunkSize)
1044
- ).filter(chunk => chunk.length > 0);
1045
-
1046
- const allScores = await Promise.all(
1047
- chunks.map((chunk, i) => contexts[i]!.rankAll(query, chunk))
1048
- );
1049
-
1050
- // Reassemble scores in original order and sort
1051
- const flatScores = allScores.flat();
1052
- const ranked = texts
1053
- .map((text, i) => ({ document: text, score: flatScores[i]! }))
1054
- .sort((a, b) => b.score - a.score);
1055
-
1056
- // Map back to our result format using the text-to-doc map
1057
- const results: RerankDocumentResult[] = ranked.map((item) => {
1058
- const docInfo = textToDoc.get(item.document)!;
1059
- return {
1060
- file: docInfo.file,
1061
- score: item.score,
1062
- index: docInfo.index,
1063
- };
1064
- });
1065
-
1066
- return {
1067
- results,
1068
- model: this.rerankModelUri,
1069
- };
1070
- }
1071
-
1072
- /**
1073
- * Get device/GPU info for status display.
1074
- * Initializes llama if not already done.
1075
- */
1076
- async getDeviceInfo(): Promise<{
1077
- gpu: string | false;
1078
- gpuOffloading: boolean;
1079
- gpuDevices: string[];
1080
- vram?: { total: number; used: number; free: number };
1081
- cpuCores: number;
1082
- }> {
1083
- const llama = await this.ensureLlama();
1084
- const gpuDevices = await llama.getGpuDeviceNames();
1085
- let vram: { total: number; used: number; free: number } | undefined;
1086
- if (llama.gpu) {
1087
- try {
1088
- const state = await llama.getVramState();
1089
- vram = { total: state.total, used: state.used, free: state.free };
1090
- } catch { /* no vram info */ }
1091
- }
1092
- return {
1093
- gpu: llama.gpu,
1094
- gpuOffloading: llama.supportsGpuOffloading,
1095
- gpuDevices,
1096
- vram,
1097
- cpuCores: llama.cpuMathCores,
1098
- };
1099
- }
1100
-
1101
- async dispose(): Promise<void> {
1102
- // Prevent double-dispose
1103
- if (this.disposed) {
1104
- return;
1105
- }
1106
- this.disposed = true;
1107
-
1108
- // Clear inactivity timer
1109
- if (this.inactivityTimer) {
1110
- clearTimeout(this.inactivityTimer);
1111
- this.inactivityTimer = null;
1112
- }
1113
-
1114
- // Disposing llama cascades to models and contexts automatically
1115
- // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
1116
- // Note: llama.dispose() can hang indefinitely, so we use a timeout
1117
- if (this.llama) {
1118
- const disposePromise = this.llama.dispose();
1119
- const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
1120
- await Promise.race([disposePromise, timeoutPromise]);
1121
- }
1122
-
1123
- // Clear references
1124
- this.embedContexts = [];
1125
- this.rerankContexts = [];
1126
- this.embedModel = null;
1127
- this.generateModel = null;
1128
- this.rerankModel = null;
1129
- this.llama = null;
1130
-
1131
- // Clear any in-flight load/create promises
1132
- this.embedModelLoadPromise = null;
1133
- this.embedContextsCreatePromise = null;
1134
- this.generateModelLoadPromise = null;
1135
- this.rerankModelLoadPromise = null;
1136
- }
1137
- }
1138
-
1139
- // =============================================================================
1140
- // Session Management Layer
1141
- // =============================================================================
1142
-
1143
- /**
1144
- * Manages LLM session lifecycle with reference counting.
1145
- * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
1146
- */
1147
- class LLMSessionManager {
1148
- private llm: LlamaCpp;
1149
- private _activeSessionCount = 0;
1150
- private _inFlightOperations = 0;
1151
-
1152
- constructor(llm: LlamaCpp) {
1153
- this.llm = llm;
1154
- }
1155
-
1156
- get activeSessionCount(): number {
1157
- return this._activeSessionCount;
1158
- }
1159
-
1160
- get inFlightOperations(): number {
1161
- return this._inFlightOperations;
1162
- }
1163
-
1164
- /**
1165
- * Returns true only when both session count and in-flight operations are 0.
1166
- * Used by LlamaCpp to determine if idle unload is safe.
1167
- */
1168
- canUnload(): boolean {
1169
- return this._activeSessionCount === 0 && this._inFlightOperations === 0;
1170
- }
1171
-
1172
- acquire(): void {
1173
- this._activeSessionCount++;
1174
- }
1175
-
1176
- release(): void {
1177
- this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
1178
- }
1179
-
1180
- operationStart(): void {
1181
- this._inFlightOperations++;
1182
- }
1183
-
1184
- operationEnd(): void {
1185
- this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
1186
- }
1187
-
1188
- getLlamaCpp(): LlamaCpp {
1189
- return this.llm;
1190
- }
1191
- }
1192
-
1193
- /**
1194
- * Error thrown when an operation is attempted on a released or aborted session.
1195
- */
1196
- export class SessionReleasedError extends Error {
1197
- constructor(message = "LLM session has been released or aborted") {
1198
- super(message);
1199
- this.name = "SessionReleasedError";
1200
- }
1201
- }
1202
-
1203
- /**
1204
- * Scoped LLM session with automatic lifecycle management.
1205
- * Wraps LlamaCpp methods with operation tracking and abort handling.
1206
- */
1207
- class LLMSession implements ILLMSession {
1208
- private manager: LLMSessionManager;
1209
- private released = false;
1210
- private abortController: AbortController;
1211
- private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
1212
- private name: string;
1213
-
1214
- constructor(manager: LLMSessionManager, options: LLMSessionOptions = {}) {
1215
- this.manager = manager;
1216
- this.name = options.name || "unnamed";
1217
- this.abortController = new AbortController();
1218
-
1219
- // Link external abort signal if provided
1220
- if (options.signal) {
1221
- if (options.signal.aborted) {
1222
- this.abortController.abort(options.signal.reason);
1223
- } else {
1224
- options.signal.addEventListener("abort", () => {
1225
- this.abortController.abort(options.signal!.reason);
1226
- }, { once: true });
1227
- }
1228
- }
1229
-
1230
- // Set up max duration timer
1231
- const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
1232
- if (maxDuration > 0) {
1233
- this.maxDurationTimer = setTimeout(() => {
1234
- this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
1235
- }, maxDuration);
1236
- this.maxDurationTimer.unref(); // Don't keep process alive
1237
- }
1238
-
1239
- // Acquire session lease
1240
- this.manager.acquire();
1241
- }
1242
-
1243
- get isValid(): boolean {
1244
- return !this.released && !this.abortController.signal.aborted;
1245
- }
1246
-
1247
- get signal(): AbortSignal {
1248
- return this.abortController.signal;
1249
- }
1250
-
1251
- /**
1252
- * Release the session and decrement ref count.
1253
- * Called automatically by withLLMSession when the callback completes.
1254
- */
1255
- release(): void {
1256
- if (this.released) return;
1257
- this.released = true;
1258
-
1259
- if (this.maxDurationTimer) {
1260
- clearTimeout(this.maxDurationTimer);
1261
- this.maxDurationTimer = null;
1262
- }
1263
-
1264
- this.abortController.abort(new Error("Session released"));
1265
- this.manager.release();
1266
- }
1267
-
1268
- /**
1269
- * Wrap an operation with tracking and abort checking.
1270
- */
1271
- private async withOperation<T>(fn: () => Promise<T>): Promise<T> {
1272
- if (!this.isValid) {
1273
- throw new SessionReleasedError();
1274
- }
1275
-
1276
- this.manager.operationStart();
1277
- try {
1278
- // Check abort before starting
1279
- if (this.abortController.signal.aborted) {
1280
- throw new SessionReleasedError(
1281
- this.abortController.signal.reason?.message || "Session aborted"
1282
- );
1283
- }
1284
- return await fn();
1285
- } finally {
1286
- this.manager.operationEnd();
1287
- }
1288
- }
1289
-
1290
- async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
1291
- return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
1292
- }
1293
-
1294
- async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
1295
- return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
1296
- }
1297
-
1298
- async expandQuery(
1299
- query: string,
1300
- options?: { context?: string; includeLexical?: boolean }
1301
- ): Promise<Queryable[]> {
1302
- return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
1303
- }
1304
-
1305
- async rerank(
1306
- query: string,
1307
- documents: RerankDocument[],
1308
- options?: RerankOptions
1309
- ): Promise<RerankResult> {
1310
- return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
1311
- }
1312
- }
1313
-
1314
- // Session manager for the default LlamaCpp instance
1315
- let defaultSessionManager: LLMSessionManager | null = null;
1316
-
1317
- /**
1318
- * Get the session manager for the default LlamaCpp instance.
1319
- */
1320
- function getSessionManager(): LLMSessionManager {
1321
- const llm = getDefaultLlamaCpp();
1322
- if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
1323
- defaultSessionManager = new LLMSessionManager(llm);
1324
- }
1325
- return defaultSessionManager;
1326
- }
1327
-
1328
- /**
1329
- * Execute a function with a scoped LLM session.
1330
- * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
1331
- *
1332
- * @example
1333
- * ```typescript
1334
- * await withLLMSession(async (session) => {
1335
- * const expanded = await session.expandQuery(query);
1336
- * const embeddings = await session.embedBatch(texts);
1337
- * const reranked = await session.rerank(query, docs);
1338
- * return reranked;
1339
- * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
1340
- * ```
1341
- */
1342
- export async function withLLMSession<T>(
1343
- fn: (session: ILLMSession) => Promise<T>,
1344
- options?: LLMSessionOptions
1345
- ): Promise<T> {
1346
- const manager = getSessionManager();
1347
- const session = new LLMSession(manager, options);
1348
-
1349
- try {
1350
- return await fn(session);
1351
- } finally {
1352
- session.release();
1353
- }
1354
- }
1355
-
1356
- /**
1357
- * Check if idle unload is safe (no active sessions or operations).
1358
- * Used internally by LlamaCpp idle timer.
1359
- */
1360
- export function canUnloadLLM(): boolean {
1361
- if (!defaultSessionManager) return true;
1362
- return defaultSessionManager.canUnload();
1363
- }
1364
-
1365
- // =============================================================================
1366
- // Singleton for default LlamaCpp instance
1367
- // =============================================================================
1368
-
1369
- let defaultLlamaCpp: LlamaCpp | null = null;
1370
-
1371
- /**
1372
- * Get the default LlamaCpp instance (creates one if needed)
1373
- */
1374
- export function getDefaultLlamaCpp(): LlamaCpp {
1375
- if (!defaultLlamaCpp) {
1376
- defaultLlamaCpp = new LlamaCpp();
1377
- }
1378
- return defaultLlamaCpp;
1379
- }
1380
-
1381
- /**
1382
- * Set a custom default LlamaCpp instance (useful for testing)
1383
- */
1384
- export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
1385
- defaultLlamaCpp = llm;
1386
- }
1387
-
1388
- /**
1389
- * Dispose the default LlamaCpp instance if it exists.
1390
- * Call this before process exit to prevent NAPI crashes.
1391
- */
1392
- export async function disposeDefaultLlamaCpp(): Promise<void> {
1393
- if (defaultLlamaCpp) {
1394
- await defaultLlamaCpp.dispose();
1395
- defaultLlamaCpp = null;
1396
- }
1397
- }