@ambicuity/kindx 0.1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/inference.js CHANGED
@@ -4,9 +4,11 @@
4
4
  * Provides embeddings, text generation, and reranking using local GGUF models.
5
5
  */
6
6
  import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
7
- import { homedir } from "os";
7
+ import { RemoteLLM } from "./remote-llm.js";
8
+ import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
9
+ import * as os from "node:os";
8
10
  import { join } from "path";
9
- import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
11
+ import { homedir } from "node:os";
10
12
  // =============================================================================
11
13
  // Embedding Formatting Functions
12
14
  // =============================================================================
@@ -49,8 +51,8 @@ export function formatDocForEmbedding(text, title, modelUri) {
49
51
  // Format: hf:<user>/<repo>/<file>
50
52
  // Override via KINDX_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/qwen3-embedding-0.6b-q8_0.gguf)
51
53
  const DEFAULT_EMBED_MODEL = process.env.KINDX_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
52
- const DEFAULT_RERANK_MODEL = process.env.KINDX_RERANK_MODEL ?? "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
53
- const DEFAULT_GENERATE_MODEL = process.env.KINDX_GENERATE_MODEL ?? "hf:rr1904/kindx-query-expansion-1.7B-gguf/kindx-query-expansion-1.7B-q4_k_m.gguf";
54
+ const DEFAULT_RERANK_MODEL = process.env.KINDX_RERANK_MODEL ?? "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-Q8_0.gguf";
55
+ const DEFAULT_GENERATE_MODEL = process.env.KINDX_GENERATE_MODEL ?? "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
54
56
  // Alternative generation models for query expansion:
55
57
  // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
56
58
  // Use these as base for fine-tuning with configs/sft_lfm2.yaml
@@ -146,6 +148,40 @@ export async function pullModels(models, options = {}) {
146
148
  const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
147
149
  const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
148
150
  const DEFAULT_RERANK_CONTEXT_SIZE = 4096;
151
+ const DEFAULT_LOW_VRAM_THRESHOLD_MB = 6144;
152
+ const DEFAULT_LOW_VRAM_EMBED_PARALLELISM = 2;
153
+ const DEFAULT_LOW_VRAM_RERANK_PARALLELISM = 1;
154
+ const DEFAULT_LOW_VRAM_EXPAND_CONTEXT_SIZE = 1024;
155
+ const DEFAULT_LOW_VRAM_RERANK_CONTEXT_SIZE = 1024;
156
+ function parseOptionalBoolean(raw, envName) {
157
+ if (raw === undefined)
158
+ return undefined;
159
+ const value = raw.trim().toLowerCase();
160
+ if (value === "1" || value === "true" || value === "yes" || value === "on")
161
+ return true;
162
+ if (value === "0" || value === "false" || value === "no" || value === "off")
163
+ return false;
164
+ process.stderr.write(`KINDX Warning: invalid ${envName}="${raw}", ignoring.\n`);
165
+ return undefined;
166
+ }
167
+ function parsePositiveIntOrWarn(raw, envName, fallback) {
168
+ if (raw === undefined || raw.trim() === "")
169
+ return fallback;
170
+ const parsed = Number.parseInt(raw.trim(), 10);
171
+ if (Number.isInteger(parsed) && parsed > 0)
172
+ return parsed;
173
+ process.stderr.write(`KINDX Warning: invalid ${envName}="${raw}", using ${fallback}.\n`);
174
+ return fallback;
175
+ }
176
+ function resolvePositiveIntWithEnv(configValue, envName, fallback) {
177
+ if (configValue !== undefined) {
178
+ if (!Number.isInteger(configValue) || configValue <= 0) {
179
+ throw new Error(`Invalid ${envName}: ${configValue}. Must be a positive integer.`);
180
+ }
181
+ return configValue;
182
+ }
183
+ return parsePositiveIntOrWarn(process.env[envName], envName, fallback);
184
+ }
149
185
  function resolveExpandContextSize(configValue) {
150
186
  if (configValue !== undefined) {
151
187
  if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -193,6 +229,15 @@ export class LlamaCpp {
193
229
  modelCacheDir;
194
230
  rerankContextSize;
195
231
  expandContextSize;
232
+ lowVramOverride;
233
+ vramBudgetMB;
234
+ lowVramThresholdMB;
235
+ lowVramEmbedParallelism;
236
+ lowVramRerankParallelism;
237
+ lowVramExpandContextSize;
238
+ lowVramRerankContextSize;
239
+ memoryPolicyPromise = null;
240
+ lowVramWarningShown = false;
196
241
  // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
197
242
  embedModelLoadPromise = null;
198
243
  generateModelLoadPromise = null;
@@ -210,6 +255,13 @@ export class LlamaCpp {
210
255
  this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
211
256
  this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
212
257
  this.rerankContextSize = resolveRerankContextSize(config.rerankContextSize);
258
+ this.lowVramOverride = config.lowVram ?? parseOptionalBoolean(process.env.KINDX_LOW_VRAM, "KINDX_LOW_VRAM");
259
+ this.vramBudgetMB = resolvePositiveIntWithEnv(config.vramBudgetMB, "KINDX_VRAM_BUDGET_MB", 0) || null;
260
+ this.lowVramThresholdMB = resolvePositiveIntWithEnv(config.lowVramThresholdMB, "KINDX_LOW_VRAM_THRESHOLD_MB", DEFAULT_LOW_VRAM_THRESHOLD_MB);
261
+ this.lowVramEmbedParallelism = resolvePositiveIntWithEnv(config.lowVramEmbedParallelism, "KINDX_LOW_VRAM_EMBED_PARALLELISM", DEFAULT_LOW_VRAM_EMBED_PARALLELISM);
262
+ this.lowVramRerankParallelism = resolvePositiveIntWithEnv(config.lowVramRerankParallelism, "KINDX_LOW_VRAM_RERANK_PARALLELISM", DEFAULT_LOW_VRAM_RERANK_PARALLELISM);
263
+ this.lowVramExpandContextSize = resolvePositiveIntWithEnv(config.lowVramExpandContextSize, "KINDX_LOW_VRAM_EXPAND_CONTEXT_SIZE", DEFAULT_LOW_VRAM_EXPAND_CONTEXT_SIZE);
264
+ this.lowVramRerankContextSize = resolvePositiveIntWithEnv(config.lowVramRerankContextSize, "KINDX_LOW_VRAM_RERANK_CONTEXT_SIZE", DEFAULT_LOW_VRAM_RERANK_CONTEXT_SIZE);
213
265
  this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
214
266
  this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
215
267
  }
@@ -322,6 +374,12 @@ export class LlamaCpp {
322
374
  else if (llama.gpu === false) {
323
375
  process.stderr.write("KINDX Warning: no GPU acceleration, running on CPU (slow). Run 'kindx status' for details.\n");
324
376
  }
377
+ else if (llama.gpu === "vulkan" && os.release().toLowerCase().includes("microsoft")) {
378
+ process.stderr.write("\nKINDX Warning: Vulkan backend detected on WSL2. This is often slow or unstable (using 'dzn').\n" +
379
+ "For native NVIDIA GPU acceleration on WSL2, please install the CUDA toolkit:\n" +
380
+ " sudo apt-get install cuda-toolkit-13-1 (or cuda-toolkit-12-6)\n" +
381
+ "See https://github.com/ambicuity/KINDX/issues/141 for more details.\n\n");
382
+ }
325
383
  this.llama = llama;
326
384
  }
327
385
  return this.llama;
@@ -361,26 +419,104 @@ export class LlamaCpp {
361
419
  this.embedModelLoadPromise = null;
362
420
  }
363
421
  }
422
+ showLowVramWarning(policy) {
423
+ if (!policy.lowVram || this.lowVramWarningShown)
424
+ return;
425
+ const freeText = policy.freeMB === null ? "unknown free VRAM" : `${Math.round(policy.freeMB)} MB free VRAM`;
426
+ const budgetText = policy.budgetMB === null ? "auto budget" : `${Math.round(policy.budgetMB)} MB budget`;
427
+ process.stderr.write(`KINDX Warning: low VRAM mode enabled (${freeText}, ${budgetText}, ${policy.reason}).\n`);
428
+ this.lowVramWarningShown = true;
429
+ }
430
+ async resolveMemoryPolicy() {
431
+ if (this.memoryPolicyPromise)
432
+ return await this.memoryPolicyPromise;
433
+ this.memoryPolicyPromise = (async () => {
434
+ const llama = await this.ensureLlama();
435
+ if (!llama.gpu) {
436
+ return { lowVram: false, freeMB: null, budgetMB: this.vramBudgetMB, reason: "cpu_or_no_gpu" };
437
+ }
438
+ let freeMB = null;
439
+ try {
440
+ const vram = await llama.getVramState();
441
+ freeMB = vram.free / (1024 * 1024);
442
+ }
443
+ catch {
444
+ freeMB = null;
445
+ }
446
+ if (this.lowVramOverride !== undefined) {
447
+ const policy = {
448
+ lowVram: this.lowVramOverride,
449
+ freeMB,
450
+ budgetMB: this.vramBudgetMB,
451
+ reason: "forced_by_KINDX_LOW_VRAM_or_config",
452
+ };
453
+ this.showLowVramWarning(policy);
454
+ return policy;
455
+ }
456
+ if (this.vramBudgetMB !== null) {
457
+ const policy = {
458
+ lowVram: true,
459
+ freeMB,
460
+ budgetMB: this.vramBudgetMB,
461
+ reason: "budget_set_by_KINDX_VRAM_BUDGET_MB_or_config",
462
+ };
463
+ this.showLowVramWarning(policy);
464
+ return policy;
465
+ }
466
+ if (freeMB !== null && freeMB < this.lowVramThresholdMB) {
467
+ const policy = {
468
+ lowVram: true,
469
+ freeMB,
470
+ budgetMB: null,
471
+ reason: `auto_detected_below_${this.lowVramThresholdMB}MB_threshold`,
472
+ };
473
+ this.showLowVramWarning(policy);
474
+ return policy;
475
+ }
476
+ return { lowVram: false, freeMB, budgetMB: null, reason: "auto_high_vram" };
477
+ })();
478
+ return await this.memoryPolicyPromise;
479
+ }
480
+ async effectiveExpandContextSize() {
481
+ const policy = await this.resolveMemoryPolicy();
482
+ if (!policy.lowVram)
483
+ return this.expandContextSize;
484
+ return Math.min(this.expandContextSize, this.lowVramExpandContextSize);
485
+ }
486
+ async effectiveRerankContextSize() {
487
+ const policy = await this.resolveMemoryPolicy();
488
+ if (!policy.lowVram)
489
+ return this.rerankContextSize;
490
+ return Math.min(this.rerankContextSize, this.lowVramRerankContextSize);
491
+ }
364
492
  /**
365
493
  * Compute how many parallel contexts to create.
366
494
  *
367
- * GPU: constrained by VRAM (25% of free, capped at 8).
495
+ * GPU: constrained by free VRAM / budget and low-VRAM policy caps.
368
496
  * CPU: constrained by cores. Splitting threads across contexts enables
369
497
  * true parallelism (each context runs on its own cores). Use at most
370
498
  * half the math cores, with at least 4 threads per context.
371
499
  */
372
- async computeParallelism(perContextMB) {
500
+ async computeParallelism(perContextMB, kind) {
373
501
  const llama = await this.ensureLlama();
502
+ const policy = await this.resolveMemoryPolicy();
374
503
  if (llama.gpu) {
375
- try {
376
- const vram = await llama.getVramState();
377
- const freeMB = vram.free / (1024 * 1024);
378
- const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
379
- return Math.max(1, Math.min(8, maxByVram));
380
- }
381
- catch {
382
- return 2;
504
+ if (policy.freeMB === null) {
505
+ const fallback = kind === "embed" ? 2 : 1;
506
+ const cap = policy.lowVram
507
+ ? (kind === "embed" ? this.lowVramEmbedParallelism : this.lowVramRerankParallelism)
508
+ : 8;
509
+ return Math.max(1, Math.min(cap, fallback));
383
510
  }
511
+ const availableMB = policy.budgetMB === null
512
+ ? policy.freeMB
513
+ : Math.min(policy.budgetMB, policy.freeMB);
514
+ const maxByVram = Math.floor((availableMB * 0.25) / perContextMB);
515
+ const base = Math.max(1, Math.min(8, maxByVram));
516
+ if (!policy.lowVram)
517
+ return base;
518
+ const cap = kind === "embed" ? this.lowVramEmbedParallelism : this.lowVramRerankParallelism;
519
+ return Math.max(1, Math.min(base, cap));
384
520
  }
385
521
  // CPU: split cores across contexts. At least 4 threads per context.
386
522
  const cores = llama.cpuMathCores || 4;
@@ -414,7 +550,7 @@ export class LlamaCpp {
414
550
  this.embedContextsCreatePromise = (async () => {
415
551
  const model = await this.ensureEmbedModel();
416
552
  // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
417
- const n = await this.computeParallelism(150);
553
+ const n = await this.computeParallelism(150, "embed");
418
554
  const threads = await this.threadsPerContext(n);
419
555
  for (let i = 0; i < n; i++) {
420
556
  try {
@@ -515,12 +651,13 @@ export class LlamaCpp {
515
651
  if (this.rerankContexts.length === 0) {
516
652
  const model = await this.ensureRerankModel();
517
653
  // ~960 MB per context with flash attention at default contextSize 4096
518
- const n = Math.min(await this.computeParallelism(1000), 4);
654
+ const n = Math.min(await this.computeParallelism(1000, "rerank"), 4);
655
+ const rerankContextSize = await this.effectiveRerankContextSize();
519
656
  const threads = await this.threadsPerContext(n);
520
657
  for (let i = 0; i < n; i++) {
521
658
  try {
522
659
  this.rerankContexts.push(await model.createRankingContext({
523
- contextSize: this.rerankContextSize,
660
+ contextSize: rerankContextSize,
524
661
  flashAttention: true,
525
662
  ...(threads > 0 ? { threads } : {}),
526
663
  }));
@@ -530,7 +667,7 @@ export class LlamaCpp {
530
667
  // Flash attention might not be supported — retry without it
531
668
  try {
532
669
  this.rerankContexts.push(await model.createRankingContext({
533
- contextSize: this.rerankContextSize,
670
+ contextSize: rerankContextSize,
534
671
  ...(threads > 0 ? { threads } : {}),
535
672
  }));
536
673
  }
@@ -704,10 +841,25 @@ export class LlamaCpp {
704
841
  async expandQuery(query, options = {}) {
705
842
  // Ping activity at start to keep models alive during this operation
706
843
  this.touchActivity();
707
- const llama = await this.ensureLlama();
708
- await this.ensureGenerateModel();
709
844
  const includeLexical = options.includeLexical ?? true;
710
845
  const context = options.context;
846
+ // -------------------------------------------------------------------------
847
+ // Task 2: Dynamic HyDE Bypass
848
+ // Short entity-lookup queries (≤3 tokens) lack sufficient semantic surface
849
+ // area for the LLM to extrapolate a meaningful hypothetical document.
850
+ // Generating HyDE passages for these actively harms precision by shifting
851
+ // the embedding centroid away from the actual target.
852
+ // Bypass generation entirely and return direct lex + vec targets.
853
+ // -------------------------------------------------------------------------
854
+ const tokenCount = query.trim().split(/\s+/).filter(Boolean).length;
855
+ if (tokenCount <= 3) {
856
+ const bypass = [{ type: 'vec', text: query }];
857
+ if (includeLexical)
858
+ bypass.unshift({ type: 'lex', text: query });
859
+ return bypass;
860
+ }
861
+ const llama = await this.ensureLlama();
862
+ await this.ensureGenerateModel();
711
863
  const grammar = await llama.createGrammar({
712
864
  grammar: `
713
865
  root ::= line+
@@ -717,12 +869,46 @@ export class LlamaCpp {
717
869
  `
718
870
  });
719
871
  const prompt = `/no_think Expand this search query: ${query}`;
720
- // Create a bounded context for expansion to prevent large default VRAM allocations.
872
+ // -------------------------------------------------------------------------
873
+ // Task 1: Strict System Prompt Injection
874
+ // Enforces three constraints on the generation model:
875
+ // 1. Grammar adherence: output must strictly match the EBNF grammar.
876
+ // 2. No conversational filler: eliminates "Of course!", apologies, etc.
877
+ // 3. Domain anchoring: if options.context is provided, inject it so the
878
+ // model stays within the bounded knowledge domain and does not
879
+ // hallucinate external concepts (e.g., "blockchain" for a code repo).
880
+ // -------------------------------------------------------------------------
881
+ const domainInstruction = context
882
+ ? `Domain context: ${context}\nYour expansions MUST stay within this domain. Do not introduce concepts from outside it.`
883
+ : `Stay strictly within the semantic domain implied by the query itself.`;
884
+ const systemPrompt = [
885
+ `You are a search query expansion engine. Your ONLY task is to output structured query variations.`,
886
+ ``,
887
+ `OUTPUT FORMAT (strict — do not deviate):`,
888
+ ` lex: <exact keyword phrase>`,
889
+ ` vec: <semantically equivalent rephrasing>`,
890
+ ` hyde: <a verbatim 1-2 sentence excerpt that would appear in a relevant technical document>`,
891
+ ``,
892
+ `RULES (violations will break the downstream parser):`,
893
+ ` - Do NOT write greetings, apologies, explanations, or any prose outside the format.`,
894
+ ` - Do NOT write "Here is...", "Of course!", "I'd be happy to...", or similar filler.`,
895
+ ` - Each line MUST start with "lex:", "vec:", or "hyde:" followed by a single space.`,
896
+ ` - hyde entries MUST read like an excerpt from a technical document, NOT a question or summary.`,
897
+ ` - Output 2–4 lines maximum. Output NOTHING else.`,
898
+ ``,
899
+ domainInstruction,
900
+ ].join('\n');
901
+ // Create a bounded context for expansion.
902
+ // The system prompt adds ~400 tokens of overhead. We allocate a 512-token buffer
903
+ // on top of expandContextSize to prevent native llama.cpp from aborting on context
904
+ // overflow when the combined system prompt + user query exceeds the window.
905
+ const SYSTEM_PROMPT_TOKEN_OVERHEAD = 512;
906
+ const effectiveExpandContextSize = await this.effectiveExpandContextSize();
721
907
  const genContext = await this.generateModel.createContext({
722
- contextSize: this.expandContextSize,
908
+ contextSize: effectiveExpandContextSize + SYSTEM_PROMPT_TOKEN_OVERHEAD,
723
909
  });
724
910
  const sequence = genContext.getSequence();
725
- const session = new LlamaChatSession({ contextSequence: sequence });
911
+ const session = new LlamaChatSession({ contextSequence: sequence, systemPrompt });
726
912
  try {
727
913
  // Qwen3 recommended settings for non-thinking mode:
728
914
  // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
@@ -793,7 +979,8 @@ export class LlamaCpp {
793
979
  // Truncate documents that would exceed the rerank context size.
794
980
  // Budget = contextSize - template overhead - query tokens
795
981
  const queryTokens = model.tokenize(query).length;
796
- const rawMaxDocTokens = this.rerankContextSize - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
982
+ const effectiveRerankContextSize = await this.effectiveRerankContextSize();
983
+ const rawMaxDocTokens = effectiveRerankContextSize - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
797
984
  // Guard against non-positive budget (e.g., very long queries or CJK content with
798
985
  // high token density). Allow at minimum 128 tokens per document to avoid crashes.
799
986
  const maxDocTokens = Math.max(128, rawMaxDocTokens);
@@ -952,7 +1139,7 @@ class LLMSessionManager {
952
1139
  operationEnd() {
953
1140
  this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
954
1141
  }
955
- getLlamaCpp() {
1142
+ getLLM() {
956
1143
  return this.llm;
957
1144
  }
958
1145
  }
@@ -1042,26 +1229,26 @@ class LLMSession {
1042
1229
  }
1043
1230
  }
1044
1231
  async embed(text, options) {
1045
- return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
1232
+ return this.withOperation(() => this.manager.getLLM().embed(text, options));
1046
1233
  }
1047
1234
  async embedBatch(texts) {
1048
- return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
1235
+ return this.withOperation(() => this.manager.getLLM().embedBatch(texts));
1049
1236
  }
1050
1237
  async expandQuery(query, options) {
1051
- return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
1238
+ return this.withOperation(() => this.manager.getLLM().expandQuery(query, options));
1052
1239
  }
1053
1240
  async rerank(query, documents, options) {
1054
- return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
1241
+ return this.withOperation(() => this.manager.getLLM().rerank(query, documents, options));
1055
1242
  }
1056
1243
  }
1057
- // Session manager for the default LlamaCpp instance
1244
+ // Session manager for the default LLM instance
1058
1245
  let defaultSessionManager = null;
1059
1246
  /**
1060
- * Get the session manager for the default LlamaCpp instance.
1247
+ * Get the session manager for the default LLM instance.
1061
1248
  */
1062
1249
  function getSessionManager() {
1063
- const llm = getDefaultLlamaCpp();
1064
- if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
1250
+ const llm = getDefaultLLM();
1251
+ if (!defaultSessionManager || defaultSessionManager.getLLM() !== llm) {
1065
1252
  defaultSessionManager = new LLMSessionManager(llm);
1066
1253
  }
1067
1254
  return defaultSessionManager;
@@ -1100,32 +1287,37 @@ export function canUnloadLLM() {
1100
1287
  return defaultSessionManager.canUnload();
1101
1288
  }
1102
1289
  // =============================================================================
1103
- // Singleton for default LlamaCpp instance
1290
+ // Singleton for default LLM instance
1104
1291
  // =============================================================================
1105
- let defaultLlamaCpp = null;
1292
+ let defaultLLM = null;
1106
1293
  /**
1107
- * Get the default LlamaCpp instance (creates one if needed)
1294
+ * Get the default LLM instance (creates one if needed)
1108
1295
  */
1109
- export function getDefaultLlamaCpp() {
1110
- if (!defaultLlamaCpp) {
1111
- const embedModel = process.env.KINDX_EMBED_MODEL;
1112
- defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
1296
+ export function getDefaultLLM() {
1297
+ if (!defaultLLM) {
1298
+ if (process.env.KINDX_LLM_BACKEND === "remote") {
1299
+ defaultLLM = new RemoteLLM();
1300
+ }
1301
+ else {
1302
+ const embedModel = process.env.KINDX_EMBED_MODEL;
1303
+ defaultLLM = new LlamaCpp(embedModel ? { embedModel } : {});
1304
+ }
1113
1305
  }
1114
- return defaultLlamaCpp;
1306
+ return defaultLLM;
1115
1307
  }
1116
1308
  /**
1117
- * Set a custom default LlamaCpp instance (useful for testing)
1309
+ * Set a custom default LLM instance (useful for testing)
1118
1310
  */
1119
- export function setDefaultLlamaCpp(llm) {
1120
- defaultLlamaCpp = llm;
1311
+ export function setDefaultLLM(llm) {
1312
+ defaultLLM = llm;
1121
1313
  }
1122
1314
  /**
1123
- * Dispose the default LlamaCpp instance if it exists.
1315
+ * Dispose the default LLM instance if it exists.
1124
1316
  * Call this before process exit to prevent NAPI crashes.
1125
1317
  */
1126
- export async function disposeDefaultLlamaCpp() {
1127
- if (defaultLlamaCpp) {
1128
- await defaultLlamaCpp.dispose();
1129
- defaultLlamaCpp = null;
1318
+ export async function disposeDefaultLLM() {
1319
+ if (defaultLLM) {
1320
+ await defaultLLM.dispose();
1321
+ defaultLLM = null;
1130
1322
  }
1131
1323
  }