npm - @ambicuity/kindx - Versions diffs - 0.1.0 → 1.1.0 - Mend

@ambicuity/kindx 0.1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/CHANGELOG.md +51 -0
package/README.md +409 -129
package/bin/kindx +38 -0
package/capabilities/kindx/SKILL.md +127 -0
package/capabilities/kindx/references/mcp-setup.md +102 -0
package/dist/catalogs.js +57 -16
package/dist/inference.d.ts +82 -7
package/dist/inference.js +241 -49
package/dist/kindx.js +425 -91
package/dist/migrate.d.ts +2 -0
package/dist/migrate.js +133 -0
package/dist/protocol.d.ts +2 -1
package/dist/protocol.js +110 -6
package/dist/remote-llm.d.ts +23 -0
package/dist/remote-llm.js +307 -0
package/dist/repository.d.ts +18 -1
package/dist/repository.js +260 -35
package/dist/watcher.d.ts +29 -0
package/dist/watcher.js +243 -0
package/package.json +26 -11

package/dist/inference.js CHANGED Viewed

@@ -4,9 +4,11 @@
  * Provides embeddings, text generation, and reranking using local GGUF models.
  */
 import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
-import { homedir } from "os";
+import { RemoteLLM } from "./remote-llm.js";
+import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
+import * as os from "node:os";
 import { join } from "path";
-import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
+import { homedir } from "node:os";
 // =============================================================================
 // Embedding Formatting Functions
 // =============================================================================
@@ -49,8 +51,8 @@ export function formatDocForEmbedding(text, title, modelUri) {
 // Format: hf:<user>/<repo>/<file>
 // Override via KINDX_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/qwen3-embedding-0.6b-q8_0.gguf)
 const DEFAULT_EMBED_MODEL = process.env.KINDX_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
-const DEFAULT_RERANK_MODEL = process.env.KINDX_RERANK_MODEL ?? "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
-const DEFAULT_GENERATE_MODEL = process.env.KINDX_GENERATE_MODEL ?? "hf:rr1904/kindx-query-expansion-1.7B-gguf/kindx-query-expansion-1.7B-q4_k_m.gguf";
+const DEFAULT_RERANK_MODEL = process.env.KINDX_RERANK_MODEL ?? "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-Q8_0.gguf";
+const DEFAULT_GENERATE_MODEL = process.env.KINDX_GENERATE_MODEL ?? "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
 // Alternative generation models for query expansion:
 // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
 // Use these as base for fine-tuning with configs/sft_lfm2.yaml
@@ -146,6 +148,40 @@ export async function pullModels(models, options = {}) {
 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
 const DEFAULT_RERANK_CONTEXT_SIZE = 4096;
+const DEFAULT_LOW_VRAM_THRESHOLD_MB = 6144;
+const DEFAULT_LOW_VRAM_EMBED_PARALLELISM = 2;
+const DEFAULT_LOW_VRAM_RERANK_PARALLELISM = 1;
+const DEFAULT_LOW_VRAM_EXPAND_CONTEXT_SIZE = 1024;
+const DEFAULT_LOW_VRAM_RERANK_CONTEXT_SIZE = 1024;
+function parseOptionalBoolean(raw, envName) {
+    if (raw === undefined)
+        return undefined;
+    const value = raw.trim().toLowerCase();
+    if (value === "1" || value === "true" || value === "yes" || value === "on")
+        return true;
+    if (value === "0" || value === "false" || value === "no" || value === "off")
+        return false;
+    process.stderr.write(`KINDX Warning: invalid ${envName}="${raw}", ignoring.\n`);
+    return undefined;
+}
+function parsePositiveIntOrWarn(raw, envName, fallback) {
+    if (raw === undefined || raw.trim() === "")
+        return fallback;
+    const parsed = Number.parseInt(raw.trim(), 10);
+    if (Number.isInteger(parsed) && parsed > 0)
+        return parsed;
+    process.stderr.write(`KINDX Warning: invalid ${envName}="${raw}", using ${fallback}.\n`);
+    return fallback;
+}
+function resolvePositiveIntWithEnv(configValue, envName, fallback) {
+    if (configValue !== undefined) {
+        if (!Number.isInteger(configValue) || configValue <= 0) {
+            throw new Error(`Invalid ${envName}: ${configValue}. Must be a positive integer.`);
+        }
+        return configValue;
+    }
+    return parsePositiveIntOrWarn(process.env[envName], envName, fallback);
+}
 function resolveExpandContextSize(configValue) {
     if (configValue !== undefined) {
         if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -193,6 +229,15 @@ export class LlamaCpp {
     modelCacheDir;
     rerankContextSize;
     expandContextSize;
+    lowVramOverride;
+    vramBudgetMB;
+    lowVramThresholdMB;
+    lowVramEmbedParallelism;
+    lowVramRerankParallelism;
+    lowVramExpandContextSize;
+    lowVramRerankContextSize;
+    memoryPolicyPromise = null;
+    lowVramWarningShown = false;
     // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
     embedModelLoadPromise = null;
     generateModelLoadPromise = null;
@@ -210,6 +255,13 @@ export class LlamaCpp {
         this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
         this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
         this.rerankContextSize = resolveRerankContextSize(config.rerankContextSize);
+        this.lowVramOverride = config.lowVram ?? parseOptionalBoolean(process.env.KINDX_LOW_VRAM, "KINDX_LOW_VRAM");
+        this.vramBudgetMB = resolvePositiveIntWithEnv(config.vramBudgetMB, "KINDX_VRAM_BUDGET_MB", 0) || null;
+        this.lowVramThresholdMB = resolvePositiveIntWithEnv(config.lowVramThresholdMB, "KINDX_LOW_VRAM_THRESHOLD_MB", DEFAULT_LOW_VRAM_THRESHOLD_MB);
+        this.lowVramEmbedParallelism = resolvePositiveIntWithEnv(config.lowVramEmbedParallelism, "KINDX_LOW_VRAM_EMBED_PARALLELISM", DEFAULT_LOW_VRAM_EMBED_PARALLELISM);
+        this.lowVramRerankParallelism = resolvePositiveIntWithEnv(config.lowVramRerankParallelism, "KINDX_LOW_VRAM_RERANK_PARALLELISM", DEFAULT_LOW_VRAM_RERANK_PARALLELISM);
+        this.lowVramExpandContextSize = resolvePositiveIntWithEnv(config.lowVramExpandContextSize, "KINDX_LOW_VRAM_EXPAND_CONTEXT_SIZE", DEFAULT_LOW_VRAM_EXPAND_CONTEXT_SIZE);
+        this.lowVramRerankContextSize = resolvePositiveIntWithEnv(config.lowVramRerankContextSize, "KINDX_LOW_VRAM_RERANK_CONTEXT_SIZE", DEFAULT_LOW_VRAM_RERANK_CONTEXT_SIZE);
         this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
         this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
     }
@@ -322,6 +374,12 @@ export class LlamaCpp {
             else if (llama.gpu === false) {
                 process.stderr.write("KINDX Warning: no GPU acceleration, running on CPU (slow). Run 'kindx status' for details.\n");
             }
+            else if (llama.gpu === "vulkan" && os.release().toLowerCase().includes("microsoft")) {
+                process.stderr.write("\nKINDX Warning: Vulkan backend detected on WSL2. This is often slow or unstable (using 'dzn').\n" +
+                    "For native NVIDIA GPU acceleration on WSL2, please install the CUDA toolkit:\n" +
+                    "  sudo apt-get install cuda-toolkit-13-1  (or cuda-toolkit-12-6)\n" +
+                    "See https://github.com/ambicuity/KINDX/issues/141 for more details.\n\n");
+            }
             this.llama = llama;
         }
         return this.llama;
@@ -361,26 +419,104 @@ export class LlamaCpp {
             this.embedModelLoadPromise = null;
         }
     }
+    showLowVramWarning(policy) {
+        if (!policy.lowVram || this.lowVramWarningShown)
+            return;
+        const freeText = policy.freeMB === null ? "unknown free VRAM" : `${Math.round(policy.freeMB)} MB free VRAM`;
+        const budgetText = policy.budgetMB === null ? "auto budget" : `${Math.round(policy.budgetMB)} MB budget`;
+        process.stderr.write(`KINDX Warning: low VRAM mode enabled (${freeText}, ${budgetText}, ${policy.reason}).\n`);
+        this.lowVramWarningShown = true;
+    }
+    async resolveMemoryPolicy() {
+        if (this.memoryPolicyPromise)
+            return await this.memoryPolicyPromise;
+        this.memoryPolicyPromise = (async () => {
+            const llama = await this.ensureLlama();
+            if (!llama.gpu) {
+                return { lowVram: false, freeMB: null, budgetMB: this.vramBudgetMB, reason: "cpu_or_no_gpu" };
+            }
+            let freeMB = null;
+            try {
+                const vram = await llama.getVramState();
+                freeMB = vram.free / (1024 * 1024);
+            }
+            catch {
+                freeMB = null;
+            }
+            if (this.lowVramOverride !== undefined) {
+                const policy = {
+                    lowVram: this.lowVramOverride,
+                    freeMB,
+                    budgetMB: this.vramBudgetMB,
+                    reason: "forced_by_KINDX_LOW_VRAM_or_config",
+                };
+                this.showLowVramWarning(policy);
+                return policy;
+            }
+            if (this.vramBudgetMB !== null) {
+                const policy = {
+                    lowVram: true,
+                    freeMB,
+                    budgetMB: this.vramBudgetMB,
+                    reason: "budget_set_by_KINDX_VRAM_BUDGET_MB_or_config",
+                };
+                this.showLowVramWarning(policy);
+                return policy;
+            }
+            if (freeMB !== null && freeMB < this.lowVramThresholdMB) {
+                const policy = {
+                    lowVram: true,
+                    freeMB,
+                    budgetMB: null,
+                    reason: `auto_detected_below_${this.lowVramThresholdMB}MB_threshold`,
+                };
+                this.showLowVramWarning(policy);
+                return policy;
+            }
+            return { lowVram: false, freeMB, budgetMB: null, reason: "auto_high_vram" };
+        })();
+        return await this.memoryPolicyPromise;
+    }
+    async effectiveExpandContextSize() {
+        const policy = await this.resolveMemoryPolicy();
+        if (!policy.lowVram)
+            return this.expandContextSize;
+        return Math.min(this.expandContextSize, this.lowVramExpandContextSize);
+    }
+    async effectiveRerankContextSize() {
+        const policy = await this.resolveMemoryPolicy();
+        if (!policy.lowVram)
+            return this.rerankContextSize;
+        return Math.min(this.rerankContextSize, this.lowVramRerankContextSize);
+    }
     /**
      * Compute how many parallel contexts to create.
      *
-     * GPU: constrained by VRAM (25% of free, capped at 8).
+     * GPU: constrained by free VRAM / budget and low-VRAM policy caps.
      * CPU: constrained by cores. Splitting threads across contexts enables
      *      true parallelism (each context runs on its own cores). Use at most
      *      half the math cores, with at least 4 threads per context.
      */
-    async computeParallelism(perContextMB) {
+    async computeParallelism(perContextMB, kind) {
         const llama = await this.ensureLlama();
+        const policy = await this.resolveMemoryPolicy();
         if (llama.gpu) {
-            try {
-                const vram = await llama.getVramState();
-                const freeMB = vram.free / (1024 * 1024);
-                const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
-                return Math.max(1, Math.min(8, maxByVram));
-            }
-            catch {
-                return 2;
+            if (policy.freeMB === null) {
+                const fallback = kind === "embed" ? 2 : 1;
+                const cap = policy.lowVram
+                    ? (kind === "embed" ? this.lowVramEmbedParallelism : this.lowVramRerankParallelism)
+                    : 8;
+                return Math.max(1, Math.min(cap, fallback));
             }
+            const availableMB = policy.budgetMB === null
+                ? policy.freeMB
+                : Math.min(policy.budgetMB, policy.freeMB);
+            const maxByVram = Math.floor((availableMB * 0.25) / perContextMB);
+            const base = Math.max(1, Math.min(8, maxByVram));
+            if (!policy.lowVram)
+                return base;
+            const cap = kind === "embed" ? this.lowVramEmbedParallelism : this.lowVramRerankParallelism;
+            return Math.max(1, Math.min(base, cap));
         }
         // CPU: split cores across contexts. At least 4 threads per context.
         const cores = llama.cpuMathCores || 4;
@@ -414,7 +550,7 @@ export class LlamaCpp {
         this.embedContextsCreatePromise = (async () => {
             const model = await this.ensureEmbedModel();
             // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
-            const n = await this.computeParallelism(150);
+            const n = await this.computeParallelism(150, "embed");
             const threads = await this.threadsPerContext(n);
             for (let i = 0; i < n; i++) {
                 try {
@@ -515,12 +651,13 @@ export class LlamaCpp {
         if (this.rerankContexts.length === 0) {
             const model = await this.ensureRerankModel();
             // ~960 MB per context with flash attention at default contextSize 4096
-            const n = Math.min(await this.computeParallelism(1000), 4);
+            const n = Math.min(await this.computeParallelism(1000, "rerank"), 4);
+            const rerankContextSize = await this.effectiveRerankContextSize();
             const threads = await this.threadsPerContext(n);
             for (let i = 0; i < n; i++) {
                 try {
                     this.rerankContexts.push(await model.createRankingContext({
-                        contextSize: this.rerankContextSize,
+                        contextSize: rerankContextSize,
                         flashAttention: true,
                         ...(threads > 0 ? { threads } : {}),
                     }));
@@ -530,7 +667,7 @@ export class LlamaCpp {
                         // Flash attention might not be supported — retry without it
                         try {
                             this.rerankContexts.push(await model.createRankingContext({
-                                contextSize: this.rerankContextSize,
+                                contextSize: rerankContextSize,
                                 ...(threads > 0 ? { threads } : {}),
                             }));
                         }
@@ -704,10 +841,25 @@ export class LlamaCpp {
     async expandQuery(query, options = {}) {
         // Ping activity at start to keep models alive during this operation
         this.touchActivity();
-        const llama = await this.ensureLlama();
-        await this.ensureGenerateModel();
         const includeLexical = options.includeLexical ?? true;
         const context = options.context;
+        // -------------------------------------------------------------------------
+        // Task 2: Dynamic HyDE Bypass
+        // Short entity-lookup queries (≤3 tokens) lack sufficient semantic surface
+        // area for the LLM to extrapolate a meaningful hypothetical document.
+        // Generating HyDE passages for these actively harms precision by shifting
+        // the embedding centroid away from the actual target.
+        // Bypass generation entirely and return direct lex + vec targets.
+        // -------------------------------------------------------------------------
+        const tokenCount = query.trim().split(/\s+/).filter(Boolean).length;
+        if (tokenCount <= 3) {
+            const bypass = [{ type: 'vec', text: query }];
+            if (includeLexical)
+                bypass.unshift({ type: 'lex', text: query });
+            return bypass;
+        }
+        const llama = await this.ensureLlama();
+        await this.ensureGenerateModel();
         const grammar = await llama.createGrammar({
             grammar: `
         root ::= line+
@@ -717,12 +869,46 @@ export class LlamaCpp {
       `
         });
         const prompt = `/no_think Expand this search query: ${query}`;
-        // Create a bounded context for expansion to prevent large default VRAM allocations.
+        // -------------------------------------------------------------------------
+        // Task 1: Strict System Prompt Injection
+        // Enforces three constraints on the generation model:
+        //   1. Grammar adherence: output must strictly match the EBNF grammar.
+        //   2. No conversational filler: eliminates "Of course!", apologies, etc.
+        //   3. Domain anchoring: if options.context is provided, inject it so the
+        //      model stays within the bounded knowledge domain and does not
+        //      hallucinate external concepts (e.g., "blockchain" for a code repo).
+        // -------------------------------------------------------------------------
+        const domainInstruction = context
+            ? `Domain context: ${context}\nYour expansions MUST stay within this domain. Do not introduce concepts from outside it.`
+            : `Stay strictly within the semantic domain implied by the query itself.`;
+        const systemPrompt = [
+            `You are a search query expansion engine. Your ONLY task is to output structured query variations.`,
+            ``,
+            `OUTPUT FORMAT (strict — do not deviate):`,
+            `  lex: <exact keyword phrase>`,
+            `  vec: <semantically equivalent rephrasing>`,
+            `  hyde: <a verbatim 1-2 sentence excerpt that would appear in a relevant technical document>`,
+            ``,
+            `RULES (violations will break the downstream parser):`,
+            `  - Do NOT write greetings, apologies, explanations, or any prose outside the format.`,
+            `  - Do NOT write "Here is...", "Of course!", "I'd be happy to...", or similar filler.`,
+            `  - Each line MUST start with "lex:", "vec:", or "hyde:" followed by a single space.`,
+            `  - hyde entries MUST read like an excerpt from a technical document, NOT a question or summary.`,
+            `  - Output 2–4 lines maximum. Output NOTHING else.`,
+            ``,
+            domainInstruction,
+        ].join('\n');
+        // Create a bounded context for expansion.
+        // The system prompt adds ~400 tokens of overhead. We allocate a 512-token buffer
+        // on top of expandContextSize to prevent native llama.cpp from aborting on context
+        // overflow when the combined system prompt + user query exceeds the window.
+        const SYSTEM_PROMPT_TOKEN_OVERHEAD = 512;
+        const effectiveExpandContextSize = await this.effectiveExpandContextSize();
         const genContext = await this.generateModel.createContext({
-            contextSize: this.expandContextSize,
+            contextSize: effectiveExpandContextSize + SYSTEM_PROMPT_TOKEN_OVERHEAD,
         });
         const sequence = genContext.getSequence();
-        const session = new LlamaChatSession({ contextSequence: sequence });
+        const session = new LlamaChatSession({ contextSequence: sequence, systemPrompt });
         try {
             // Qwen3 recommended settings for non-thinking mode:
             // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
@@ -793,7 +979,8 @@ export class LlamaCpp {
         // Truncate documents that would exceed the rerank context size.
         // Budget = contextSize - template overhead - query tokens
         const queryTokens = model.tokenize(query).length;
-        const rawMaxDocTokens = this.rerankContextSize - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
+        const effectiveRerankContextSize = await this.effectiveRerankContextSize();
+        const rawMaxDocTokens = effectiveRerankContextSize - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
         // Guard against non-positive budget (e.g., very long queries or CJK content with
         // high token density). Allow at minimum 128 tokens per document to avoid crashes.
         const maxDocTokens = Math.max(128, rawMaxDocTokens);
@@ -952,7 +1139,7 @@ class LLMSessionManager {
     operationEnd() {
         this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
     }
-    getLlamaCpp() {
+    getLLM() {
         return this.llm;
     }
 }
@@ -1042,26 +1229,26 @@ class LLMSession {
         }
     }
     async embed(text, options) {
-        return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
+        return this.withOperation(() => this.manager.getLLM().embed(text, options));
     }
     async embedBatch(texts) {
-        return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
+        return this.withOperation(() => this.manager.getLLM().embedBatch(texts));
     }
     async expandQuery(query, options) {
-        return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
+        return this.withOperation(() => this.manager.getLLM().expandQuery(query, options));
     }
     async rerank(query, documents, options) {
-        return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
+        return this.withOperation(() => this.manager.getLLM().rerank(query, documents, options));
     }
 }
-// Session manager for the default LlamaCpp instance
+// Session manager for the default LLM instance
 let defaultSessionManager = null;
 /**
- * Get the session manager for the default LlamaCpp instance.
+ * Get the session manager for the default LLM instance.
  */
 function getSessionManager() {
-    const llm = getDefaultLlamaCpp();
-    if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
+    const llm = getDefaultLLM();
+    if (!defaultSessionManager || defaultSessionManager.getLLM() !== llm) {
         defaultSessionManager = new LLMSessionManager(llm);
     }
     return defaultSessionManager;
@@ -1100,32 +1287,37 @@ export function canUnloadLLM() {
     return defaultSessionManager.canUnload();
 }
 // =============================================================================
-// Singleton for default LlamaCpp instance
+// Singleton for default LLM instance
 // =============================================================================
-let defaultLlamaCpp = null;
+let defaultLLM = null;
 /**
- * Get the default LlamaCpp instance (creates one if needed)
+ * Get the default LLM instance (creates one if needed)
  */
-export function getDefaultLlamaCpp() {
-    if (!defaultLlamaCpp) {
-        const embedModel = process.env.KINDX_EMBED_MODEL;
-        defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
+export function getDefaultLLM() {
+    if (!defaultLLM) {
+        if (process.env.KINDX_LLM_BACKEND === "remote") {
+            defaultLLM = new RemoteLLM();
+        }
+        else {
+            const embedModel = process.env.KINDX_EMBED_MODEL;
+            defaultLLM = new LlamaCpp(embedModel ? { embedModel } : {});
+        }
     }
-    return defaultLlamaCpp;
+    return defaultLLM;
 }
 /**
- * Set a custom default LlamaCpp instance (useful for testing)
+ * Set a custom default LLM instance (useful for testing)
  */
-export function setDefaultLlamaCpp(llm) {
-    defaultLlamaCpp = llm;
+export function setDefaultLLM(llm) {
+    defaultLLM = llm;
 }
 /**
- * Dispose the default LlamaCpp instance if it exists.
+ * Dispose the default LLM instance if it exists.
  * Call this before process exit to prevent NAPI crashes.
  */
-export async function disposeDefaultLlamaCpp() {
-    if (defaultLlamaCpp) {
-        await defaultLlamaCpp.dispose();
-        defaultLlamaCpp = null;
+export async function disposeDefaultLLM() {
+    if (defaultLLM) {
+        await defaultLLM.dispose();
+        defaultLLM = null;
     }
 }