npm - @tobilu/qmd - Versions diffs - 2.0.1 → 2.1.0 - Mend

@tobilu/qmd 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/dist/llm.js CHANGED Viewed

@@ -48,7 +48,7 @@ export function formatDocForEmbedding(text, title, modelUri) {
 // HuggingFace model URIs for node-llama-cpp
 // Format: hf:<user>/<repo>/<file>
 // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
-const DEFAULT_EMBED_MODEL = process.env.QMD_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@@ -61,7 +61,9 @@ export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
 export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
 export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
 // Local model cache directory
-const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
+const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
+    ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
+    : join(homedir(), ".cache", "qmd", "models");
 export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
 function parseHfUri(model) {
     if (!model.startsWith("hf:"))
@@ -187,14 +189,17 @@ export class LlamaCpp {
     // Track disposal state to prevent double-dispose
     disposed = false;
     constructor(config = {}) {
-        this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
-        this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
-        this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
+        this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
+        this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
+        this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
         this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
         this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
         this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
         this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
     }
+    get embedModelName() {
+        return this.embedModelUri;
+    }
     /**
      * Reset the inactivity timer. Called after each model operation.
      * When timer fires, models are unloaded to free memory (if no active sessions).
@@ -289,11 +294,29 @@ export class LlamaCpp {
      */
     async ensureLlama() {
         if (!this.llama) {
-            const llama = await getLlama({
-                // attempt to build
+            // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
+            const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
+            const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
+            const loadLlama = async (gpu) => await getLlama({
                 build: "autoAttempt",
-                logLevel: LlamaLogLevel.error
+                logLevel: LlamaLogLevel.error,
+                gpu,
             });
+            let llama;
+            if (forceCpu) {
+                llama = await loadLlama(false);
+            }
+            else {
+                try {
+                    llama = await loadLlama("auto");
+                }
+                catch (err) {
+                    // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
+                    // Fall back to CPU so qmd still works.
+                    process.stderr.write(`QMD Warning: GPU init failed (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
+                    llama = await loadLlama(false);
+                }
+            }
             if (llama.gpu === false) {
                 process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
             }
@@ -394,6 +417,7 @@ export class LlamaCpp {
             for (let i = 0; i < n; i++) {
                 try {
                     this.embedContexts.push(await model.createEmbeddingContext({
+                        contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
                         ...(threads > 0 ? { threads } : {}),
                     }));
                 }
@@ -484,9 +508,20 @@ export class LlamaCpp {
      * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
      */
     // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
-    // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
-    // Use 2048 for safety margin. Still 17× less than auto (40960).
-    static RERANK_CONTEXT_SIZE = 2048;
+    // Default 2048 was too small for longer documents (e.g. session transcripts,
+    // CJK text, or large markdown files) — callers hit "input lengths exceed
+    // context size" errors even after truncation because the overhead estimate
+    // was insufficient.  4096 comfortably fits the largest real-world chunks
+    // while staying well below the 40 960-token auto size.
+    // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
+    static RERANK_CONTEXT_SIZE = (() => {
+        const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
+        return Number.isFinite(v) && v > 0 ? v : 4096;
+    })();
+    static EMBED_CONTEXT_SIZE = (() => {
+        const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
+        return Number.isFinite(v) && v > 0 ? v : 2048;
+    })();
     async ensureRerankContexts() {
         if (this.rerankContexts.length === 0) {
             const model = await this.ensureRerankModel();
@@ -555,15 +590,41 @@ export class LlamaCpp {
     // ==========================================================================
     // Core API methods
     // ==========================================================================
+    /**
+     * Truncate text to fit within the embedding model's context window.
+     * Uses the model's own tokenizer for accurate token counting, then
+     * detokenizes back to text if truncation is needed.
+     * Returns the (possibly truncated) text and whether truncation occurred.
+     */
+    async truncateToContextSize(text) {
+        if (!this.embedModel)
+            return { text, truncated: false };
+        const maxTokens = this.embedModel.trainContextSize;
+        if (maxTokens <= 0)
+            return { text, truncated: false };
+        const tokens = this.embedModel.tokenize(text);
+        if (tokens.length <= maxTokens)
+            return { text, truncated: false };
+        // Leave a small margin (4 tokens) for BOS/EOS overhead
+        const safeLimit = Math.max(1, maxTokens - 4);
+        const truncatedTokens = tokens.slice(0, safeLimit);
+        const truncatedText = this.embedModel.detokenize(truncatedTokens);
+        return { text: truncatedText, truncated: true };
+    }
     async embed(text, options = {}) {
         // Ping activity at start to keep models alive during this operation
         this.touchActivity();
         try {
             const context = await this.ensureEmbedContext();
-            const embedding = await context.getEmbeddingFor(text);
+            // Guard: truncate text that exceeds model context window to prevent GGML crash
+            const { text: safeText, truncated } = await this.truncateToContextSize(text);
+            if (truncated) {
+                console.warn(`⚠ Text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
+            }
+            const embedding = await context.getEmbeddingFor(safeText);
             return {
                 embedding: Array.from(embedding.vector),
-                model: this.embedModelUri,
+                model: options.model ?? this.embedModelUri,
             };
         }
         catch (error) {
@@ -575,7 +636,7 @@ export class LlamaCpp {
      * Batch embed multiple texts efficiently
      * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
      */
-    async embedBatch(texts) {
+    async embedBatch(texts, options = {}) {
         if (this._ciMode)
             throw new Error("LLM operations are disabled in CI (set CI=true)");
         // Ping activity at start to keep models alive during this operation
@@ -591,9 +652,13 @@ export class LlamaCpp {
                 const embeddings = [];
                 for (const text of texts) {
                     try {
-                        const embedding = await context.getEmbeddingFor(text);
+                        const { text: safeText, truncated } = await this.truncateToContextSize(text);
+                        if (truncated) {
+                            console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
+                        }
+                        const embedding = await context.getEmbeddingFor(safeText);
                         this.touchActivity();
-                        embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
+                        embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
                     }
                     catch (err) {
                         console.error("Embedding error for text:", err);
@@ -610,9 +675,13 @@ export class LlamaCpp {
                 const results = [];
                 for (const text of chunk) {
                     try {
-                        const embedding = await ctx.getEmbeddingFor(text);
+                        const { text: safeText, truncated } = await this.truncateToContextSize(text);
+                        if (truncated) {
+                            console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
+                        }
+                        const embedding = await ctx.getEmbeddingFor(safeText);
                         this.touchActivity();
-                        results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
+                        results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
                     }
                     catch (err) {
                         console.error("Embedding error for text:", err);
@@ -767,8 +836,10 @@ export class LlamaCpp {
             await genContext.dispose();
         }
     }
-    // Qwen3 reranker chat template overhead (system prompt, tags, separators)
-    static RERANK_TEMPLATE_OVERHEAD = 200;
+    // Qwen3 reranker chat template overhead (system prompt, tags, separators).
+    // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
+    // the truncation budget never lets a document slip past the context limit.
+    static RERANK_TEMPLATE_OVERHEAD = 512;
     static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
     async rerank(query, documents, options = {}) {
         if (this._ciMode)
@@ -1028,8 +1099,8 @@ class LLMSession {
     async embed(text, options) {
         return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
     }
-    async embedBatch(texts) {
-        return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
+    async embedBatch(texts, options) {
+        return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
     }
     async expandQuery(query, options) {
         return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
@@ -1106,8 +1177,7 @@ let defaultLlamaCpp = null;
  */
 export function getDefaultLlamaCpp() {
     if (!defaultLlamaCpp) {
-        const embedModel = process.env.QMD_EMBED_MODEL;
-        defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
+        defaultLlamaCpp = new LlamaCpp();
     }
     return defaultLlamaCpp;
 }

package/dist/mcp/server.js CHANGED Viewed

@@ -8,13 +8,17 @@
  */
 import { createServer } from "node:http";
 import { randomUUID } from "node:crypto";
+import { readFileSync } from "node:fs";
+import { join, dirname } from "node:path";
 import { fileURLToPath } from "url";
 import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
 import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
 import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
 import { isInitializeRequest } from "@modelcontextprotocol/sdk/types.js";
 import { z } from "zod";
+import { existsSync } from "fs";
 import { createStore, extractSnippet, addLineNumbers, getDefaultDbPath, DEFAULT_MULTI_GET_MAX_BYTES, } from "../index.js";
+import { getConfigPath } from "../collections.js";
 // =============================================================================
 // Helper functions
 // =============================================================================
@@ -39,6 +43,16 @@ function formatSearchSummary(results, query) {
     }
     return lines.join('\n');
 }
+function getPackageVersion() {
+    try {
+        const pkgPath = join(dirname(fileURLToPath(import.meta.url)), "../../package.json");
+        const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
+        return pkg.version ?? "unknown";
+    }
+    catch {
+        return "unknown";
+    }
+}
 // =============================================================================
 // MCP Server
 // =============================================================================
@@ -108,7 +122,7 @@ async function buildInstructions(store) {
  * Shared by both stdio and HTTP transports.
  */
 async function createMcpServer(store) {
-    const server = new McpServer({ name: "qmd", version: "0.9.9" }, { instructions: await buildInstructions(store) });
+    const server = new McpServer({ name: "qmd", version: getPackageVersion() }, { instructions: await buildInstructions(store) });
     // Pre-fetch default collection names for search tools
     const defaultCollectionNames = await store.getDefaultCollectionNames();
     // ---------------------------------------------------------------------------
@@ -218,8 +232,9 @@ Intent-aware lex (C++ performance, not sports):
             candidateLimit: z.number().optional().describe("Maximum candidates to rerank (default: 40, lower = faster but may miss results)"),
             collections: z.array(z.string()).optional().describe("Filter to collections (OR match)"),
             intent: z.string().optional().describe("Background context to disambiguate the query. Example: query='performance', intent='web page load times and Core Web Vitals'. Does not search on its own."),
+            rerank: z.boolean().optional().default(true).describe("Rerank results using LLM (default: true). Set to false for faster results on CPU-only machines."),
         },
-    }, async ({ searches, limit, minScore, candidateLimit, collections, intent }) => {
+    }, async ({ searches, limit, minScore, candidateLimit, collections, intent, rerank }) => {
         // Map to internal format
         const queries = searches.map(s => ({
             type: s.type,
@@ -232,6 +247,7 @@ Intent-aware lex (C++ performance, not sports):
             collections: effectiveCollections.length > 0 ? effectiveCollections : undefined,
             limit,
             minScore,
+            rerank,
             intent,
         });
         // Use first lex or vec query for snippet extraction
@@ -387,7 +403,7 @@ Intent-aware lex (C++ performance, not sports):
             `  Collections: ${status.collections.length}`,
         ];
         for (const col of status.collections) {
-            summary.push(`    - ${col.path} (${col.documents} docs)`);
+            summary.push(`    - ${col.name}: ${col.path} (${col.documents} docs)`);
         }
         return {
             content: [{ type: "text", text: summary.join('\n') }],
@@ -400,7 +416,11 @@ Intent-aware lex (C++ performance, not sports):
 // Transport: stdio (default)
 // =============================================================================
 export async function startMcpServer() {
-    const store = await createStore({ dbPath: getDefaultDbPath() });
+    const configPath = getConfigPath();
+    const store = await createStore({
+        dbPath: getDefaultDbPath(),
+        ...(existsSync(configPath) ? { configPath } : {}),
+    });
     const server = await createMcpServer(store);
     const transport = new StdioServerTransport();
     await server.connect(transport);
@@ -410,7 +430,11 @@ export async function startMcpServer() {
  * Binds to localhost only. Returns a handle for shutdown and port discovery.
  */
 export async function startMcpHttpServer(port, options) {
-    const store = await createStore({ dbPath: getDefaultDbPath() });
+    const configPath = getConfigPath();
+    const store = await createStore({
+        dbPath: getDefaultDbPath(),
+        ...(existsSync(configPath) ? { configPath } : {}),
+    });
     // Pre-fetch default collection names for REST endpoint
     const defaultCollectionNames = await store.getDefaultCollectionNames();
     // Session map: each client gets its own McpServer + Transport pair (MCP spec requirement).

package/dist/store.d.ts CHANGED Viewed

@@ -18,6 +18,8 @@ export declare const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b
 export declare const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
 export declare const DEFAULT_GLOB = "**/*.md";
 export declare const DEFAULT_MULTI_GET_MAX_BYTES: number;
+export declare const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
+export declare const DEFAULT_EMBED_MAX_BATCH_BYTES: number;
 export declare const CHUNK_SIZE_TOKENS = 900;
 export declare const CHUNK_OVERLAP_TOKENS: number;
 export declare const CHUNK_SIZE_CHARS: number;
@@ -76,6 +78,20 @@ export declare function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]
  * @returns The best position to cut at
  */
 export declare function findBestCutoff(breakPoints: BreakPoint[], targetCharPos: number, windowChars?: number, decayFactor?: number, codeFences?: CodeFenceRegion[]): number;
+export type ChunkStrategy = "auto" | "regex";
+/**
+ * Merge two sets of break points (e.g. regex + AST), keeping the highest
+ * score at each position. Result is sorted by position.
+ */
+export declare function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[];
+/**
+ * Core chunk algorithm that operates on precomputed break points and code fences.
+ * This is the shared implementation used by both regex-only and AST-aware chunking.
+ */
+export declare function chunkDocumentWithBreakPoints(content: string, breakPoints: BreakPoint[], codeFences: CodeFenceRegion[], maxChars?: number, overlapChars?: number, windowChars?: number): {
+    text: string;
+    pos: number;
+}[];
 export declare const STRONG_SIGNAL_MIN_SCORE = 0.85;
 export declare const STRONG_SIGNAL_MIN_GAP = 0.15;
 export declare const RERANK_CANDIDATE_LIMIT = 40;
@@ -118,6 +134,8 @@ export declare function normalizePathSeparators(path: string): string;
 export declare function getRelativePathFromPrefix(path: string, prefix: string): string | null;
 export declare function resolve(...paths: string[]): string;
 export declare function enableProductionMode(): void;
+/** Reset production mode flag — only for testing. */
+export declare function _resetProductionModeForTesting(): void;
 export declare function getDefaultDbPath(indexName?: string): string;
 export declare function getPwd(): string;
 export declare function getRealPath(path: string): string;
@@ -311,16 +329,20 @@ export type EmbedResult = {
     errors: number;
     durationMs: number;
 };
+export type EmbedOptions = {
+    force?: boolean;
+    model?: string;
+    maxDocsPerBatch?: number;
+    maxBatchBytes?: number;
+    chunkStrategy?: ChunkStrategy;
+    onProgress?: (info: EmbedProgress) => void;
+};
 /**
  * Generate vector embeddings for documents that need them.
  * Pure function — no console output, no db lifecycle management.
  * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
  */
-export declare function generateEmbeddings(store: Store, options?: {
-    force?: boolean;
-    model?: string;
-    onProgress?: (info: EmbedProgress) => void;
-}): Promise<EmbedResult>;
+export declare function generateEmbeddings(store: Store, options?: EmbedOptions): Promise<EmbedResult>;
 /**
  * Create a new store instance with the given database path.
  * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
@@ -505,15 +527,34 @@ export declare function deactivateDocument(db: Database, collectionName: string,
  */
 export declare function getActiveDocumentPaths(db: Database, collectionName: string): string[];
 export { formatQueryForEmbedding, formatDocForEmbedding };
+/**
+ * Chunk a document using regex-only break point detection.
+ * This is the sync, backward-compatible API used by tests and legacy callers.
+ */
 export declare function chunkDocument(content: string, maxChars?: number, overlapChars?: number, windowChars?: number): {
     text: string;
     pos: number;
 }[];
+/**
+ * Async AST-aware chunking. Detects language from filepath, computes AST
+ * break points for supported code files, merges with regex break points,
+ * and delegates to the shared chunk algorithm.
+ *
+ * Falls back to regex-only when strategy is "regex", filepath is absent,
+ * or language is unsupported.
+ */
+export declare function chunkDocumentAsync(content: string, maxChars?: number, overlapChars?: number, windowChars?: number, filepath?: string, chunkStrategy?: ChunkStrategy): Promise<{
+    text: string;
+    pos: number;
+}[]>;
 /**
  * Chunk a document by actual token count using the LLM tokenizer.
  * More accurate than character-based chunking but requires async.
+ *
+ * When filepath and chunkStrategy are provided, uses AST-aware break points
+ * for supported code files.
  */
-export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number): Promise<{
+export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal): Promise<{
     text: string;
     pos: number;
     tokens: number;
@@ -640,6 +681,7 @@ export declare function getCollectionsWithoutContext(db: Database): {
  * Useful for suggesting where context might be needed.
  */
 export declare function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[];
+export declare function sanitizeFTS5Term(term: string): string;
 /**
  * Validate that a vec/hyde query doesn't use lex-only syntax.
  * Returns error message if invalid, null if valid.
@@ -665,6 +707,12 @@ export declare function clearAllEmbeddings(db: Database): void;
 /**
  * Insert a single embedding into both content_vectors and vectors_vec tables.
  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
+ *
+ * content_vectors is inserted first so that getHashesForEmbedding (which checks
+ * only content_vectors) won't re-select the hash on a crash between the two inserts.
+ *
+ * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
+ * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
  */
 export declare function insertEmbedding(db: Database, hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string): void;
 export declare function expandQuery(query: string, model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]>;
@@ -763,6 +811,7 @@ export interface HybridQueryOptions {
     explain?: boolean;
     intent?: string;
     skipRerank?: boolean;
+    chunkStrategy?: ChunkStrategy;
     hooks?: SearchHooks;
 }
 export interface HybridQueryResult {
@@ -836,6 +885,7 @@ export interface StructuredSearchOptions {
     intent?: string;
     /** Skip LLM reranking, use only RRF scores */
     skipRerank?: boolean;
+    chunkStrategy?: ChunkStrategy;
     hooks?: SearchHooks;
 }
 /**