npm - @geminilight/mindos - Versions diffs - 0.6.32 → 0.6.33 - Mend

@geminilight/mindos 0.6.32 → 0.6.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/app/app/api/ask/route.ts +69 -29
package/app/app/api/graph/route.ts +5 -76
package/app/app/trash/page.tsx +1 -0
package/app/app/view/[...path]/ViewPageClient.tsx +22 -8
package/app/components/ExportModal.tsx +2 -2
package/app/components/FileTree.tsx +26 -5
package/app/components/HomeContent.tsx +4 -0
package/app/components/SystemPulse.tsx +318 -0
package/app/components/TrashPageClient.tsx +9 -9
package/app/components/agents/AgentsSkillsSection.tsx +173 -102
package/app/components/ui/Toaster.tsx +11 -2
package/app/lib/actions.ts +20 -9
package/app/lib/agent/context.ts +22 -11
package/app/lib/agent/loop-detection.ts +52 -0
package/app/lib/agent/retry.ts +19 -0
package/app/lib/core/backlinks.ts +33 -9
package/app/lib/core/index.ts +4 -1
package/app/lib/core/link-index.ts +224 -0
package/app/lib/core/search-index.ts +310 -14
package/app/lib/core/search.ts +180 -29
package/app/lib/fs.ts +67 -10
package/app/lib/hooks/usePinnedFiles.ts +7 -2
package/app/lib/i18n/modules/knowledge.ts +62 -0
package/app/lib/toast.ts +7 -1
package/app/next-env.d.ts +1 -1
package/app/package.json +2 -0
package/package.json +1 -1
package/scripts/parse-syncinclude.sh +92 -0
package/scripts/write-build-stamp.js +40 -0

package/app/lib/core/search-index.ts CHANGED Viewed

@@ -1,3 +1,5 @@
+import fs from 'fs';
+import path from 'path';
 import { collectAllFiles } from './tree';
 import { readFile } from './fs-ops';
@@ -6,11 +8,18 @@ const MAX_CONTENT_LENGTH = 50_000;
 // CJK Unicode ranges: Chinese, Japanese Hiragana/Katakana, Korean
 const CJK_REGEX = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/;
+// Intl.Segmenter for proper CJK word segmentation (available in Node 16+)
+const zhSegmenter = typeof Intl !== 'undefined' && Intl.Segmenter
+  ? new Intl.Segmenter('zh', { granularity: 'word' })
+  : null;
 /**
- * Tokenize text for indexing: split on word boundaries + CJK bigrams.
+ * Tokenize text for indexing: split on word boundaries + CJK word segmentation.
  *
  * Latin/ASCII: split on non-alphanumeric characters, lowercased.
- * CJK: generate character-level bigrams (overlapping pairs).
+ * CJK: uses Intl.Segmenter for proper word boundaries (e.g. "知识管理"
+ *   → ["知识", "管理"] instead of bigrams ["知识", "识管", "管理"]).
+ *   Falls back to bigrams if Intl.Segmenter is unavailable.
  * Mixed text: both strategies applied, tokens merged.
  */
 function tokenize(text: string): Set<string> {
@@ -27,27 +36,42 @@ function tokenize(text: string): Set<string> {
     }
   }
-  // CJK bigrams + single chars (unigrams carry meaning in CJK scripts)
+  // CJK word segmentation
   if (CJK_REGEX.test(lower)) {
-    const cjkChars: string[] = [];
-    for (const ch of lower) {
-      if (CJK_REGEX.test(ch)) {
-        cjkChars.push(ch);
-      } else {
-        // Emit bigrams for accumulated CJK run
-        if (cjkChars.length > 0) {
-          emitCjkTokens(cjkChars, tokens);
-          cjkChars.length = 0;
+    if (zhSegmenter) {
+      // Intl.Segmenter: proper word boundaries
+      for (const { segment, isWordLike } of zhSegmenter.segment(lower)) {
+        if (!isWordLike) continue;
+        const word = segment.trim();
+        if (!word) continue;
+        tokens.add(word);
+        // Also add individual CJK characters as unigrams (for single-char queries)
+        for (const ch of word) {
+          if (CJK_REGEX.test(ch)) tokens.add(ch);
+        }
+      }
+    } else {
+      // Fallback: bigrams + unigrams
+      const cjkChars: string[] = [];
+      for (const ch of lower) {
+        if (CJK_REGEX.test(ch)) {
+          cjkChars.push(ch);
+        } else {
+          if (cjkChars.length > 0) {
+            emitCjkBigrams(cjkChars, tokens);
+            cjkChars.length = 0;
+          }
         }
       }
+      if (cjkChars.length > 0) emitCjkBigrams(cjkChars, tokens);
     }
-    if (cjkChars.length > 0) emitCjkTokens(cjkChars, tokens);
   }
   return tokens;
 }
-function emitCjkTokens(chars: string[], tokens: Set<string>): void {
+/** Fallback CJK tokenizer: bigrams + unigrams (when Intl.Segmenter unavailable) */
+function emitCjkBigrams(chars: string[], tokens: Set<string>): void {
   for (let i = 0; i < chars.length; i++) {
     tokens.add(chars[i]); // unigram
     if (i + 1 < chars.length) {
@@ -73,10 +97,19 @@ export class SearchIndex {
   private builtForRoot: string | null = null;
   private fileCount = 0;
+  /** BM25 statistics — populated during rebuild() */
+  private docLengths = new Map<string, number>();  // filePath → char count
+  private totalChars = 0;
+  /** Reverse mapping: filePath → Set<token> for efficient removeFile. */
+  private fileTokens = new Map<string, Set<string>>();
   /** Full rebuild: read all files and build inverted index. */
   rebuild(mindRoot: string): void {
     const allFiles = collectAllFiles(mindRoot);
     const inverted = new Map<string, Set<string>>();
+    const docLengths = new Map<string, number>();
+    const fileTokensMap = new Map<string, Set<string>>();
+    let totalChars = 0;
     for (const filePath of allFiles) {
       let content: string;
@@ -86,6 +119,10 @@ export class SearchIndex {
         continue;
       }
+      // Store original length for BM25 before truncation
+      docLengths.set(filePath, content.length);
+      totalChars += content.length;
       if (content.length > MAX_CONTENT_LENGTH) {
         content = content.slice(0, MAX_CONTENT_LENGTH);
       }
@@ -93,6 +130,7 @@ export class SearchIndex {
       // Also index the file path itself
       const allText = filePath + '\n' + content;
       const tokens = tokenize(allText);
+      fileTokensMap.set(filePath, tokens);
       for (const token of tokens) {
         let set = inverted.get(token);
@@ -107,6 +145,9 @@ export class SearchIndex {
     this.invertedIndex = inverted;
     this.builtForRoot = mindRoot;
     this.fileCount = allFiles.length;
+    this.docLengths = docLengths;
+    this.totalChars = totalChars;
+    this.fileTokens = fileTokensMap;
   }
   /** Clear the index. Next search will trigger a lazy rebuild. */
@@ -114,6 +155,77 @@ export class SearchIndex {
     this.invertedIndex = null;
     this.builtForRoot = null;
     this.fileCount = 0;
+    this.docLengths.clear();
+    this.totalChars = 0;
+    this.fileTokens.clear();
+  }
+  // ── Incremental updates ──────────────────────────────────────────────
+  /**
+   * Remove a single file from the index (e.g. after deletion).
+   * O(tokens-in-file) — much faster than full rebuild.
+   */
+  removeFile(filePath: string): void {
+    if (!this.invertedIndex) return;
+    // Use reverse mapping for O(tokens-in-file) instead of O(all-tokens)
+    const tokens = this.fileTokens.get(filePath);
+    if (tokens) {
+      for (const token of tokens) {
+        this.invertedIndex.get(token)?.delete(filePath);
+      }
+      this.fileTokens.delete(filePath);
+    }
+    // Update BM25 stats
+    const oldLen = this.docLengths.get(filePath) ?? 0;
+    this.totalChars -= oldLen;
+    this.docLengths.delete(filePath);
+    this.fileCount = Math.max(0, this.fileCount - 1);
+  }
+  /**
+   * Add a new file to the index (e.g. after creation).
+   * O(tokens-in-file) — much faster than full rebuild.
+   */
+  addFile(mindRoot: string, filePath: string): void {
+    if (!this.invertedIndex) return;
+    let content: string;
+    try { content = readFile(mindRoot, filePath); } catch { return; }
+    // Update BM25 stats
+    this.docLengths.set(filePath, content.length);
+    this.totalChars += content.length;
+    this.fileCount++;
+    // Index tokens
+    if (content.length > MAX_CONTENT_LENGTH) {
+      content = content.slice(0, MAX_CONTENT_LENGTH);
+    }
+    const allText = filePath + '\n' + content;
+    const tokens = tokenize(allText);
+    this.fileTokens.set(filePath, tokens);
+    for (const token of tokens) {
+      let set = this.invertedIndex.get(token);
+      if (!set) {
+        set = new Set<string>();
+        this.invertedIndex.set(token, set);
+      }
+      set.add(filePath);
+    }
+  }
+  /**
+   * Re-index a single file after modification.
+   * Equivalent to removeFile + addFile but avoids double traversal of inverted index.
+   */
+  updateFile(mindRoot: string, filePath: string): void {
+    if (!this.invertedIndex) return;
+    this.removeFile(filePath);
+    this.addFile(mindRoot, filePath);
   }
   /** Whether the index has been built for the given mindRoot. */
@@ -131,6 +243,66 @@ export class SearchIndex {
     return this.fileCount;
   }
+  /** Average document length in chars. */
+  getAvgDocLength(): number {
+    return this.fileCount > 0 ? this.totalChars / this.fileCount : 0;
+  }
+  /** Character count of a specific document. Returns 0 if unknown. */
+  getDocLength(filePath: string): number {
+    return this.docLengths.get(filePath) ?? 0;
+  }
+  /** Number of documents containing a specific token (document frequency). */
+  getDocFrequency(token: string): number {
+    if (!this.invertedIndex) return 0;
+    return this.invertedIndex.get(token)?.size ?? 0;
+  }
+  /**
+   * Get candidates via UNION of token sets (for BM25 multi-term scoring).
+   * Unlike getCandidates (intersection), this returns any file matching any token.
+   *
+   * Optimization: when the query produces many tokens (common with CJK bigrams),
+   * files are ranked by how many distinct query tokens they match. Files matching
+   * fewer than half the tokens are pruned — unless that would leave zero results,
+   * in which case all matching files are returned. This prevents CJK bigram
+   * explosion from creating massive candidate sets full of low-quality matches.
+   */
+  getCandidatesUnion(query: string): string[] | null {
+    if (!query.trim()) return null;
+    if (!this.invertedIndex) return null;
+    const tokens = tokenize(query.toLowerCase().trim());
+    if (tokens.size === 0) return null;
+    // Count how many query tokens each file matches
+    const hitCount = new Map<string, number>();
+    for (const token of tokens) {
+      const set = this.invertedIndex.get(token);
+      if (set) {
+        for (const filePath of set) {
+          hitCount.set(filePath, (hitCount.get(filePath) ?? 0) + 1);
+        }
+      }
+    }
+    if (hitCount.size === 0) return [];
+    // When query has many tokens (e.g. CJK bigrams), prune low-overlap files
+    const tokenCount = tokens.size;
+    if (tokenCount >= 3) {
+      const threshold = Math.max(1, Math.floor(tokenCount / 2));
+      const filtered = [...hitCount.entries()]
+        .filter(([, count]) => count >= threshold)
+        .map(([path]) => path);
+      // Only apply pruning if it doesn't eliminate everything
+      if (filtered.length > 0) return filtered;
+    }
+    return [...hitCount.keys()];
+  }
   /**
    * Get candidate file paths for a query (single or multi-word).
    *
@@ -171,4 +343,128 @@ export class SearchIndex {
     return result ? Array.from(result) : [];
   }
+  // ── Persistence ──────────────────────────────────────────────────────
+  /**
+   * Serialize the index to a JSON file for persistence across restarts.
+   * Stored at `<mindosDir>/search-index.json`.
+   */
+  persist(mindosDir: string): void {
+    if (!this.invertedIndex) return;
+    const data: PersistedIndex = {
+      version: 1,
+      builtForRoot: this.builtForRoot ?? '',
+      fileCount: this.fileCount,
+      totalChars: this.totalChars,
+      docLengths: Object.fromEntries(this.docLengths),
+      invertedIndex: {},
+      timestamp: Date.now(),
+    };
+    for (const [token, fileSet] of this.invertedIndex) {
+      data.invertedIndex[token] = [...fileSet];
+    }
+    const filePath = path.join(mindosDir, 'search-index.json');
+    try {
+      fs.mkdirSync(mindosDir, { recursive: true });
+      fs.writeFileSync(filePath, JSON.stringify(data), 'utf-8');
+    } catch {
+      // Non-critical — index will be rebuilt on next search
+    }
+  }
+  /**
+   * Load a previously persisted index from disk.
+   * Returns true if loaded successfully, false if stale/missing/corrupt.
+   *
+   * Staleness checks (all must pass):
+   * 1. Version and mindRoot match
+   * 2. Actual file count on disk matches indexed file count (detects adds/deletes)
+   * 3. Sampled files' mtime are older than the persisted timestamp
+   */
+  load(mindosDir: string, mindRoot: string): boolean {
+    const filePath = path.join(mindosDir, 'search-index.json');
+    let raw: string;
+    try { raw = fs.readFileSync(filePath, 'utf-8'); } catch { return false; }
+    let data: PersistedIndex;
+    try { data = JSON.parse(raw); } catch { return false; }
+    if (data.version !== 1 || data.builtForRoot !== mindRoot) return false;
+    // Check 1: file count on disk must match indexed count
+    // This catches new files created or files deleted while process was down
+    const currentFiles = collectAllFiles(mindRoot);
+    if (currentFiles.length !== data.fileCount) return false;
+    // Check 2: mtime sampling — check every file if ≤50, otherwise sample 50
+    const docPaths = Object.keys(data.docLengths);
+    const sampleSize = Math.min(50, docPaths.length);
+    if (sampleSize === docPaths.length) {
+      // Small index: check all files
+      for (const dp of docPaths) {
+        try {
+          const stat = fs.statSync(path.join(mindRoot, dp));
+          if (stat.mtimeMs > data.timestamp) return false;
+        } catch {
+          return false; // file deleted
+        }
+      }
+    } else {
+      // Large index: sample evenly + always check the last few (most likely to be recent)
+      const step = Math.max(1, Math.floor(docPaths.length / 40));
+      const sampled = new Set<number>();
+      // Evenly spaced samples
+      for (let i = 0; i < docPaths.length; i += step) sampled.add(i);
+      // Always check the last 10 files (most recently added to the index)
+      for (let i = Math.max(0, docPaths.length - 10); i < docPaths.length; i++) sampled.add(i);
+      for (const idx of sampled) {
+        try {
+          const stat = fs.statSync(path.join(mindRoot, docPaths[idx]));
+          if (stat.mtimeMs > data.timestamp) return false;
+        } catch {
+          return false;
+        }
+      }
+    }
+    // Restore state
+    this.builtForRoot = data.builtForRoot;
+    this.fileCount = data.fileCount;
+    this.totalChars = data.totalChars;
+    this.docLengths = new Map(Object.entries(data.docLengths).map(([k, v]) => [k, v as number]));
+    const inverted = new Map<string, Set<string>>();
+    const fileTokensMap = new Map<string, Set<string>>();
+    for (const [token, files] of Object.entries(data.invertedIndex)) {
+      const fileSet = new Set(files as string[]);
+      inverted.set(token, fileSet);
+      // Rebuild reverse mapping
+      for (const f of fileSet) {
+        let tokens = fileTokensMap.get(f);
+        if (!tokens) { tokens = new Set(); fileTokensMap.set(f, tokens); }
+        tokens.add(token);
+      }
+    }
+    this.invertedIndex = inverted;
+    this.fileTokens = fileTokensMap;
+    return true;
+  }
+}
+/** Shape of the persisted index JSON. */
+interface PersistedIndex {
+  version: number;
+  builtForRoot: string;
+  fileCount: number;
+  totalChars: number;
+  docLengths: Record<string, number>;
+  invertedIndex: Record<string, string[]>;
+  timestamp: number;
 }

package/app/lib/core/search.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import fs from 'fs';
 import path from 'path';
+import os from 'os';
 import { collectAllFiles } from './tree';
 import { readFile } from './fs-ops';
 import { SearchIndex } from './search-index';
@@ -11,24 +12,122 @@ import type { SearchResult, SearchOptions } from './types';
  */
 const searchIndex = new SearchIndex();
+/** Path to ~/.mindos/ for index persistence. */
+function getMindosDir(): string {
+  return path.join(os.homedir(), '.mindos');
+}
 /** Invalidate the core search index. Called from `lib/fs.ts` on write operations. */
 export function invalidateSearchIndex(): void {
   searchIndex.invalidate();
 }
+/** Incrementally update a single file in the search index (after write/edit). */
+export function updateSearchIndexFile(mindRoot: string, filePath: string): void {
+  if (!searchIndex.isBuilt()) return;
+  searchIndex.updateFile(mindRoot, filePath);
+  schedulePersist();
+}
+/** Incrementally add a new file to the search index (after create). */
+export function addSearchIndexFile(mindRoot: string, filePath: string): void {
+  if (!searchIndex.isBuilt()) return;
+  searchIndex.addFile(mindRoot, filePath);
+  schedulePersist();
+}
+/** Incrementally remove a file from the search index (after delete). */
+export function removeSearchIndexFile(filePath: string): void {
+  if (!searchIndex.isBuilt()) return;
+  searchIndex.removeFile(filePath);
+  schedulePersist();
+}
+/** Debounced persist — writes index to disk 5s after last write operation. */
+let _persistTimer: ReturnType<typeof setTimeout> | null = null;
+let _persistDirty = false;
+function schedulePersist(): void {
+  _persistDirty = true;
+  if (_persistTimer) clearTimeout(_persistTimer);
+  _persistTimer = setTimeout(flushPersist, 5000);
+}
+/** Immediately flush pending index to disk (used by exit hooks). */
+function flushPersist(): void {
+  if (_persistTimer) { clearTimeout(_persistTimer); _persistTimer = null; }
+  if (!_persistDirty) return;
+  _persistDirty = false;
+  try { searchIndex.persist(getMindosDir()); } catch { /* non-critical */ }
+}
+// Ensure index is persisted before process exits
+if (typeof process !== 'undefined') {
+  process.on('beforeExit', flushPersist);
+  process.on('SIGTERM', () => { flushPersist(); process.exit(0); });
+  process.on('SIGINT', () => { flushPersist(); process.exit(0); });
+}
+/* ── BM25 Parameters ── */
+const BM25_K1 = 1.2;  // Term frequency saturation
+const BM25_B = 0.75;  // Document length normalization
+/**
+ * Compute BM25 score for a single term in a single document.
+ *
+ * @param tf          - raw term frequency (occurrences of term in doc)
+ * @param df          - document frequency (number of docs containing term)
+ * @param docLength   - length of this document (chars)
+ * @param avgDocLength - average document length across corpus (chars)
+ * @param totalDocs   - total number of documents in corpus
+ */
+export function bm25Score(
+  tf: number,
+  df: number,
+  docLength: number,
+  avgDocLength: number,
+  totalDocs: number,
+): number {
+  if (tf === 0 || totalDocs === 0 || avgDocLength === 0) return 0;
+  // IDF: log((N - df + 0.5) / (df + 0.5) + 1) — the +1 prevents negative IDF
+  // when df > N/2 (common terms)
+  const idf = Math.log((totalDocs - df + 0.5) / (df + 0.5) + 1);
+  // Normalized TF with saturation and length normalization
+  const tfNorm = (tf * (BM25_K1 + 1)) / (tf + BM25_K1 * (1 - BM25_B + BM25_B * docLength / avgDocLength));
+  return idf * tfNorm;
+}
+/**
+ * Split a query into individual search terms for multi-term BM25 scoring.
+ * Each term is scored independently, then scores are summed per document.
+ */
+function splitQueryTerms(query: string): string[] {
+  const lower = query.toLowerCase().trim();
+  // Split on whitespace, filter empty
+  const terms = lower.split(/\s+/).filter(t => t.length > 0);
+  // Deduplicate
+  return [...new Set(terms)];
+}
 /**
  * Core literal search — used by MCP tools via REST API.
  *
- * This is a **case-insensitive literal string match** with occurrence-density scoring.
- * It supports scope, file_type, and modified_after filters that MCP tools expose.
+ * Scoring: **BM25** (Best Matching 25) — the standard information retrieval
+ * ranking function. For multi-term queries, each term is scored independently
+ * and scores are summed. This means:
+ * - Rare terms (low document frequency) contribute more to the score
+ * - Term frequency has diminishing returns (saturation at k1)
+ * - Shorter documents score higher when term frequency is equal
  *
- * Performance: uses an in-memory inverted index to narrow the candidate file set
- * before doing full-text scanning. The index is built lazily on the first query
- * and invalidated on any write operation.
+ * Candidate narrowing: uses an in-memory inverted index with UNION semantics
+ * for multi-term queries (a document matching ANY term is a candidate).
  *
  * NOTE: The App also has a separate Fuse.js fuzzy search in `lib/fs.ts` for the
  * browser `⌘K` search overlay. The two coexist intentionally:
- * - Core search (here): exact literal match, supports filters, used by MCP/API
+ * - Core search (here): exact literal match + BM25 ranking, used by MCP/API
  * - App search (lib/fs.ts): Fuse.js fuzzy match with CJK support, used by frontend
  */
 export function searchFiles(mindRoot: string, query: string, opts: SearchOptions = {}): SearchResult[] {
@@ -37,11 +136,21 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
   // Ensure search index is built for this mindRoot
   if (!searchIndex.isBuiltFor(mindRoot)) {
-    searchIndex.rebuild(mindRoot);
+    // Try loading from disk first (fast path — avoids full rebuild)
+    const loaded = searchIndex.load(getMindosDir(), mindRoot);
+    if (!loaded) {
+      searchIndex.rebuild(mindRoot);
+      // Persist for next cold start (fire-and-forget)
+      try { searchIndex.persist(getMindosDir()); } catch { /* non-critical */ }
+    }
   }
-  // Use index to get candidate files (or null if index unavailable → full scan)
-  const candidates = searchIndex.getCandidates(query);
+  const totalDocs = searchIndex.getFileCount();
+  const avgDocLength = searchIndex.getAvgDocLength();
+  const queryTerms = splitQueryTerms(query);
+  // Use UNION index to get candidate files (any file matching any term)
+  const candidates = searchIndex.getCandidatesUnion(query);
   const candidateSet = candidates ? new Set(candidates) : null;
   let allFiles = collectAllFiles(mindRoot);
@@ -72,10 +181,16 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
   const results: SearchResult[] = [];
   const lowerQuery = query.toLowerCase();
-  const escapedQuery = lowerQuery.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+  // ── Pre-scan: compute document frequency for each query term ──
+  // We count how many candidate files contain each term via literal match.
+  // This is more accurate than using the inverted index token df, because
+  // the index tokenizes via Intl.Segmenter (CJK word boundaries) which may
+  // split query terms differently than our literal substring match.
+  const termDf = new Map<string, number>();
+  const fileContents = new Map<string, string>();
   for (const filePath of allFiles) {
-    // Check mtime filter before reading content
     if (mtimeThreshold > 0) {
       try {
         const abs = path.join(mindRoot, filePath);
@@ -87,34 +202,70 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
     let content: string;
     try { content = readFile(mindRoot, filePath); } catch { continue; }
+    const lower = content.toLowerCase();
+    fileContents.set(filePath, content);
+    for (const term of queryTerms) {
+      if (lower.includes(term)) {
+        termDf.set(term, (termDf.get(term) ?? 0) + 1);
+      }
+    }
+  }
+  // ── Score each document with BM25 ──
+  for (const [filePath, content] of fileContents) {
     const lowerContent = content.toLowerCase();
-    const index = lowerContent.indexOf(lowerQuery);
-    if (index === -1) continue;
-    // Try to find natural boundaries (paragraphs) around the match
-    let snippetStart = content.lastIndexOf('\n\n', index);
-    if (snippetStart === -1) snippetStart = Math.max(0, index - 200);
-    else snippetStart += 2; // skip the newlines
+    // Check if document matches any term (full-text verification after index narrowing)
+    let matchedAnyTerm = false;
+    let firstMatchIndex = -1;
-    let snippetEnd = content.indexOf('\n\n', index);
-    if (snippetEnd === -1) snippetEnd = Math.min(content.length, index + query.length + 200);
+    // Compute BM25 score: sum of per-term scores
+    let totalScore = 0;
+    let totalOccurrences = 0;
+    const docLength = content.length;
-    // Prevent massive blocks (cap at ~400 chars total)
-    if (index - snippetStart > 200) snippetStart = index - 200;
-    if (snippetEnd - index > 200) snippetEnd = index + query.length + 200;
+    for (const term of queryTerms) {
+      const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+      const matches = lowerContent.match(new RegExp(escapedTerm, 'g'));
+      const tf = matches ? matches.length : 0;
+      if (tf === 0) continue;
+      matchedAnyTerm = true;
+      totalOccurrences += tf;
+      if (firstMatchIndex === -1) {
+        firstMatchIndex = lowerContent.indexOf(term);
+      }
+      // Get document frequency for this term (computed in pre-scan)
+      const df = termDf.get(term) ?? 0;
+      totalScore += bm25Score(tf, df, docLength, avgDocLength, totalDocs);
+    }
+    if (!matchedAnyTerm) continue;
+    // Build snippet around the first match
+    const index = firstMatchIndex >= 0 ? firstMatchIndex : lowerContent.indexOf(lowerQuery);
+    const snippetAnchor = index >= 0 ? index : 0;
+    let snippetStart = content.lastIndexOf('\n\n', snippetAnchor);
+    if (snippetStart === -1) snippetStart = Math.max(0, snippetAnchor - 200);
+    else snippetStart += 2;
+    let snippetEnd = content.indexOf('\n\n', snippetAnchor);
+    if (snippetEnd === -1) snippetEnd = Math.min(content.length, snippetAnchor + query.length + 200);
+    if (snippetAnchor - snippetStart > 200) snippetStart = snippetAnchor - 200;
+    if (snippetEnd - snippetAnchor > 200) snippetEnd = snippetAnchor + query.length + 200;
     let snippet = content.slice(snippetStart, snippetEnd).trim();
-    // Collapse internal whitespace for cleaner search result presentation, but preserve some structure
     snippet = snippet.replace(/\n{3,}/g, '\n\n');
     if (snippetStart > 0) snippet = '...' + snippet;
     if (snippetEnd < content.length) snippet += '...';
-    const occurrences = (lowerContent.match(new RegExp(escapedQuery, 'g')) ?? []).length;
-    const score = occurrences / content.length;
-    results.push({ path: filePath, snippet, score, occurrences });
+    results.push({ path: filePath, snippet, score: totalScore, occurrences: totalOccurrences });
   }
   results.sort((a, b) => b.score - a.score);