@geminilight/mindos 0.6.31 → 0.6.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
1
3
  import { collectAllFiles } from './tree';
2
4
  import { readFile } from './fs-ops';
3
5
 
@@ -6,11 +8,18 @@ const MAX_CONTENT_LENGTH = 50_000;
6
8
  // CJK Unicode ranges: Chinese, Japanese Hiragana/Katakana, Korean
7
9
  const CJK_REGEX = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/;
8
10
 
11
+ // Intl.Segmenter for proper CJK word segmentation (available in Node 16+)
12
+ const zhSegmenter = typeof Intl !== 'undefined' && Intl.Segmenter
13
+ ? new Intl.Segmenter('zh', { granularity: 'word' })
14
+ : null;
15
+
9
16
  /**
10
- * Tokenize text for indexing: split on word boundaries + CJK bigrams.
17
+ * Tokenize text for indexing: split on word boundaries + CJK word segmentation.
11
18
  *
12
19
  * Latin/ASCII: split on non-alphanumeric characters, lowercased.
13
- * CJK: generate character-level bigrams (overlapping pairs).
20
+ * CJK: uses Intl.Segmenter for proper word boundaries (e.g. "知识管理"
21
+ * → ["知识", "管理"] instead of bigrams ["知识", "识管", "管理"]).
22
+ * Falls back to bigrams if Intl.Segmenter is unavailable.
14
23
  * Mixed text: both strategies applied, tokens merged.
15
24
  */
16
25
  function tokenize(text: string): Set<string> {
@@ -27,27 +36,42 @@ function tokenize(text: string): Set<string> {
27
36
  }
28
37
  }
29
38
 
30
- // CJK bigrams + single chars (unigrams carry meaning in CJK scripts)
39
+ // CJK word segmentation
31
40
  if (CJK_REGEX.test(lower)) {
32
- const cjkChars: string[] = [];
33
- for (const ch of lower) {
34
- if (CJK_REGEX.test(ch)) {
35
- cjkChars.push(ch);
36
- } else {
37
- // Emit bigrams for accumulated CJK run
38
- if (cjkChars.length > 0) {
39
- emitCjkTokens(cjkChars, tokens);
40
- cjkChars.length = 0;
41
+ if (zhSegmenter) {
42
+ // Intl.Segmenter: proper word boundaries
43
+ for (const { segment, isWordLike } of zhSegmenter.segment(lower)) {
44
+ if (!isWordLike) continue;
45
+ const word = segment.trim();
46
+ if (!word) continue;
47
+ tokens.add(word);
48
+ // Also add individual CJK characters as unigrams (for single-char queries)
49
+ for (const ch of word) {
50
+ if (CJK_REGEX.test(ch)) tokens.add(ch);
51
+ }
52
+ }
53
+ } else {
54
+ // Fallback: bigrams + unigrams
55
+ const cjkChars: string[] = [];
56
+ for (const ch of lower) {
57
+ if (CJK_REGEX.test(ch)) {
58
+ cjkChars.push(ch);
59
+ } else {
60
+ if (cjkChars.length > 0) {
61
+ emitCjkBigrams(cjkChars, tokens);
62
+ cjkChars.length = 0;
63
+ }
41
64
  }
42
65
  }
66
+ if (cjkChars.length > 0) emitCjkBigrams(cjkChars, tokens);
43
67
  }
44
- if (cjkChars.length > 0) emitCjkTokens(cjkChars, tokens);
45
68
  }
46
69
 
47
70
  return tokens;
48
71
  }
49
72
 
50
- function emitCjkTokens(chars: string[], tokens: Set<string>): void {
73
+ /** Fallback CJK tokenizer: bigrams + unigrams (when Intl.Segmenter unavailable) */
74
+ function emitCjkBigrams(chars: string[], tokens: Set<string>): void {
51
75
  for (let i = 0; i < chars.length; i++) {
52
76
  tokens.add(chars[i]); // unigram
53
77
  if (i + 1 < chars.length) {
@@ -73,10 +97,19 @@ export class SearchIndex {
73
97
  private builtForRoot: string | null = null;
74
98
  private fileCount = 0;
75
99
 
100
+ /** BM25 statistics — populated during rebuild() */
101
+ private docLengths = new Map<string, number>(); // filePath → char count
102
+ private totalChars = 0;
103
+ /** Reverse mapping: filePath → Set<token> for efficient removeFile. */
104
+ private fileTokens = new Map<string, Set<string>>();
105
+
76
106
  /** Full rebuild: read all files and build inverted index. */
77
107
  rebuild(mindRoot: string): void {
78
108
  const allFiles = collectAllFiles(mindRoot);
79
109
  const inverted = new Map<string, Set<string>>();
110
+ const docLengths = new Map<string, number>();
111
+ const fileTokensMap = new Map<string, Set<string>>();
112
+ let totalChars = 0;
80
113
 
81
114
  for (const filePath of allFiles) {
82
115
  let content: string;
@@ -86,6 +119,10 @@ export class SearchIndex {
86
119
  continue;
87
120
  }
88
121
 
122
+ // Store original length for BM25 before truncation
123
+ docLengths.set(filePath, content.length);
124
+ totalChars += content.length;
125
+
89
126
  if (content.length > MAX_CONTENT_LENGTH) {
90
127
  content = content.slice(0, MAX_CONTENT_LENGTH);
91
128
  }
@@ -93,6 +130,7 @@ export class SearchIndex {
93
130
  // Also index the file path itself
94
131
  const allText = filePath + '\n' + content;
95
132
  const tokens = tokenize(allText);
133
+ fileTokensMap.set(filePath, tokens);
96
134
 
97
135
  for (const token of tokens) {
98
136
  let set = inverted.get(token);
@@ -107,6 +145,9 @@ export class SearchIndex {
107
145
  this.invertedIndex = inverted;
108
146
  this.builtForRoot = mindRoot;
109
147
  this.fileCount = allFiles.length;
148
+ this.docLengths = docLengths;
149
+ this.totalChars = totalChars;
150
+ this.fileTokens = fileTokensMap;
110
151
  }
111
152
 
112
153
  /** Clear the index. Next search will trigger a lazy rebuild. */
@@ -114,6 +155,77 @@ export class SearchIndex {
114
155
  this.invertedIndex = null;
115
156
  this.builtForRoot = null;
116
157
  this.fileCount = 0;
158
+ this.docLengths.clear();
159
+ this.totalChars = 0;
160
+ this.fileTokens.clear();
161
+ }
162
+
163
+ // ── Incremental updates ──────────────────────────────────────────────
164
+
165
+ /**
166
+ * Remove a single file from the index (e.g. after deletion).
167
+ * O(tokens-in-file) — much faster than full rebuild.
168
+ */
169
+ removeFile(filePath: string): void {
170
+ if (!this.invertedIndex) return;
171
+
172
+ // Use reverse mapping for O(tokens-in-file) instead of O(all-tokens)
173
+ const tokens = this.fileTokens.get(filePath);
174
+ if (tokens) {
175
+ for (const token of tokens) {
176
+ this.invertedIndex.get(token)?.delete(filePath);
177
+ }
178
+ this.fileTokens.delete(filePath);
179
+ }
180
+
181
+ // Update BM25 stats
182
+ const oldLen = this.docLengths.get(filePath) ?? 0;
183
+ this.totalChars -= oldLen;
184
+ this.docLengths.delete(filePath);
185
+ this.fileCount = Math.max(0, this.fileCount - 1);
186
+ }
187
+
188
+ /**
189
+ * Add a new file to the index (e.g. after creation).
190
+ * O(tokens-in-file) — much faster than full rebuild.
191
+ */
192
+ addFile(mindRoot: string, filePath: string): void {
193
+ if (!this.invertedIndex) return;
194
+
195
+ let content: string;
196
+ try { content = readFile(mindRoot, filePath); } catch { return; }
197
+
198
+ // Update BM25 stats
199
+ this.docLengths.set(filePath, content.length);
200
+ this.totalChars += content.length;
201
+ this.fileCount++;
202
+
203
+ // Index tokens
204
+ if (content.length > MAX_CONTENT_LENGTH) {
205
+ content = content.slice(0, MAX_CONTENT_LENGTH);
206
+ }
207
+ const allText = filePath + '\n' + content;
208
+ const tokens = tokenize(allText);
209
+ this.fileTokens.set(filePath, tokens);
210
+
211
+ for (const token of tokens) {
212
+ let set = this.invertedIndex.get(token);
213
+ if (!set) {
214
+ set = new Set<string>();
215
+ this.invertedIndex.set(token, set);
216
+ }
217
+ set.add(filePath);
218
+ }
219
+ }
220
+
221
+ /**
222
+ * Re-index a single file after modification.
223
+ * Equivalent to removeFile + addFile but avoids double traversal of inverted index.
224
+ */
225
+ updateFile(mindRoot: string, filePath: string): void {
226
+ if (!this.invertedIndex) return;
227
+ this.removeFile(filePath);
228
+ this.addFile(mindRoot, filePath);
117
229
  }
118
230
 
119
231
  /** Whether the index has been built for the given mindRoot. */
@@ -131,6 +243,66 @@ export class SearchIndex {
131
243
  return this.fileCount;
132
244
  }
133
245
 
246
+ /** Average document length in chars. */
247
+ getAvgDocLength(): number {
248
+ return this.fileCount > 0 ? this.totalChars / this.fileCount : 0;
249
+ }
250
+
251
+ /** Character count of a specific document. Returns 0 if unknown. */
252
+ getDocLength(filePath: string): number {
253
+ return this.docLengths.get(filePath) ?? 0;
254
+ }
255
+
256
+ /** Number of documents containing a specific token (document frequency). */
257
+ getDocFrequency(token: string): number {
258
+ if (!this.invertedIndex) return 0;
259
+ return this.invertedIndex.get(token)?.size ?? 0;
260
+ }
261
+
262
+ /**
263
+ * Get candidates via UNION of token sets (for BM25 multi-term scoring).
264
+ * Unlike getCandidates (intersection), this returns any file matching any token.
265
+ *
266
+ * Optimization: when the query produces many tokens (common with CJK bigrams),
267
+ * files are ranked by how many distinct query tokens they match. Files matching
268
+ * fewer than half the tokens are pruned — unless that would leave zero results,
269
+ * in which case all matching files are returned. This prevents CJK bigram
270
+ * explosion from creating massive candidate sets full of low-quality matches.
271
+ */
272
+ getCandidatesUnion(query: string): string[] | null {
273
+ if (!query.trim()) return null;
274
+ if (!this.invertedIndex) return null;
275
+
276
+ const tokens = tokenize(query.toLowerCase().trim());
277
+ if (tokens.size === 0) return null;
278
+
279
+ // Count how many query tokens each file matches
280
+ const hitCount = new Map<string, number>();
281
+ for (const token of tokens) {
282
+ const set = this.invertedIndex.get(token);
283
+ if (set) {
284
+ for (const filePath of set) {
285
+ hitCount.set(filePath, (hitCount.get(filePath) ?? 0) + 1);
286
+ }
287
+ }
288
+ }
289
+
290
+ if (hitCount.size === 0) return [];
291
+
292
+ // When query has many tokens (e.g. CJK bigrams), prune low-overlap files
293
+ const tokenCount = tokens.size;
294
+ if (tokenCount >= 3) {
295
+ const threshold = Math.max(1, Math.floor(tokenCount / 2));
296
+ const filtered = [...hitCount.entries()]
297
+ .filter(([, count]) => count >= threshold)
298
+ .map(([path]) => path);
299
+ // Only apply pruning if it doesn't eliminate everything
300
+ if (filtered.length > 0) return filtered;
301
+ }
302
+
303
+ return [...hitCount.keys()];
304
+ }
305
+
134
306
  /**
135
307
  * Get candidate file paths for a query (single or multi-word).
136
308
  *
@@ -171,4 +343,128 @@ export class SearchIndex {
171
343
 
172
344
  return result ? Array.from(result) : [];
173
345
  }
346
+
347
+ // ── Persistence ──────────────────────────────────────────────────────
348
+
349
+ /**
350
+ * Serialize the index to a JSON file for persistence across restarts.
351
+ * Stored at `<mindosDir>/search-index.json`.
352
+ */
353
+ persist(mindosDir: string): void {
354
+ if (!this.invertedIndex) return;
355
+
356
+ const data: PersistedIndex = {
357
+ version: 1,
358
+ builtForRoot: this.builtForRoot ?? '',
359
+ fileCount: this.fileCount,
360
+ totalChars: this.totalChars,
361
+ docLengths: Object.fromEntries(this.docLengths),
362
+ invertedIndex: {},
363
+ timestamp: Date.now(),
364
+ };
365
+
366
+ for (const [token, fileSet] of this.invertedIndex) {
367
+ data.invertedIndex[token] = [...fileSet];
368
+ }
369
+
370
+ const filePath = path.join(mindosDir, 'search-index.json');
371
+ try {
372
+ fs.mkdirSync(mindosDir, { recursive: true });
373
+ fs.writeFileSync(filePath, JSON.stringify(data), 'utf-8');
374
+ } catch {
375
+ // Non-critical — index will be rebuilt on next search
376
+ }
377
+ }
378
+
379
+ /**
380
+ * Load a previously persisted index from disk.
381
+ * Returns true if loaded successfully, false if stale/missing/corrupt.
382
+ *
383
+ * Staleness checks (all must pass):
384
+ * 1. Version and mindRoot match
385
+ * 2. Actual file count on disk matches indexed file count (detects adds/deletes)
386
+ * 3. Sampled files' mtime are older than the persisted timestamp
387
+ */
388
+ load(mindosDir: string, mindRoot: string): boolean {
389
+ const filePath = path.join(mindosDir, 'search-index.json');
390
+
391
+ let raw: string;
392
+ try { raw = fs.readFileSync(filePath, 'utf-8'); } catch { return false; }
393
+
394
+ let data: PersistedIndex;
395
+ try { data = JSON.parse(raw); } catch { return false; }
396
+
397
+ if (data.version !== 1 || data.builtForRoot !== mindRoot) return false;
398
+
399
+ // Check 1: file count on disk must match indexed count
400
+ // This catches new files created or files deleted while process was down
401
+ const currentFiles = collectAllFiles(mindRoot);
402
+ if (currentFiles.length !== data.fileCount) return false;
403
+
404
+ // Check 2: mtime sampling — check every file if ≤50, otherwise sample 50
405
+ const docPaths = Object.keys(data.docLengths);
406
+ const sampleSize = Math.min(50, docPaths.length);
407
+ if (sampleSize === docPaths.length) {
408
+ // Small index: check all files
409
+ for (const dp of docPaths) {
410
+ try {
411
+ const stat = fs.statSync(path.join(mindRoot, dp));
412
+ if (stat.mtimeMs > data.timestamp) return false;
413
+ } catch {
414
+ return false; // file deleted
415
+ }
416
+ }
417
+ } else {
418
+ // Large index: sample evenly + always check the last few (most likely to be recent)
419
+ const step = Math.max(1, Math.floor(docPaths.length / 40));
420
+ const sampled = new Set<number>();
421
+ // Evenly spaced samples
422
+ for (let i = 0; i < docPaths.length; i += step) sampled.add(i);
423
+ // Always check the last 10 files (most recently added to the index)
424
+ for (let i = Math.max(0, docPaths.length - 10); i < docPaths.length; i++) sampled.add(i);
425
+
426
+ for (const idx of sampled) {
427
+ try {
428
+ const stat = fs.statSync(path.join(mindRoot, docPaths[idx]));
429
+ if (stat.mtimeMs > data.timestamp) return false;
430
+ } catch {
431
+ return false;
432
+ }
433
+ }
434
+ }
435
+
436
+ // Restore state
437
+ this.builtForRoot = data.builtForRoot;
438
+ this.fileCount = data.fileCount;
439
+ this.totalChars = data.totalChars;
440
+ this.docLengths = new Map(Object.entries(data.docLengths).map(([k, v]) => [k, v as number]));
441
+
442
+ const inverted = new Map<string, Set<string>>();
443
+ const fileTokensMap = new Map<string, Set<string>>();
444
+ for (const [token, files] of Object.entries(data.invertedIndex)) {
445
+ const fileSet = new Set(files as string[]);
446
+ inverted.set(token, fileSet);
447
+ // Rebuild reverse mapping
448
+ for (const f of fileSet) {
449
+ let tokens = fileTokensMap.get(f);
450
+ if (!tokens) { tokens = new Set(); fileTokensMap.set(f, tokens); }
451
+ tokens.add(token);
452
+ }
453
+ }
454
+ this.invertedIndex = inverted;
455
+ this.fileTokens = fileTokensMap;
456
+
457
+ return true;
458
+ }
459
+ }
460
+
461
+ /** Shape of the persisted index JSON. */
462
+ interface PersistedIndex {
463
+ version: number;
464
+ builtForRoot: string;
465
+ fileCount: number;
466
+ totalChars: number;
467
+ docLengths: Record<string, number>;
468
+ invertedIndex: Record<string, string[]>;
469
+ timestamp: number;
174
470
  }
@@ -1,5 +1,6 @@
1
1
  import fs from 'fs';
2
2
  import path from 'path';
3
+ import os from 'os';
3
4
  import { collectAllFiles } from './tree';
4
5
  import { readFile } from './fs-ops';
5
6
  import { SearchIndex } from './search-index';
@@ -11,24 +12,122 @@ import type { SearchResult, SearchOptions } from './types';
11
12
  */
12
13
  const searchIndex = new SearchIndex();
13
14
 
15
+ /** Path to ~/.mindos/ for index persistence. */
16
+ function getMindosDir(): string {
17
+ return path.join(os.homedir(), '.mindos');
18
+ }
19
+
14
20
  /** Invalidate the core search index. Called from `lib/fs.ts` on write operations. */
15
21
  export function invalidateSearchIndex(): void {
16
22
  searchIndex.invalidate();
17
23
  }
18
24
 
25
+ /** Incrementally update a single file in the search index (after write/edit). */
26
+ export function updateSearchIndexFile(mindRoot: string, filePath: string): void {
27
+ if (!searchIndex.isBuilt()) return;
28
+ searchIndex.updateFile(mindRoot, filePath);
29
+ schedulePersist();
30
+ }
31
+
32
+ /** Incrementally add a new file to the search index (after create). */
33
+ export function addSearchIndexFile(mindRoot: string, filePath: string): void {
34
+ if (!searchIndex.isBuilt()) return;
35
+ searchIndex.addFile(mindRoot, filePath);
36
+ schedulePersist();
37
+ }
38
+
39
+ /** Incrementally remove a file from the search index (after delete). */
40
+ export function removeSearchIndexFile(filePath: string): void {
41
+ if (!searchIndex.isBuilt()) return;
42
+ searchIndex.removeFile(filePath);
43
+ schedulePersist();
44
+ }
45
+
46
+ /** Debounced persist — writes index to disk 5s after last write operation. */
47
+ let _persistTimer: ReturnType<typeof setTimeout> | null = null;
48
+ let _persistDirty = false;
49
+
50
+ function schedulePersist(): void {
51
+ _persistDirty = true;
52
+ if (_persistTimer) clearTimeout(_persistTimer);
53
+ _persistTimer = setTimeout(flushPersist, 5000);
54
+ }
55
+
56
+ /** Immediately flush pending index to disk (used by exit hooks). */
57
+ function flushPersist(): void {
58
+ if (_persistTimer) { clearTimeout(_persistTimer); _persistTimer = null; }
59
+ if (!_persistDirty) return;
60
+ _persistDirty = false;
61
+ try { searchIndex.persist(getMindosDir()); } catch { /* non-critical */ }
62
+ }
63
+
64
+ // Ensure index is persisted before process exits
65
+ if (typeof process !== 'undefined') {
66
+ process.on('beforeExit', flushPersist);
67
+ process.on('SIGTERM', () => { flushPersist(); process.exit(0); });
68
+ process.on('SIGINT', () => { flushPersist(); process.exit(0); });
69
+ }
70
+
71
+ /* ── BM25 Parameters ── */
72
+ const BM25_K1 = 1.2; // Term frequency saturation
73
+ const BM25_B = 0.75; // Document length normalization
74
+
75
+ /**
76
+ * Compute BM25 score for a single term in a single document.
77
+ *
78
+ * @param tf - raw term frequency (occurrences of term in doc)
79
+ * @param df - document frequency (number of docs containing term)
80
+ * @param docLength - length of this document (chars)
81
+ * @param avgDocLength - average document length across corpus (chars)
82
+ * @param totalDocs - total number of documents in corpus
83
+ */
84
+ export function bm25Score(
85
+ tf: number,
86
+ df: number,
87
+ docLength: number,
88
+ avgDocLength: number,
89
+ totalDocs: number,
90
+ ): number {
91
+ if (tf === 0 || totalDocs === 0 || avgDocLength === 0) return 0;
92
+
93
+ // IDF: log((N - df + 0.5) / (df + 0.5) + 1) — the +1 prevents negative IDF
94
+ // when df > N/2 (common terms)
95
+ const idf = Math.log((totalDocs - df + 0.5) / (df + 0.5) + 1);
96
+
97
+ // Normalized TF with saturation and length normalization
98
+ const tfNorm = (tf * (BM25_K1 + 1)) / (tf + BM25_K1 * (1 - BM25_B + BM25_B * docLength / avgDocLength));
99
+
100
+ return idf * tfNorm;
101
+ }
102
+
103
+ /**
104
+ * Split a query into individual search terms for multi-term BM25 scoring.
105
+ * Each term is scored independently, then scores are summed per document.
106
+ */
107
+ function splitQueryTerms(query: string): string[] {
108
+ const lower = query.toLowerCase().trim();
109
+ // Split on whitespace, filter empty
110
+ const terms = lower.split(/\s+/).filter(t => t.length > 0);
111
+ // Deduplicate
112
+ return [...new Set(terms)];
113
+ }
114
+
19
115
  /**
20
116
  * Core literal search — used by MCP tools via REST API.
21
117
  *
22
- * This is a **case-insensitive literal string match** with occurrence-density scoring.
23
- * It supports scope, file_type, and modified_after filters that MCP tools expose.
118
+ * Scoring: **BM25** (Best Matching 25) the standard information retrieval
119
+ * ranking function. For multi-term queries, each term is scored independently
120
+ * and scores are summed. This means:
121
+ * - Rare terms (low document frequency) contribute more to the score
122
+ * - Term frequency has diminishing returns (saturation at k1)
123
+ * - Shorter documents score higher when term frequency is equal
24
124
  *
25
- * Performance: uses an in-memory inverted index to narrow the candidate file set
26
- * before doing full-text scanning. The index is built lazily on the first query
27
- * and invalidated on any write operation.
125
+ * Candidate narrowing: uses an in-memory inverted index with UNION semantics
126
+ * for multi-term queries (a document matching ANY term is a candidate).
28
127
  *
29
128
  * NOTE: The App also has a separate Fuse.js fuzzy search in `lib/fs.ts` for the
30
129
  * browser `⌘K` search overlay. The two coexist intentionally:
31
- * - Core search (here): exact literal match, supports filters, used by MCP/API
130
+ * - Core search (here): exact literal match + BM25 ranking, used by MCP/API
32
131
  * - App search (lib/fs.ts): Fuse.js fuzzy match with CJK support, used by frontend
33
132
  */
34
133
  export function searchFiles(mindRoot: string, query: string, opts: SearchOptions = {}): SearchResult[] {
@@ -37,11 +136,21 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
37
136
 
38
137
  // Ensure search index is built for this mindRoot
39
138
  if (!searchIndex.isBuiltFor(mindRoot)) {
40
- searchIndex.rebuild(mindRoot);
139
+ // Try loading from disk first (fast path — avoids full rebuild)
140
+ const loaded = searchIndex.load(getMindosDir(), mindRoot);
141
+ if (!loaded) {
142
+ searchIndex.rebuild(mindRoot);
143
+ // Persist for next cold start (fire-and-forget)
144
+ try { searchIndex.persist(getMindosDir()); } catch { /* non-critical */ }
145
+ }
41
146
  }
42
147
 
43
- // Use index to get candidate files (or null if index unavailable → full scan)
44
- const candidates = searchIndex.getCandidates(query);
148
+ const totalDocs = searchIndex.getFileCount();
149
+ const avgDocLength = searchIndex.getAvgDocLength();
150
+ const queryTerms = splitQueryTerms(query);
151
+
152
+ // Use UNION index to get candidate files (any file matching any term)
153
+ const candidates = searchIndex.getCandidatesUnion(query);
45
154
  const candidateSet = candidates ? new Set(candidates) : null;
46
155
 
47
156
  let allFiles = collectAllFiles(mindRoot);
@@ -72,10 +181,16 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
72
181
 
73
182
  const results: SearchResult[] = [];
74
183
  const lowerQuery = query.toLowerCase();
75
- const escapedQuery = lowerQuery.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
184
+
185
+ // ── Pre-scan: compute document frequency for each query term ──
186
+ // We count how many candidate files contain each term via literal match.
187
+ // This is more accurate than using the inverted index token df, because
188
+ // the index tokenizes via Intl.Segmenter (CJK word boundaries) which may
189
+ // split query terms differently than our literal substring match.
190
+ const termDf = new Map<string, number>();
191
+ const fileContents = new Map<string, string>();
76
192
 
77
193
  for (const filePath of allFiles) {
78
- // Check mtime filter before reading content
79
194
  if (mtimeThreshold > 0) {
80
195
  try {
81
196
  const abs = path.join(mindRoot, filePath);
@@ -87,34 +202,70 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
87
202
  let content: string;
88
203
  try { content = readFile(mindRoot, filePath); } catch { continue; }
89
204
 
205
+ const lower = content.toLowerCase();
206
+ fileContents.set(filePath, content);
207
+
208
+ for (const term of queryTerms) {
209
+ if (lower.includes(term)) {
210
+ termDf.set(term, (termDf.get(term) ?? 0) + 1);
211
+ }
212
+ }
213
+ }
214
+
215
+ // ── Score each document with BM25 ──
216
+ for (const [filePath, content] of fileContents) {
90
217
  const lowerContent = content.toLowerCase();
91
- const index = lowerContent.indexOf(lowerQuery);
92
- if (index === -1) continue;
93
218
 
94
- // Try to find natural boundaries (paragraphs) around the match
95
- let snippetStart = content.lastIndexOf('\n\n', index);
96
- if (snippetStart === -1) snippetStart = Math.max(0, index - 200);
97
- else snippetStart += 2; // skip the newlines
219
+ // Check if document matches any term (full-text verification after index narrowing)
220
+ let matchedAnyTerm = false;
221
+ let firstMatchIndex = -1;
98
222
 
99
- let snippetEnd = content.indexOf('\n\n', index);
100
- if (snippetEnd === -1) snippetEnd = Math.min(content.length, index + query.length + 200);
223
+ // Compute BM25 score: sum of per-term scores
224
+ let totalScore = 0;
225
+ let totalOccurrences = 0;
226
+ const docLength = content.length;
101
227
 
102
- // Prevent massive blocks (cap at ~400 chars total)
103
- if (index - snippetStart > 200) snippetStart = index - 200;
104
- if (snippetEnd - index > 200) snippetEnd = index + query.length + 200;
228
+ for (const term of queryTerms) {
229
+ const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
230
+ const matches = lowerContent.match(new RegExp(escapedTerm, 'g'));
231
+ const tf = matches ? matches.length : 0;
232
+ if (tf === 0) continue;
233
+
234
+ matchedAnyTerm = true;
235
+ totalOccurrences += tf;
236
+
237
+ if (firstMatchIndex === -1) {
238
+ firstMatchIndex = lowerContent.indexOf(term);
239
+ }
240
+
241
+ // Get document frequency for this term (computed in pre-scan)
242
+ const df = termDf.get(term) ?? 0;
243
+
244
+ totalScore += bm25Score(tf, df, docLength, avgDocLength, totalDocs);
245
+ }
246
+
247
+ if (!matchedAnyTerm) continue;
248
+
249
+ // Build snippet around the first match
250
+ const index = firstMatchIndex >= 0 ? firstMatchIndex : lowerContent.indexOf(lowerQuery);
251
+ const snippetAnchor = index >= 0 ? index : 0;
252
+
253
+ let snippetStart = content.lastIndexOf('\n\n', snippetAnchor);
254
+ if (snippetStart === -1) snippetStart = Math.max(0, snippetAnchor - 200);
255
+ else snippetStart += 2;
256
+
257
+ let snippetEnd = content.indexOf('\n\n', snippetAnchor);
258
+ if (snippetEnd === -1) snippetEnd = Math.min(content.length, snippetAnchor + query.length + 200);
259
+
260
+ if (snippetAnchor - snippetStart > 200) snippetStart = snippetAnchor - 200;
261
+ if (snippetEnd - snippetAnchor > 200) snippetEnd = snippetAnchor + query.length + 200;
105
262
 
106
263
  let snippet = content.slice(snippetStart, snippetEnd).trim();
107
-
108
- // Collapse internal whitespace for cleaner search result presentation, but preserve some structure
109
264
  snippet = snippet.replace(/\n{3,}/g, '\n\n');
110
-
111
265
  if (snippetStart > 0) snippet = '...' + snippet;
112
266
  if (snippetEnd < content.length) snippet += '...';
113
267
 
114
- const occurrences = (lowerContent.match(new RegExp(escapedQuery, 'g')) ?? []).length;
115
- const score = occurrences / content.length;
116
-
117
- results.push({ path: filePath, snippet, score, occurrences });
268
+ results.push({ path: filePath, snippet, score: totalScore, occurrences: totalOccurrences });
118
269
  }
119
270
 
120
271
  results.sort((a, b) => b.score - a.score);