@geminilight/mindos 0.6.32 → 0.6.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app/app/api/ask/route.ts +69 -29
- package/app/app/api/graph/route.ts +5 -76
- package/app/app/trash/page.tsx +1 -0
- package/app/app/view/[...path]/ViewPageClient.tsx +22 -8
- package/app/components/ExportModal.tsx +2 -2
- package/app/components/FileTree.tsx +26 -5
- package/app/components/HomeContent.tsx +4 -0
- package/app/components/SystemPulse.tsx +318 -0
- package/app/components/TrashPageClient.tsx +9 -9
- package/app/components/agents/AgentsSkillsSection.tsx +173 -102
- package/app/components/ui/Toaster.tsx +11 -2
- package/app/lib/actions.ts +20 -9
- package/app/lib/agent/context.ts +22 -11
- package/app/lib/agent/loop-detection.ts +52 -0
- package/app/lib/agent/retry.ts +19 -0
- package/app/lib/core/backlinks.ts +33 -9
- package/app/lib/core/index.ts +4 -1
- package/app/lib/core/link-index.ts +224 -0
- package/app/lib/core/search-index.ts +310 -14
- package/app/lib/core/search.ts +180 -29
- package/app/lib/fs.ts +67 -10
- package/app/lib/hooks/usePinnedFiles.ts +7 -2
- package/app/lib/i18n/modules/knowledge.ts +62 -0
- package/app/lib/toast.ts +7 -1
- package/app/next-env.d.ts +1 -1
- package/app/package.json +2 -0
- package/package.json +1 -1
- package/scripts/parse-syncinclude.sh +92 -0
- package/scripts/write-build-stamp.js +40 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
1
3
|
import { collectAllFiles } from './tree';
|
|
2
4
|
import { readFile } from './fs-ops';
|
|
3
5
|
|
|
@@ -6,11 +8,18 @@ const MAX_CONTENT_LENGTH = 50_000;
|
|
|
6
8
|
// CJK Unicode ranges: Chinese, Japanese Hiragana/Katakana, Korean
|
|
7
9
|
const CJK_REGEX = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/;
|
|
8
10
|
|
|
11
|
+
// Intl.Segmenter for proper CJK word segmentation (available in Node 16+)
|
|
12
|
+
const zhSegmenter = typeof Intl !== 'undefined' && Intl.Segmenter
|
|
13
|
+
? new Intl.Segmenter('zh', { granularity: 'word' })
|
|
14
|
+
: null;
|
|
15
|
+
|
|
9
16
|
/**
|
|
10
|
-
* Tokenize text for indexing: split on word boundaries + CJK
|
|
17
|
+
* Tokenize text for indexing: split on word boundaries + CJK word segmentation.
|
|
11
18
|
*
|
|
12
19
|
* Latin/ASCII: split on non-alphanumeric characters, lowercased.
|
|
13
|
-
* CJK:
|
|
20
|
+
* CJK: uses Intl.Segmenter for proper word boundaries (e.g. "知识管理"
|
|
21
|
+
* → ["知识", "管理"] instead of bigrams ["知识", "识管", "管理"]).
|
|
22
|
+
* Falls back to bigrams if Intl.Segmenter is unavailable.
|
|
14
23
|
* Mixed text: both strategies applied, tokens merged.
|
|
15
24
|
*/
|
|
16
25
|
function tokenize(text: string): Set<string> {
|
|
@@ -27,27 +36,42 @@ function tokenize(text: string): Set<string> {
|
|
|
27
36
|
}
|
|
28
37
|
}
|
|
29
38
|
|
|
30
|
-
// CJK
|
|
39
|
+
// CJK word segmentation
|
|
31
40
|
if (CJK_REGEX.test(lower)) {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
+
if (zhSegmenter) {
|
|
42
|
+
// Intl.Segmenter: proper word boundaries
|
|
43
|
+
for (const { segment, isWordLike } of zhSegmenter.segment(lower)) {
|
|
44
|
+
if (!isWordLike) continue;
|
|
45
|
+
const word = segment.trim();
|
|
46
|
+
if (!word) continue;
|
|
47
|
+
tokens.add(word);
|
|
48
|
+
// Also add individual CJK characters as unigrams (for single-char queries)
|
|
49
|
+
for (const ch of word) {
|
|
50
|
+
if (CJK_REGEX.test(ch)) tokens.add(ch);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
} else {
|
|
54
|
+
// Fallback: bigrams + unigrams
|
|
55
|
+
const cjkChars: string[] = [];
|
|
56
|
+
for (const ch of lower) {
|
|
57
|
+
if (CJK_REGEX.test(ch)) {
|
|
58
|
+
cjkChars.push(ch);
|
|
59
|
+
} else {
|
|
60
|
+
if (cjkChars.length > 0) {
|
|
61
|
+
emitCjkBigrams(cjkChars, tokens);
|
|
62
|
+
cjkChars.length = 0;
|
|
63
|
+
}
|
|
41
64
|
}
|
|
42
65
|
}
|
|
66
|
+
if (cjkChars.length > 0) emitCjkBigrams(cjkChars, tokens);
|
|
43
67
|
}
|
|
44
|
-
if (cjkChars.length > 0) emitCjkTokens(cjkChars, tokens);
|
|
45
68
|
}
|
|
46
69
|
|
|
47
70
|
return tokens;
|
|
48
71
|
}
|
|
49
72
|
|
|
50
|
-
|
|
73
|
+
/** Fallback CJK tokenizer: bigrams + unigrams (when Intl.Segmenter unavailable) */
|
|
74
|
+
function emitCjkBigrams(chars: string[], tokens: Set<string>): void {
|
|
51
75
|
for (let i = 0; i < chars.length; i++) {
|
|
52
76
|
tokens.add(chars[i]); // unigram
|
|
53
77
|
if (i + 1 < chars.length) {
|
|
@@ -73,10 +97,19 @@ export class SearchIndex {
|
|
|
73
97
|
private builtForRoot: string | null = null;
|
|
74
98
|
private fileCount = 0;
|
|
75
99
|
|
|
100
|
+
/** BM25 statistics — populated during rebuild() */
|
|
101
|
+
private docLengths = new Map<string, number>(); // filePath → char count
|
|
102
|
+
private totalChars = 0;
|
|
103
|
+
/** Reverse mapping: filePath → Set<token> for efficient removeFile. */
|
|
104
|
+
private fileTokens = new Map<string, Set<string>>();
|
|
105
|
+
|
|
76
106
|
/** Full rebuild: read all files and build inverted index. */
|
|
77
107
|
rebuild(mindRoot: string): void {
|
|
78
108
|
const allFiles = collectAllFiles(mindRoot);
|
|
79
109
|
const inverted = new Map<string, Set<string>>();
|
|
110
|
+
const docLengths = new Map<string, number>();
|
|
111
|
+
const fileTokensMap = new Map<string, Set<string>>();
|
|
112
|
+
let totalChars = 0;
|
|
80
113
|
|
|
81
114
|
for (const filePath of allFiles) {
|
|
82
115
|
let content: string;
|
|
@@ -86,6 +119,10 @@ export class SearchIndex {
|
|
|
86
119
|
continue;
|
|
87
120
|
}
|
|
88
121
|
|
|
122
|
+
// Store original length for BM25 before truncation
|
|
123
|
+
docLengths.set(filePath, content.length);
|
|
124
|
+
totalChars += content.length;
|
|
125
|
+
|
|
89
126
|
if (content.length > MAX_CONTENT_LENGTH) {
|
|
90
127
|
content = content.slice(0, MAX_CONTENT_LENGTH);
|
|
91
128
|
}
|
|
@@ -93,6 +130,7 @@ export class SearchIndex {
|
|
|
93
130
|
// Also index the file path itself
|
|
94
131
|
const allText = filePath + '\n' + content;
|
|
95
132
|
const tokens = tokenize(allText);
|
|
133
|
+
fileTokensMap.set(filePath, tokens);
|
|
96
134
|
|
|
97
135
|
for (const token of tokens) {
|
|
98
136
|
let set = inverted.get(token);
|
|
@@ -107,6 +145,9 @@ export class SearchIndex {
|
|
|
107
145
|
this.invertedIndex = inverted;
|
|
108
146
|
this.builtForRoot = mindRoot;
|
|
109
147
|
this.fileCount = allFiles.length;
|
|
148
|
+
this.docLengths = docLengths;
|
|
149
|
+
this.totalChars = totalChars;
|
|
150
|
+
this.fileTokens = fileTokensMap;
|
|
110
151
|
}
|
|
111
152
|
|
|
112
153
|
/** Clear the index. Next search will trigger a lazy rebuild. */
|
|
@@ -114,6 +155,77 @@ export class SearchIndex {
|
|
|
114
155
|
this.invertedIndex = null;
|
|
115
156
|
this.builtForRoot = null;
|
|
116
157
|
this.fileCount = 0;
|
|
158
|
+
this.docLengths.clear();
|
|
159
|
+
this.totalChars = 0;
|
|
160
|
+
this.fileTokens.clear();
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// ── Incremental updates ──────────────────────────────────────────────
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Remove a single file from the index (e.g. after deletion).
|
|
167
|
+
* O(tokens-in-file) — much faster than full rebuild.
|
|
168
|
+
*/
|
|
169
|
+
removeFile(filePath: string): void {
|
|
170
|
+
if (!this.invertedIndex) return;
|
|
171
|
+
|
|
172
|
+
// Use reverse mapping for O(tokens-in-file) instead of O(all-tokens)
|
|
173
|
+
const tokens = this.fileTokens.get(filePath);
|
|
174
|
+
if (tokens) {
|
|
175
|
+
for (const token of tokens) {
|
|
176
|
+
this.invertedIndex.get(token)?.delete(filePath);
|
|
177
|
+
}
|
|
178
|
+
this.fileTokens.delete(filePath);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Update BM25 stats
|
|
182
|
+
const oldLen = this.docLengths.get(filePath) ?? 0;
|
|
183
|
+
this.totalChars -= oldLen;
|
|
184
|
+
this.docLengths.delete(filePath);
|
|
185
|
+
this.fileCount = Math.max(0, this.fileCount - 1);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Add a new file to the index (e.g. after creation).
|
|
190
|
+
* O(tokens-in-file) — much faster than full rebuild.
|
|
191
|
+
*/
|
|
192
|
+
addFile(mindRoot: string, filePath: string): void {
|
|
193
|
+
if (!this.invertedIndex) return;
|
|
194
|
+
|
|
195
|
+
let content: string;
|
|
196
|
+
try { content = readFile(mindRoot, filePath); } catch { return; }
|
|
197
|
+
|
|
198
|
+
// Update BM25 stats
|
|
199
|
+
this.docLengths.set(filePath, content.length);
|
|
200
|
+
this.totalChars += content.length;
|
|
201
|
+
this.fileCount++;
|
|
202
|
+
|
|
203
|
+
// Index tokens
|
|
204
|
+
if (content.length > MAX_CONTENT_LENGTH) {
|
|
205
|
+
content = content.slice(0, MAX_CONTENT_LENGTH);
|
|
206
|
+
}
|
|
207
|
+
const allText = filePath + '\n' + content;
|
|
208
|
+
const tokens = tokenize(allText);
|
|
209
|
+
this.fileTokens.set(filePath, tokens);
|
|
210
|
+
|
|
211
|
+
for (const token of tokens) {
|
|
212
|
+
let set = this.invertedIndex.get(token);
|
|
213
|
+
if (!set) {
|
|
214
|
+
set = new Set<string>();
|
|
215
|
+
this.invertedIndex.set(token, set);
|
|
216
|
+
}
|
|
217
|
+
set.add(filePath);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Re-index a single file after modification.
|
|
223
|
+
* Equivalent to removeFile + addFile but avoids double traversal of inverted index.
|
|
224
|
+
*/
|
|
225
|
+
updateFile(mindRoot: string, filePath: string): void {
|
|
226
|
+
if (!this.invertedIndex) return;
|
|
227
|
+
this.removeFile(filePath);
|
|
228
|
+
this.addFile(mindRoot, filePath);
|
|
117
229
|
}
|
|
118
230
|
|
|
119
231
|
/** Whether the index has been built for the given mindRoot. */
|
|
@@ -131,6 +243,66 @@ export class SearchIndex {
|
|
|
131
243
|
return this.fileCount;
|
|
132
244
|
}
|
|
133
245
|
|
|
246
|
+
/** Average document length in chars. */
|
|
247
|
+
getAvgDocLength(): number {
|
|
248
|
+
return this.fileCount > 0 ? this.totalChars / this.fileCount : 0;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/** Character count of a specific document. Returns 0 if unknown. */
|
|
252
|
+
getDocLength(filePath: string): number {
|
|
253
|
+
return this.docLengths.get(filePath) ?? 0;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/** Number of documents containing a specific token (document frequency). */
|
|
257
|
+
getDocFrequency(token: string): number {
|
|
258
|
+
if (!this.invertedIndex) return 0;
|
|
259
|
+
return this.invertedIndex.get(token)?.size ?? 0;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Get candidates via UNION of token sets (for BM25 multi-term scoring).
|
|
264
|
+
* Unlike getCandidates (intersection), this returns any file matching any token.
|
|
265
|
+
*
|
|
266
|
+
* Optimization: when the query produces many tokens (common with CJK bigrams),
|
|
267
|
+
* files are ranked by how many distinct query tokens they match. Files matching
|
|
268
|
+
* fewer than half the tokens are pruned — unless that would leave zero results,
|
|
269
|
+
* in which case all matching files are returned. This prevents CJK bigram
|
|
270
|
+
* explosion from creating massive candidate sets full of low-quality matches.
|
|
271
|
+
*/
|
|
272
|
+
getCandidatesUnion(query: string): string[] | null {
|
|
273
|
+
if (!query.trim()) return null;
|
|
274
|
+
if (!this.invertedIndex) return null;
|
|
275
|
+
|
|
276
|
+
const tokens = tokenize(query.toLowerCase().trim());
|
|
277
|
+
if (tokens.size === 0) return null;
|
|
278
|
+
|
|
279
|
+
// Count how many query tokens each file matches
|
|
280
|
+
const hitCount = new Map<string, number>();
|
|
281
|
+
for (const token of tokens) {
|
|
282
|
+
const set = this.invertedIndex.get(token);
|
|
283
|
+
if (set) {
|
|
284
|
+
for (const filePath of set) {
|
|
285
|
+
hitCount.set(filePath, (hitCount.get(filePath) ?? 0) + 1);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
if (hitCount.size === 0) return [];
|
|
291
|
+
|
|
292
|
+
// When query has many tokens (e.g. CJK bigrams), prune low-overlap files
|
|
293
|
+
const tokenCount = tokens.size;
|
|
294
|
+
if (tokenCount >= 3) {
|
|
295
|
+
const threshold = Math.max(1, Math.floor(tokenCount / 2));
|
|
296
|
+
const filtered = [...hitCount.entries()]
|
|
297
|
+
.filter(([, count]) => count >= threshold)
|
|
298
|
+
.map(([path]) => path);
|
|
299
|
+
// Only apply pruning if it doesn't eliminate everything
|
|
300
|
+
if (filtered.length > 0) return filtered;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
return [...hitCount.keys()];
|
|
304
|
+
}
|
|
305
|
+
|
|
134
306
|
/**
|
|
135
307
|
* Get candidate file paths for a query (single or multi-word).
|
|
136
308
|
*
|
|
@@ -171,4 +343,128 @@ export class SearchIndex {
|
|
|
171
343
|
|
|
172
344
|
return result ? Array.from(result) : [];
|
|
173
345
|
}
|
|
346
|
+
|
|
347
|
+
// ── Persistence ──────────────────────────────────────────────────────
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Serialize the index to a JSON file for persistence across restarts.
|
|
351
|
+
* Stored at `<mindosDir>/search-index.json`.
|
|
352
|
+
*/
|
|
353
|
+
persist(mindosDir: string): void {
|
|
354
|
+
if (!this.invertedIndex) return;
|
|
355
|
+
|
|
356
|
+
const data: PersistedIndex = {
|
|
357
|
+
version: 1,
|
|
358
|
+
builtForRoot: this.builtForRoot ?? '',
|
|
359
|
+
fileCount: this.fileCount,
|
|
360
|
+
totalChars: this.totalChars,
|
|
361
|
+
docLengths: Object.fromEntries(this.docLengths),
|
|
362
|
+
invertedIndex: {},
|
|
363
|
+
timestamp: Date.now(),
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
for (const [token, fileSet] of this.invertedIndex) {
|
|
367
|
+
data.invertedIndex[token] = [...fileSet];
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
const filePath = path.join(mindosDir, 'search-index.json');
|
|
371
|
+
try {
|
|
372
|
+
fs.mkdirSync(mindosDir, { recursive: true });
|
|
373
|
+
fs.writeFileSync(filePath, JSON.stringify(data), 'utf-8');
|
|
374
|
+
} catch {
|
|
375
|
+
// Non-critical — index will be rebuilt on next search
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* Load a previously persisted index from disk.
|
|
381
|
+
* Returns true if loaded successfully, false if stale/missing/corrupt.
|
|
382
|
+
*
|
|
383
|
+
* Staleness checks (all must pass):
|
|
384
|
+
* 1. Version and mindRoot match
|
|
385
|
+
* 2. Actual file count on disk matches indexed file count (detects adds/deletes)
|
|
386
|
+
* 3. Sampled files' mtime are older than the persisted timestamp
|
|
387
|
+
*/
|
|
388
|
+
load(mindosDir: string, mindRoot: string): boolean {
|
|
389
|
+
const filePath = path.join(mindosDir, 'search-index.json');
|
|
390
|
+
|
|
391
|
+
let raw: string;
|
|
392
|
+
try { raw = fs.readFileSync(filePath, 'utf-8'); } catch { return false; }
|
|
393
|
+
|
|
394
|
+
let data: PersistedIndex;
|
|
395
|
+
try { data = JSON.parse(raw); } catch { return false; }
|
|
396
|
+
|
|
397
|
+
if (data.version !== 1 || data.builtForRoot !== mindRoot) return false;
|
|
398
|
+
|
|
399
|
+
// Check 1: file count on disk must match indexed count
|
|
400
|
+
// This catches new files created or files deleted while process was down
|
|
401
|
+
const currentFiles = collectAllFiles(mindRoot);
|
|
402
|
+
if (currentFiles.length !== data.fileCount) return false;
|
|
403
|
+
|
|
404
|
+
// Check 2: mtime sampling — check every file if ≤50, otherwise sample 50
|
|
405
|
+
const docPaths = Object.keys(data.docLengths);
|
|
406
|
+
const sampleSize = Math.min(50, docPaths.length);
|
|
407
|
+
if (sampleSize === docPaths.length) {
|
|
408
|
+
// Small index: check all files
|
|
409
|
+
for (const dp of docPaths) {
|
|
410
|
+
try {
|
|
411
|
+
const stat = fs.statSync(path.join(mindRoot, dp));
|
|
412
|
+
if (stat.mtimeMs > data.timestamp) return false;
|
|
413
|
+
} catch {
|
|
414
|
+
return false; // file deleted
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
} else {
|
|
418
|
+
// Large index: sample evenly + always check the last few (most likely to be recent)
|
|
419
|
+
const step = Math.max(1, Math.floor(docPaths.length / 40));
|
|
420
|
+
const sampled = new Set<number>();
|
|
421
|
+
// Evenly spaced samples
|
|
422
|
+
for (let i = 0; i < docPaths.length; i += step) sampled.add(i);
|
|
423
|
+
// Always check the last 10 files (most recently added to the index)
|
|
424
|
+
for (let i = Math.max(0, docPaths.length - 10); i < docPaths.length; i++) sampled.add(i);
|
|
425
|
+
|
|
426
|
+
for (const idx of sampled) {
|
|
427
|
+
try {
|
|
428
|
+
const stat = fs.statSync(path.join(mindRoot, docPaths[idx]));
|
|
429
|
+
if (stat.mtimeMs > data.timestamp) return false;
|
|
430
|
+
} catch {
|
|
431
|
+
return false;
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Restore state
|
|
437
|
+
this.builtForRoot = data.builtForRoot;
|
|
438
|
+
this.fileCount = data.fileCount;
|
|
439
|
+
this.totalChars = data.totalChars;
|
|
440
|
+
this.docLengths = new Map(Object.entries(data.docLengths).map(([k, v]) => [k, v as number]));
|
|
441
|
+
|
|
442
|
+
const inverted = new Map<string, Set<string>>();
|
|
443
|
+
const fileTokensMap = new Map<string, Set<string>>();
|
|
444
|
+
for (const [token, files] of Object.entries(data.invertedIndex)) {
|
|
445
|
+
const fileSet = new Set(files as string[]);
|
|
446
|
+
inverted.set(token, fileSet);
|
|
447
|
+
// Rebuild reverse mapping
|
|
448
|
+
for (const f of fileSet) {
|
|
449
|
+
let tokens = fileTokensMap.get(f);
|
|
450
|
+
if (!tokens) { tokens = new Set(); fileTokensMap.set(f, tokens); }
|
|
451
|
+
tokens.add(token);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
this.invertedIndex = inverted;
|
|
455
|
+
this.fileTokens = fileTokensMap;
|
|
456
|
+
|
|
457
|
+
return true;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
/** Shape of the persisted index JSON. */
|
|
462
|
+
interface PersistedIndex {
|
|
463
|
+
version: number;
|
|
464
|
+
builtForRoot: string;
|
|
465
|
+
fileCount: number;
|
|
466
|
+
totalChars: number;
|
|
467
|
+
docLengths: Record<string, number>;
|
|
468
|
+
invertedIndex: Record<string, string[]>;
|
|
469
|
+
timestamp: number;
|
|
174
470
|
}
|
package/app/lib/core/search.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
2
|
import path from 'path';
|
|
3
|
+
import os from 'os';
|
|
3
4
|
import { collectAllFiles } from './tree';
|
|
4
5
|
import { readFile } from './fs-ops';
|
|
5
6
|
import { SearchIndex } from './search-index';
|
|
@@ -11,24 +12,122 @@ import type { SearchResult, SearchOptions } from './types';
|
|
|
11
12
|
*/
|
|
12
13
|
const searchIndex = new SearchIndex();
|
|
13
14
|
|
|
15
|
+
/** Path to ~/.mindos/ for index persistence. */
|
|
16
|
+
function getMindosDir(): string {
|
|
17
|
+
return path.join(os.homedir(), '.mindos');
|
|
18
|
+
}
|
|
19
|
+
|
|
14
20
|
/** Invalidate the core search index. Called from `lib/fs.ts` on write operations. */
|
|
15
21
|
export function invalidateSearchIndex(): void {
|
|
16
22
|
searchIndex.invalidate();
|
|
17
23
|
}
|
|
18
24
|
|
|
25
|
+
/** Incrementally update a single file in the search index (after write/edit). */
|
|
26
|
+
export function updateSearchIndexFile(mindRoot: string, filePath: string): void {
|
|
27
|
+
if (!searchIndex.isBuilt()) return;
|
|
28
|
+
searchIndex.updateFile(mindRoot, filePath);
|
|
29
|
+
schedulePersist();
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Incrementally add a new file to the search index (after create). */
|
|
33
|
+
export function addSearchIndexFile(mindRoot: string, filePath: string): void {
|
|
34
|
+
if (!searchIndex.isBuilt()) return;
|
|
35
|
+
searchIndex.addFile(mindRoot, filePath);
|
|
36
|
+
schedulePersist();
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Incrementally remove a file from the search index (after delete). */
|
|
40
|
+
export function removeSearchIndexFile(filePath: string): void {
|
|
41
|
+
if (!searchIndex.isBuilt()) return;
|
|
42
|
+
searchIndex.removeFile(filePath);
|
|
43
|
+
schedulePersist();
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Debounced persist — writes index to disk 5s after last write operation. */
|
|
47
|
+
let _persistTimer: ReturnType<typeof setTimeout> | null = null;
|
|
48
|
+
let _persistDirty = false;
|
|
49
|
+
|
|
50
|
+
function schedulePersist(): void {
|
|
51
|
+
_persistDirty = true;
|
|
52
|
+
if (_persistTimer) clearTimeout(_persistTimer);
|
|
53
|
+
_persistTimer = setTimeout(flushPersist, 5000);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Immediately flush pending index to disk (used by exit hooks). */
|
|
57
|
+
function flushPersist(): void {
|
|
58
|
+
if (_persistTimer) { clearTimeout(_persistTimer); _persistTimer = null; }
|
|
59
|
+
if (!_persistDirty) return;
|
|
60
|
+
_persistDirty = false;
|
|
61
|
+
try { searchIndex.persist(getMindosDir()); } catch { /* non-critical */ }
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Ensure index is persisted before process exits
|
|
65
|
+
if (typeof process !== 'undefined') {
|
|
66
|
+
process.on('beforeExit', flushPersist);
|
|
67
|
+
process.on('SIGTERM', () => { flushPersist(); process.exit(0); });
|
|
68
|
+
process.on('SIGINT', () => { flushPersist(); process.exit(0); });
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/* ── BM25 Parameters ── */
|
|
72
|
+
const BM25_K1 = 1.2; // Term frequency saturation
|
|
73
|
+
const BM25_B = 0.75; // Document length normalization
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Compute BM25 score for a single term in a single document.
|
|
77
|
+
*
|
|
78
|
+
* @param tf - raw term frequency (occurrences of term in doc)
|
|
79
|
+
* @param df - document frequency (number of docs containing term)
|
|
80
|
+
* @param docLength - length of this document (chars)
|
|
81
|
+
* @param avgDocLength - average document length across corpus (chars)
|
|
82
|
+
* @param totalDocs - total number of documents in corpus
|
|
83
|
+
*/
|
|
84
|
+
export function bm25Score(
|
|
85
|
+
tf: number,
|
|
86
|
+
df: number,
|
|
87
|
+
docLength: number,
|
|
88
|
+
avgDocLength: number,
|
|
89
|
+
totalDocs: number,
|
|
90
|
+
): number {
|
|
91
|
+
if (tf === 0 || totalDocs === 0 || avgDocLength === 0) return 0;
|
|
92
|
+
|
|
93
|
+
// IDF: log((N - df + 0.5) / (df + 0.5) + 1) — the +1 prevents negative IDF
|
|
94
|
+
// when df > N/2 (common terms)
|
|
95
|
+
const idf = Math.log((totalDocs - df + 0.5) / (df + 0.5) + 1);
|
|
96
|
+
|
|
97
|
+
// Normalized TF with saturation and length normalization
|
|
98
|
+
const tfNorm = (tf * (BM25_K1 + 1)) / (tf + BM25_K1 * (1 - BM25_B + BM25_B * docLength / avgDocLength));
|
|
99
|
+
|
|
100
|
+
return idf * tfNorm;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Split a query into individual search terms for multi-term BM25 scoring.
|
|
105
|
+
* Each term is scored independently, then scores are summed per document.
|
|
106
|
+
*/
|
|
107
|
+
function splitQueryTerms(query: string): string[] {
|
|
108
|
+
const lower = query.toLowerCase().trim();
|
|
109
|
+
// Split on whitespace, filter empty
|
|
110
|
+
const terms = lower.split(/\s+/).filter(t => t.length > 0);
|
|
111
|
+
// Deduplicate
|
|
112
|
+
return [...new Set(terms)];
|
|
113
|
+
}
|
|
114
|
+
|
|
19
115
|
/**
|
|
20
116
|
* Core literal search — used by MCP tools via REST API.
|
|
21
117
|
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
118
|
+
* Scoring: **BM25** (Best Matching 25) — the standard information retrieval
|
|
119
|
+
* ranking function. For multi-term queries, each term is scored independently
|
|
120
|
+
* and scores are summed. This means:
|
|
121
|
+
* - Rare terms (low document frequency) contribute more to the score
|
|
122
|
+
* - Term frequency has diminishing returns (saturation at k1)
|
|
123
|
+
* - Shorter documents score higher when term frequency is equal
|
|
24
124
|
*
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
* and invalidated on any write operation.
|
|
125
|
+
* Candidate narrowing: uses an in-memory inverted index with UNION semantics
|
|
126
|
+
* for multi-term queries (a document matching ANY term is a candidate).
|
|
28
127
|
*
|
|
29
128
|
* NOTE: The App also has a separate Fuse.js fuzzy search in `lib/fs.ts` for the
|
|
30
129
|
* browser `⌘K` search overlay. The two coexist intentionally:
|
|
31
|
-
* - Core search (here): exact literal match
|
|
130
|
+
* - Core search (here): exact literal match + BM25 ranking, used by MCP/API
|
|
32
131
|
* - App search (lib/fs.ts): Fuse.js fuzzy match with CJK support, used by frontend
|
|
33
132
|
*/
|
|
34
133
|
export function searchFiles(mindRoot: string, query: string, opts: SearchOptions = {}): SearchResult[] {
|
|
@@ -37,11 +136,21 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
|
|
|
37
136
|
|
|
38
137
|
// Ensure search index is built for this mindRoot
|
|
39
138
|
if (!searchIndex.isBuiltFor(mindRoot)) {
|
|
40
|
-
|
|
139
|
+
// Try loading from disk first (fast path — avoids full rebuild)
|
|
140
|
+
const loaded = searchIndex.load(getMindosDir(), mindRoot);
|
|
141
|
+
if (!loaded) {
|
|
142
|
+
searchIndex.rebuild(mindRoot);
|
|
143
|
+
// Persist for next cold start (fire-and-forget)
|
|
144
|
+
try { searchIndex.persist(getMindosDir()); } catch { /* non-critical */ }
|
|
145
|
+
}
|
|
41
146
|
}
|
|
42
147
|
|
|
43
|
-
|
|
44
|
-
const
|
|
148
|
+
const totalDocs = searchIndex.getFileCount();
|
|
149
|
+
const avgDocLength = searchIndex.getAvgDocLength();
|
|
150
|
+
const queryTerms = splitQueryTerms(query);
|
|
151
|
+
|
|
152
|
+
// Use UNION index to get candidate files (any file matching any term)
|
|
153
|
+
const candidates = searchIndex.getCandidatesUnion(query);
|
|
45
154
|
const candidateSet = candidates ? new Set(candidates) : null;
|
|
46
155
|
|
|
47
156
|
let allFiles = collectAllFiles(mindRoot);
|
|
@@ -72,10 +181,16 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
|
|
|
72
181
|
|
|
73
182
|
const results: SearchResult[] = [];
|
|
74
183
|
const lowerQuery = query.toLowerCase();
|
|
75
|
-
|
|
184
|
+
|
|
185
|
+
// ── Pre-scan: compute document frequency for each query term ──
|
|
186
|
+
// We count how many candidate files contain each term via literal match.
|
|
187
|
+
// This is more accurate than using the inverted index token df, because
|
|
188
|
+
// the index tokenizes via Intl.Segmenter (CJK word boundaries) which may
|
|
189
|
+
// split query terms differently than our literal substring match.
|
|
190
|
+
const termDf = new Map<string, number>();
|
|
191
|
+
const fileContents = new Map<string, string>();
|
|
76
192
|
|
|
77
193
|
for (const filePath of allFiles) {
|
|
78
|
-
// Check mtime filter before reading content
|
|
79
194
|
if (mtimeThreshold > 0) {
|
|
80
195
|
try {
|
|
81
196
|
const abs = path.join(mindRoot, filePath);
|
|
@@ -87,34 +202,70 @@ export function searchFiles(mindRoot: string, query: string, opts: SearchOptions
|
|
|
87
202
|
let content: string;
|
|
88
203
|
try { content = readFile(mindRoot, filePath); } catch { continue; }
|
|
89
204
|
|
|
205
|
+
const lower = content.toLowerCase();
|
|
206
|
+
fileContents.set(filePath, content);
|
|
207
|
+
|
|
208
|
+
for (const term of queryTerms) {
|
|
209
|
+
if (lower.includes(term)) {
|
|
210
|
+
termDf.set(term, (termDf.get(term) ?? 0) + 1);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// ── Score each document with BM25 ──
|
|
216
|
+
for (const [filePath, content] of fileContents) {
|
|
90
217
|
const lowerContent = content.toLowerCase();
|
|
91
|
-
const index = lowerContent.indexOf(lowerQuery);
|
|
92
|
-
if (index === -1) continue;
|
|
93
218
|
|
|
94
|
-
//
|
|
95
|
-
let
|
|
96
|
-
|
|
97
|
-
else snippetStart += 2; // skip the newlines
|
|
219
|
+
// Check if document matches any term (full-text verification after index narrowing)
|
|
220
|
+
let matchedAnyTerm = false;
|
|
221
|
+
let firstMatchIndex = -1;
|
|
98
222
|
|
|
99
|
-
|
|
100
|
-
|
|
223
|
+
// Compute BM25 score: sum of per-term scores
|
|
224
|
+
let totalScore = 0;
|
|
225
|
+
let totalOccurrences = 0;
|
|
226
|
+
const docLength = content.length;
|
|
101
227
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
228
|
+
for (const term of queryTerms) {
|
|
229
|
+
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
230
|
+
const matches = lowerContent.match(new RegExp(escapedTerm, 'g'));
|
|
231
|
+
const tf = matches ? matches.length : 0;
|
|
232
|
+
if (tf === 0) continue;
|
|
233
|
+
|
|
234
|
+
matchedAnyTerm = true;
|
|
235
|
+
totalOccurrences += tf;
|
|
236
|
+
|
|
237
|
+
if (firstMatchIndex === -1) {
|
|
238
|
+
firstMatchIndex = lowerContent.indexOf(term);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Get document frequency for this term (computed in pre-scan)
|
|
242
|
+
const df = termDf.get(term) ?? 0;
|
|
243
|
+
|
|
244
|
+
totalScore += bm25Score(tf, df, docLength, avgDocLength, totalDocs);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (!matchedAnyTerm) continue;
|
|
248
|
+
|
|
249
|
+
// Build snippet around the first match
|
|
250
|
+
const index = firstMatchIndex >= 0 ? firstMatchIndex : lowerContent.indexOf(lowerQuery);
|
|
251
|
+
const snippetAnchor = index >= 0 ? index : 0;
|
|
252
|
+
|
|
253
|
+
let snippetStart = content.lastIndexOf('\n\n', snippetAnchor);
|
|
254
|
+
if (snippetStart === -1) snippetStart = Math.max(0, snippetAnchor - 200);
|
|
255
|
+
else snippetStart += 2;
|
|
256
|
+
|
|
257
|
+
let snippetEnd = content.indexOf('\n\n', snippetAnchor);
|
|
258
|
+
if (snippetEnd === -1) snippetEnd = Math.min(content.length, snippetAnchor + query.length + 200);
|
|
259
|
+
|
|
260
|
+
if (snippetAnchor - snippetStart > 200) snippetStart = snippetAnchor - 200;
|
|
261
|
+
if (snippetEnd - snippetAnchor > 200) snippetEnd = snippetAnchor + query.length + 200;
|
|
105
262
|
|
|
106
263
|
let snippet = content.slice(snippetStart, snippetEnd).trim();
|
|
107
|
-
|
|
108
|
-
// Collapse internal whitespace for cleaner search result presentation, but preserve some structure
|
|
109
264
|
snippet = snippet.replace(/\n{3,}/g, '\n\n');
|
|
110
|
-
|
|
111
265
|
if (snippetStart > 0) snippet = '...' + snippet;
|
|
112
266
|
if (snippetEnd < content.length) snippet += '...';
|
|
113
267
|
|
|
114
|
-
|
|
115
|
-
const score = occurrences / content.length;
|
|
116
|
-
|
|
117
|
-
results.push({ path: filePath, snippet, score, occurrences });
|
|
268
|
+
results.push({ path: filePath, snippet, score: totalScore, occurrences: totalOccurrences });
|
|
118
269
|
}
|
|
119
270
|
|
|
120
271
|
results.sort((a, b) => b.score - a.score);
|