@softerist/heuristic-mcp 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +287 -0
- package/CONTRIBUTING.md +308 -0
- package/LICENSE +21 -0
- package/README.md +249 -0
- package/config.json +66 -0
- package/example.png +0 -0
- package/features/clear-cache.js +75 -0
- package/features/find-similar-code.js +127 -0
- package/features/hybrid-search.js +173 -0
- package/features/index-codebase.js +811 -0
- package/how-its-works.png +0 -0
- package/index.js +208 -0
- package/lib/cache.js +163 -0
- package/lib/config.js +257 -0
- package/lib/embedding-worker.js +67 -0
- package/lib/ignore-patterns.js +314 -0
- package/lib/project-detector.js +75 -0
- package/lib/tokenizer.js +142 -0
- package/lib/utils.js +301 -0
- package/package.json +65 -0
- package/scripts/clear-cache.js +31 -0
- package/test/clear-cache.test.js +288 -0
- package/test/embedding-model.test.js +230 -0
- package/test/helpers.js +128 -0
- package/test/hybrid-search.test.js +243 -0
- package/test/index-codebase.test.js +246 -0
- package/test/integration.test.js +223 -0
- package/test/tokenizer.test.js +225 -0
- package/vitest.config.js +29 -0
|
@@ -0,0 +1,811 @@
|
|
|
1
|
+
import { fdir } from "fdir";
|
|
2
|
+
import fs from "fs/promises";
|
|
3
|
+
import chokidar from "chokidar";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import os from "os";
|
|
6
|
+
import { Worker } from "worker_threads";
|
|
7
|
+
import { fileURLToPath } from "url";
|
|
8
|
+
import { smartChunk, hashContent } from "../lib/utils.js";
|
|
9
|
+
|
|
10
|
+
function escapeRegExp(value) {
|
|
11
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function globToRegExp(pattern) {
|
|
15
|
+
let regex = "^";
|
|
16
|
+
for (let i = 0; i < pattern.length; ) {
|
|
17
|
+
const char = pattern[i];
|
|
18
|
+
if (char === "*") {
|
|
19
|
+
if (pattern[i + 1] === "*") {
|
|
20
|
+
if (pattern[i + 2] === "/") {
|
|
21
|
+
regex += "(?:.*/)?";
|
|
22
|
+
i += 3;
|
|
23
|
+
} else {
|
|
24
|
+
regex += ".*";
|
|
25
|
+
i += 2;
|
|
26
|
+
}
|
|
27
|
+
} else {
|
|
28
|
+
regex += "[^/]*";
|
|
29
|
+
i += 1;
|
|
30
|
+
}
|
|
31
|
+
continue;
|
|
32
|
+
}
|
|
33
|
+
if (char === "?") {
|
|
34
|
+
regex += "[^/]";
|
|
35
|
+
i += 1;
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
regex += escapeRegExp(char);
|
|
39
|
+
i += 1;
|
|
40
|
+
}
|
|
41
|
+
regex += "$";
|
|
42
|
+
return new RegExp(regex);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function normalizePath(filePath) {
|
|
46
|
+
return filePath.split(path.sep).join("/");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function buildExcludeMatchers(patterns) {
|
|
50
|
+
return [...new Set(patterns)]
|
|
51
|
+
.filter(Boolean)
|
|
52
|
+
.map(pattern => ({
|
|
53
|
+
matchBase: !pattern.includes("/"),
|
|
54
|
+
regex: globToRegExp(pattern)
|
|
55
|
+
}));
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function matchesExcludePatterns(filePath, matchers) {
|
|
59
|
+
if (matchers.length === 0) return false;
|
|
60
|
+
const normalized = normalizePath(filePath);
|
|
61
|
+
const basename = path.posix.basename(normalized);
|
|
62
|
+
|
|
63
|
+
for (const matcher of matchers) {
|
|
64
|
+
const target = matcher.matchBase ? basename : normalized;
|
|
65
|
+
if (matcher.regex.test(target)) {
|
|
66
|
+
return true;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
73
|
+
|
|
74
|
+
export class CodebaseIndexer {
|
|
75
|
+
constructor(embedder, cache, config, server = null) {
|
|
76
|
+
this.embedder = embedder;
|
|
77
|
+
this.cache = cache;
|
|
78
|
+
this.config = config;
|
|
79
|
+
this.server = server;
|
|
80
|
+
this.watcher = null;
|
|
81
|
+
this.workers = [];
|
|
82
|
+
this.workerReady = [];
|
|
83
|
+
this.isIndexing = false;
|
|
84
|
+
this.excludeMatchers = buildExcludeMatchers(this.config.excludePatterns || []);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Initialize worker thread pool for parallel embedding
|
|
89
|
+
*/
|
|
90
|
+
async initializeWorkers() {
|
|
91
|
+
const numWorkers = this.config.workerThreads === "auto"
|
|
92
|
+
? Math.min(4, Math.max(1, os.cpus().length - 1)) // Cap 'auto' at 4 workers
|
|
93
|
+
: (this.config.workerThreads || 1);
|
|
94
|
+
|
|
95
|
+
// Only use workers if we have more than 1 CPU
|
|
96
|
+
if (numWorkers <= 1) {
|
|
97
|
+
console.error("[Indexer] Single-threaded mode (1 CPU detected)");
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (this.config.verbose) {
|
|
102
|
+
console.error(`[Indexer] Worker config: workerThreads=${this.config.workerThreads}, resolved to ${numWorkers}`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
console.error(`[Indexer] Initializing ${numWorkers} worker threads...`);
|
|
106
|
+
|
|
107
|
+
const workerPath = path.join(__dirname, "../lib/embedding-worker.js");
|
|
108
|
+
|
|
109
|
+
for (let i = 0; i < numWorkers; i++) {
|
|
110
|
+
try {
|
|
111
|
+
const worker = new Worker(workerPath, {
|
|
112
|
+
workerData: {
|
|
113
|
+
embeddingModel: this.config.embeddingModel,
|
|
114
|
+
verbose: this.config.verbose
|
|
115
|
+
}
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
const readyPromise = new Promise((resolve, reject) => {
|
|
119
|
+
const timeout = setTimeout(() => reject(new Error("Worker init timeout")), 120000);
|
|
120
|
+
|
|
121
|
+
worker.once("message", (msg) => {
|
|
122
|
+
clearTimeout(timeout);
|
|
123
|
+
if (msg.type === "ready") {
|
|
124
|
+
resolve(worker);
|
|
125
|
+
} else if (msg.type === "error") {
|
|
126
|
+
reject(new Error(msg.error));
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
worker.once("error", (err) => {
|
|
131
|
+
clearTimeout(timeout);
|
|
132
|
+
reject(err);
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
this.workers.push(worker);
|
|
137
|
+
this.workerReady.push(readyPromise);
|
|
138
|
+
} catch (err) {
|
|
139
|
+
console.error(`[Indexer] Failed to create worker ${i}: ${err.message}`);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Wait for all workers to be ready
|
|
144
|
+
try {
|
|
145
|
+
await Promise.all(this.workerReady);
|
|
146
|
+
console.error(`[Indexer] ${this.workers.length} workers ready`);
|
|
147
|
+
if (this.config.verbose) {
|
|
148
|
+
console.error(`[Indexer] Each worker loaded model: ${this.config.embeddingModel}`);
|
|
149
|
+
}
|
|
150
|
+
} catch (err) {
|
|
151
|
+
console.error(`[Indexer] Worker initialization failed: ${err.message}, falling back to single-threaded`);
|
|
152
|
+
this.terminateWorkers();
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Terminate all worker threads
|
|
158
|
+
*/
|
|
159
|
+
terminateWorkers() {
|
|
160
|
+
for (const worker of this.workers) {
|
|
161
|
+
worker.postMessage({ type: "shutdown" });
|
|
162
|
+
}
|
|
163
|
+
this.workers = [];
|
|
164
|
+
this.workerReady = [];
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
isExcluded(filePath) {
|
|
168
|
+
return matchesExcludePatterns(filePath, this.excludeMatchers);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Send MCP progress notification to connected clients
|
|
173
|
+
*/
|
|
174
|
+
sendProgress(progress, total, message) {
|
|
175
|
+
if (this.server) {
|
|
176
|
+
try {
|
|
177
|
+
this.server.sendNotification("notifications/progress", {
|
|
178
|
+
progressToken: "indexing",
|
|
179
|
+
progress,
|
|
180
|
+
total,
|
|
181
|
+
message
|
|
182
|
+
});
|
|
183
|
+
} catch (err) {
|
|
184
|
+
// Silently ignore if client doesn't support progress notifications
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Process chunks using worker thread pool with timeout and error recovery
|
|
191
|
+
*/
|
|
192
|
+
async processChunksWithWorkers(allChunks) {
|
|
193
|
+
if (this.workers.length === 0) {
|
|
194
|
+
// Fallback to single-threaded processing
|
|
195
|
+
return this.processChunksSingleThreaded(allChunks);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const results = [];
|
|
199
|
+
const chunkSize = Math.ceil(allChunks.length / this.workers.length);
|
|
200
|
+
const workerPromises = [];
|
|
201
|
+
const WORKER_TIMEOUT = 300000; // 5 minutes per batch
|
|
202
|
+
|
|
203
|
+
if (this.config.verbose) {
|
|
204
|
+
console.error(`[Indexer] Distributing ${allChunks.length} chunks across ${this.workers.length} workers (~${chunkSize} chunks each)`);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
for (let i = 0; i < this.workers.length; i++) {
|
|
208
|
+
const workerChunks = allChunks.slice(i * chunkSize, (i + 1) * chunkSize);
|
|
209
|
+
if (workerChunks.length === 0) continue;
|
|
210
|
+
|
|
211
|
+
if (this.config.verbose) {
|
|
212
|
+
console.error(`[Indexer] Worker ${i}: processing ${workerChunks.length} chunks`);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
const promise = new Promise((resolve, reject) => {
|
|
216
|
+
const worker = this.workers[i];
|
|
217
|
+
const batchId = `batch-${i}-${Date.now()}`;
|
|
218
|
+
|
|
219
|
+
// Timeout handler
|
|
220
|
+
const timeout = setTimeout(() => {
|
|
221
|
+
worker.off("message", handler);
|
|
222
|
+
console.error(`[Indexer] Worker ${i} timed out, falling back to single-threaded for this batch`);
|
|
223
|
+
// Return empty and let fallback handle it
|
|
224
|
+
resolve([]);
|
|
225
|
+
}, WORKER_TIMEOUT);
|
|
226
|
+
|
|
227
|
+
const handler = (msg) => {
|
|
228
|
+
if (msg.batchId === batchId) {
|
|
229
|
+
clearTimeout(timeout);
|
|
230
|
+
worker.off("message", handler);
|
|
231
|
+
if (msg.type === "results") {
|
|
232
|
+
resolve(msg.results);
|
|
233
|
+
} else if (msg.type === "error") {
|
|
234
|
+
console.error(`[Indexer] Worker ${i} error: ${msg.error}`);
|
|
235
|
+
resolve([]); // Return empty, don't reject - let fallback handle
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
// Handle worker crash
|
|
241
|
+
const errorHandler = (err) => {
|
|
242
|
+
clearTimeout(timeout);
|
|
243
|
+
worker.off("message", handler);
|
|
244
|
+
console.error(`[Indexer] Worker ${i} crashed: ${err.message}`);
|
|
245
|
+
resolve([]); // Return empty, don't reject
|
|
246
|
+
};
|
|
247
|
+
worker.once("error", errorHandler);
|
|
248
|
+
|
|
249
|
+
worker.on("message", handler);
|
|
250
|
+
worker.postMessage({ type: "process", chunks: workerChunks, batchId });
|
|
251
|
+
});
|
|
252
|
+
|
|
253
|
+
workerPromises.push({ promise, chunks: workerChunks });
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Wait for all workers with error recovery
|
|
257
|
+
const workerResults = await Promise.all(workerPromises.map(p => p.promise));
|
|
258
|
+
|
|
259
|
+
// Collect results and identify failed chunks that need retry
|
|
260
|
+
const failedChunks = [];
|
|
261
|
+
for (let i = 0; i < workerResults.length; i++) {
|
|
262
|
+
if (workerResults[i].length > 0) {
|
|
263
|
+
results.push(...workerResults[i]);
|
|
264
|
+
} else if (workerPromises[i].chunks.length > 0) {
|
|
265
|
+
// Worker failed or timed out, need to retry these chunks
|
|
266
|
+
failedChunks.push(...workerPromises[i].chunks);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Retry failed chunks with single-threaded fallback
|
|
271
|
+
if (failedChunks.length > 0) {
|
|
272
|
+
console.error(`[Indexer] Retrying ${failedChunks.length} chunks with single-threaded fallback...`);
|
|
273
|
+
const retryResults = await this.processChunksSingleThreaded(failedChunks);
|
|
274
|
+
results.push(...retryResults);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
return results;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Single-threaded chunk processing (fallback)
|
|
282
|
+
*/
|
|
283
|
+
async processChunksSingleThreaded(chunks) {
|
|
284
|
+
const results = [];
|
|
285
|
+
|
|
286
|
+
for (const chunk of chunks) {
|
|
287
|
+
try {
|
|
288
|
+
const output = await this.embedder(chunk.text, { pooling: "mean", normalize: true });
|
|
289
|
+
results.push({
|
|
290
|
+
file: chunk.file,
|
|
291
|
+
startLine: chunk.startLine,
|
|
292
|
+
endLine: chunk.endLine,
|
|
293
|
+
content: chunk.text,
|
|
294
|
+
vector: Array.from(output.data),
|
|
295
|
+
success: true
|
|
296
|
+
});
|
|
297
|
+
} catch (error) {
|
|
298
|
+
results.push({
|
|
299
|
+
file: chunk.file,
|
|
300
|
+
startLine: chunk.startLine,
|
|
301
|
+
endLine: chunk.endLine,
|
|
302
|
+
error: error.message,
|
|
303
|
+
success: false
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return results;
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
async indexFile(file) {
|
|
312
|
+
const fileName = path.basename(file);
|
|
313
|
+
if (this.isExcluded(file)) {
|
|
314
|
+
if (this.config.verbose) {
|
|
315
|
+
console.error(`[Indexer] Skipped ${fileName} (excluded by pattern)`);
|
|
316
|
+
}
|
|
317
|
+
return 0;
|
|
318
|
+
}
|
|
319
|
+
if (this.config.verbose) {
|
|
320
|
+
console.error(`[Indexer] Processing: ${fileName}...`);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
try {
|
|
324
|
+
// Check file size first
|
|
325
|
+
const stats = await fs.stat(file);
|
|
326
|
+
|
|
327
|
+
// Skip directories
|
|
328
|
+
if (stats.isDirectory()) {
|
|
329
|
+
return 0;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if (stats.size > this.config.maxFileSize) {
|
|
333
|
+
if (this.config.verbose) {
|
|
334
|
+
console.error(`[Indexer] Skipped ${fileName} (too large: ${(stats.size / 1024 / 1024).toFixed(2)}MB)`);
|
|
335
|
+
}
|
|
336
|
+
return 0;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
const content = await fs.readFile(file, "utf-8");
|
|
340
|
+
const hash = hashContent(content);
|
|
341
|
+
|
|
342
|
+
// Skip if file hasn't changed
|
|
343
|
+
if (this.cache.getFileHash(file) === hash) {
|
|
344
|
+
if (this.config.verbose) {
|
|
345
|
+
console.error(`[Indexer] Skipped ${fileName} (unchanged)`);
|
|
346
|
+
}
|
|
347
|
+
return 0;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
if (this.config.verbose) {
|
|
351
|
+
console.error(`[Indexer] Indexing ${fileName}...`);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Remove old chunks for this file
|
|
355
|
+
this.cache.removeFileFromStore(file);
|
|
356
|
+
|
|
357
|
+
const chunks = smartChunk(content, file, this.config);
|
|
358
|
+
let addedChunks = 0;
|
|
359
|
+
let failedChunks = 0;
|
|
360
|
+
|
|
361
|
+
for (const chunk of chunks) {
|
|
362
|
+
try {
|
|
363
|
+
const output = await this.embedder(chunk.text, { pooling: "mean", normalize: true });
|
|
364
|
+
|
|
365
|
+
this.cache.addToStore({
|
|
366
|
+
file,
|
|
367
|
+
startLine: chunk.startLine,
|
|
368
|
+
endLine: chunk.endLine,
|
|
369
|
+
content: chunk.text,
|
|
370
|
+
vector: Array.from(output.data)
|
|
371
|
+
});
|
|
372
|
+
addedChunks++;
|
|
373
|
+
} catch (embeddingError) {
|
|
374
|
+
failedChunks++;
|
|
375
|
+
console.error(`[Indexer] Failed to embed chunk in ${fileName}:`, embeddingError.message);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
if (chunks.length === 0 || failedChunks === 0) {
|
|
380
|
+
this.cache.setFileHash(file, hash);
|
|
381
|
+
} else if (this.config.verbose) {
|
|
382
|
+
console.error(`[Indexer] Skipped hash update for ${fileName} (${addedChunks}/${chunks.length} chunks embedded)`);
|
|
383
|
+
}
|
|
384
|
+
if (this.config.verbose) {
|
|
385
|
+
console.error(`[Indexer] Completed ${fileName} (${addedChunks} chunks)`);
|
|
386
|
+
}
|
|
387
|
+
return addedChunks;
|
|
388
|
+
} catch (error) {
|
|
389
|
+
console.error(`[Indexer] Error indexing ${fileName}:`, error.message);
|
|
390
|
+
return 0;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
/**
|
|
395
|
+
* Discover files using fdir (3-5x faster than glob)
|
|
396
|
+
* Uses config.excludePatterns which includes smart patterns from ignore-patterns.js
|
|
397
|
+
*/
|
|
398
|
+
async discoverFiles() {
|
|
399
|
+
const startTime = Date.now();
|
|
400
|
+
|
|
401
|
+
// Build extension filter from config
|
|
402
|
+
const extensions = new Set(this.config.fileExtensions.map(ext => `.${ext}`));
|
|
403
|
+
|
|
404
|
+
// Extract directory names from glob patterns in config.excludePatterns
|
|
405
|
+
// Patterns like "**/node_modules/**" -> "node_modules"
|
|
406
|
+
const excludeDirs = new Set();
|
|
407
|
+
for (const pattern of this.config.excludePatterns) {
|
|
408
|
+
// Extract directory names from glob patterns
|
|
409
|
+
const match = pattern.match(/\*\*\/([^/*]+)\/?\*?\*?$/);
|
|
410
|
+
if (match) {
|
|
411
|
+
excludeDirs.add(match[1]);
|
|
412
|
+
}
|
|
413
|
+
// Also handle patterns like "**/dirname/**"
|
|
414
|
+
const match2 = pattern.match(/\*\*\/([^/*]+)\/\*\*$/);
|
|
415
|
+
if (match2) {
|
|
416
|
+
excludeDirs.add(match2[1]);
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
// Always exclude cache directory
|
|
421
|
+
excludeDirs.add(".smart-coding-cache");
|
|
422
|
+
|
|
423
|
+
if (this.config.verbose) {
|
|
424
|
+
console.error(`[Indexer] Using ${excludeDirs.size} exclude directories from config`);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
const api = new fdir()
|
|
428
|
+
.withFullPaths()
|
|
429
|
+
.exclude((dirName) => excludeDirs.has(dirName))
|
|
430
|
+
.filter((filePath) => extensions.has(path.extname(filePath)) && !this.isExcluded(filePath))
|
|
431
|
+
.crawl(this.config.searchDirectory);
|
|
432
|
+
|
|
433
|
+
const files = await api.withPromise();
|
|
434
|
+
|
|
435
|
+
console.error(`[Indexer] File discovery: ${files.length} files in ${Date.now() - startTime}ms`);
|
|
436
|
+
return files;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
/**
|
|
440
|
+
* Pre-filter files by hash (skip unchanged files before processing)
|
|
441
|
+
*/
|
|
442
|
+
async preFilterFiles(files) {
|
|
443
|
+
const startTime = Date.now();
|
|
444
|
+
const filesToProcess = [];
|
|
445
|
+
const skippedCount = { unchanged: 0, tooLarge: 0, error: 0 };
|
|
446
|
+
|
|
447
|
+
// Process in parallel batches for speed
|
|
448
|
+
const BATCH_SIZE = 500;
|
|
449
|
+
|
|
450
|
+
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
|
451
|
+
const batch = files.slice(i, i + BATCH_SIZE);
|
|
452
|
+
|
|
453
|
+
const results = await Promise.all(
|
|
454
|
+
batch.map(async (file) => {
|
|
455
|
+
try {
|
|
456
|
+
const stats = await fs.stat(file);
|
|
457
|
+
|
|
458
|
+
if (stats.isDirectory()) {
|
|
459
|
+
return null;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
if (stats.size > this.config.maxFileSize) {
|
|
463
|
+
skippedCount.tooLarge++;
|
|
464
|
+
return null;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
const content = await fs.readFile(file, "utf-8");
|
|
468
|
+
const hash = hashContent(content);
|
|
469
|
+
|
|
470
|
+
if (this.cache.getFileHash(file) === hash) {
|
|
471
|
+
skippedCount.unchanged++;
|
|
472
|
+
return null;
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
return { file, content, hash };
|
|
476
|
+
} catch (error) {
|
|
477
|
+
skippedCount.error++;
|
|
478
|
+
return null;
|
|
479
|
+
}
|
|
480
|
+
})
|
|
481
|
+
);
|
|
482
|
+
|
|
483
|
+
for (const result of results) {
|
|
484
|
+
if (result) filesToProcess.push(result);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
console.error(`[Indexer] Pre-filter: ${filesToProcess.length} changed, ${skippedCount.unchanged} unchanged, ${skippedCount.tooLarge} too large, ${skippedCount.error} errors (${Date.now() - startTime}ms)`);
|
|
489
|
+
return filesToProcess;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
async indexAll(force = false) {
|
|
493
|
+
if (this.isIndexing) {
|
|
494
|
+
console.error("[Indexer] Indexing already in progress, skipping concurrent request");
|
|
495
|
+
return { skipped: true, reason: "Indexing already in progress" };
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
this.isIndexing = true;
|
|
499
|
+
|
|
500
|
+
try {
|
|
501
|
+
if (force) {
|
|
502
|
+
console.error("[Indexer] Force reindex requested: clearing cache");
|
|
503
|
+
this.cache.setVectorStore([]);
|
|
504
|
+
this.cache.fileHashes = new Map();
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
const totalStartTime = Date.now();
|
|
508
|
+
console.error(`[Indexer] Starting optimized indexing in ${this.config.searchDirectory}...`);
|
|
509
|
+
|
|
510
|
+
// Step 1: Fast file discovery with fdir
|
|
511
|
+
const files = await this.discoverFiles();
|
|
512
|
+
|
|
513
|
+
if (files.length === 0) {
|
|
514
|
+
console.error("[Indexer] No files found to index");
|
|
515
|
+
this.sendProgress(100, 100, "No files found to index");
|
|
516
|
+
return { skipped: false, filesProcessed: 0, chunksCreated: 0, message: "No files found to index" };
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// Send progress: discovery complete
|
|
520
|
+
this.sendProgress(5, 100, `Discovered ${files.length} files`);
|
|
521
|
+
|
|
522
|
+
// Step 1.5: Prune deleted or excluded files from cache
|
|
523
|
+
if (!force) {
|
|
524
|
+
const currentFilesSet = new Set(files);
|
|
525
|
+
const cachedFiles = Array.from(this.cache.fileHashes.keys());
|
|
526
|
+
let prunedCount = 0;
|
|
527
|
+
|
|
528
|
+
for (const cachedFile of cachedFiles) {
|
|
529
|
+
if (!currentFilesSet.has(cachedFile)) {
|
|
530
|
+
this.cache.removeFileFromStore(cachedFile);
|
|
531
|
+
this.cache.deleteFileHash(cachedFile);
|
|
532
|
+
prunedCount++;
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
if (prunedCount > 0) {
|
|
537
|
+
if (this.config.verbose) {
|
|
538
|
+
console.error(`[Indexer] Pruned ${prunedCount} deleted/excluded files from index`);
|
|
539
|
+
}
|
|
540
|
+
// If we pruned files, we should save these changes even if no other files changed
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
// Step 2: Pre-filter unchanged files (early hash check)
|
|
545
|
+
const filesToProcess = await this.preFilterFiles(files);
|
|
546
|
+
|
|
547
|
+
if (filesToProcess.length === 0) {
|
|
548
|
+
console.error("[Indexer] All files unchanged, nothing to index");
|
|
549
|
+
this.sendProgress(100, 100, "All files up to date");
|
|
550
|
+
await this.cache.save();
|
|
551
|
+
const vectorStore = this.cache.getVectorStore();
|
|
552
|
+
return {
|
|
553
|
+
skipped: false,
|
|
554
|
+
filesProcessed: 0,
|
|
555
|
+
chunksCreated: 0,
|
|
556
|
+
totalFiles: new Set(vectorStore.map(v => v.file)).size,
|
|
557
|
+
totalChunks: vectorStore.length,
|
|
558
|
+
message: "All files up to date"
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// Send progress: filtering complete
|
|
563
|
+
this.sendProgress(10, 100, `Processing ${filesToProcess.length} changed files`);
|
|
564
|
+
|
|
565
|
+
// Step 3: Determine batch size based on project size
|
|
566
|
+
const adaptiveBatchSize = files.length > 10000 ? 500 :
|
|
567
|
+
files.length > 1000 ? 200 :
|
|
568
|
+
this.config.batchSize || 100;
|
|
569
|
+
|
|
570
|
+
console.error(`[Indexer] Processing ${filesToProcess.length} files (batch size: ${adaptiveBatchSize})`);
|
|
571
|
+
|
|
572
|
+
// Step 4: Initialize worker threads (always use when multi-core available)
|
|
573
|
+
const useWorkers = os.cpus().length > 1;
|
|
574
|
+
|
|
575
|
+
if (useWorkers) {
|
|
576
|
+
await this.initializeWorkers();
|
|
577
|
+
console.error(`[Indexer] Multi-threaded mode: ${this.workers.length} workers active`);
|
|
578
|
+
} else {
|
|
579
|
+
console.error(`[Indexer] Single-threaded mode (single-core system)`);
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
let totalChunks = 0;
|
|
583
|
+
let processedFiles = 0;
|
|
584
|
+
|
|
585
|
+
// Step 5: Process files in adaptive batches
|
|
586
|
+
for (let i = 0; i < filesToProcess.length; i += adaptiveBatchSize) {
|
|
587
|
+
const batch = filesToProcess.slice(i, i + adaptiveBatchSize);
|
|
588
|
+
|
|
589
|
+
// Generate all chunks for this batch
|
|
590
|
+
const allChunks = [];
|
|
591
|
+
const fileStats = new Map();
|
|
592
|
+
|
|
593
|
+
for (const { file, content, hash } of batch) {
|
|
594
|
+
// Remove old chunks for this file
|
|
595
|
+
this.cache.removeFileFromStore(file);
|
|
596
|
+
|
|
597
|
+
const chunks = smartChunk(content, file, this.config);
|
|
598
|
+
fileStats.set(file, { hash, totalChunks: 0, successChunks: 0 });
|
|
599
|
+
|
|
600
|
+
for (const chunk of chunks) {
|
|
601
|
+
allChunks.push({
|
|
602
|
+
file,
|
|
603
|
+
text: chunk.text,
|
|
604
|
+
startLine: chunk.startLine,
|
|
605
|
+
endLine: chunk.endLine
|
|
606
|
+
});
|
|
607
|
+
const stats = fileStats.get(file);
|
|
608
|
+
if (stats) {
|
|
609
|
+
stats.totalChunks++;
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
// Process chunks (with workers if available, otherwise single-threaded)
|
|
615
|
+
let results;
|
|
616
|
+
if (useWorkers && this.workers.length > 0) {
|
|
617
|
+
results = await this.processChunksWithWorkers(allChunks);
|
|
618
|
+
} else {
|
|
619
|
+
results = await this.processChunksSingleThreaded(allChunks);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// Store successful results
|
|
623
|
+
for (const result of results) {
|
|
624
|
+
const stats = fileStats.get(result.file);
|
|
625
|
+
if (result.success) {
|
|
626
|
+
this.cache.addToStore({
|
|
627
|
+
file: result.file,
|
|
628
|
+
startLine: result.startLine,
|
|
629
|
+
endLine: result.endLine,
|
|
630
|
+
content: result.content,
|
|
631
|
+
vector: result.vector
|
|
632
|
+
});
|
|
633
|
+
totalChunks++;
|
|
634
|
+
if (stats) {
|
|
635
|
+
stats.successChunks++;
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// Update file hashes
|
|
641
|
+
for (const [file, stats] of fileStats) {
|
|
642
|
+
if (stats.totalChunks === 0 || stats.successChunks === stats.totalChunks) {
|
|
643
|
+
this.cache.setFileHash(file, stats.hash);
|
|
644
|
+
} else if (this.config.verbose) {
|
|
645
|
+
console.error(`[Indexer] Skipped hash update for ${path.basename(file)} (${stats.successChunks}/${stats.totalChunks} chunks embedded)`);
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
processedFiles += batch.length;
|
|
650
|
+
|
|
651
|
+
// Progress indicator every batch
|
|
652
|
+
if (processedFiles % (adaptiveBatchSize * 2) === 0 || processedFiles === filesToProcess.length) {
|
|
653
|
+
const elapsed = ((Date.now() - totalStartTime) / 1000).toFixed(1);
|
|
654
|
+
const rate = (processedFiles / parseFloat(elapsed)).toFixed(0);
|
|
655
|
+
console.error(`[Indexer] Progress: ${processedFiles}/${filesToProcess.length} files (${rate} files/sec)`);
|
|
656
|
+
|
|
657
|
+
// Send MCP progress notification (10-95% range for batch processing)
|
|
658
|
+
const progressPercent = Math.floor(10 + (processedFiles / filesToProcess.length) * 85);
|
|
659
|
+
this.sendProgress(progressPercent, 100, `Indexed ${processedFiles}/${filesToProcess.length} files (${rate}/sec)`);
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// Cleanup workers
|
|
664
|
+
if (useWorkers) {
|
|
665
|
+
this.terminateWorkers();
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
const totalTime = ((Date.now() - totalStartTime) / 1000).toFixed(1);
|
|
669
|
+
console.error(`[Indexer] Complete: ${totalChunks} chunks from ${filesToProcess.length} files in ${totalTime}s`);
|
|
670
|
+
|
|
671
|
+
// Send completion progress
|
|
672
|
+
this.sendProgress(100, 100, `Complete: ${totalChunks} chunks from ${filesToProcess.length} files in ${totalTime}s`);
|
|
673
|
+
|
|
674
|
+
await this.cache.save();
|
|
675
|
+
|
|
676
|
+
const vectorStore = this.cache.getVectorStore();
|
|
677
|
+
return {
|
|
678
|
+
skipped: false,
|
|
679
|
+
filesProcessed: filesToProcess.length,
|
|
680
|
+
chunksCreated: totalChunks,
|
|
681
|
+
totalFiles: new Set(vectorStore.map(v => v.file)).size,
|
|
682
|
+
totalChunks: vectorStore.length,
|
|
683
|
+
duration: totalTime,
|
|
684
|
+
message: `Indexed ${filesToProcess.length} files (${totalChunks} chunks) in ${totalTime}s`
|
|
685
|
+
};
|
|
686
|
+
} finally {
|
|
687
|
+
this.isIndexing = false;
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
setupFileWatcher() {
|
|
692
|
+
if (!this.config.watchFiles) return;
|
|
693
|
+
|
|
694
|
+
const pattern = this.config.fileExtensions.map(ext => `**/*.${ext}`);
|
|
695
|
+
|
|
696
|
+
this.watcher = chokidar.watch(pattern, {
|
|
697
|
+
cwd: this.config.searchDirectory,
|
|
698
|
+
ignored: this.config.excludePatterns,
|
|
699
|
+
persistent: true,
|
|
700
|
+
ignoreInitial: true
|
|
701
|
+
});
|
|
702
|
+
|
|
703
|
+
this.watcher
|
|
704
|
+
.on("add", async (filePath) => {
|
|
705
|
+
const fullPath = path.join(this.config.searchDirectory, filePath);
|
|
706
|
+
console.error(`[Indexer] New file detected: ${filePath}`);
|
|
707
|
+
|
|
708
|
+
// Invalidate recency cache
|
|
709
|
+
if (this.server && this.server.hybridSearch) {
|
|
710
|
+
this.server.hybridSearch.clearFileModTime(fullPath);
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
await this.indexFile(fullPath);
|
|
714
|
+
await this.cache.save();
|
|
715
|
+
})
|
|
716
|
+
.on("change", async (filePath) => {
|
|
717
|
+
const fullPath = path.join(this.config.searchDirectory, filePath);
|
|
718
|
+
console.error(`[Indexer] File changed: ${filePath}`);
|
|
719
|
+
|
|
720
|
+
// Invalidate recency cache
|
|
721
|
+
if (this.server && this.server.hybridSearch) {
|
|
722
|
+
this.server.hybridSearch.clearFileModTime(fullPath);
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
await this.indexFile(fullPath);
|
|
726
|
+
await this.cache.save();
|
|
727
|
+
})
|
|
728
|
+
.on("unlink", (filePath) => {
|
|
729
|
+
const fullPath = path.join(this.config.searchDirectory, filePath);
|
|
730
|
+
console.error(`[Indexer] File deleted: ${filePath}`);
|
|
731
|
+
|
|
732
|
+
// Invalidate recency cache
|
|
733
|
+
if (this.server && this.server.hybridSearch) {
|
|
734
|
+
this.server.hybridSearch.clearFileModTime(fullPath);
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
this.cache.removeFileFromStore(fullPath);
|
|
738
|
+
this.cache.deleteFileHash(fullPath);
|
|
739
|
+
this.cache.save();
|
|
740
|
+
});
|
|
741
|
+
|
|
742
|
+
console.error("[Indexer] File watcher enabled for incremental indexing");
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
// MCP Tool definition for this feature
|
|
747
|
+
export function getToolDefinition() {
|
|
748
|
+
return {
|
|
749
|
+
name: "b_index_codebase",
|
|
750
|
+
description: "Manually trigger a full reindex of the codebase. This will scan all files and update the embeddings cache. Useful after large code changes or if the index seems out of date.",
|
|
751
|
+
inputSchema: {
|
|
752
|
+
type: "object",
|
|
753
|
+
properties: {
|
|
754
|
+
force: {
|
|
755
|
+
type: "boolean",
|
|
756
|
+
description: "Force reindex even if files haven't changed",
|
|
757
|
+
default: false
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
},
|
|
761
|
+
annotations: {
|
|
762
|
+
title: "Reindex Codebase",
|
|
763
|
+
readOnlyHint: false,
|
|
764
|
+
destructiveHint: false,
|
|
765
|
+
idempotentHint: true,
|
|
766
|
+
openWorldHint: false
|
|
767
|
+
}
|
|
768
|
+
};
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
// Tool handler
|
|
772
|
+
export async function handleToolCall(request, indexer) {
|
|
773
|
+
const force = request.params.arguments?.force || false;
|
|
774
|
+
const result = await indexer.indexAll(force);
|
|
775
|
+
|
|
776
|
+
// Handle case when indexing was skipped due to concurrent request
|
|
777
|
+
if (result?.skipped) {
|
|
778
|
+
return {
|
|
779
|
+
content: [{
|
|
780
|
+
type: "text",
|
|
781
|
+
text: `Indexing skipped: ${result.reason}\n\nPlease wait for the current indexing operation to complete before requesting another reindex.`
|
|
782
|
+
}]
|
|
783
|
+
};
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
// Get current stats from cache
|
|
787
|
+
const vectorStore = indexer.cache.getVectorStore();
|
|
788
|
+
const stats = {
|
|
789
|
+
totalChunks: result?.totalChunks ?? vectorStore.length,
|
|
790
|
+
totalFiles: result?.totalFiles ?? new Set(vectorStore.map(v => v.file)).size,
|
|
791
|
+
filesProcessed: result?.filesProcessed ?? 0,
|
|
792
|
+
chunksCreated: result?.chunksCreated ?? 0
|
|
793
|
+
};
|
|
794
|
+
|
|
795
|
+
let message = result?.message
|
|
796
|
+
? `Codebase reindexed successfully.\n\n${result.message}`
|
|
797
|
+
: `Codebase reindexed successfully.`;
|
|
798
|
+
|
|
799
|
+
message += `\n\nStatistics:\n- Total files in index: ${stats.totalFiles}\n- Total code chunks: ${stats.totalChunks}`;
|
|
800
|
+
|
|
801
|
+
if (stats.filesProcessed > 0) {
|
|
802
|
+
message += `\n- Files processed this run: ${stats.filesProcessed}\n- Chunks created this run: ${stats.chunksCreated}`;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
return {
|
|
806
|
+
content: [{
|
|
807
|
+
type: "text",
|
|
808
|
+
text: message
|
|
809
|
+
}]
|
|
810
|
+
};
|
|
811
|
+
}
|