semantic-code-mcp 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,999 @@
1
+ import { fdir } from "fdir";
2
+ import fs from "fs/promises";
3
+ import chokidar from "chokidar";
4
+ import path from "path";
5
+ import os from "os";
6
+ import { Worker } from "worker_threads";
7
+ import { fileURLToPath } from "url";
8
+ import { smartChunk, hashContent } from "../lib/utils.js";
9
+ import { ResourceThrottle } from "../lib/resource-throttle.js";
10
+
11
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
12
+
13
+ async function resolveCacheStats(cache) {
14
+ if (typeof cache?.getStats === "function") {
15
+ try {
16
+ const stats = await cache.getStats();
17
+ return {
18
+ totalChunks: Number(stats?.totalChunks || 0),
19
+ totalFiles: Number(stats?.totalFiles || 0)
20
+ };
21
+ } catch {
22
+ // Fall back to legacy vectorStore contract below.
23
+ }
24
+ }
25
+
26
+ const vectorStore = cache?.getVectorStore?.() || [];
27
+ return {
28
+ totalChunks: vectorStore.length,
29
+ totalFiles: new Set(vectorStore.map((v) => v.file)).size
30
+ };
31
+ }
32
+
33
+ export class CodebaseIndexer {
34
+ constructor(embedder, cache, config, server = null) {
35
+ this.embedder = embedder;
36
+ this.cache = cache;
37
+ this.config = config;
38
+ this.server = server;
39
+ this.watcher = null;
40
+ this.workers = [];
41
+ this.workerReady = [];
42
+ this.isIndexing = false;
43
+
44
+ // Initialize resource throttling
45
+ this.throttle = new ResourceThrottle(config);
46
+
47
+ // Track indexing status for progressive search
48
+ this.indexingStatus = {
49
+ inProgress: false,
50
+ totalFiles: 0,
51
+ processedFiles: 0,
52
+ percentage: 0
53
+ };
54
+ }
55
+
56
+ /**
57
+ * Initialize worker thread pool for parallel embedding
58
+ * Note: Workers are disabled for nomic models due to ONNX runtime thread-safety issues
59
+ */
60
+ async initializeWorkers() {
61
+ // Workers don't work with nomic/transformers.js due to ONNX WASM thread-safety issues
62
+ const isNomicModel = this.config.embeddingModel?.includes('nomic');
63
+ if (isNomicModel) {
64
+ console.error("[Indexer] Single-threaded mode (nomic model - ONNX workers incompatible)");
65
+ return;
66
+ }
67
+
68
+ // API providers: allow workers for parallel embedding
69
+ // Each worker makes independent API calls concurrently
70
+ const provider = (this.config.embeddingProvider || 'local').toLowerCase();
71
+ if (['gemini', 'openai', 'openai-compatible', 'vertex'].includes(provider)) {
72
+ console.error("[Indexer] API provider detected - using parallel workers for faster embedding");
73
+ }
74
+
75
+ // Check if workers are explicitly disabled
76
+ if (this.config.workerThreads === 0 || this.config.disableWorkers) {
77
+ console.error("[Indexer] Single-threaded mode (workers disabled by config)");
78
+ return;
79
+ }
80
+
81
+ const numWorkers = this.config.workerThreads === "auto"
82
+ ? this.throttle.maxWorkers // Use throttled worker count
83
+ : this.throttle.getWorkerCount(this.config.workerThreads);
84
+
85
+ // Only use workers if we have more than 1 CPU
86
+ if (numWorkers <= 1) {
87
+ console.error("[Indexer] Single-threaded mode (1 CPU detected)");
88
+ return;
89
+ }
90
+
91
+ if (this.config.verbose) {
92
+ console.error(`[Indexer] Worker config: workerThreads=${this.config.workerThreads}, resolved to ${numWorkers}`);
93
+ }
94
+
95
+ console.error(`[Indexer] Initializing ${numWorkers} worker threads...`);
96
+
97
+ const workerPath = path.join(__dirname, "../lib/embedding-worker.js");
98
+
99
+ for (let i = 0; i < numWorkers; i++) {
100
+ try {
101
+ const worker = new Worker(workerPath, {
102
+ workerData: {
103
+ embeddingProvider: this.config.embeddingProvider,
104
+ embeddingModel: this.config.embeddingModel,
105
+ embeddingDimension: this.config.embeddingDimension,
106
+ geminiApiKey: this.config.geminiApiKey,
107
+ geminiModel: this.config.geminiModel,
108
+ geminiBaseURL: this.config.geminiBaseURL,
109
+ embeddingApiKey: this.config.embeddingApiKey,
110
+ embeddingBaseURL: this.config.embeddingBaseURL,
111
+ openaiApiKey: this.config.openaiApiKey || process.env.OPENAI_API_KEY,
112
+ vertexProject: this.config.vertexProject,
113
+ vertexLocation: this.config.vertexLocation,
114
+ googleApplicationCredentials: process.env.GOOGLE_APPLICATION_CREDENTIALS,
115
+ geminiDimensions: this.config.geminiDimensions,
116
+ geminiBatchSize: this.config.geminiBatchSize,
117
+ geminiBatchFlushMs: this.config.geminiBatchFlushMs,
118
+ geminiMaxRetries: this.config.geminiMaxRetries,
119
+ verbose: this.config.verbose
120
+ }
121
+ });
122
+
123
+ const readyPromise = new Promise((resolve, reject) => {
124
+ const timeout = setTimeout(() => reject(new Error("Worker init timeout")), 120000);
125
+
126
+ worker.once("message", (msg) => {
127
+ clearTimeout(timeout);
128
+ if (msg.type === "ready") {
129
+ resolve(worker);
130
+ } else if (msg.type === "error") {
131
+ reject(new Error(msg.error));
132
+ }
133
+ });
134
+
135
+ worker.once("error", (err) => {
136
+ clearTimeout(timeout);
137
+ reject(err);
138
+ });
139
+ });
140
+
141
+ this.workers.push(worker);
142
+ this.workerReady.push(readyPromise);
143
+ } catch (err) {
144
+ console.error(`[Indexer] Failed to create worker ${i}: ${err.message}`);
145
+ }
146
+ }
147
+
148
+ // Wait for all workers to be ready
149
+ try {
150
+ await Promise.all(this.workerReady);
151
+ console.error(`[Indexer] ${this.workers.length} workers ready`);
152
+ if (this.config.verbose) {
153
+ console.error(`[Indexer] Each worker loaded model: ${this.config.embeddingModel}`);
154
+ }
155
+ } catch (err) {
156
+ console.error(`[Indexer] Worker initialization failed: ${err.message}, falling back to single-threaded`);
157
+ this.terminateWorkers();
158
+ }
159
+ }
160
+
161
+ /**
162
+ * Terminate all worker threads
163
+ */
164
+ terminateWorkers() {
165
+ for (const worker of this.workers) {
166
+ worker.postMessage({ type: "shutdown" });
167
+ }
168
+ this.workers = [];
169
+ this.workerReady = [];
170
+ }
171
+
172
+ /**
173
+ * Send MCP progress notification to connected clients
174
+ */
175
+ sendProgress(progress, total, message) {
176
+ if (this.server) {
177
+ try {
178
+ this.server.sendNotification("notifications/progress", {
179
+ progressToken: "indexing",
180
+ progress,
181
+ total,
182
+ message
183
+ });
184
+ } catch (err) {
185
+ // Silently ignore if client doesn't support progress notifications
186
+ }
187
+ }
188
+ }
189
+
190
+ /**
191
+ * Process chunks using worker thread pool with timeout and error recovery
192
+ */
193
+ async processChunksWithWorkers(allChunks) {
194
+ if (this.workers.length === 0) {
195
+ // Fallback to single-threaded processing
196
+ return this.processChunksSingleThreaded(allChunks);
197
+ }
198
+
199
+ const results = [];
200
+ const chunkSize = Math.ceil(allChunks.length / this.workers.length);
201
+ const workerPromises = [];
202
+ const WORKER_TIMEOUT = 300000; // 5 minutes per batch
203
+
204
+ if (this.config.verbose) {
205
+ console.error(`[Indexer] Distributing ${allChunks.length} chunks across ${this.workers.length} workers (~${chunkSize} chunks each)`);
206
+ }
207
+
208
+ for (let i = 0; i < this.workers.length; i++) {
209
+ const workerChunks = allChunks.slice(i * chunkSize, (i + 1) * chunkSize);
210
+ if (workerChunks.length === 0) continue;
211
+
212
+ if (this.config.verbose) {
213
+ console.error(`[Indexer] Worker ${i}: processing ${workerChunks.length} chunks`);
214
+ }
215
+
216
+ const promise = new Promise((resolve, reject) => {
217
+ const worker = this.workers[i];
218
+ const batchId = `batch-${i}-${Date.now()}`;
219
+
220
+ // Timeout handler
221
+ const timeout = setTimeout(() => {
222
+ worker.off("message", handler);
223
+ console.error(`[Indexer] Worker ${i} timed out, falling back to single-threaded for this batch`);
224
+ // Return empty and let fallback handle it
225
+ resolve([]);
226
+ }, WORKER_TIMEOUT);
227
+
228
+ const handler = (msg) => {
229
+ if (msg.batchId === batchId) {
230
+ clearTimeout(timeout);
231
+ worker.off("message", handler);
232
+ if (msg.type === "results") {
233
+ resolve(msg.results);
234
+ } else if (msg.type === "error") {
235
+ console.error(`[Indexer] Worker ${i} error: ${msg.error}`);
236
+ resolve([]); // Return empty, don't reject - let fallback handle
237
+ }
238
+ }
239
+ };
240
+
241
+ // Handle worker crash
242
+ const errorHandler = (err) => {
243
+ clearTimeout(timeout);
244
+ worker.off("message", handler);
245
+ console.error(`[Indexer] Worker ${i} crashed: ${err.message}`);
246
+ resolve([]); // Return empty, don't reject
247
+ };
248
+ worker.once("error", errorHandler);
249
+
250
+ worker.on("message", handler);
251
+ worker.postMessage({ type: "process", chunks: workerChunks, batchId });
252
+ });
253
+
254
+ workerPromises.push({ promise, chunks: workerChunks });
255
+ }
256
+
257
+ // Wait for all workers with error recovery
258
+ const workerResults = await Promise.all(workerPromises.map(p => p.promise));
259
+
260
+ // Collect results and identify failed chunks that need retry
261
+ const failedChunks = [];
262
+ for (let i = 0; i < workerResults.length; i++) {
263
+ if (workerResults[i].length > 0) {
264
+ results.push(...workerResults[i]);
265
+ } else if (workerPromises[i].chunks.length > 0) {
266
+ // Worker failed or timed out, need to retry these chunks
267
+ failedChunks.push(...workerPromises[i].chunks);
268
+ }
269
+ }
270
+
271
+ // Retry failed chunks with single-threaded fallback
272
+ if (failedChunks.length > 0) {
273
+ console.error(`[Indexer] Retrying ${failedChunks.length} chunks with single-threaded fallback...`);
274
+ const retryResults = await this.processChunksSingleThreaded(failedChunks);
275
+ results.push(...retryResults);
276
+ }
277
+
278
+ return results;
279
+ }
280
+
281
+ /**
282
+ * Single-threaded chunk processing (fallback)
283
+ */
284
+ async processChunksSingleThreaded(chunks) {
285
+ const results = [];
286
+
287
+ for (const chunk of chunks) {
288
+ try {
289
+ const output = await this.embedder(chunk.text, { pooling: "mean", normalize: true });
290
+ results.push({
291
+ file: chunk.file,
292
+ startLine: chunk.startLine,
293
+ endLine: chunk.endLine,
294
+ content: chunk.text,
295
+ vector: Array.from(output.data),
296
+ success: true
297
+ });
298
+ } catch (error) {
299
+ results.push({
300
+ file: chunk.file,
301
+ startLine: chunk.startLine,
302
+ endLine: chunk.endLine,
303
+ error: error.message,
304
+ success: false
305
+ });
306
+ }
307
+ }
308
+
309
+ return results;
310
+ }
311
+
312
+ async indexFile(file) {
313
+ const fileName = path.basename(file);
314
+ if (this.config.verbose) {
315
+ console.error(`[Indexer] Processing: ${fileName}...`);
316
+ }
317
+
318
+ try {
319
+ // Check file size first
320
+ const stats = await fs.stat(file);
321
+
322
+ // Skip directories
323
+ if (stats.isDirectory()) {
324
+ return 0;
325
+ }
326
+
327
+ if (stats.size > this.config.maxFileSize) {
328
+ if (this.config.verbose) {
329
+ console.error(`[Indexer] Skipped ${fileName} (too large: ${(stats.size / 1024 / 1024).toFixed(2)}MB)`);
330
+ }
331
+ return 0;
332
+ }
333
+
334
+ // OPTIMIZATION: Check mtime first (fast) before reading file content
335
+ const currentMtime = stats.mtimeMs;
336
+ const cachedMtime = this.cache.getFileMtime(file);
337
+
338
+ // If mtime unchanged, file definitely unchanged - skip without reading
339
+ if (cachedMtime && currentMtime === cachedMtime) {
340
+ if (this.config.verbose) {
341
+ console.error(`[Indexer] Skipped ${fileName} (unchanged - mtime)`);
342
+ }
343
+ return 0;
344
+ }
345
+
346
+ const content = await fs.readFile(file, "utf-8");
347
+ const hash = hashContent(content);
348
+
349
+ // Skip if file hasn't changed (content check after mtime indicated change)
350
+ if (this.cache.getFileHash(file) === hash) {
351
+ // Content same but mtime different - update cached mtime
352
+ this.cache.setFileHash(file, hash, currentMtime);
353
+ if (this.config.verbose) {
354
+ console.error(`[Indexer] Skipped ${fileName} (unchanged - hash)`);
355
+ }
356
+ return 0;
357
+ }
358
+
359
+ if (this.config.verbose) {
360
+ console.error(`[Indexer] Indexing ${fileName}...`);
361
+ }
362
+
363
+ // Remove old chunks for this file
364
+ this.cache.removeFileFromStore(file);
365
+
366
+ const chunks = smartChunk(content, file, this.config);
367
+ let addedChunks = 0;
368
+
369
+ for (const chunk of chunks) {
370
+ try {
371
+ const output = await this.embedder(chunk.text, { pooling: "mean", normalize: true });
372
+
373
+ this.cache.addToStore({
374
+ file,
375
+ startLine: chunk.startLine,
376
+ endLine: chunk.endLine,
377
+ content: chunk.text,
378
+ vector: Array.from(output.data)
379
+ });
380
+ addedChunks++;
381
+ } catch (embeddingError) {
382
+ console.error(`[Indexer] Failed to embed chunk in ${fileName}:`, embeddingError.message);
383
+ }
384
+ }
385
+
386
+ this.cache.setFileHash(file, hash, currentMtime);
387
+ if (this.config.verbose) {
388
+ console.error(`[Indexer] Completed ${fileName} (${addedChunks} chunks)`);
389
+ }
390
+ return addedChunks;
391
+ } catch (error) {
392
+ console.error(`[Indexer] Error indexing ${fileName}:`, error.message);
393
+ return 0;
394
+ }
395
+ }
396
+
397
+ /**
398
+ * Discover files using fdir (3-5x faster than glob)
399
+ * Uses config.excludePatterns which includes smart patterns from ignore-patterns.js
400
+ */
401
+ async discoverFiles() {
402
+ const startTime = Date.now();
403
+
404
+ // Build extension filter from config
405
+ const extensions = new Set(this.config.fileExtensions.map(ext => `.${ext}`));
406
+
407
+ // Extract directory names from glob patterns in config.excludePatterns
408
+ // Patterns like "**/node_modules/**" -> "node_modules"
409
+ const excludeDirs = new Set();
410
+ const excludeFilePatterns = [];
411
+ for (const pattern of this.config.excludePatterns) {
412
+ // Extract directory names from glob patterns
413
+ const match = pattern.match(/\*\*\/([^/*]+)\/?\*?\*?$/);
414
+ if (match) {
415
+ excludeDirs.add(match[1]);
416
+ }
417
+ // Also handle patterns like "**/dirname/**"
418
+ const match2 = pattern.match(/\*\*\/([^/*]+)\/\*\*$/);
419
+ if (match2) {
420
+ excludeDirs.add(match2[1]);
421
+ }
422
+ // Extract file-level glob patterns like **/*.test.js, **/test_*.py
423
+ const fileMatch = pattern.match(/\*\*\/(\*[^/]+|[^/*]+\*[^/]*)$/);
424
+ if (fileMatch) {
425
+ const glob = fileMatch[1];
426
+ // Convert glob to regex: *.test.js -> /\.test\.js$/, test_*.py -> /^test_.*\.py$/
427
+ const escaped = glob
428
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&')
429
+ .replace(/\*/g, '.*');
430
+ try {
431
+ excludeFilePatterns.push(new RegExp(`^${escaped}$`));
432
+ } catch {
433
+ // skip invalid patterns
434
+ }
435
+ }
436
+ }
437
+
438
+ // Always exclude cache directory
439
+ excludeDirs.add(".smart-coding-cache");
440
+
441
+ const isExcludedDirectory = (dirName) => {
442
+ if (!dirName) {
443
+ return false;
444
+ }
445
+
446
+ const normalized = dirName.replace(/[\\/]+$/g, "");
447
+ if (excludeDirs.has(normalized)) {
448
+ return true;
449
+ }
450
+
451
+ const normalizedSegments = normalized.split(/[\\/]+/);
452
+ if (normalizedSegments.some((segment) => excludeDirs.has(segment))) {
453
+ return true;
454
+ }
455
+
456
+ const basename = path.basename(normalized);
457
+ if (excludeDirs.has(basename)) {
458
+ return true;
459
+ }
460
+
461
+ return false;
462
+ };
463
+
464
+ if (this.config.verbose) {
465
+ console.error(`[Indexer] Using ${excludeDirs.size} exclude directories, ${excludeFilePatterns.length} file patterns from config`);
466
+ }
467
+ // Debug: always log for diagnosing test exclusion
468
+ console.error(`[Indexer] excludeDirs: ${[...excludeDirs].join(', ')}`);
469
+ console.error(`[Indexer] excludeFilePatterns: ${excludeFilePatterns.map(r => r.source).join(', ')}`);
470
+
471
+ const api = new fdir()
472
+ .withFullPaths()
473
+ .exclude(isExcludedDirectory)
474
+ .filter((filePath) => {
475
+ if (!extensions.has(path.extname(filePath))) return false;
476
+ // Apply file-level exclusion patterns
477
+ if (excludeFilePatterns.length > 0) {
478
+ const basename = path.basename(filePath);
479
+ for (const re of excludeFilePatterns) {
480
+ if (re.test(basename)) return false;
481
+ }
482
+ }
483
+ return true;
484
+ })
485
+ .crawl(this.config.searchDirectory);
486
+
487
+ const files = await api.withPromise();
488
+
489
+ console.error(`[Indexer] File discovery: ${files.length} files in ${Date.now() - startTime}ms`);
490
+ return files;
491
+ }
492
+
493
+ /**
494
+ * Sort files by priority for progressive indexing
495
+ * Priority: recently modified files first (users likely searching for recent work)
496
+ */
497
+ async sortFilesByPriority(files) {
498
+ const startTime = Date.now();
499
+
500
+ // Get mtime for all files in parallel
501
+ const filesWithMtime = await Promise.all(
502
+ files.map(async (file) => {
503
+ try {
504
+ const stats = await fs.stat(file);
505
+ return { file, mtime: stats.mtimeMs };
506
+ } catch {
507
+ return { file, mtime: 0 };
508
+ }
509
+ })
510
+ );
511
+
512
+ // Sort by mtime descending (most recently modified first)
513
+ filesWithMtime.sort((a, b) => b.mtime - a.mtime);
514
+
515
+ if (this.config.verbose) {
516
+ console.error(`[Indexer] Priority sort: ${files.length} files in ${Date.now() - startTime}ms`);
517
+ }
518
+
519
+ return filesWithMtime.map(f => f.file);
520
+ }
521
+
522
+ /**
523
+ * Start background indexing (non-blocking)
524
+ * Allows search to work immediately with partial results
525
+ */
526
+ startBackgroundIndexing(force = false) {
527
+ if (this.isIndexing) {
528
+ console.error("[Indexer] Background indexing already in progress");
529
+ return;
530
+ }
531
+
532
+ console.error("[Indexer] Starting background indexing...");
533
+
534
+ // Run indexAll in background (don't await)
535
+ this.indexAll(force).then(result => {
536
+ console.error(`[Indexer] Background indexing complete: ${result.message || 'done'}`);
537
+ }).catch(err => {
538
+ console.error(`[Indexer] Background indexing error: ${err.message}`);
539
+ });
540
+ }
541
+
542
+ /**
543
+ * Get current indexing status for progressive search
544
+ */
545
+ getIndexingStatus() {
546
+ return {
547
+ ...this.indexingStatus,
548
+ isReady: !this.indexingStatus.inProgress || this.indexingStatus.processedFiles > 0
549
+ };
550
+ }
551
+
552
+ /**
553
+ * Pre-filter files by hash (skip unchanged files before processing)
554
+ */
555
+ async preFilterFiles(files) {
556
+ const startTime = Date.now();
557
+ const filesToProcess = [];
558
+ const skippedCount = { unchanged: 0, tooLarge: 0, error: 0 };
559
+
560
+ // Process in parallel batches for speed
561
+ const BATCH_SIZE = 500;
562
+
563
+ for (let i = 0; i < files.length; i += BATCH_SIZE) {
564
+ const batch = files.slice(i, i + BATCH_SIZE);
565
+
566
+ const results = await Promise.all(
567
+ batch.map(async (file) => {
568
+ try {
569
+ const stats = await fs.stat(file);
570
+
571
+ if (stats.isDirectory()) {
572
+ return null;
573
+ }
574
+
575
+ if (stats.size > this.config.maxFileSize) {
576
+ skippedCount.tooLarge++;
577
+ return null;
578
+ }
579
+
580
+ const content = await fs.readFile(file, "utf-8");
581
+ const hash = hashContent(content);
582
+
583
+ if (this.cache.getFileHash(file) === hash) {
584
+ skippedCount.unchanged++;
585
+ return null;
586
+ }
587
+
588
+ return { file, content, hash };
589
+ } catch (error) {
590
+ skippedCount.error++;
591
+ return null;
592
+ }
593
+ })
594
+ );
595
+
596
+ for (const result of results) {
597
+ if (result) filesToProcess.push(result);
598
+ }
599
+ }
600
+
601
+ console.error(`[Indexer] Pre-filter: ${filesToProcess.length} changed, ${skippedCount.unchanged} unchanged, ${skippedCount.tooLarge} too large, ${skippedCount.error} errors (${Date.now() - startTime}ms)`);
602
+ return filesToProcess;
603
+ }
604
+
605
+ async indexAll(force = false) {
606
+ if (this.isIndexing) {
607
+ console.error("[Indexer] Indexing already in progress, skipping concurrent request");
608
+ return { skipped: true, reason: "Indexing already in progress" };
609
+ }
610
+
611
+ this.isIndexing = true;
612
+
613
+ // Initialize indexing status for progressive search
614
+ this.indexingStatus = {
615
+ inProgress: true,
616
+ totalFiles: 0,
617
+ processedFiles: 0,
618
+ percentage: 0
619
+ };
620
+
621
+ // Declare counters outside try block so they're accessible in finally
622
+ let processedFiles = 0;
623
+ let skippedFiles = 0;
624
+
625
+ try {
626
+ if (force) {
627
+ console.error("[Indexer] Force reindex requested: clearing cache");
628
+ if (typeof this.cache.resetForFullReindex === "function") {
629
+ await this.cache.resetForFullReindex();
630
+ } else {
631
+ this.cache.setVectorStore([]);
632
+ this.cache.clearAllFileHashes();
633
+ }
634
+ }
635
+
636
+ const totalStartTime = Date.now();
637
+ console.error(`[Indexer] Starting optimized indexing in ${this.config.searchDirectory}...`);
638
+
639
+ // Step 1: Fast file discovery with fdir
640
+ let files = await this.discoverFiles();
641
+
642
+ if (files.length === 0) {
643
+ console.error("[Indexer] No files found to index");
644
+ this.sendProgress(100, 100, "No files found to index");
645
+ return { skipped: false, filesProcessed: 0, chunksCreated: 0, message: "No files found to index" };
646
+ }
647
+
648
+ // Step 1.1: Sort files by priority (recently modified first) for progressive indexing
649
+ // This ensures search results are useful even while indexing is in progress
650
+ files = await this.sortFilesByPriority(files);
651
+ console.error(`[Indexer] Progressive mode: recently modified files will be indexed first`);
652
+
653
+ // Send progress: discovery complete
654
+ this.sendProgress(5, 100, `Discovered ${files.length} files (sorted by priority)`);
655
+
656
+ // Step 1.5: Prune deleted or excluded files from cache
657
+ if (!force) {
658
+ const currentFilesSet = new Set(files);
659
+ const cachedFiles = Array.from(this.cache.getAllFileHashes().keys());
660
+ let prunedCount = 0;
661
+
662
+ for (const cachedFile of cachedFiles) {
663
+ if (!currentFilesSet.has(cachedFile)) {
664
+ this.cache.removeFileFromStore(cachedFile);
665
+ this.cache.deleteFileHash(cachedFile);
666
+ prunedCount++;
667
+ }
668
+ }
669
+
670
+ if (prunedCount > 0) {
671
+ if (this.config.verbose) {
672
+ console.error(`[Indexer] Pruned ${prunedCount} deleted/excluded files from index`);
673
+ }
674
+ // If we pruned files, we should save these changes even if no other files changed
675
+ }
676
+ }
677
+
678
+ // Step 2: Process files with progressive indexing
679
+ // Use batch size of 1 for immediate search availability (progressive indexing)
680
+ // Each file is processed, embedded, and saved immediately so search can find it
681
+ const adaptiveBatchSize = this.config.progressiveIndexing !== false ? 1 :
682
+ files.length > 10000 ? 500 :
683
+ files.length > 1000 ? 200 :
684
+ this.config.batchSize || 100;
685
+
686
+ console.error(`[Indexer] Processing ${files.length} files (progressive mode: batch size ${adaptiveBatchSize})`);
687
+
688
+ // Step 3: Initialize worker threads (always use when multi-core available)
689
+ const useWorkers = os.cpus().length > 1;
690
+
691
+ if (useWorkers) {
692
+ await this.initializeWorkers();
693
+ console.error(`[Indexer] Multi-threaded mode: ${this.workers.length} workers active`);
694
+ } else {
695
+ console.error(`[Indexer] Single-threaded mode (single-core system)`);
696
+ }
697
+
698
+ let totalChunks = 0;
699
+ let batchCounter = 0; // Track batches for incremental saves
700
+
701
+ // Update total file count for status tracking (estimated, will adjust as we filter)
702
+ this.indexingStatus.totalFiles = files.length;
703
+
704
+ // Step 4: Process files in adaptive batches with inline lazy filtering
705
+ for (let i = 0; i < files.length; i += adaptiveBatchSize) {
706
+ const batch = files.slice(i, i + adaptiveBatchSize);
707
+
708
+ // Lazy filter and generate chunks for this batch
709
+ const allChunks = [];
710
+ const fileHashes = new Map();
711
+
712
+ for (const file of batch) {
713
+ try {
714
+ const stats = await fs.stat(file);
715
+
716
+ // Skip directories and oversized files
717
+ if (stats.isDirectory()) continue;
718
+ if (stats.size > this.config.maxFileSize) {
719
+ skippedFiles++;
720
+ continue;
721
+ }
722
+
723
+ // OPTIMIZATION: Check mtime first (fast) before reading file content
724
+ const currentMtime = stats.mtimeMs;
725
+ const cachedMtime = this.cache.getFileMtime(file);
726
+
727
+ // If mtime unchanged, file definitely unchanged - skip without reading
728
+ if (cachedMtime && currentMtime === cachedMtime) {
729
+ skippedFiles++;
730
+ continue;
731
+ }
732
+
733
+ // mtime changed (or new file) - read content and verify with hash
734
+ const content = await fs.readFile(file, "utf-8");
735
+ const hash = hashContent(content);
736
+
737
+ // Check if content actually changed (mtime can change without content change)
738
+ if (this.cache.getFileHash(file) === hash) {
739
+ // Content same but mtime different - update cached mtime
740
+ this.cache.setFileHash(file, hash, currentMtime);
741
+ skippedFiles++;
742
+ continue;
743
+ }
744
+
745
+ // File changed - remove old chunks and prepare new ones
746
+ this.cache.removeFileFromStore(file);
747
+ const chunks = smartChunk(content, file, this.config);
748
+
749
+ for (const chunk of chunks) {
750
+ allChunks.push({
751
+ file,
752
+ text: chunk.text,
753
+ startLine: chunk.startLine,
754
+ endLine: chunk.endLine,
755
+ hash,
756
+ mtime: currentMtime
757
+ });
758
+ }
759
+
760
+ fileHashes.set(file, { hash, mtime: currentMtime });
761
+ } catch (error) {
762
+ // Skip files with read errors
763
+ skippedFiles++;
764
+ if (this.config.verbose) {
765
+ console.error(`[Indexer] Error reading ${path.basename(file)}: ${error.message}`);
766
+ }
767
+ }
768
+ }
769
+
770
+ // Skip this batch if no chunks to process
771
+ if (allChunks.length === 0) {
772
+ continue;
773
+ }
774
+
775
+ // Process chunks (with workers if available, otherwise single-threaded)
776
+ let results;
777
+ if (useWorkers && this.workers.length > 0) {
778
+ results = await this.processChunksWithWorkers(allChunks);
779
+ } else {
780
+ results = await this.processChunksSingleThreaded(allChunks);
781
+ }
782
+
783
+ // Collect successful results for batch insert
784
+ const chunksToInsert = [];
785
+ const filesProcessedInBatch = new Set();
786
+
787
+ for (const result of results) {
788
+ if (result.success) {
789
+ chunksToInsert.push({
790
+ file: result.file,
791
+ startLine: result.startLine,
792
+ endLine: result.endLine,
793
+ content: result.content,
794
+ vector: result.vector
795
+ });
796
+ totalChunks++;
797
+ filesProcessedInBatch.add(result.file);
798
+ }
799
+ }
800
+
801
+ // Batch insert to SQLite (much faster than individual inserts)
802
+ if (chunksToInsert.length > 0 && typeof this.cache.addBatchToStore === 'function') {
803
+ this.cache.addBatchToStore(chunksToInsert);
804
+ } else {
805
+ // Fallback for old cache implementation
806
+ for (const chunk of chunksToInsert) {
807
+ this.cache.addToStore(chunk);
808
+ }
809
+ }
810
+
811
+ // Update file hashes with mtime
812
+ for (const [file, { hash, mtime }] of fileHashes) {
813
+ this.cache.setFileHash(file, hash, mtime);
814
+ }
815
+
816
+ processedFiles += filesProcessedInBatch.size;
817
+ batchCounter++;
818
+
819
+ // Update indexing status for progressive search
820
+ const estimatedTotal = files.length - skippedFiles;
821
+ this.indexingStatus.processedFiles = processedFiles;
822
+ this.indexingStatus.totalFiles = Math.max(estimatedTotal, processedFiles);
823
+ this.indexingStatus.percentage = estimatedTotal > 0 ? Math.floor((processedFiles / estimatedTotal) * 100) : 100;
824
+
825
+ // Progressive indexing: save after EVERY batch so search can find new results immediately
826
+ // This is critical for background indexing - users can search while indexing continues
827
+ if (chunksToInsert.length > 0) {
828
+ if (typeof this.cache.saveIncremental === 'function') {
829
+ await this.cache.saveIncremental();
830
+ } else {
831
+ // Fallback: full save (slower but ensures data is persisted)
832
+ await this.cache.save();
833
+ }
834
+ }
835
+
836
+ // Apply CPU throttling (delay between batches)
837
+ await this.throttle.throttledBatch(null);
838
+
839
+ // Progress indicator - show progress after each file in progressive mode
840
+ const progressInterval = adaptiveBatchSize === 1 ? 1 : adaptiveBatchSize * 2;
841
+ if (processedFiles > 0 && ((processedFiles + skippedFiles) % progressInterval === 0 || i + adaptiveBatchSize >= files.length)) {
842
+ const elapsed = ((Date.now() - totalStartTime) / 1000).toFixed(1);
843
+ const totalProcessed = processedFiles + skippedFiles;
844
+ const rate = totalProcessed > 0 ? (totalProcessed / parseFloat(elapsed)).toFixed(1) : '0';
845
+ console.error(`[Indexer] Progress: ${processedFiles} indexed, ${skippedFiles} skipped of ${files.length} (${rate} files/sec)`);
846
+
847
+ // Send MCP progress notification (10-95% range for batch processing)
848
+ const progressPercent = Math.min(95, Math.floor(10 + (totalProcessed / files.length) * 85));
849
+ this.sendProgress(progressPercent, 100, `Indexed ${processedFiles} files, ${skippedFiles} skipped (${rate}/sec)`);
850
+ }
851
+ }
852
+
853
+ // Cleanup workers
854
+ if (useWorkers) {
855
+ this.terminateWorkers();
856
+ }
857
+
858
+ const totalTime = ((Date.now() - totalStartTime) / 1000).toFixed(1);
859
+ const changedFiles = processedFiles;
860
+ console.error(`[Indexer] Complete: ${totalChunks} chunks from ${changedFiles} changed files (${skippedFiles} unchanged) in ${totalTime}s`);
861
+
862
+ // Mark indexing as complete
863
+ this.indexingStatus.inProgress = false;
864
+ this.indexingStatus.percentage = 100;
865
+
866
+ // Send completion progress
867
+ const summaryMsg = changedFiles > 0
868
+ ? `Complete: ${totalChunks} chunks from ${changedFiles} changed files (${skippedFiles} unchanged) in ${totalTime}s`
869
+ : `Complete: No files changed (${skippedFiles} files up to date)`;
870
+ this.sendProgress(100, 100, summaryMsg);
871
+
872
+ await this.cache.save();
873
+
874
+ const stats = await resolveCacheStats(this.cache);
875
+ const resolvedTotalChunks =
876
+ stats.totalChunks === 0 && totalChunks > 0 ? totalChunks : stats.totalChunks;
877
+ const resolvedTotalFiles =
878
+ stats.totalFiles === 0 && changedFiles > 0 ? changedFiles : stats.totalFiles;
879
+ return {
880
+ skipped: false,
881
+ filesProcessed: changedFiles,
882
+ chunksCreated: totalChunks,
883
+ totalFiles: resolvedTotalFiles,
884
+ totalChunks: resolvedTotalChunks,
885
+ duration: totalTime,
886
+ message: changedFiles > 0
887
+ ? `Indexed ${changedFiles} files (${totalChunks} chunks, ${skippedFiles} unchanged) in ${totalTime}s`
888
+ : `All ${skippedFiles} files up to date`
889
+ };
890
+ } finally {
891
+ this.isIndexing = false;
892
+ // Adjust estimated total after completion
893
+ this.indexingStatus.totalFiles = processedFiles + skippedFiles;
894
+ }
895
+ }
896
+
897
+ setupFileWatcher() {
898
+ if (!this.config.watchFiles) return;
899
+
900
+ const pattern = this.config.fileExtensions.map(ext => `**/*.${ext}`);
901
+
902
+ this.watcher = chokidar.watch(pattern, {
903
+ cwd: this.config.searchDirectory,
904
+ ignored: this.config.excludePatterns,
905
+ persistent: true,
906
+ ignoreInitial: true
907
+ });
908
+
909
+ this.watcher
910
+ .on("add", async (filePath) => {
911
+ const fullPath = path.join(this.config.searchDirectory, filePath);
912
+ console.error(`[Indexer] New file detected: ${filePath}`);
913
+ await this.indexFile(fullPath);
914
+ await this.cache.save();
915
+ })
916
+ .on("change", async (filePath) => {
917
+ const fullPath = path.join(this.config.searchDirectory, filePath);
918
+ console.error(`[Indexer] File changed: ${filePath}`);
919
+ await this.indexFile(fullPath);
920
+ await this.cache.save();
921
+ })
922
+ .on("unlink", (filePath) => {
923
+ const fullPath = path.join(this.config.searchDirectory, filePath);
924
+ console.error(`[Indexer] File deleted: ${filePath}`);
925
+ this.cache.removeFileFromStore(fullPath);
926
+ this.cache.deleteFileHash(fullPath);
927
+ this.cache.save();
928
+ });
929
+
930
+ console.error("[Indexer] File watcher enabled for incremental indexing");
931
+ }
932
+ }
933
+
934
+ // MCP Tool definition for this feature
935
+ export function getToolDefinition() {
936
+ return {
937
+ name: "b_index_codebase",
938
+ description: "Manually trigger a full reindex of the codebase. This will scan all files and update the embeddings cache. Useful after large code changes or if the index seems out of date.",
939
+ inputSchema: {
940
+ type: "object",
941
+ properties: {
942
+ force: {
943
+ type: "boolean",
944
+ description: "Force reindex even if files haven't changed",
945
+ default: false
946
+ }
947
+ }
948
+ },
949
+ annotations: {
950
+ title: "Reindex Codebase",
951
+ readOnlyHint: false,
952
+ destructiveHint: false,
953
+ idempotentHint: true,
954
+ openWorldHint: false
955
+ }
956
+ };
957
+ }
958
+
959
+ // Tool handler
960
+ export async function handleToolCall(request, indexer) {
961
+ const force = request.params.arguments?.force || false;
962
+ const result = await indexer.indexAll(force);
963
+
964
+ // Handle case when indexing was skipped due to concurrent request
965
+ if (result?.skipped) {
966
+ return {
967
+ content: [{
968
+ type: "text",
969
+ text: `Indexing skipped: ${result.reason}\n\nPlease wait for the current indexing operation to complete before requesting another reindex.`
970
+ }]
971
+ };
972
+ }
973
+
974
+ // Get current stats from cache
975
+ const cacheStats = await resolveCacheStats(indexer.cache);
976
+ const stats = {
977
+ totalChunks: result?.totalChunks ?? cacheStats.totalChunks,
978
+ totalFiles: result?.totalFiles ?? cacheStats.totalFiles,
979
+ filesProcessed: result?.filesProcessed ?? 0,
980
+ chunksCreated: result?.chunksCreated ?? 0
981
+ };
982
+
983
+ let message = result?.message
984
+ ? `Codebase reindexed successfully.\n\n${result.message}`
985
+ : `Codebase reindexed successfully.`;
986
+
987
+ message += `\n\nStatistics:\n- Total files in index: ${stats.totalFiles}\n- Total code chunks: ${stats.totalChunks}`;
988
+
989
+ if (stats.filesProcessed > 0) {
990
+ message += `\n- Files processed this run: ${stats.filesProcessed}\n- Chunks created this run: ${stats.chunksCreated}`;
991
+ }
992
+
993
+ return {
994
+ content: [{
995
+ type: "text",
996
+ text: message
997
+ }]
998
+ };
999
+ }