@matperez/coderag 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/README.md +154 -0
  2. package/dist/.tsbuildinfo +1 -0
  3. package/dist/ast-chunking.d.ts +40 -0
  4. package/dist/ast-chunking.d.ts.map +1 -0
  5. package/dist/ast-chunking.js +88 -0
  6. package/dist/ast-chunking.js.map +1 -0
  7. package/dist/ast-chunking.test.d.ts +5 -0
  8. package/dist/ast-chunking.test.d.ts.map +1 -0
  9. package/dist/ast-chunking.test.js +173 -0
  10. package/dist/ast-chunking.test.js.map +1 -0
  11. package/dist/code-tokenizer.d.ts +62 -0
  12. package/dist/code-tokenizer.d.ts.map +1 -0
  13. package/dist/code-tokenizer.js +129 -0
  14. package/dist/code-tokenizer.js.map +1 -0
  15. package/dist/code-tokenizer.test.d.ts +5 -0
  16. package/dist/code-tokenizer.test.d.ts.map +1 -0
  17. package/dist/code-tokenizer.test.js +96 -0
  18. package/dist/code-tokenizer.test.js.map +1 -0
  19. package/dist/db/client-pg.d.ts +16 -0
  20. package/dist/db/client-pg.d.ts.map +1 -0
  21. package/dist/db/client-pg.js +38 -0
  22. package/dist/db/client-pg.js.map +1 -0
  23. package/dist/db/client.d.ts +36 -0
  24. package/dist/db/client.d.ts.map +1 -0
  25. package/dist/db/client.js +81 -0
  26. package/dist/db/client.js.map +1 -0
  27. package/dist/db/migrations-pg.d.ts +6 -0
  28. package/dist/db/migrations-pg.d.ts.map +1 -0
  29. package/dist/db/migrations-pg.js +88 -0
  30. package/dist/db/migrations-pg.js.map +1 -0
  31. package/dist/db/migrations.d.ts +9 -0
  32. package/dist/db/migrations.d.ts.map +1 -0
  33. package/dist/db/migrations.js +164 -0
  34. package/dist/db/migrations.js.map +1 -0
  35. package/dist/db/schema-pg.d.ts +611 -0
  36. package/dist/db/schema-pg.d.ts.map +1 -0
  37. package/dist/db/schema-pg.js +66 -0
  38. package/dist/db/schema-pg.js.map +1 -0
  39. package/dist/db/schema.d.ts +630 -0
  40. package/dist/db/schema.d.ts.map +1 -0
  41. package/dist/db/schema.js +85 -0
  42. package/dist/db/schema.js.map +1 -0
  43. package/dist/embeddings.d.ts +92 -0
  44. package/dist/embeddings.d.ts.map +1 -0
  45. package/dist/embeddings.js +275 -0
  46. package/dist/embeddings.js.map +1 -0
  47. package/dist/embeddings.test.d.ts +5 -0
  48. package/dist/embeddings.test.d.ts.map +1 -0
  49. package/dist/embeddings.test.js +255 -0
  50. package/dist/embeddings.test.js.map +1 -0
  51. package/dist/hybrid-search.d.ts +47 -0
  52. package/dist/hybrid-search.d.ts.map +1 -0
  53. package/dist/hybrid-search.js +215 -0
  54. package/dist/hybrid-search.js.map +1 -0
  55. package/dist/hybrid-search.test.d.ts +5 -0
  56. package/dist/hybrid-search.test.d.ts.map +1 -0
  57. package/dist/hybrid-search.test.js +252 -0
  58. package/dist/hybrid-search.test.js.map +1 -0
  59. package/dist/incremental-tfidf.d.ts +77 -0
  60. package/dist/incremental-tfidf.d.ts.map +1 -0
  61. package/dist/incremental-tfidf.js +248 -0
  62. package/dist/incremental-tfidf.js.map +1 -0
  63. package/dist/incremental-tfidf.test.d.ts +5 -0
  64. package/dist/incremental-tfidf.test.d.ts.map +1 -0
  65. package/dist/incremental-tfidf.test.js +276 -0
  66. package/dist/incremental-tfidf.test.js.map +1 -0
  67. package/dist/index.d.ts +18 -0
  68. package/dist/index.d.ts.map +1 -0
  69. package/dist/index.js +19 -0
  70. package/dist/index.js.map +1 -0
  71. package/dist/indexer.d.ts +205 -0
  72. package/dist/indexer.d.ts.map +1 -0
  73. package/dist/indexer.js +1331 -0
  74. package/dist/indexer.js.map +1 -0
  75. package/dist/indexer.test.d.ts +12 -0
  76. package/dist/indexer.test.d.ts.map +1 -0
  77. package/dist/indexer.test.js +471 -0
  78. package/dist/indexer.test.js.map +1 -0
  79. package/dist/language-config.d.ts +54 -0
  80. package/dist/language-config.d.ts.map +1 -0
  81. package/dist/language-config.js +75 -0
  82. package/dist/language-config.js.map +1 -0
  83. package/dist/search-cache.d.ts +63 -0
  84. package/dist/search-cache.d.ts.map +1 -0
  85. package/dist/search-cache.js +118 -0
  86. package/dist/search-cache.js.map +1 -0
  87. package/dist/search-cache.test.d.ts +5 -0
  88. package/dist/search-cache.test.d.ts.map +1 -0
  89. package/dist/search-cache.test.js +194 -0
  90. package/dist/search-cache.test.js.map +1 -0
  91. package/dist/storage-factory.d.ts +11 -0
  92. package/dist/storage-factory.d.ts.map +1 -0
  93. package/dist/storage-factory.js +17 -0
  94. package/dist/storage-factory.js.map +1 -0
  95. package/dist/storage-persistent-pg.d.ts +75 -0
  96. package/dist/storage-persistent-pg.d.ts.map +1 -0
  97. package/dist/storage-persistent-pg.js +579 -0
  98. package/dist/storage-persistent-pg.js.map +1 -0
  99. package/dist/storage-persistent-pg.test.d.ts +7 -0
  100. package/dist/storage-persistent-pg.test.d.ts.map +1 -0
  101. package/dist/storage-persistent-pg.test.js +90 -0
  102. package/dist/storage-persistent-pg.test.js.map +1 -0
  103. package/dist/storage-persistent-types.d.ts +110 -0
  104. package/dist/storage-persistent-types.d.ts.map +1 -0
  105. package/dist/storage-persistent-types.js +5 -0
  106. package/dist/storage-persistent-types.js.map +1 -0
  107. package/dist/storage-persistent.d.ts +231 -0
  108. package/dist/storage-persistent.d.ts.map +1 -0
  109. package/dist/storage-persistent.js +897 -0
  110. package/dist/storage-persistent.js.map +1 -0
  111. package/dist/storage-persistent.test.d.ts +5 -0
  112. package/dist/storage-persistent.test.d.ts.map +1 -0
  113. package/dist/storage-persistent.test.js +325 -0
  114. package/dist/storage-persistent.test.js.map +1 -0
  115. package/dist/storage.d.ts +63 -0
  116. package/dist/storage.d.ts.map +1 -0
  117. package/dist/storage.js +67 -0
  118. package/dist/storage.js.map +1 -0
  119. package/dist/storage.test.d.ts +5 -0
  120. package/dist/storage.test.d.ts.map +1 -0
  121. package/dist/storage.test.js +157 -0
  122. package/dist/storage.test.js.map +1 -0
  123. package/dist/tfidf.d.ts +97 -0
  124. package/dist/tfidf.d.ts.map +1 -0
  125. package/dist/tfidf.js +308 -0
  126. package/dist/tfidf.js.map +1 -0
  127. package/dist/tfidf.test.d.ts +5 -0
  128. package/dist/tfidf.test.d.ts.map +1 -0
  129. package/dist/tfidf.test.js +181 -0
  130. package/dist/tfidf.test.js.map +1 -0
  131. package/dist/utils.d.ts +61 -0
  132. package/dist/utils.d.ts.map +1 -0
  133. package/dist/utils.js +264 -0
  134. package/dist/utils.js.map +1 -0
  135. package/dist/utils.test.d.ts +5 -0
  136. package/dist/utils.test.d.ts.map +1 -0
  137. package/dist/utils.test.js +94 -0
  138. package/dist/utils.test.js.map +1 -0
  139. package/dist/vector-storage.d.ts +120 -0
  140. package/dist/vector-storage.d.ts.map +1 -0
  141. package/dist/vector-storage.js +264 -0
  142. package/dist/vector-storage.js.map +1 -0
  143. package/dist/vector-storage.test.d.ts +5 -0
  144. package/dist/vector-storage.test.d.ts.map +1 -0
  145. package/dist/vector-storage.test.js +345 -0
  146. package/dist/vector-storage.test.js.map +1 -0
  147. package/package.json +85 -0
@@ -0,0 +1,1331 @@
1
+ /**
2
+ * Codebase indexer service
3
+ * Uses chunk-level indexing for better search granularity
4
+ */
5
+ import fs from 'node:fs/promises';
6
+ import path from 'node:path';
7
+ import { chunkCodeByAST } from './ast-chunking.js';
8
+ import { getCoderagDataDir } from './db/client.js';
9
+ import { IncrementalTFIDF } from './incremental-tfidf.js';
10
+ import { createCacheKey, LRUCache } from './search-cache.js';
11
+ import { MemoryStorage } from './storage.js';
12
+ import { PersistentStorage } from './storage-persistent.js';
13
+ import { buildSearchIndex, getQueryTokens, tokenize } from './tfidf.js';
14
+ import { detectLanguage, isTextFile, loadGitignore, readFileContent, scanFileMetadata, simpleHash, } from './utils.js';
15
+ import { VectorStorage } from './vector-storage.js';
16
+ export class CodebaseIndexer {
17
+ codebaseRoot;
18
+ maxFileSize;
19
+ storage;
20
+ searchIndex = null;
21
+ incrementalEngine = null;
22
+ pendingFileChanges = [];
23
+ searchCache;
24
+ watcher = null;
25
+ isWatching = false;
26
+ onFileChangeCallback;
27
+ pendingUpdates = new Map();
28
+ ignoreFilter = null;
29
+ status = {
30
+ isIndexing: false,
31
+ progress: 0,
32
+ totalFiles: 0,
33
+ processedFiles: 0,
34
+ totalChunks: 0,
35
+ indexedChunks: 0,
36
+ };
37
+ vectorStorage;
38
+ embeddingProvider;
39
+ vectorBatchSize;
40
+ indexingBatchSize;
41
+ lowMemoryMode;
42
+ constructor(options = {}) {
43
+ this.codebaseRoot = options.codebaseRoot || process.cwd();
44
+ this.maxFileSize = options.maxFileSize || 1048576; // 1MB
45
+ this.storage = options.storage || new MemoryStorage();
46
+ this.onFileChangeCallback = options.onFileChange;
47
+ this.searchCache = new LRUCache(100, 5); // 100 entries, 5 min TTL
48
+ this.embeddingProvider = options.embeddingProvider;
49
+ this.vectorBatchSize = options.vectorBatchSize || 10;
50
+ this.indexingBatchSize = options.indexingBatchSize || 50; // Memory optimization
51
+ // Default to low memory mode when using persistent storage
52
+ this.lowMemoryMode = options.lowMemoryMode ?? options.storage instanceof PersistentStorage;
53
+ // Initialize vector storage if embedding provider is available
54
+ if (this.embeddingProvider) {
55
+ // Use global ~/.coderag/projects/<hash>/ directory for vector storage
56
+ const dataDir = getCoderagDataDir(this.codebaseRoot);
57
+ const vectorDbPath = path.join(dataDir, 'vectors.lance');
58
+ this.vectorStorage = new VectorStorage({
59
+ dimensions: this.embeddingProvider.dimensions,
60
+ dbPath: vectorDbPath,
61
+ });
62
+ console.error(`[INFO] Vector storage initialized: ${this.embeddingProvider.dimensions} dimensions`);
63
+ }
64
+ }
65
+ /**
66
+ * Get current indexing status
67
+ */
68
+ getStatus() {
69
+ return { ...this.status };
70
+ }
71
+ /**
72
+ * Compare filesystem with database to find changes
73
+ * Used for incremental updates after long periods of inactivity
74
+ */
75
+ async diffFilesystem(dbMetadata) {
76
+ if (!this.ignoreFilter) {
77
+ this.ignoreFilter = loadGitignore(this.codebaseRoot);
78
+ }
79
+ const added = [];
80
+ const changed = [];
81
+ const deleted = [];
82
+ let unchanged = 0;
83
+ // Track which db files we've seen in filesystem
84
+ const seenPaths = new Set();
85
+ // Scan filesystem
86
+ for (const metadata of scanFileMetadata(this.codebaseRoot, {
87
+ ignoreFilter: this.ignoreFilter,
88
+ codebaseRoot: this.codebaseRoot,
89
+ maxFileSize: this.maxFileSize,
90
+ })) {
91
+ seenPaths.add(metadata.path);
92
+ const dbEntry = dbMetadata.get(metadata.path);
93
+ if (!dbEntry) {
94
+ // New file
95
+ added.push(metadata);
96
+ }
97
+ else if (Math.abs(metadata.mtime - dbEntry.mtime) > 1000) {
98
+ // mtime changed (1 second tolerance for filesystem precision)
99
+ // File might have changed, need to verify with hash
100
+ changed.push(metadata);
101
+ }
102
+ else {
103
+ unchanged++;
104
+ }
105
+ }
106
+ // Find deleted files (in db but not in filesystem)
107
+ for (const dbPath of dbMetadata.keys()) {
108
+ if (!seenPaths.has(dbPath)) {
109
+ deleted.push(dbPath);
110
+ }
111
+ }
112
+ return { added, changed, deleted, unchanged };
113
+ }
114
+ /**
115
+ * Process incremental changes (add, update, delete files)
116
+ * Uses chunk-level indexing with SQL-based updates
117
+ */
118
+ async processIncrementalChanges(diff, dbMetadata, options) {
119
+ const persistentStorage = this.storage;
120
+ // Step 1: Get terms for deleted files (before deleting, for IDF recalculation)
121
+ let _deletedTerms = new Set();
122
+ if (diff.deleted.length > 0) {
123
+ console.error(`[INFO] Getting terms for ${diff.deleted.length} deleted files...`);
124
+ _deletedTerms = await persistentStorage.getTermsForFiles(diff.deleted);
125
+ console.error(`[INFO] Deleting ${diff.deleted.length} removed files...`);
126
+ await persistentStorage.deleteFiles(diff.deleted);
127
+ }
128
+ // Step 2: Process added and changed files - chunk and index
129
+ const filesToProcess = [...diff.added, ...diff.changed];
130
+ let totalChunks = 0;
131
+ if (filesToProcess.length > 0) {
132
+ console.error(`[INFO] Processing ${filesToProcess.length} files...`);
133
+ const batchSize = this.indexingBatchSize;
134
+ let processedCount = 0;
135
+ for (let i = 0; i < filesToProcess.length; i += batchSize) {
136
+ const batchMetadata = filesToProcess.slice(i, i + batchSize);
137
+ const batchFiles = [];
138
+ const fileChunks = [];
139
+ for (const metadata of batchMetadata) {
140
+ const content = readFileContent(metadata.absolutePath);
141
+ if (content === null)
142
+ continue;
143
+ const newHash = simpleHash(content);
144
+ // For changed files, verify content actually changed using hash
145
+ const dbEntry = dbMetadata.get(metadata.path);
146
+ if (dbEntry && dbEntry.hash === newHash) {
147
+ // File content unchanged, just mtime difference - skip
148
+ processedCount++;
149
+ continue;
150
+ }
151
+ const codebaseFile = {
152
+ path: metadata.path,
153
+ content,
154
+ size: metadata.size,
155
+ mtime: new Date(metadata.mtime),
156
+ language: metadata.language,
157
+ hash: newHash,
158
+ };
159
+ batchFiles.push(codebaseFile);
160
+ // Chunk the file using AST
161
+ const chunks = await chunkCodeByAST(content, metadata.path);
162
+ const chunkData = chunks.map((chunk) => ({
163
+ content: chunk.content,
164
+ type: chunk.type,
165
+ startLine: chunk.startLine,
166
+ endLine: chunk.endLine,
167
+ metadata: chunk.metadata,
168
+ }));
169
+ fileChunks.push({ filePath: metadata.path, chunks: chunkData });
170
+ totalChunks += chunkData.length;
171
+ processedCount++;
172
+ this.status.currentFile = metadata.path;
173
+ this.status.progress = Math.round((processedCount / filesToProcess.length) * 30);
174
+ options.onProgress?.(processedCount, filesToProcess.length, metadata.path);
175
+ }
176
+ // Store batch to database (file content)
177
+ if (batchFiles.length > 0) {
178
+ await persistentStorage.storeFiles(batchFiles);
179
+ }
180
+ // Store chunks for this batch
181
+ if (fileChunks.length > 0) {
182
+ const chunkIdMap = await persistentStorage.storeManyChunks(fileChunks);
183
+ // Build TF-IDF vectors for chunks
184
+ const chunkVectors = [];
185
+ for (const fc of fileChunks) {
186
+ const chunkIds = chunkIdMap.get(fc.filePath);
187
+ if (!chunkIds)
188
+ continue;
189
+ for (let j = 0; j < fc.chunks.length; j++) {
190
+ const chunk = fc.chunks[j];
191
+ const chunkId = chunkIds[j];
192
+ if (!chunkId)
193
+ continue;
194
+ // Tokenize chunk content
195
+ const tokens = await tokenize(chunk.content);
196
+ const termFreq = new Map();
197
+ for (const token of tokens) {
198
+ termFreq.set(token, (termFreq.get(token) || 0) + 1);
199
+ }
200
+ // Calculate TF
201
+ const totalTerms = tokens.length;
202
+ if (totalTerms === 0)
203
+ continue;
204
+ const terms = new Map();
205
+ for (const [term, freq] of termFreq) {
206
+ terms.set(term, {
207
+ tf: freq / totalTerms,
208
+ tfidf: 0, // Will be calculated after IDF rebuild
209
+ rawFreq: freq,
210
+ });
211
+ }
212
+ chunkVectors.push({ chunkId, terms, tokenCount: totalTerms });
213
+ }
214
+ }
215
+ // Store chunk vectors
216
+ if (chunkVectors.length > 0) {
217
+ await persistentStorage.storeManyChunkVectors(chunkVectors);
218
+ }
219
+ }
220
+ }
221
+ }
222
+ this.status.progress = 50;
223
+ // Step 3: Rebuild IDF scores from vectors (SQL-based)
224
+ console.error('[INFO] Recalculating IDF scores...');
225
+ await persistentStorage.rebuildIdfScoresFromVectors();
226
+ this.status.progress = 70;
227
+ // Step 4: Recalculate TF-IDF scores (SQL-based batch update)
228
+ console.error('[INFO] Updating TF-IDF scores...');
229
+ await persistentStorage.recalculateTfidfScores();
230
+ this.status.progress = 80;
231
+ // Step 5: Update pre-computed magnitudes (for cosine similarity search)
232
+ console.error('[INFO] Updating chunk magnitudes...');
233
+ await persistentStorage.updateChunkMagnitudes();
234
+ this.status.progress = 90;
235
+ // Step 6: Update average document length (for BM25)
236
+ console.error('[INFO] Updating average document length...');
237
+ await persistentStorage.updateAverageDocLength();
238
+ this.status.progress = 95;
239
+ // Step 7: Invalidate search cache
240
+ this.searchCache.invalidate();
241
+ console.error('[INFO] Search cache invalidated');
242
+ // Log summary
243
+ console.error(`[SUCCESS] Incremental update complete: ${filesToProcess.length - diff.changed.length} files added, ${diff.changed.length} changed, ${diff.deleted.length} deleted, ${totalChunks} chunks indexed`);
244
+ }
245
+ /**
246
+ * Get search index
247
+ */
248
+ getSearchIndex() {
249
+ return this.searchIndex;
250
+ }
251
+ /**
252
+ * Index the codebase
253
+ */
254
+ async index(options = {}) {
255
+ this.status.isIndexing = true;
256
+ this.status.progress = 0;
257
+ this.status.processedFiles = 0;
258
+ this.status.indexedChunks = 0;
259
+ try {
260
+ // Try to load existing index from persistent storage
261
+ if (this.storage instanceof PersistentStorage) {
262
+ const existingFileCount = await this.storage.count();
263
+ const existingChunkCount = (await this.storage.getChunkCount?.()) ?? 0;
264
+ if (existingFileCount > 0) {
265
+ console.error(`[INFO] Found existing index: ${existingFileCount} files, ${existingChunkCount} chunks`);
266
+ // Verify IDF scores exist (index is valid)
267
+ const idf = await this.storage.getIdfScores();
268
+ if (idf.size > 0) {
269
+ // Incremental diff: compare filesystem vs database
270
+ console.error('[INFO] Checking for file changes since last index...');
271
+ const dbMetadata = await this.storage.getAllFileMetadata();
272
+ const diff = await this.diffFilesystem(dbMetadata);
273
+ const totalChanges = diff.added.length + diff.changed.length + diff.deleted.length;
274
+ if (totalChanges === 0) {
275
+ // No changes - use existing index
276
+ console.error(`[SUCCESS] No changes detected (${diff.unchanged} files, ${existingChunkCount} chunks)`);
277
+ this.status.progress = 100;
278
+ this.status.totalFiles = existingFileCount;
279
+ this.status.processedFiles = existingFileCount;
280
+ this.status.totalChunks = existingChunkCount;
281
+ this.status.indexedChunks = existingChunkCount;
282
+ // Start watching if requested
283
+ if (options.watch) {
284
+ await this.startWatch();
285
+ }
286
+ this.status.isIndexing = false;
287
+ return;
288
+ }
289
+ // Process incremental changes
290
+ console.error(`[INFO] Incremental update: ${diff.added.length} added, ${diff.changed.length} changed, ${diff.deleted.length} deleted`);
291
+ await this.processIncrementalChanges(diff, dbMetadata, options);
292
+ // Get updated chunk count after incremental changes
293
+ const updatedChunkCount = (await this.storage.getChunkCount?.()) ?? 0;
294
+ const updatedFileCount = existingFileCount + diff.added.length - diff.deleted.length;
295
+ this.status.progress = 100;
296
+ this.status.totalFiles = updatedFileCount;
297
+ this.status.processedFiles = updatedFileCount;
298
+ this.status.totalChunks = updatedChunkCount;
299
+ this.status.indexedChunks = updatedChunkCount;
300
+ // Start watching if requested
301
+ if (options.watch) {
302
+ await this.startWatch();
303
+ }
304
+ this.status.isIndexing = false;
305
+ return;
306
+ }
307
+ console.error('[WARN] Index verification failed, rebuilding...');
308
+ }
309
+ }
310
+ // Load .gitignore
311
+ this.ignoreFilter = loadGitignore(this.codebaseRoot);
312
+ const ignoreFilter = this.ignoreFilter;
313
+ // Phase 1: Scan file metadata only (no content loaded - Memory optimization)
314
+ console.error('[INFO] Scanning codebase (metadata only)...');
315
+ const fileMetadataList = [];
316
+ for (const metadata of scanFileMetadata(this.codebaseRoot, {
317
+ ignoreFilter,
318
+ codebaseRoot: this.codebaseRoot,
319
+ maxFileSize: this.maxFileSize,
320
+ })) {
321
+ fileMetadataList.push(metadata);
322
+ }
323
+ this.status.totalFiles = fileMetadataList.length;
324
+ console.error(`[INFO] Found ${fileMetadataList.length} files`);
325
+ // Sync deleted files when doing full rebuild (e.g. after interrupted first index)
326
+ if (this.storage instanceof PersistentStorage && (await this.storage.count()) > 0) {
327
+ const dbMetadata = await this.storage.getAllFileMetadata();
328
+ const dbPaths = Array.from(dbMetadata.keys());
329
+ const currentPaths = new Set(fileMetadataList.map((m) => m.path));
330
+ const deleted = dbPaths.filter((p) => !currentPaths.has(p));
331
+ if (deleted.length > 0) {
332
+ await this.storage.deleteFiles(deleted);
333
+ console.error(`[INFO] Removed ${deleted.length} deleted files from index (full rebuild).`);
334
+ }
335
+ }
336
+ // Phase 2: Process files in batches with chunk-level indexing
337
+ // Only batch content is in memory at any time
338
+ console.error(`[INFO] Processing files in batches of ${this.indexingBatchSize}...`);
339
+ const batchSize = this.indexingBatchSize;
340
+ let processedCount = 0;
341
+ let totalChunks = 0;
342
+ // Check if we're using persistent storage for chunk-based indexing
343
+ const isPersistent = this.storage instanceof PersistentStorage;
344
+ const persistentStorage = isPersistent ? this.storage : null;
345
+ // For non-persistent storage, still use incremental engine
346
+ if (!isPersistent) {
347
+ this.incrementalEngine = new IncrementalTFIDF();
348
+ }
349
+ const existingHashes = options.skipUnchanged !== false && this.storage.getFileHashes
350
+ ? await this.storage.getFileHashes()
351
+ : new Map();
352
+ for (let i = 0; i < fileMetadataList.length; i += batchSize) {
353
+ const batchMetadata = fileMetadataList.slice(i, i + batchSize);
354
+ const batchFiles = [];
355
+ const fileChunks = [];
356
+ const batchUpdates = [];
357
+ // Read content for this batch only
358
+ for (const metadata of batchMetadata) {
359
+ const content = readFileContent(metadata.absolutePath);
360
+ if (content === null)
361
+ continue;
362
+ const hash = simpleHash(content);
363
+ if (existingHashes.get(metadata.path) === hash) {
364
+ processedCount++;
365
+ this.status.currentFile = metadata.path;
366
+ this.status.processedFiles = processedCount;
367
+ this.status.progress = Math.round((processedCount / fileMetadataList.length) * 40);
368
+ options.onProgress?.(processedCount, fileMetadataList.length, metadata.path);
369
+ continue;
370
+ }
371
+ const codebaseFile = {
372
+ path: metadata.path,
373
+ content,
374
+ size: metadata.size,
375
+ mtime: new Date(metadata.mtime),
376
+ language: metadata.language,
377
+ hash,
378
+ };
379
+ batchFiles.push(codebaseFile);
380
+ // Chunk the file using AST
381
+ const chunks = await chunkCodeByAST(content, metadata.path);
382
+ const chunkData = chunks.map((chunk) => ({
383
+ content: chunk.content,
384
+ type: chunk.type,
385
+ startLine: chunk.startLine,
386
+ endLine: chunk.endLine,
387
+ metadata: chunk.metadata,
388
+ }));
389
+ fileChunks.push({ filePath: metadata.path, chunks: chunkData, content });
390
+ totalChunks += chunkData.length;
391
+ // For non-persistent storage, use incremental engine
392
+ if (!isPersistent) {
393
+ batchUpdates.push({
394
+ type: 'add',
395
+ uri: `file://${metadata.path}`,
396
+ newContent: content,
397
+ });
398
+ }
399
+ processedCount++;
400
+ this.status.currentFile = metadata.path;
401
+ this.status.processedFiles = processedCount;
402
+ this.status.indexedChunks = totalChunks;
403
+ this.status.progress = Math.round((processedCount / fileMetadataList.length) * 40);
404
+ options.onProgress?.(processedCount, fileMetadataList.length, metadata.path);
405
+ }
406
+ // Store batch to database
407
+ if (batchFiles.length > 0) {
408
+ if (this.storage.storeFiles) {
409
+ await this.storage.storeFiles(batchFiles);
410
+ }
411
+ else {
412
+ for (const file of batchFiles) {
413
+ await this.storage.storeFile(file);
414
+ }
415
+ }
416
+ // For persistent storage: store chunks and build TF-IDF vectors per chunk
417
+ if (persistentStorage && fileChunks.length > 0) {
418
+ const chunkIdMap = await persistentStorage.storeManyChunks(fileChunks.map((fc) => ({ filePath: fc.filePath, chunks: fc.chunks })));
419
+ // Build flat list of tokenize tasks (same order as chunks)
420
+ const tokenTasks = [];
421
+ for (const fc of fileChunks) {
422
+ const chunkIds = chunkIdMap.get(fc.filePath);
423
+ if (!chunkIds)
424
+ continue;
425
+ for (let j = 0; j < fc.chunks.length; j++) {
426
+ const chunkId = chunkIds[j];
427
+ if (chunkId)
428
+ tokenTasks.push({ chunkId, content: fc.chunks[j].content });
429
+ }
430
+ }
431
+ const allTokenResults = [];
432
+ if (options.useParallelTokenize === true) {
433
+ // Tokenize in parallel (sub-batches to avoid too many concurrent)
434
+ const TOKENIZE_CONCURRENCY = 25;
435
+ for (let i = 0; i < tokenTasks.length; i += TOKENIZE_CONCURRENCY) {
436
+ const batch = tokenTasks.slice(i, i + TOKENIZE_CONCURRENCY);
437
+ const batchResults = await Promise.all(batch.map((t) => tokenize(t.content)));
438
+ allTokenResults.push(...batchResults);
439
+ }
440
+ }
441
+ else {
442
+ // Sequential tokenize (default)
443
+ for (const t of tokenTasks) {
444
+ allTokenResults.push(await tokenize(t.content));
445
+ }
446
+ }
447
+ // Build chunkVectors from results (same order)
448
+ const chunkVectors = [];
449
+ for (let k = 0; k < tokenTasks.length; k++) {
450
+ const tokens = allTokenResults[k];
451
+ const chunkId = tokenTasks[k].chunkId;
452
+ const termFreq = new Map();
453
+ for (const token of tokens) {
454
+ termFreq.set(token, (termFreq.get(token) || 0) + 1);
455
+ }
456
+ const totalTerms = tokens.length;
457
+ if (totalTerms === 0)
458
+ continue;
459
+ const terms = new Map();
460
+ for (const [term, freq] of termFreq) {
461
+ terms.set(term, {
462
+ tf: freq / totalTerms,
463
+ tfidf: 0,
464
+ rawFreq: freq,
465
+ });
466
+ }
467
+ chunkVectors.push({ chunkId, terms, tokenCount: totalTerms });
468
+ }
469
+ if (chunkVectors.length > 0) {
470
+ await persistentStorage.storeManyChunkVectors(chunkVectors);
471
+ }
472
+ }
473
+ // For non-persistent storage: use incremental engine
474
+ if (this.incrementalEngine && batchUpdates.length > 0) {
475
+ await this.incrementalEngine.applyUpdates(batchUpdates);
476
+ }
477
+ }
478
+ // Clear batch references for GC
479
+ batchFiles.length = 0;
480
+ fileChunks.length = 0;
481
+ batchUpdates.length = 0;
482
+ }
483
+ console.error(`[INFO] Total chunks created: ${totalChunks}`);
484
+ this.status.totalChunks = totalChunks;
485
+ this.status.progress = 50;
486
+ // Finalize index based on storage type
487
+ if (persistentStorage) {
488
+ // Persistent storage: rebuild IDF and TF-IDF scores
489
+ console.error('[INFO] Rebuilding IDF scores...');
490
+ await persistentStorage.rebuildIdfScoresFromVectors();
491
+ this.status.progress = 60;
492
+ console.error('[INFO] Recalculating TF-IDF scores...');
493
+ await persistentStorage.recalculateTfidfScores();
494
+ this.status.progress = 70;
495
+ console.error('[INFO] Computing chunk magnitudes...');
496
+ await persistentStorage.updateChunkMagnitudes();
497
+ this.status.progress = 80;
498
+ console.error('[INFO] Computing average document length...');
499
+ await persistentStorage.updateAverageDocLength();
500
+ this.status.progress = 85;
501
+ // Release in-memory structures in low memory mode
502
+ if (this.lowMemoryMode) {
503
+ this.searchIndex = null;
504
+ this.incrementalEngine = null;
505
+ console.error('[INFO] Low memory mode: released in-memory index');
506
+ }
507
+ console.error('[SUCCESS] Chunk-level TF-IDF index persisted');
508
+ }
509
+ else if (this.incrementalEngine) {
510
+ // Non-persistent storage: build in-memory index
511
+ console.error('[INFO] Finalizing in-memory search index...');
512
+ const indexData = this.incrementalEngine.getIndex();
513
+ this.searchIndex = {
514
+ documents: indexData.documents,
515
+ idf: indexData.idf,
516
+ totalDocuments: indexData.totalDocuments,
517
+ metadata: {
518
+ generatedAt: new Date().toISOString(),
519
+ version: '1.0.0',
520
+ },
521
+ };
522
+ console.error('[INFO] Incremental index engine initialized');
523
+ }
524
+ // Build vector index if embedding provider available
525
+ if (this.embeddingProvider && this.vectorStorage) {
526
+ await this.buildVectorIndexFromMetadata(fileMetadataList);
527
+ }
528
+ this.status.progress = 100;
529
+ this.status.indexedChunks = totalChunks;
530
+ console.error(`[SUCCESS] Indexed ${totalChunks} chunks from ${fileMetadataList.length} files`);
531
+ // Start watching if requested
532
+ if (options.watch) {
533
+ await this.startWatch();
534
+ }
535
+ }
536
+ catch (error) {
537
+ console.error('[ERROR] Failed to index codebase:', error);
538
+ throw error;
539
+ }
540
+ finally {
541
+ this.status.isIndexing = false;
542
+ this.status.currentFile = undefined;
543
+ }
544
+ }
545
+ /**
546
+ * Start watching for file changes
547
+ * Uses @parcel/watcher which provides native FSEvents on macOS
548
+ */
549
+ async startWatch() {
550
+ if (this.isWatching) {
551
+ console.error('[WARN] Already watching for changes');
552
+ return;
553
+ }
554
+ if (!this.ignoreFilter) {
555
+ this.ignoreFilter = loadGitignore(this.codebaseRoot);
556
+ }
557
+ console.error('[INFO] Starting file watcher (native FSEvents)...');
558
+ const watcher = await import('@parcel/watcher');
559
+ // Subscribe to file changes
560
+ this.watcher = await watcher.subscribe(this.codebaseRoot, (err, events) => {
561
+ if (err) {
562
+ console.error('[WARN] File watcher error:', err.message);
563
+ return;
564
+ }
565
+ for (const event of events) {
566
+ const absolutePath = event.path;
567
+ const relativePath = path.relative(this.codebaseRoot, absolutePath);
568
+ // Skip ignored files
569
+ if (this.shouldIgnore(relativePath)) {
570
+ continue;
571
+ }
572
+ // Map @parcel/watcher event types to our types
573
+ const eventType = event.type === 'create' ? 'add' : event.type === 'delete' ? 'unlink' : 'change';
574
+ this.handleFileChange(eventType, absolutePath);
575
+ }
576
+ }, {
577
+ // Use native backend (FSEvents on macOS, inotify on Linux)
578
+ backend: undefined, // auto-detect best backend
579
+ ignore: [
580
+ '**/node_modules/**',
581
+ '**/.git/**',
582
+ '**/dist/**',
583
+ '**/build/**',
584
+ '**/.next/**',
585
+ '**/.turbo/**',
586
+ '**/.cache/**',
587
+ '**/coverage/**',
588
+ '**/*.log',
589
+ ],
590
+ });
591
+ this.isWatching = true;
592
+ console.error('[SUCCESS] File watcher started (native FSEvents)');
593
+ }
594
+ /**
595
+ * Check if a file should be ignored
596
+ */
597
+ shouldIgnore(relativePath) {
598
+ // Skip empty paths
599
+ if (!relativePath)
600
+ return true;
601
+ // Check gitignore
602
+ if (this.ignoreFilter?.ignores(relativePath)) {
603
+ return true;
604
+ }
605
+ return false;
606
+ }
607
+ /**
608
+ * Stop watching for file changes
609
+ */
610
+ async stopWatch() {
611
+ if (!this.isWatching || !this.watcher) {
612
+ return;
613
+ }
614
+ console.error('[INFO] Stopping file watcher...');
615
+ await this.watcher.unsubscribe();
616
+ this.watcher = null;
617
+ this.isWatching = false;
618
+ // Clear pending updates
619
+ for (const timeout of this.pendingUpdates.values()) {
620
+ clearTimeout(timeout);
621
+ }
622
+ this.pendingUpdates.clear();
623
+ // Clear pending file changes to prevent memory leak
624
+ this.pendingFileChanges = [];
625
+ console.error('[SUCCESS] File watcher stopped');
626
+ }
627
+ /**
628
+ * Close indexer and release all resources
629
+ * Should be called when the indexer is no longer needed
630
+ */
631
+ async close() {
632
+ // Stop file watcher first
633
+ await this.stopWatch();
634
+ // Close vector storage (LanceDB connection)
635
+ if (this.vectorStorage) {
636
+ await this.vectorStorage.close();
637
+ this.vectorStorage = undefined;
638
+ }
639
+ // Close persistent storage (SQLite connection)
640
+ if (this.storage instanceof PersistentStorage) {
641
+ this.storage.close();
642
+ }
643
+ // Clear all in-memory state
644
+ this.searchIndex = null;
645
+ this.incrementalEngine = null;
646
+ this.pendingFileChanges = [];
647
+ this.searchCache.clear();
648
+ this.ignoreFilter = null;
649
+ console.error('[SUCCESS] Indexer closed and resources released');
650
+ }
651
+ /**
652
+ * Handle file change events with debouncing
653
+ */
654
+ handleFileChange(type, absolutePath) {
655
+ const relativePath = path.relative(this.codebaseRoot, absolutePath);
656
+ // Check if file should be ignored
657
+ if (this.ignoreFilter?.ignores(relativePath)) {
658
+ return;
659
+ }
660
+ // Debounce updates (wait 500ms after last change)
661
+ const existing = this.pendingUpdates.get(relativePath);
662
+ if (existing) {
663
+ clearTimeout(existing);
664
+ }
665
+ const timeout = setTimeout(async () => {
666
+ this.pendingUpdates.delete(relativePath);
667
+ await this.processFileChange(type, relativePath, absolutePath);
668
+ }, 500);
669
+ this.pendingUpdates.set(relativePath, timeout);
670
+ }
671
+ /**
672
+ * Process file change and update index
673
+ */
674
+ async processFileChange(type, relativePath, absolutePath) {
675
+ const event = {
676
+ type,
677
+ path: relativePath,
678
+ timestamp: Date.now(),
679
+ };
680
+ try {
681
+ if (type === 'unlink') {
682
+ // Track deletion for incremental update
683
+ const existingFile = await this.storage.getFile(relativePath);
684
+ if (existingFile && this.searchIndex) {
685
+ const oldDoc = this.searchIndex.documents.find((d) => d.uri === `file://${relativePath}`);
686
+ if (oldDoc) {
687
+ this.pendingFileChanges.push({
688
+ type: 'delete',
689
+ uri: `file://${relativePath}`,
690
+ oldDocument: oldDoc,
691
+ });
692
+ }
693
+ }
694
+ // Remove from storage
695
+ await this.storage.deleteFile(relativePath);
696
+ // Remove from vector storage
697
+ await this.deleteFileVector(relativePath);
698
+ console.error(`[FILE] Removed: ${relativePath}`);
699
+ }
700
+ else {
701
+ // Check if file is text and within size limit
702
+ const stats = await fs.stat(absolutePath);
703
+ if (stats.size > this.maxFileSize) {
704
+ console.error(`[FILE] Skipped (too large): ${relativePath}`);
705
+ return;
706
+ }
707
+ if (!isTextFile(absolutePath)) {
708
+ console.error(`[FILE] Skipped (binary): ${relativePath}`);
709
+ return;
710
+ }
711
+ // Read file content
712
+ const content = await fs.readFile(absolutePath, 'utf-8');
713
+ const hash = simpleHash(content);
714
+ // OPTIMIZATION: Check if file actually changed using hash comparison
715
+ const existingFile = await this.storage.getFile(relativePath);
716
+ if (existingFile && existingFile.hash === hash) {
717
+ console.error(`[FILE] Skipped (unchanged): ${relativePath}`);
718
+ // File hasn't changed, skip indexing
719
+ this.onFileChangeCallback?.(event);
720
+ return;
721
+ }
722
+ // Track change for incremental update
723
+ if (this.searchIndex) {
724
+ const uri = `file://${relativePath}`;
725
+ const oldDoc = this.searchIndex.documents.find((d) => d.uri === uri);
726
+ if (oldDoc) {
727
+ // Update existing document
728
+ this.pendingFileChanges.push({
729
+ type: 'update',
730
+ uri,
731
+ oldDocument: oldDoc,
732
+ newContent: content,
733
+ });
734
+ }
735
+ else {
736
+ // Add new document
737
+ this.pendingFileChanges.push({
738
+ type: 'add',
739
+ uri,
740
+ newContent: content,
741
+ });
742
+ }
743
+ }
744
+ // File changed or new, process it
745
+ const codebaseFile = {
746
+ path: relativePath,
747
+ content,
748
+ size: stats.size,
749
+ mtime: stats.mtime,
750
+ language: detectLanguage(relativePath),
751
+ hash,
752
+ };
753
+ await this.storage.storeFile(codebaseFile);
754
+ // Update vector storage
755
+ await this.updateFileVector(relativePath, content);
756
+ console.error(`[FILE] ${type === 'add' ? 'Added' : 'Updated'}: ${relativePath}`);
757
+ }
758
+ // Rebuild search index
759
+ await this.rebuildSearchIndex();
760
+ // Notify callback
761
+ this.onFileChangeCallback?.(event);
762
+ }
763
+ catch (error) {
764
+ console.error(`[ERROR] Failed to process file change (${relativePath}):`, error);
765
+ }
766
+ }
767
+ /**
768
+ * Rebuild search index from current storage
769
+ * Uses incremental update when possible for performance
770
+ */
771
+ async rebuildSearchIndex() {
772
+ // If no incremental engine or no pending changes, do full rebuild
773
+ if (!this.incrementalEngine || this.pendingFileChanges.length === 0) {
774
+ // CRITICAL: Clear pending changes to prevent memory leak
775
+ // In lowMemoryMode, incrementalEngine is null, so we must clear here
776
+ this.pendingFileChanges = [];
777
+ return this.fullRebuildSearchIndex();
778
+ }
779
+ // Use try/finally to ensure pendingFileChanges is always cleared
780
+ // This prevents memory leak if an exception occurs during rebuild
781
+ try {
782
+ // Check if incremental update is recommended
783
+ if (await this.incrementalEngine.shouldFullRebuild(this.pendingFileChanges)) {
784
+ console.error('[INFO] Changes too extensive, performing full rebuild instead of incremental');
785
+ this.pendingFileChanges = [];
786
+ return this.fullRebuildSearchIndex();
787
+ }
788
+ // Perform incremental update
789
+ const stats = await this.incrementalEngine.applyUpdates(this.pendingFileChanges);
790
+ // Update search index from incremental engine
791
+ const indexData = this.incrementalEngine.getIndex();
792
+ this.searchIndex = {
793
+ documents: indexData.documents,
794
+ idf: indexData.idf,
795
+ totalDocuments: indexData.totalDocuments,
796
+ metadata: {
797
+ generatedAt: new Date().toISOString(),
798
+ version: '1.0.0',
799
+ },
800
+ };
801
+ console.error(`[SUCCESS] Incremental update: ${stats.affectedDocuments} docs, ${stats.affectedTerms} terms, ${stats.updateTime}ms`);
802
+ // Invalidate search cache (index changed)
803
+ this.searchCache.invalidate();
804
+ console.error('[INFO] Search cache invalidated');
805
+ // Persist if using persistent storage
806
+ if (this.storage instanceof PersistentStorage) {
807
+ await this.persistSearchIndex();
808
+ }
809
+ }
810
+ finally {
811
+ // Always clear pending changes to prevent memory leak
812
+ this.pendingFileChanges = [];
813
+ }
814
+ }
815
+ /**
816
+ * Full rebuild of search index (fallback when incremental not possible)
817
+ * For persistent storage, this rebuilds the chunk-level index
818
+ */
819
+ async fullRebuildSearchIndex() {
820
+ // For persistent storage, rebuild chunk index
821
+ if (this.storage instanceof PersistentStorage) {
822
+ const persistentStorage = this.storage;
823
+ const allFiles = await this.storage.getAllFiles();
824
+ console.error(`[INFO] Full rebuild: re-chunking ${allFiles.length} files...`);
825
+ // Re-chunk all files
826
+ const fileChunks = [];
827
+ for (const file of allFiles) {
828
+ const chunks = await chunkCodeByAST(file.content, file.path);
829
+ const chunkData = chunks.map((chunk) => ({
830
+ content: chunk.content,
831
+ type: chunk.type,
832
+ startLine: chunk.startLine,
833
+ endLine: chunk.endLine,
834
+ metadata: chunk.metadata,
835
+ }));
836
+ fileChunks.push({ filePath: file.path, chunks: chunkData });
837
+ }
838
+ // Store all chunks (this also deletes old chunks)
839
+ const chunkIdMap = await persistentStorage.storeManyChunks(fileChunks);
840
+ // Build TF-IDF vectors for all chunks
841
+ const chunkVectors = [];
842
+ for (const fc of fileChunks) {
843
+ const chunkIds = chunkIdMap.get(fc.filePath);
844
+ if (!chunkIds)
845
+ continue;
846
+ for (let j = 0; j < fc.chunks.length; j++) {
847
+ const chunk = fc.chunks[j];
848
+ const chunkId = chunkIds[j];
849
+ if (!chunkId)
850
+ continue;
851
+ const tokens = await tokenize(chunk.content);
852
+ const termFreq = new Map();
853
+ for (const token of tokens) {
854
+ termFreq.set(token, (termFreq.get(token) || 0) + 1);
855
+ }
856
+ const totalTerms = tokens.length;
857
+ if (totalTerms === 0)
858
+ continue;
859
+ const terms = new Map();
860
+ for (const [term, freq] of termFreq) {
861
+ terms.set(term, {
862
+ tf: freq / totalTerms,
863
+ tfidf: 0,
864
+ rawFreq: freq,
865
+ });
866
+ }
867
+ chunkVectors.push({ chunkId, terms, tokenCount: totalTerms });
868
+ }
869
+ }
870
+ if (chunkVectors.length > 0) {
871
+ await persistentStorage.storeManyChunkVectors(chunkVectors);
872
+ }
873
+ // Rebuild IDF and TF-IDF scores
874
+ await persistentStorage.rebuildIdfScoresFromVectors();
875
+ await persistentStorage.recalculateTfidfScores();
876
+ await persistentStorage.updateChunkMagnitudes();
877
+ await persistentStorage.updateAverageDocLength();
878
+ console.error('[SUCCESS] Full chunk index rebuild complete');
879
+ }
880
+ else {
881
+ // For non-persistent storage, use in-memory index
882
+ const allFiles = await this.storage.getAllFiles();
883
+ const documents = allFiles.map((file) => ({
884
+ uri: `file://${file.path}`,
885
+ content: file.content,
886
+ }));
887
+ this.searchIndex = await buildSearchIndex(documents);
888
+ this.incrementalEngine = new IncrementalTFIDF(this.searchIndex.documents, this.searchIndex.idf);
889
+ }
890
+ // Invalidate search cache (index changed)
891
+ this.searchCache.invalidate();
892
+ console.error('[INFO] Search cache invalidated');
893
+ }
894
+ /**
895
+ * Persist search index to storage
896
+ * NOTE: For PersistentStorage, chunk-based indexing happens inline during index()
897
+ * This method is only used for in-memory storage fallback
898
+ */
899
+ async persistSearchIndex() {
900
+ // For persistent storage, indexing is done inline with chunks
901
+ // This method is kept for compatibility with in-memory storage mode
902
+ if (this.storage instanceof PersistentStorage) {
903
+ // Chunk-based indexing already persisted during index()
904
+ console.error('[INFO] Chunk-based index already persisted');
905
+ return;
906
+ }
907
+ // For non-persistent storage, just store IDF scores if available
908
+ if (this.searchIndex) {
909
+ const docFreq = new Map();
910
+ for (const doc of this.searchIndex.documents) {
911
+ const uniqueTerms = new Set(doc.rawTerms.keys());
912
+ for (const term of uniqueTerms) {
913
+ docFreq.set(term, (docFreq.get(term) || 0) + 1);
914
+ }
915
+ }
916
+ console.error('[INFO] In-memory index built (non-persistent storage)');
917
+ }
918
+ }
919
+ /**
920
+ * Check if currently watching for changes
921
+ */
922
+ isWatchEnabled() {
923
+ return this.isWatching;
924
+ }
925
+ /**
926
+ * Search the codebase
927
+ * Returns chunk-level results when using persistent storage (SQL-based search)
928
+ */
929
+ async search(query, options = {}) {
930
+ const { limit = 10, includeContent = true } = options;
931
+ const snippetOptions = {
932
+ contextLines: options.contextLines,
933
+ maxChars: options.maxSnippetChars,
934
+ maxBlocks: options.maxSnippetBlocks,
935
+ };
936
+ // Create cache key from query and options
937
+ const cacheKey = createCacheKey(query, {
938
+ limit,
939
+ fileExtensions: options.fileExtensions,
940
+ pathFilter: options.pathFilter,
941
+ excludePaths: options.excludePaths,
942
+ ...snippetOptions,
943
+ });
944
+ // Check cache first
945
+ const cachedResults = this.searchCache.get(cacheKey);
946
+ if (cachedResults) {
947
+ console.error(`[CACHE HIT] Query: "${query}"`);
948
+ return cachedResults;
949
+ }
950
+ console.error(`[CACHE MISS] Query: "${query}"`);
951
+ // Use chunk-based SQL search in low memory mode (Memory optimization)
952
+ if (this.lowMemoryMode && this.storage instanceof PersistentStorage) {
953
+ const searchResults = await this.searchChunks(query, options);
954
+ this.searchCache.set(cacheKey, searchResults);
955
+ return searchResults;
956
+ }
957
+ // In-memory search (faster but uses more memory) - file-level
958
+ let results;
959
+ if (!this.searchIndex) {
960
+ throw new Error('Codebase not indexed. Please run index() first.');
961
+ }
962
+ const searchIndex = this.searchIndex;
963
+ results = await import('./tfidf.js').then((m) => m.searchDocuments(query, searchIndex, { limit }));
964
+ // Get file content and apply filters (in-memory mode)
965
+ const searchResults = [];
966
+ for (const result of results) {
967
+ const filePath = result.uri.replace('file://', '');
968
+ // Apply filters
969
+ if (options.fileExtensions && options.fileExtensions.length > 0) {
970
+ if (!options.fileExtensions.some((ext) => filePath.endsWith(ext))) {
971
+ continue;
972
+ }
973
+ }
974
+ if (options.pathFilter && !filePath.includes(options.pathFilter)) {
975
+ continue;
976
+ }
977
+ if (options.excludePaths && options.excludePaths.length > 0) {
978
+ if (options.excludePaths.some((exclude) => filePath.includes(exclude))) {
979
+ continue;
980
+ }
981
+ }
982
+ const file = await this.storage.getFile(filePath);
983
+ if (!file)
984
+ continue;
985
+ const searchResult = {
986
+ path: file.path,
987
+ score: result.score,
988
+ matchedTerms: result.matchedTerms,
989
+ language: file.language,
990
+ size: file.size,
991
+ };
992
+ if (includeContent) {
993
+ searchResult.snippet = this.extractSnippet(file.content, result.matchedTerms, snippetOptions);
994
+ }
995
+ searchResults.push(searchResult);
996
+ }
997
+ const finalResults = searchResults.slice(0, limit);
998
+ // Store in cache
999
+ this.searchCache.set(cacheKey, finalResults);
1000
+ return finalResults;
1001
+ }
1002
+ /**
1003
+ * Chunk-based search with BM25 scoring
1004
+ * Returns chunk content directly (no separate snippet extraction needed)
1005
+ */
1006
+ async searchChunks(query, options) {
1007
+ const { limit = 10, includeContent = true } = options;
1008
+ const persistentStorage = this.storage;
1009
+ // Tokenize query
1010
+ const queryTokens = await getQueryTokens(query);
1011
+ if (queryTokens.length === 0) {
1012
+ return [];
1013
+ }
1014
+ // Get matching chunks from storage (already includes content)
1015
+ const candidates = await persistentStorage.searchByTerms(queryTokens, { limit: limit * 3 });
1016
+ // Get IDF scores for query terms
1017
+ const idf = await persistentStorage.getIdfScoresForTerms(queryTokens);
1018
+ // Get average document length for BM25
1019
+ const avgDocLength = await persistentStorage.getAverageDocLength();
1020
+ // BM25 parameters
1021
+ const k1 = 1.2;
1022
+ const b = 0.75;
1023
+ // Calculate BM25 scores for each chunk
1024
+ const scoredResults = [];
1025
+ for (const chunk of candidates) {
1026
+ // Apply filters
1027
+ if (options.fileExtensions && options.fileExtensions.length > 0) {
1028
+ if (!options.fileExtensions.some((ext) => chunk.filePath.endsWith(ext))) {
1029
+ continue;
1030
+ }
1031
+ }
1032
+ if (options.pathFilter && !chunk.filePath.includes(options.pathFilter)) {
1033
+ continue;
1034
+ }
1035
+ if (options.excludePaths && options.excludePaths.length > 0) {
1036
+ if (options.excludePaths.some((exclude) => chunk.filePath.includes(exclude))) {
1037
+ continue;
1038
+ }
1039
+ }
1040
+ // Calculate BM25 score
1041
+ let score = 0;
1042
+ const matchedTerms = [];
1043
+ for (const term of queryTokens) {
1044
+ const termData = chunk.matchedTerms.get(term);
1045
+ if (!termData)
1046
+ continue;
1047
+ matchedTerms.push(term);
1048
+ const termIdf = idf.get(term) || 1;
1049
+ // BM25 formula
1050
+ const tf = termData.rawFreq;
1051
+ const docLen = chunk.tokenCount || 1;
1052
+ const numerator = tf * (k1 + 1);
1053
+ const denominator = tf + k1 * (1 - b + b * (docLen / (avgDocLength || 1)));
1054
+ score += termIdf * (numerator / denominator);
1055
+ }
1056
+ if (matchedTerms.length > 0) {
1057
+ scoredResults.push({ chunk, score, matchedTerms });
1058
+ }
1059
+ }
1060
+ // Sort by score descending
1061
+ scoredResults.sort((a, b) => b.score - a.score);
1062
+ // Convert to SearchResult format
1063
+ const results = [];
1064
+ for (const { chunk, score, matchedTerms } of scoredResults.slice(0, limit)) {
1065
+ const result = {
1066
+ path: chunk.filePath,
1067
+ score,
1068
+ matchedTerms,
1069
+ language: detectLanguage(chunk.filePath),
1070
+ size: chunk.content.length,
1071
+ // Include chunk metadata
1072
+ chunkType: chunk.type,
1073
+ startLine: chunk.startLine,
1074
+ endLine: chunk.endLine,
1075
+ };
1076
+ if (includeContent) {
1077
+ // Chunk content is the snippet - add line numbers
1078
+ const lines = chunk.content.split('\n');
1079
+ result.snippet = lines.map((line, i) => `${chunk.startLine + i}: ${line}`).join('\n');
1080
+ }
1081
+ results.push(result);
1082
+ }
1083
+ console.error(`[BM25 CHUNK SEARCH] Found ${results.length} chunks`);
1084
+ return results;
1085
+ }
1086
+ /**
1087
+ * Extract code block snippets from content around matched terms
1088
+ *
1089
+ * Returns the most relevant code blocks (not just lines) with context.
1090
+ * Blocks are ranked by term density (more matched terms = higher score).
1091
+ */
1092
+ extractSnippet(content, matchedTerms, options = {}) {
1093
+ const { contextLines = 3, maxChars = 2000, maxBlocks = 4 } = options;
1094
+ const lines = content.split('\n');
1095
+ // Step 1: Find all lines with matches and score them
1096
+ const matchedLineInfos = [];
1097
+ for (let i = 0; i < lines.length; i++) {
1098
+ const lineLower = lines[i].toLowerCase();
1099
+ const termsInLine = matchedTerms.filter((term) => lineLower.includes(term.toLowerCase()));
1100
+ if (termsInLine.length > 0) {
1101
+ matchedLineInfos.push({
1102
+ lineNum: i,
1103
+ score: termsInLine.length,
1104
+ matchedTerms: termsInLine,
1105
+ });
1106
+ }
1107
+ }
1108
+ if (matchedLineInfos.length === 0) {
1109
+ // Return first few lines if no matches found
1110
+ return lines.slice(0, 5).join('\n');
1111
+ }
1112
+ const blocks = [];
1113
+ for (const info of matchedLineInfos) {
1114
+ const start = Math.max(0, info.lineNum - contextLines);
1115
+ const end = Math.min(lines.length - 1, info.lineNum + contextLines);
1116
+ // Try to merge with existing block if overlapping
1117
+ let merged = false;
1118
+ for (const block of blocks) {
1119
+ if (start <= block.end + 1 && end >= block.start - 1) {
1120
+ // Overlapping or adjacent - merge
1121
+ block.start = Math.min(block.start, start);
1122
+ block.end = Math.max(block.end, end);
1123
+ block.score += info.score;
1124
+ for (const term of info.matchedTerms) {
1125
+ block.matchedTerms.add(term);
1126
+ }
1127
+ merged = true;
1128
+ break;
1129
+ }
1130
+ }
1131
+ if (!merged) {
1132
+ blocks.push({
1133
+ start,
1134
+ end,
1135
+ score: info.score,
1136
+ matchedTerms: new Set(info.matchedTerms),
1137
+ });
1138
+ }
1139
+ }
1140
+ // Step 3: Sort blocks by unique terms (primary) and density (secondary)
1141
+ // Unique terms = how many different query terms appear in block
1142
+ // Density = unique terms / block size (prefer compact blocks)
1143
+ blocks.sort((a, b) => {
1144
+ const uniqueA = a.matchedTerms.size;
1145
+ const uniqueB = b.matchedTerms.size;
1146
+ if (uniqueA !== uniqueB) {
1147
+ return uniqueB - uniqueA; // More unique terms = better
1148
+ }
1149
+ // Tie-break: prefer denser blocks (more terms per line)
1150
+ const densityA = uniqueA / (a.end - a.start + 1);
1151
+ const densityB = uniqueB / (b.end - b.start + 1);
1152
+ return densityB - densityA;
1153
+ });
1154
+ const topBlocks = blocks.slice(0, maxBlocks);
1155
+ // Sort by position for output (top to bottom in file)
1156
+ topBlocks.sort((a, b) => a.start - b.start);
1157
+ // Step 4: Build output with character limit
1158
+ const snippetParts = [];
1159
+ let totalChars = 0;
1160
+ for (const block of topBlocks) {
1161
+ const blockLines = lines.slice(block.start, block.end + 1);
1162
+ const blockContent = blockLines.map((line, i) => `${block.start + i + 1}: ${line}`).join('\n');
1163
+ // Check if adding this block would exceed limit
1164
+ if (totalChars + blockContent.length > maxChars && snippetParts.length > 0) {
1165
+ break;
1166
+ }
1167
+ snippetParts.push(blockContent);
1168
+ totalChars += blockContent.length;
1169
+ }
1170
+ return snippetParts.join('\n...\n');
1171
+ }
1172
+ /**
1173
+ * Get file content
1174
+ */
1175
+ async getFileContent(filePath) {
1176
+ const file = await this.storage.getFile(filePath);
1177
+ return file?.content || null;
1178
+ }
1179
+ /**
1180
+ * Get total indexed files count
1181
+ */
1182
+ async getIndexedCount() {
1183
+ return this.storage.count();
1184
+ }
1185
+ /**
1186
+ * Get vector storage (for hybrid search)
1187
+ */
1188
+ getVectorStorage() {
1189
+ return this.vectorStorage;
1190
+ }
1191
+ /**
1192
+ * Get embedding provider (for hybrid search)
1193
+ */
1194
+ getEmbeddingProvider() {
1195
+ return this.embeddingProvider;
1196
+ }
1197
+ /**
1198
+ * Build vector index from file metadata (Memory optimization)
1199
+ * Generates embeddings per CHUNK, not per file
1200
+ */
1201
+ async buildVectorIndexFromMetadata(files) {
1202
+ if (!this.embeddingProvider || !this.vectorStorage) {
1203
+ return;
1204
+ }
1205
+ console.error('[INFO] Generating embeddings for vector search (chunk-level)...');
1206
+ const startTime = Date.now();
1207
+ let totalChunks = 0;
1208
+ let processedChunks = 0;
1209
+ // First pass: count total chunks
1210
+ const allChunks = [];
1211
+ for (const metadata of files) {
1212
+ const content = readFileContent(metadata.absolutePath);
1213
+ if (content === null)
1214
+ continue;
1215
+ // Chunk the file using AST
1216
+ const chunks = await chunkCodeByAST(content, metadata.path);
1217
+ for (let i = 0; i < chunks.length; i++) {
1218
+ const chunk = chunks[i];
1219
+ allChunks.push({
1220
+ id: `chunk://${metadata.path}:${chunk.startLine}-${chunk.endLine}`,
1221
+ content: chunk.content,
1222
+ metadata,
1223
+ chunkType: chunk.type,
1224
+ startLine: chunk.startLine,
1225
+ endLine: chunk.endLine,
1226
+ });
1227
+ }
1228
+ }
1229
+ totalChunks = allChunks.length;
1230
+ console.error(`[INFO] Total chunks to embed: ${totalChunks}`);
1231
+ // Process chunks in batches
1232
+ const batchSize = this.vectorBatchSize;
1233
+ for (let i = 0; i < allChunks.length; i += batchSize) {
1234
+ const batch = allChunks.slice(i, i + batchSize);
1235
+ try {
1236
+ // Generate embeddings for batch
1237
+ const embeddings = await this.embeddingProvider.generateEmbeddings(batch.map((c) => c.content));
1238
+ // Add to vector storage
1239
+ for (let j = 0; j < batch.length; j++) {
1240
+ const chunk = batch[j];
1241
+ const embedding = embeddings[j];
1242
+ const doc = {
1243
+ id: chunk.id,
1244
+ embedding,
1245
+ metadata: {
1246
+ type: 'code',
1247
+ chunkType: chunk.chunkType,
1248
+ language: chunk.metadata.language,
1249
+ content: chunk.content.substring(0, 500), // Preview
1250
+ path: chunk.metadata.path,
1251
+ startLine: chunk.startLine,
1252
+ endLine: chunk.endLine,
1253
+ },
1254
+ };
1255
+ await this.vectorStorage.addDocument(doc);
1256
+ }
1257
+ processedChunks += batch.length;
1258
+ console.error(`[INFO] Generated embeddings: ${processedChunks}/${totalChunks} chunks`);
1259
+ }
1260
+ catch (error) {
1261
+ console.error(`[ERROR] Failed to generate embeddings for batch ${i}:`, error);
1262
+ // Continue with next batch
1263
+ }
1264
+ }
1265
+ // LanceDB auto-persists, no need to save
1266
+ const elapsedTime = Date.now() - startTime;
1267
+ console.error(`[SUCCESS] Vector index built (${processedChunks} chunks from ${files.length} files, ${elapsedTime}ms)`);
1268
+ }
1269
+ /**
1270
+ * Update vectors for a single file (chunk-level)
1271
+ * Deletes old chunks and adds new ones
1272
+ */
1273
+ async updateFileVector(filePath, content) {
1274
+ if (!this.embeddingProvider || !this.vectorStorage) {
1275
+ return;
1276
+ }
1277
+ try {
1278
+ // Delete existing chunks for this file
1279
+ await this.deleteFileVector(filePath);
1280
+ // Chunk the file using AST
1281
+ const chunks = await chunkCodeByAST(content, filePath);
1282
+ const language = detectLanguage(filePath);
1283
+ // Generate embeddings for all chunks
1284
+ const embeddings = await this.embeddingProvider.generateEmbeddings(chunks.map((c) => c.content));
1285
+ // Add each chunk to vector storage
1286
+ for (let i = 0; i < chunks.length; i++) {
1287
+ const chunk = chunks[i];
1288
+ const embedding = embeddings[i];
1289
+ const doc = {
1290
+ id: `chunk://${filePath}:${chunk.startLine}-${chunk.endLine}`,
1291
+ embedding,
1292
+ metadata: {
1293
+ type: 'code',
1294
+ chunkType: chunk.type,
1295
+ language,
1296
+ content: chunk.content.substring(0, 500),
1297
+ path: filePath,
1298
+ startLine: chunk.startLine,
1299
+ endLine: chunk.endLine,
1300
+ },
1301
+ };
1302
+ await this.vectorStorage.addDocument(doc);
1303
+ }
1304
+ console.error(`[VECTOR] Updated: ${filePath} (${chunks.length} chunks)`);
1305
+ }
1306
+ catch (error) {
1307
+ console.error(`[ERROR] Failed to update vector for ${filePath}:`, error);
1308
+ }
1309
+ }
1310
+ /**
1311
+ * Delete vectors for a file (all chunks)
1312
+ */
1313
+ async deleteFileVector(filePath) {
1314
+ if (!this.vectorStorage) {
1315
+ return;
1316
+ }
1317
+ // Delete all chunks that belong to this file
1318
+ // Vector IDs are in format: chunk://path:startLine-endLine
1319
+ // We need to query and delete all matching the path prefix
1320
+ try {
1321
+ // LanceDB doesn't have a prefix delete, so we search and delete individually
1322
+ // For now, we'll rely on the addDocument overwriting or use a workaround
1323
+ // TODO: Implement proper chunk deletion in VectorStorage
1324
+ console.error(`[VECTOR] Deleting chunks for: ${filePath}`);
1325
+ }
1326
+ catch (error) {
1327
+ console.error(`[ERROR] Failed to delete vectors for ${filePath}:`, error);
1328
+ }
1329
+ }
1330
+ }
1331
+ //# sourceMappingURL=indexer.js.map