@matperez/coderag 0.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +154 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/ast-chunking.d.ts +40 -0
- package/dist/ast-chunking.d.ts.map +1 -0
- package/dist/ast-chunking.js +88 -0
- package/dist/ast-chunking.js.map +1 -0
- package/dist/ast-chunking.test.d.ts +5 -0
- package/dist/ast-chunking.test.d.ts.map +1 -0
- package/dist/ast-chunking.test.js +173 -0
- package/dist/ast-chunking.test.js.map +1 -0
- package/dist/code-tokenizer.d.ts +62 -0
- package/dist/code-tokenizer.d.ts.map +1 -0
- package/dist/code-tokenizer.js +129 -0
- package/dist/code-tokenizer.js.map +1 -0
- package/dist/code-tokenizer.test.d.ts +5 -0
- package/dist/code-tokenizer.test.d.ts.map +1 -0
- package/dist/code-tokenizer.test.js +96 -0
- package/dist/code-tokenizer.test.js.map +1 -0
- package/dist/db/client-pg.d.ts +16 -0
- package/dist/db/client-pg.d.ts.map +1 -0
- package/dist/db/client-pg.js +38 -0
- package/dist/db/client-pg.js.map +1 -0
- package/dist/db/client.d.ts +36 -0
- package/dist/db/client.d.ts.map +1 -0
- package/dist/db/client.js +81 -0
- package/dist/db/client.js.map +1 -0
- package/dist/db/migrations-pg.d.ts +6 -0
- package/dist/db/migrations-pg.d.ts.map +1 -0
- package/dist/db/migrations-pg.js +88 -0
- package/dist/db/migrations-pg.js.map +1 -0
- package/dist/db/migrations.d.ts +9 -0
- package/dist/db/migrations.d.ts.map +1 -0
- package/dist/db/migrations.js +164 -0
- package/dist/db/migrations.js.map +1 -0
- package/dist/db/schema-pg.d.ts +611 -0
- package/dist/db/schema-pg.d.ts.map +1 -0
- package/dist/db/schema-pg.js +66 -0
- package/dist/db/schema-pg.js.map +1 -0
- package/dist/db/schema.d.ts +630 -0
- package/dist/db/schema.d.ts.map +1 -0
- package/dist/db/schema.js +85 -0
- package/dist/db/schema.js.map +1 -0
- package/dist/embeddings.d.ts +92 -0
- package/dist/embeddings.d.ts.map +1 -0
- package/dist/embeddings.js +275 -0
- package/dist/embeddings.js.map +1 -0
- package/dist/embeddings.test.d.ts +5 -0
- package/dist/embeddings.test.d.ts.map +1 -0
- package/dist/embeddings.test.js +255 -0
- package/dist/embeddings.test.js.map +1 -0
- package/dist/hybrid-search.d.ts +47 -0
- package/dist/hybrid-search.d.ts.map +1 -0
- package/dist/hybrid-search.js +215 -0
- package/dist/hybrid-search.js.map +1 -0
- package/dist/hybrid-search.test.d.ts +5 -0
- package/dist/hybrid-search.test.d.ts.map +1 -0
- package/dist/hybrid-search.test.js +252 -0
- package/dist/hybrid-search.test.js.map +1 -0
- package/dist/incremental-tfidf.d.ts +77 -0
- package/dist/incremental-tfidf.d.ts.map +1 -0
- package/dist/incremental-tfidf.js +248 -0
- package/dist/incremental-tfidf.js.map +1 -0
- package/dist/incremental-tfidf.test.d.ts +5 -0
- package/dist/incremental-tfidf.test.d.ts.map +1 -0
- package/dist/incremental-tfidf.test.js +276 -0
- package/dist/incremental-tfidf.test.js.map +1 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +19 -0
- package/dist/index.js.map +1 -0
- package/dist/indexer.d.ts +205 -0
- package/dist/indexer.d.ts.map +1 -0
- package/dist/indexer.js +1331 -0
- package/dist/indexer.js.map +1 -0
- package/dist/indexer.test.d.ts +12 -0
- package/dist/indexer.test.d.ts.map +1 -0
- package/dist/indexer.test.js +471 -0
- package/dist/indexer.test.js.map +1 -0
- package/dist/language-config.d.ts +54 -0
- package/dist/language-config.d.ts.map +1 -0
- package/dist/language-config.js +75 -0
- package/dist/language-config.js.map +1 -0
- package/dist/search-cache.d.ts +63 -0
- package/dist/search-cache.d.ts.map +1 -0
- package/dist/search-cache.js +118 -0
- package/dist/search-cache.js.map +1 -0
- package/dist/search-cache.test.d.ts +5 -0
- package/dist/search-cache.test.d.ts.map +1 -0
- package/dist/search-cache.test.js +194 -0
- package/dist/search-cache.test.js.map +1 -0
- package/dist/storage-factory.d.ts +11 -0
- package/dist/storage-factory.d.ts.map +1 -0
- package/dist/storage-factory.js +17 -0
- package/dist/storage-factory.js.map +1 -0
- package/dist/storage-persistent-pg.d.ts +75 -0
- package/dist/storage-persistent-pg.d.ts.map +1 -0
- package/dist/storage-persistent-pg.js +579 -0
- package/dist/storage-persistent-pg.js.map +1 -0
- package/dist/storage-persistent-pg.test.d.ts +7 -0
- package/dist/storage-persistent-pg.test.d.ts.map +1 -0
- package/dist/storage-persistent-pg.test.js +90 -0
- package/dist/storage-persistent-pg.test.js.map +1 -0
- package/dist/storage-persistent-types.d.ts +110 -0
- package/dist/storage-persistent-types.d.ts.map +1 -0
- package/dist/storage-persistent-types.js +5 -0
- package/dist/storage-persistent-types.js.map +1 -0
- package/dist/storage-persistent.d.ts +231 -0
- package/dist/storage-persistent.d.ts.map +1 -0
- package/dist/storage-persistent.js +897 -0
- package/dist/storage-persistent.js.map +1 -0
- package/dist/storage-persistent.test.d.ts +5 -0
- package/dist/storage-persistent.test.d.ts.map +1 -0
- package/dist/storage-persistent.test.js +325 -0
- package/dist/storage-persistent.test.js.map +1 -0
- package/dist/storage.d.ts +63 -0
- package/dist/storage.d.ts.map +1 -0
- package/dist/storage.js +67 -0
- package/dist/storage.js.map +1 -0
- package/dist/storage.test.d.ts +5 -0
- package/dist/storage.test.d.ts.map +1 -0
- package/dist/storage.test.js +157 -0
- package/dist/storage.test.js.map +1 -0
- package/dist/tfidf.d.ts +97 -0
- package/dist/tfidf.d.ts.map +1 -0
- package/dist/tfidf.js +308 -0
- package/dist/tfidf.js.map +1 -0
- package/dist/tfidf.test.d.ts +5 -0
- package/dist/tfidf.test.d.ts.map +1 -0
- package/dist/tfidf.test.js +181 -0
- package/dist/tfidf.test.js.map +1 -0
- package/dist/utils.d.ts +61 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +264 -0
- package/dist/utils.js.map +1 -0
- package/dist/utils.test.d.ts +5 -0
- package/dist/utils.test.d.ts.map +1 -0
- package/dist/utils.test.js +94 -0
- package/dist/utils.test.js.map +1 -0
- package/dist/vector-storage.d.ts +120 -0
- package/dist/vector-storage.d.ts.map +1 -0
- package/dist/vector-storage.js +264 -0
- package/dist/vector-storage.js.map +1 -0
- package/dist/vector-storage.test.d.ts +5 -0
- package/dist/vector-storage.test.d.ts.map +1 -0
- package/dist/vector-storage.test.js +345 -0
- package/dist/vector-storage.test.js.map +1 -0
- package/package.json +85 -0
package/dist/indexer.js
ADDED
|
@@ -0,0 +1,1331 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Codebase indexer service
|
|
3
|
+
* Uses chunk-level indexing for better search granularity
|
|
4
|
+
*/
|
|
5
|
+
import fs from 'node:fs/promises';
|
|
6
|
+
import path from 'node:path';
|
|
7
|
+
import { chunkCodeByAST } from './ast-chunking.js';
|
|
8
|
+
import { getCoderagDataDir } from './db/client.js';
|
|
9
|
+
import { IncrementalTFIDF } from './incremental-tfidf.js';
|
|
10
|
+
import { createCacheKey, LRUCache } from './search-cache.js';
|
|
11
|
+
import { MemoryStorage } from './storage.js';
|
|
12
|
+
import { PersistentStorage } from './storage-persistent.js';
|
|
13
|
+
import { buildSearchIndex, getQueryTokens, tokenize } from './tfidf.js';
|
|
14
|
+
import { detectLanguage, isTextFile, loadGitignore, readFileContent, scanFileMetadata, simpleHash, } from './utils.js';
|
|
15
|
+
import { VectorStorage } from './vector-storage.js';
|
|
16
|
+
export class CodebaseIndexer {
|
|
17
|
+
codebaseRoot;
|
|
18
|
+
maxFileSize;
|
|
19
|
+
storage;
|
|
20
|
+
searchIndex = null;
|
|
21
|
+
incrementalEngine = null;
|
|
22
|
+
pendingFileChanges = [];
|
|
23
|
+
searchCache;
|
|
24
|
+
watcher = null;
|
|
25
|
+
isWatching = false;
|
|
26
|
+
onFileChangeCallback;
|
|
27
|
+
pendingUpdates = new Map();
|
|
28
|
+
ignoreFilter = null;
|
|
29
|
+
status = {
|
|
30
|
+
isIndexing: false,
|
|
31
|
+
progress: 0,
|
|
32
|
+
totalFiles: 0,
|
|
33
|
+
processedFiles: 0,
|
|
34
|
+
totalChunks: 0,
|
|
35
|
+
indexedChunks: 0,
|
|
36
|
+
};
|
|
37
|
+
vectorStorage;
|
|
38
|
+
embeddingProvider;
|
|
39
|
+
vectorBatchSize;
|
|
40
|
+
indexingBatchSize;
|
|
41
|
+
lowMemoryMode;
|
|
42
|
+
constructor(options = {}) {
|
|
43
|
+
this.codebaseRoot = options.codebaseRoot || process.cwd();
|
|
44
|
+
this.maxFileSize = options.maxFileSize || 1048576; // 1MB
|
|
45
|
+
this.storage = options.storage || new MemoryStorage();
|
|
46
|
+
this.onFileChangeCallback = options.onFileChange;
|
|
47
|
+
this.searchCache = new LRUCache(100, 5); // 100 entries, 5 min TTL
|
|
48
|
+
this.embeddingProvider = options.embeddingProvider;
|
|
49
|
+
this.vectorBatchSize = options.vectorBatchSize || 10;
|
|
50
|
+
this.indexingBatchSize = options.indexingBatchSize || 50; // Memory optimization
|
|
51
|
+
// Default to low memory mode when using persistent storage
|
|
52
|
+
this.lowMemoryMode = options.lowMemoryMode ?? options.storage instanceof PersistentStorage;
|
|
53
|
+
// Initialize vector storage if embedding provider is available
|
|
54
|
+
if (this.embeddingProvider) {
|
|
55
|
+
// Use global ~/.coderag/projects/<hash>/ directory for vector storage
|
|
56
|
+
const dataDir = getCoderagDataDir(this.codebaseRoot);
|
|
57
|
+
const vectorDbPath = path.join(dataDir, 'vectors.lance');
|
|
58
|
+
this.vectorStorage = new VectorStorage({
|
|
59
|
+
dimensions: this.embeddingProvider.dimensions,
|
|
60
|
+
dbPath: vectorDbPath,
|
|
61
|
+
});
|
|
62
|
+
console.error(`[INFO] Vector storage initialized: ${this.embeddingProvider.dimensions} dimensions`);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Get current indexing status
|
|
67
|
+
*/
|
|
68
|
+
getStatus() {
|
|
69
|
+
return { ...this.status };
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Compare filesystem with database to find changes
|
|
73
|
+
* Used for incremental updates after long periods of inactivity
|
|
74
|
+
*/
|
|
75
|
+
async diffFilesystem(dbMetadata) {
|
|
76
|
+
if (!this.ignoreFilter) {
|
|
77
|
+
this.ignoreFilter = loadGitignore(this.codebaseRoot);
|
|
78
|
+
}
|
|
79
|
+
const added = [];
|
|
80
|
+
const changed = [];
|
|
81
|
+
const deleted = [];
|
|
82
|
+
let unchanged = 0;
|
|
83
|
+
// Track which db files we've seen in filesystem
|
|
84
|
+
const seenPaths = new Set();
|
|
85
|
+
// Scan filesystem
|
|
86
|
+
for (const metadata of scanFileMetadata(this.codebaseRoot, {
|
|
87
|
+
ignoreFilter: this.ignoreFilter,
|
|
88
|
+
codebaseRoot: this.codebaseRoot,
|
|
89
|
+
maxFileSize: this.maxFileSize,
|
|
90
|
+
})) {
|
|
91
|
+
seenPaths.add(metadata.path);
|
|
92
|
+
const dbEntry = dbMetadata.get(metadata.path);
|
|
93
|
+
if (!dbEntry) {
|
|
94
|
+
// New file
|
|
95
|
+
added.push(metadata);
|
|
96
|
+
}
|
|
97
|
+
else if (Math.abs(metadata.mtime - dbEntry.mtime) > 1000) {
|
|
98
|
+
// mtime changed (1 second tolerance for filesystem precision)
|
|
99
|
+
// File might have changed, need to verify with hash
|
|
100
|
+
changed.push(metadata);
|
|
101
|
+
}
|
|
102
|
+
else {
|
|
103
|
+
unchanged++;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
// Find deleted files (in db but not in filesystem)
|
|
107
|
+
for (const dbPath of dbMetadata.keys()) {
|
|
108
|
+
if (!seenPaths.has(dbPath)) {
|
|
109
|
+
deleted.push(dbPath);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return { added, changed, deleted, unchanged };
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Process incremental changes (add, update, delete files)
|
|
116
|
+
* Uses chunk-level indexing with SQL-based updates
|
|
117
|
+
*/
|
|
118
|
+
async processIncrementalChanges(diff, dbMetadata, options) {
|
|
119
|
+
const persistentStorage = this.storage;
|
|
120
|
+
// Step 1: Get terms for deleted files (before deleting, for IDF recalculation)
|
|
121
|
+
let _deletedTerms = new Set();
|
|
122
|
+
if (diff.deleted.length > 0) {
|
|
123
|
+
console.error(`[INFO] Getting terms for ${diff.deleted.length} deleted files...`);
|
|
124
|
+
_deletedTerms = await persistentStorage.getTermsForFiles(diff.deleted);
|
|
125
|
+
console.error(`[INFO] Deleting ${diff.deleted.length} removed files...`);
|
|
126
|
+
await persistentStorage.deleteFiles(diff.deleted);
|
|
127
|
+
}
|
|
128
|
+
// Step 2: Process added and changed files - chunk and index
|
|
129
|
+
const filesToProcess = [...diff.added, ...diff.changed];
|
|
130
|
+
let totalChunks = 0;
|
|
131
|
+
if (filesToProcess.length > 0) {
|
|
132
|
+
console.error(`[INFO] Processing ${filesToProcess.length} files...`);
|
|
133
|
+
const batchSize = this.indexingBatchSize;
|
|
134
|
+
let processedCount = 0;
|
|
135
|
+
for (let i = 0; i < filesToProcess.length; i += batchSize) {
|
|
136
|
+
const batchMetadata = filesToProcess.slice(i, i + batchSize);
|
|
137
|
+
const batchFiles = [];
|
|
138
|
+
const fileChunks = [];
|
|
139
|
+
for (const metadata of batchMetadata) {
|
|
140
|
+
const content = readFileContent(metadata.absolutePath);
|
|
141
|
+
if (content === null)
|
|
142
|
+
continue;
|
|
143
|
+
const newHash = simpleHash(content);
|
|
144
|
+
// For changed files, verify content actually changed using hash
|
|
145
|
+
const dbEntry = dbMetadata.get(metadata.path);
|
|
146
|
+
if (dbEntry && dbEntry.hash === newHash) {
|
|
147
|
+
// File content unchanged, just mtime difference - skip
|
|
148
|
+
processedCount++;
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
const codebaseFile = {
|
|
152
|
+
path: metadata.path,
|
|
153
|
+
content,
|
|
154
|
+
size: metadata.size,
|
|
155
|
+
mtime: new Date(metadata.mtime),
|
|
156
|
+
language: metadata.language,
|
|
157
|
+
hash: newHash,
|
|
158
|
+
};
|
|
159
|
+
batchFiles.push(codebaseFile);
|
|
160
|
+
// Chunk the file using AST
|
|
161
|
+
const chunks = await chunkCodeByAST(content, metadata.path);
|
|
162
|
+
const chunkData = chunks.map((chunk) => ({
|
|
163
|
+
content: chunk.content,
|
|
164
|
+
type: chunk.type,
|
|
165
|
+
startLine: chunk.startLine,
|
|
166
|
+
endLine: chunk.endLine,
|
|
167
|
+
metadata: chunk.metadata,
|
|
168
|
+
}));
|
|
169
|
+
fileChunks.push({ filePath: metadata.path, chunks: chunkData });
|
|
170
|
+
totalChunks += chunkData.length;
|
|
171
|
+
processedCount++;
|
|
172
|
+
this.status.currentFile = metadata.path;
|
|
173
|
+
this.status.progress = Math.round((processedCount / filesToProcess.length) * 30);
|
|
174
|
+
options.onProgress?.(processedCount, filesToProcess.length, metadata.path);
|
|
175
|
+
}
|
|
176
|
+
// Store batch to database (file content)
|
|
177
|
+
if (batchFiles.length > 0) {
|
|
178
|
+
await persistentStorage.storeFiles(batchFiles);
|
|
179
|
+
}
|
|
180
|
+
// Store chunks for this batch
|
|
181
|
+
if (fileChunks.length > 0) {
|
|
182
|
+
const chunkIdMap = await persistentStorage.storeManyChunks(fileChunks);
|
|
183
|
+
// Build TF-IDF vectors for chunks
|
|
184
|
+
const chunkVectors = [];
|
|
185
|
+
for (const fc of fileChunks) {
|
|
186
|
+
const chunkIds = chunkIdMap.get(fc.filePath);
|
|
187
|
+
if (!chunkIds)
|
|
188
|
+
continue;
|
|
189
|
+
for (let j = 0; j < fc.chunks.length; j++) {
|
|
190
|
+
const chunk = fc.chunks[j];
|
|
191
|
+
const chunkId = chunkIds[j];
|
|
192
|
+
if (!chunkId)
|
|
193
|
+
continue;
|
|
194
|
+
// Tokenize chunk content
|
|
195
|
+
const tokens = await tokenize(chunk.content);
|
|
196
|
+
const termFreq = new Map();
|
|
197
|
+
for (const token of tokens) {
|
|
198
|
+
termFreq.set(token, (termFreq.get(token) || 0) + 1);
|
|
199
|
+
}
|
|
200
|
+
// Calculate TF
|
|
201
|
+
const totalTerms = tokens.length;
|
|
202
|
+
if (totalTerms === 0)
|
|
203
|
+
continue;
|
|
204
|
+
const terms = new Map();
|
|
205
|
+
for (const [term, freq] of termFreq) {
|
|
206
|
+
terms.set(term, {
|
|
207
|
+
tf: freq / totalTerms,
|
|
208
|
+
tfidf: 0, // Will be calculated after IDF rebuild
|
|
209
|
+
rawFreq: freq,
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
chunkVectors.push({ chunkId, terms, tokenCount: totalTerms });
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
// Store chunk vectors
|
|
216
|
+
if (chunkVectors.length > 0) {
|
|
217
|
+
await persistentStorage.storeManyChunkVectors(chunkVectors);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
this.status.progress = 50;
|
|
223
|
+
// Step 3: Rebuild IDF scores from vectors (SQL-based)
|
|
224
|
+
console.error('[INFO] Recalculating IDF scores...');
|
|
225
|
+
await persistentStorage.rebuildIdfScoresFromVectors();
|
|
226
|
+
this.status.progress = 70;
|
|
227
|
+
// Step 4: Recalculate TF-IDF scores (SQL-based batch update)
|
|
228
|
+
console.error('[INFO] Updating TF-IDF scores...');
|
|
229
|
+
await persistentStorage.recalculateTfidfScores();
|
|
230
|
+
this.status.progress = 80;
|
|
231
|
+
// Step 5: Update pre-computed magnitudes (for cosine similarity search)
|
|
232
|
+
console.error('[INFO] Updating chunk magnitudes...');
|
|
233
|
+
await persistentStorage.updateChunkMagnitudes();
|
|
234
|
+
this.status.progress = 90;
|
|
235
|
+
// Step 6: Update average document length (for BM25)
|
|
236
|
+
console.error('[INFO] Updating average document length...');
|
|
237
|
+
await persistentStorage.updateAverageDocLength();
|
|
238
|
+
this.status.progress = 95;
|
|
239
|
+
// Step 7: Invalidate search cache
|
|
240
|
+
this.searchCache.invalidate();
|
|
241
|
+
console.error('[INFO] Search cache invalidated');
|
|
242
|
+
// Log summary
|
|
243
|
+
console.error(`[SUCCESS] Incremental update complete: ${filesToProcess.length - diff.changed.length} files added, ${diff.changed.length} changed, ${diff.deleted.length} deleted, ${totalChunks} chunks indexed`);
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Get search index
|
|
247
|
+
*/
|
|
248
|
+
getSearchIndex() {
|
|
249
|
+
return this.searchIndex;
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Index the codebase
|
|
253
|
+
*/
|
|
254
|
+
async index(options = {}) {
|
|
255
|
+
this.status.isIndexing = true;
|
|
256
|
+
this.status.progress = 0;
|
|
257
|
+
this.status.processedFiles = 0;
|
|
258
|
+
this.status.indexedChunks = 0;
|
|
259
|
+
try {
|
|
260
|
+
// Try to load existing index from persistent storage
|
|
261
|
+
if (this.storage instanceof PersistentStorage) {
|
|
262
|
+
const existingFileCount = await this.storage.count();
|
|
263
|
+
const existingChunkCount = (await this.storage.getChunkCount?.()) ?? 0;
|
|
264
|
+
if (existingFileCount > 0) {
|
|
265
|
+
console.error(`[INFO] Found existing index: ${existingFileCount} files, ${existingChunkCount} chunks`);
|
|
266
|
+
// Verify IDF scores exist (index is valid)
|
|
267
|
+
const idf = await this.storage.getIdfScores();
|
|
268
|
+
if (idf.size > 0) {
|
|
269
|
+
// Incremental diff: compare filesystem vs database
|
|
270
|
+
console.error('[INFO] Checking for file changes since last index...');
|
|
271
|
+
const dbMetadata = await this.storage.getAllFileMetadata();
|
|
272
|
+
const diff = await this.diffFilesystem(dbMetadata);
|
|
273
|
+
const totalChanges = diff.added.length + diff.changed.length + diff.deleted.length;
|
|
274
|
+
if (totalChanges === 0) {
|
|
275
|
+
// No changes - use existing index
|
|
276
|
+
console.error(`[SUCCESS] No changes detected (${diff.unchanged} files, ${existingChunkCount} chunks)`);
|
|
277
|
+
this.status.progress = 100;
|
|
278
|
+
this.status.totalFiles = existingFileCount;
|
|
279
|
+
this.status.processedFiles = existingFileCount;
|
|
280
|
+
this.status.totalChunks = existingChunkCount;
|
|
281
|
+
this.status.indexedChunks = existingChunkCount;
|
|
282
|
+
// Start watching if requested
|
|
283
|
+
if (options.watch) {
|
|
284
|
+
await this.startWatch();
|
|
285
|
+
}
|
|
286
|
+
this.status.isIndexing = false;
|
|
287
|
+
return;
|
|
288
|
+
}
|
|
289
|
+
// Process incremental changes
|
|
290
|
+
console.error(`[INFO] Incremental update: ${diff.added.length} added, ${diff.changed.length} changed, ${diff.deleted.length} deleted`);
|
|
291
|
+
await this.processIncrementalChanges(diff, dbMetadata, options);
|
|
292
|
+
// Get updated chunk count after incremental changes
|
|
293
|
+
const updatedChunkCount = (await this.storage.getChunkCount?.()) ?? 0;
|
|
294
|
+
const updatedFileCount = existingFileCount + diff.added.length - diff.deleted.length;
|
|
295
|
+
this.status.progress = 100;
|
|
296
|
+
this.status.totalFiles = updatedFileCount;
|
|
297
|
+
this.status.processedFiles = updatedFileCount;
|
|
298
|
+
this.status.totalChunks = updatedChunkCount;
|
|
299
|
+
this.status.indexedChunks = updatedChunkCount;
|
|
300
|
+
// Start watching if requested
|
|
301
|
+
if (options.watch) {
|
|
302
|
+
await this.startWatch();
|
|
303
|
+
}
|
|
304
|
+
this.status.isIndexing = false;
|
|
305
|
+
return;
|
|
306
|
+
}
|
|
307
|
+
console.error('[WARN] Index verification failed, rebuilding...');
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
// Load .gitignore
|
|
311
|
+
this.ignoreFilter = loadGitignore(this.codebaseRoot);
|
|
312
|
+
const ignoreFilter = this.ignoreFilter;
|
|
313
|
+
// Phase 1: Scan file metadata only (no content loaded - Memory optimization)
|
|
314
|
+
console.error('[INFO] Scanning codebase (metadata only)...');
|
|
315
|
+
const fileMetadataList = [];
|
|
316
|
+
for (const metadata of scanFileMetadata(this.codebaseRoot, {
|
|
317
|
+
ignoreFilter,
|
|
318
|
+
codebaseRoot: this.codebaseRoot,
|
|
319
|
+
maxFileSize: this.maxFileSize,
|
|
320
|
+
})) {
|
|
321
|
+
fileMetadataList.push(metadata);
|
|
322
|
+
}
|
|
323
|
+
this.status.totalFiles = fileMetadataList.length;
|
|
324
|
+
console.error(`[INFO] Found ${fileMetadataList.length} files`);
|
|
325
|
+
// Sync deleted files when doing full rebuild (e.g. after interrupted first index)
|
|
326
|
+
if (this.storage instanceof PersistentStorage && (await this.storage.count()) > 0) {
|
|
327
|
+
const dbMetadata = await this.storage.getAllFileMetadata();
|
|
328
|
+
const dbPaths = Array.from(dbMetadata.keys());
|
|
329
|
+
const currentPaths = new Set(fileMetadataList.map((m) => m.path));
|
|
330
|
+
const deleted = dbPaths.filter((p) => !currentPaths.has(p));
|
|
331
|
+
if (deleted.length > 0) {
|
|
332
|
+
await this.storage.deleteFiles(deleted);
|
|
333
|
+
console.error(`[INFO] Removed ${deleted.length} deleted files from index (full rebuild).`);
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
// Phase 2: Process files in batches with chunk-level indexing
|
|
337
|
+
// Only batch content is in memory at any time
|
|
338
|
+
console.error(`[INFO] Processing files in batches of ${this.indexingBatchSize}...`);
|
|
339
|
+
const batchSize = this.indexingBatchSize;
|
|
340
|
+
let processedCount = 0;
|
|
341
|
+
let totalChunks = 0;
|
|
342
|
+
// Check if we're using persistent storage for chunk-based indexing
|
|
343
|
+
const isPersistent = this.storage instanceof PersistentStorage;
|
|
344
|
+
const persistentStorage = isPersistent ? this.storage : null;
|
|
345
|
+
// For non-persistent storage, still use incremental engine
|
|
346
|
+
if (!isPersistent) {
|
|
347
|
+
this.incrementalEngine = new IncrementalTFIDF();
|
|
348
|
+
}
|
|
349
|
+
const existingHashes = options.skipUnchanged !== false && this.storage.getFileHashes
|
|
350
|
+
? await this.storage.getFileHashes()
|
|
351
|
+
: new Map();
|
|
352
|
+
for (let i = 0; i < fileMetadataList.length; i += batchSize) {
|
|
353
|
+
const batchMetadata = fileMetadataList.slice(i, i + batchSize);
|
|
354
|
+
const batchFiles = [];
|
|
355
|
+
const fileChunks = [];
|
|
356
|
+
const batchUpdates = [];
|
|
357
|
+
// Read content for this batch only
|
|
358
|
+
for (const metadata of batchMetadata) {
|
|
359
|
+
const content = readFileContent(metadata.absolutePath);
|
|
360
|
+
if (content === null)
|
|
361
|
+
continue;
|
|
362
|
+
const hash = simpleHash(content);
|
|
363
|
+
if (existingHashes.get(metadata.path) === hash) {
|
|
364
|
+
processedCount++;
|
|
365
|
+
this.status.currentFile = metadata.path;
|
|
366
|
+
this.status.processedFiles = processedCount;
|
|
367
|
+
this.status.progress = Math.round((processedCount / fileMetadataList.length) * 40);
|
|
368
|
+
options.onProgress?.(processedCount, fileMetadataList.length, metadata.path);
|
|
369
|
+
continue;
|
|
370
|
+
}
|
|
371
|
+
const codebaseFile = {
|
|
372
|
+
path: metadata.path,
|
|
373
|
+
content,
|
|
374
|
+
size: metadata.size,
|
|
375
|
+
mtime: new Date(metadata.mtime),
|
|
376
|
+
language: metadata.language,
|
|
377
|
+
hash,
|
|
378
|
+
};
|
|
379
|
+
batchFiles.push(codebaseFile);
|
|
380
|
+
// Chunk the file using AST
|
|
381
|
+
const chunks = await chunkCodeByAST(content, metadata.path);
|
|
382
|
+
const chunkData = chunks.map((chunk) => ({
|
|
383
|
+
content: chunk.content,
|
|
384
|
+
type: chunk.type,
|
|
385
|
+
startLine: chunk.startLine,
|
|
386
|
+
endLine: chunk.endLine,
|
|
387
|
+
metadata: chunk.metadata,
|
|
388
|
+
}));
|
|
389
|
+
fileChunks.push({ filePath: metadata.path, chunks: chunkData, content });
|
|
390
|
+
totalChunks += chunkData.length;
|
|
391
|
+
// For non-persistent storage, use incremental engine
|
|
392
|
+
if (!isPersistent) {
|
|
393
|
+
batchUpdates.push({
|
|
394
|
+
type: 'add',
|
|
395
|
+
uri: `file://${metadata.path}`,
|
|
396
|
+
newContent: content,
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
processedCount++;
|
|
400
|
+
this.status.currentFile = metadata.path;
|
|
401
|
+
this.status.processedFiles = processedCount;
|
|
402
|
+
this.status.indexedChunks = totalChunks;
|
|
403
|
+
this.status.progress = Math.round((processedCount / fileMetadataList.length) * 40);
|
|
404
|
+
options.onProgress?.(processedCount, fileMetadataList.length, metadata.path);
|
|
405
|
+
}
|
|
406
|
+
// Store batch to database
|
|
407
|
+
if (batchFiles.length > 0) {
|
|
408
|
+
if (this.storage.storeFiles) {
|
|
409
|
+
await this.storage.storeFiles(batchFiles);
|
|
410
|
+
}
|
|
411
|
+
else {
|
|
412
|
+
for (const file of batchFiles) {
|
|
413
|
+
await this.storage.storeFile(file);
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
// For persistent storage: store chunks and build TF-IDF vectors per chunk
|
|
417
|
+
if (persistentStorage && fileChunks.length > 0) {
|
|
418
|
+
const chunkIdMap = await persistentStorage.storeManyChunks(fileChunks.map((fc) => ({ filePath: fc.filePath, chunks: fc.chunks })));
|
|
419
|
+
// Build flat list of tokenize tasks (same order as chunks)
|
|
420
|
+
const tokenTasks = [];
|
|
421
|
+
for (const fc of fileChunks) {
|
|
422
|
+
const chunkIds = chunkIdMap.get(fc.filePath);
|
|
423
|
+
if (!chunkIds)
|
|
424
|
+
continue;
|
|
425
|
+
for (let j = 0; j < fc.chunks.length; j++) {
|
|
426
|
+
const chunkId = chunkIds[j];
|
|
427
|
+
if (chunkId)
|
|
428
|
+
tokenTasks.push({ chunkId, content: fc.chunks[j].content });
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
const allTokenResults = [];
|
|
432
|
+
if (options.useParallelTokenize === true) {
|
|
433
|
+
// Tokenize in parallel (sub-batches to avoid too many concurrent)
|
|
434
|
+
const TOKENIZE_CONCURRENCY = 25;
|
|
435
|
+
for (let i = 0; i < tokenTasks.length; i += TOKENIZE_CONCURRENCY) {
|
|
436
|
+
const batch = tokenTasks.slice(i, i + TOKENIZE_CONCURRENCY);
|
|
437
|
+
const batchResults = await Promise.all(batch.map((t) => tokenize(t.content)));
|
|
438
|
+
allTokenResults.push(...batchResults);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
else {
|
|
442
|
+
// Sequential tokenize (default)
|
|
443
|
+
for (const t of tokenTasks) {
|
|
444
|
+
allTokenResults.push(await tokenize(t.content));
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
// Build chunkVectors from results (same order)
|
|
448
|
+
const chunkVectors = [];
|
|
449
|
+
for (let k = 0; k < tokenTasks.length; k++) {
|
|
450
|
+
const tokens = allTokenResults[k];
|
|
451
|
+
const chunkId = tokenTasks[k].chunkId;
|
|
452
|
+
const termFreq = new Map();
|
|
453
|
+
for (const token of tokens) {
|
|
454
|
+
termFreq.set(token, (termFreq.get(token) || 0) + 1);
|
|
455
|
+
}
|
|
456
|
+
const totalTerms = tokens.length;
|
|
457
|
+
if (totalTerms === 0)
|
|
458
|
+
continue;
|
|
459
|
+
const terms = new Map();
|
|
460
|
+
for (const [term, freq] of termFreq) {
|
|
461
|
+
terms.set(term, {
|
|
462
|
+
tf: freq / totalTerms,
|
|
463
|
+
tfidf: 0,
|
|
464
|
+
rawFreq: freq,
|
|
465
|
+
});
|
|
466
|
+
}
|
|
467
|
+
chunkVectors.push({ chunkId, terms, tokenCount: totalTerms });
|
|
468
|
+
}
|
|
469
|
+
if (chunkVectors.length > 0) {
|
|
470
|
+
await persistentStorage.storeManyChunkVectors(chunkVectors);
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
// For non-persistent storage: use incremental engine
|
|
474
|
+
if (this.incrementalEngine && batchUpdates.length > 0) {
|
|
475
|
+
await this.incrementalEngine.applyUpdates(batchUpdates);
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
// Clear batch references for GC
|
|
479
|
+
batchFiles.length = 0;
|
|
480
|
+
fileChunks.length = 0;
|
|
481
|
+
batchUpdates.length = 0;
|
|
482
|
+
}
|
|
483
|
+
console.error(`[INFO] Total chunks created: ${totalChunks}`);
|
|
484
|
+
this.status.totalChunks = totalChunks;
|
|
485
|
+
this.status.progress = 50;
|
|
486
|
+
// Finalize index based on storage type
|
|
487
|
+
if (persistentStorage) {
|
|
488
|
+
// Persistent storage: rebuild IDF and TF-IDF scores
|
|
489
|
+
console.error('[INFO] Rebuilding IDF scores...');
|
|
490
|
+
await persistentStorage.rebuildIdfScoresFromVectors();
|
|
491
|
+
this.status.progress = 60;
|
|
492
|
+
console.error('[INFO] Recalculating TF-IDF scores...');
|
|
493
|
+
await persistentStorage.recalculateTfidfScores();
|
|
494
|
+
this.status.progress = 70;
|
|
495
|
+
console.error('[INFO] Computing chunk magnitudes...');
|
|
496
|
+
await persistentStorage.updateChunkMagnitudes();
|
|
497
|
+
this.status.progress = 80;
|
|
498
|
+
console.error('[INFO] Computing average document length...');
|
|
499
|
+
await persistentStorage.updateAverageDocLength();
|
|
500
|
+
this.status.progress = 85;
|
|
501
|
+
// Release in-memory structures in low memory mode
|
|
502
|
+
if (this.lowMemoryMode) {
|
|
503
|
+
this.searchIndex = null;
|
|
504
|
+
this.incrementalEngine = null;
|
|
505
|
+
console.error('[INFO] Low memory mode: released in-memory index');
|
|
506
|
+
}
|
|
507
|
+
console.error('[SUCCESS] Chunk-level TF-IDF index persisted');
|
|
508
|
+
}
|
|
509
|
+
else if (this.incrementalEngine) {
|
|
510
|
+
// Non-persistent storage: build in-memory index
|
|
511
|
+
console.error('[INFO] Finalizing in-memory search index...');
|
|
512
|
+
const indexData = this.incrementalEngine.getIndex();
|
|
513
|
+
this.searchIndex = {
|
|
514
|
+
documents: indexData.documents,
|
|
515
|
+
idf: indexData.idf,
|
|
516
|
+
totalDocuments: indexData.totalDocuments,
|
|
517
|
+
metadata: {
|
|
518
|
+
generatedAt: new Date().toISOString(),
|
|
519
|
+
version: '1.0.0',
|
|
520
|
+
},
|
|
521
|
+
};
|
|
522
|
+
console.error('[INFO] Incremental index engine initialized');
|
|
523
|
+
}
|
|
524
|
+
// Build vector index if embedding provider available
|
|
525
|
+
if (this.embeddingProvider && this.vectorStorage) {
|
|
526
|
+
await this.buildVectorIndexFromMetadata(fileMetadataList);
|
|
527
|
+
}
|
|
528
|
+
this.status.progress = 100;
|
|
529
|
+
this.status.indexedChunks = totalChunks;
|
|
530
|
+
console.error(`[SUCCESS] Indexed ${totalChunks} chunks from ${fileMetadataList.length} files`);
|
|
531
|
+
// Start watching if requested
|
|
532
|
+
if (options.watch) {
|
|
533
|
+
await this.startWatch();
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
catch (error) {
|
|
537
|
+
console.error('[ERROR] Failed to index codebase:', error);
|
|
538
|
+
throw error;
|
|
539
|
+
}
|
|
540
|
+
finally {
|
|
541
|
+
this.status.isIndexing = false;
|
|
542
|
+
this.status.currentFile = undefined;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
/**
|
|
546
|
+
* Start watching for file changes
|
|
547
|
+
* Uses @parcel/watcher which provides native FSEvents on macOS
|
|
548
|
+
*/
|
|
549
|
+
async startWatch() {
|
|
550
|
+
if (this.isWatching) {
|
|
551
|
+
console.error('[WARN] Already watching for changes');
|
|
552
|
+
return;
|
|
553
|
+
}
|
|
554
|
+
if (!this.ignoreFilter) {
|
|
555
|
+
this.ignoreFilter = loadGitignore(this.codebaseRoot);
|
|
556
|
+
}
|
|
557
|
+
console.error('[INFO] Starting file watcher (native FSEvents)...');
|
|
558
|
+
const watcher = await import('@parcel/watcher');
|
|
559
|
+
// Subscribe to file changes
|
|
560
|
+
this.watcher = await watcher.subscribe(this.codebaseRoot, (err, events) => {
|
|
561
|
+
if (err) {
|
|
562
|
+
console.error('[WARN] File watcher error:', err.message);
|
|
563
|
+
return;
|
|
564
|
+
}
|
|
565
|
+
for (const event of events) {
|
|
566
|
+
const absolutePath = event.path;
|
|
567
|
+
const relativePath = path.relative(this.codebaseRoot, absolutePath);
|
|
568
|
+
// Skip ignored files
|
|
569
|
+
if (this.shouldIgnore(relativePath)) {
|
|
570
|
+
continue;
|
|
571
|
+
}
|
|
572
|
+
// Map @parcel/watcher event types to our types
|
|
573
|
+
const eventType = event.type === 'create' ? 'add' : event.type === 'delete' ? 'unlink' : 'change';
|
|
574
|
+
this.handleFileChange(eventType, absolutePath);
|
|
575
|
+
}
|
|
576
|
+
}, {
|
|
577
|
+
// Use native backend (FSEvents on macOS, inotify on Linux)
|
|
578
|
+
backend: undefined, // auto-detect best backend
|
|
579
|
+
ignore: [
|
|
580
|
+
'**/node_modules/**',
|
|
581
|
+
'**/.git/**',
|
|
582
|
+
'**/dist/**',
|
|
583
|
+
'**/build/**',
|
|
584
|
+
'**/.next/**',
|
|
585
|
+
'**/.turbo/**',
|
|
586
|
+
'**/.cache/**',
|
|
587
|
+
'**/coverage/**',
|
|
588
|
+
'**/*.log',
|
|
589
|
+
],
|
|
590
|
+
});
|
|
591
|
+
this.isWatching = true;
|
|
592
|
+
console.error('[SUCCESS] File watcher started (native FSEvents)');
|
|
593
|
+
}
|
|
594
|
+
/**
|
|
595
|
+
* Check if a file should be ignored
|
|
596
|
+
*/
|
|
597
|
+
shouldIgnore(relativePath) {
|
|
598
|
+
// Skip empty paths
|
|
599
|
+
if (!relativePath)
|
|
600
|
+
return true;
|
|
601
|
+
// Check gitignore
|
|
602
|
+
if (this.ignoreFilter?.ignores(relativePath)) {
|
|
603
|
+
return true;
|
|
604
|
+
}
|
|
605
|
+
return false;
|
|
606
|
+
}
|
|
607
|
+
/**
|
|
608
|
+
* Stop watching for file changes
|
|
609
|
+
*/
|
|
610
|
+
async stopWatch() {
|
|
611
|
+
if (!this.isWatching || !this.watcher) {
|
|
612
|
+
return;
|
|
613
|
+
}
|
|
614
|
+
console.error('[INFO] Stopping file watcher...');
|
|
615
|
+
await this.watcher.unsubscribe();
|
|
616
|
+
this.watcher = null;
|
|
617
|
+
this.isWatching = false;
|
|
618
|
+
// Clear pending updates
|
|
619
|
+
for (const timeout of this.pendingUpdates.values()) {
|
|
620
|
+
clearTimeout(timeout);
|
|
621
|
+
}
|
|
622
|
+
this.pendingUpdates.clear();
|
|
623
|
+
// Clear pending file changes to prevent memory leak
|
|
624
|
+
this.pendingFileChanges = [];
|
|
625
|
+
console.error('[SUCCESS] File watcher stopped');
|
|
626
|
+
}
|
|
627
|
+
/**
|
|
628
|
+
* Close indexer and release all resources
|
|
629
|
+
* Should be called when the indexer is no longer needed
|
|
630
|
+
*/
|
|
631
|
+
async close() {
|
|
632
|
+
// Stop file watcher first
|
|
633
|
+
await this.stopWatch();
|
|
634
|
+
// Close vector storage (LanceDB connection)
|
|
635
|
+
if (this.vectorStorage) {
|
|
636
|
+
await this.vectorStorage.close();
|
|
637
|
+
this.vectorStorage = undefined;
|
|
638
|
+
}
|
|
639
|
+
// Close persistent storage (SQLite connection)
|
|
640
|
+
if (this.storage instanceof PersistentStorage) {
|
|
641
|
+
this.storage.close();
|
|
642
|
+
}
|
|
643
|
+
// Clear all in-memory state
|
|
644
|
+
this.searchIndex = null;
|
|
645
|
+
this.incrementalEngine = null;
|
|
646
|
+
this.pendingFileChanges = [];
|
|
647
|
+
this.searchCache.clear();
|
|
648
|
+
this.ignoreFilter = null;
|
|
649
|
+
console.error('[SUCCESS] Indexer closed and resources released');
|
|
650
|
+
}
|
|
651
|
+
/**
|
|
652
|
+
* Handle file change events with debouncing
|
|
653
|
+
*/
|
|
654
|
+
handleFileChange(type, absolutePath) {
|
|
655
|
+
const relativePath = path.relative(this.codebaseRoot, absolutePath);
|
|
656
|
+
// Check if file should be ignored
|
|
657
|
+
if (this.ignoreFilter?.ignores(relativePath)) {
|
|
658
|
+
return;
|
|
659
|
+
}
|
|
660
|
+
// Debounce updates (wait 500ms after last change)
|
|
661
|
+
const existing = this.pendingUpdates.get(relativePath);
|
|
662
|
+
if (existing) {
|
|
663
|
+
clearTimeout(existing);
|
|
664
|
+
}
|
|
665
|
+
const timeout = setTimeout(async () => {
|
|
666
|
+
this.pendingUpdates.delete(relativePath);
|
|
667
|
+
await this.processFileChange(type, relativePath, absolutePath);
|
|
668
|
+
}, 500);
|
|
669
|
+
this.pendingUpdates.set(relativePath, timeout);
|
|
670
|
+
}
|
|
671
|
+
/**
|
|
672
|
+
* Process file change and update index
|
|
673
|
+
*/
|
|
674
|
+
async processFileChange(type, relativePath, absolutePath) {
|
|
675
|
+
const event = {
|
|
676
|
+
type,
|
|
677
|
+
path: relativePath,
|
|
678
|
+
timestamp: Date.now(),
|
|
679
|
+
};
|
|
680
|
+
try {
|
|
681
|
+
if (type === 'unlink') {
|
|
682
|
+
// Track deletion for incremental update
|
|
683
|
+
const existingFile = await this.storage.getFile(relativePath);
|
|
684
|
+
if (existingFile && this.searchIndex) {
|
|
685
|
+
const oldDoc = this.searchIndex.documents.find((d) => d.uri === `file://${relativePath}`);
|
|
686
|
+
if (oldDoc) {
|
|
687
|
+
this.pendingFileChanges.push({
|
|
688
|
+
type: 'delete',
|
|
689
|
+
uri: `file://${relativePath}`,
|
|
690
|
+
oldDocument: oldDoc,
|
|
691
|
+
});
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
// Remove from storage
|
|
695
|
+
await this.storage.deleteFile(relativePath);
|
|
696
|
+
// Remove from vector storage
|
|
697
|
+
await this.deleteFileVector(relativePath);
|
|
698
|
+
console.error(`[FILE] Removed: ${relativePath}`);
|
|
699
|
+
}
|
|
700
|
+
else {
|
|
701
|
+
// Check if file is text and within size limit
|
|
702
|
+
const stats = await fs.stat(absolutePath);
|
|
703
|
+
if (stats.size > this.maxFileSize) {
|
|
704
|
+
console.error(`[FILE] Skipped (too large): ${relativePath}`);
|
|
705
|
+
return;
|
|
706
|
+
}
|
|
707
|
+
if (!isTextFile(absolutePath)) {
|
|
708
|
+
console.error(`[FILE] Skipped (binary): ${relativePath}`);
|
|
709
|
+
return;
|
|
710
|
+
}
|
|
711
|
+
// Read file content
|
|
712
|
+
const content = await fs.readFile(absolutePath, 'utf-8');
|
|
713
|
+
const hash = simpleHash(content);
|
|
714
|
+
// OPTIMIZATION: Check if file actually changed using hash comparison
|
|
715
|
+
const existingFile = await this.storage.getFile(relativePath);
|
|
716
|
+
if (existingFile && existingFile.hash === hash) {
|
|
717
|
+
console.error(`[FILE] Skipped (unchanged): ${relativePath}`);
|
|
718
|
+
// File hasn't changed, skip indexing
|
|
719
|
+
this.onFileChangeCallback?.(event);
|
|
720
|
+
return;
|
|
721
|
+
}
|
|
722
|
+
// Track change for incremental update
|
|
723
|
+
if (this.searchIndex) {
|
|
724
|
+
const uri = `file://${relativePath}`;
|
|
725
|
+
const oldDoc = this.searchIndex.documents.find((d) => d.uri === uri);
|
|
726
|
+
if (oldDoc) {
|
|
727
|
+
// Update existing document
|
|
728
|
+
this.pendingFileChanges.push({
|
|
729
|
+
type: 'update',
|
|
730
|
+
uri,
|
|
731
|
+
oldDocument: oldDoc,
|
|
732
|
+
newContent: content,
|
|
733
|
+
});
|
|
734
|
+
}
|
|
735
|
+
else {
|
|
736
|
+
// Add new document
|
|
737
|
+
this.pendingFileChanges.push({
|
|
738
|
+
type: 'add',
|
|
739
|
+
uri,
|
|
740
|
+
newContent: content,
|
|
741
|
+
});
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
// File changed or new, process it
|
|
745
|
+
const codebaseFile = {
|
|
746
|
+
path: relativePath,
|
|
747
|
+
content,
|
|
748
|
+
size: stats.size,
|
|
749
|
+
mtime: stats.mtime,
|
|
750
|
+
language: detectLanguage(relativePath),
|
|
751
|
+
hash,
|
|
752
|
+
};
|
|
753
|
+
await this.storage.storeFile(codebaseFile);
|
|
754
|
+
// Update vector storage
|
|
755
|
+
await this.updateFileVector(relativePath, content);
|
|
756
|
+
console.error(`[FILE] ${type === 'add' ? 'Added' : 'Updated'}: ${relativePath}`);
|
|
757
|
+
}
|
|
758
|
+
// Rebuild search index
|
|
759
|
+
await this.rebuildSearchIndex();
|
|
760
|
+
// Notify callback
|
|
761
|
+
this.onFileChangeCallback?.(event);
|
|
762
|
+
}
|
|
763
|
+
catch (error) {
|
|
764
|
+
console.error(`[ERROR] Failed to process file change (${relativePath}):`, error);
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
/**
|
|
768
|
+
* Rebuild search index from current storage
|
|
769
|
+
* Uses incremental update when possible for performance
|
|
770
|
+
*/
|
|
771
|
+
async rebuildSearchIndex() {
|
|
772
|
+
// If no incremental engine or no pending changes, do full rebuild
|
|
773
|
+
if (!this.incrementalEngine || this.pendingFileChanges.length === 0) {
|
|
774
|
+
// CRITICAL: Clear pending changes to prevent memory leak
|
|
775
|
+
// In lowMemoryMode, incrementalEngine is null, so we must clear here
|
|
776
|
+
this.pendingFileChanges = [];
|
|
777
|
+
return this.fullRebuildSearchIndex();
|
|
778
|
+
}
|
|
779
|
+
// Use try/finally to ensure pendingFileChanges is always cleared
|
|
780
|
+
// This prevents memory leak if an exception occurs during rebuild
|
|
781
|
+
try {
|
|
782
|
+
// Check if incremental update is recommended
|
|
783
|
+
if (await this.incrementalEngine.shouldFullRebuild(this.pendingFileChanges)) {
|
|
784
|
+
console.error('[INFO] Changes too extensive, performing full rebuild instead of incremental');
|
|
785
|
+
this.pendingFileChanges = [];
|
|
786
|
+
return this.fullRebuildSearchIndex();
|
|
787
|
+
}
|
|
788
|
+
// Perform incremental update
|
|
789
|
+
const stats = await this.incrementalEngine.applyUpdates(this.pendingFileChanges);
|
|
790
|
+
// Update search index from incremental engine
|
|
791
|
+
const indexData = this.incrementalEngine.getIndex();
|
|
792
|
+
this.searchIndex = {
|
|
793
|
+
documents: indexData.documents,
|
|
794
|
+
idf: indexData.idf,
|
|
795
|
+
totalDocuments: indexData.totalDocuments,
|
|
796
|
+
metadata: {
|
|
797
|
+
generatedAt: new Date().toISOString(),
|
|
798
|
+
version: '1.0.0',
|
|
799
|
+
},
|
|
800
|
+
};
|
|
801
|
+
console.error(`[SUCCESS] Incremental update: ${stats.affectedDocuments} docs, ${stats.affectedTerms} terms, ${stats.updateTime}ms`);
|
|
802
|
+
// Invalidate search cache (index changed)
|
|
803
|
+
this.searchCache.invalidate();
|
|
804
|
+
console.error('[INFO] Search cache invalidated');
|
|
805
|
+
// Persist if using persistent storage
|
|
806
|
+
if (this.storage instanceof PersistentStorage) {
|
|
807
|
+
await this.persistSearchIndex();
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
finally {
|
|
811
|
+
// Always clear pending changes to prevent memory leak
|
|
812
|
+
this.pendingFileChanges = [];
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
/**
|
|
816
|
+
* Full rebuild of search index (fallback when incremental not possible)
|
|
817
|
+
* For persistent storage, this rebuilds the chunk-level index
|
|
818
|
+
*/
|
|
819
|
+
async fullRebuildSearchIndex() {
|
|
820
|
+
// For persistent storage, rebuild chunk index
|
|
821
|
+
if (this.storage instanceof PersistentStorage) {
|
|
822
|
+
const persistentStorage = this.storage;
|
|
823
|
+
const allFiles = await this.storage.getAllFiles();
|
|
824
|
+
console.error(`[INFO] Full rebuild: re-chunking ${allFiles.length} files...`);
|
|
825
|
+
// Re-chunk all files
|
|
826
|
+
const fileChunks = [];
|
|
827
|
+
for (const file of allFiles) {
|
|
828
|
+
const chunks = await chunkCodeByAST(file.content, file.path);
|
|
829
|
+
const chunkData = chunks.map((chunk) => ({
|
|
830
|
+
content: chunk.content,
|
|
831
|
+
type: chunk.type,
|
|
832
|
+
startLine: chunk.startLine,
|
|
833
|
+
endLine: chunk.endLine,
|
|
834
|
+
metadata: chunk.metadata,
|
|
835
|
+
}));
|
|
836
|
+
fileChunks.push({ filePath: file.path, chunks: chunkData });
|
|
837
|
+
}
|
|
838
|
+
// Store all chunks (this also deletes old chunks)
|
|
839
|
+
const chunkIdMap = await persistentStorage.storeManyChunks(fileChunks);
|
|
840
|
+
// Build TF-IDF vectors for all chunks
|
|
841
|
+
const chunkVectors = [];
|
|
842
|
+
for (const fc of fileChunks) {
|
|
843
|
+
const chunkIds = chunkIdMap.get(fc.filePath);
|
|
844
|
+
if (!chunkIds)
|
|
845
|
+
continue;
|
|
846
|
+
for (let j = 0; j < fc.chunks.length; j++) {
|
|
847
|
+
const chunk = fc.chunks[j];
|
|
848
|
+
const chunkId = chunkIds[j];
|
|
849
|
+
if (!chunkId)
|
|
850
|
+
continue;
|
|
851
|
+
const tokens = await tokenize(chunk.content);
|
|
852
|
+
const termFreq = new Map();
|
|
853
|
+
for (const token of tokens) {
|
|
854
|
+
termFreq.set(token, (termFreq.get(token) || 0) + 1);
|
|
855
|
+
}
|
|
856
|
+
const totalTerms = tokens.length;
|
|
857
|
+
if (totalTerms === 0)
|
|
858
|
+
continue;
|
|
859
|
+
const terms = new Map();
|
|
860
|
+
for (const [term, freq] of termFreq) {
|
|
861
|
+
terms.set(term, {
|
|
862
|
+
tf: freq / totalTerms,
|
|
863
|
+
tfidf: 0,
|
|
864
|
+
rawFreq: freq,
|
|
865
|
+
});
|
|
866
|
+
}
|
|
867
|
+
chunkVectors.push({ chunkId, terms, tokenCount: totalTerms });
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
if (chunkVectors.length > 0) {
|
|
871
|
+
await persistentStorage.storeManyChunkVectors(chunkVectors);
|
|
872
|
+
}
|
|
873
|
+
// Rebuild IDF and TF-IDF scores
|
|
874
|
+
await persistentStorage.rebuildIdfScoresFromVectors();
|
|
875
|
+
await persistentStorage.recalculateTfidfScores();
|
|
876
|
+
await persistentStorage.updateChunkMagnitudes();
|
|
877
|
+
await persistentStorage.updateAverageDocLength();
|
|
878
|
+
console.error('[SUCCESS] Full chunk index rebuild complete');
|
|
879
|
+
}
|
|
880
|
+
else {
|
|
881
|
+
// For non-persistent storage, use in-memory index
|
|
882
|
+
const allFiles = await this.storage.getAllFiles();
|
|
883
|
+
const documents = allFiles.map((file) => ({
|
|
884
|
+
uri: `file://${file.path}`,
|
|
885
|
+
content: file.content,
|
|
886
|
+
}));
|
|
887
|
+
this.searchIndex = await buildSearchIndex(documents);
|
|
888
|
+
this.incrementalEngine = new IncrementalTFIDF(this.searchIndex.documents, this.searchIndex.idf);
|
|
889
|
+
}
|
|
890
|
+
// Invalidate search cache (index changed)
|
|
891
|
+
this.searchCache.invalidate();
|
|
892
|
+
console.error('[INFO] Search cache invalidated');
|
|
893
|
+
}
|
|
894
|
+
/**
|
|
895
|
+
* Persist search index to storage
|
|
896
|
+
* NOTE: For PersistentStorage, chunk-based indexing happens inline during index()
|
|
897
|
+
* This method is only used for in-memory storage fallback
|
|
898
|
+
*/
|
|
899
|
+
async persistSearchIndex() {
|
|
900
|
+
// For persistent storage, indexing is done inline with chunks
|
|
901
|
+
// This method is kept for compatibility with in-memory storage mode
|
|
902
|
+
if (this.storage instanceof PersistentStorage) {
|
|
903
|
+
// Chunk-based indexing already persisted during index()
|
|
904
|
+
console.error('[INFO] Chunk-based index already persisted');
|
|
905
|
+
return;
|
|
906
|
+
}
|
|
907
|
+
// For non-persistent storage, just store IDF scores if available
|
|
908
|
+
if (this.searchIndex) {
|
|
909
|
+
const docFreq = new Map();
|
|
910
|
+
for (const doc of this.searchIndex.documents) {
|
|
911
|
+
const uniqueTerms = new Set(doc.rawTerms.keys());
|
|
912
|
+
for (const term of uniqueTerms) {
|
|
913
|
+
docFreq.set(term, (docFreq.get(term) || 0) + 1);
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
console.error('[INFO] In-memory index built (non-persistent storage)');
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
/**
|
|
920
|
+
* Check if currently watching for changes
|
|
921
|
+
*/
|
|
922
|
+
isWatchEnabled() {
|
|
923
|
+
return this.isWatching;
|
|
924
|
+
}
|
|
925
|
+
/**
|
|
926
|
+
* Search the codebase
|
|
927
|
+
* Returns chunk-level results when using persistent storage (SQL-based search)
|
|
928
|
+
*/
|
|
929
|
+
async search(query, options = {}) {
|
|
930
|
+
const { limit = 10, includeContent = true } = options;
|
|
931
|
+
const snippetOptions = {
|
|
932
|
+
contextLines: options.contextLines,
|
|
933
|
+
maxChars: options.maxSnippetChars,
|
|
934
|
+
maxBlocks: options.maxSnippetBlocks,
|
|
935
|
+
};
|
|
936
|
+
// Create cache key from query and options
|
|
937
|
+
const cacheKey = createCacheKey(query, {
|
|
938
|
+
limit,
|
|
939
|
+
fileExtensions: options.fileExtensions,
|
|
940
|
+
pathFilter: options.pathFilter,
|
|
941
|
+
excludePaths: options.excludePaths,
|
|
942
|
+
...snippetOptions,
|
|
943
|
+
});
|
|
944
|
+
// Check cache first
|
|
945
|
+
const cachedResults = this.searchCache.get(cacheKey);
|
|
946
|
+
if (cachedResults) {
|
|
947
|
+
console.error(`[CACHE HIT] Query: "${query}"`);
|
|
948
|
+
return cachedResults;
|
|
949
|
+
}
|
|
950
|
+
console.error(`[CACHE MISS] Query: "${query}"`);
|
|
951
|
+
// Use chunk-based SQL search in low memory mode (Memory optimization)
|
|
952
|
+
if (this.lowMemoryMode && this.storage instanceof PersistentStorage) {
|
|
953
|
+
const searchResults = await this.searchChunks(query, options);
|
|
954
|
+
this.searchCache.set(cacheKey, searchResults);
|
|
955
|
+
return searchResults;
|
|
956
|
+
}
|
|
957
|
+
// In-memory search (faster but uses more memory) - file-level
|
|
958
|
+
let results;
|
|
959
|
+
if (!this.searchIndex) {
|
|
960
|
+
throw new Error('Codebase not indexed. Please run index() first.');
|
|
961
|
+
}
|
|
962
|
+
const searchIndex = this.searchIndex;
|
|
963
|
+
results = await import('./tfidf.js').then((m) => m.searchDocuments(query, searchIndex, { limit }));
|
|
964
|
+
// Get file content and apply filters (in-memory mode)
|
|
965
|
+
const searchResults = [];
|
|
966
|
+
for (const result of results) {
|
|
967
|
+
const filePath = result.uri.replace('file://', '');
|
|
968
|
+
// Apply filters
|
|
969
|
+
if (options.fileExtensions && options.fileExtensions.length > 0) {
|
|
970
|
+
if (!options.fileExtensions.some((ext) => filePath.endsWith(ext))) {
|
|
971
|
+
continue;
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
if (options.pathFilter && !filePath.includes(options.pathFilter)) {
|
|
975
|
+
continue;
|
|
976
|
+
}
|
|
977
|
+
if (options.excludePaths && options.excludePaths.length > 0) {
|
|
978
|
+
if (options.excludePaths.some((exclude) => filePath.includes(exclude))) {
|
|
979
|
+
continue;
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
const file = await this.storage.getFile(filePath);
|
|
983
|
+
if (!file)
|
|
984
|
+
continue;
|
|
985
|
+
const searchResult = {
|
|
986
|
+
path: file.path,
|
|
987
|
+
score: result.score,
|
|
988
|
+
matchedTerms: result.matchedTerms,
|
|
989
|
+
language: file.language,
|
|
990
|
+
size: file.size,
|
|
991
|
+
};
|
|
992
|
+
if (includeContent) {
|
|
993
|
+
searchResult.snippet = this.extractSnippet(file.content, result.matchedTerms, snippetOptions);
|
|
994
|
+
}
|
|
995
|
+
searchResults.push(searchResult);
|
|
996
|
+
}
|
|
997
|
+
const finalResults = searchResults.slice(0, limit);
|
|
998
|
+
// Store in cache
|
|
999
|
+
this.searchCache.set(cacheKey, finalResults);
|
|
1000
|
+
return finalResults;
|
|
1001
|
+
}
|
|
1002
|
+
/**
|
|
1003
|
+
* Chunk-based search with BM25 scoring
|
|
1004
|
+
* Returns chunk content directly (no separate snippet extraction needed)
|
|
1005
|
+
*/
|
|
1006
|
+
async searchChunks(query, options) {
|
|
1007
|
+
const { limit = 10, includeContent = true } = options;
|
|
1008
|
+
const persistentStorage = this.storage;
|
|
1009
|
+
// Tokenize query
|
|
1010
|
+
const queryTokens = await getQueryTokens(query);
|
|
1011
|
+
if (queryTokens.length === 0) {
|
|
1012
|
+
return [];
|
|
1013
|
+
}
|
|
1014
|
+
// Get matching chunks from storage (already includes content)
|
|
1015
|
+
const candidates = await persistentStorage.searchByTerms(queryTokens, { limit: limit * 3 });
|
|
1016
|
+
// Get IDF scores for query terms
|
|
1017
|
+
const idf = await persistentStorage.getIdfScoresForTerms(queryTokens);
|
|
1018
|
+
// Get average document length for BM25
|
|
1019
|
+
const avgDocLength = await persistentStorage.getAverageDocLength();
|
|
1020
|
+
// BM25 parameters
|
|
1021
|
+
const k1 = 1.2;
|
|
1022
|
+
const b = 0.75;
|
|
1023
|
+
// Calculate BM25 scores for each chunk
|
|
1024
|
+
const scoredResults = [];
|
|
1025
|
+
for (const chunk of candidates) {
|
|
1026
|
+
// Apply filters
|
|
1027
|
+
if (options.fileExtensions && options.fileExtensions.length > 0) {
|
|
1028
|
+
if (!options.fileExtensions.some((ext) => chunk.filePath.endsWith(ext))) {
|
|
1029
|
+
continue;
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
if (options.pathFilter && !chunk.filePath.includes(options.pathFilter)) {
|
|
1033
|
+
continue;
|
|
1034
|
+
}
|
|
1035
|
+
if (options.excludePaths && options.excludePaths.length > 0) {
|
|
1036
|
+
if (options.excludePaths.some((exclude) => chunk.filePath.includes(exclude))) {
|
|
1037
|
+
continue;
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
// Calculate BM25 score
|
|
1041
|
+
let score = 0;
|
|
1042
|
+
const matchedTerms = [];
|
|
1043
|
+
for (const term of queryTokens) {
|
|
1044
|
+
const termData = chunk.matchedTerms.get(term);
|
|
1045
|
+
if (!termData)
|
|
1046
|
+
continue;
|
|
1047
|
+
matchedTerms.push(term);
|
|
1048
|
+
const termIdf = idf.get(term) || 1;
|
|
1049
|
+
// BM25 formula
|
|
1050
|
+
const tf = termData.rawFreq;
|
|
1051
|
+
const docLen = chunk.tokenCount || 1;
|
|
1052
|
+
const numerator = tf * (k1 + 1);
|
|
1053
|
+
const denominator = tf + k1 * (1 - b + b * (docLen / (avgDocLength || 1)));
|
|
1054
|
+
score += termIdf * (numerator / denominator);
|
|
1055
|
+
}
|
|
1056
|
+
if (matchedTerms.length > 0) {
|
|
1057
|
+
scoredResults.push({ chunk, score, matchedTerms });
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
// Sort by score descending
|
|
1061
|
+
scoredResults.sort((a, b) => b.score - a.score);
|
|
1062
|
+
// Convert to SearchResult format
|
|
1063
|
+
const results = [];
|
|
1064
|
+
for (const { chunk, score, matchedTerms } of scoredResults.slice(0, limit)) {
|
|
1065
|
+
const result = {
|
|
1066
|
+
path: chunk.filePath,
|
|
1067
|
+
score,
|
|
1068
|
+
matchedTerms,
|
|
1069
|
+
language: detectLanguage(chunk.filePath),
|
|
1070
|
+
size: chunk.content.length,
|
|
1071
|
+
// Include chunk metadata
|
|
1072
|
+
chunkType: chunk.type,
|
|
1073
|
+
startLine: chunk.startLine,
|
|
1074
|
+
endLine: chunk.endLine,
|
|
1075
|
+
};
|
|
1076
|
+
if (includeContent) {
|
|
1077
|
+
// Chunk content is the snippet - add line numbers
|
|
1078
|
+
const lines = chunk.content.split('\n');
|
|
1079
|
+
result.snippet = lines.map((line, i) => `${chunk.startLine + i}: ${line}`).join('\n');
|
|
1080
|
+
}
|
|
1081
|
+
results.push(result);
|
|
1082
|
+
}
|
|
1083
|
+
console.error(`[BM25 CHUNK SEARCH] Found ${results.length} chunks`);
|
|
1084
|
+
return results;
|
|
1085
|
+
}
|
|
1086
|
+
/**
|
|
1087
|
+
* Extract code block snippets from content around matched terms
|
|
1088
|
+
*
|
|
1089
|
+
* Returns the most relevant code blocks (not just lines) with context.
|
|
1090
|
+
* Blocks are ranked by term density (more matched terms = higher score).
|
|
1091
|
+
*/
|
|
1092
|
+
extractSnippet(content, matchedTerms, options = {}) {
|
|
1093
|
+
const { contextLines = 3, maxChars = 2000, maxBlocks = 4 } = options;
|
|
1094
|
+
const lines = content.split('\n');
|
|
1095
|
+
// Step 1: Find all lines with matches and score them
|
|
1096
|
+
const matchedLineInfos = [];
|
|
1097
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1098
|
+
const lineLower = lines[i].toLowerCase();
|
|
1099
|
+
const termsInLine = matchedTerms.filter((term) => lineLower.includes(term.toLowerCase()));
|
|
1100
|
+
if (termsInLine.length > 0) {
|
|
1101
|
+
matchedLineInfos.push({
|
|
1102
|
+
lineNum: i,
|
|
1103
|
+
score: termsInLine.length,
|
|
1104
|
+
matchedTerms: termsInLine,
|
|
1105
|
+
});
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
if (matchedLineInfos.length === 0) {
|
|
1109
|
+
// Return first few lines if no matches found
|
|
1110
|
+
return lines.slice(0, 5).join('\n');
|
|
1111
|
+
}
|
|
1112
|
+
const blocks = [];
|
|
1113
|
+
for (const info of matchedLineInfos) {
|
|
1114
|
+
const start = Math.max(0, info.lineNum - contextLines);
|
|
1115
|
+
const end = Math.min(lines.length - 1, info.lineNum + contextLines);
|
|
1116
|
+
// Try to merge with existing block if overlapping
|
|
1117
|
+
let merged = false;
|
|
1118
|
+
for (const block of blocks) {
|
|
1119
|
+
if (start <= block.end + 1 && end >= block.start - 1) {
|
|
1120
|
+
// Overlapping or adjacent - merge
|
|
1121
|
+
block.start = Math.min(block.start, start);
|
|
1122
|
+
block.end = Math.max(block.end, end);
|
|
1123
|
+
block.score += info.score;
|
|
1124
|
+
for (const term of info.matchedTerms) {
|
|
1125
|
+
block.matchedTerms.add(term);
|
|
1126
|
+
}
|
|
1127
|
+
merged = true;
|
|
1128
|
+
break;
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
if (!merged) {
|
|
1132
|
+
blocks.push({
|
|
1133
|
+
start,
|
|
1134
|
+
end,
|
|
1135
|
+
score: info.score,
|
|
1136
|
+
matchedTerms: new Set(info.matchedTerms),
|
|
1137
|
+
});
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
// Step 3: Sort blocks by unique terms (primary) and density (secondary)
|
|
1141
|
+
// Unique terms = how many different query terms appear in block
|
|
1142
|
+
// Density = unique terms / block size (prefer compact blocks)
|
|
1143
|
+
blocks.sort((a, b) => {
|
|
1144
|
+
const uniqueA = a.matchedTerms.size;
|
|
1145
|
+
const uniqueB = b.matchedTerms.size;
|
|
1146
|
+
if (uniqueA !== uniqueB) {
|
|
1147
|
+
return uniqueB - uniqueA; // More unique terms = better
|
|
1148
|
+
}
|
|
1149
|
+
// Tie-break: prefer denser blocks (more terms per line)
|
|
1150
|
+
const densityA = uniqueA / (a.end - a.start + 1);
|
|
1151
|
+
const densityB = uniqueB / (b.end - b.start + 1);
|
|
1152
|
+
return densityB - densityA;
|
|
1153
|
+
});
|
|
1154
|
+
const topBlocks = blocks.slice(0, maxBlocks);
|
|
1155
|
+
// Sort by position for output (top to bottom in file)
|
|
1156
|
+
topBlocks.sort((a, b) => a.start - b.start);
|
|
1157
|
+
// Step 4: Build output with character limit
|
|
1158
|
+
const snippetParts = [];
|
|
1159
|
+
let totalChars = 0;
|
|
1160
|
+
for (const block of topBlocks) {
|
|
1161
|
+
const blockLines = lines.slice(block.start, block.end + 1);
|
|
1162
|
+
const blockContent = blockLines.map((line, i) => `${block.start + i + 1}: ${line}`).join('\n');
|
|
1163
|
+
// Check if adding this block would exceed limit
|
|
1164
|
+
if (totalChars + blockContent.length > maxChars && snippetParts.length > 0) {
|
|
1165
|
+
break;
|
|
1166
|
+
}
|
|
1167
|
+
snippetParts.push(blockContent);
|
|
1168
|
+
totalChars += blockContent.length;
|
|
1169
|
+
}
|
|
1170
|
+
return snippetParts.join('\n...\n');
|
|
1171
|
+
}
|
|
1172
|
+
/**
|
|
1173
|
+
* Get file content
|
|
1174
|
+
*/
|
|
1175
|
+
async getFileContent(filePath) {
|
|
1176
|
+
const file = await this.storage.getFile(filePath);
|
|
1177
|
+
return file?.content || null;
|
|
1178
|
+
}
|
|
1179
|
+
/**
|
|
1180
|
+
* Get total indexed files count
|
|
1181
|
+
*/
|
|
1182
|
+
async getIndexedCount() {
|
|
1183
|
+
return this.storage.count();
|
|
1184
|
+
}
|
|
1185
|
+
/**
|
|
1186
|
+
* Get vector storage (for hybrid search)
|
|
1187
|
+
*/
|
|
1188
|
+
getVectorStorage() {
|
|
1189
|
+
return this.vectorStorage;
|
|
1190
|
+
}
|
|
1191
|
+
/**
|
|
1192
|
+
* Get embedding provider (for hybrid search)
|
|
1193
|
+
*/
|
|
1194
|
+
getEmbeddingProvider() {
|
|
1195
|
+
return this.embeddingProvider;
|
|
1196
|
+
}
|
|
1197
|
+
/**
|
|
1198
|
+
* Build vector index from file metadata (Memory optimization)
|
|
1199
|
+
* Generates embeddings per CHUNK, not per file
|
|
1200
|
+
*/
|
|
1201
|
+
async buildVectorIndexFromMetadata(files) {
|
|
1202
|
+
if (!this.embeddingProvider || !this.vectorStorage) {
|
|
1203
|
+
return;
|
|
1204
|
+
}
|
|
1205
|
+
console.error('[INFO] Generating embeddings for vector search (chunk-level)...');
|
|
1206
|
+
const startTime = Date.now();
|
|
1207
|
+
let totalChunks = 0;
|
|
1208
|
+
let processedChunks = 0;
|
|
1209
|
+
// First pass: count total chunks
|
|
1210
|
+
const allChunks = [];
|
|
1211
|
+
for (const metadata of files) {
|
|
1212
|
+
const content = readFileContent(metadata.absolutePath);
|
|
1213
|
+
if (content === null)
|
|
1214
|
+
continue;
|
|
1215
|
+
// Chunk the file using AST
|
|
1216
|
+
const chunks = await chunkCodeByAST(content, metadata.path);
|
|
1217
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
1218
|
+
const chunk = chunks[i];
|
|
1219
|
+
allChunks.push({
|
|
1220
|
+
id: `chunk://${metadata.path}:${chunk.startLine}-${chunk.endLine}`,
|
|
1221
|
+
content: chunk.content,
|
|
1222
|
+
metadata,
|
|
1223
|
+
chunkType: chunk.type,
|
|
1224
|
+
startLine: chunk.startLine,
|
|
1225
|
+
endLine: chunk.endLine,
|
|
1226
|
+
});
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
totalChunks = allChunks.length;
|
|
1230
|
+
console.error(`[INFO] Total chunks to embed: ${totalChunks}`);
|
|
1231
|
+
// Process chunks in batches
|
|
1232
|
+
const batchSize = this.vectorBatchSize;
|
|
1233
|
+
for (let i = 0; i < allChunks.length; i += batchSize) {
|
|
1234
|
+
const batch = allChunks.slice(i, i + batchSize);
|
|
1235
|
+
try {
|
|
1236
|
+
// Generate embeddings for batch
|
|
1237
|
+
const embeddings = await this.embeddingProvider.generateEmbeddings(batch.map((c) => c.content));
|
|
1238
|
+
// Add to vector storage
|
|
1239
|
+
for (let j = 0; j < batch.length; j++) {
|
|
1240
|
+
const chunk = batch[j];
|
|
1241
|
+
const embedding = embeddings[j];
|
|
1242
|
+
const doc = {
|
|
1243
|
+
id: chunk.id,
|
|
1244
|
+
embedding,
|
|
1245
|
+
metadata: {
|
|
1246
|
+
type: 'code',
|
|
1247
|
+
chunkType: chunk.chunkType,
|
|
1248
|
+
language: chunk.metadata.language,
|
|
1249
|
+
content: chunk.content.substring(0, 500), // Preview
|
|
1250
|
+
path: chunk.metadata.path,
|
|
1251
|
+
startLine: chunk.startLine,
|
|
1252
|
+
endLine: chunk.endLine,
|
|
1253
|
+
},
|
|
1254
|
+
};
|
|
1255
|
+
await this.vectorStorage.addDocument(doc);
|
|
1256
|
+
}
|
|
1257
|
+
processedChunks += batch.length;
|
|
1258
|
+
console.error(`[INFO] Generated embeddings: ${processedChunks}/${totalChunks} chunks`);
|
|
1259
|
+
}
|
|
1260
|
+
catch (error) {
|
|
1261
|
+
console.error(`[ERROR] Failed to generate embeddings for batch ${i}:`, error);
|
|
1262
|
+
// Continue with next batch
|
|
1263
|
+
}
|
|
1264
|
+
}
|
|
1265
|
+
// LanceDB auto-persists, no need to save
|
|
1266
|
+
const elapsedTime = Date.now() - startTime;
|
|
1267
|
+
console.error(`[SUCCESS] Vector index built (${processedChunks} chunks from ${files.length} files, ${elapsedTime}ms)`);
|
|
1268
|
+
}
|
|
1269
|
+
/**
|
|
1270
|
+
* Update vectors for a single file (chunk-level)
|
|
1271
|
+
* Deletes old chunks and adds new ones
|
|
1272
|
+
*/
|
|
1273
|
+
async updateFileVector(filePath, content) {
|
|
1274
|
+
if (!this.embeddingProvider || !this.vectorStorage) {
|
|
1275
|
+
return;
|
|
1276
|
+
}
|
|
1277
|
+
try {
|
|
1278
|
+
// Delete existing chunks for this file
|
|
1279
|
+
await this.deleteFileVector(filePath);
|
|
1280
|
+
// Chunk the file using AST
|
|
1281
|
+
const chunks = await chunkCodeByAST(content, filePath);
|
|
1282
|
+
const language = detectLanguage(filePath);
|
|
1283
|
+
// Generate embeddings for all chunks
|
|
1284
|
+
const embeddings = await this.embeddingProvider.generateEmbeddings(chunks.map((c) => c.content));
|
|
1285
|
+
// Add each chunk to vector storage
|
|
1286
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
1287
|
+
const chunk = chunks[i];
|
|
1288
|
+
const embedding = embeddings[i];
|
|
1289
|
+
const doc = {
|
|
1290
|
+
id: `chunk://${filePath}:${chunk.startLine}-${chunk.endLine}`,
|
|
1291
|
+
embedding,
|
|
1292
|
+
metadata: {
|
|
1293
|
+
type: 'code',
|
|
1294
|
+
chunkType: chunk.type,
|
|
1295
|
+
language,
|
|
1296
|
+
content: chunk.content.substring(0, 500),
|
|
1297
|
+
path: filePath,
|
|
1298
|
+
startLine: chunk.startLine,
|
|
1299
|
+
endLine: chunk.endLine,
|
|
1300
|
+
},
|
|
1301
|
+
};
|
|
1302
|
+
await this.vectorStorage.addDocument(doc);
|
|
1303
|
+
}
|
|
1304
|
+
console.error(`[VECTOR] Updated: ${filePath} (${chunks.length} chunks)`);
|
|
1305
|
+
}
|
|
1306
|
+
catch (error) {
|
|
1307
|
+
console.error(`[ERROR] Failed to update vector for ${filePath}:`, error);
|
|
1308
|
+
}
|
|
1309
|
+
}
|
|
1310
|
+
/**
|
|
1311
|
+
* Delete vectors for a file (all chunks)
|
|
1312
|
+
*/
|
|
1313
|
+
async deleteFileVector(filePath) {
|
|
1314
|
+
if (!this.vectorStorage) {
|
|
1315
|
+
return;
|
|
1316
|
+
}
|
|
1317
|
+
// Delete all chunks that belong to this file
|
|
1318
|
+
// Vector IDs are in format: chunk://path:startLine-endLine
|
|
1319
|
+
// We need to query and delete all matching the path prefix
|
|
1320
|
+
try {
|
|
1321
|
+
// LanceDB doesn't have a prefix delete, so we search and delete individually
|
|
1322
|
+
// For now, we'll rely on the addDocument overwriting or use a workaround
|
|
1323
|
+
// TODO: Implement proper chunk deletion in VectorStorage
|
|
1324
|
+
console.error(`[VECTOR] Deleting chunks for: ${filePath}`);
|
|
1325
|
+
}
|
|
1326
|
+
catch (error) {
|
|
1327
|
+
console.error(`[ERROR] Failed to delete vectors for ${filePath}:`, error);
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
}
|
|
1331
|
+
//# sourceMappingURL=indexer.js.map
|