@mhalder/qdrant-mcp-server 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.codecov.yml +16 -0
- package/CHANGELOG.md +18 -0
- package/README.md +236 -9
- package/build/code/chunker/base.d.ts +19 -0
- package/build/code/chunker/base.d.ts.map +1 -0
- package/build/code/chunker/base.js +5 -0
- package/build/code/chunker/base.js.map +1 -0
- package/build/code/chunker/character-chunker.d.ts +22 -0
- package/build/code/chunker/character-chunker.d.ts.map +1 -0
- package/build/code/chunker/character-chunker.js +111 -0
- package/build/code/chunker/character-chunker.js.map +1 -0
- package/build/code/chunker/tree-sitter-chunker.d.ts +29 -0
- package/build/code/chunker/tree-sitter-chunker.d.ts.map +1 -0
- package/build/code/chunker/tree-sitter-chunker.js +213 -0
- package/build/code/chunker/tree-sitter-chunker.js.map +1 -0
- package/build/code/config.d.ts +11 -0
- package/build/code/config.d.ts.map +1 -0
- package/build/code/config.js +145 -0
- package/build/code/config.js.map +1 -0
- package/build/code/indexer.d.ts +42 -0
- package/build/code/indexer.d.ts.map +1 -0
- package/build/code/indexer.js +508 -0
- package/build/code/indexer.js.map +1 -0
- package/build/code/metadata.d.ts +32 -0
- package/build/code/metadata.d.ts.map +1 -0
- package/build/code/metadata.js +128 -0
- package/build/code/metadata.js.map +1 -0
- package/build/code/scanner.d.ts +35 -0
- package/build/code/scanner.d.ts.map +1 -0
- package/build/code/scanner.js +108 -0
- package/build/code/scanner.js.map +1 -0
- package/build/code/sync/merkle.d.ts +45 -0
- package/build/code/sync/merkle.d.ts.map +1 -0
- package/build/code/sync/merkle.js +116 -0
- package/build/code/sync/merkle.js.map +1 -0
- package/build/code/sync/snapshot.d.ts +41 -0
- package/build/code/sync/snapshot.d.ts.map +1 -0
- package/build/code/sync/snapshot.js +91 -0
- package/build/code/sync/snapshot.js.map +1 -0
- package/build/code/sync/synchronizer.d.ts +53 -0
- package/build/code/sync/synchronizer.d.ts.map +1 -0
- package/build/code/sync/synchronizer.js +132 -0
- package/build/code/sync/synchronizer.js.map +1 -0
- package/build/code/types.d.ts +98 -0
- package/build/code/types.d.ts.map +1 -0
- package/build/code/types.js +5 -0
- package/build/code/types.js.map +1 -0
- package/build/index.js +250 -0
- package/build/index.js.map +1 -1
- package/examples/code-search/README.md +271 -0
- package/package.json +13 -1
- package/src/code/chunker/base.ts +22 -0
- package/src/code/chunker/character-chunker.ts +131 -0
- package/src/code/chunker/tree-sitter-chunker.ts +250 -0
- package/src/code/config.ts +156 -0
- package/src/code/indexer.ts +613 -0
- package/src/code/metadata.ts +153 -0
- package/src/code/scanner.ts +124 -0
- package/src/code/sync/merkle.ts +136 -0
- package/src/code/sync/snapshot.ts +110 -0
- package/src/code/sync/synchronizer.ts +154 -0
- package/src/code/types.ts +117 -0
- package/src/index.ts +296 -0
- package/tests/code/chunker/character-chunker.test.ts +141 -0
- package/tests/code/chunker/tree-sitter-chunker.test.ts +275 -0
- package/tests/code/fixtures/sample-py/calculator.py +32 -0
- package/tests/code/fixtures/sample-ts/async-operations.ts +120 -0
- package/tests/code/fixtures/sample-ts/auth.ts +31 -0
- package/tests/code/fixtures/sample-ts/config.ts +52 -0
- package/tests/code/fixtures/sample-ts/database.ts +50 -0
- package/tests/code/fixtures/sample-ts/index.ts +39 -0
- package/tests/code/fixtures/sample-ts/types-advanced.ts +132 -0
- package/tests/code/fixtures/sample-ts/utils.ts +105 -0
- package/tests/code/fixtures/sample-ts/validator.ts +169 -0
- package/tests/code/indexer.test.ts +828 -0
- package/tests/code/integration.test.ts +708 -0
- package/tests/code/metadata.test.ts +457 -0
- package/tests/code/scanner.test.ts +131 -0
- package/tests/code/sync/merkle.test.ts +406 -0
- package/tests/code/sync/snapshot.test.ts +360 -0
- package/tests/code/sync/synchronizer.test.ts +501 -0
- package/vitest.config.ts +1 -0
|
@@ -0,0 +1,613 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CodeIndexer - Main orchestrator for code vectorization
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { createHash } from "node:crypto";
|
|
6
|
+
import { promises as fs } from "node:fs";
|
|
7
|
+
import { extname, join, relative, resolve } from "node:path";
|
|
8
|
+
import type { EmbeddingProvider } from "../embeddings/base.js";
|
|
9
|
+
import { BM25SparseVectorGenerator } from "../embeddings/sparse.js";
|
|
10
|
+
import type { QdrantManager } from "../qdrant/client.js";
|
|
11
|
+
import { TreeSitterChunker } from "./chunker/tree-sitter-chunker.js";
|
|
12
|
+
import { MetadataExtractor } from "./metadata.js";
|
|
13
|
+
import { FileScanner } from "./scanner.js";
|
|
14
|
+
import { FileSynchronizer } from "./sync/synchronizer.js";
|
|
15
|
+
import type {
|
|
16
|
+
ChangeStats,
|
|
17
|
+
CodeChunk,
|
|
18
|
+
CodeConfig,
|
|
19
|
+
CodeSearchResult,
|
|
20
|
+
IndexOptions,
|
|
21
|
+
IndexStats,
|
|
22
|
+
IndexStatus,
|
|
23
|
+
ProgressCallback,
|
|
24
|
+
SearchOptions,
|
|
25
|
+
} from "./types.js";
|
|
26
|
+
|
|
27
|
+
export class CodeIndexer {
|
|
28
|
+
constructor(
|
|
29
|
+
private qdrant: QdrantManager,
|
|
30
|
+
private embeddings: EmbeddingProvider,
|
|
31
|
+
private config: CodeConfig
|
|
32
|
+
) {}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Validate that a path doesn't attempt directory traversal
|
|
36
|
+
* @throws Error if path traversal is detected
|
|
37
|
+
*/
|
|
38
|
+
private async validatePath(path: string): Promise<string> {
|
|
39
|
+
const absolutePath = resolve(path);
|
|
40
|
+
|
|
41
|
+
try {
|
|
42
|
+
// Resolve the real path (follows symlinks)
|
|
43
|
+
const realPath = await fs.realpath(absolutePath);
|
|
44
|
+
|
|
45
|
+
// For now, we just ensure the path exists and is resolved
|
|
46
|
+
// In a more restrictive environment, you could check against an allowlist
|
|
47
|
+
return realPath;
|
|
48
|
+
} catch (error) {
|
|
49
|
+
// If realpath fails, the path doesn't exist yet or is invalid
|
|
50
|
+
// For operations like indexing, we still need to accept non-existent paths
|
|
51
|
+
// so we just return the resolved absolute path
|
|
52
|
+
return absolutePath;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Index a codebase from scratch or force re-index
|
|
58
|
+
*/
|
|
59
|
+
async indexCodebase(
|
|
60
|
+
path: string,
|
|
61
|
+
options?: IndexOptions,
|
|
62
|
+
progressCallback?: ProgressCallback
|
|
63
|
+
): Promise<IndexStats> {
|
|
64
|
+
const startTime = Date.now();
|
|
65
|
+
const stats: IndexStats = {
|
|
66
|
+
filesScanned: 0,
|
|
67
|
+
filesIndexed: 0,
|
|
68
|
+
chunksCreated: 0,
|
|
69
|
+
durationMs: 0,
|
|
70
|
+
status: "completed",
|
|
71
|
+
errors: [],
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
try {
|
|
75
|
+
const absolutePath = await this.validatePath(path);
|
|
76
|
+
|
|
77
|
+
// 1. Scan files
|
|
78
|
+
progressCallback?.({
|
|
79
|
+
phase: "scanning",
|
|
80
|
+
current: 0,
|
|
81
|
+
total: 100,
|
|
82
|
+
percentage: 0,
|
|
83
|
+
message: "Scanning files...",
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
const scanner = new FileScanner({
|
|
87
|
+
supportedExtensions: options?.extensions || this.config.supportedExtensions,
|
|
88
|
+
ignorePatterns: this.config.ignorePatterns,
|
|
89
|
+
customIgnorePatterns: options?.ignorePatterns || this.config.customIgnorePatterns,
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
await scanner.loadIgnorePatterns(absolutePath);
|
|
93
|
+
const files = await scanner.scanDirectory(absolutePath);
|
|
94
|
+
|
|
95
|
+
stats.filesScanned = files.length;
|
|
96
|
+
|
|
97
|
+
if (files.length === 0) {
|
|
98
|
+
stats.status = "completed";
|
|
99
|
+
stats.durationMs = Date.now() - startTime;
|
|
100
|
+
return stats;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// 2. Create or verify collection
|
|
104
|
+
const collectionName = this.getCollectionName(absolutePath);
|
|
105
|
+
const collectionExists = await this.qdrant.collectionExists(collectionName);
|
|
106
|
+
|
|
107
|
+
if (options?.forceReindex && collectionExists) {
|
|
108
|
+
await this.qdrant.deleteCollection(collectionName);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if (!collectionExists || options?.forceReindex) {
|
|
112
|
+
const vectorSize = this.embeddings.getDimensions();
|
|
113
|
+
await this.qdrant.createCollection(
|
|
114
|
+
collectionName,
|
|
115
|
+
vectorSize,
|
|
116
|
+
"Cosine",
|
|
117
|
+
this.config.enableHybridSearch
|
|
118
|
+
);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// 3. Process files and create chunks
|
|
122
|
+
const chunker = new TreeSitterChunker({
|
|
123
|
+
chunkSize: this.config.chunkSize,
|
|
124
|
+
chunkOverlap: this.config.chunkOverlap,
|
|
125
|
+
maxChunkSize: this.config.chunkSize * 2,
|
|
126
|
+
});
|
|
127
|
+
const metadataExtractor = new MetadataExtractor();
|
|
128
|
+
const allChunks: Array<{ chunk: CodeChunk; id: string }> = [];
|
|
129
|
+
|
|
130
|
+
for (const [index, filePath] of files.entries()) {
|
|
131
|
+
try {
|
|
132
|
+
progressCallback?.({
|
|
133
|
+
phase: "chunking",
|
|
134
|
+
current: index + 1,
|
|
135
|
+
total: files.length,
|
|
136
|
+
percentage: Math.round(((index + 1) / files.length) * 40), // 0-40%
|
|
137
|
+
message: `Chunking file ${index + 1}/${files.length}`,
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
const code = await fs.readFile(filePath, "utf-8");
|
|
141
|
+
|
|
142
|
+
// Check for secrets (basic detection)
|
|
143
|
+
if (metadataExtractor.containsSecrets(code)) {
|
|
144
|
+
stats.errors?.push(`Skipped ${filePath}: potential secrets detected`);
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const language = metadataExtractor.extractLanguage(filePath);
|
|
149
|
+
const chunks = await chunker.chunk(code, filePath, language);
|
|
150
|
+
|
|
151
|
+
// Apply chunk limits if configured
|
|
152
|
+
const chunksToAdd = this.config.maxChunksPerFile
|
|
153
|
+
? chunks.slice(0, this.config.maxChunksPerFile)
|
|
154
|
+
: chunks;
|
|
155
|
+
|
|
156
|
+
for (const chunk of chunksToAdd) {
|
|
157
|
+
const id = metadataExtractor.generateChunkId(chunk);
|
|
158
|
+
allChunks.push({ chunk, id });
|
|
159
|
+
|
|
160
|
+
// Check total chunk limit
|
|
161
|
+
if (this.config.maxTotalChunks && allChunks.length >= this.config.maxTotalChunks) {
|
|
162
|
+
break;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
stats.filesIndexed++;
|
|
167
|
+
|
|
168
|
+
// Check total chunk limit
|
|
169
|
+
if (this.config.maxTotalChunks && allChunks.length >= this.config.maxTotalChunks) {
|
|
170
|
+
break;
|
|
171
|
+
}
|
|
172
|
+
} catch (error) {
|
|
173
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
174
|
+
stats.errors?.push(`Failed to process ${filePath}: ${errorMessage}`);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
stats.chunksCreated = allChunks.length;
|
|
179
|
+
|
|
180
|
+
// Save snapshot for incremental updates (even if no chunks were created)
|
|
181
|
+
try {
|
|
182
|
+
const synchronizer = new FileSynchronizer(absolutePath, collectionName);
|
|
183
|
+
await synchronizer.updateSnapshot(files);
|
|
184
|
+
} catch (error) {
|
|
185
|
+
// Snapshot failure shouldn't fail the entire indexing
|
|
186
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
187
|
+
console.error("Failed to save snapshot:", errorMessage);
|
|
188
|
+
stats.errors?.push(`Snapshot save failed: ${errorMessage}`);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if (allChunks.length === 0) {
|
|
192
|
+
stats.status = "completed";
|
|
193
|
+
stats.durationMs = Date.now() - startTime;
|
|
194
|
+
return stats;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// 4. Generate embeddings and store in batches
|
|
198
|
+
const batchSize = this.config.batchSize;
|
|
199
|
+
for (let i = 0; i < allChunks.length; i += batchSize) {
|
|
200
|
+
const batch = allChunks.slice(i, i + batchSize);
|
|
201
|
+
|
|
202
|
+
progressCallback?.({
|
|
203
|
+
phase: "embedding",
|
|
204
|
+
current: i + batch.length,
|
|
205
|
+
total: allChunks.length,
|
|
206
|
+
percentage: 40 + Math.round(((i + batch.length) / allChunks.length) * 30), // 40-70%
|
|
207
|
+
message: `Generating embeddings ${i + batch.length}/${allChunks.length}`,
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
try {
|
|
211
|
+
const texts = batch.map((b) => b.chunk.content);
|
|
212
|
+
const embeddings = await this.embeddings.embedBatch(texts);
|
|
213
|
+
|
|
214
|
+
// 5. Store to Qdrant
|
|
215
|
+
const points = batch.map((b, idx) => ({
|
|
216
|
+
id: b.id,
|
|
217
|
+
vector: embeddings[idx].embedding,
|
|
218
|
+
payload: {
|
|
219
|
+
content: b.chunk.content,
|
|
220
|
+
relativePath: relative(absolutePath, b.chunk.metadata.filePath),
|
|
221
|
+
startLine: b.chunk.startLine,
|
|
222
|
+
endLine: b.chunk.endLine,
|
|
223
|
+
fileExtension: extname(b.chunk.metadata.filePath),
|
|
224
|
+
language: b.chunk.metadata.language,
|
|
225
|
+
codebasePath: absolutePath,
|
|
226
|
+
chunkIndex: b.chunk.metadata.chunkIndex,
|
|
227
|
+
...(b.chunk.metadata.name && { name: b.chunk.metadata.name }),
|
|
228
|
+
...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType }),
|
|
229
|
+
},
|
|
230
|
+
}));
|
|
231
|
+
|
|
232
|
+
progressCallback?.({
|
|
233
|
+
phase: "storing",
|
|
234
|
+
current: i + batch.length,
|
|
235
|
+
total: allChunks.length,
|
|
236
|
+
percentage: 70 + Math.round(((i + batch.length) / allChunks.length) * 30), // 70-100%
|
|
237
|
+
message: `Storing chunks ${i + batch.length}/${allChunks.length}`,
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
if (this.config.enableHybridSearch) {
|
|
241
|
+
// Generate sparse vectors for hybrid search
|
|
242
|
+
const sparseGenerator = new BM25SparseVectorGenerator();
|
|
243
|
+
const hybridPoints = batch.map((b, idx) => ({
|
|
244
|
+
id: b.id,
|
|
245
|
+
vector: embeddings[idx].embedding,
|
|
246
|
+
sparseVector: sparseGenerator.generate(b.chunk.content),
|
|
247
|
+
payload: {
|
|
248
|
+
content: b.chunk.content,
|
|
249
|
+
relativePath: relative(absolutePath, b.chunk.metadata.filePath),
|
|
250
|
+
startLine: b.chunk.startLine,
|
|
251
|
+
endLine: b.chunk.endLine,
|
|
252
|
+
fileExtension: extname(b.chunk.metadata.filePath),
|
|
253
|
+
language: b.chunk.metadata.language,
|
|
254
|
+
codebasePath: absolutePath,
|
|
255
|
+
chunkIndex: b.chunk.metadata.chunkIndex,
|
|
256
|
+
...(b.chunk.metadata.name && { name: b.chunk.metadata.name }),
|
|
257
|
+
...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType }),
|
|
258
|
+
},
|
|
259
|
+
}));
|
|
260
|
+
|
|
261
|
+
await this.qdrant.addPointsWithSparse(collectionName, hybridPoints);
|
|
262
|
+
} else {
|
|
263
|
+
await this.qdrant.addPoints(collectionName, points);
|
|
264
|
+
}
|
|
265
|
+
} catch (error) {
|
|
266
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
267
|
+
stats.errors?.push(`Failed to process batch at index ${i}: ${errorMessage}`);
|
|
268
|
+
stats.status = "partial";
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
stats.durationMs = Date.now() - startTime;
|
|
273
|
+
return stats;
|
|
274
|
+
} catch (error) {
|
|
275
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
276
|
+
stats.status = "failed";
|
|
277
|
+
stats.errors?.push(`Indexing failed: ${errorMessage}`);
|
|
278
|
+
stats.durationMs = Date.now() - startTime;
|
|
279
|
+
return stats;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Search code semantically
|
|
285
|
+
*/
|
|
286
|
+
async searchCode(
|
|
287
|
+
path: string,
|
|
288
|
+
query: string,
|
|
289
|
+
options?: SearchOptions
|
|
290
|
+
): Promise<CodeSearchResult[]> {
|
|
291
|
+
const absolutePath = await this.validatePath(path);
|
|
292
|
+
const collectionName = this.getCollectionName(absolutePath);
|
|
293
|
+
|
|
294
|
+
// Check if collection exists
|
|
295
|
+
const exists = await this.qdrant.collectionExists(collectionName);
|
|
296
|
+
if (!exists) {
|
|
297
|
+
throw new Error(`Codebase not indexed: ${path}`);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
// Check if collection has hybrid search enabled
|
|
301
|
+
const collectionInfo = await this.qdrant.getCollectionInfo(collectionName);
|
|
302
|
+
const useHybrid =
|
|
303
|
+
(options?.useHybrid ?? this.config.enableHybridSearch) && collectionInfo.hybridEnabled;
|
|
304
|
+
|
|
305
|
+
// Generate query embedding
|
|
306
|
+
const { embedding } = await this.embeddings.embed(query);
|
|
307
|
+
|
|
308
|
+
// Build filter
|
|
309
|
+
let filter: any;
|
|
310
|
+
if (options?.fileTypes || options?.pathPattern) {
|
|
311
|
+
filter = { must: [] };
|
|
312
|
+
|
|
313
|
+
if (options.fileTypes && options.fileTypes.length > 0) {
|
|
314
|
+
filter.must.push({
|
|
315
|
+
key: "fileExtension",
|
|
316
|
+
match: { any: options.fileTypes },
|
|
317
|
+
});
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
if (options.pathPattern) {
|
|
321
|
+
// Convert glob pattern to regex (simplified)
|
|
322
|
+
const regex = options.pathPattern
|
|
323
|
+
.replace(/\./g, "\\.")
|
|
324
|
+
.replace(/\*\*/g, ".*")
|
|
325
|
+
.replace(/\*/g, "[^/]*")
|
|
326
|
+
.replace(/\?/g, ".");
|
|
327
|
+
|
|
328
|
+
filter.must.push({
|
|
329
|
+
key: "relativePath",
|
|
330
|
+
match: { text: regex },
|
|
331
|
+
});
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Search with hybrid or standard search
|
|
336
|
+
let results;
|
|
337
|
+
if (useHybrid) {
|
|
338
|
+
const sparseGenerator = new BM25SparseVectorGenerator();
|
|
339
|
+
const sparseVector = sparseGenerator.generate(query);
|
|
340
|
+
results = await this.qdrant.hybridSearch(
|
|
341
|
+
collectionName,
|
|
342
|
+
embedding,
|
|
343
|
+
sparseVector,
|
|
344
|
+
options?.limit || this.config.defaultSearchLimit,
|
|
345
|
+
filter
|
|
346
|
+
);
|
|
347
|
+
} else {
|
|
348
|
+
results = await this.qdrant.search(
|
|
349
|
+
collectionName,
|
|
350
|
+
embedding,
|
|
351
|
+
options?.limit || this.config.defaultSearchLimit,
|
|
352
|
+
filter
|
|
353
|
+
);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
// Apply score threshold if specified
|
|
357
|
+
const filteredResults = options?.scoreThreshold
|
|
358
|
+
? results.filter((r) => r.score >= (options.scoreThreshold || 0))
|
|
359
|
+
: results;
|
|
360
|
+
|
|
361
|
+
// Format results
|
|
362
|
+
return filteredResults.map((r) => ({
|
|
363
|
+
content: r.payload?.content || "",
|
|
364
|
+
filePath: r.payload?.relativePath || "",
|
|
365
|
+
startLine: r.payload?.startLine || 0,
|
|
366
|
+
endLine: r.payload?.endLine || 0,
|
|
367
|
+
language: r.payload?.language || "unknown",
|
|
368
|
+
score: r.score,
|
|
369
|
+
fileExtension: r.payload?.fileExtension || "",
|
|
370
|
+
}));
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Get indexing status for a codebase
|
|
375
|
+
*/
|
|
376
|
+
async getIndexStatus(path: string): Promise<IndexStatus> {
|
|
377
|
+
const absolutePath = await this.validatePath(path);
|
|
378
|
+
const collectionName = this.getCollectionName(absolutePath);
|
|
379
|
+
const exists = await this.qdrant.collectionExists(collectionName);
|
|
380
|
+
|
|
381
|
+
if (!exists) {
|
|
382
|
+
return { isIndexed: false };
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
const info = await this.qdrant.getCollectionInfo(collectionName);
|
|
386
|
+
|
|
387
|
+
return {
|
|
388
|
+
isIndexed: true,
|
|
389
|
+
collectionName,
|
|
390
|
+
chunksCount: info.pointsCount,
|
|
391
|
+
// TODO: Extract unique languages and file count from collection
|
|
392
|
+
// This would require scrolling through points or maintaining separate metadata
|
|
393
|
+
};
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/**
|
|
397
|
+
* Incrementally re-index only changed files
|
|
398
|
+
*/
|
|
399
|
+
async reindexChanges(path: string, progressCallback?: ProgressCallback): Promise<ChangeStats> {
|
|
400
|
+
const startTime = Date.now();
|
|
401
|
+
const stats: ChangeStats = {
|
|
402
|
+
filesAdded: 0,
|
|
403
|
+
filesModified: 0,
|
|
404
|
+
filesDeleted: 0,
|
|
405
|
+
chunksAdded: 0,
|
|
406
|
+
chunksDeleted: 0,
|
|
407
|
+
durationMs: 0,
|
|
408
|
+
};
|
|
409
|
+
|
|
410
|
+
try {
|
|
411
|
+
const absolutePath = await this.validatePath(path);
|
|
412
|
+
const collectionName = this.getCollectionName(absolutePath);
|
|
413
|
+
|
|
414
|
+
// Check if collection exists
|
|
415
|
+
const exists = await this.qdrant.collectionExists(collectionName);
|
|
416
|
+
if (!exists) {
|
|
417
|
+
throw new Error(`Codebase not indexed: ${path}`);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
// Initialize synchronizer
|
|
421
|
+
const synchronizer = new FileSynchronizer(absolutePath, collectionName);
|
|
422
|
+
const hasSnapshot = await synchronizer.initialize();
|
|
423
|
+
|
|
424
|
+
if (!hasSnapshot) {
|
|
425
|
+
throw new Error("No previous snapshot found. Use index_codebase for initial indexing.");
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// Scan current files
|
|
429
|
+
progressCallback?.({
|
|
430
|
+
phase: "scanning",
|
|
431
|
+
current: 0,
|
|
432
|
+
total: 100,
|
|
433
|
+
percentage: 0,
|
|
434
|
+
message: "Scanning for changes...",
|
|
435
|
+
});
|
|
436
|
+
|
|
437
|
+
const scanner = new FileScanner({
|
|
438
|
+
supportedExtensions: this.config.supportedExtensions,
|
|
439
|
+
ignorePatterns: this.config.ignorePatterns,
|
|
440
|
+
customIgnorePatterns: this.config.customIgnorePatterns,
|
|
441
|
+
});
|
|
442
|
+
|
|
443
|
+
await scanner.loadIgnorePatterns(absolutePath);
|
|
444
|
+
const currentFiles = await scanner.scanDirectory(absolutePath);
|
|
445
|
+
|
|
446
|
+
// Detect changes
|
|
447
|
+
const changes = await synchronizer.detectChanges(currentFiles);
|
|
448
|
+
stats.filesAdded = changes.added.length;
|
|
449
|
+
stats.filesModified = changes.modified.length;
|
|
450
|
+
stats.filesDeleted = changes.deleted.length;
|
|
451
|
+
|
|
452
|
+
if (stats.filesAdded === 0 && stats.filesModified === 0 && stats.filesDeleted === 0) {
|
|
453
|
+
stats.durationMs = Date.now() - startTime;
|
|
454
|
+
return stats;
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
const chunker = new TreeSitterChunker({
|
|
458
|
+
chunkSize: this.config.chunkSize,
|
|
459
|
+
chunkOverlap: this.config.chunkOverlap,
|
|
460
|
+
maxChunkSize: this.config.chunkSize * 2,
|
|
461
|
+
});
|
|
462
|
+
const metadataExtractor = new MetadataExtractor();
|
|
463
|
+
|
|
464
|
+
// Process deleted and modified files - collect chunk IDs to delete
|
|
465
|
+
const _chunkIdsToDelete: string[] = [];
|
|
466
|
+
const filesToReprocess = [...changes.modified, ...changes.deleted];
|
|
467
|
+
|
|
468
|
+
for (const _filePath of filesToReprocess) {
|
|
469
|
+
try {
|
|
470
|
+
// Read old file content to generate chunk IDs for deletion
|
|
471
|
+
// We need to regenerate the chunks to get their IDs
|
|
472
|
+
// For now, we'll use a simpler approach: delete based on file path
|
|
473
|
+
// This requires keeping track of chunk IDs per file
|
|
474
|
+
// Since we don't have a direct way to query by file path,
|
|
475
|
+
// we'll mark these as needing deletion by filename pattern
|
|
476
|
+
// For simplicity in Phase 2, we'll re-index everything
|
|
477
|
+
// A future enhancement would be to maintain a chunk ID mapping
|
|
478
|
+
} catch (_error) {
|
|
479
|
+
// File might be deleted, skip
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// For Phase 2 MVP: Simply re-process all changed files
|
|
484
|
+
// TODO Phase 3: Implement proper chunk deletion by maintaining chunk ID mapping
|
|
485
|
+
const filesToIndex = [...changes.added, ...changes.modified];
|
|
486
|
+
const allChunks: Array<{ chunk: CodeChunk; id: string }> = [];
|
|
487
|
+
|
|
488
|
+
for (const [index, filePath] of filesToIndex.entries()) {
|
|
489
|
+
try {
|
|
490
|
+
progressCallback?.({
|
|
491
|
+
phase: "chunking",
|
|
492
|
+
current: index + 1,
|
|
493
|
+
total: filesToIndex.length,
|
|
494
|
+
percentage: Math.round(((index + 1) / filesToIndex.length) * 40),
|
|
495
|
+
message: `Processing file ${index + 1}/${filesToIndex.length}`,
|
|
496
|
+
});
|
|
497
|
+
|
|
498
|
+
const absoluteFilePath = join(absolutePath, filePath);
|
|
499
|
+
const code = await fs.readFile(absoluteFilePath, "utf-8");
|
|
500
|
+
|
|
501
|
+
// Check for secrets
|
|
502
|
+
if (metadataExtractor.containsSecrets(code)) {
|
|
503
|
+
continue;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
const language = metadataExtractor.extractLanguage(absoluteFilePath);
|
|
507
|
+
const chunks = await chunker.chunk(code, absoluteFilePath, language);
|
|
508
|
+
|
|
509
|
+
for (const chunk of chunks) {
|
|
510
|
+
const id = metadataExtractor.generateChunkId(chunk);
|
|
511
|
+
allChunks.push({ chunk, id });
|
|
512
|
+
}
|
|
513
|
+
} catch (error) {
|
|
514
|
+
console.error(`Failed to process ${filePath}:`, error);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
stats.chunksAdded = allChunks.length;
|
|
519
|
+
|
|
520
|
+
// Generate embeddings and store in batches
|
|
521
|
+
const batchSize = this.config.batchSize;
|
|
522
|
+
for (let i = 0; i < allChunks.length; i += batchSize) {
|
|
523
|
+
const batch = allChunks.slice(i, i + batchSize);
|
|
524
|
+
|
|
525
|
+
progressCallback?.({
|
|
526
|
+
phase: "embedding",
|
|
527
|
+
current: i + batch.length,
|
|
528
|
+
total: allChunks.length,
|
|
529
|
+
percentage: 40 + Math.round(((i + batch.length) / allChunks.length) * 30),
|
|
530
|
+
message: `Generating embeddings ${i + batch.length}/${allChunks.length}`,
|
|
531
|
+
});
|
|
532
|
+
|
|
533
|
+
const texts = batch.map((b) => b.chunk.content);
|
|
534
|
+
const embeddings = await this.embeddings.embedBatch(texts);
|
|
535
|
+
|
|
536
|
+
const points = batch.map((b, idx) => ({
|
|
537
|
+
id: b.id,
|
|
538
|
+
vector: embeddings[idx].embedding,
|
|
539
|
+
payload: {
|
|
540
|
+
content: b.chunk.content,
|
|
541
|
+
relativePath: relative(absolutePath, b.chunk.metadata.filePath),
|
|
542
|
+
startLine: b.chunk.startLine,
|
|
543
|
+
endLine: b.chunk.endLine,
|
|
544
|
+
fileExtension: extname(b.chunk.metadata.filePath),
|
|
545
|
+
language: b.chunk.metadata.language,
|
|
546
|
+
codebasePath: absolutePath,
|
|
547
|
+
chunkIndex: b.chunk.metadata.chunkIndex,
|
|
548
|
+
...(b.chunk.metadata.name && { name: b.chunk.metadata.name }),
|
|
549
|
+
...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType }),
|
|
550
|
+
},
|
|
551
|
+
}));
|
|
552
|
+
|
|
553
|
+
progressCallback?.({
|
|
554
|
+
phase: "storing",
|
|
555
|
+
current: i + batch.length,
|
|
556
|
+
total: allChunks.length,
|
|
557
|
+
percentage: 70 + Math.round(((i + batch.length) / allChunks.length) * 30),
|
|
558
|
+
message: `Storing chunks ${i + batch.length}/${allChunks.length}`,
|
|
559
|
+
});
|
|
560
|
+
|
|
561
|
+
if (this.config.enableHybridSearch) {
|
|
562
|
+
const sparseGenerator = new BM25SparseVectorGenerator();
|
|
563
|
+
const hybridPoints = points.map((point, idx) => ({
|
|
564
|
+
...point,
|
|
565
|
+
sparseVector: sparseGenerator.generate(allChunks[i + idx].chunk.content),
|
|
566
|
+
}));
|
|
567
|
+
await this.qdrant.addPointsWithSparse(collectionName, hybridPoints);
|
|
568
|
+
} else {
|
|
569
|
+
await this.qdrant.addPoints(collectionName, points);
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
// Update snapshot
|
|
574
|
+
await synchronizer.updateSnapshot(currentFiles);
|
|
575
|
+
|
|
576
|
+
stats.durationMs = Date.now() - startTime;
|
|
577
|
+
return stats;
|
|
578
|
+
} catch (error) {
|
|
579
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
580
|
+
throw new Error(`Incremental re-indexing failed: ${errorMessage}`);
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
/**
|
|
585
|
+
* Clear all indexed data for a codebase
|
|
586
|
+
*/
|
|
587
|
+
async clearIndex(path: string): Promise<void> {
|
|
588
|
+
const absolutePath = await this.validatePath(path);
|
|
589
|
+
const collectionName = this.getCollectionName(absolutePath);
|
|
590
|
+
const exists = await this.qdrant.collectionExists(collectionName);
|
|
591
|
+
|
|
592
|
+
if (exists) {
|
|
593
|
+
await this.qdrant.deleteCollection(collectionName);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
// Also delete snapshot
|
|
597
|
+
try {
|
|
598
|
+
const synchronizer = new FileSynchronizer(absolutePath, collectionName);
|
|
599
|
+
await synchronizer.deleteSnapshot();
|
|
600
|
+
} catch (_error) {
|
|
601
|
+
// Ignore snapshot deletion errors
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
/**
|
|
606
|
+
* Generate deterministic collection name from codebase path
|
|
607
|
+
*/
|
|
608
|
+
private getCollectionName(path: string): string {
|
|
609
|
+
const absolutePath = resolve(path);
|
|
610
|
+
const hash = createHash("md5").update(absolutePath).digest("hex");
|
|
611
|
+
return `code_${hash.substring(0, 8)}`;
|
|
612
|
+
}
|
|
613
|
+
}
|