@mhalder/qdrant-mcp-server 1.4.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/.codecov.yml +16 -0
  2. package/.github/workflows/claude-code-review.yml +6 -5
  3. package/.releaserc.json +8 -1
  4. package/CHANGELOG.md +34 -0
  5. package/README.md +259 -9
  6. package/build/code/chunker/base.d.ts +19 -0
  7. package/build/code/chunker/base.d.ts.map +1 -0
  8. package/build/code/chunker/base.js +5 -0
  9. package/build/code/chunker/base.js.map +1 -0
  10. package/build/code/chunker/character-chunker.d.ts +22 -0
  11. package/build/code/chunker/character-chunker.d.ts.map +1 -0
  12. package/build/code/chunker/character-chunker.js +111 -0
  13. package/build/code/chunker/character-chunker.js.map +1 -0
  14. package/build/code/chunker/tree-sitter-chunker.d.ts +29 -0
  15. package/build/code/chunker/tree-sitter-chunker.d.ts.map +1 -0
  16. package/build/code/chunker/tree-sitter-chunker.js +213 -0
  17. package/build/code/chunker/tree-sitter-chunker.js.map +1 -0
  18. package/build/code/config.d.ts +11 -0
  19. package/build/code/config.d.ts.map +1 -0
  20. package/build/code/config.js +145 -0
  21. package/build/code/config.js.map +1 -0
  22. package/build/code/indexer.d.ts +42 -0
  23. package/build/code/indexer.d.ts.map +1 -0
  24. package/build/code/indexer.js +508 -0
  25. package/build/code/indexer.js.map +1 -0
  26. package/build/code/metadata.d.ts +32 -0
  27. package/build/code/metadata.d.ts.map +1 -0
  28. package/build/code/metadata.js +128 -0
  29. package/build/code/metadata.js.map +1 -0
  30. package/build/code/scanner.d.ts +35 -0
  31. package/build/code/scanner.d.ts.map +1 -0
  32. package/build/code/scanner.js +108 -0
  33. package/build/code/scanner.js.map +1 -0
  34. package/build/code/sync/merkle.d.ts +45 -0
  35. package/build/code/sync/merkle.d.ts.map +1 -0
  36. package/build/code/sync/merkle.js +116 -0
  37. package/build/code/sync/merkle.js.map +1 -0
  38. package/build/code/sync/snapshot.d.ts +41 -0
  39. package/build/code/sync/snapshot.d.ts.map +1 -0
  40. package/build/code/sync/snapshot.js +91 -0
  41. package/build/code/sync/snapshot.js.map +1 -0
  42. package/build/code/sync/synchronizer.d.ts +53 -0
  43. package/build/code/sync/synchronizer.d.ts.map +1 -0
  44. package/build/code/sync/synchronizer.js +132 -0
  45. package/build/code/sync/synchronizer.js.map +1 -0
  46. package/build/code/types.d.ts +98 -0
  47. package/build/code/types.d.ts.map +1 -0
  48. package/build/code/types.js +5 -0
  49. package/build/code/types.js.map +1 -0
  50. package/build/index.js +252 -1
  51. package/build/index.js.map +1 -1
  52. package/build/qdrant/client.d.ts +1 -1
  53. package/build/qdrant/client.d.ts.map +1 -1
  54. package/build/qdrant/client.js +2 -2
  55. package/build/qdrant/client.js.map +1 -1
  56. package/build/qdrant/client.test.js +16 -0
  57. package/build/qdrant/client.test.js.map +1 -1
  58. package/examples/code-search/README.md +271 -0
  59. package/package.json +15 -2
  60. package/src/code/chunker/base.ts +22 -0
  61. package/src/code/chunker/character-chunker.ts +131 -0
  62. package/src/code/chunker/tree-sitter-chunker.ts +250 -0
  63. package/src/code/config.ts +156 -0
  64. package/src/code/indexer.ts +613 -0
  65. package/src/code/metadata.ts +153 -0
  66. package/src/code/scanner.ts +124 -0
  67. package/src/code/sync/merkle.ts +136 -0
  68. package/src/code/sync/snapshot.ts +110 -0
  69. package/src/code/sync/synchronizer.ts +154 -0
  70. package/src/code/types.ts +117 -0
  71. package/src/index.ts +298 -1
  72. package/src/qdrant/client.test.ts +20 -0
  73. package/src/qdrant/client.ts +2 -2
  74. package/tests/code/chunker/character-chunker.test.ts +141 -0
  75. package/tests/code/chunker/tree-sitter-chunker.test.ts +275 -0
  76. package/tests/code/fixtures/sample-py/calculator.py +32 -0
  77. package/tests/code/fixtures/sample-ts/async-operations.ts +120 -0
  78. package/tests/code/fixtures/sample-ts/auth.ts +31 -0
  79. package/tests/code/fixtures/sample-ts/config.ts +52 -0
  80. package/tests/code/fixtures/sample-ts/database.ts +50 -0
  81. package/tests/code/fixtures/sample-ts/index.ts +39 -0
  82. package/tests/code/fixtures/sample-ts/types-advanced.ts +132 -0
  83. package/tests/code/fixtures/sample-ts/utils.ts +105 -0
  84. package/tests/code/fixtures/sample-ts/validator.ts +169 -0
  85. package/tests/code/indexer.test.ts +828 -0
  86. package/tests/code/integration.test.ts +708 -0
  87. package/tests/code/metadata.test.ts +457 -0
  88. package/tests/code/scanner.test.ts +131 -0
  89. package/tests/code/sync/merkle.test.ts +406 -0
  90. package/tests/code/sync/snapshot.test.ts +360 -0
  91. package/tests/code/sync/synchronizer.test.ts +501 -0
  92. package/vitest.config.ts +1 -0
@@ -0,0 +1,613 @@
1
+ /**
2
+ * CodeIndexer - Main orchestrator for code vectorization
3
+ */
4
+
5
+ import { createHash } from "node:crypto";
6
+ import { promises as fs } from "node:fs";
7
+ import { extname, join, relative, resolve } from "node:path";
8
+ import type { EmbeddingProvider } from "../embeddings/base.js";
9
+ import { BM25SparseVectorGenerator } from "../embeddings/sparse.js";
10
+ import type { QdrantManager } from "../qdrant/client.js";
11
+ import { TreeSitterChunker } from "./chunker/tree-sitter-chunker.js";
12
+ import { MetadataExtractor } from "./metadata.js";
13
+ import { FileScanner } from "./scanner.js";
14
+ import { FileSynchronizer } from "./sync/synchronizer.js";
15
+ import type {
16
+ ChangeStats,
17
+ CodeChunk,
18
+ CodeConfig,
19
+ CodeSearchResult,
20
+ IndexOptions,
21
+ IndexStats,
22
+ IndexStatus,
23
+ ProgressCallback,
24
+ SearchOptions,
25
+ } from "./types.js";
26
+
27
+ export class CodeIndexer {
28
+ constructor(
29
+ private qdrant: QdrantManager,
30
+ private embeddings: EmbeddingProvider,
31
+ private config: CodeConfig
32
+ ) {}
33
+
34
+ /**
35
+ * Validate that a path doesn't attempt directory traversal
36
+ * @throws Error if path traversal is detected
37
+ */
38
+ private async validatePath(path: string): Promise<string> {
39
+ const absolutePath = resolve(path);
40
+
41
+ try {
42
+ // Resolve the real path (follows symlinks)
43
+ const realPath = await fs.realpath(absolutePath);
44
+
45
+ // For now, we just ensure the path exists and is resolved
46
+ // In a more restrictive environment, you could check against an allowlist
47
+ return realPath;
48
+ } catch (error) {
49
+ // If realpath fails, the path doesn't exist yet or is invalid
50
+ // For operations like indexing, we still need to accept non-existent paths
51
+ // so we just return the resolved absolute path
52
+ return absolutePath;
53
+ }
54
+ }
55
+
56
+ /**
57
+ * Index a codebase from scratch or force re-index
58
+ */
59
+ async indexCodebase(
60
+ path: string,
61
+ options?: IndexOptions,
62
+ progressCallback?: ProgressCallback
63
+ ): Promise<IndexStats> {
64
+ const startTime = Date.now();
65
+ const stats: IndexStats = {
66
+ filesScanned: 0,
67
+ filesIndexed: 0,
68
+ chunksCreated: 0,
69
+ durationMs: 0,
70
+ status: "completed",
71
+ errors: [],
72
+ };
73
+
74
+ try {
75
+ const absolutePath = await this.validatePath(path);
76
+
77
+ // 1. Scan files
78
+ progressCallback?.({
79
+ phase: "scanning",
80
+ current: 0,
81
+ total: 100,
82
+ percentage: 0,
83
+ message: "Scanning files...",
84
+ });
85
+
86
+ const scanner = new FileScanner({
87
+ supportedExtensions: options?.extensions || this.config.supportedExtensions,
88
+ ignorePatterns: this.config.ignorePatterns,
89
+ customIgnorePatterns: options?.ignorePatterns || this.config.customIgnorePatterns,
90
+ });
91
+
92
+ await scanner.loadIgnorePatterns(absolutePath);
93
+ const files = await scanner.scanDirectory(absolutePath);
94
+
95
+ stats.filesScanned = files.length;
96
+
97
+ if (files.length === 0) {
98
+ stats.status = "completed";
99
+ stats.durationMs = Date.now() - startTime;
100
+ return stats;
101
+ }
102
+
103
+ // 2. Create or verify collection
104
+ const collectionName = this.getCollectionName(absolutePath);
105
+ const collectionExists = await this.qdrant.collectionExists(collectionName);
106
+
107
+ if (options?.forceReindex && collectionExists) {
108
+ await this.qdrant.deleteCollection(collectionName);
109
+ }
110
+
111
+ if (!collectionExists || options?.forceReindex) {
112
+ const vectorSize = this.embeddings.getDimensions();
113
+ await this.qdrant.createCollection(
114
+ collectionName,
115
+ vectorSize,
116
+ "Cosine",
117
+ this.config.enableHybridSearch
118
+ );
119
+ }
120
+
121
+ // 3. Process files and create chunks
122
+ const chunker = new TreeSitterChunker({
123
+ chunkSize: this.config.chunkSize,
124
+ chunkOverlap: this.config.chunkOverlap,
125
+ maxChunkSize: this.config.chunkSize * 2,
126
+ });
127
+ const metadataExtractor = new MetadataExtractor();
128
+ const allChunks: Array<{ chunk: CodeChunk; id: string }> = [];
129
+
130
+ for (const [index, filePath] of files.entries()) {
131
+ try {
132
+ progressCallback?.({
133
+ phase: "chunking",
134
+ current: index + 1,
135
+ total: files.length,
136
+ percentage: Math.round(((index + 1) / files.length) * 40), // 0-40%
137
+ message: `Chunking file ${index + 1}/${files.length}`,
138
+ });
139
+
140
+ const code = await fs.readFile(filePath, "utf-8");
141
+
142
+ // Check for secrets (basic detection)
143
+ if (metadataExtractor.containsSecrets(code)) {
144
+ stats.errors?.push(`Skipped ${filePath}: potential secrets detected`);
145
+ continue;
146
+ }
147
+
148
+ const language = metadataExtractor.extractLanguage(filePath);
149
+ const chunks = await chunker.chunk(code, filePath, language);
150
+
151
+ // Apply chunk limits if configured
152
+ const chunksToAdd = this.config.maxChunksPerFile
153
+ ? chunks.slice(0, this.config.maxChunksPerFile)
154
+ : chunks;
155
+
156
+ for (const chunk of chunksToAdd) {
157
+ const id = metadataExtractor.generateChunkId(chunk);
158
+ allChunks.push({ chunk, id });
159
+
160
+ // Check total chunk limit
161
+ if (this.config.maxTotalChunks && allChunks.length >= this.config.maxTotalChunks) {
162
+ break;
163
+ }
164
+ }
165
+
166
+ stats.filesIndexed++;
167
+
168
+ // Check total chunk limit
169
+ if (this.config.maxTotalChunks && allChunks.length >= this.config.maxTotalChunks) {
170
+ break;
171
+ }
172
+ } catch (error) {
173
+ const errorMessage = error instanceof Error ? error.message : String(error);
174
+ stats.errors?.push(`Failed to process ${filePath}: ${errorMessage}`);
175
+ }
176
+ }
177
+
178
+ stats.chunksCreated = allChunks.length;
179
+
180
+ // Save snapshot for incremental updates (even if no chunks were created)
181
+ try {
182
+ const synchronizer = new FileSynchronizer(absolutePath, collectionName);
183
+ await synchronizer.updateSnapshot(files);
184
+ } catch (error) {
185
+ // Snapshot failure shouldn't fail the entire indexing
186
+ const errorMessage = error instanceof Error ? error.message : String(error);
187
+ console.error("Failed to save snapshot:", errorMessage);
188
+ stats.errors?.push(`Snapshot save failed: ${errorMessage}`);
189
+ }
190
+
191
+ if (allChunks.length === 0) {
192
+ stats.status = "completed";
193
+ stats.durationMs = Date.now() - startTime;
194
+ return stats;
195
+ }
196
+
197
+ // 4. Generate embeddings and store in batches
198
+ const batchSize = this.config.batchSize;
199
+ for (let i = 0; i < allChunks.length; i += batchSize) {
200
+ const batch = allChunks.slice(i, i + batchSize);
201
+
202
+ progressCallback?.({
203
+ phase: "embedding",
204
+ current: i + batch.length,
205
+ total: allChunks.length,
206
+ percentage: 40 + Math.round(((i + batch.length) / allChunks.length) * 30), // 40-70%
207
+ message: `Generating embeddings ${i + batch.length}/${allChunks.length}`,
208
+ });
209
+
210
+ try {
211
+ const texts = batch.map((b) => b.chunk.content);
212
+ const embeddings = await this.embeddings.embedBatch(texts);
213
+
214
+ // 5. Store to Qdrant
215
+ const points = batch.map((b, idx) => ({
216
+ id: b.id,
217
+ vector: embeddings[idx].embedding,
218
+ payload: {
219
+ content: b.chunk.content,
220
+ relativePath: relative(absolutePath, b.chunk.metadata.filePath),
221
+ startLine: b.chunk.startLine,
222
+ endLine: b.chunk.endLine,
223
+ fileExtension: extname(b.chunk.metadata.filePath),
224
+ language: b.chunk.metadata.language,
225
+ codebasePath: absolutePath,
226
+ chunkIndex: b.chunk.metadata.chunkIndex,
227
+ ...(b.chunk.metadata.name && { name: b.chunk.metadata.name }),
228
+ ...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType }),
229
+ },
230
+ }));
231
+
232
+ progressCallback?.({
233
+ phase: "storing",
234
+ current: i + batch.length,
235
+ total: allChunks.length,
236
+ percentage: 70 + Math.round(((i + batch.length) / allChunks.length) * 30), // 70-100%
237
+ message: `Storing chunks ${i + batch.length}/${allChunks.length}`,
238
+ });
239
+
240
+ if (this.config.enableHybridSearch) {
241
+ // Generate sparse vectors for hybrid search
242
+ const sparseGenerator = new BM25SparseVectorGenerator();
243
+ const hybridPoints = batch.map((b, idx) => ({
244
+ id: b.id,
245
+ vector: embeddings[idx].embedding,
246
+ sparseVector: sparseGenerator.generate(b.chunk.content),
247
+ payload: {
248
+ content: b.chunk.content,
249
+ relativePath: relative(absolutePath, b.chunk.metadata.filePath),
250
+ startLine: b.chunk.startLine,
251
+ endLine: b.chunk.endLine,
252
+ fileExtension: extname(b.chunk.metadata.filePath),
253
+ language: b.chunk.metadata.language,
254
+ codebasePath: absolutePath,
255
+ chunkIndex: b.chunk.metadata.chunkIndex,
256
+ ...(b.chunk.metadata.name && { name: b.chunk.metadata.name }),
257
+ ...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType }),
258
+ },
259
+ }));
260
+
261
+ await this.qdrant.addPointsWithSparse(collectionName, hybridPoints);
262
+ } else {
263
+ await this.qdrant.addPoints(collectionName, points);
264
+ }
265
+ } catch (error) {
266
+ const errorMessage = error instanceof Error ? error.message : String(error);
267
+ stats.errors?.push(`Failed to process batch at index ${i}: ${errorMessage}`);
268
+ stats.status = "partial";
269
+ }
270
+ }
271
+
272
+ stats.durationMs = Date.now() - startTime;
273
+ return stats;
274
+ } catch (error) {
275
+ const errorMessage = error instanceof Error ? error.message : String(error);
276
+ stats.status = "failed";
277
+ stats.errors?.push(`Indexing failed: ${errorMessage}`);
278
+ stats.durationMs = Date.now() - startTime;
279
+ return stats;
280
+ }
281
+ }
282
+
283
+ /**
284
+ * Search code semantically
285
+ */
286
+ async searchCode(
287
+ path: string,
288
+ query: string,
289
+ options?: SearchOptions
290
+ ): Promise<CodeSearchResult[]> {
291
+ const absolutePath = await this.validatePath(path);
292
+ const collectionName = this.getCollectionName(absolutePath);
293
+
294
+ // Check if collection exists
295
+ const exists = await this.qdrant.collectionExists(collectionName);
296
+ if (!exists) {
297
+ throw new Error(`Codebase not indexed: ${path}`);
298
+ }
299
+
300
+ // Check if collection has hybrid search enabled
301
+ const collectionInfo = await this.qdrant.getCollectionInfo(collectionName);
302
+ const useHybrid =
303
+ (options?.useHybrid ?? this.config.enableHybridSearch) && collectionInfo.hybridEnabled;
304
+
305
+ // Generate query embedding
306
+ const { embedding } = await this.embeddings.embed(query);
307
+
308
+ // Build filter
309
+ let filter: any;
310
+ if (options?.fileTypes || options?.pathPattern) {
311
+ filter = { must: [] };
312
+
313
+ if (options.fileTypes && options.fileTypes.length > 0) {
314
+ filter.must.push({
315
+ key: "fileExtension",
316
+ match: { any: options.fileTypes },
317
+ });
318
+ }
319
+
320
+ if (options.pathPattern) {
321
+ // Convert glob pattern to regex (simplified)
322
+ const regex = options.pathPattern
323
+ .replace(/\./g, "\\.")
324
+ .replace(/\*\*/g, ".*")
325
+ .replace(/\*/g, "[^/]*")
326
+ .replace(/\?/g, ".");
327
+
328
+ filter.must.push({
329
+ key: "relativePath",
330
+ match: { text: regex },
331
+ });
332
+ }
333
+ }
334
+
335
+ // Search with hybrid or standard search
336
+ let results;
337
+ if (useHybrid) {
338
+ const sparseGenerator = new BM25SparseVectorGenerator();
339
+ const sparseVector = sparseGenerator.generate(query);
340
+ results = await this.qdrant.hybridSearch(
341
+ collectionName,
342
+ embedding,
343
+ sparseVector,
344
+ options?.limit || this.config.defaultSearchLimit,
345
+ filter
346
+ );
347
+ } else {
348
+ results = await this.qdrant.search(
349
+ collectionName,
350
+ embedding,
351
+ options?.limit || this.config.defaultSearchLimit,
352
+ filter
353
+ );
354
+ }
355
+
356
+ // Apply score threshold if specified
357
+ const filteredResults = options?.scoreThreshold
358
+ ? results.filter((r) => r.score >= (options.scoreThreshold || 0))
359
+ : results;
360
+
361
+ // Format results
362
+ return filteredResults.map((r) => ({
363
+ content: r.payload?.content || "",
364
+ filePath: r.payload?.relativePath || "",
365
+ startLine: r.payload?.startLine || 0,
366
+ endLine: r.payload?.endLine || 0,
367
+ language: r.payload?.language || "unknown",
368
+ score: r.score,
369
+ fileExtension: r.payload?.fileExtension || "",
370
+ }));
371
+ }
372
+
373
+ /**
374
+ * Get indexing status for a codebase
375
+ */
376
+ async getIndexStatus(path: string): Promise<IndexStatus> {
377
+ const absolutePath = await this.validatePath(path);
378
+ const collectionName = this.getCollectionName(absolutePath);
379
+ const exists = await this.qdrant.collectionExists(collectionName);
380
+
381
+ if (!exists) {
382
+ return { isIndexed: false };
383
+ }
384
+
385
+ const info = await this.qdrant.getCollectionInfo(collectionName);
386
+
387
+ return {
388
+ isIndexed: true,
389
+ collectionName,
390
+ chunksCount: info.pointsCount,
391
+ // TODO: Extract unique languages and file count from collection
392
+ // This would require scrolling through points or maintaining separate metadata
393
+ };
394
+ }
395
+
396
+ /**
397
+ * Incrementally re-index only changed files
398
+ */
399
+ async reindexChanges(path: string, progressCallback?: ProgressCallback): Promise<ChangeStats> {
400
+ const startTime = Date.now();
401
+ const stats: ChangeStats = {
402
+ filesAdded: 0,
403
+ filesModified: 0,
404
+ filesDeleted: 0,
405
+ chunksAdded: 0,
406
+ chunksDeleted: 0,
407
+ durationMs: 0,
408
+ };
409
+
410
+ try {
411
+ const absolutePath = await this.validatePath(path);
412
+ const collectionName = this.getCollectionName(absolutePath);
413
+
414
+ // Check if collection exists
415
+ const exists = await this.qdrant.collectionExists(collectionName);
416
+ if (!exists) {
417
+ throw new Error(`Codebase not indexed: ${path}`);
418
+ }
419
+
420
+ // Initialize synchronizer
421
+ const synchronizer = new FileSynchronizer(absolutePath, collectionName);
422
+ const hasSnapshot = await synchronizer.initialize();
423
+
424
+ if (!hasSnapshot) {
425
+ throw new Error("No previous snapshot found. Use index_codebase for initial indexing.");
426
+ }
427
+
428
+ // Scan current files
429
+ progressCallback?.({
430
+ phase: "scanning",
431
+ current: 0,
432
+ total: 100,
433
+ percentage: 0,
434
+ message: "Scanning for changes...",
435
+ });
436
+
437
+ const scanner = new FileScanner({
438
+ supportedExtensions: this.config.supportedExtensions,
439
+ ignorePatterns: this.config.ignorePatterns,
440
+ customIgnorePatterns: this.config.customIgnorePatterns,
441
+ });
442
+
443
+ await scanner.loadIgnorePatterns(absolutePath);
444
+ const currentFiles = await scanner.scanDirectory(absolutePath);
445
+
446
+ // Detect changes
447
+ const changes = await synchronizer.detectChanges(currentFiles);
448
+ stats.filesAdded = changes.added.length;
449
+ stats.filesModified = changes.modified.length;
450
+ stats.filesDeleted = changes.deleted.length;
451
+
452
+ if (stats.filesAdded === 0 && stats.filesModified === 0 && stats.filesDeleted === 0) {
453
+ stats.durationMs = Date.now() - startTime;
454
+ return stats;
455
+ }
456
+
457
+ const chunker = new TreeSitterChunker({
458
+ chunkSize: this.config.chunkSize,
459
+ chunkOverlap: this.config.chunkOverlap,
460
+ maxChunkSize: this.config.chunkSize * 2,
461
+ });
462
+ const metadataExtractor = new MetadataExtractor();
463
+
464
+ // Process deleted and modified files - collect chunk IDs to delete
465
+ const _chunkIdsToDelete: string[] = [];
466
+ const filesToReprocess = [...changes.modified, ...changes.deleted];
467
+
468
+ for (const _filePath of filesToReprocess) {
469
+ try {
470
+ // Read old file content to generate chunk IDs for deletion
471
+ // We need to regenerate the chunks to get their IDs
472
+ // For now, we'll use a simpler approach: delete based on file path
473
+ // This requires keeping track of chunk IDs per file
474
+ // Since we don't have a direct way to query by file path,
475
+ // we'll mark these as needing deletion by filename pattern
476
+ // For simplicity in Phase 2, we'll re-index everything
477
+ // A future enhancement would be to maintain a chunk ID mapping
478
+ } catch (_error) {
479
+ // File might be deleted, skip
480
+ }
481
+ }
482
+
483
+ // For Phase 2 MVP: Simply re-process all changed files
484
+ // TODO Phase 3: Implement proper chunk deletion by maintaining chunk ID mapping
485
+ const filesToIndex = [...changes.added, ...changes.modified];
486
+ const allChunks: Array<{ chunk: CodeChunk; id: string }> = [];
487
+
488
+ for (const [index, filePath] of filesToIndex.entries()) {
489
+ try {
490
+ progressCallback?.({
491
+ phase: "chunking",
492
+ current: index + 1,
493
+ total: filesToIndex.length,
494
+ percentage: Math.round(((index + 1) / filesToIndex.length) * 40),
495
+ message: `Processing file ${index + 1}/${filesToIndex.length}`,
496
+ });
497
+
498
+ const absoluteFilePath = join(absolutePath, filePath);
499
+ const code = await fs.readFile(absoluteFilePath, "utf-8");
500
+
501
+ // Check for secrets
502
+ if (metadataExtractor.containsSecrets(code)) {
503
+ continue;
504
+ }
505
+
506
+ const language = metadataExtractor.extractLanguage(absoluteFilePath);
507
+ const chunks = await chunker.chunk(code, absoluteFilePath, language);
508
+
509
+ for (const chunk of chunks) {
510
+ const id = metadataExtractor.generateChunkId(chunk);
511
+ allChunks.push({ chunk, id });
512
+ }
513
+ } catch (error) {
514
+ console.error(`Failed to process ${filePath}:`, error);
515
+ }
516
+ }
517
+
518
+ stats.chunksAdded = allChunks.length;
519
+
520
+ // Generate embeddings and store in batches
521
+ const batchSize = this.config.batchSize;
522
+ for (let i = 0; i < allChunks.length; i += batchSize) {
523
+ const batch = allChunks.slice(i, i + batchSize);
524
+
525
+ progressCallback?.({
526
+ phase: "embedding",
527
+ current: i + batch.length,
528
+ total: allChunks.length,
529
+ percentage: 40 + Math.round(((i + batch.length) / allChunks.length) * 30),
530
+ message: `Generating embeddings ${i + batch.length}/${allChunks.length}`,
531
+ });
532
+
533
+ const texts = batch.map((b) => b.chunk.content);
534
+ const embeddings = await this.embeddings.embedBatch(texts);
535
+
536
+ const points = batch.map((b, idx) => ({
537
+ id: b.id,
538
+ vector: embeddings[idx].embedding,
539
+ payload: {
540
+ content: b.chunk.content,
541
+ relativePath: relative(absolutePath, b.chunk.metadata.filePath),
542
+ startLine: b.chunk.startLine,
543
+ endLine: b.chunk.endLine,
544
+ fileExtension: extname(b.chunk.metadata.filePath),
545
+ language: b.chunk.metadata.language,
546
+ codebasePath: absolutePath,
547
+ chunkIndex: b.chunk.metadata.chunkIndex,
548
+ ...(b.chunk.metadata.name && { name: b.chunk.metadata.name }),
549
+ ...(b.chunk.metadata.chunkType && { chunkType: b.chunk.metadata.chunkType }),
550
+ },
551
+ }));
552
+
553
+ progressCallback?.({
554
+ phase: "storing",
555
+ current: i + batch.length,
556
+ total: allChunks.length,
557
+ percentage: 70 + Math.round(((i + batch.length) / allChunks.length) * 30),
558
+ message: `Storing chunks ${i + batch.length}/${allChunks.length}`,
559
+ });
560
+
561
+ if (this.config.enableHybridSearch) {
562
+ const sparseGenerator = new BM25SparseVectorGenerator();
563
+ const hybridPoints = points.map((point, idx) => ({
564
+ ...point,
565
+ sparseVector: sparseGenerator.generate(allChunks[i + idx].chunk.content),
566
+ }));
567
+ await this.qdrant.addPointsWithSparse(collectionName, hybridPoints);
568
+ } else {
569
+ await this.qdrant.addPoints(collectionName, points);
570
+ }
571
+ }
572
+
573
+ // Update snapshot
574
+ await synchronizer.updateSnapshot(currentFiles);
575
+
576
+ stats.durationMs = Date.now() - startTime;
577
+ return stats;
578
+ } catch (error) {
579
+ const errorMessage = error instanceof Error ? error.message : String(error);
580
+ throw new Error(`Incremental re-indexing failed: ${errorMessage}`);
581
+ }
582
+ }
583
+
584
+ /**
585
+ * Clear all indexed data for a codebase
586
+ */
587
+ async clearIndex(path: string): Promise<void> {
588
+ const absolutePath = await this.validatePath(path);
589
+ const collectionName = this.getCollectionName(absolutePath);
590
+ const exists = await this.qdrant.collectionExists(collectionName);
591
+
592
+ if (exists) {
593
+ await this.qdrant.deleteCollection(collectionName);
594
+ }
595
+
596
+ // Also delete snapshot
597
+ try {
598
+ const synchronizer = new FileSynchronizer(absolutePath, collectionName);
599
+ await synchronizer.deleteSnapshot();
600
+ } catch (_error) {
601
+ // Ignore snapshot deletion errors
602
+ }
603
+ }
604
+
605
+ /**
606
+ * Generate deterministic collection name from codebase path
607
+ */
608
+ private getCollectionName(path: string): string {
609
+ const absolutePath = resolve(path);
610
+ const hash = createHash("md5").update(absolutePath).digest("hex");
611
+ return `code_${hash.substring(0, 8)}`;
612
+ }
613
+ }