lance-context 0.1.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/README.md +232 -23
  2. package/dist/__tests__/ast-chunker.test.d.ts +2 -0
  3. package/dist/__tests__/ast-chunker.test.d.ts.map +1 -0
  4. package/dist/__tests__/ast-chunker.test.js +307 -0
  5. package/dist/__tests__/ast-chunker.test.js.map +1 -0
  6. package/dist/__tests__/config.test.d.ts +2 -0
  7. package/dist/__tests__/config.test.d.ts.map +1 -0
  8. package/dist/__tests__/config.test.js +242 -0
  9. package/dist/__tests__/config.test.js.map +1 -0
  10. package/dist/__tests__/dashboard/beads.test.d.ts +2 -0
  11. package/dist/__tests__/dashboard/beads.test.d.ts.map +1 -0
  12. package/dist/__tests__/dashboard/beads.test.js +151 -0
  13. package/dist/__tests__/dashboard/beads.test.js.map +1 -0
  14. package/dist/__tests__/dashboard/index.test.d.ts +2 -0
  15. package/dist/__tests__/dashboard/index.test.d.ts.map +1 -0
  16. package/dist/__tests__/dashboard/index.test.js +116 -0
  17. package/dist/__tests__/dashboard/index.test.js.map +1 -0
  18. package/dist/__tests__/dashboard/routes.test.d.ts +2 -0
  19. package/dist/__tests__/dashboard/routes.test.d.ts.map +1 -0
  20. package/dist/__tests__/dashboard/routes.test.js +125 -0
  21. package/dist/__tests__/dashboard/routes.test.js.map +1 -0
  22. package/dist/__tests__/dashboard/server.test.d.ts +2 -0
  23. package/dist/__tests__/dashboard/server.test.d.ts.map +1 -0
  24. package/dist/__tests__/dashboard/server.test.js +75 -0
  25. package/dist/__tests__/dashboard/server.test.js.map +1 -0
  26. package/dist/__tests__/dashboard/state.test.d.ts +2 -0
  27. package/dist/__tests__/dashboard/state.test.d.ts.map +1 -0
  28. package/dist/__tests__/dashboard/state.test.js +124 -0
  29. package/dist/__tests__/dashboard/state.test.js.map +1 -0
  30. package/dist/__tests__/embeddings/factory.test.d.ts +2 -0
  31. package/dist/__tests__/embeddings/factory.test.d.ts.map +1 -0
  32. package/dist/__tests__/embeddings/factory.test.js +100 -0
  33. package/dist/__tests__/embeddings/factory.test.js.map +1 -0
  34. package/dist/__tests__/embeddings/jina.test.d.ts +2 -0
  35. package/dist/__tests__/embeddings/jina.test.d.ts.map +1 -0
  36. package/dist/__tests__/embeddings/jina.test.js +156 -0
  37. package/dist/__tests__/embeddings/jina.test.js.map +1 -0
  38. package/dist/__tests__/embeddings/ollama.test.d.ts +2 -0
  39. package/dist/__tests__/embeddings/ollama.test.d.ts.map +1 -0
  40. package/dist/__tests__/embeddings/ollama.test.js +172 -0
  41. package/dist/__tests__/embeddings/ollama.test.js.map +1 -0
  42. package/dist/__tests__/embeddings/rate-limiter.test.d.ts +2 -0
  43. package/dist/__tests__/embeddings/rate-limiter.test.d.ts.map +1 -0
  44. package/dist/__tests__/embeddings/rate-limiter.test.js +163 -0
  45. package/dist/__tests__/embeddings/rate-limiter.test.js.map +1 -0
  46. package/dist/__tests__/embeddings/retry.test.d.ts +2 -0
  47. package/dist/__tests__/embeddings/retry.test.d.ts.map +1 -0
  48. package/dist/__tests__/embeddings/retry.test.js +260 -0
  49. package/dist/__tests__/embeddings/retry.test.js.map +1 -0
  50. package/dist/__tests__/embeddings/types.test.d.ts +2 -0
  51. package/dist/__tests__/embeddings/types.test.d.ts.map +1 -0
  52. package/dist/__tests__/embeddings/types.test.js +31 -0
  53. package/dist/__tests__/embeddings/types.test.js.map +1 -0
  54. package/dist/__tests__/mocks/embedding-backend.mock.d.ts +10 -0
  55. package/dist/__tests__/mocks/embedding-backend.mock.d.ts.map +1 -0
  56. package/dist/__tests__/mocks/embedding-backend.mock.js +39 -0
  57. package/dist/__tests__/mocks/embedding-backend.mock.js.map +1 -0
  58. package/dist/__tests__/mocks/fetch.mock.d.ts +38 -0
  59. package/dist/__tests__/mocks/fetch.mock.d.ts.map +1 -0
  60. package/dist/__tests__/mocks/fetch.mock.js +74 -0
  61. package/dist/__tests__/mocks/fetch.mock.js.map +1 -0
  62. package/dist/__tests__/mocks/lancedb.mock.d.ts +38 -0
  63. package/dist/__tests__/mocks/lancedb.mock.d.ts.map +1 -0
  64. package/dist/__tests__/mocks/lancedb.mock.js +63 -0
  65. package/dist/__tests__/mocks/lancedb.mock.js.map +1 -0
  66. package/dist/__tests__/search/clustering.test.d.ts +2 -0
  67. package/dist/__tests__/search/clustering.test.d.ts.map +1 -0
  68. package/dist/__tests__/search/clustering.test.js +230 -0
  69. package/dist/__tests__/search/clustering.test.js.map +1 -0
  70. package/dist/__tests__/search/hybrid-search.test.d.ts +2 -0
  71. package/dist/__tests__/search/hybrid-search.test.d.ts.map +1 -0
  72. package/dist/__tests__/search/hybrid-search.test.js +186 -0
  73. package/dist/__tests__/search/hybrid-search.test.js.map +1 -0
  74. package/dist/__tests__/search/indexer.test.d.ts +2 -0
  75. package/dist/__tests__/search/indexer.test.d.ts.map +1 -0
  76. package/dist/__tests__/search/indexer.test.js +878 -0
  77. package/dist/__tests__/search/indexer.test.js.map +1 -0
  78. package/dist/__tests__/search/tree-sitter-chunker.test.d.ts +2 -0
  79. package/dist/__tests__/search/tree-sitter-chunker.test.d.ts.map +1 -0
  80. package/dist/__tests__/search/tree-sitter-chunker.test.js +228 -0
  81. package/dist/__tests__/search/tree-sitter-chunker.test.js.map +1 -0
  82. package/dist/__tests__/setup.d.ts +2 -0
  83. package/dist/__tests__/setup.d.ts.map +1 -0
  84. package/dist/__tests__/setup.js +11 -0
  85. package/dist/__tests__/setup.js.map +1 -0
  86. package/dist/__tests__/utils/concurrency.test.d.ts +2 -0
  87. package/dist/__tests__/utils/concurrency.test.d.ts.map +1 -0
  88. package/dist/__tests__/utils/concurrency.test.js +83 -0
  89. package/dist/__tests__/utils/concurrency.test.js.map +1 -0
  90. package/dist/__tests__/utils/errors.test.d.ts +2 -0
  91. package/dist/__tests__/utils/errors.test.d.ts.map +1 -0
  92. package/dist/__tests__/utils/errors.test.js +136 -0
  93. package/dist/__tests__/utils/errors.test.js.map +1 -0
  94. package/dist/__tests__/utils/type-guards.test.d.ts +2 -0
  95. package/dist/__tests__/utils/type-guards.test.d.ts.map +1 -0
  96. package/dist/__tests__/utils/type-guards.test.js +80 -0
  97. package/dist/__tests__/utils/type-guards.test.js.map +1 -0
  98. package/dist/__tests__/worktree/worktree-manager.test.d.ts +2 -0
  99. package/dist/__tests__/worktree/worktree-manager.test.d.ts.map +1 -0
  100. package/dist/__tests__/worktree/worktree-manager.test.js +403 -0
  101. package/dist/__tests__/worktree/worktree-manager.test.js.map +1 -0
  102. package/dist/config.d.ts +122 -0
  103. package/dist/config.d.ts.map +1 -0
  104. package/dist/config.js +508 -0
  105. package/dist/config.js.map +1 -0
  106. package/dist/dashboard/beads.d.ts +35 -0
  107. package/dist/dashboard/beads.d.ts.map +1 -0
  108. package/dist/dashboard/beads.js +102 -0
  109. package/dist/dashboard/beads.js.map +1 -0
  110. package/dist/dashboard/events.d.ts +46 -0
  111. package/dist/dashboard/events.d.ts.map +1 -0
  112. package/dist/dashboard/events.js +141 -0
  113. package/dist/dashboard/events.js.map +1 -0
  114. package/dist/dashboard/index.d.ts +67 -0
  115. package/dist/dashboard/index.d.ts.map +1 -0
  116. package/dist/dashboard/index.js +90 -0
  117. package/dist/dashboard/index.js.map +1 -0
  118. package/dist/dashboard/routes.d.ts +6 -0
  119. package/dist/dashboard/routes.d.ts.map +1 -0
  120. package/dist/dashboard/routes.js +244 -0
  121. package/dist/dashboard/routes.js.map +1 -0
  122. package/dist/dashboard/server.d.ts +27 -0
  123. package/dist/dashboard/server.d.ts.map +1 -0
  124. package/dist/dashboard/server.js +72 -0
  125. package/dist/dashboard/server.js.map +1 -0
  126. package/dist/dashboard/state.d.ts +116 -0
  127. package/dist/dashboard/state.d.ts.map +1 -0
  128. package/dist/dashboard/state.js +251 -0
  129. package/dist/dashboard/state.js.map +1 -0
  130. package/dist/dashboard/ui.d.ts +6 -0
  131. package/dist/dashboard/ui.d.ts.map +1 -0
  132. package/dist/dashboard/ui.js +1407 -0
  133. package/dist/dashboard/ui.js.map +1 -0
  134. package/dist/embeddings/index.d.ts +20 -2
  135. package/dist/embeddings/index.d.ts.map +1 -1
  136. package/dist/embeddings/index.js +49 -6
  137. package/dist/embeddings/index.js.map +1 -1
  138. package/dist/embeddings/jina.d.ts +9 -0
  139. package/dist/embeddings/jina.d.ts.map +1 -1
  140. package/dist/embeddings/jina.js +42 -2
  141. package/dist/embeddings/jina.js.map +1 -1
  142. package/dist/embeddings/ollama.d.ts +2 -0
  143. package/dist/embeddings/ollama.d.ts.map +1 -1
  144. package/dist/embeddings/ollama.js +21 -5
  145. package/dist/embeddings/ollama.js.map +1 -1
  146. package/dist/embeddings/rate-limiter.d.ts +75 -0
  147. package/dist/embeddings/rate-limiter.d.ts.map +1 -0
  148. package/dist/embeddings/rate-limiter.js +145 -0
  149. package/dist/embeddings/rate-limiter.js.map +1 -0
  150. package/dist/embeddings/retry.d.ts +14 -0
  151. package/dist/embeddings/retry.d.ts.map +1 -0
  152. package/dist/embeddings/retry.js +89 -0
  153. package/dist/embeddings/retry.js.map +1 -0
  154. package/dist/embeddings/types.d.ts +56 -2
  155. package/dist/embeddings/types.d.ts.map +1 -1
  156. package/dist/embeddings/types.js +16 -0
  157. package/dist/embeddings/types.js.map +1 -1
  158. package/dist/index.js +1870 -44
  159. package/dist/index.js.map +1 -1
  160. package/dist/memory/index.d.ts +63 -0
  161. package/dist/memory/index.d.ts.map +1 -0
  162. package/dist/memory/index.js +168 -0
  163. package/dist/memory/index.js.map +1 -0
  164. package/dist/search/ast-chunker.d.ts +34 -0
  165. package/dist/search/ast-chunker.d.ts.map +1 -0
  166. package/dist/search/ast-chunker.js +261 -0
  167. package/dist/search/ast-chunker.js.map +1 -0
  168. package/dist/search/clustering.d.ts +77 -0
  169. package/dist/search/clustering.d.ts.map +1 -0
  170. package/dist/search/clustering.js +455 -0
  171. package/dist/search/clustering.js.map +1 -0
  172. package/dist/search/indexer.d.ts +239 -3
  173. package/dist/search/indexer.d.ts.map +1 -1
  174. package/dist/search/indexer.js +941 -45
  175. package/dist/search/indexer.js.map +1 -1
  176. package/dist/search/tree-sitter-chunker.d.ts +69 -0
  177. package/dist/search/tree-sitter-chunker.d.ts.map +1 -0
  178. package/dist/search/tree-sitter-chunker.js +436 -0
  179. package/dist/search/tree-sitter-chunker.js.map +1 -0
  180. package/dist/symbols/index.d.ts +14 -0
  181. package/dist/symbols/index.d.ts.map +1 -0
  182. package/dist/symbols/index.js +19 -0
  183. package/dist/symbols/index.js.map +1 -0
  184. package/dist/symbols/name-path.d.ts +113 -0
  185. package/dist/symbols/name-path.d.ts.map +1 -0
  186. package/dist/symbols/name-path.js +194 -0
  187. package/dist/symbols/name-path.js.map +1 -0
  188. package/dist/symbols/pattern-search.d.ts +14 -0
  189. package/dist/symbols/pattern-search.d.ts.map +1 -0
  190. package/dist/symbols/pattern-search.js +224 -0
  191. package/dist/symbols/pattern-search.js.map +1 -0
  192. package/dist/symbols/reference-finder.d.ts +38 -0
  193. package/dist/symbols/reference-finder.d.ts.map +1 -0
  194. package/dist/symbols/reference-finder.js +376 -0
  195. package/dist/symbols/reference-finder.js.map +1 -0
  196. package/dist/symbols/symbol-editor.d.ts +81 -0
  197. package/dist/symbols/symbol-editor.d.ts.map +1 -0
  198. package/dist/symbols/symbol-editor.js +257 -0
  199. package/dist/symbols/symbol-editor.js.map +1 -0
  200. package/dist/symbols/symbol-extractor.d.ts +49 -0
  201. package/dist/symbols/symbol-extractor.d.ts.map +1 -0
  202. package/dist/symbols/symbol-extractor.js +593 -0
  203. package/dist/symbols/symbol-extractor.js.map +1 -0
  204. package/dist/symbols/symbol-renamer.d.ts +81 -0
  205. package/dist/symbols/symbol-renamer.d.ts.map +1 -0
  206. package/dist/symbols/symbol-renamer.js +204 -0
  207. package/dist/symbols/symbol-renamer.js.map +1 -0
  208. package/dist/symbols/types.d.ts +234 -0
  209. package/dist/symbols/types.d.ts.map +1 -0
  210. package/dist/symbols/types.js +106 -0
  211. package/dist/symbols/types.js.map +1 -0
  212. package/dist/utils/concurrency.d.ts +32 -0
  213. package/dist/utils/concurrency.d.ts.map +1 -0
  214. package/dist/utils/concurrency.js +57 -0
  215. package/dist/utils/concurrency.js.map +1 -0
  216. package/dist/utils/errors.d.ts +36 -0
  217. package/dist/utils/errors.d.ts.map +1 -0
  218. package/dist/utils/errors.js +91 -0
  219. package/dist/utils/errors.js.map +1 -0
  220. package/dist/utils/type-guards.d.ts +17 -0
  221. package/dist/utils/type-guards.d.ts.map +1 -0
  222. package/dist/utils/type-guards.js +25 -0
  223. package/dist/utils/type-guards.js.map +1 -0
  224. package/dist/worktree/index.d.ts +6 -0
  225. package/dist/worktree/index.d.ts.map +1 -0
  226. package/dist/worktree/index.js +6 -0
  227. package/dist/worktree/index.js.map +1 -0
  228. package/dist/worktree/types.d.ts +101 -0
  229. package/dist/worktree/types.d.ts.map +1 -0
  230. package/dist/worktree/types.js +6 -0
  231. package/dist/worktree/types.js.map +1 -0
  232. package/dist/worktree/worktree-manager.d.ts +80 -0
  233. package/dist/worktree/worktree-manager.d.ts.map +1 -0
  234. package/dist/worktree/worktree-manager.js +407 -0
  235. package/dist/worktree/worktree-manager.js.map +1 -0
  236. package/package.json +39 -5
  237. package/scripts/postinstall.js +48 -0
@@ -1,17 +1,70 @@
1
1
  import * as lancedb from '@lancedb/lancedb';
2
+ import * as crypto from 'crypto';
2
3
  import * as fs from 'fs/promises';
3
4
  import * as path from 'path';
4
- const CHUNK_SIZE = 100; // lines per chunk
5
- const CHUNK_OVERLAP = 20; // overlap between chunks
5
+ import { ASTChunker } from './ast-chunker.js';
6
+ import { TreeSitterChunker } from './tree-sitter-chunker.js';
7
+ import { loadConfig, getDefaultPatterns, getDefaultExcludePatterns, getChunkingConfig, getSearchConfig, } from '../config.js';
8
+ import { minimatch } from 'minimatch';
9
+ import { mapInBatches } from '../utils/concurrency.js';
10
+ import { kMeansClustering, calculateSilhouetteScore, } from './clustering.js';
11
+ /** Default concurrency for parallel file processing */
12
+ const FILE_PROCESSING_CONCURRENCY = 10;
6
13
  /**
7
- * Code indexer using LanceDB for vector storage
14
+ * Compute a checksum for index integrity validation.
15
+ * Based on sorted file list and chunk count.
8
16
  */
17
+ function computeIndexChecksum(files, chunkCount) {
18
+ const sortedFiles = [...files].sort();
19
+ const data = JSON.stringify({ files: sortedFiles, chunkCount });
20
+ return crypto.createHash('sha256').update(data).digest('hex').slice(0, 16);
21
+ }
22
+ /**
23
+ * Sanitize a file path for use in LanceDB filter expressions.
24
+ * Prevents SQL injection by only allowing safe path characters.
25
+ */
26
+ function sanitizePathForFilter(filepath) {
27
+ // Only allow safe file path characters: alphanumeric, /, ., -, _, space
28
+ // This is more restrictive than escaping and prevents injection attacks
29
+ if (!/^[\w\s./-]+$/.test(filepath)) {
30
+ // If path contains unusual characters, escape single quotes and backslashes
31
+ return filepath.replace(/\\/g, '\\\\').replace(/'/g, "''");
32
+ }
33
+ return filepath.replace(/'/g, "''");
34
+ }
35
+ /**
36
+ * Code indexer that uses LanceDB for vector storage and semantic search.
37
+ *
38
+ * Provides functionality to:
39
+ * - Index a codebase by chunking files and generating embeddings
40
+ * - Perform hybrid semantic + keyword search
41
+ * - Support incremental indexing (only re-index changed files)
42
+ *
43
+ * @example
44
+ * ```typescript
45
+ * const backend = await createEmbeddingBackend();
46
+ * const indexer = new CodeIndexer('/path/to/project', backend);
47
+ * await indexer.initialize();
48
+ *
49
+ * // Index the codebase
50
+ * await indexer.indexCodebase();
51
+ *
52
+ * // Search for code
53
+ * const results = await indexer.search('authentication middleware');
54
+ * ```
55
+ */
56
+ /** Maximum number of query embeddings to cache */
57
+ const QUERY_CACHE_MAX_SIZE = 100;
9
58
  export class CodeIndexer {
10
59
  db = null;
11
60
  table = null;
61
+ metadataTable = null;
12
62
  embeddingBackend;
13
63
  indexPath;
14
64
  projectPath;
65
+ config = null;
66
+ /** LRU cache for query embeddings to avoid recomputing identical queries */
67
+ queryEmbeddingCache = new Map();
15
68
  constructor(projectPath, embeddingBackend) {
16
69
  this.projectPath = projectPath;
17
70
  this.embeddingBackend = embeddingBackend;
@@ -19,6 +72,118 @@ export class CodeIndexer {
19
72
  }
20
73
  async initialize() {
21
74
  this.db = await lancedb.connect(this.indexPath);
75
+ this.config = await loadConfig(this.projectPath);
76
+ console.error(`[lance-context] Loaded config with ${this.config.patterns?.length} patterns`);
77
+ }
78
+ get metadataPath() {
79
+ return path.join(this.indexPath, 'index-metadata.json');
80
+ }
81
+ /**
82
+ * Save index metadata to disk
83
+ */
84
+ async saveIndexMetadata(fileCount, chunkCount, indexedFiles) {
85
+ // Convert to relative paths for checksum
86
+ const relativePaths = indexedFiles.map((f) => path.isAbsolute(f) ? path.relative(this.projectPath, f) : f);
87
+ const metadata = {
88
+ lastUpdated: new Date().toISOString(),
89
+ fileCount,
90
+ chunkCount,
91
+ embeddingBackend: this.embeddingBackend.name,
92
+ embeddingModel: this.embeddingBackend.getModel(),
93
+ embeddingDimensions: this.embeddingBackend.getDimensions(),
94
+ version: '1.0.0',
95
+ checksum: computeIndexChecksum(relativePaths, chunkCount),
96
+ };
97
+ await fs.writeFile(this.metadataPath, JSON.stringify(metadata, null, 2));
98
+ }
99
+ /**
100
+ * Load index metadata from disk
101
+ */
102
+ async loadIndexMetadata() {
103
+ try {
104
+ const content = await fs.readFile(this.metadataPath, 'utf-8');
105
+ return JSON.parse(content);
106
+ }
107
+ catch {
108
+ return null;
109
+ }
110
+ }
111
+ /**
112
+ * Get the modification time of a file
113
+ */
114
+ async getFileMtime(filepath) {
115
+ const stats = await fs.stat(filepath);
116
+ return stats.mtimeMs;
117
+ }
118
+ /**
119
+ * Get stored metadata for all indexed files
120
+ */
121
+ async getStoredMetadata() {
122
+ const tableNames = await this.db.tableNames();
123
+ if (!tableNames.includes('file_metadata')) {
124
+ return new Map();
125
+ }
126
+ this.metadataTable = await this.db.openTable('file_metadata');
127
+ const rows = await this.metadataTable.query().toArray();
128
+ const metadata = new Map();
129
+ for (const row of rows) {
130
+ metadata.set(row.filepath, row.mtime);
131
+ }
132
+ return metadata;
133
+ }
134
+ /**
135
+ * Detect which files have been added, modified, or deleted
136
+ */
137
+ async detectFileChanges(currentFiles) {
138
+ const storedMetadata = await this.getStoredMetadata();
139
+ const changes = {
140
+ added: [],
141
+ modified: [],
142
+ deleted: [],
143
+ unchanged: [],
144
+ };
145
+ const currentFilesSet = new Set();
146
+ for (const filepath of currentFiles) {
147
+ const relativePath = path.relative(this.projectPath, filepath);
148
+ currentFilesSet.add(relativePath);
149
+ const currentMtime = await this.getFileMtime(filepath);
150
+ const storedMtime = storedMetadata.get(relativePath);
151
+ if (storedMtime === undefined) {
152
+ changes.added.push(filepath);
153
+ }
154
+ else if (currentMtime > storedMtime) {
155
+ changes.modified.push(filepath);
156
+ }
157
+ else {
158
+ changes.unchanged.push(filepath);
159
+ }
160
+ }
161
+ // Find deleted files
162
+ for (const [relativePath] of storedMetadata) {
163
+ if (!currentFilesSet.has(relativePath)) {
164
+ changes.deleted.push(relativePath);
165
+ }
166
+ }
167
+ return changes;
168
+ }
169
+ /**
170
+ * Save metadata for indexed files
171
+ */
172
+ async saveFileMetadata(files) {
173
+ const metadata = [];
174
+ for (const filepath of files) {
175
+ const relativePath = path.relative(this.projectPath, filepath);
176
+ const mtime = await this.getFileMtime(filepath);
177
+ metadata.push({ filepath: relativePath, mtime });
178
+ }
179
+ // Drop and recreate metadata table
180
+ const tableNames = await this.db.tableNames();
181
+ if (tableNames.includes('file_metadata')) {
182
+ await this.db.dropTable('file_metadata');
183
+ }
184
+ if (metadata.length > 0) {
185
+ this.metadataTable = await this.db.createTable('file_metadata', metadata);
186
+ }
22
187
  }
23
188
  async getStatus() {
24
189
  const tableNames = await this.db?.tableNames();
@@ -30,54 +195,199 @@ export class CodeIndexer {
30
195
  chunkCount: 0,
31
196
  lastUpdated: null,
32
197
  indexPath: this.indexPath,
198
+ embeddingBackend: this.embeddingBackend.name,
199
+ embeddingModel: this.embeddingBackend.getModel(),
33
200
  };
34
201
  }
35
202
  this.table = await this.db.openTable('code_chunks');
36
203
  const count = await this.table.countRows();
204
+ // Load persisted metadata
205
+ const metadata = await this.loadIndexMetadata();
206
+ // Validate index integrity
207
+ const corruptionCheck = await this.validateIndexIntegrity(metadata, count);
208
+ // Check for backend mismatch
209
+ const backendMismatch = this.checkBackendMismatch(metadata);
37
210
  return {
38
211
  indexed: true,
39
- fileCount: 0, // Would need to query distinct files
212
+ fileCount: metadata?.fileCount ?? 0,
40
213
  chunkCount: count,
41
- lastUpdated: new Date().toISOString(),
214
+ lastUpdated: metadata?.lastUpdated ?? null,
42
215
  indexPath: this.indexPath,
216
+ embeddingBackend: metadata?.embeddingBackend ?? this.embeddingBackend.name,
217
+ embeddingModel: metadata?.embeddingModel ?? this.embeddingBackend.getModel(),
218
+ corrupted: corruptionCheck.corrupted,
219
+ corruptionReason: corruptionCheck.reason,
220
+ backendMismatch: backendMismatch.mismatch,
221
+ backendMismatchReason: backendMismatch.reason,
43
222
  };
44
223
  }
45
- async indexCodebase(patterns = ['**/*.ts', '**/*.tsx', '**/*.js', '**/*.jsx', '**/*.py', '**/*.go', '**/*.rs'], excludePatterns = ['**/node_modules/**', '**/dist/**', '**/.git/**', '**/build/**']) {
224
+ /**
225
+ * Check if the current embedding backend differs from the one used to create the index.
226
+ * Returns mismatch status and reason if mismatched.
227
+ */
228
+ checkBackendMismatch(metadata) {
229
+ if (!metadata) {
230
+ return { mismatch: false };
231
+ }
232
+ const currentBackend = this.embeddingBackend.name;
233
+ const currentModel = this.embeddingBackend.getModel();
234
+ const currentDimensions = this.embeddingBackend.getDimensions();
235
+ // Check dimension mismatch (critical - will cause search failures)
236
+ if (metadata.embeddingDimensions && metadata.embeddingDimensions !== currentDimensions) {
237
+ return {
238
+ mismatch: true,
239
+ reason: `Embedding dimension mismatch: index has ${metadata.embeddingDimensions}-dim vectors, ` +
240
+ `current backend (${currentBackend}) produces ${currentDimensions}-dim vectors. Reindex required.`,
241
+ };
242
+ }
243
+ // Check model mismatch (different models produce incompatible embeddings)
244
+ if (metadata.embeddingModel && metadata.embeddingModel !== currentModel) {
245
+ return {
246
+ mismatch: true,
247
+ reason: `Embedding model mismatch: index uses '${metadata.embeddingModel}', ` +
248
+ `current backend uses '${currentModel}'. Reindex required.`,
249
+ };
250
+ }
251
+ // Check backend mismatch (even same dimensions may have different embedding spaces)
252
+ if (metadata.embeddingBackend && metadata.embeddingBackend !== currentBackend) {
253
+ return {
254
+ mismatch: true,
255
+ reason: `Embedding backend changed from '${metadata.embeddingBackend}' to '${currentBackend}'. Reindex required.`,
256
+ };
257
+ }
258
+ return { mismatch: false };
259
+ }
260
+ /**
261
+ * Validate index integrity by checking metadata consistency.
262
+ * Returns corruption status and reason if corrupted.
263
+ */
264
+ async validateIndexIntegrity(metadata, actualChunkCount) {
265
+ // No metadata file - possible incomplete indexing
266
+ if (!metadata) {
267
+ return {
268
+ corrupted: true,
269
+ reason: 'Missing index metadata file. Index may be incomplete. Run clear_index followed by index_codebase to rebuild.',
270
+ };
271
+ }
272
+ // Check if chunk count matches
273
+ if (metadata.chunkCount !== actualChunkCount) {
274
+ return {
275
+ corrupted: true,
276
+ reason: `Chunk count mismatch: metadata says ${metadata.chunkCount}, index has ${actualChunkCount}. Run clear_index followed by index_codebase to rebuild.`,
277
+ };
278
+ }
279
+ // Validate checksum if present
280
+ if (metadata.checksum) {
281
+ const storedFiles = await this.getStoredMetadata();
282
+ const fileList = Array.from(storedFiles.keys());
283
+ const computedChecksum = computeIndexChecksum(fileList, actualChunkCount);
284
+ if (computedChecksum !== metadata.checksum) {
285
+ return {
286
+ corrupted: true,
287
+ reason: `Checksum mismatch: file metadata does not match index. Run clear_index followed by index_codebase to rebuild.`,
288
+ };
289
+ }
290
+ }
291
+ return { corrupted: false };
292
+ }
293
+ async indexCodebase(patterns, excludePatterns, forceReindex = false, onProgress, autoRepair = false) {
46
294
  const { glob } = await import('glob');
295
+ const report = (progress) => {
296
+ console.error(`[lance-context] ${progress.message}`);
297
+ onProgress?.(progress);
298
+ };
299
+ // Check for corruption if autoRepair is enabled
300
+ if (autoRepair) {
301
+ const status = await this.getStatus();
302
+ if (status.corrupted) {
303
+ console.error(`[lance-context] Index corruption detected: ${status.corruptionReason}`);
304
+ console.error('[lance-context] Auto-repair enabled, clearing and rebuilding index...');
305
+ await this.clearIndex();
306
+ // Recursively call with forceReindex but without autoRepair to avoid loops
307
+ const result = await this.indexCodebase(patterns, excludePatterns, true, onProgress, false);
308
+ return { ...result, repaired: true };
309
+ }
310
+ }
311
+ // Use provided patterns or fall back to config/defaults
312
+ const effectivePatterns = patterns || this.config?.patterns || getDefaultPatterns();
313
+ const effectiveExcludePatterns = excludePatterns || this.config?.excludePatterns || getDefaultExcludePatterns();
314
+ report({ phase: 'scanning', current: 0, total: 0, message: 'Scanning for files...' });
47
315
  // Find all matching files
48
316
  const files = [];
49
- for (const pattern of patterns) {
317
+ for (const pattern of effectivePatterns) {
50
318
  const matches = await glob(pattern, {
51
319
  cwd: this.projectPath,
52
- ignore: excludePatterns,
320
+ ignore: effectiveExcludePatterns,
53
321
  absolute: true,
54
322
  });
55
323
  files.push(...matches);
56
324
  }
57
- console.error(`[lance-context] Found ${files.length} files to index`);
58
- // Process files into chunks
59
- const allChunks = [];
60
- for (const filePath of files) {
61
- const chunks = await this.chunkFile(filePath);
62
- allChunks.push(...chunks);
325
+ report({
326
+ phase: 'scanning',
327
+ current: files.length,
328
+ total: files.length,
329
+ message: `Found ${files.length} files to index`,
330
+ });
331
+ // Check if we can do incremental indexing
332
+ const tableNames = await this.db.tableNames();
333
+ const hasExistingIndex = tableNames.includes('code_chunks');
334
+ // Check for embedding dimension or model mismatch
335
+ let embeddingMismatch = false;
336
+ if (hasExistingIndex && !forceReindex) {
337
+ const metadata = await this.loadIndexMetadata();
338
+ const currentDimensions = this.embeddingBackend.getDimensions();
339
+ const currentModel = this.embeddingBackend.getModel();
340
+ // Check dimension mismatch
341
+ if (metadata?.embeddingDimensions && metadata.embeddingDimensions !== currentDimensions) {
342
+ console.error(`[lance-context] Embedding dimension mismatch: index has ${metadata.embeddingDimensions}, ` +
343
+ `current backend (${this.embeddingBackend.name}) uses ${currentDimensions}. Forcing full reindex.`);
344
+ embeddingMismatch = true;
345
+ }
346
+ // Check model mismatch (even if dimensions match, different models produce incompatible embeddings)
347
+ if (metadata?.embeddingModel && metadata.embeddingModel !== currentModel) {
348
+ console.error(`[lance-context] Embedding model mismatch: index uses '${metadata.embeddingModel}', ` +
349
+ `current backend uses '${currentModel}'. Forcing full reindex.`);
350
+ embeddingMismatch = true;
351
+ }
352
+ }
353
+ const canDoIncremental = hasExistingIndex && !forceReindex && !embeddingMismatch;
354
+ if (canDoIncremental) {
355
+ return this.indexIncremental(files, onProgress);
63
356
  }
64
- console.error(`[lance-context] Created ${allChunks.length} chunks`);
65
- // Generate embeddings in batches
66
- const batchSize = 32;
67
- for (let i = 0; i < allChunks.length; i += batchSize) {
68
- const batch = allChunks.slice(i, i + batchSize);
69
- const texts = batch.map((c) => c.content);
70
- const embeddings = await this.embeddingBackend.embedBatch(texts);
71
- batch.forEach((chunk, idx) => {
72
- chunk.embedding = embeddings[idx];
357
+ // Full reindex
358
+ return this.indexFull(files, onProgress);
359
+ }
360
+ /**
361
+ * Perform a full reindex of all files
362
+ */
363
+ async indexFull(files, onProgress) {
364
+ const report = (progress) => {
365
+ console.error(`[lance-context] ${progress.message}`);
366
+ onProgress?.(progress);
367
+ };
368
+ // Process files into chunks (parallelized for I/O efficiency)
369
+ report({ phase: 'chunking', current: 0, total: files.length, message: 'Chunking files...' });
370
+ const chunkResults = await mapInBatches(files, async (filepath) => this.chunkFile(filepath), FILE_PROCESSING_CONCURRENCY, (completed, total) => {
371
+ report({
372
+ phase: 'chunking',
373
+ current: completed,
374
+ total,
375
+ message: `Chunked ${completed}/${total} files`,
73
376
  });
74
- console.error(`[lance-context] Embedded ${i + batch.length}/${allChunks.length} chunks`);
75
- }
377
+ });
378
+ const allChunks = chunkResults.flat();
379
+ report({
380
+ phase: 'chunking',
381
+ current: files.length,
382
+ total: files.length,
383
+ message: `Created ${allChunks.length} chunks`,
384
+ });
385
+ // Generate embeddings in batches
386
+ await this.embedChunks(allChunks, onProgress);
76
387
  // Store in LanceDB
77
- const dimensions = this.embeddingBackend.getDimensions();
78
388
  const data = allChunks.map((chunk) => ({
79
389
  id: chunk.id,
80
- filePath: chunk.filePath,
390
+ filepath: chunk.filepath,
81
391
  content: chunk.content,
82
392
  startLine: chunk.startLine,
83
393
  endLine: chunk.endLine,
@@ -90,26 +400,215 @@ export class CodeIndexer {
90
400
  await this.db.dropTable('code_chunks');
91
401
  }
92
402
  this.table = await this.db.createTable('code_chunks', data);
403
+ // Save file metadata for future incremental indexing
404
+ await this.saveFileMetadata(files);
405
+ // Save index metadata with checksum
406
+ await this.saveIndexMetadata(files.length, allChunks.length, files);
93
407
  return {
94
408
  filesIndexed: files.length,
95
409
  chunksCreated: allChunks.length,
410
+ incremental: false,
96
411
  };
97
412
  }
98
- async chunkFile(filePath) {
99
- const content = await fs.readFile(filePath, 'utf-8');
100
- const lines = content.split('\n');
101
- const ext = path.extname(filePath).slice(1);
413
+ /**
414
+ * Perform incremental indexing - only process changed files
415
+ */
416
+ async indexIncremental(files, onProgress) {
417
+ const report = (progress) => {
418
+ console.error(`[lance-context] ${progress.message}`);
419
+ onProgress?.(progress);
420
+ };
421
+ const changes = await this.detectFileChanges(files);
422
+ const filesToProcess = [...changes.added, ...changes.modified];
423
+ const hasChanges = filesToProcess.length > 0 || changes.deleted.length > 0;
424
+ if (!hasChanges) {
425
+ report({
426
+ phase: 'complete',
427
+ current: 0,
428
+ total: 0,
429
+ message: 'No changes detected, index is up to date',
430
+ });
431
+ this.table = await this.db.openTable('code_chunks');
432
+ const count = await this.table.countRows();
433
+ return {
434
+ filesIndexed: 0,
435
+ chunksCreated: count,
436
+ incremental: true,
437
+ };
438
+ }
439
+ report({
440
+ phase: 'scanning',
441
+ current: 0,
442
+ total: filesToProcess.length,
443
+ message: `Incremental update: ${changes.added.length} added, ${changes.modified.length} modified, ${changes.deleted.length} deleted`,
444
+ });
445
+ // Open the existing table
446
+ this.table = await this.db.openTable('code_chunks');
447
+ // Delete chunks from modified and deleted files
448
+ const filesToRemove = [
449
+ ...changes.modified.map((f) => path.relative(this.projectPath, f)),
450
+ ...changes.deleted,
451
+ ];
452
+ if (filesToRemove.length > 0) {
453
+ for (const relativePath of filesToRemove) {
454
+ const sanitizedPath = sanitizePathForFilter(relativePath);
455
+ await this.table.delete(`filepath = '${sanitizedPath}'`);
456
+ }
457
+ report({
458
+ phase: 'chunking',
459
+ current: 0,
460
+ total: filesToProcess.length,
461
+ message: `Removed chunks from ${filesToRemove.length} files`,
462
+ });
463
+ }
464
+ // Process new and modified files (parallelized for I/O efficiency)
465
+ if (filesToProcess.length > 0) {
466
+ const chunkResults = await mapInBatches(filesToProcess, async (filepath) => this.chunkFile(filepath), FILE_PROCESSING_CONCURRENCY, (completed, total) => {
467
+ report({
468
+ phase: 'chunking',
469
+ current: completed,
470
+ total,
471
+ message: `Chunked ${completed}/${total} files`,
472
+ });
473
+ });
474
+ const newChunks = chunkResults.flat();
475
+ report({
476
+ phase: 'chunking',
477
+ current: filesToProcess.length,
478
+ total: filesToProcess.length,
479
+ message: `Created ${newChunks.length} new chunks`,
480
+ });
481
+ // Generate embeddings
482
+ await this.embedChunks(newChunks, onProgress);
483
+ // Add new chunks to the table
484
+ const data = newChunks.map((chunk) => ({
485
+ id: chunk.id,
486
+ filepath: chunk.filepath,
487
+ content: chunk.content,
488
+ startLine: chunk.startLine,
489
+ endLine: chunk.endLine,
490
+ language: chunk.language,
491
+ vector: chunk.embedding,
492
+ }));
493
+ if (data.length > 0) {
494
+ await this.table.add(data);
495
+ }
496
+ }
497
+ // Update file metadata
498
+ const allCurrentFiles = [...changes.unchanged, ...changes.added, ...changes.modified];
499
+ await this.saveFileMetadata(allCurrentFiles);
500
+ const totalChunks = await this.table.countRows();
501
+ // Save index metadata with checksum
502
+ await this.saveIndexMetadata(allCurrentFiles.length, totalChunks, allCurrentFiles);
503
+ return {
504
+ filesIndexed: filesToProcess.length,
505
+ chunksCreated: totalChunks,
506
+ incremental: true,
507
+ };
508
+ }
509
+ /**
510
+ * Generate embeddings for chunks in batches
511
+ */
512
+ async embedChunks(chunks, onProgress) {
513
+ const report = (progress) => {
514
+ console.error(`[lance-context] ${progress.message}`);
515
+ onProgress?.(progress);
516
+ };
517
+ const batchSize = 32;
518
+ for (let i = 0; i < chunks.length; i += batchSize) {
519
+ const batch = chunks.slice(i, i + batchSize);
520
+ const texts = batch.map((c) => c.content);
521
+ const embeddings = await this.embeddingBackend.embedBatch(texts);
522
+ batch.forEach((chunk, idx) => {
523
+ chunk.embedding = embeddings[idx];
524
+ });
525
+ report({
526
+ phase: 'embedding',
527
+ current: i + batch.length,
528
+ total: chunks.length,
529
+ message: `Embedded ${i + batch.length}/${chunks.length} chunks`,
530
+ });
531
+ }
532
+ }
533
+ async chunkFile(filepath) {
534
+ const ext = path.extname(filepath).slice(1);
102
535
  const language = this.getLanguage(ext);
103
- const relativePath = path.relative(this.projectPath, filePath);
536
+ const relativePath = path.relative(this.projectPath, filepath);
537
+ // Try AST-aware chunking for TypeScript/JavaScript
538
+ if (ASTChunker.canParse(filepath)) {
539
+ try {
540
+ return await this.chunkFileWithAST(filepath, relativePath, language);
541
+ }
542
+ catch {
543
+ // Fall back to line-based chunking if AST parsing fails
544
+ console.error(`[lance-context] AST parsing failed for ${relativePath}, falling back to line-based chunking`);
545
+ }
546
+ }
547
+ // Try tree-sitter chunking for other languages (Python, Go, Rust, Java, Kotlin)
548
+ if (TreeSitterChunker.canParse(filepath)) {
549
+ try {
550
+ return await this.chunkFileWithTreeSitter(filepath, relativePath, language);
551
+ }
552
+ catch (error) {
553
+ // Fall back to line-based chunking if tree-sitter parsing fails
554
+ console.error(`[lance-context] Tree-sitter parsing failed for ${relativePath}, falling back to line-based chunking:`, error);
555
+ }
556
+ }
557
+ // Line-based chunking for unsupported languages or as fallback
558
+ return this.chunkFileByLines(filepath, relativePath, language);
559
+ }
560
+ /**
561
+ * Chunk a file using AST-aware parsing
562
+ */
563
+ async chunkFileWithAST(filepath, relativePath, language) {
564
+ const astChunker = new ASTChunker();
565
+ const astChunks = await astChunker.chunkFile(filepath);
566
+ return astChunks.map((chunk) => ({
567
+ id: `${relativePath}:${chunk.startLine}-${chunk.endLine}${chunk.name ? `:${chunk.name}` : ''}`,
568
+ filepath: relativePath,
569
+ content: chunk.content,
570
+ startLine: chunk.startLine,
571
+ endLine: chunk.endLine,
572
+ language,
573
+ symbolType: chunk.type,
574
+ symbolName: chunk.name,
575
+ }));
576
+ }
577
+ /**
578
+ * Chunk a file using tree-sitter AST parsing (Python, Go, Rust, Java, Kotlin)
579
+ */
580
+ async chunkFileWithTreeSitter(filepath, relativePath, language) {
581
+ const treeSitterChunker = new TreeSitterChunker();
582
+ const treeSitterChunks = await treeSitterChunker.chunkFile(filepath);
583
+ return treeSitterChunks.map((chunk) => ({
584
+ id: `${relativePath}:${chunk.startLine}-${chunk.endLine}${chunk.name ? `:${chunk.name}` : ''}`,
585
+ filepath: relativePath,
586
+ content: chunk.content,
587
+ startLine: chunk.startLine,
588
+ endLine: chunk.endLine,
589
+ language,
590
+ symbolType: chunk.type,
591
+ symbolName: chunk.name,
592
+ }));
593
+ }
594
+ /**
595
+ * Chunk a file using line-based splitting (fallback)
596
+ */
597
+ async chunkFileByLines(filepath, relativePath, language) {
598
+ const content = await fs.readFile(filepath, 'utf-8');
599
+ const lines = content.split('\n');
600
+ const chunkingConfig = getChunkingConfig(this.config);
601
+ const chunkSize = chunkingConfig.maxLines;
602
+ const chunkOverlap = chunkingConfig.overlap;
104
603
  const chunks = [];
105
- for (let i = 0; i < lines.length; i += CHUNK_SIZE - CHUNK_OVERLAP) {
106
- const chunkLines = lines.slice(i, i + CHUNK_SIZE);
604
+ for (let i = 0; i < lines.length; i += chunkSize - chunkOverlap) {
605
+ const chunkLines = lines.slice(i, i + chunkSize);
107
606
  const chunkContent = chunkLines.join('\n');
108
607
  if (chunkContent.trim().length === 0)
109
608
  continue;
110
609
  chunks.push({
111
610
  id: `${relativePath}:${i + 1}-${i + chunkLines.length}`,
112
- filePath: relativePath,
611
+ filepath: relativePath,
113
612
  content: chunkContent,
114
613
  startLine: i + 1,
115
614
  endLine: i + chunkLines.length,
@@ -144,30 +643,427 @@ export class CodeIndexer {
144
643
  };
145
644
  return langMap[ext] || ext;
146
645
  }
147
- async search(query, limit = 10) {
646
+ /**
647
+ * Get query embedding from cache or compute it.
648
+ * Uses LRU eviction when cache is full.
649
+ */
650
+ async getQueryEmbedding(query) {
651
+ // Check cache first
652
+ const cached = this.queryEmbeddingCache.get(query);
653
+ if (cached) {
654
+ // Move to end for LRU (delete and re-insert)
655
+ this.queryEmbeddingCache.delete(query);
656
+ this.queryEmbeddingCache.set(query, cached);
657
+ return cached;
658
+ }
659
+ // Compute embedding
660
+ const embedding = await this.embeddingBackend.embed(query);
661
+ // Evict oldest entry if cache is full (first entry in Map)
662
+ if (this.queryEmbeddingCache.size >= QUERY_CACHE_MAX_SIZE) {
663
+ const oldestKey = this.queryEmbeddingCache.keys().next().value;
664
+ if (oldestKey !== undefined) {
665
+ this.queryEmbeddingCache.delete(oldestKey);
666
+ }
667
+ }
668
+ // Store in cache
669
+ this.queryEmbeddingCache.set(query, embedding);
670
+ return embedding;
671
+ }
672
+ /**
673
+ * Check if a filepath matches a glob pattern.
674
+ * Supports negation patterns starting with '!'.
675
+ */
676
+ matchesPathPattern(filepath, pattern) {
677
+ // Handle negation pattern
678
+ if (pattern.startsWith('!')) {
679
+ return !minimatch(filepath, pattern.slice(1));
680
+ }
681
+ return minimatch(filepath, pattern);
682
+ }
683
+ async search(queryOrOptions, limit) {
684
+ // Normalize arguments
685
+ const options = typeof queryOrOptions === 'string'
686
+ ? { query: queryOrOptions, limit: limit ?? 10 }
687
+ : queryOrOptions;
688
+ const { query, limit: resultLimit = 10, pathPattern, languages } = options;
148
689
  if (!this.table) {
149
690
  const status = await this.getStatus();
150
691
  if (!status.indexed) {
151
692
  throw new Error('Codebase not indexed. Run index_codebase first.');
152
693
  }
153
694
  }
154
- const queryEmbedding = await this.embeddingBackend.embed(query);
155
- const results = await this.table.search(queryEmbedding).limit(limit).toArray();
156
- return results.map((r) => ({
157
- id: r.id,
158
- filePath: r.filePath,
159
- content: r.content,
160
- startLine: r.startLine,
161
- endLine: r.endLine,
162
- language: r.language,
695
+ const queryEmbedding = await this.getQueryEmbedding(query);
696
+ const searchConfig = getSearchConfig(this.config);
697
+ // Fetch more results than needed for re-ranking and filtering
698
+ // If we have filters, fetch even more to account for filtered-out results
699
+ const hasFilters = pathPattern !== undefined || (languages && languages.length > 0);
700
+ const fetchMultiplier = hasFilters ? 5 : 3;
701
+ const fetchLimit = Math.min(resultLimit * fetchMultiplier, hasFilters ? 100 : 50);
702
+ const results = await this.table.search(queryEmbedding).limit(fetchLimit).toArray();
703
+ // Apply filters
704
+ let filteredResults = results;
705
+ if (pathPattern) {
706
+ filteredResults = filteredResults.filter((r) => this.matchesPathPattern(r.filepath, pathPattern));
707
+ }
708
+ if (languages && languages.length > 0) {
709
+ const normalizedLanguages = languages.map((l) => l.toLowerCase());
710
+ filteredResults = filteredResults.filter((r) => normalizedLanguages.includes(r.language.toLowerCase()));
711
+ }
712
+ // Hybrid scoring: combine semantic similarity with keyword matching
713
+ const scoredResults = filteredResults.map((r, index) => {
714
+ // Semantic score: inverse of rank (higher is better)
715
+ const semanticScore = 1 - index / fetchLimit;
716
+ // Keyword score: based on query term matches
717
+ const keywordScore = this.calculateKeywordScore(query, r.content, r.filepath);
718
+ // Combined score using configurable weights
719
+ const combinedScore = searchConfig.semanticWeight * semanticScore + searchConfig.keywordWeight * keywordScore;
720
+ return { result: r, score: combinedScore };
721
+ });
722
+ // Sort by combined score and take top results
723
+ scoredResults.sort((a, b) => b.score - a.score);
724
+ return scoredResults.slice(0, resultLimit).map((sr) => ({
725
+ id: sr.result.id,
726
+ filepath: sr.result.filepath,
727
+ content: sr.result.content,
728
+ startLine: sr.result.startLine,
729
+ endLine: sr.result.endLine,
730
+ language: sr.result.language,
731
+ symbolType: sr.result.symbolType,
732
+ symbolName: sr.result.symbolName,
163
733
  }));
164
734
  }
735
+ /**
736
+ * Calculate keyword match score for hybrid search
737
+ */
738
+ calculateKeywordScore(query, content, filepath) {
739
+ const queryTerms = query
740
+ .toLowerCase()
741
+ .split(/\s+/)
742
+ .filter((t) => t.length > 2);
743
+ if (queryTerms.length === 0)
744
+ return 0;
745
+ const contentLower = content.toLowerCase();
746
+ const filepathLower = filepath.toLowerCase();
747
+ let matchCount = 0;
748
+ let exactMatchBonus = 0;
749
+ for (const term of queryTerms) {
750
+ // Check content matches
751
+ if (contentLower.includes(term)) {
752
+ matchCount++;
753
+ // Bonus for exact word match (not just substring)
754
+ const wordBoundaryRegex = new RegExp(`\\b${term}\\b`, 'i');
755
+ if (wordBoundaryRegex.test(content)) {
756
+ exactMatchBonus += 0.5;
757
+ }
758
+ }
759
+ // Bonus for filename/path match
760
+ if (filepathLower.includes(term)) {
761
+ matchCount += 0.5;
762
+ }
763
+ }
764
+ // Normalize score to 0-1 range
765
+ const baseScore = matchCount / queryTerms.length;
766
+ const bonusScore = Math.min(exactMatchBonus / queryTerms.length, 0.5);
767
+ return Math.min(baseScore + bonusScore, 1);
768
+ }
769
+ /**
770
+ * Find code chunks semantically similar to a given code snippet or file location.
771
+ * This is useful for finding duplicate logic, similar implementations, or related code.
772
+ */
773
+ async searchSimilar(options) {
774
+ const { filepath, startLine, endLine, code, limit = 10, threshold = 0, excludeSelf = true, } = options;
775
+ // Validate input first - need either code or filepath
776
+ if (!code && !filepath) {
777
+ throw new Error('Either code or filepath must be provided');
778
+ }
779
+ if (!this.table) {
780
+ const status = await this.getStatus();
781
+ if (!status.indexed) {
782
+ throw new Error('Codebase not indexed. Run index_codebase first.');
783
+ }
784
+ }
785
+ // Get the source code to find similar chunks for
786
+ let sourceCode;
787
+ let sourceId = null;
788
+ if (code) {
789
+ sourceCode = code;
790
+ }
791
+ else {
792
+ // Read from file
793
+ const fullPath = path.join(this.projectPath, filepath);
794
+ const fileContent = await fs.readFile(fullPath, 'utf-8');
795
+ const lines = fileContent.split('\n');
796
+ const start = startLine ? startLine - 1 : 0;
797
+ const end = endLine ? endLine : lines.length;
798
+ sourceCode = lines.slice(start, end).join('\n');
799
+ // Build source ID for exclusion
800
+ if (startLine && endLine) {
801
+ sourceId = `${filepath}:${startLine}-${endLine}`;
802
+ }
803
+ }
804
+ if (!sourceCode.trim()) {
805
+ throw new Error('Source code is empty');
806
+ }
807
+ // Embed the source code
808
+ const sourceEmbedding = await this.embeddingBackend.embed(sourceCode);
809
+ // Search for similar chunks - fetch extra to account for filtering
810
+ const fetchLimit = Math.min((limit + 5) * 2, 100);
811
+ const results = await this.table.search(sourceEmbedding).limit(fetchLimit).toArray();
812
+ // LanceDB returns results sorted by distance (ascending)
813
+ // Convert distance to similarity score (1 - normalized_distance)
814
+ const maxDistance = results.length > 0 ? Math.max(...results.map((r) => r._distance || 0)) : 1;
815
+ const scoredResults = [];
816
+ for (const r of results) {
817
+ // Skip self if requested
818
+ if (excludeSelf && sourceId && r.id === sourceId) {
819
+ continue;
820
+ }
821
+ // Also skip if content is identical (for code-based search)
822
+ if (excludeSelf && code && r.content.trim() === code.trim()) {
823
+ continue;
824
+ }
825
+ // Convert distance to similarity (0 = far, 1 = identical)
826
+ const distance = r._distance || 0;
827
+ const similarity = maxDistance > 0 ? 1 - distance / maxDistance : 1;
828
+ // Apply threshold filter
829
+ if (similarity < threshold) {
830
+ continue;
831
+ }
832
+ scoredResults.push({
833
+ id: r.id,
834
+ filepath: r.filepath,
835
+ content: r.content,
836
+ startLine: r.startLine,
837
+ endLine: r.endLine,
838
+ language: r.language,
839
+ similarity,
840
+ symbolType: r.symbolType,
841
+ symbolName: r.symbolName,
842
+ });
843
+ if (scoredResults.length >= limit) {
844
+ break;
845
+ }
846
+ }
847
+ return scoredResults;
848
+ }
165
849
  async clearIndex() {
166
850
  const tableNames = await this.db.tableNames();
167
851
  if (tableNames.includes('code_chunks')) {
168
852
  await this.db.dropTable('code_chunks');
169
853
  }
170
854
  this.table = null;
855
+ // Clear query embedding cache to prevent stale embeddings
856
+ this.queryEmbeddingCache.clear();
857
+ // Clear clustering metadata
858
+ await this.clearClusteringMetadata();
859
+ }
860
+ get clusteringMetadataPath() {
861
+ return path.join(this.indexPath, 'clustering-metadata.json');
862
+ }
863
+ /**
864
+ * Clear clustering metadata file
865
+ */
866
+ async clearClusteringMetadata() {
867
+ try {
868
+ await fs.unlink(this.clusteringMetadataPath);
869
+ }
870
+ catch {
871
+ // Ignore errors if file doesn't exist
872
+ }
873
+ }
874
+ /**
875
+ * Save clustering result to metadata file
876
+ */
877
+ async saveClusteringMetadata(result) {
878
+ await fs.mkdir(this.indexPath, { recursive: true });
879
+ const data = {
880
+ clusterCount: result.clusterCount,
881
+ clusters: result.clusters,
882
+ // Convert Map to object for JSON serialization
883
+ assignments: Object.fromEntries(result.assignments),
884
+ generatedAt: new Date().toISOString(),
885
+ };
886
+ await fs.writeFile(this.clusteringMetadataPath, JSON.stringify(data, null, 2));
887
+ }
888
+ /**
889
+ * Load clustering result from metadata file
890
+ */
891
+ async loadClusteringMetadata() {
892
+ try {
893
+ const content = await fs.readFile(this.clusteringMetadataPath, 'utf-8');
894
+ const data = JSON.parse(content);
895
+ return {
896
+ clusterCount: data.clusterCount,
897
+ clusters: data.clusters,
898
+ // Convert object back to Map
899
+ assignments: new Map(Object.entries(data.assignments).map(([k, v]) => [k, v])),
900
+ };
901
+ }
902
+ catch {
903
+ return null;
904
+ }
905
+ }
906
+ /**
907
+ * Cluster the indexed codebase into semantic concept areas.
908
+ * Uses k-means clustering on embeddings to discover related code groups.
909
+ */
910
+ async clusterConcepts(options = {}) {
911
+ if (!this.table) {
912
+ const status = await this.getStatus();
913
+ if (!status.indexed) {
914
+ throw new Error('Codebase not indexed. Run index_codebase first.');
915
+ }
916
+ this.table = await this.db.openTable('code_chunks');
917
+ }
918
+ // Fetch all chunks with embeddings
919
+ const rows = await this.table.query().toArray();
920
+ const chunks = rows.map((row) => ({
921
+ id: row.id,
922
+ content: row.content,
923
+ filepath: row.filepath,
924
+ embedding: row.vector,
925
+ symbolName: row.symbolName,
926
+ symbolType: row.symbolType,
927
+ }));
928
+ // Perform clustering
929
+ const result = kMeansClustering(chunks, options);
930
+ // Save to metadata file
931
+ await this.saveClusteringMetadata(result);
932
+ return result;
933
+ }
934
+ /**
935
+ * List all discovered concept clusters.
936
+ * Returns cached clustering result if available, otherwise clusters first.
937
+ */
938
+ async listConcepts(forceRecluster = false) {
939
+ if (!forceRecluster) {
940
+ const cached = await this.loadClusteringMetadata();
941
+ if (cached) {
942
+ return cached.clusters;
943
+ }
944
+ }
945
+ const result = await this.clusterConcepts();
946
+ return result.clusters;
947
+ }
948
+ /**
949
+ * Search for code within a specific concept cluster.
950
+ * Returns chunks that belong to the specified cluster, optionally filtered by query.
951
+ */
952
+ async searchByConcept(conceptId, query, limit = 10) {
953
+ const clustering = await this.loadClusteringMetadata();
954
+ if (!clustering) {
955
+ throw new Error('No clustering data available. Run clusterConcepts first.');
956
+ }
957
+ // Get chunk IDs in this cluster
958
+ const chunkIds = new Set();
959
+ for (const [chunkId, clusterId] of clustering.assignments) {
960
+ if (clusterId === conceptId) {
961
+ chunkIds.add(chunkId);
962
+ }
963
+ }
964
+ if (chunkIds.size === 0) {
965
+ return [];
966
+ }
967
+ if (!this.table) {
968
+ this.table = await this.db.openTable('code_chunks');
969
+ }
970
+ // If query provided, use semantic search and filter to cluster
971
+ if (query) {
972
+ const queryEmbedding = await this.getQueryEmbedding(query);
973
+ const results = await this.table
974
+ .search(queryEmbedding)
975
+ .limit(limit * 3)
976
+ .toArray();
977
+ return results
978
+ .filter((r) => chunkIds.has(r.id))
979
+ .slice(0, limit)
980
+ .map((r) => ({
981
+ id: r.id,
982
+ filepath: r.filepath,
983
+ content: r.content,
984
+ startLine: r.startLine,
985
+ endLine: r.endLine,
986
+ language: r.language,
987
+ symbolType: r.symbolType,
988
+ symbolName: r.symbolName,
989
+ }));
990
+ }
991
+ // Without query, return representative chunks from the cluster
992
+ const cluster = clustering.clusters.find((c) => c.id === conceptId);
993
+ if (!cluster) {
994
+ return [];
995
+ }
996
+ const results = [];
997
+ for (const chunkId of cluster.representativeChunks.slice(0, limit)) {
998
+ // Fetch chunk by ID - LanceDB doesn't have direct ID lookup, so we filter
999
+ const rows = await this.table.query().where(`id = '${chunkId}'`).limit(1).toArray();
1000
+ if (rows.length > 0) {
1001
+ const r = rows[0];
1002
+ results.push({
1003
+ id: r.id,
1004
+ filepath: r.filepath,
1005
+ content: r.content,
1006
+ startLine: r.startLine,
1007
+ endLine: r.endLine,
1008
+ language: r.language,
1009
+ symbolType: r.symbolType,
1010
+ symbolName: r.symbolName,
1011
+ });
1012
+ }
1013
+ }
1014
+ return results;
1015
+ }
1016
+ /**
1017
+ * Generate a comprehensive summary of the codebase including concept areas.
1018
+ */
1019
+ async summarizeCodebase(clusteringOptions) {
1020
+ const status = await this.getStatus();
1021
+ if (!status.indexed) {
1022
+ throw new Error('Codebase not indexed. Run index_codebase first.');
1023
+ }
1024
+ if (!this.table) {
1025
+ this.table = await this.db.openTable('code_chunks');
1026
+ }
1027
+ // Gather language statistics
1028
+ const rows = await this.table.query().toArray();
1029
+ const languageStats = new Map();
1030
+ for (const row of rows) {
1031
+ const lang = row.language;
1032
+ if (!languageStats.has(lang)) {
1033
+ languageStats.set(lang, { fileCount: new Set(), chunkCount: 0 });
1034
+ }
1035
+ const stats = languageStats.get(lang);
1036
+ stats.fileCount.add(row.filepath);
1037
+ stats.chunkCount++;
1038
+ }
1039
+ const languages = Array.from(languageStats.entries())
1040
+ .map(([language, stats]) => ({
1041
+ language,
1042
+ fileCount: stats.fileCount.size,
1043
+ chunkCount: stats.chunkCount,
1044
+ }))
1045
+ .sort((a, b) => b.chunkCount - a.chunkCount);
1046
+ // Perform clustering
1047
+ const chunks = rows.map((row) => ({
1048
+ id: row.id,
1049
+ content: row.content,
1050
+ filepath: row.filepath,
1051
+ embedding: row.vector,
1052
+ symbolName: row.symbolName,
1053
+ symbolType: row.symbolType,
1054
+ }));
1055
+ const clusteringResult = kMeansClustering(chunks, clusteringOptions);
1056
+ await this.saveClusteringMetadata(clusteringResult);
1057
+ // Calculate clustering quality
1058
+ const silhouetteScore = calculateSilhouetteScore(chunks, clusteringResult.assignments, clusteringResult.clusters);
1059
+ return {
1060
+ totalFiles: status.fileCount,
1061
+ totalChunks: status.chunkCount,
1062
+ languages,
1063
+ concepts: clusteringResult.clusters,
1064
+ clusteringQuality: silhouetteScore,
1065
+ generatedAt: new Date().toISOString(),
1066
+ };
171
1067
  }
172
1068
  }
173
1069
  //# sourceMappingURL=indexer.js.map