@matperez/coderag 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/README.md +154 -0
  2. package/dist/.tsbuildinfo +1 -0
  3. package/dist/ast-chunking.d.ts +40 -0
  4. package/dist/ast-chunking.d.ts.map +1 -0
  5. package/dist/ast-chunking.js +88 -0
  6. package/dist/ast-chunking.js.map +1 -0
  7. package/dist/ast-chunking.test.d.ts +5 -0
  8. package/dist/ast-chunking.test.d.ts.map +1 -0
  9. package/dist/ast-chunking.test.js +173 -0
  10. package/dist/ast-chunking.test.js.map +1 -0
  11. package/dist/code-tokenizer.d.ts +62 -0
  12. package/dist/code-tokenizer.d.ts.map +1 -0
  13. package/dist/code-tokenizer.js +129 -0
  14. package/dist/code-tokenizer.js.map +1 -0
  15. package/dist/code-tokenizer.test.d.ts +5 -0
  16. package/dist/code-tokenizer.test.d.ts.map +1 -0
  17. package/dist/code-tokenizer.test.js +96 -0
  18. package/dist/code-tokenizer.test.js.map +1 -0
  19. package/dist/db/client-pg.d.ts +16 -0
  20. package/dist/db/client-pg.d.ts.map +1 -0
  21. package/dist/db/client-pg.js +38 -0
  22. package/dist/db/client-pg.js.map +1 -0
  23. package/dist/db/client.d.ts +36 -0
  24. package/dist/db/client.d.ts.map +1 -0
  25. package/dist/db/client.js +81 -0
  26. package/dist/db/client.js.map +1 -0
  27. package/dist/db/migrations-pg.d.ts +6 -0
  28. package/dist/db/migrations-pg.d.ts.map +1 -0
  29. package/dist/db/migrations-pg.js +88 -0
  30. package/dist/db/migrations-pg.js.map +1 -0
  31. package/dist/db/migrations.d.ts +9 -0
  32. package/dist/db/migrations.d.ts.map +1 -0
  33. package/dist/db/migrations.js +164 -0
  34. package/dist/db/migrations.js.map +1 -0
  35. package/dist/db/schema-pg.d.ts +611 -0
  36. package/dist/db/schema-pg.d.ts.map +1 -0
  37. package/dist/db/schema-pg.js +66 -0
  38. package/dist/db/schema-pg.js.map +1 -0
  39. package/dist/db/schema.d.ts +630 -0
  40. package/dist/db/schema.d.ts.map +1 -0
  41. package/dist/db/schema.js +85 -0
  42. package/dist/db/schema.js.map +1 -0
  43. package/dist/embeddings.d.ts +92 -0
  44. package/dist/embeddings.d.ts.map +1 -0
  45. package/dist/embeddings.js +275 -0
  46. package/dist/embeddings.js.map +1 -0
  47. package/dist/embeddings.test.d.ts +5 -0
  48. package/dist/embeddings.test.d.ts.map +1 -0
  49. package/dist/embeddings.test.js +255 -0
  50. package/dist/embeddings.test.js.map +1 -0
  51. package/dist/hybrid-search.d.ts +47 -0
  52. package/dist/hybrid-search.d.ts.map +1 -0
  53. package/dist/hybrid-search.js +215 -0
  54. package/dist/hybrid-search.js.map +1 -0
  55. package/dist/hybrid-search.test.d.ts +5 -0
  56. package/dist/hybrid-search.test.d.ts.map +1 -0
  57. package/dist/hybrid-search.test.js +252 -0
  58. package/dist/hybrid-search.test.js.map +1 -0
  59. package/dist/incremental-tfidf.d.ts +77 -0
  60. package/dist/incremental-tfidf.d.ts.map +1 -0
  61. package/dist/incremental-tfidf.js +248 -0
  62. package/dist/incremental-tfidf.js.map +1 -0
  63. package/dist/incremental-tfidf.test.d.ts +5 -0
  64. package/dist/incremental-tfidf.test.d.ts.map +1 -0
  65. package/dist/incremental-tfidf.test.js +276 -0
  66. package/dist/incremental-tfidf.test.js.map +1 -0
  67. package/dist/index.d.ts +18 -0
  68. package/dist/index.d.ts.map +1 -0
  69. package/dist/index.js +19 -0
  70. package/dist/index.js.map +1 -0
  71. package/dist/indexer.d.ts +205 -0
  72. package/dist/indexer.d.ts.map +1 -0
  73. package/dist/indexer.js +1331 -0
  74. package/dist/indexer.js.map +1 -0
  75. package/dist/indexer.test.d.ts +12 -0
  76. package/dist/indexer.test.d.ts.map +1 -0
  77. package/dist/indexer.test.js +471 -0
  78. package/dist/indexer.test.js.map +1 -0
  79. package/dist/language-config.d.ts +54 -0
  80. package/dist/language-config.d.ts.map +1 -0
  81. package/dist/language-config.js +75 -0
  82. package/dist/language-config.js.map +1 -0
  83. package/dist/search-cache.d.ts +63 -0
  84. package/dist/search-cache.d.ts.map +1 -0
  85. package/dist/search-cache.js +118 -0
  86. package/dist/search-cache.js.map +1 -0
  87. package/dist/search-cache.test.d.ts +5 -0
  88. package/dist/search-cache.test.d.ts.map +1 -0
  89. package/dist/search-cache.test.js +194 -0
  90. package/dist/search-cache.test.js.map +1 -0
  91. package/dist/storage-factory.d.ts +11 -0
  92. package/dist/storage-factory.d.ts.map +1 -0
  93. package/dist/storage-factory.js +17 -0
  94. package/dist/storage-factory.js.map +1 -0
  95. package/dist/storage-persistent-pg.d.ts +75 -0
  96. package/dist/storage-persistent-pg.d.ts.map +1 -0
  97. package/dist/storage-persistent-pg.js +579 -0
  98. package/dist/storage-persistent-pg.js.map +1 -0
  99. package/dist/storage-persistent-pg.test.d.ts +7 -0
  100. package/dist/storage-persistent-pg.test.d.ts.map +1 -0
  101. package/dist/storage-persistent-pg.test.js +90 -0
  102. package/dist/storage-persistent-pg.test.js.map +1 -0
  103. package/dist/storage-persistent-types.d.ts +110 -0
  104. package/dist/storage-persistent-types.d.ts.map +1 -0
  105. package/dist/storage-persistent-types.js +5 -0
  106. package/dist/storage-persistent-types.js.map +1 -0
  107. package/dist/storage-persistent.d.ts +231 -0
  108. package/dist/storage-persistent.d.ts.map +1 -0
  109. package/dist/storage-persistent.js +897 -0
  110. package/dist/storage-persistent.js.map +1 -0
  111. package/dist/storage-persistent.test.d.ts +5 -0
  112. package/dist/storage-persistent.test.d.ts.map +1 -0
  113. package/dist/storage-persistent.test.js +325 -0
  114. package/dist/storage-persistent.test.js.map +1 -0
  115. package/dist/storage.d.ts +63 -0
  116. package/dist/storage.d.ts.map +1 -0
  117. package/dist/storage.js +67 -0
  118. package/dist/storage.js.map +1 -0
  119. package/dist/storage.test.d.ts +5 -0
  120. package/dist/storage.test.d.ts.map +1 -0
  121. package/dist/storage.test.js +157 -0
  122. package/dist/storage.test.js.map +1 -0
  123. package/dist/tfidf.d.ts +97 -0
  124. package/dist/tfidf.d.ts.map +1 -0
  125. package/dist/tfidf.js +308 -0
  126. package/dist/tfidf.js.map +1 -0
  127. package/dist/tfidf.test.d.ts +5 -0
  128. package/dist/tfidf.test.d.ts.map +1 -0
  129. package/dist/tfidf.test.js +181 -0
  130. package/dist/tfidf.test.js.map +1 -0
  131. package/dist/utils.d.ts +61 -0
  132. package/dist/utils.d.ts.map +1 -0
  133. package/dist/utils.js +264 -0
  134. package/dist/utils.js.map +1 -0
  135. package/dist/utils.test.d.ts +5 -0
  136. package/dist/utils.test.d.ts.map +1 -0
  137. package/dist/utils.test.js +94 -0
  138. package/dist/utils.test.js.map +1 -0
  139. package/dist/vector-storage.d.ts +120 -0
  140. package/dist/vector-storage.d.ts.map +1 -0
  141. package/dist/vector-storage.js +264 -0
  142. package/dist/vector-storage.js.map +1 -0
  143. package/dist/vector-storage.test.d.ts +5 -0
  144. package/dist/vector-storage.test.d.ts.map +1 -0
  145. package/dist/vector-storage.test.js +345 -0
  146. package/dist/vector-storage.test.js.map +1 -0
  147. package/package.json +85 -0
@@ -0,0 +1,897 @@
1
+ /**
2
+ * Persistent storage implementation using SQLite + Drizzle ORM (LibSQL WASM-compatible)
3
+ * Now supports chunk-level indexing for better search granularity
4
+ */
5
+ import { eq, sql } from 'drizzle-orm';
6
+ import { createDb } from './db/client.js';
7
+ import { runMigrations } from './db/migrations.js';
8
+ import * as schema from './db/schema.js';
9
+ export class PersistentStorage {
10
+ dbInstance;
11
+ initPromise;
12
+ useBulkInsertChunks;
13
+ constructor(config = {}) {
14
+ this.useBulkInsertChunks = config.useBulkInsertChunks ?? false;
15
+ this.initPromise = this.initialize(config);
16
+ }
17
+ async initialize(config) {
18
+ this.dbInstance = await createDb(config);
19
+ await runMigrations(this.dbInstance.client);
20
+ }
21
+ /**
22
+ * Ensure database is initialized before operations
23
+ */
24
+ async ensureInit() {
25
+ await this.initPromise;
26
+ }
27
+ /**
28
+ * Get the LibSQL client for raw SQL operations
29
+ */
30
+ get client() {
31
+ return this.dbInstance.client;
32
+ }
33
+ /**
34
+ * Store a file
35
+ */
36
+ async storeFile(file) {
37
+ await this.ensureInit();
38
+ const { db } = this.dbInstance;
39
+ const mtime = typeof file.mtime === 'number' ? file.mtime : file.mtime.getTime();
40
+ const values = {
41
+ path: file.path,
42
+ content: file.content,
43
+ hash: file.hash,
44
+ size: file.size,
45
+ mtime,
46
+ ...(file.language ? { language: file.language } : {}),
47
+ indexedAt: Date.now(),
48
+ };
49
+ await db
50
+ .insert(schema.files)
51
+ .values(values)
52
+ .onConflictDoUpdate({
53
+ target: schema.files.path,
54
+ set: {
55
+ content: values.content,
56
+ hash: values.hash,
57
+ size: values.size,
58
+ mtime: values.mtime,
59
+ ...(values.language ? { language: values.language } : {}),
60
+ indexedAt: values.indexedAt,
61
+ },
62
+ });
63
+ }
64
+ /**
65
+ * Store multiple files in a single transaction (batch operation)
66
+ * Much faster than storing one by one for large datasets
67
+ */
68
+ async storeFiles(files) {
69
+ if (files.length === 0) {
70
+ return;
71
+ }
72
+ await this.ensureInit();
73
+ // LibSQL supports batch transactions
74
+ await this.client.batch(files.map((file) => {
75
+ const mtime = typeof file.mtime === 'number' ? file.mtime : file.mtime.getTime();
76
+ return {
77
+ sql: `INSERT INTO files (path, content, hash, size, mtime, language, indexed_at)
78
+ VALUES (?, ?, ?, ?, ?, ?, ?)
79
+ ON CONFLICT(path) DO UPDATE SET
80
+ content = excluded.content,
81
+ hash = excluded.hash,
82
+ size = excluded.size,
83
+ mtime = excluded.mtime,
84
+ language = excluded.language,
85
+ indexed_at = excluded.indexed_at`,
86
+ args: [
87
+ file.path,
88
+ file.content,
89
+ file.hash,
90
+ file.size,
91
+ mtime,
92
+ file.language || null,
93
+ Date.now(),
94
+ ],
95
+ };
96
+ }), 'write');
97
+ }
98
+ /**
99
+ * Get a file by path
100
+ */
101
+ async getFile(path) {
102
+ await this.ensureInit();
103
+ const { db } = this.dbInstance;
104
+ const result = await db.select().from(schema.files).where(eq(schema.files.path, path)).get();
105
+ if (!result) {
106
+ return null;
107
+ }
108
+ return {
109
+ path: result.path,
110
+ content: result.content,
111
+ hash: result.hash,
112
+ size: result.size,
113
+ mtime: result.mtime,
114
+ language: result.language || undefined,
115
+ };
116
+ }
117
+ /**
118
+ * Get all files
119
+ */
120
+ async getAllFiles() {
121
+ await this.ensureInit();
122
+ const { db } = this.dbInstance;
123
+ const results = await db.select().from(schema.files).all();
124
+ return results.map((file) => ({
125
+ path: file.path,
126
+ content: file.content,
127
+ hash: file.hash,
128
+ size: file.size,
129
+ mtime: file.mtime,
130
+ language: file.language || undefined,
131
+ }));
132
+ }
133
+ /**
134
+ * Get path -> hash for all files (for skip-unchanged during full index)
135
+ */
136
+ async getFileHashes() {
137
+ await this.ensureInit();
138
+ const { db } = this.dbInstance;
139
+ const results = await db
140
+ .select({ path: schema.files.path, hash: schema.files.hash })
141
+ .from(schema.files)
142
+ .all();
143
+ const map = new Map();
144
+ for (const row of results)
145
+ map.set(row.path, row.hash);
146
+ return map;
147
+ }
148
+ /**
149
+ * Delete a file
150
+ */
151
+ async deleteFile(path) {
152
+ await this.ensureInit();
153
+ const { db } = this.dbInstance;
154
+ await db.delete(schema.files).where(eq(schema.files.path, path));
155
+ }
156
+ /**
157
+ * Clear all files
158
+ */
159
+ async clear() {
160
+ await this.ensureInit();
161
+ const { db } = this.dbInstance;
162
+ await db.delete(schema.chunks);
163
+ await db.delete(schema.files);
164
+ await db.delete(schema.documentVectors);
165
+ await db.delete(schema.idfScores);
166
+ await db.delete(schema.indexMetadata);
167
+ }
168
+ // ============ CHUNK METHODS ============
169
+ /**
170
+ * Store chunks for a file (replaces existing chunks)
171
+ */
172
+ async storeChunks(filePath, chunks) {
173
+ await this.ensureInit();
174
+ const { db } = this.dbInstance;
175
+ // Get file ID
176
+ const file = await db.select().from(schema.files).where(eq(schema.files.path, filePath)).get();
177
+ if (!file) {
178
+ throw new Error(`File not found: ${filePath}`);
179
+ }
180
+ // Delete existing chunks for this file
181
+ await db.delete(schema.chunks).where(eq(schema.chunks.fileId, file.id));
182
+ // Insert new chunks
183
+ const chunkIds = [];
184
+ for (const chunk of chunks) {
185
+ const insertValues = {
186
+ fileId: file.id,
187
+ content: chunk.content,
188
+ type: chunk.type,
189
+ startLine: chunk.startLine,
190
+ endLine: chunk.endLine,
191
+ };
192
+ if (chunk.metadata) {
193
+ insertValues.metadata = JSON.stringify(chunk.metadata);
194
+ }
195
+ const result = await db
196
+ .insert(schema.chunks)
197
+ .values(insertValues)
198
+ .returning({ id: schema.chunks.id });
199
+ if (result[0]) {
200
+ chunkIds.push(result[0].id);
201
+ }
202
+ }
203
+ return chunkIds;
204
+ }
205
+ /**
206
+ * Store chunks for multiple files in batch
207
+ */
208
+ async storeManyChunks(fileChunks) {
209
+ await this.ensureInit();
210
+ const { db } = this.dbInstance;
211
+ const result = new Map();
212
+ if (fileChunks.length === 0) {
213
+ return result;
214
+ }
215
+ // Get file IDs
216
+ const filePaths = fileChunks.map((fc) => fc.filePath);
217
+ const files = await db
218
+ .select({ id: schema.files.id, path: schema.files.path })
219
+ .from(schema.files)
220
+ .where(sql `${schema.files.path} IN (${sql.join(filePaths.map((p) => sql `${p}`), sql `, `)})`)
221
+ .all();
222
+ const fileIdMap = new Map();
223
+ for (const file of files) {
224
+ fileIdMap.set(file.path, file.id);
225
+ }
226
+ // Delete existing chunks for these files
227
+ const fileIds = Array.from(fileIdMap.values());
228
+ if (fileIds.length > 0) {
229
+ await db.delete(schema.chunks).where(sql `${schema.chunks.fileId} IN (${sql.join(fileIds.map((id) => sql `${id}`), sql `, `)})`);
230
+ }
231
+ if (!this.useBulkInsertChunks) {
232
+ for (const fc of fileChunks) {
233
+ const fileId = fileIdMap.get(fc.filePath);
234
+ if (!fileId)
235
+ continue;
236
+ const ids = [];
237
+ for (const chunk of fc.chunks) {
238
+ const insertResult = await db
239
+ .insert(schema.chunks)
240
+ .values({
241
+ fileId,
242
+ content: chunk.content,
243
+ type: chunk.type,
244
+ startLine: chunk.startLine,
245
+ endLine: chunk.endLine,
246
+ metadata: chunk.metadata ? JSON.stringify(chunk.metadata) : null,
247
+ })
248
+ .returning({ id: schema.chunks.id });
249
+ const row = Array.isArray(insertResult) ? insertResult[0] : insertResult;
250
+ if (row?.id != null)
251
+ ids.push(row.id);
252
+ }
253
+ result.set(fc.filePath, ids);
254
+ }
255
+ return result;
256
+ }
257
+ // Bulk insert (SQLite ~999 bind limit, 6 fields/row → batch 150)
258
+ const CHUNK_INSERT_BATCH_SIZE = 150;
259
+ const flatRows = [];
260
+ const countsPerFile = [];
261
+ for (const fc of fileChunks) {
262
+ const fileId = fileIdMap.get(fc.filePath);
263
+ if (!fileId) {
264
+ countsPerFile.push(0);
265
+ continue;
266
+ }
267
+ countsPerFile.push(fc.chunks.length);
268
+ for (const chunk of fc.chunks) {
269
+ flatRows.push({
270
+ fileId,
271
+ content: chunk.content,
272
+ type: chunk.type,
273
+ startLine: chunk.startLine,
274
+ endLine: chunk.endLine,
275
+ metadata: chunk.metadata ? JSON.stringify(chunk.metadata) : null,
276
+ });
277
+ }
278
+ }
279
+ const allIds = [];
280
+ for (let i = 0; i < flatRows.length; i += CHUNK_INSERT_BATCH_SIZE) {
281
+ const batch = flatRows.slice(i, i + CHUNK_INSERT_BATCH_SIZE);
282
+ const insertResult = await db
283
+ .insert(schema.chunks)
284
+ .values(batch)
285
+ .returning({ id: schema.chunks.id });
286
+ const rows = Array.isArray(insertResult) ? insertResult : [insertResult];
287
+ for (const row of rows)
288
+ if (row?.id != null)
289
+ allIds.push(row.id);
290
+ }
291
+ let offset = 0;
292
+ for (let i = 0; i < fileChunks.length; i++) {
293
+ const count = countsPerFile[i];
294
+ if (count === 0)
295
+ continue;
296
+ result.set(fileChunks[i].filePath, allIds.slice(offset, offset + count));
297
+ offset += count;
298
+ }
299
+ return result;
300
+ }
301
+ /**
302
+ * Get chunks for a file
303
+ */
304
+ async getChunksForFile(filePath) {
305
+ await this.ensureInit();
306
+ const { db } = this.dbInstance;
307
+ const results = await db
308
+ .select({
309
+ id: schema.chunks.id,
310
+ fileId: schema.chunks.fileId,
311
+ content: schema.chunks.content,
312
+ type: schema.chunks.type,
313
+ startLine: schema.chunks.startLine,
314
+ endLine: schema.chunks.endLine,
315
+ metadata: schema.chunks.metadata,
316
+ filePath: schema.files.path,
317
+ })
318
+ .from(schema.chunks)
319
+ .innerJoin(schema.files, eq(schema.chunks.fileId, schema.files.id))
320
+ .where(eq(schema.files.path, filePath))
321
+ .all();
322
+ return results.map((r) => ({
323
+ id: r.id,
324
+ fileId: r.fileId,
325
+ filePath: r.filePath,
326
+ content: r.content,
327
+ type: r.type,
328
+ startLine: r.startLine,
329
+ endLine: r.endLine,
330
+ metadata: r.metadata ? JSON.parse(r.metadata) : undefined,
331
+ }));
332
+ }
333
+ /**
334
+ * Get total chunk count
335
+ */
336
+ async getChunkCount() {
337
+ await this.ensureInit();
338
+ const { db } = this.dbInstance;
339
+ const result = await db.select({ count: sql `count(*)` }).from(schema.chunks).get();
340
+ return result?.count || 0;
341
+ }
342
+ /**
343
+ * Get file count
344
+ */
345
+ async count() {
346
+ await this.ensureInit();
347
+ const { db } = this.dbInstance;
348
+ const result = await db.select({ count: sql `count(*)` }).from(schema.files).get();
349
+ return result?.count || 0;
350
+ }
351
+ /**
352
+ * Check if file exists
353
+ */
354
+ async exists(path) {
355
+ await this.ensureInit();
356
+ const { db } = this.dbInstance;
357
+ const result = await db.select().from(schema.files).where(eq(schema.files.path, path)).get();
358
+ return result !== undefined;
359
+ }
360
+ /**
361
+ * Store document vectors (TF-IDF) for a CHUNK
362
+ */
363
+ async storeChunkVectors(chunkId, terms, tokenCount) {
364
+ await this.ensureInit();
365
+ const { db } = this.dbInstance;
366
+ // Delete existing vectors for this chunk
367
+ await db.delete(schema.documentVectors).where(eq(schema.documentVectors.chunkId, chunkId));
368
+ // Update token count if provided
369
+ if (tokenCount !== undefined) {
370
+ await this.client.execute({
371
+ sql: 'UPDATE chunks SET token_count = ? WHERE id = ?',
372
+ args: [tokenCount, chunkId],
373
+ });
374
+ }
375
+ // Insert new vectors in batches (SQLite has ~999 bind variable limit, 5 fields per row = 199 rows)
376
+ const BATCH_SIZE = 199;
377
+ const vectors = Array.from(terms.entries()).map(([term, scores]) => ({
378
+ chunkId,
379
+ term,
380
+ tf: scores.tf,
381
+ tfidf: scores.tfidf,
382
+ rawFreq: scores.rawFreq,
383
+ }));
384
+ for (let i = 0; i < vectors.length; i += BATCH_SIZE) {
385
+ const batch = vectors.slice(i, i + BATCH_SIZE);
386
+ await db.insert(schema.documentVectors).values(batch);
387
+ }
388
+ }
389
+ /**
390
+ * Store document vectors for multiple CHUNKS in a single transaction (batch operation)
391
+ * Much faster than storing one by one for large datasets
392
+ */
393
+ async storeManyChunkVectors(chunkVectors) {
394
+ if (chunkVectors.length === 0) {
395
+ return;
396
+ }
397
+ await this.ensureInit();
398
+ const { db } = this.dbInstance;
399
+ // Delete all existing vectors for these chunks
400
+ const chunkIds = chunkVectors.map((cv) => cv.chunkId);
401
+ if (chunkIds.length > 0) {
402
+ // Delete in batches to avoid SQLite variable limits
403
+ const deleteBatchSize = 500;
404
+ for (let i = 0; i < chunkIds.length; i += deleteBatchSize) {
405
+ const batch = chunkIds.slice(i, i + deleteBatchSize);
406
+ await db.delete(schema.documentVectors).where(sql `${schema.documentVectors.chunkId} IN (${sql.join(batch.map((id) => sql `${id}`), sql `, `)})`);
407
+ }
408
+ }
409
+ // Prepare all vectors for batch insert
410
+ const allVectors = [];
411
+ // Track token counts for BM25
412
+ const tokenCountUpdates = [];
413
+ for (const cv of chunkVectors) {
414
+ // Track token count for BM25 document length normalization
415
+ if (cv.tokenCount !== undefined) {
416
+ tokenCountUpdates.push({ chunkId: cv.chunkId, tokenCount: cv.tokenCount });
417
+ }
418
+ for (const [term, scores] of cv.terms.entries()) {
419
+ allVectors.push({
420
+ chunkId: cv.chunkId,
421
+ term,
422
+ tf: scores.tf,
423
+ tfidf: scores.tfidf,
424
+ rawFreq: scores.rawFreq,
425
+ });
426
+ }
427
+ }
428
+ // Update token counts for BM25 using batch
429
+ if (tokenCountUpdates.length > 0) {
430
+ await this.client.batch(tokenCountUpdates.map(({ chunkId, tokenCount }) => ({
431
+ sql: 'UPDATE chunks SET token_count = ? WHERE id = ?',
432
+ args: [tokenCount, chunkId],
433
+ })), 'write');
434
+ }
435
+ // Insert in batches to avoid SQLite variable limits (5 fields per row = 199 rows max)
436
+ const batchSize = 199;
437
+ for (let i = 0; i < allVectors.length; i += batchSize) {
438
+ const batch = allVectors.slice(i, i + batchSize);
439
+ if (batch.length > 0) {
440
+ await db.insert(schema.documentVectors).values(batch);
441
+ }
442
+ }
443
+ }
444
+ /**
445
+ * Store IDF scores
446
+ */
447
+ async storeIdfScores(idf, docFreq) {
448
+ await this.ensureInit();
449
+ const { db } = this.dbInstance;
450
+ // Clear existing IDF scores
451
+ await db.delete(schema.idfScores);
452
+ // Insert new scores in batches (SQLite has ~999 bind variable limit, 3 fields per row = 300 rows)
453
+ const BATCH_SIZE = 300;
454
+ const scores = Array.from(idf.entries()).map(([term, idfScore]) => ({
455
+ term,
456
+ idf: idfScore,
457
+ documentFrequency: docFreq.get(term) || 0,
458
+ }));
459
+ for (let i = 0; i < scores.length; i += BATCH_SIZE) {
460
+ const batch = scores.slice(i, i + BATCH_SIZE);
461
+ await db.insert(schema.idfScores).values(batch);
462
+ }
463
+ }
464
+ /**
465
+ * Get IDF scores
466
+ */
467
+ async getIdfScores() {
468
+ await this.ensureInit();
469
+ const { db } = this.dbInstance;
470
+ const scores = await db.select().from(schema.idfScores).all();
471
+ const idf = new Map();
472
+ for (const score of scores) {
473
+ idf.set(score.term, score.idf);
474
+ }
475
+ return idf;
476
+ }
477
+ /**
478
+ * Get document vectors for a chunk
479
+ */
480
+ async getChunkVectors(chunkId) {
481
+ await this.ensureInit();
482
+ const { db } = this.dbInstance;
483
+ const vectors = await db
484
+ .select()
485
+ .from(schema.documentVectors)
486
+ .where(eq(schema.documentVectors.chunkId, chunkId))
487
+ .all();
488
+ if (vectors.length === 0) {
489
+ return null;
490
+ }
491
+ const terms = new Map();
492
+ for (const vector of vectors) {
493
+ terms.set(vector.term, {
494
+ tf: vector.tf,
495
+ tfidf: vector.tfidf,
496
+ rawFreq: vector.rawFreq,
497
+ });
498
+ }
499
+ return terms;
500
+ }
501
+ /**
502
+ * Get all chunk vectors in a single batch query (CPU + Memory optimization)
503
+ * Avoids N+1 query pattern when loading index from storage
504
+ * Returns Map<chunkId, Map<term, {tf, tfidf, rawFreq}>>
505
+ */
506
+ async getAllChunkVectors() {
507
+ await this.ensureInit();
508
+ const { db } = this.dbInstance;
509
+ // Single query to get all vectors
510
+ const results = await db
511
+ .select({
512
+ chunkId: schema.documentVectors.chunkId,
513
+ term: schema.documentVectors.term,
514
+ tf: schema.documentVectors.tf,
515
+ tfidf: schema.documentVectors.tfidf,
516
+ rawFreq: schema.documentVectors.rawFreq,
517
+ })
518
+ .from(schema.documentVectors)
519
+ .all();
520
+ // Group by chunk ID
521
+ const allVectors = new Map();
522
+ for (const row of results) {
523
+ let chunkVectors = allVectors.get(row.chunkId);
524
+ if (!chunkVectors) {
525
+ chunkVectors = new Map();
526
+ allVectors.set(row.chunkId, chunkVectors);
527
+ }
528
+ chunkVectors.set(row.term, {
529
+ tf: row.tf,
530
+ tfidf: row.tfidf,
531
+ rawFreq: row.rawFreq,
532
+ });
533
+ }
534
+ return allVectors;
535
+ }
536
+ /**
537
+ * Search chunks by terms using SQL (Memory optimization)
538
+ * Returns matching chunks with their content for direct display
539
+ * Uses pre-computed magnitude from chunks table
540
+ */
541
+ async searchByTerms(queryTerms, options = {}) {
542
+ if (queryTerms.length === 0) {
543
+ return [];
544
+ }
545
+ await this.ensureInit();
546
+ const { db } = this.dbInstance;
547
+ const { limit = 100 } = options;
548
+ // Step 1: Find chunk IDs that contain any query term, with pre-computed magnitude and token count
549
+ const matchingChunks = await db
550
+ .select({
551
+ chunkId: schema.documentVectors.chunkId,
552
+ filePath: schema.files.path,
553
+ content: schema.chunks.content,
554
+ type: schema.chunks.type,
555
+ startLine: schema.chunks.startLine,
556
+ endLine: schema.chunks.endLine,
557
+ magnitude: schema.chunks.magnitude,
558
+ tokenCount: schema.chunks.tokenCount,
559
+ matchCount: sql `COUNT(DISTINCT ${schema.documentVectors.term})`,
560
+ })
561
+ .from(schema.documentVectors)
562
+ .innerJoin(schema.chunks, eq(schema.documentVectors.chunkId, schema.chunks.id))
563
+ .innerJoin(schema.files, eq(schema.chunks.fileId, schema.files.id))
564
+ .where(sql `${schema.documentVectors.term} IN (${sql.join(queryTerms.map((t) => sql `${t}`), sql `, `)})`)
565
+ .groupBy(schema.documentVectors.chunkId)
566
+ .orderBy(sql `COUNT(DISTINCT ${schema.documentVectors.term}) DESC`)
567
+ .limit(limit * 2) // Get more candidates for scoring
568
+ .all();
569
+ if (matchingChunks.length === 0) {
570
+ return [];
571
+ }
572
+ // Step 2: Get matched term vectors for these chunks (only query terms)
573
+ const chunkIds = matchingChunks.map((c) => c.chunkId);
574
+ const matchedVectors = await db
575
+ .select({
576
+ chunkId: schema.documentVectors.chunkId,
577
+ term: schema.documentVectors.term,
578
+ tfidf: schema.documentVectors.tfidf,
579
+ rawFreq: schema.documentVectors.rawFreq,
580
+ })
581
+ .from(schema.documentVectors)
582
+ .where(sql `${schema.documentVectors.chunkId} IN (${sql.join(chunkIds.map((id) => sql `${id}`), sql `, `)}) AND ${schema.documentVectors.term} IN (${sql.join(queryTerms.map((t) => sql `${t}`), sql `, `)})`)
583
+ .all();
584
+ // Build result map with pre-computed magnitude and token count
585
+ const resultMap = new Map();
586
+ // Initialize result entries with chunk data
587
+ for (const c of matchingChunks) {
588
+ resultMap.set(c.chunkId, {
589
+ chunkId: c.chunkId,
590
+ filePath: c.filePath,
591
+ content: c.content,
592
+ type: c.type,
593
+ startLine: c.startLine,
594
+ endLine: c.endLine,
595
+ matchedTerms: new Map(),
596
+ magnitude: c.magnitude ?? 0,
597
+ tokenCount: c.tokenCount ?? 0,
598
+ });
599
+ }
600
+ // Populate matched terms
601
+ for (const v of matchedVectors) {
602
+ const entry = resultMap.get(v.chunkId);
603
+ if (entry) {
604
+ entry.matchedTerms.set(v.term, { tfidf: v.tfidf, rawFreq: v.rawFreq });
605
+ }
606
+ }
607
+ return Array.from(resultMap.values());
608
+ }
609
+ /**
610
+ * Get IDF scores for specific terms only (Memory optimization)
611
+ */
612
+ async getIdfScoresForTerms(terms) {
613
+ if (terms.length === 0) {
614
+ return new Map();
615
+ }
616
+ await this.ensureInit();
617
+ const { db } = this.dbInstance;
618
+ const scores = await db
619
+ .select()
620
+ .from(schema.idfScores)
621
+ .where(sql `${schema.idfScores.term} IN (${sql.join(terms.map((t) => sql `${t}`), sql `, `)})`)
622
+ .all();
623
+ const idf = new Map();
624
+ for (const score of scores) {
625
+ idf.set(score.term, score.idf);
626
+ }
627
+ return idf;
628
+ }
629
+ /**
630
+ * Get total chunk count (for IDF calculation)
631
+ * BM25/TF-IDF now operates at chunk level, not file level
632
+ */
633
+ async getTotalDocuments() {
634
+ return this.getChunkCount();
635
+ }
636
+ /**
637
+ * Get all file metadata (path, mtime, hash) without content
638
+ * Used for incremental diff detection
639
+ */
640
+ async getAllFileMetadata() {
641
+ await this.ensureInit();
642
+ const { db } = this.dbInstance;
643
+ const results = await db
644
+ .select({
645
+ path: schema.files.path,
646
+ mtime: schema.files.mtime,
647
+ hash: schema.files.hash,
648
+ })
649
+ .from(schema.files)
650
+ .all();
651
+ const metadata = new Map();
652
+ for (const row of results) {
653
+ metadata.set(row.path, { mtime: row.mtime, hash: row.hash });
654
+ }
655
+ return metadata;
656
+ }
657
+ /**
658
+ * Delete multiple files in a single transaction (batch operation)
659
+ */
660
+ async deleteFiles(paths) {
661
+ if (paths.length === 0) {
662
+ return;
663
+ }
664
+ await this.ensureInit();
665
+ const { db } = this.dbInstance;
666
+ // Delete in chunks to avoid SQLite variable limits
667
+ const chunkSize = 500;
668
+ for (let i = 0; i < paths.length; i += chunkSize) {
669
+ const chunk = paths.slice(i, i + chunkSize);
670
+ await db.delete(schema.files).where(sql `${schema.files.path} IN (${sql.join(chunk.map((p) => sql `${p}`), sql `, `)})`);
671
+ }
672
+ }
673
+ /**
674
+ * Store metadata
675
+ */
676
+ async setMetadata(key, value) {
677
+ await this.ensureInit();
678
+ const { db } = this.dbInstance;
679
+ await db
680
+ .insert(schema.indexMetadata)
681
+ .values({
682
+ key,
683
+ value,
684
+ updatedAt: Date.now(),
685
+ })
686
+ .onConflictDoUpdate({
687
+ target: schema.indexMetadata.key,
688
+ set: {
689
+ value,
690
+ updatedAt: Date.now(),
691
+ },
692
+ });
693
+ }
694
+ /**
695
+ * Get metadata
696
+ */
697
+ async getMetadata(key) {
698
+ await this.ensureInit();
699
+ const { db } = this.dbInstance;
700
+ const result = await db
701
+ .select()
702
+ .from(schema.indexMetadata)
703
+ .where(eq(schema.indexMetadata.key, key))
704
+ .get();
705
+ return result?.value || null;
706
+ }
707
+ /**
708
+ * Get average chunk length (token count) for BM25 scoring
709
+ * Returns cached value from metadata if available, otherwise calculates from chunks table
710
+ */
711
+ async getAverageDocLength() {
712
+ await this.ensureInit();
713
+ // Try to get cached value first
714
+ const cached = await this.getMetadata('avgDocLength');
715
+ if (cached) {
716
+ return parseFloat(cached);
717
+ }
718
+ // Calculate from chunks table
719
+ const { db } = this.dbInstance;
720
+ const result = await db
721
+ .select({
722
+ avgLen: sql `AVG(COALESCE(${schema.chunks.tokenCount}, 0))`,
723
+ })
724
+ .from(schema.chunks)
725
+ .get();
726
+ const avgLen = result?.avgLen || 0;
727
+ // Cache the result
728
+ await this.setMetadata('avgDocLength', avgLen.toString());
729
+ return avgLen;
730
+ }
731
+ /**
732
+ * Update average chunk length in metadata (call after indexing)
733
+ */
734
+ async updateAverageDocLength() {
735
+ await this.ensureInit();
736
+ const { db } = this.dbInstance;
737
+ const result = await db
738
+ .select({
739
+ avgLen: sql `AVG(COALESCE(${schema.chunks.tokenCount}, 0))`,
740
+ })
741
+ .from(schema.chunks)
742
+ .get();
743
+ const avgLen = result?.avgLen || 0;
744
+ await this.setMetadata('avgDocLength', avgLen.toString());
745
+ return avgLen;
746
+ }
747
+ /**
748
+ * Rebuild IDF scores from document vectors using SQL (Memory optimization)
749
+ * Calculates document frequency for each term across CHUNKS and computes IDF
750
+ */
751
+ async rebuildIdfScoresFromVectors() {
752
+ await this.ensureInit();
753
+ const { db } = this.dbInstance;
754
+ // Get total chunk count (IDF is calculated per chunk, not per file)
755
+ const totalChunks = await this.getChunkCount();
756
+ if (totalChunks === 0) {
757
+ await db.delete(schema.idfScores);
758
+ return;
759
+ }
760
+ // Calculate document frequency for each term using SQL (counting chunks, not files)
761
+ const dfResults = await db
762
+ .select({
763
+ term: schema.documentVectors.term,
764
+ df: sql `COUNT(DISTINCT ${schema.documentVectors.chunkId})`,
765
+ })
766
+ .from(schema.documentVectors)
767
+ .groupBy(schema.documentVectors.term)
768
+ .all();
769
+ // Clear existing IDF scores
770
+ await db.delete(schema.idfScores);
771
+ // Insert in batches using smoothed IDF formula
772
+ // Smoothed IDF: log((N+1)/(df+1)) + 1 ensures no term gets IDF=0
773
+ const BATCH_SIZE = 300;
774
+ const scores = dfResults.map((row) => ({
775
+ term: row.term,
776
+ idf: Math.log((totalChunks + 1) / (row.df + 1)) + 1,
777
+ documentFrequency: row.df,
778
+ }));
779
+ for (let i = 0; i < scores.length; i += BATCH_SIZE) {
780
+ const batch = scores.slice(i, i + BATCH_SIZE);
781
+ if (batch.length > 0) {
782
+ await db.insert(schema.idfScores).values(batch);
783
+ }
784
+ }
785
+ }
786
+ /**
787
+ * Recalculate TF-IDF scores for all documents using current IDF values (Memory optimization)
788
+ * Updates document_vectors.tfidf = document_vectors.tf * idf_scores.idf
789
+ */
790
+ async recalculateTfidfScores() {
791
+ await this.ensureInit();
792
+ // Use raw SQL for efficient batch update with JOIN
793
+ await this.client.execute(`
794
+ UPDATE document_vectors
795
+ SET tfidf = tf * COALESCE(
796
+ (SELECT idf FROM idf_scores WHERE idf_scores.term = document_vectors.term),
797
+ 0
798
+ )
799
+ `);
800
+ }
801
+ /**
802
+ * Update pre-computed magnitude for all chunks (Memory optimization for search)
803
+ * magnitude = sqrt(sum(tfidf^2)) for each chunk
804
+ * Called after TF-IDF recalculation to keep magnitude in sync
805
+ */
806
+ async updateChunkMagnitudes() {
807
+ await this.ensureInit();
808
+ // Use raw SQL for efficient batch update with aggregate
809
+ await this.client.execute(`
810
+ UPDATE chunks
811
+ SET magnitude = COALESCE(
812
+ (SELECT SQRT(SUM(tfidf * tfidf)) FROM document_vectors WHERE document_vectors.chunk_id = chunks.id),
813
+ 0
814
+ )
815
+ `);
816
+ }
817
+ /**
818
+ * Get terms for chunks of files (for tracking affected terms during incremental updates)
819
+ * When files are deleted, we need to know which terms were affected
820
+ */
821
+ async getTermsForFiles(paths) {
822
+ if (paths.length === 0) {
823
+ return new Set();
824
+ }
825
+ await this.ensureInit();
826
+ const { db } = this.dbInstance;
827
+ const terms = new Set();
828
+ // Get file IDs
829
+ const files = await db
830
+ .select({ id: schema.files.id })
831
+ .from(schema.files)
832
+ .where(sql `${schema.files.path} IN (${sql.join(paths.map((p) => sql `${p}`), sql `, `)})`)
833
+ .all();
834
+ if (files.length === 0) {
835
+ return terms;
836
+ }
837
+ const fileIds = files.map((f) => f.id);
838
+ // Get chunk IDs for these files
839
+ const chunks = await db
840
+ .select({ id: schema.chunks.id })
841
+ .from(schema.chunks)
842
+ .where(sql `${schema.chunks.fileId} IN (${sql.join(fileIds.map((id) => sql `${id}`), sql `, `)})`)
843
+ .all();
844
+ if (chunks.length === 0) {
845
+ return terms;
846
+ }
847
+ const chunkIds = chunks.map((c) => c.id);
848
+ // Get terms for these chunks
849
+ const results = await db
850
+ .select({ term: schema.documentVectors.term })
851
+ .from(schema.documentVectors)
852
+ .where(sql `${schema.documentVectors.chunkId} IN (${sql.join(chunkIds.map((id) => sql `${id}`), sql `, `)})`)
853
+ .all();
854
+ for (const row of results) {
855
+ terms.add(row.term);
856
+ }
857
+ return terms;
858
+ }
859
+ /**
860
+ * Get all chunks with their file paths (for bulk operations)
861
+ */
862
+ async getAllChunks() {
863
+ await this.ensureInit();
864
+ const { db } = this.dbInstance;
865
+ const results = await db
866
+ .select({
867
+ id: schema.chunks.id,
868
+ fileId: schema.chunks.fileId,
869
+ content: schema.chunks.content,
870
+ type: schema.chunks.type,
871
+ startLine: schema.chunks.startLine,
872
+ endLine: schema.chunks.endLine,
873
+ metadata: schema.chunks.metadata,
874
+ filePath: schema.files.path,
875
+ })
876
+ .from(schema.chunks)
877
+ .innerJoin(schema.files, eq(schema.chunks.fileId, schema.files.id))
878
+ .all();
879
+ return results.map((r) => ({
880
+ id: r.id,
881
+ fileId: r.fileId,
882
+ filePath: r.filePath,
883
+ content: r.content,
884
+ type: r.type,
885
+ startLine: r.startLine,
886
+ endLine: r.endLine,
887
+ metadata: r.metadata ? JSON.parse(r.metadata) : undefined,
888
+ }));
889
+ }
890
+ /**
891
+ * Close database connection
892
+ */
893
+ close() {
894
+ this.dbInstance.client.close();
895
+ }
896
+ }
897
+ //# sourceMappingURL=storage-persistent.js.map