@matperez/coderag 0.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +154 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/ast-chunking.d.ts +40 -0
- package/dist/ast-chunking.d.ts.map +1 -0
- package/dist/ast-chunking.js +88 -0
- package/dist/ast-chunking.js.map +1 -0
- package/dist/ast-chunking.test.d.ts +5 -0
- package/dist/ast-chunking.test.d.ts.map +1 -0
- package/dist/ast-chunking.test.js +173 -0
- package/dist/ast-chunking.test.js.map +1 -0
- package/dist/code-tokenizer.d.ts +62 -0
- package/dist/code-tokenizer.d.ts.map +1 -0
- package/dist/code-tokenizer.js +129 -0
- package/dist/code-tokenizer.js.map +1 -0
- package/dist/code-tokenizer.test.d.ts +5 -0
- package/dist/code-tokenizer.test.d.ts.map +1 -0
- package/dist/code-tokenizer.test.js +96 -0
- package/dist/code-tokenizer.test.js.map +1 -0
- package/dist/db/client-pg.d.ts +16 -0
- package/dist/db/client-pg.d.ts.map +1 -0
- package/dist/db/client-pg.js +38 -0
- package/dist/db/client-pg.js.map +1 -0
- package/dist/db/client.d.ts +36 -0
- package/dist/db/client.d.ts.map +1 -0
- package/dist/db/client.js +81 -0
- package/dist/db/client.js.map +1 -0
- package/dist/db/migrations-pg.d.ts +6 -0
- package/dist/db/migrations-pg.d.ts.map +1 -0
- package/dist/db/migrations-pg.js +88 -0
- package/dist/db/migrations-pg.js.map +1 -0
- package/dist/db/migrations.d.ts +9 -0
- package/dist/db/migrations.d.ts.map +1 -0
- package/dist/db/migrations.js +164 -0
- package/dist/db/migrations.js.map +1 -0
- package/dist/db/schema-pg.d.ts +611 -0
- package/dist/db/schema-pg.d.ts.map +1 -0
- package/dist/db/schema-pg.js +66 -0
- package/dist/db/schema-pg.js.map +1 -0
- package/dist/db/schema.d.ts +630 -0
- package/dist/db/schema.d.ts.map +1 -0
- package/dist/db/schema.js +85 -0
- package/dist/db/schema.js.map +1 -0
- package/dist/embeddings.d.ts +92 -0
- package/dist/embeddings.d.ts.map +1 -0
- package/dist/embeddings.js +275 -0
- package/dist/embeddings.js.map +1 -0
- package/dist/embeddings.test.d.ts +5 -0
- package/dist/embeddings.test.d.ts.map +1 -0
- package/dist/embeddings.test.js +255 -0
- package/dist/embeddings.test.js.map +1 -0
- package/dist/hybrid-search.d.ts +47 -0
- package/dist/hybrid-search.d.ts.map +1 -0
- package/dist/hybrid-search.js +215 -0
- package/dist/hybrid-search.js.map +1 -0
- package/dist/hybrid-search.test.d.ts +5 -0
- package/dist/hybrid-search.test.d.ts.map +1 -0
- package/dist/hybrid-search.test.js +252 -0
- package/dist/hybrid-search.test.js.map +1 -0
- package/dist/incremental-tfidf.d.ts +77 -0
- package/dist/incremental-tfidf.d.ts.map +1 -0
- package/dist/incremental-tfidf.js +248 -0
- package/dist/incremental-tfidf.js.map +1 -0
- package/dist/incremental-tfidf.test.d.ts +5 -0
- package/dist/incremental-tfidf.test.d.ts.map +1 -0
- package/dist/incremental-tfidf.test.js +276 -0
- package/dist/incremental-tfidf.test.js.map +1 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +19 -0
- package/dist/index.js.map +1 -0
- package/dist/indexer.d.ts +205 -0
- package/dist/indexer.d.ts.map +1 -0
- package/dist/indexer.js +1331 -0
- package/dist/indexer.js.map +1 -0
- package/dist/indexer.test.d.ts +12 -0
- package/dist/indexer.test.d.ts.map +1 -0
- package/dist/indexer.test.js +471 -0
- package/dist/indexer.test.js.map +1 -0
- package/dist/language-config.d.ts +54 -0
- package/dist/language-config.d.ts.map +1 -0
- package/dist/language-config.js +75 -0
- package/dist/language-config.js.map +1 -0
- package/dist/search-cache.d.ts +63 -0
- package/dist/search-cache.d.ts.map +1 -0
- package/dist/search-cache.js +118 -0
- package/dist/search-cache.js.map +1 -0
- package/dist/search-cache.test.d.ts +5 -0
- package/dist/search-cache.test.d.ts.map +1 -0
- package/dist/search-cache.test.js +194 -0
- package/dist/search-cache.test.js.map +1 -0
- package/dist/storage-factory.d.ts +11 -0
- package/dist/storage-factory.d.ts.map +1 -0
- package/dist/storage-factory.js +17 -0
- package/dist/storage-factory.js.map +1 -0
- package/dist/storage-persistent-pg.d.ts +75 -0
- package/dist/storage-persistent-pg.d.ts.map +1 -0
- package/dist/storage-persistent-pg.js +579 -0
- package/dist/storage-persistent-pg.js.map +1 -0
- package/dist/storage-persistent-pg.test.d.ts +7 -0
- package/dist/storage-persistent-pg.test.d.ts.map +1 -0
- package/dist/storage-persistent-pg.test.js +90 -0
- package/dist/storage-persistent-pg.test.js.map +1 -0
- package/dist/storage-persistent-types.d.ts +110 -0
- package/dist/storage-persistent-types.d.ts.map +1 -0
- package/dist/storage-persistent-types.js +5 -0
- package/dist/storage-persistent-types.js.map +1 -0
- package/dist/storage-persistent.d.ts +231 -0
- package/dist/storage-persistent.d.ts.map +1 -0
- package/dist/storage-persistent.js +897 -0
- package/dist/storage-persistent.js.map +1 -0
- package/dist/storage-persistent.test.d.ts +5 -0
- package/dist/storage-persistent.test.d.ts.map +1 -0
- package/dist/storage-persistent.test.js +325 -0
- package/dist/storage-persistent.test.js.map +1 -0
- package/dist/storage.d.ts +63 -0
- package/dist/storage.d.ts.map +1 -0
- package/dist/storage.js +67 -0
- package/dist/storage.js.map +1 -0
- package/dist/storage.test.d.ts +5 -0
- package/dist/storage.test.d.ts.map +1 -0
- package/dist/storage.test.js +157 -0
- package/dist/storage.test.js.map +1 -0
- package/dist/tfidf.d.ts +97 -0
- package/dist/tfidf.d.ts.map +1 -0
- package/dist/tfidf.js +308 -0
- package/dist/tfidf.js.map +1 -0
- package/dist/tfidf.test.d.ts +5 -0
- package/dist/tfidf.test.d.ts.map +1 -0
- package/dist/tfidf.test.js +181 -0
- package/dist/tfidf.test.js.map +1 -0
- package/dist/utils.d.ts +61 -0
- package/dist/utils.d.ts.map +1 -0
- package/dist/utils.js +264 -0
- package/dist/utils.js.map +1 -0
- package/dist/utils.test.d.ts +5 -0
- package/dist/utils.test.d.ts.map +1 -0
- package/dist/utils.test.js +94 -0
- package/dist/utils.test.js.map +1 -0
- package/dist/vector-storage.d.ts +120 -0
- package/dist/vector-storage.d.ts.map +1 -0
- package/dist/vector-storage.js +264 -0
- package/dist/vector-storage.js.map +1 -0
- package/dist/vector-storage.test.d.ts +5 -0
- package/dist/vector-storage.test.d.ts.map +1 -0
- package/dist/vector-storage.test.js +345 -0
- package/dist/vector-storage.test.js.map +1 -0
- package/package.json +85 -0
|
@@ -0,0 +1,897 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Persistent storage implementation using SQLite + Drizzle ORM (LibSQL WASM-compatible)
|
|
3
|
+
* Now supports chunk-level indexing for better search granularity
|
|
4
|
+
*/
|
|
5
|
+
import { eq, sql } from 'drizzle-orm';
|
|
6
|
+
import { createDb } from './db/client.js';
|
|
7
|
+
import { runMigrations } from './db/migrations.js';
|
|
8
|
+
import * as schema from './db/schema.js';
|
|
9
|
+
export class PersistentStorage {
|
|
10
|
+
dbInstance;
|
|
11
|
+
initPromise;
|
|
12
|
+
useBulkInsertChunks;
|
|
13
|
+
constructor(config = {}) {
|
|
14
|
+
this.useBulkInsertChunks = config.useBulkInsertChunks ?? false;
|
|
15
|
+
this.initPromise = this.initialize(config);
|
|
16
|
+
}
|
|
17
|
+
async initialize(config) {
|
|
18
|
+
this.dbInstance = await createDb(config);
|
|
19
|
+
await runMigrations(this.dbInstance.client);
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Ensure database is initialized before operations
|
|
23
|
+
*/
|
|
24
|
+
async ensureInit() {
|
|
25
|
+
await this.initPromise;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Get the LibSQL client for raw SQL operations
|
|
29
|
+
*/
|
|
30
|
+
get client() {
|
|
31
|
+
return this.dbInstance.client;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Store a file
|
|
35
|
+
*/
|
|
36
|
+
async storeFile(file) {
|
|
37
|
+
await this.ensureInit();
|
|
38
|
+
const { db } = this.dbInstance;
|
|
39
|
+
const mtime = typeof file.mtime === 'number' ? file.mtime : file.mtime.getTime();
|
|
40
|
+
const values = {
|
|
41
|
+
path: file.path,
|
|
42
|
+
content: file.content,
|
|
43
|
+
hash: file.hash,
|
|
44
|
+
size: file.size,
|
|
45
|
+
mtime,
|
|
46
|
+
...(file.language ? { language: file.language } : {}),
|
|
47
|
+
indexedAt: Date.now(),
|
|
48
|
+
};
|
|
49
|
+
await db
|
|
50
|
+
.insert(schema.files)
|
|
51
|
+
.values(values)
|
|
52
|
+
.onConflictDoUpdate({
|
|
53
|
+
target: schema.files.path,
|
|
54
|
+
set: {
|
|
55
|
+
content: values.content,
|
|
56
|
+
hash: values.hash,
|
|
57
|
+
size: values.size,
|
|
58
|
+
mtime: values.mtime,
|
|
59
|
+
...(values.language ? { language: values.language } : {}),
|
|
60
|
+
indexedAt: values.indexedAt,
|
|
61
|
+
},
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Store multiple files in a single transaction (batch operation)
|
|
66
|
+
* Much faster than storing one by one for large datasets
|
|
67
|
+
*/
|
|
68
|
+
async storeFiles(files) {
|
|
69
|
+
if (files.length === 0) {
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
await this.ensureInit();
|
|
73
|
+
// LibSQL supports batch transactions
|
|
74
|
+
await this.client.batch(files.map((file) => {
|
|
75
|
+
const mtime = typeof file.mtime === 'number' ? file.mtime : file.mtime.getTime();
|
|
76
|
+
return {
|
|
77
|
+
sql: `INSERT INTO files (path, content, hash, size, mtime, language, indexed_at)
|
|
78
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
79
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
80
|
+
content = excluded.content,
|
|
81
|
+
hash = excluded.hash,
|
|
82
|
+
size = excluded.size,
|
|
83
|
+
mtime = excluded.mtime,
|
|
84
|
+
language = excluded.language,
|
|
85
|
+
indexed_at = excluded.indexed_at`,
|
|
86
|
+
args: [
|
|
87
|
+
file.path,
|
|
88
|
+
file.content,
|
|
89
|
+
file.hash,
|
|
90
|
+
file.size,
|
|
91
|
+
mtime,
|
|
92
|
+
file.language || null,
|
|
93
|
+
Date.now(),
|
|
94
|
+
],
|
|
95
|
+
};
|
|
96
|
+
}), 'write');
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Get a file by path
|
|
100
|
+
*/
|
|
101
|
+
async getFile(path) {
|
|
102
|
+
await this.ensureInit();
|
|
103
|
+
const { db } = this.dbInstance;
|
|
104
|
+
const result = await db.select().from(schema.files).where(eq(schema.files.path, path)).get();
|
|
105
|
+
if (!result) {
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
return {
|
|
109
|
+
path: result.path,
|
|
110
|
+
content: result.content,
|
|
111
|
+
hash: result.hash,
|
|
112
|
+
size: result.size,
|
|
113
|
+
mtime: result.mtime,
|
|
114
|
+
language: result.language || undefined,
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Get all files
|
|
119
|
+
*/
|
|
120
|
+
async getAllFiles() {
|
|
121
|
+
await this.ensureInit();
|
|
122
|
+
const { db } = this.dbInstance;
|
|
123
|
+
const results = await db.select().from(schema.files).all();
|
|
124
|
+
return results.map((file) => ({
|
|
125
|
+
path: file.path,
|
|
126
|
+
content: file.content,
|
|
127
|
+
hash: file.hash,
|
|
128
|
+
size: file.size,
|
|
129
|
+
mtime: file.mtime,
|
|
130
|
+
language: file.language || undefined,
|
|
131
|
+
}));
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Get path -> hash for all files (for skip-unchanged during full index)
|
|
135
|
+
*/
|
|
136
|
+
async getFileHashes() {
|
|
137
|
+
await this.ensureInit();
|
|
138
|
+
const { db } = this.dbInstance;
|
|
139
|
+
const results = await db
|
|
140
|
+
.select({ path: schema.files.path, hash: schema.files.hash })
|
|
141
|
+
.from(schema.files)
|
|
142
|
+
.all();
|
|
143
|
+
const map = new Map();
|
|
144
|
+
for (const row of results)
|
|
145
|
+
map.set(row.path, row.hash);
|
|
146
|
+
return map;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Delete a file
|
|
150
|
+
*/
|
|
151
|
+
async deleteFile(path) {
|
|
152
|
+
await this.ensureInit();
|
|
153
|
+
const { db } = this.dbInstance;
|
|
154
|
+
await db.delete(schema.files).where(eq(schema.files.path, path));
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Clear all files
|
|
158
|
+
*/
|
|
159
|
+
async clear() {
|
|
160
|
+
await this.ensureInit();
|
|
161
|
+
const { db } = this.dbInstance;
|
|
162
|
+
await db.delete(schema.chunks);
|
|
163
|
+
await db.delete(schema.files);
|
|
164
|
+
await db.delete(schema.documentVectors);
|
|
165
|
+
await db.delete(schema.idfScores);
|
|
166
|
+
await db.delete(schema.indexMetadata);
|
|
167
|
+
}
|
|
168
|
+
// ============ CHUNK METHODS ============
|
|
169
|
+
/**
|
|
170
|
+
* Store chunks for a file (replaces existing chunks)
|
|
171
|
+
*/
|
|
172
|
+
async storeChunks(filePath, chunks) {
|
|
173
|
+
await this.ensureInit();
|
|
174
|
+
const { db } = this.dbInstance;
|
|
175
|
+
// Get file ID
|
|
176
|
+
const file = await db.select().from(schema.files).where(eq(schema.files.path, filePath)).get();
|
|
177
|
+
if (!file) {
|
|
178
|
+
throw new Error(`File not found: ${filePath}`);
|
|
179
|
+
}
|
|
180
|
+
// Delete existing chunks for this file
|
|
181
|
+
await db.delete(schema.chunks).where(eq(schema.chunks.fileId, file.id));
|
|
182
|
+
// Insert new chunks
|
|
183
|
+
const chunkIds = [];
|
|
184
|
+
for (const chunk of chunks) {
|
|
185
|
+
const insertValues = {
|
|
186
|
+
fileId: file.id,
|
|
187
|
+
content: chunk.content,
|
|
188
|
+
type: chunk.type,
|
|
189
|
+
startLine: chunk.startLine,
|
|
190
|
+
endLine: chunk.endLine,
|
|
191
|
+
};
|
|
192
|
+
if (chunk.metadata) {
|
|
193
|
+
insertValues.metadata = JSON.stringify(chunk.metadata);
|
|
194
|
+
}
|
|
195
|
+
const result = await db
|
|
196
|
+
.insert(schema.chunks)
|
|
197
|
+
.values(insertValues)
|
|
198
|
+
.returning({ id: schema.chunks.id });
|
|
199
|
+
if (result[0]) {
|
|
200
|
+
chunkIds.push(result[0].id);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
return chunkIds;
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Store chunks for multiple files in batch
|
|
207
|
+
*/
|
|
208
|
+
async storeManyChunks(fileChunks) {
|
|
209
|
+
await this.ensureInit();
|
|
210
|
+
const { db } = this.dbInstance;
|
|
211
|
+
const result = new Map();
|
|
212
|
+
if (fileChunks.length === 0) {
|
|
213
|
+
return result;
|
|
214
|
+
}
|
|
215
|
+
// Get file IDs
|
|
216
|
+
const filePaths = fileChunks.map((fc) => fc.filePath);
|
|
217
|
+
const files = await db
|
|
218
|
+
.select({ id: schema.files.id, path: schema.files.path })
|
|
219
|
+
.from(schema.files)
|
|
220
|
+
.where(sql `${schema.files.path} IN (${sql.join(filePaths.map((p) => sql `${p}`), sql `, `)})`)
|
|
221
|
+
.all();
|
|
222
|
+
const fileIdMap = new Map();
|
|
223
|
+
for (const file of files) {
|
|
224
|
+
fileIdMap.set(file.path, file.id);
|
|
225
|
+
}
|
|
226
|
+
// Delete existing chunks for these files
|
|
227
|
+
const fileIds = Array.from(fileIdMap.values());
|
|
228
|
+
if (fileIds.length > 0) {
|
|
229
|
+
await db.delete(schema.chunks).where(sql `${schema.chunks.fileId} IN (${sql.join(fileIds.map((id) => sql `${id}`), sql `, `)})`);
|
|
230
|
+
}
|
|
231
|
+
if (!this.useBulkInsertChunks) {
|
|
232
|
+
for (const fc of fileChunks) {
|
|
233
|
+
const fileId = fileIdMap.get(fc.filePath);
|
|
234
|
+
if (!fileId)
|
|
235
|
+
continue;
|
|
236
|
+
const ids = [];
|
|
237
|
+
for (const chunk of fc.chunks) {
|
|
238
|
+
const insertResult = await db
|
|
239
|
+
.insert(schema.chunks)
|
|
240
|
+
.values({
|
|
241
|
+
fileId,
|
|
242
|
+
content: chunk.content,
|
|
243
|
+
type: chunk.type,
|
|
244
|
+
startLine: chunk.startLine,
|
|
245
|
+
endLine: chunk.endLine,
|
|
246
|
+
metadata: chunk.metadata ? JSON.stringify(chunk.metadata) : null,
|
|
247
|
+
})
|
|
248
|
+
.returning({ id: schema.chunks.id });
|
|
249
|
+
const row = Array.isArray(insertResult) ? insertResult[0] : insertResult;
|
|
250
|
+
if (row?.id != null)
|
|
251
|
+
ids.push(row.id);
|
|
252
|
+
}
|
|
253
|
+
result.set(fc.filePath, ids);
|
|
254
|
+
}
|
|
255
|
+
return result;
|
|
256
|
+
}
|
|
257
|
+
// Bulk insert (SQLite ~999 bind limit, 6 fields/row → batch 150)
|
|
258
|
+
const CHUNK_INSERT_BATCH_SIZE = 150;
|
|
259
|
+
const flatRows = [];
|
|
260
|
+
const countsPerFile = [];
|
|
261
|
+
for (const fc of fileChunks) {
|
|
262
|
+
const fileId = fileIdMap.get(fc.filePath);
|
|
263
|
+
if (!fileId) {
|
|
264
|
+
countsPerFile.push(0);
|
|
265
|
+
continue;
|
|
266
|
+
}
|
|
267
|
+
countsPerFile.push(fc.chunks.length);
|
|
268
|
+
for (const chunk of fc.chunks) {
|
|
269
|
+
flatRows.push({
|
|
270
|
+
fileId,
|
|
271
|
+
content: chunk.content,
|
|
272
|
+
type: chunk.type,
|
|
273
|
+
startLine: chunk.startLine,
|
|
274
|
+
endLine: chunk.endLine,
|
|
275
|
+
metadata: chunk.metadata ? JSON.stringify(chunk.metadata) : null,
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
const allIds = [];
|
|
280
|
+
for (let i = 0; i < flatRows.length; i += CHUNK_INSERT_BATCH_SIZE) {
|
|
281
|
+
const batch = flatRows.slice(i, i + CHUNK_INSERT_BATCH_SIZE);
|
|
282
|
+
const insertResult = await db
|
|
283
|
+
.insert(schema.chunks)
|
|
284
|
+
.values(batch)
|
|
285
|
+
.returning({ id: schema.chunks.id });
|
|
286
|
+
const rows = Array.isArray(insertResult) ? insertResult : [insertResult];
|
|
287
|
+
for (const row of rows)
|
|
288
|
+
if (row?.id != null)
|
|
289
|
+
allIds.push(row.id);
|
|
290
|
+
}
|
|
291
|
+
let offset = 0;
|
|
292
|
+
for (let i = 0; i < fileChunks.length; i++) {
|
|
293
|
+
const count = countsPerFile[i];
|
|
294
|
+
if (count === 0)
|
|
295
|
+
continue;
|
|
296
|
+
result.set(fileChunks[i].filePath, allIds.slice(offset, offset + count));
|
|
297
|
+
offset += count;
|
|
298
|
+
}
|
|
299
|
+
return result;
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Get chunks for a file
|
|
303
|
+
*/
|
|
304
|
+
async getChunksForFile(filePath) {
|
|
305
|
+
await this.ensureInit();
|
|
306
|
+
const { db } = this.dbInstance;
|
|
307
|
+
const results = await db
|
|
308
|
+
.select({
|
|
309
|
+
id: schema.chunks.id,
|
|
310
|
+
fileId: schema.chunks.fileId,
|
|
311
|
+
content: schema.chunks.content,
|
|
312
|
+
type: schema.chunks.type,
|
|
313
|
+
startLine: schema.chunks.startLine,
|
|
314
|
+
endLine: schema.chunks.endLine,
|
|
315
|
+
metadata: schema.chunks.metadata,
|
|
316
|
+
filePath: schema.files.path,
|
|
317
|
+
})
|
|
318
|
+
.from(schema.chunks)
|
|
319
|
+
.innerJoin(schema.files, eq(schema.chunks.fileId, schema.files.id))
|
|
320
|
+
.where(eq(schema.files.path, filePath))
|
|
321
|
+
.all();
|
|
322
|
+
return results.map((r) => ({
|
|
323
|
+
id: r.id,
|
|
324
|
+
fileId: r.fileId,
|
|
325
|
+
filePath: r.filePath,
|
|
326
|
+
content: r.content,
|
|
327
|
+
type: r.type,
|
|
328
|
+
startLine: r.startLine,
|
|
329
|
+
endLine: r.endLine,
|
|
330
|
+
metadata: r.metadata ? JSON.parse(r.metadata) : undefined,
|
|
331
|
+
}));
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Get total chunk count
|
|
335
|
+
*/
|
|
336
|
+
async getChunkCount() {
|
|
337
|
+
await this.ensureInit();
|
|
338
|
+
const { db } = this.dbInstance;
|
|
339
|
+
const result = await db.select({ count: sql `count(*)` }).from(schema.chunks).get();
|
|
340
|
+
return result?.count || 0;
|
|
341
|
+
}
|
|
342
|
+
/**
|
|
343
|
+
* Get file count
|
|
344
|
+
*/
|
|
345
|
+
async count() {
|
|
346
|
+
await this.ensureInit();
|
|
347
|
+
const { db } = this.dbInstance;
|
|
348
|
+
const result = await db.select({ count: sql `count(*)` }).from(schema.files).get();
|
|
349
|
+
return result?.count || 0;
|
|
350
|
+
}
|
|
351
|
+
/**
|
|
352
|
+
* Check if file exists
|
|
353
|
+
*/
|
|
354
|
+
async exists(path) {
|
|
355
|
+
await this.ensureInit();
|
|
356
|
+
const { db } = this.dbInstance;
|
|
357
|
+
const result = await db.select().from(schema.files).where(eq(schema.files.path, path)).get();
|
|
358
|
+
return result !== undefined;
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Store document vectors (TF-IDF) for a CHUNK
|
|
362
|
+
*/
|
|
363
|
+
async storeChunkVectors(chunkId, terms, tokenCount) {
|
|
364
|
+
await this.ensureInit();
|
|
365
|
+
const { db } = this.dbInstance;
|
|
366
|
+
// Delete existing vectors for this chunk
|
|
367
|
+
await db.delete(schema.documentVectors).where(eq(schema.documentVectors.chunkId, chunkId));
|
|
368
|
+
// Update token count if provided
|
|
369
|
+
if (tokenCount !== undefined) {
|
|
370
|
+
await this.client.execute({
|
|
371
|
+
sql: 'UPDATE chunks SET token_count = ? WHERE id = ?',
|
|
372
|
+
args: [tokenCount, chunkId],
|
|
373
|
+
});
|
|
374
|
+
}
|
|
375
|
+
// Insert new vectors in batches (SQLite has ~999 bind variable limit, 5 fields per row = 199 rows)
|
|
376
|
+
const BATCH_SIZE = 199;
|
|
377
|
+
const vectors = Array.from(terms.entries()).map(([term, scores]) => ({
|
|
378
|
+
chunkId,
|
|
379
|
+
term,
|
|
380
|
+
tf: scores.tf,
|
|
381
|
+
tfidf: scores.tfidf,
|
|
382
|
+
rawFreq: scores.rawFreq,
|
|
383
|
+
}));
|
|
384
|
+
for (let i = 0; i < vectors.length; i += BATCH_SIZE) {
|
|
385
|
+
const batch = vectors.slice(i, i + BATCH_SIZE);
|
|
386
|
+
await db.insert(schema.documentVectors).values(batch);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
/**
|
|
390
|
+
* Store document vectors for multiple CHUNKS in a single transaction (batch operation)
|
|
391
|
+
* Much faster than storing one by one for large datasets
|
|
392
|
+
*/
|
|
393
|
+
async storeManyChunkVectors(chunkVectors) {
|
|
394
|
+
if (chunkVectors.length === 0) {
|
|
395
|
+
return;
|
|
396
|
+
}
|
|
397
|
+
await this.ensureInit();
|
|
398
|
+
const { db } = this.dbInstance;
|
|
399
|
+
// Delete all existing vectors for these chunks
|
|
400
|
+
const chunkIds = chunkVectors.map((cv) => cv.chunkId);
|
|
401
|
+
if (chunkIds.length > 0) {
|
|
402
|
+
// Delete in batches to avoid SQLite variable limits
|
|
403
|
+
const deleteBatchSize = 500;
|
|
404
|
+
for (let i = 0; i < chunkIds.length; i += deleteBatchSize) {
|
|
405
|
+
const batch = chunkIds.slice(i, i + deleteBatchSize);
|
|
406
|
+
await db.delete(schema.documentVectors).where(sql `${schema.documentVectors.chunkId} IN (${sql.join(batch.map((id) => sql `${id}`), sql `, `)})`);
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
// Prepare all vectors for batch insert
|
|
410
|
+
const allVectors = [];
|
|
411
|
+
// Track token counts for BM25
|
|
412
|
+
const tokenCountUpdates = [];
|
|
413
|
+
for (const cv of chunkVectors) {
|
|
414
|
+
// Track token count for BM25 document length normalization
|
|
415
|
+
if (cv.tokenCount !== undefined) {
|
|
416
|
+
tokenCountUpdates.push({ chunkId: cv.chunkId, tokenCount: cv.tokenCount });
|
|
417
|
+
}
|
|
418
|
+
for (const [term, scores] of cv.terms.entries()) {
|
|
419
|
+
allVectors.push({
|
|
420
|
+
chunkId: cv.chunkId,
|
|
421
|
+
term,
|
|
422
|
+
tf: scores.tf,
|
|
423
|
+
tfidf: scores.tfidf,
|
|
424
|
+
rawFreq: scores.rawFreq,
|
|
425
|
+
});
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
// Update token counts for BM25 using batch
|
|
429
|
+
if (tokenCountUpdates.length > 0) {
|
|
430
|
+
await this.client.batch(tokenCountUpdates.map(({ chunkId, tokenCount }) => ({
|
|
431
|
+
sql: 'UPDATE chunks SET token_count = ? WHERE id = ?',
|
|
432
|
+
args: [tokenCount, chunkId],
|
|
433
|
+
})), 'write');
|
|
434
|
+
}
|
|
435
|
+
// Insert in batches to avoid SQLite variable limits (5 fields per row = 199 rows max)
|
|
436
|
+
const batchSize = 199;
|
|
437
|
+
for (let i = 0; i < allVectors.length; i += batchSize) {
|
|
438
|
+
const batch = allVectors.slice(i, i + batchSize);
|
|
439
|
+
if (batch.length > 0) {
|
|
440
|
+
await db.insert(schema.documentVectors).values(batch);
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
/**
|
|
445
|
+
* Store IDF scores
|
|
446
|
+
*/
|
|
447
|
+
async storeIdfScores(idf, docFreq) {
|
|
448
|
+
await this.ensureInit();
|
|
449
|
+
const { db } = this.dbInstance;
|
|
450
|
+
// Clear existing IDF scores
|
|
451
|
+
await db.delete(schema.idfScores);
|
|
452
|
+
// Insert new scores in batches (SQLite has ~999 bind variable limit, 3 fields per row = 300 rows)
|
|
453
|
+
const BATCH_SIZE = 300;
|
|
454
|
+
const scores = Array.from(idf.entries()).map(([term, idfScore]) => ({
|
|
455
|
+
term,
|
|
456
|
+
idf: idfScore,
|
|
457
|
+
documentFrequency: docFreq.get(term) || 0,
|
|
458
|
+
}));
|
|
459
|
+
for (let i = 0; i < scores.length; i += BATCH_SIZE) {
|
|
460
|
+
const batch = scores.slice(i, i + BATCH_SIZE);
|
|
461
|
+
await db.insert(schema.idfScores).values(batch);
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
/**
|
|
465
|
+
* Get IDF scores
|
|
466
|
+
*/
|
|
467
|
+
async getIdfScores() {
|
|
468
|
+
await this.ensureInit();
|
|
469
|
+
const { db } = this.dbInstance;
|
|
470
|
+
const scores = await db.select().from(schema.idfScores).all();
|
|
471
|
+
const idf = new Map();
|
|
472
|
+
for (const score of scores) {
|
|
473
|
+
idf.set(score.term, score.idf);
|
|
474
|
+
}
|
|
475
|
+
return idf;
|
|
476
|
+
}
|
|
477
|
+
/**
|
|
478
|
+
* Get document vectors for a chunk
|
|
479
|
+
*/
|
|
480
|
+
async getChunkVectors(chunkId) {
|
|
481
|
+
await this.ensureInit();
|
|
482
|
+
const { db } = this.dbInstance;
|
|
483
|
+
const vectors = await db
|
|
484
|
+
.select()
|
|
485
|
+
.from(schema.documentVectors)
|
|
486
|
+
.where(eq(schema.documentVectors.chunkId, chunkId))
|
|
487
|
+
.all();
|
|
488
|
+
if (vectors.length === 0) {
|
|
489
|
+
return null;
|
|
490
|
+
}
|
|
491
|
+
const terms = new Map();
|
|
492
|
+
for (const vector of vectors) {
|
|
493
|
+
terms.set(vector.term, {
|
|
494
|
+
tf: vector.tf,
|
|
495
|
+
tfidf: vector.tfidf,
|
|
496
|
+
rawFreq: vector.rawFreq,
|
|
497
|
+
});
|
|
498
|
+
}
|
|
499
|
+
return terms;
|
|
500
|
+
}
|
|
501
|
+
/**
|
|
502
|
+
* Get all chunk vectors in a single batch query (CPU + Memory optimization)
|
|
503
|
+
* Avoids N+1 query pattern when loading index from storage
|
|
504
|
+
* Returns Map<chunkId, Map<term, {tf, tfidf, rawFreq}>>
|
|
505
|
+
*/
|
|
506
|
+
async getAllChunkVectors() {
|
|
507
|
+
await this.ensureInit();
|
|
508
|
+
const { db } = this.dbInstance;
|
|
509
|
+
// Single query to get all vectors
|
|
510
|
+
const results = await db
|
|
511
|
+
.select({
|
|
512
|
+
chunkId: schema.documentVectors.chunkId,
|
|
513
|
+
term: schema.documentVectors.term,
|
|
514
|
+
tf: schema.documentVectors.tf,
|
|
515
|
+
tfidf: schema.documentVectors.tfidf,
|
|
516
|
+
rawFreq: schema.documentVectors.rawFreq,
|
|
517
|
+
})
|
|
518
|
+
.from(schema.documentVectors)
|
|
519
|
+
.all();
|
|
520
|
+
// Group by chunk ID
|
|
521
|
+
const allVectors = new Map();
|
|
522
|
+
for (const row of results) {
|
|
523
|
+
let chunkVectors = allVectors.get(row.chunkId);
|
|
524
|
+
if (!chunkVectors) {
|
|
525
|
+
chunkVectors = new Map();
|
|
526
|
+
allVectors.set(row.chunkId, chunkVectors);
|
|
527
|
+
}
|
|
528
|
+
chunkVectors.set(row.term, {
|
|
529
|
+
tf: row.tf,
|
|
530
|
+
tfidf: row.tfidf,
|
|
531
|
+
rawFreq: row.rawFreq,
|
|
532
|
+
});
|
|
533
|
+
}
|
|
534
|
+
return allVectors;
|
|
535
|
+
}
|
|
536
|
+
/**
|
|
537
|
+
* Search chunks by terms using SQL (Memory optimization)
|
|
538
|
+
* Returns matching chunks with their content for direct display
|
|
539
|
+
* Uses pre-computed magnitude from chunks table
|
|
540
|
+
*/
|
|
541
|
+
async searchByTerms(queryTerms, options = {}) {
|
|
542
|
+
if (queryTerms.length === 0) {
|
|
543
|
+
return [];
|
|
544
|
+
}
|
|
545
|
+
await this.ensureInit();
|
|
546
|
+
const { db } = this.dbInstance;
|
|
547
|
+
const { limit = 100 } = options;
|
|
548
|
+
// Step 1: Find chunk IDs that contain any query term, with pre-computed magnitude and token count
|
|
549
|
+
const matchingChunks = await db
|
|
550
|
+
.select({
|
|
551
|
+
chunkId: schema.documentVectors.chunkId,
|
|
552
|
+
filePath: schema.files.path,
|
|
553
|
+
content: schema.chunks.content,
|
|
554
|
+
type: schema.chunks.type,
|
|
555
|
+
startLine: schema.chunks.startLine,
|
|
556
|
+
endLine: schema.chunks.endLine,
|
|
557
|
+
magnitude: schema.chunks.magnitude,
|
|
558
|
+
tokenCount: schema.chunks.tokenCount,
|
|
559
|
+
matchCount: sql `COUNT(DISTINCT ${schema.documentVectors.term})`,
|
|
560
|
+
})
|
|
561
|
+
.from(schema.documentVectors)
|
|
562
|
+
.innerJoin(schema.chunks, eq(schema.documentVectors.chunkId, schema.chunks.id))
|
|
563
|
+
.innerJoin(schema.files, eq(schema.chunks.fileId, schema.files.id))
|
|
564
|
+
.where(sql `${schema.documentVectors.term} IN (${sql.join(queryTerms.map((t) => sql `${t}`), sql `, `)})`)
|
|
565
|
+
.groupBy(schema.documentVectors.chunkId)
|
|
566
|
+
.orderBy(sql `COUNT(DISTINCT ${schema.documentVectors.term}) DESC`)
|
|
567
|
+
.limit(limit * 2) // Get more candidates for scoring
|
|
568
|
+
.all();
|
|
569
|
+
if (matchingChunks.length === 0) {
|
|
570
|
+
return [];
|
|
571
|
+
}
|
|
572
|
+
// Step 2: Get matched term vectors for these chunks (only query terms)
|
|
573
|
+
const chunkIds = matchingChunks.map((c) => c.chunkId);
|
|
574
|
+
const matchedVectors = await db
|
|
575
|
+
.select({
|
|
576
|
+
chunkId: schema.documentVectors.chunkId,
|
|
577
|
+
term: schema.documentVectors.term,
|
|
578
|
+
tfidf: schema.documentVectors.tfidf,
|
|
579
|
+
rawFreq: schema.documentVectors.rawFreq,
|
|
580
|
+
})
|
|
581
|
+
.from(schema.documentVectors)
|
|
582
|
+
.where(sql `${schema.documentVectors.chunkId} IN (${sql.join(chunkIds.map((id) => sql `${id}`), sql `, `)}) AND ${schema.documentVectors.term} IN (${sql.join(queryTerms.map((t) => sql `${t}`), sql `, `)})`)
|
|
583
|
+
.all();
|
|
584
|
+
// Build result map with pre-computed magnitude and token count
|
|
585
|
+
const resultMap = new Map();
|
|
586
|
+
// Initialize result entries with chunk data
|
|
587
|
+
for (const c of matchingChunks) {
|
|
588
|
+
resultMap.set(c.chunkId, {
|
|
589
|
+
chunkId: c.chunkId,
|
|
590
|
+
filePath: c.filePath,
|
|
591
|
+
content: c.content,
|
|
592
|
+
type: c.type,
|
|
593
|
+
startLine: c.startLine,
|
|
594
|
+
endLine: c.endLine,
|
|
595
|
+
matchedTerms: new Map(),
|
|
596
|
+
magnitude: c.magnitude ?? 0,
|
|
597
|
+
tokenCount: c.tokenCount ?? 0,
|
|
598
|
+
});
|
|
599
|
+
}
|
|
600
|
+
// Populate matched terms
|
|
601
|
+
for (const v of matchedVectors) {
|
|
602
|
+
const entry = resultMap.get(v.chunkId);
|
|
603
|
+
if (entry) {
|
|
604
|
+
entry.matchedTerms.set(v.term, { tfidf: v.tfidf, rawFreq: v.rawFreq });
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
return Array.from(resultMap.values());
|
|
608
|
+
}
|
|
609
|
+
/**
|
|
610
|
+
* Get IDF scores for specific terms only (Memory optimization)
|
|
611
|
+
*/
|
|
612
|
+
async getIdfScoresForTerms(terms) {
|
|
613
|
+
if (terms.length === 0) {
|
|
614
|
+
return new Map();
|
|
615
|
+
}
|
|
616
|
+
await this.ensureInit();
|
|
617
|
+
const { db } = this.dbInstance;
|
|
618
|
+
const scores = await db
|
|
619
|
+
.select()
|
|
620
|
+
.from(schema.idfScores)
|
|
621
|
+
.where(sql `${schema.idfScores.term} IN (${sql.join(terms.map((t) => sql `${t}`), sql `, `)})`)
|
|
622
|
+
.all();
|
|
623
|
+
const idf = new Map();
|
|
624
|
+
for (const score of scores) {
|
|
625
|
+
idf.set(score.term, score.idf);
|
|
626
|
+
}
|
|
627
|
+
return idf;
|
|
628
|
+
}
|
|
629
|
+
/**
|
|
630
|
+
* Get total chunk count (for IDF calculation)
|
|
631
|
+
* BM25/TF-IDF now operates at chunk level, not file level
|
|
632
|
+
*/
|
|
633
|
+
async getTotalDocuments() {
|
|
634
|
+
return this.getChunkCount();
|
|
635
|
+
}
|
|
636
|
+
/**
|
|
637
|
+
* Get all file metadata (path, mtime, hash) without content
|
|
638
|
+
* Used for incremental diff detection
|
|
639
|
+
*/
|
|
640
|
+
async getAllFileMetadata() {
|
|
641
|
+
await this.ensureInit();
|
|
642
|
+
const { db } = this.dbInstance;
|
|
643
|
+
const results = await db
|
|
644
|
+
.select({
|
|
645
|
+
path: schema.files.path,
|
|
646
|
+
mtime: schema.files.mtime,
|
|
647
|
+
hash: schema.files.hash,
|
|
648
|
+
})
|
|
649
|
+
.from(schema.files)
|
|
650
|
+
.all();
|
|
651
|
+
const metadata = new Map();
|
|
652
|
+
for (const row of results) {
|
|
653
|
+
metadata.set(row.path, { mtime: row.mtime, hash: row.hash });
|
|
654
|
+
}
|
|
655
|
+
return metadata;
|
|
656
|
+
}
|
|
657
|
+
/**
|
|
658
|
+
* Delete multiple files in a single transaction (batch operation)
|
|
659
|
+
*/
|
|
660
|
+
async deleteFiles(paths) {
|
|
661
|
+
if (paths.length === 0) {
|
|
662
|
+
return;
|
|
663
|
+
}
|
|
664
|
+
await this.ensureInit();
|
|
665
|
+
const { db } = this.dbInstance;
|
|
666
|
+
// Delete in chunks to avoid SQLite variable limits
|
|
667
|
+
const chunkSize = 500;
|
|
668
|
+
for (let i = 0; i < paths.length; i += chunkSize) {
|
|
669
|
+
const chunk = paths.slice(i, i + chunkSize);
|
|
670
|
+
await db.delete(schema.files).where(sql `${schema.files.path} IN (${sql.join(chunk.map((p) => sql `${p}`), sql `, `)})`);
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
/**
|
|
674
|
+
* Store metadata
|
|
675
|
+
*/
|
|
676
|
+
async setMetadata(key, value) {
|
|
677
|
+
await this.ensureInit();
|
|
678
|
+
const { db } = this.dbInstance;
|
|
679
|
+
await db
|
|
680
|
+
.insert(schema.indexMetadata)
|
|
681
|
+
.values({
|
|
682
|
+
key,
|
|
683
|
+
value,
|
|
684
|
+
updatedAt: Date.now(),
|
|
685
|
+
})
|
|
686
|
+
.onConflictDoUpdate({
|
|
687
|
+
target: schema.indexMetadata.key,
|
|
688
|
+
set: {
|
|
689
|
+
value,
|
|
690
|
+
updatedAt: Date.now(),
|
|
691
|
+
},
|
|
692
|
+
});
|
|
693
|
+
}
|
|
694
|
+
/**
|
|
695
|
+
* Get metadata
|
|
696
|
+
*/
|
|
697
|
+
async getMetadata(key) {
|
|
698
|
+
await this.ensureInit();
|
|
699
|
+
const { db } = this.dbInstance;
|
|
700
|
+
const result = await db
|
|
701
|
+
.select()
|
|
702
|
+
.from(schema.indexMetadata)
|
|
703
|
+
.where(eq(schema.indexMetadata.key, key))
|
|
704
|
+
.get();
|
|
705
|
+
return result?.value || null;
|
|
706
|
+
}
|
|
707
|
+
/**
|
|
708
|
+
* Get average chunk length (token count) for BM25 scoring
|
|
709
|
+
* Returns cached value from metadata if available, otherwise calculates from chunks table
|
|
710
|
+
*/
|
|
711
|
+
async getAverageDocLength() {
|
|
712
|
+
await this.ensureInit();
|
|
713
|
+
// Try to get cached value first
|
|
714
|
+
const cached = await this.getMetadata('avgDocLength');
|
|
715
|
+
if (cached) {
|
|
716
|
+
return parseFloat(cached);
|
|
717
|
+
}
|
|
718
|
+
// Calculate from chunks table
|
|
719
|
+
const { db } = this.dbInstance;
|
|
720
|
+
const result = await db
|
|
721
|
+
.select({
|
|
722
|
+
avgLen: sql `AVG(COALESCE(${schema.chunks.tokenCount}, 0))`,
|
|
723
|
+
})
|
|
724
|
+
.from(schema.chunks)
|
|
725
|
+
.get();
|
|
726
|
+
const avgLen = result?.avgLen || 0;
|
|
727
|
+
// Cache the result
|
|
728
|
+
await this.setMetadata('avgDocLength', avgLen.toString());
|
|
729
|
+
return avgLen;
|
|
730
|
+
}
|
|
731
|
+
/**
|
|
732
|
+
* Update average chunk length in metadata (call after indexing)
|
|
733
|
+
*/
|
|
734
|
+
async updateAverageDocLength() {
|
|
735
|
+
await this.ensureInit();
|
|
736
|
+
const { db } = this.dbInstance;
|
|
737
|
+
const result = await db
|
|
738
|
+
.select({
|
|
739
|
+
avgLen: sql `AVG(COALESCE(${schema.chunks.tokenCount}, 0))`,
|
|
740
|
+
})
|
|
741
|
+
.from(schema.chunks)
|
|
742
|
+
.get();
|
|
743
|
+
const avgLen = result?.avgLen || 0;
|
|
744
|
+
await this.setMetadata('avgDocLength', avgLen.toString());
|
|
745
|
+
return avgLen;
|
|
746
|
+
}
|
|
747
|
+
/**
|
|
748
|
+
* Rebuild IDF scores from document vectors using SQL (Memory optimization)
|
|
749
|
+
* Calculates document frequency for each term across CHUNKS and computes IDF
|
|
750
|
+
*/
|
|
751
|
+
async rebuildIdfScoresFromVectors() {
|
|
752
|
+
await this.ensureInit();
|
|
753
|
+
const { db } = this.dbInstance;
|
|
754
|
+
// Get total chunk count (IDF is calculated per chunk, not per file)
|
|
755
|
+
const totalChunks = await this.getChunkCount();
|
|
756
|
+
if (totalChunks === 0) {
|
|
757
|
+
await db.delete(schema.idfScores);
|
|
758
|
+
return;
|
|
759
|
+
}
|
|
760
|
+
// Calculate document frequency for each term using SQL (counting chunks, not files)
|
|
761
|
+
const dfResults = await db
|
|
762
|
+
.select({
|
|
763
|
+
term: schema.documentVectors.term,
|
|
764
|
+
df: sql `COUNT(DISTINCT ${schema.documentVectors.chunkId})`,
|
|
765
|
+
})
|
|
766
|
+
.from(schema.documentVectors)
|
|
767
|
+
.groupBy(schema.documentVectors.term)
|
|
768
|
+
.all();
|
|
769
|
+
// Clear existing IDF scores
|
|
770
|
+
await db.delete(schema.idfScores);
|
|
771
|
+
// Insert in batches using smoothed IDF formula
|
|
772
|
+
// Smoothed IDF: log((N+1)/(df+1)) + 1 ensures no term gets IDF=0
|
|
773
|
+
const BATCH_SIZE = 300;
|
|
774
|
+
const scores = dfResults.map((row) => ({
|
|
775
|
+
term: row.term,
|
|
776
|
+
idf: Math.log((totalChunks + 1) / (row.df + 1)) + 1,
|
|
777
|
+
documentFrequency: row.df,
|
|
778
|
+
}));
|
|
779
|
+
for (let i = 0; i < scores.length; i += BATCH_SIZE) {
|
|
780
|
+
const batch = scores.slice(i, i + BATCH_SIZE);
|
|
781
|
+
if (batch.length > 0) {
|
|
782
|
+
await db.insert(schema.idfScores).values(batch);
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
/**
|
|
787
|
+
* Recalculate TF-IDF scores for all documents using current IDF values (Memory optimization)
|
|
788
|
+
* Updates document_vectors.tfidf = document_vectors.tf * idf_scores.idf
|
|
789
|
+
*/
|
|
790
|
+
async recalculateTfidfScores() {
|
|
791
|
+
await this.ensureInit();
|
|
792
|
+
// Use raw SQL for efficient batch update with JOIN
|
|
793
|
+
await this.client.execute(`
|
|
794
|
+
UPDATE document_vectors
|
|
795
|
+
SET tfidf = tf * COALESCE(
|
|
796
|
+
(SELECT idf FROM idf_scores WHERE idf_scores.term = document_vectors.term),
|
|
797
|
+
0
|
|
798
|
+
)
|
|
799
|
+
`);
|
|
800
|
+
}
|
|
801
|
+
/**
|
|
802
|
+
* Update pre-computed magnitude for all chunks (Memory optimization for search)
|
|
803
|
+
* magnitude = sqrt(sum(tfidf^2)) for each chunk
|
|
804
|
+
* Called after TF-IDF recalculation to keep magnitude in sync
|
|
805
|
+
*/
|
|
806
|
+
async updateChunkMagnitudes() {
|
|
807
|
+
await this.ensureInit();
|
|
808
|
+
// Use raw SQL for efficient batch update with aggregate
|
|
809
|
+
await this.client.execute(`
|
|
810
|
+
UPDATE chunks
|
|
811
|
+
SET magnitude = COALESCE(
|
|
812
|
+
(SELECT SQRT(SUM(tfidf * tfidf)) FROM document_vectors WHERE document_vectors.chunk_id = chunks.id),
|
|
813
|
+
0
|
|
814
|
+
)
|
|
815
|
+
`);
|
|
816
|
+
}
|
|
817
|
+
/**
|
|
818
|
+
* Get terms for chunks of files (for tracking affected terms during incremental updates)
|
|
819
|
+
* When files are deleted, we need to know which terms were affected
|
|
820
|
+
*/
|
|
821
|
+
async getTermsForFiles(paths) {
|
|
822
|
+
if (paths.length === 0) {
|
|
823
|
+
return new Set();
|
|
824
|
+
}
|
|
825
|
+
await this.ensureInit();
|
|
826
|
+
const { db } = this.dbInstance;
|
|
827
|
+
const terms = new Set();
|
|
828
|
+
// Get file IDs
|
|
829
|
+
const files = await db
|
|
830
|
+
.select({ id: schema.files.id })
|
|
831
|
+
.from(schema.files)
|
|
832
|
+
.where(sql `${schema.files.path} IN (${sql.join(paths.map((p) => sql `${p}`), sql `, `)})`)
|
|
833
|
+
.all();
|
|
834
|
+
if (files.length === 0) {
|
|
835
|
+
return terms;
|
|
836
|
+
}
|
|
837
|
+
const fileIds = files.map((f) => f.id);
|
|
838
|
+
// Get chunk IDs for these files
|
|
839
|
+
const chunks = await db
|
|
840
|
+
.select({ id: schema.chunks.id })
|
|
841
|
+
.from(schema.chunks)
|
|
842
|
+
.where(sql `${schema.chunks.fileId} IN (${sql.join(fileIds.map((id) => sql `${id}`), sql `, `)})`)
|
|
843
|
+
.all();
|
|
844
|
+
if (chunks.length === 0) {
|
|
845
|
+
return terms;
|
|
846
|
+
}
|
|
847
|
+
const chunkIds = chunks.map((c) => c.id);
|
|
848
|
+
// Get terms for these chunks
|
|
849
|
+
const results = await db
|
|
850
|
+
.select({ term: schema.documentVectors.term })
|
|
851
|
+
.from(schema.documentVectors)
|
|
852
|
+
.where(sql `${schema.documentVectors.chunkId} IN (${sql.join(chunkIds.map((id) => sql `${id}`), sql `, `)})`)
|
|
853
|
+
.all();
|
|
854
|
+
for (const row of results) {
|
|
855
|
+
terms.add(row.term);
|
|
856
|
+
}
|
|
857
|
+
return terms;
|
|
858
|
+
}
|
|
859
|
+
/**
|
|
860
|
+
* Get all chunks with their file paths (for bulk operations)
|
|
861
|
+
*/
|
|
862
|
+
async getAllChunks() {
|
|
863
|
+
await this.ensureInit();
|
|
864
|
+
const { db } = this.dbInstance;
|
|
865
|
+
const results = await db
|
|
866
|
+
.select({
|
|
867
|
+
id: schema.chunks.id,
|
|
868
|
+
fileId: schema.chunks.fileId,
|
|
869
|
+
content: schema.chunks.content,
|
|
870
|
+
type: schema.chunks.type,
|
|
871
|
+
startLine: schema.chunks.startLine,
|
|
872
|
+
endLine: schema.chunks.endLine,
|
|
873
|
+
metadata: schema.chunks.metadata,
|
|
874
|
+
filePath: schema.files.path,
|
|
875
|
+
})
|
|
876
|
+
.from(schema.chunks)
|
|
877
|
+
.innerJoin(schema.files, eq(schema.chunks.fileId, schema.files.id))
|
|
878
|
+
.all();
|
|
879
|
+
return results.map((r) => ({
|
|
880
|
+
id: r.id,
|
|
881
|
+
fileId: r.fileId,
|
|
882
|
+
filePath: r.filePath,
|
|
883
|
+
content: r.content,
|
|
884
|
+
type: r.type,
|
|
885
|
+
startLine: r.startLine,
|
|
886
|
+
endLine: r.endLine,
|
|
887
|
+
metadata: r.metadata ? JSON.parse(r.metadata) : undefined,
|
|
888
|
+
}));
|
|
889
|
+
}
|
|
890
|
+
/**
|
|
891
|
+
* Close database connection
|
|
892
|
+
*/
|
|
893
|
+
close() {
|
|
894
|
+
this.dbInstance.client.close();
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
//# sourceMappingURL=storage-persistent.js.map
|