@ruso-0/nreki 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +648 -0
- package/LICENSE +21 -0
- package/README.md +425 -0
- package/dist/ast-navigator.d.ts +29 -0
- package/dist/ast-navigator.d.ts.map +1 -0
- package/dist/ast-navigator.js +279 -0
- package/dist/ast-navigator.js.map +1 -0
- package/dist/ast-sandbox.d.ts +74 -0
- package/dist/ast-sandbox.d.ts.map +1 -0
- package/dist/ast-sandbox.js +242 -0
- package/dist/ast-sandbox.js.map +1 -0
- package/dist/chronos-memory.d.ts +69 -0
- package/dist/chronos-memory.d.ts.map +1 -0
- package/dist/chronos-memory.js +247 -0
- package/dist/chronos-memory.js.map +1 -0
- package/dist/circuit-breaker.d.ts +107 -0
- package/dist/circuit-breaker.d.ts.map +1 -0
- package/dist/circuit-breaker.js +330 -0
- package/dist/circuit-breaker.js.map +1 -0
- package/dist/compressor-advanced.d.ts +80 -0
- package/dist/compressor-advanced.d.ts.map +1 -0
- package/dist/compressor-advanced.js +555 -0
- package/dist/compressor-advanced.js.map +1 -0
- package/dist/compressor.d.ts +81 -0
- package/dist/compressor.d.ts.map +1 -0
- package/dist/compressor.js +227 -0
- package/dist/compressor.js.map +1 -0
- package/dist/database.d.ts +169 -0
- package/dist/database.d.ts.map +1 -0
- package/dist/database.js +1029 -0
- package/dist/database.js.map +1 -0
- package/dist/embedder.d.ts +73 -0
- package/dist/embedder.d.ts.map +1 -0
- package/dist/embedder.js +165 -0
- package/dist/embedder.js.map +1 -0
- package/dist/engine.d.ts +224 -0
- package/dist/engine.d.ts.map +1 -0
- package/dist/engine.js +582 -0
- package/dist/engine.js.map +1 -0
- package/dist/hologram/harvester.d.ts +41 -0
- package/dist/hologram/harvester.d.ts.map +1 -0
- package/dist/hologram/harvester.js +129 -0
- package/dist/hologram/harvester.js.map +1 -0
- package/dist/hologram/shadow-cache.d.ts +49 -0
- package/dist/hologram/shadow-cache.d.ts.map +1 -0
- package/dist/hologram/shadow-cache.js +165 -0
- package/dist/hologram/shadow-cache.js.map +1 -0
- package/dist/hologram/shadow-generator.d.ts +32 -0
- package/dist/hologram/shadow-generator.d.ts.map +1 -0
- package/dist/hologram/shadow-generator.js +828 -0
- package/dist/hologram/shadow-generator.js.map +1 -0
- package/dist/hooks/preToolUse.d.ts +63 -0
- package/dist/hooks/preToolUse.d.ts.map +1 -0
- package/dist/hooks/preToolUse.js +103 -0
- package/dist/hooks/preToolUse.js.map +1 -0
- package/dist/index.d.ts +19 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +367 -0
- package/dist/index.js.map +1 -0
- package/dist/kernel/kernel-manager.d.ts +52 -0
- package/dist/kernel/kernel-manager.d.ts.map +1 -0
- package/dist/kernel/kernel-manager.js +197 -0
- package/dist/kernel/kernel-manager.js.map +1 -0
- package/dist/kernel/kernel-worker.d.ts +9 -0
- package/dist/kernel/kernel-worker.d.ts.map +1 -0
- package/dist/kernel/kernel-worker.js +76 -0
- package/dist/kernel/kernel-worker.js.map +1 -0
- package/dist/kernel/nreki-kernel.d.ts +244 -0
- package/dist/kernel/nreki-kernel.d.ts.map +1 -0
- package/dist/kernel/nreki-kernel.js +1656 -0
- package/dist/kernel/nreki-kernel.js.map +1 -0
- package/dist/middleware/circuit-breaker.d.ts +32 -0
- package/dist/middleware/circuit-breaker.d.ts.map +1 -0
- package/dist/middleware/circuit-breaker.js +160 -0
- package/dist/middleware/circuit-breaker.js.map +1 -0
- package/dist/middleware/file-lock.d.ts +33 -0
- package/dist/middleware/file-lock.d.ts.map +1 -0
- package/dist/middleware/file-lock.js +55 -0
- package/dist/middleware/file-lock.js.map +1 -0
- package/dist/middleware/validator.d.ts +26 -0
- package/dist/middleware/validator.d.ts.map +1 -0
- package/dist/middleware/validator.js +39 -0
- package/dist/middleware/validator.js.map +1 -0
- package/dist/monitor.d.ts +94 -0
- package/dist/monitor.d.ts.map +1 -0
- package/dist/monitor.js +221 -0
- package/dist/monitor.js.map +1 -0
- package/dist/parser-pool.d.ts +28 -0
- package/dist/parser-pool.d.ts.map +1 -0
- package/dist/parser-pool.js +81 -0
- package/dist/parser-pool.js.map +1 -0
- package/dist/parser.d.ts +91 -0
- package/dist/parser.d.ts.map +1 -0
- package/dist/parser.js +311 -0
- package/dist/parser.js.map +1 -0
- package/dist/pin-memory.d.ts +35 -0
- package/dist/pin-memory.d.ts.map +1 -0
- package/dist/pin-memory.js +161 -0
- package/dist/pin-memory.js.map +1 -0
- package/dist/repo-map.d.ts +81 -0
- package/dist/repo-map.d.ts.map +1 -0
- package/dist/repo-map.js +550 -0
- package/dist/repo-map.js.map +1 -0
- package/dist/router.d.ts +102 -0
- package/dist/router.d.ts.map +1 -0
- package/dist/router.js +1989 -0
- package/dist/router.js.map +1 -0
- package/dist/semantic-edit.d.ts +82 -0
- package/dist/semantic-edit.d.ts.map +1 -0
- package/dist/semantic-edit.js +529 -0
- package/dist/semantic-edit.js.map +1 -0
- package/dist/terminal-filter.d.ts +27 -0
- package/dist/terminal-filter.d.ts.map +1 -0
- package/dist/terminal-filter.js +257 -0
- package/dist/terminal-filter.js.map +1 -0
- package/dist/undo.d.ts +21 -0
- package/dist/undo.d.ts.map +1 -0
- package/dist/undo.js +55 -0
- package/dist/undo.js.map +1 -0
- package/dist/utils/code-tokenizer.d.ts +25 -0
- package/dist/utils/code-tokenizer.d.ts.map +1 -0
- package/dist/utils/code-tokenizer.js +52 -0
- package/dist/utils/code-tokenizer.js.map +1 -0
- package/dist/utils/file-filter.d.ts +23 -0
- package/dist/utils/file-filter.d.ts.map +1 -0
- package/dist/utils/file-filter.js +48 -0
- package/dist/utils/file-filter.js.map +1 -0
- package/dist/utils/imports.d.ts +32 -0
- package/dist/utils/imports.d.ts.map +1 -0
- package/dist/utils/imports.js +155 -0
- package/dist/utils/imports.js.map +1 -0
- package/dist/utils/path-jail.d.ts +27 -0
- package/dist/utils/path-jail.d.ts.map +1 -0
- package/dist/utils/path-jail.js +95 -0
- package/dist/utils/path-jail.js.map +1 -0
- package/dist/utils/read-source.d.ts +18 -0
- package/dist/utils/read-source.d.ts.map +1 -0
- package/dist/utils/read-source.js +22 -0
- package/dist/utils/read-source.js.map +1 -0
- package/dist/utils/safe-parse.d.ts +20 -0
- package/dist/utils/safe-parse.d.ts.map +1 -0
- package/dist/utils/safe-parse.js +25 -0
- package/dist/utils/safe-parse.js.map +1 -0
- package/package.json +75 -0
- package/scripts/download-wasm.js +46 -0
- package/wasm/.gitkeep +0 -0
- package/wasm/tree-sitter-go.wasm +0 -0
- package/wasm/tree-sitter-javascript.wasm +0 -0
- package/wasm/tree-sitter-python.wasm +0 -0
- package/wasm/tree-sitter-typescript.wasm +0 -0
package/dist/database.js
ADDED
|
@@ -0,0 +1,1029 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* database.ts - SQLite persistence layer for NREKI.
|
|
3
|
+
*
|
|
4
|
+
* Uses sql.js (SQLite compiled to WASM) for zero-native-dependency
|
|
5
|
+
* operation. Vector search AND keyword search are both implemented
|
|
6
|
+
* in pure JavaScript:
|
|
7
|
+
*
|
|
8
|
+
* - VectorIndex: brute-force cosine similarity on Float32Array
|
|
9
|
+
* - KeywordIndex: inverted index with Porter-inspired BM25 scoring
|
|
10
|
+
*
|
|
11
|
+
* This eliminates the need for FTS5, sqlite-vec, better-sqlite3,
|
|
12
|
+
* node-gyp, and Visual Studio Build Tools - making NREKI
|
|
13
|
+
* portable to any platform without native compilation.
|
|
14
|
+
*/
|
|
15
|
+
import initSqlJs from "sql.js";
|
|
16
|
+
import crypto from "crypto";
|
|
17
|
+
import fs from "fs";
|
|
18
|
+
import path from "path";
|
|
19
|
+
import { codeTokenize } from "./utils/code-tokenizer.js";
|
|
20
|
+
// ─── In-Memory Vector Store ──────────────────────────────────────────
|
|
21
|
+
/**
|
|
22
|
+
* Fast dot-product similarity for L2-normalized vectors.
|
|
23
|
+
* Jina embeddings output L2-normalized vectors (magnitude = 1),
|
|
24
|
+
* so cosine_similarity = dot_product (no sqrt/division needed).
|
|
25
|
+
* This is ~3x faster than full cosine similarity.
|
|
26
|
+
*/
|
|
27
|
+
function fastSimilarity(a, b) {
|
|
28
|
+
let dot = 0;
|
|
29
|
+
for (let i = 0; i < a.length; i++) {
|
|
30
|
+
dot += a[i] * b[i];
|
|
31
|
+
}
|
|
32
|
+
return dot;
|
|
33
|
+
}
|
|
34
|
+
// Fallback cosine similarity for non-normalized models:
|
|
35
|
+
// function cosineSimilarity(a: Float32Array, aNorm: number, b: Float32Array, bNorm: number): number {
|
|
36
|
+
// let dot = 0;
|
|
37
|
+
// for (let i = 0; i < a.length; i++) dot += a[i] * b[i];
|
|
38
|
+
// return (aNorm > 0 && bNorm > 0) ? dot / (aNorm * bNorm) : 0;
|
|
39
|
+
// }
|
|
40
|
+
/**
|
|
41
|
+
* Pure JavaScript vector index using brute-force dot-product similarity.
|
|
42
|
+
* For L2-normalized embeddings (Jina), dot product = cosine similarity.
|
|
43
|
+
* For codebases up to ~50K chunks, brute-force is fast enough (<10ms)
|
|
44
|
+
* and avoids any native dependency.
|
|
45
|
+
*/
|
|
46
|
+
class VectorIndex {
|
|
47
|
+
vectors = new Map();
|
|
48
|
+
insert(rowid, embedding) {
|
|
49
|
+
this.vectors.set(rowid, embedding);
|
|
50
|
+
}
|
|
51
|
+
delete(rowid) {
|
|
52
|
+
this.vectors.delete(rowid);
|
|
53
|
+
}
|
|
54
|
+
deleteBulk(rowids) {
|
|
55
|
+
for (const id of rowids) {
|
|
56
|
+
this.vectors.delete(id);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
search(query, limit) {
|
|
60
|
+
const scored = [];
|
|
61
|
+
for (const [rowid, vec] of this.vectors) {
|
|
62
|
+
const sim = fastSimilarity(query, vec);
|
|
63
|
+
scored.push({ rowid, distance: 1 - sim });
|
|
64
|
+
}
|
|
65
|
+
scored.sort((a, b) => a.distance - b.distance);
|
|
66
|
+
return scored.slice(0, limit);
|
|
67
|
+
}
|
|
68
|
+
get size() {
|
|
69
|
+
return this.vectors.size;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Serialize the vector index to a binary buffer.
|
|
73
|
+
* Format: [count:u32] + ([rowid:u32][vec:f32×dim])×count
|
|
74
|
+
* Note: rowid uses UInt32 - max 4,294,967,295. Sufficient for practical
|
|
75
|
+
* codebases (would require billions of INSERT/DELETE cycles to overflow).
|
|
76
|
+
*/
|
|
77
|
+
serialize() {
|
|
78
|
+
const entries = Array.from(this.vectors.entries());
|
|
79
|
+
const header = Buffer.alloc(4);
|
|
80
|
+
header.writeUInt32LE(entries.length);
|
|
81
|
+
const chunks = [header];
|
|
82
|
+
for (const [rowid, vec] of entries) {
|
|
83
|
+
const idBuf = Buffer.alloc(4);
|
|
84
|
+
idBuf.writeUInt32LE(rowid);
|
|
85
|
+
chunks.push(idBuf);
|
|
86
|
+
chunks.push(Buffer.from(vec.buffer, vec.byteOffset, vec.byteLength));
|
|
87
|
+
}
|
|
88
|
+
return Buffer.concat(chunks);
|
|
89
|
+
}
|
|
90
|
+
static deserialize(buf, dim) {
|
|
91
|
+
const index = new VectorIndex();
|
|
92
|
+
if (buf.length < 4)
|
|
93
|
+
return index;
|
|
94
|
+
const count = buf.readUInt32LE(0);
|
|
95
|
+
let offset = 4;
|
|
96
|
+
const vecBytes = dim * 4;
|
|
97
|
+
for (let i = 0; i < count; i++) {
|
|
98
|
+
if (offset + 4 + vecBytes > buf.length)
|
|
99
|
+
break;
|
|
100
|
+
const rowid = buf.readUInt32LE(offset);
|
|
101
|
+
offset += 4;
|
|
102
|
+
const vec = new Float32Array(buf.buffer.slice(buf.byteOffset + offset, buf.byteOffset + offset + vecBytes));
|
|
103
|
+
index.vectors.set(rowid, vec);
|
|
104
|
+
offset += vecBytes;
|
|
105
|
+
}
|
|
106
|
+
return index;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// ─── Porter Stemmer ─────────────────────────────────────────────────
|
|
110
|
+
/**
|
|
111
|
+
* Full Porter stemming algorithm in pure TypeScript.
|
|
112
|
+
* Based on the original 1980 paper by Martin Porter.
|
|
113
|
+
*/
|
|
114
|
+
class PorterStemmer {
|
|
115
|
+
static isConsonant(word, i) {
|
|
116
|
+
if (i < 0 || i >= word.length)
|
|
117
|
+
return false;
|
|
118
|
+
const c = word[i];
|
|
119
|
+
if (/[aeiou]/.test(c))
|
|
120
|
+
return false;
|
|
121
|
+
if (c === "y")
|
|
122
|
+
return i === 0 || !PorterStemmer.isConsonant(word, i - 1);
|
|
123
|
+
return true;
|
|
124
|
+
}
|
|
125
|
+
/** Measure: count VC sequences in the stem. */
|
|
126
|
+
static measure(stem) {
|
|
127
|
+
let m = 0;
|
|
128
|
+
let i = 0;
|
|
129
|
+
const len = stem.length;
|
|
130
|
+
// Skip leading consonants
|
|
131
|
+
while (i < len && PorterStemmer.isConsonant(stem, i))
|
|
132
|
+
i++;
|
|
133
|
+
while (i < len) {
|
|
134
|
+
// Count vowel sequence
|
|
135
|
+
while (i < len && !PorterStemmer.isConsonant(stem, i))
|
|
136
|
+
i++;
|
|
137
|
+
if (i >= len)
|
|
138
|
+
break;
|
|
139
|
+
// Count consonant sequence
|
|
140
|
+
while (i < len && PorterStemmer.isConsonant(stem, i))
|
|
141
|
+
i++;
|
|
142
|
+
m++;
|
|
143
|
+
}
|
|
144
|
+
return m;
|
|
145
|
+
}
|
|
146
|
+
static containsVowel(stem) {
|
|
147
|
+
for (let i = 0; i < stem.length; i++) {
|
|
148
|
+
if (!PorterStemmer.isConsonant(stem, i))
|
|
149
|
+
return true;
|
|
150
|
+
}
|
|
151
|
+
return false;
|
|
152
|
+
}
|
|
153
|
+
static endsWithDouble(word) {
|
|
154
|
+
if (word.length < 2)
|
|
155
|
+
return false;
|
|
156
|
+
return word[word.length - 1] === word[word.length - 2] &&
|
|
157
|
+
PorterStemmer.isConsonant(word, word.length - 1);
|
|
158
|
+
}
|
|
159
|
+
/** Ends with consonant-vowel-consonant where last C is not w, x, or y. */
|
|
160
|
+
static cvc(word) {
|
|
161
|
+
const len = word.length;
|
|
162
|
+
if (len < 3)
|
|
163
|
+
return false;
|
|
164
|
+
const last = word[len - 1];
|
|
165
|
+
if (!PorterStemmer.isConsonant(word, len - 1))
|
|
166
|
+
return false;
|
|
167
|
+
if (PorterStemmer.isConsonant(word, len - 2))
|
|
168
|
+
return false;
|
|
169
|
+
if (!PorterStemmer.isConsonant(word, len - 3))
|
|
170
|
+
return false;
|
|
171
|
+
return last !== "w" && last !== "x" && last !== "y";
|
|
172
|
+
}
|
|
173
|
+
static stem(word) {
|
|
174
|
+
if (!word || word.length <= 2)
|
|
175
|
+
return word || "";
|
|
176
|
+
let w = word.toLowerCase();
|
|
177
|
+
// Step 1a: Plurals
|
|
178
|
+
if (w.endsWith("sses"))
|
|
179
|
+
w = w.slice(0, -2);
|
|
180
|
+
else if (w.endsWith("ies"))
|
|
181
|
+
w = w.slice(0, -2);
|
|
182
|
+
else if (!w.endsWith("ss") && w.endsWith("s"))
|
|
183
|
+
w = w.slice(0, -1);
|
|
184
|
+
// Step 1b: Past participles / gerunds
|
|
185
|
+
let step1bFlag = false;
|
|
186
|
+
if (w.endsWith("eed")) {
|
|
187
|
+
const stem = w.slice(0, -3);
|
|
188
|
+
if (PorterStemmer.measure(stem) > 0)
|
|
189
|
+
w = w.slice(0, -1); // eed -> ee
|
|
190
|
+
}
|
|
191
|
+
else if (w.endsWith("ed")) {
|
|
192
|
+
const stem = w.slice(0, -2);
|
|
193
|
+
if (PorterStemmer.containsVowel(stem)) {
|
|
194
|
+
w = stem;
|
|
195
|
+
step1bFlag = true;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
else if (w.endsWith("ing")) {
|
|
199
|
+
const stem = w.slice(0, -3);
|
|
200
|
+
if (PorterStemmer.containsVowel(stem)) {
|
|
201
|
+
w = stem;
|
|
202
|
+
step1bFlag = true;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (step1bFlag) {
|
|
206
|
+
if (w.endsWith("at") || w.endsWith("bl") || w.endsWith("iz")) {
|
|
207
|
+
w += "e";
|
|
208
|
+
}
|
|
209
|
+
else if (PorterStemmer.endsWithDouble(w) &&
|
|
210
|
+
!/[lsz]$/.test(w)) {
|
|
211
|
+
w = w.slice(0, -1);
|
|
212
|
+
}
|
|
213
|
+
else if (PorterStemmer.measure(w) === 1 && PorterStemmer.cvc(w)) {
|
|
214
|
+
w += "e";
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
// Step 1c: y -> i
|
|
218
|
+
if (w.endsWith("y") && PorterStemmer.containsVowel(w.slice(0, -1))) {
|
|
219
|
+
w = w.slice(0, -1) + "i";
|
|
220
|
+
}
|
|
221
|
+
// Step 2: Double suffixes
|
|
222
|
+
const step2 = [
|
|
223
|
+
["ational", "ate"], ["tional", "tion"], ["enci", "ence"],
|
|
224
|
+
["anci", "ance"], ["izer", "ize"], ["abli", "able"],
|
|
225
|
+
["alli", "al"], ["entli", "ent"], ["eli", "e"],
|
|
226
|
+
["ousli", "ous"], ["ization", "ize"], ["ation", "ate"],
|
|
227
|
+
["ator", "ate"], ["alism", "al"], ["iveness", "ive"],
|
|
228
|
+
["fulness", "ful"], ["ousness", "ous"], ["aliti", "al"],
|
|
229
|
+
["iviti", "ive"], ["biliti", "ble"],
|
|
230
|
+
];
|
|
231
|
+
for (const [suffix, replacement] of step2) {
|
|
232
|
+
if (w.endsWith(suffix)) {
|
|
233
|
+
const stem = w.slice(0, -suffix.length);
|
|
234
|
+
if (PorterStemmer.measure(stem) > 0)
|
|
235
|
+
w = stem + replacement;
|
|
236
|
+
break;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
// Step 3
|
|
240
|
+
const step3 = [
|
|
241
|
+
["icate", "ic"], ["ative", ""], ["alize", "al"],
|
|
242
|
+
["iciti", "ic"], ["ical", "ic"], ["ful", ""], ["ness", ""],
|
|
243
|
+
];
|
|
244
|
+
for (const [suffix, replacement] of step3) {
|
|
245
|
+
if (w.endsWith(suffix)) {
|
|
246
|
+
const stem = w.slice(0, -suffix.length);
|
|
247
|
+
if (PorterStemmer.measure(stem) > 0)
|
|
248
|
+
w = stem + replacement;
|
|
249
|
+
break;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
// Step 4: Remove suffixes
|
|
253
|
+
const step4 = [
|
|
254
|
+
"al", "ance", "ence", "er", "ic", "able", "ible", "ant",
|
|
255
|
+
"ement", "ment", "ent", "ion", "ou", "ism", "ate", "iti",
|
|
256
|
+
"ous", "ive", "ize",
|
|
257
|
+
];
|
|
258
|
+
for (const suffix of step4) {
|
|
259
|
+
if (w.endsWith(suffix)) {
|
|
260
|
+
const stem = w.slice(0, -suffix.length);
|
|
261
|
+
if (PorterStemmer.measure(stem) > 1) {
|
|
262
|
+
if (suffix === "ion") {
|
|
263
|
+
if (stem.endsWith("s") || stem.endsWith("t"))
|
|
264
|
+
w = stem;
|
|
265
|
+
}
|
|
266
|
+
else {
|
|
267
|
+
w = stem;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
break;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
// Step 5a: Remove trailing e
|
|
274
|
+
if (w.endsWith("e")) {
|
|
275
|
+
const stem = w.slice(0, -1);
|
|
276
|
+
const m = PorterStemmer.measure(stem);
|
|
277
|
+
if (m > 1 || (m === 1 && !PorterStemmer.cvc(stem))) {
|
|
278
|
+
w = stem;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
// Step 5b: Remove double l
|
|
282
|
+
if (w.endsWith("ll") && PorterStemmer.measure(w) > 1) {
|
|
283
|
+
w = w.slice(0, -1);
|
|
284
|
+
}
|
|
285
|
+
return w;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
// ─── In-Memory Keyword Index ─────────────────────────────────────────
|
|
289
|
+
/**
|
|
290
|
+
* Pure JavaScript inverted index for BM25-style keyword search.
|
|
291
|
+
* Replaces FTS5 entirely - no native extensions needed.
|
|
292
|
+
*
|
|
293
|
+
* Tokenization: lowercases, splits on non-alphanumeric chars,
|
|
294
|
+
* filters stopwords, applies basic stemming (suffix removal).
|
|
295
|
+
*/
|
|
296
|
+
class KeywordIndex {
|
|
297
|
+
/** Map from term → Map<rowid, TF> - unified inverted index + term frequency */
|
|
298
|
+
invertedIndex = new Map();
|
|
299
|
+
/** Map from bigram → Set of document rowids (for phrase search) */
|
|
300
|
+
bigramIndex = new Map();
|
|
301
|
+
/** Map from rowid → tokenized terms (for delete and avgDocLen) */
|
|
302
|
+
docTerms = new Map();
|
|
303
|
+
/** Total number of documents */
|
|
304
|
+
docCount = 0;
|
|
305
|
+
/** Average document length in terms */
|
|
306
|
+
avgDocLen = 0;
|
|
307
|
+
static STOPWORDS = new Set([
|
|
308
|
+
"a", "an", "the", "is", "are", "was", "were", "be", "been",
|
|
309
|
+
"being", "have", "has", "had", "do", "does", "did", "will",
|
|
310
|
+
"would", "could", "should", "may", "might", "shall", "can",
|
|
311
|
+
"to", "of", "in", "for", "on", "with", "at", "by", "from",
|
|
312
|
+
"as", "into", "through", "during", "before", "after", "above",
|
|
313
|
+
"below", "and", "but", "or", "not", "no", "if", "then",
|
|
314
|
+
"else", "this", "that", "it", "its", "new", "old",
|
|
315
|
+
]);
|
|
316
|
+
/** Tokenize text into normalized terms with code-aware splitting. */
|
|
317
|
+
tokenize(text) {
|
|
318
|
+
// FIX 5: Apply code-aware tokenizer before stemming
|
|
319
|
+
const rawTokens = text
|
|
320
|
+
.replace(/[^a-zA-Z0-9_.]/g, " ")
|
|
321
|
+
.split(/\s+/)
|
|
322
|
+
.filter((t) => t.length > 1);
|
|
323
|
+
const allTerms = [];
|
|
324
|
+
for (const raw of rawTokens) {
|
|
325
|
+
// Code-aware tokenization: split identifiers
|
|
326
|
+
const subTokens = codeTokenize(raw);
|
|
327
|
+
if (subTokens.length > 0) {
|
|
328
|
+
for (const sub of subTokens) {
|
|
329
|
+
if (sub.length > 1 && !KeywordIndex.STOPWORDS.has(sub)) {
|
|
330
|
+
allTerms.push(sub);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
else {
|
|
335
|
+
const lower = raw.toLowerCase();
|
|
336
|
+
if (!KeywordIndex.STOPWORDS.has(lower)) {
|
|
337
|
+
allTerms.push(this.stem(lower));
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
return allTerms;
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Porter stemmer - full implementation of the Porter stemming algorithm.
|
|
345
|
+
* 5 steps with consonant-vowel pattern analysis for accurate English stemming.
|
|
346
|
+
*/
|
|
347
|
+
stem(word) {
|
|
348
|
+
if (word.length <= 2)
|
|
349
|
+
return word;
|
|
350
|
+
return PorterStemmer.stem(word);
|
|
351
|
+
}
|
|
352
|
+
/** Add a document to the index. */
|
|
353
|
+
insert(rowid, text) {
|
|
354
|
+
const terms = this.tokenize(text);
|
|
355
|
+
this.docTerms.set(rowid, terms);
|
|
356
|
+
// Compute local TF
|
|
357
|
+
const tfMap = new Map();
|
|
358
|
+
for (const term of terms) {
|
|
359
|
+
tfMap.set(term, (tfMap.get(term) || 0) + 1);
|
|
360
|
+
}
|
|
361
|
+
// Store TF directly in inverted index for O(1) lookup
|
|
362
|
+
for (const [term, tf] of tfMap) {
|
|
363
|
+
let docMap = this.invertedIndex.get(term);
|
|
364
|
+
if (!docMap) {
|
|
365
|
+
docMap = new Map();
|
|
366
|
+
this.invertedIndex.set(term, docMap);
|
|
367
|
+
}
|
|
368
|
+
docMap.set(rowid, tf);
|
|
369
|
+
}
|
|
370
|
+
// Generate bigrams for phrase search
|
|
371
|
+
for (let i = 0; i < terms.length - 1; i++) {
|
|
372
|
+
const bigram = terms[i] + "_" + terms[i + 1];
|
|
373
|
+
if (!this.bigramIndex.has(bigram)) {
|
|
374
|
+
this.bigramIndex.set(bigram, new Set());
|
|
375
|
+
}
|
|
376
|
+
this.bigramIndex.get(bigram).add(rowid);
|
|
377
|
+
}
|
|
378
|
+
this.docCount++;
|
|
379
|
+
this.updateAvgDocLen();
|
|
380
|
+
}
|
|
381
|
+
/** Remove a document from the index. */
|
|
382
|
+
delete(rowid) {
|
|
383
|
+
const terms = this.docTerms.get(rowid);
|
|
384
|
+
if (!terms)
|
|
385
|
+
return;
|
|
386
|
+
for (const term of terms) {
|
|
387
|
+
const docMap = this.invertedIndex.get(term);
|
|
388
|
+
if (docMap) {
|
|
389
|
+
docMap.delete(rowid);
|
|
390
|
+
if (docMap.size === 0) {
|
|
391
|
+
this.invertedIndex.delete(term);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
// Clean up bigram entries
|
|
396
|
+
for (let i = 0; i < terms.length - 1; i++) {
|
|
397
|
+
const bigram = terms[i] + "_" + terms[i + 1];
|
|
398
|
+
const docs = this.bigramIndex.get(bigram);
|
|
399
|
+
if (docs) {
|
|
400
|
+
docs.delete(rowid);
|
|
401
|
+
if (docs.size === 0) {
|
|
402
|
+
this.bigramIndex.delete(bigram);
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
this.docTerms.delete(rowid);
|
|
407
|
+
this.docCount = Math.max(0, this.docCount - 1);
|
|
408
|
+
this.updateAvgDocLen();
|
|
409
|
+
}
|
|
410
|
+
deleteBulk(rowids) {
|
|
411
|
+
for (const id of rowids) {
|
|
412
|
+
this.delete(id);
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
updateAvgDocLen() {
|
|
416
|
+
if (this.docCount === 0) {
|
|
417
|
+
this.avgDocLen = 0;
|
|
418
|
+
return;
|
|
419
|
+
}
|
|
420
|
+
let totalLen = 0;
|
|
421
|
+
for (const terms of this.docTerms.values()) {
|
|
422
|
+
totalLen += terms.length;
|
|
423
|
+
}
|
|
424
|
+
this.avgDocLen = totalLen / this.docCount;
|
|
425
|
+
}
|
|
426
|
+
/**
|
|
427
|
+
* BM25 search with bigram phrase boosting.
|
|
428
|
+
* Code-tuned parameters: k1 = 1.8, b = 0.35
|
|
429
|
+
* Multi-word queries get a 0.3 weight bigram boost.
|
|
430
|
+
*/
|
|
431
|
+
search(queryText, limit) {
|
|
432
|
+
const queryTerms = this.tokenize(queryText);
|
|
433
|
+
if (queryTerms.length === 0)
|
|
434
|
+
return [];
|
|
435
|
+
const k1 = 1.8;
|
|
436
|
+
const b = 0.35;
|
|
437
|
+
const scores = new Map();
|
|
438
|
+
for (const term of queryTerms) {
|
|
439
|
+
const docMap = this.invertedIndex.get(term);
|
|
440
|
+
if (!docMap)
|
|
441
|
+
continue;
|
|
442
|
+
// IDF = log((N - df + 0.5) / (df + 0.5) + 1)
|
|
443
|
+
const df = docMap.size;
|
|
444
|
+
const idf = Math.log((this.docCount - df + 0.5) / (df + 0.5) + 1);
|
|
445
|
+
// TF read directly from inverted index - O(1)
|
|
446
|
+
for (const [rowid, tf] of docMap) {
|
|
447
|
+
const docLen = this.docTerms.get(rowid).length;
|
|
448
|
+
// BM25 formula
|
|
449
|
+
const tfNorm = (tf * (k1 + 1)) /
|
|
450
|
+
(tf + k1 * (1 - b + b * (docLen / (this.avgDocLen || 1))));
|
|
451
|
+
const score = idf * tfNorm;
|
|
452
|
+
scores.set(rowid, (scores.get(rowid) || 0) + score);
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
// Bigram phrase boost for multi-word queries
|
|
456
|
+
if (queryTerms.length >= 2) {
|
|
457
|
+
const bigramWeight = 0.3;
|
|
458
|
+
for (let i = 0; i < queryTerms.length - 1; i++) {
|
|
459
|
+
const bigram = queryTerms[i] + "_" + queryTerms[i + 1];
|
|
460
|
+
const docs = this.bigramIndex.get(bigram);
|
|
461
|
+
if (!docs)
|
|
462
|
+
continue;
|
|
463
|
+
for (const rowid of docs) {
|
|
464
|
+
const existing = scores.get(rowid) || 0;
|
|
465
|
+
scores.set(rowid, existing + bigramWeight);
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
return Array.from(scores.entries())
|
|
470
|
+
.map(([rowid, score]) => ({ rowid, score }))
|
|
471
|
+
.sort((a, b) => b.score - a.score)
|
|
472
|
+
.slice(0, limit);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
// ─── Database Manager ────────────────────────────────────────────────
|
|
476
|
+
export class NrekiDB {
|
|
477
|
+
db;
|
|
478
|
+
vecIndex = new VectorIndex();
|
|
479
|
+
kwIndex = new KeywordIndex();
|
|
480
|
+
dbPath;
|
|
481
|
+
vecPath;
|
|
482
|
+
initPromise = null;
|
|
483
|
+
_ready = false;
|
|
484
|
+
constructor(dbPath = ".nreki.db") {
|
|
485
|
+
this.dbPath = dbPath;
|
|
486
|
+
this.vecPath = dbPath.replace(/\.db$/, ".vec");
|
|
487
|
+
}
|
|
488
|
+
/** Async initialization - must be called before any DB operation. */
|
|
489
|
+
async initialize() {
|
|
490
|
+
if (this._ready)
|
|
491
|
+
return;
|
|
492
|
+
if (!this.initPromise) {
|
|
493
|
+
this.initPromise = this._init();
|
|
494
|
+
}
|
|
495
|
+
await this.initPromise;
|
|
496
|
+
}
|
|
497
|
+
async _init() {
|
|
498
|
+
const SQL = await initSqlJs();
|
|
499
|
+
// Load existing database if it exists
|
|
500
|
+
if (fs.existsSync(this.dbPath)) {
|
|
501
|
+
const fileBuffer = fs.readFileSync(this.dbPath);
|
|
502
|
+
this.db = new SQL.Database(fileBuffer);
|
|
503
|
+
}
|
|
504
|
+
else {
|
|
505
|
+
this.db = new SQL.Database();
|
|
506
|
+
}
|
|
507
|
+
// Setup schema first (creates metadata table needed for dimension lookup)
|
|
508
|
+
this.setupSchema();
|
|
509
|
+
// Load vector index using stored dimension (default 512)
|
|
510
|
+
const storedDim = parseInt(this.getMetadata("embedding_dim") ?? "512", 10);
|
|
511
|
+
if (fs.existsSync(this.vecPath)) {
|
|
512
|
+
const vecBuffer = fs.readFileSync(this.vecPath);
|
|
513
|
+
this.vecIndex = VectorIndex.deserialize(vecBuffer, storedDim);
|
|
514
|
+
}
|
|
515
|
+
// Rebuild keyword index from existing data
|
|
516
|
+
this.rebuildKeywordIndex();
|
|
517
|
+
this._ready = true;
|
|
518
|
+
}
|
|
519
|
+
get ready() {
|
|
520
|
+
return this._ready;
|
|
521
|
+
}
|
|
522
|
+
// ─── Schema ──────────────────────────────────────────────────
|
|
523
|
+
setupSchema() {
|
|
524
|
+
this.db.run(`
|
|
525
|
+
-- Indexed files with content hashes for Merkle-style diffing
|
|
526
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
527
|
+
path TEXT PRIMARY KEY,
|
|
528
|
+
hash TEXT NOT NULL,
|
|
529
|
+
indexed_at TEXT DEFAULT (datetime('now'))
|
|
530
|
+
);
|
|
531
|
+
|
|
532
|
+
-- AST chunks extracted from source files
|
|
533
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
534
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
535
|
+
path TEXT NOT NULL,
|
|
536
|
+
shorthand TEXT NOT NULL,
|
|
537
|
+
raw_code TEXT NOT NULL,
|
|
538
|
+
node_type TEXT NOT NULL DEFAULT 'unknown',
|
|
539
|
+
start_line INTEGER NOT NULL DEFAULT 0,
|
|
540
|
+
end_line INTEGER NOT NULL DEFAULT 0,
|
|
541
|
+
start_index INTEGER NOT NULL DEFAULT 0,
|
|
542
|
+
end_index INTEGER NOT NULL DEFAULT 0,
|
|
543
|
+
symbol_name TEXT NOT NULL DEFAULT ''
|
|
544
|
+
);
|
|
545
|
+
|
|
546
|
+
-- Token usage tracking
|
|
547
|
+
CREATE TABLE IF NOT EXISTS usage_log (
|
|
548
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
549
|
+
timestamp TEXT DEFAULT (datetime('now')),
|
|
550
|
+
tool_name TEXT NOT NULL,
|
|
551
|
+
input_tokens INTEGER NOT NULL DEFAULT 0,
|
|
552
|
+
output_tokens INTEGER NOT NULL DEFAULT 0,
|
|
553
|
+
saved_tokens INTEGER NOT NULL DEFAULT 0
|
|
554
|
+
);
|
|
555
|
+
|
|
556
|
+
-- Indexes for common queries
|
|
557
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
|
|
558
|
+
CREATE INDEX IF NOT EXISTS idx_usage_timestamp ON usage_log(timestamp);
|
|
559
|
+
|
|
560
|
+
-- Metadata key-value store (embedding dimension, model name, etc.)
|
|
561
|
+
CREATE TABLE IF NOT EXISTS metadata (
|
|
562
|
+
key TEXT PRIMARY KEY,
|
|
563
|
+
value TEXT NOT NULL
|
|
564
|
+
);
|
|
565
|
+
`);
|
|
566
|
+
// Migration: add columns for existing DBs that lack them
|
|
567
|
+
const migrationColumns = [
|
|
568
|
+
"ALTER TABLE chunks ADD COLUMN start_index INTEGER NOT NULL DEFAULT 0",
|
|
569
|
+
"ALTER TABLE chunks ADD COLUMN end_index INTEGER NOT NULL DEFAULT 0",
|
|
570
|
+
"ALTER TABLE chunks ADD COLUMN symbol_name TEXT NOT NULL DEFAULT ''",
|
|
571
|
+
];
|
|
572
|
+
for (const sql of migrationColumns) {
|
|
573
|
+
try {
|
|
574
|
+
this.db.run(sql);
|
|
575
|
+
}
|
|
576
|
+
catch { /* column already exists */ }
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
/** Rebuild the in-memory keyword index from all existing chunks. */
|
|
580
|
+
rebuildKeywordIndex() {
|
|
581
|
+
const rows = this.db.exec("SELECT id, shorthand FROM chunks");
|
|
582
|
+
if (rows.length === 0)
|
|
583
|
+
return;
|
|
584
|
+
for (const row of rows[0].values) {
|
|
585
|
+
const [id, shorthand] = row;
|
|
586
|
+
this.kwIndex.insert(id, shorthand);
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
// ─── Metadata ────────────────────────────────────────────────
|
|
590
|
+
/** Read a metadata value by key, or null if not set. */
|
|
591
|
+
getMetadata(key) {
|
|
592
|
+
const stmt = this.db.prepare("SELECT value FROM metadata WHERE key = ?");
|
|
593
|
+
stmt.bind([key]);
|
|
594
|
+
let result = null;
|
|
595
|
+
if (stmt.step()) {
|
|
596
|
+
result = stmt.getAsObject().value;
|
|
597
|
+
}
|
|
598
|
+
stmt.free();
|
|
599
|
+
return result;
|
|
600
|
+
}
|
|
601
|
+
/** Write a metadata key-value pair (upsert). */
|
|
602
|
+
setMetadata(key, value) {
|
|
603
|
+
this.db.run("INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)", [key, value]);
|
|
604
|
+
}
|
|
605
|
+
/**
|
|
606
|
+
* Check if the active embedding dimension matches what was stored.
|
|
607
|
+
* If they differ, clear all vectors and update the stored dimension.
|
|
608
|
+
* Returns true if a re-index is needed.
|
|
609
|
+
*/
|
|
610
|
+
checkEmbeddingDimension(activeDim) {
|
|
611
|
+
const storedDim = this.getMetadata("embedding_dim");
|
|
612
|
+
if (storedDim && parseInt(storedDim, 10) !== activeDim) {
|
|
613
|
+
console.error(`[NREKI] Embedding dimension changed (${storedDim} -> ${activeDim}). Clearing index.`);
|
|
614
|
+
// Clear all vectors
|
|
615
|
+
this.vecIndex = new VectorIndex();
|
|
616
|
+
// Clear all chunks and files so they get re-indexed
|
|
617
|
+
this.db.run("DELETE FROM chunks");
|
|
618
|
+
this.db.run("DELETE FROM files");
|
|
619
|
+
this.kwIndex = new KeywordIndex();
|
|
620
|
+
this.setMetadata("embedding_dim", String(activeDim));
|
|
621
|
+
return true;
|
|
622
|
+
}
|
|
623
|
+
if (!storedDim) {
|
|
624
|
+
this.setMetadata("embedding_dim", String(activeDim));
|
|
625
|
+
}
|
|
626
|
+
return false;
|
|
627
|
+
}
|
|
628
|
+
// ─── Persistence ─────────────────────────────────────────────
|
|
629
|
+
/** Persist database and vector index to disk. */
|
|
630
|
+
save() {
|
|
631
|
+
// Save SQLite database
|
|
632
|
+
const data = this.db.export();
|
|
633
|
+
const buffer = Buffer.from(data);
|
|
634
|
+
const dir = path.dirname(this.dbPath);
|
|
635
|
+
if (dir && !fs.existsSync(dir)) {
|
|
636
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
637
|
+
}
|
|
638
|
+
fs.writeFileSync(this.dbPath, buffer);
|
|
639
|
+
// Save vector index
|
|
640
|
+
const vecData = this.vecIndex.serialize();
|
|
641
|
+
fs.writeFileSync(this.vecPath, vecData);
|
|
642
|
+
}
|
|
643
|
+
// ─── File Operations ─────────────────────────────────────────
|
|
644
|
+
fileNeedsUpdate(filePath, content) {
|
|
645
|
+
const newHash = crypto.createHash("sha256").update(content).digest("hex");
|
|
646
|
+
const stmt = this.db.prepare("SELECT hash FROM files WHERE path = ?");
|
|
647
|
+
stmt.bind([filePath]);
|
|
648
|
+
if (stmt.step()) {
|
|
649
|
+
const row = stmt.getAsObject();
|
|
650
|
+
stmt.free();
|
|
651
|
+
return row.hash !== newHash;
|
|
652
|
+
}
|
|
653
|
+
stmt.free();
|
|
654
|
+
return true;
|
|
655
|
+
}
|
|
656
|
+
hashContent(content) {
|
|
657
|
+
return crypto.createHash("sha256").update(content).digest("hex");
|
|
658
|
+
}
|
|
659
|
+
upsertFile(filePath, hash) {
|
|
660
|
+
this.db.run("INSERT OR REPLACE INTO files (path, hash, indexed_at) VALUES (?, ?, datetime('now'))", [filePath, hash]);
|
|
661
|
+
}
|
|
662
|
+
clearChunks(filePath) {
|
|
663
|
+
const stmt = this.db.prepare("SELECT id FROM chunks WHERE path = ?");
|
|
664
|
+
stmt.bind([filePath]);
|
|
665
|
+
const ids = [];
|
|
666
|
+
while (stmt.step()) {
|
|
667
|
+
const row = stmt.getAsObject();
|
|
668
|
+
ids.push(row.id);
|
|
669
|
+
}
|
|
670
|
+
stmt.free();
|
|
671
|
+
if (ids.length > 0) {
|
|
672
|
+
this.vecIndex.deleteBulk(ids);
|
|
673
|
+
this.kwIndex.deleteBulk(ids);
|
|
674
|
+
this.db.run("DELETE FROM chunks WHERE path = ?", [filePath]);
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
// ─── Chunk Operations ────────────────────────────────────────
|
|
678
|
+
insertChunk(filePath, shorthand, rawCode, nodeType, startLine, endLine, embedding, startIndex = 0, endIndex = 0, symbolName = "") {
|
|
679
|
+
this.db.run(`INSERT INTO chunks (path, shorthand, raw_code, node_type, start_line, end_line, start_index, end_index, symbol_name)
|
|
680
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [filePath, shorthand, rawCode, nodeType, startLine, endLine, startIndex, endIndex, symbolName]);
|
|
681
|
+
const rowid = this.db.exec("SELECT last_insert_rowid() AS id")[0]
|
|
682
|
+
.values[0][0];
|
|
683
|
+
// A-04: Only insert non-empty vectors (Lite mode uses Float32Array(0))
|
|
684
|
+
if (embedding.length > 0) {
|
|
685
|
+
this.vecIndex.insert(rowid, embedding);
|
|
686
|
+
}
|
|
687
|
+
this.kwIndex.insert(rowid, shorthand);
|
|
688
|
+
return rowid;
|
|
689
|
+
}
|
|
690
|
+
insertChunksBatch(chunks) {
|
|
691
|
+
this.db.run("BEGIN TRANSACTION");
|
|
692
|
+
try {
|
|
693
|
+
for (const chunk of chunks) {
|
|
694
|
+
this.insertChunk(chunk.path, chunk.shorthand, chunk.rawCode, chunk.nodeType, chunk.startLine, chunk.endLine, chunk.embedding, chunk.startIndex ?? 0, chunk.endIndex ?? 0, chunk.symbolName ?? "");
|
|
695
|
+
}
|
|
696
|
+
this.db.run("COMMIT");
|
|
697
|
+
}
|
|
698
|
+
catch (err) {
|
|
699
|
+
this.db.run("ROLLBACK");
|
|
700
|
+
throw err;
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
// ─── Path Boosting ────────────────────────────────────────────
|
|
704
|
+
/** Apply path-based weighting: boost src/, penalize tests/node_modules/. */
|
|
705
|
+
getPathBoost(filePath) {
|
|
706
|
+
const normalized = filePath.replace(/\\/g, "/").toLowerCase();
|
|
707
|
+
if (normalized.includes("/node_modules/"))
|
|
708
|
+
return 0.3;
|
|
709
|
+
if (normalized.includes("/dist/") || normalized.includes("/build/"))
|
|
710
|
+
return 0.5;
|
|
711
|
+
if (normalized.includes("/test") || normalized.includes("/__test"))
|
|
712
|
+
return 0.7;
|
|
713
|
+
if (normalized.includes("/src/"))
|
|
714
|
+
return 1.2;
|
|
715
|
+
if (normalized.includes("/lib/") || normalized.includes("/core/"))
|
|
716
|
+
return 1.1;
|
|
717
|
+
return 1.0;
|
|
718
|
+
}
|
|
719
|
+
// ─── Batch Helpers ─────────────────────────────────────────────
|
|
720
|
+
/**
|
|
721
|
+
* Batch-fetch paths for an array of chunk IDs. Single SQL query.
|
|
722
|
+
* Used by RRF fusion to apply path boosting without N+1 queries.
|
|
723
|
+
*/
|
|
724
|
+
fetchPathsBatch(ids) {
|
|
725
|
+
const result = new Map();
|
|
726
|
+
if (ids.length === 0)
|
|
727
|
+
return result;
|
|
728
|
+
const placeholders = ids.map(() => "?").join(",");
|
|
729
|
+
const stmt = this.db.prepare(`SELECT id, path FROM chunks WHERE id IN (${placeholders})`);
|
|
730
|
+
stmt.bind(ids);
|
|
731
|
+
while (stmt.step()) {
|
|
732
|
+
const row = stmt.getAsObject();
|
|
733
|
+
result.set(row.id, row.path);
|
|
734
|
+
}
|
|
735
|
+
stmt.free();
|
|
736
|
+
return result;
|
|
737
|
+
}
|
|
738
|
+
/**
|
|
739
|
+
* Batch-fetch full chunk data for an array of chunk IDs. Single SQL query.
|
|
740
|
+
* Used by all search methods to hydrate final results without N+1 queries.
|
|
741
|
+
*/
|
|
742
|
+
fetchChunksBatch(ids) {
|
|
743
|
+
const result = new Map();
|
|
744
|
+
if (ids.length === 0)
|
|
745
|
+
return result;
|
|
746
|
+
const placeholders = ids.map(() => "?").join(",");
|
|
747
|
+
const stmt = this.db.prepare(`SELECT id, path, shorthand, raw_code, node_type, start_line, end_line, start_index, end_index, symbol_name
|
|
748
|
+
FROM chunks WHERE id IN (${placeholders})`);
|
|
749
|
+
stmt.bind(ids);
|
|
750
|
+
while (stmt.step()) {
|
|
751
|
+
const row = stmt.getAsObject();
|
|
752
|
+
result.set(row.id, {
|
|
753
|
+
id: row.id,
|
|
754
|
+
path: row.path,
|
|
755
|
+
shorthand: row.shorthand,
|
|
756
|
+
raw_code: row.raw_code,
|
|
757
|
+
node_type: row.node_type,
|
|
758
|
+
start_line: row.start_line,
|
|
759
|
+
end_line: row.end_line,
|
|
760
|
+
start_index: row.start_index ?? 0,
|
|
761
|
+
end_index: row.end_index ?? 0,
|
|
762
|
+
symbol_name: row.symbol_name ?? "",
|
|
763
|
+
});
|
|
764
|
+
}
|
|
765
|
+
stmt.free();
|
|
766
|
+
return result;
|
|
767
|
+
}
|
|
768
|
+
// ─── Search Operations ───────────────────────────────────────
|
|
769
|
+
/**
|
|
770
|
+
* Hybrid search using Reciprocal Rank Fusion (RRF).
|
|
771
|
+
* Combines:
|
|
772
|
+
* - Vector similarity (semantic, cosine distance)
|
|
773
|
+
* - BM25 keyword matching (in pure JS inverted index)
|
|
774
|
+
*
|
|
775
|
+
* RRF formula: score = Σ 1/(k + rank_i) where k=10
|
|
776
|
+
*/
|
|
777
|
+
searchHybrid(queryEmbedding, queryText, limit = 10) {
|
|
778
|
+
// 1. Vector search - top 60 by cosine similarity
|
|
779
|
+
const vecResults = this.vecIndex.search(queryEmbedding, 60);
|
|
780
|
+
const vecRanks = new Map();
|
|
781
|
+
vecResults.forEach((r, i) => vecRanks.set(r.rowid, i + 1));
|
|
782
|
+
// 2. BM25 keyword search - top 60 by term relevance
|
|
783
|
+
const kwResults = this.kwIndex.search(queryText, 60);
|
|
784
|
+
const kwRanks = new Map();
|
|
785
|
+
kwResults.forEach((r, i) => kwRanks.set(r.rowid, i + 1));
|
|
786
|
+
// 3. RRF fusion with path boosting (batch query)
|
|
787
|
+
const allIds = new Set([...vecRanks.keys(), ...kwRanks.keys()]);
|
|
788
|
+
const pathMap = this.fetchPathsBatch([...allIds]);
|
|
789
|
+
const scored = [];
|
|
790
|
+
for (const id of allIds) {
|
|
791
|
+
const vecRank = vecRanks.get(id);
|
|
792
|
+
const kwRank = kwRanks.get(id);
|
|
793
|
+
let rrf = (vecRank ? 1.0 / (10 + vecRank) : 0) +
|
|
794
|
+
(kwRank ? 1.0 / (10 + kwRank) : 0);
|
|
795
|
+
const filePath = pathMap.get(id);
|
|
796
|
+
if (filePath) {
|
|
797
|
+
rrf *= this.getPathBoost(filePath);
|
|
798
|
+
}
|
|
799
|
+
scored.push({ id, rrf });
|
|
800
|
+
}
|
|
801
|
+
scored.sort((a, b) => b.rrf - a.rrf);
|
|
802
|
+
const topIds = scored.slice(0, limit);
|
|
803
|
+
// 4. Fetch full chunk data (batch query)
|
|
804
|
+
const chunkMap = this.fetchChunksBatch(topIds.map(t => t.id));
|
|
805
|
+
const results = [];
|
|
806
|
+
for (const { id, rrf } of topIds) {
|
|
807
|
+
const row = chunkMap.get(id);
|
|
808
|
+
if (row) {
|
|
809
|
+
results.push({
|
|
810
|
+
id: row.id, path: row.path, shorthand: row.shorthand,
|
|
811
|
+
raw_code: row.raw_code, node_type: row.node_type,
|
|
812
|
+
start_line: row.start_line, end_line: row.end_line,
|
|
813
|
+
start_index: row.start_index, end_index: row.end_index,
|
|
814
|
+
symbol_name: row.symbol_name,
|
|
815
|
+
rrf_score: rrf,
|
|
816
|
+
});
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
return results;
|
|
820
|
+
}
|
|
821
|
+
/**
|
|
822
|
+
* Keyword-only search using BM25 (for Lite mode - no embeddings needed).
|
|
823
|
+
* Uses the in-memory KeywordIndex with path boosting.
|
|
824
|
+
*/
|
|
825
|
+
searchKeywordOnly(queryText, limit = 10) {
|
|
826
|
+
const kwResults = this.kwIndex.search(queryText, limit * 2);
|
|
827
|
+
if (kwResults.length === 0)
|
|
828
|
+
return [];
|
|
829
|
+
const chunkMap = this.fetchChunksBatch(kwResults.map(r => r.rowid));
|
|
830
|
+
const results = [];
|
|
831
|
+
for (const { rowid, score } of kwResults) {
|
|
832
|
+
const row = chunkMap.get(rowid);
|
|
833
|
+
if (row) {
|
|
834
|
+
const boostedScore = score * this.getPathBoost(row.path);
|
|
835
|
+
results.push({
|
|
836
|
+
id: row.id, path: row.path, shorthand: row.shorthand,
|
|
837
|
+
raw_code: row.raw_code, node_type: row.node_type,
|
|
838
|
+
start_line: row.start_line, end_line: row.end_line,
|
|
839
|
+
start_index: row.start_index, end_index: row.end_index,
|
|
840
|
+
symbol_name: row.symbol_name,
|
|
841
|
+
rrf_score: boostedScore,
|
|
842
|
+
});
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
results.sort((a, b) => b.rrf_score - a.rrf_score);
|
|
846
|
+
return results.slice(0, limit);
|
|
847
|
+
}
|
|
848
|
+
/**
|
|
849
|
+
* BM25-powered fast resolution for import-anchored auto-context.
|
|
850
|
+
* Searches "symbol pathHint" together to defeat homonyms.
|
|
851
|
+
* Enforces a 150ms hard timeout to prevent event loop blocking.
|
|
852
|
+
*/
|
|
853
|
+
resolveImportSignatures(deps, maxTimeMs = 150) {
|
|
854
|
+
if (!this._ready || deps.length === 0)
|
|
855
|
+
return [];
|
|
856
|
+
const start = performance.now();
|
|
857
|
+
const results = [];
|
|
858
|
+
const seenSymbols = new Set();
|
|
859
|
+
const escapeRegex = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
860
|
+
for (const dep of deps) {
|
|
861
|
+
if (seenSymbols.has(dep.symbol))
|
|
862
|
+
continue;
|
|
863
|
+
seenSymbols.add(dep.symbol);
|
|
864
|
+
if (performance.now() - start > maxTimeMs)
|
|
865
|
+
break;
|
|
866
|
+
// BM25 with two terms: symbol + path hint defeats homonyms
|
|
867
|
+
const cleanHint = dep.pathHint.replace(/['"%_]/g, " ").trim();
|
|
868
|
+
const queryText = cleanHint
|
|
869
|
+
? `${dep.symbol} ${cleanHint}`
|
|
870
|
+
: dep.symbol;
|
|
871
|
+
const hits = this.searchKeywordOnly(queryText, 3);
|
|
872
|
+
if (hits.length > 0) {
|
|
873
|
+
// Final validation: symbol must appear textually in the shorthand
|
|
874
|
+
// Uses safe boundaries (not \b) to handle $store etc.
|
|
875
|
+
const safeSym = escapeRegex(dep.symbol);
|
|
876
|
+
const exactRegex = new RegExp(`(^|[^a-zA-Z0-9_$])${safeSym}(?=[^a-zA-Z0-9_$]|$)`);
|
|
877
|
+
for (const hit of hits) {
|
|
878
|
+
if (exactRegex.test(hit.shorthand)) {
|
|
879
|
+
results.push({ raw: hit.shorthand, path: hit.path });
|
|
880
|
+
break;
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
return results;
|
|
886
|
+
}
|
|
887
|
+
searchVector(queryEmbedding, limit = 10) {
|
|
888
|
+
const vecResults = this.vecIndex.search(queryEmbedding, limit);
|
|
889
|
+
if (vecResults.length === 0)
|
|
890
|
+
return [];
|
|
891
|
+
const chunkMap = this.fetchChunksBatch(vecResults.map(r => r.rowid));
|
|
892
|
+
const results = [];
|
|
893
|
+
for (const { rowid, distance } of vecResults) {
|
|
894
|
+
const row = chunkMap.get(rowid);
|
|
895
|
+
if (row) {
|
|
896
|
+
results.push({
|
|
897
|
+
id: row.id, path: row.path, shorthand: row.shorthand,
|
|
898
|
+
raw_code: row.raw_code, node_type: row.node_type,
|
|
899
|
+
start_line: row.start_line, end_line: row.end_line,
|
|
900
|
+
start_index: row.start_index, end_index: row.end_index,
|
|
901
|
+
symbol_name: row.symbol_name,
|
|
902
|
+
rrf_score: 1 - distance,
|
|
903
|
+
});
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
return results;
|
|
907
|
+
}
|
|
908
|
+
// ─── Usage Tracking ──────────────────────────────────────────
|
|
909
|
+
logUsage(toolName, inputTokens, outputTokens, savedTokens) {
|
|
910
|
+
this.db.run(`INSERT INTO usage_log (tool_name, input_tokens, output_tokens, saved_tokens)
|
|
911
|
+
VALUES (?, ?, ?, ?)`, [toolName, inputTokens, outputTokens, savedTokens]);
|
|
912
|
+
}
|
|
913
|
+
getUsageStats(since) {
|
|
914
|
+
const whereClause = since ? "WHERE timestamp >= ?" : "";
|
|
915
|
+
const params = since ? [since] : [];
|
|
916
|
+
const stmt = this.db.prepare(`SELECT
|
|
917
|
+
COALESCE(SUM(input_tokens), 0) AS total_input,
|
|
918
|
+
COALESCE(SUM(output_tokens), 0) AS total_output,
|
|
919
|
+
COALESCE(SUM(saved_tokens), 0) AS total_saved,
|
|
920
|
+
COUNT(*) AS tool_calls
|
|
921
|
+
FROM usage_log ${whereClause}`);
|
|
922
|
+
if (params.length > 0)
|
|
923
|
+
stmt.bind(params);
|
|
924
|
+
let result = { total_input: 0, total_output: 0, total_saved: 0, tool_calls: 0 };
|
|
925
|
+
if (stmt.step()) {
|
|
926
|
+
const row = stmt.getAsObject();
|
|
927
|
+
result = {
|
|
928
|
+
total_input: row.total_input ?? 0,
|
|
929
|
+
total_output: row.total_output ?? 0,
|
|
930
|
+
total_saved: row.total_saved ?? 0,
|
|
931
|
+
tool_calls: row.tool_calls ?? 0,
|
|
932
|
+
};
|
|
933
|
+
}
|
|
934
|
+
stmt.free();
|
|
935
|
+
return result;
|
|
936
|
+
}
|
|
937
|
+
/**
|
|
938
|
+
* Find the heaviest files by total raw code size.
|
|
939
|
+
* Zero disk I/O - queries indexed data in SQLite.
|
|
940
|
+
*/
|
|
941
|
+
getTopHeavyFiles(limit = 5) {
|
|
942
|
+
if (!this._ready)
|
|
943
|
+
return [];
|
|
944
|
+
const stmt = this.db.prepare(`
|
|
945
|
+
SELECT path, SUM(LENGTH(raw_code)) as total_chars
|
|
946
|
+
FROM chunks
|
|
947
|
+
GROUP BY path
|
|
948
|
+
ORDER BY total_chars DESC
|
|
949
|
+
LIMIT ?
|
|
950
|
+
`);
|
|
951
|
+
stmt.bind([limit]);
|
|
952
|
+
const results = [];
|
|
953
|
+
while (stmt.step()) {
|
|
954
|
+
const row = stmt.getAsObject();
|
|
955
|
+
results.push({
|
|
956
|
+
path: row.path,
|
|
957
|
+
estimated_tokens: Math.ceil(row.total_chars / 3.5),
|
|
958
|
+
});
|
|
959
|
+
}
|
|
960
|
+
stmt.free();
|
|
961
|
+
return results;
|
|
962
|
+
}
|
|
963
|
+
// ─── Statistics ──────────────────────────────────────────────
|
|
964
|
+
getStats() {
|
|
965
|
+
const rows = this.db.exec(`
|
|
966
|
+
SELECT
|
|
967
|
+
COUNT(*) AS total_chunks,
|
|
968
|
+
COUNT(DISTINCT path) AS total_files,
|
|
969
|
+
COALESCE(SUM(LENGTH(raw_code)), 0) AS total_raw_tokens,
|
|
970
|
+
COALESCE(SUM(LENGTH(shorthand)), 0) AS total_shorthand_tokens
|
|
971
|
+
FROM chunks
|
|
972
|
+
`);
|
|
973
|
+
if (rows.length === 0 || rows[0].values.length === 0) {
|
|
974
|
+
return {
|
|
975
|
+
total_chunks: 0,
|
|
976
|
+
total_files: 0,
|
|
977
|
+
total_raw_tokens: 0,
|
|
978
|
+
total_shorthand_tokens: 0,
|
|
979
|
+
compression_ratio: 0,
|
|
980
|
+
};
|
|
981
|
+
}
|
|
982
|
+
const [total_chunks, total_files, total_raw_tokens, total_shorthand_tokens] = rows[0].values[0];
|
|
983
|
+
return {
|
|
984
|
+
total_chunks,
|
|
985
|
+
total_files,
|
|
986
|
+
total_raw_tokens,
|
|
987
|
+
total_shorthand_tokens,
|
|
988
|
+
compression_ratio: total_raw_tokens > 0
|
|
989
|
+
? 1 - total_shorthand_tokens / total_raw_tokens
|
|
990
|
+
: 0,
|
|
991
|
+
};
|
|
992
|
+
}
|
|
993
|
+
getFileCount() {
|
|
994
|
+
const rows = this.db.exec("SELECT COUNT(*) AS count FROM files");
|
|
995
|
+
if (rows.length === 0)
|
|
996
|
+
return 0;
|
|
997
|
+
return rows[0].values[0][0];
|
|
998
|
+
}
|
|
999
|
+
getVectorCount() {
|
|
1000
|
+
return this.vecIndex.size;
|
|
1001
|
+
}
|
|
1002
|
+
/**
|
|
1003
|
+
* Scan ALL chunks whose raw_code contains the given symbol name.
|
|
1004
|
+
* Returns distinct file paths. Used by prepare_refactor for 100% coverage.
|
|
1005
|
+
*/
|
|
1006
|
+
searchRawCode(symbolName) {
|
|
1007
|
+
if (!this._ready)
|
|
1008
|
+
return [];
|
|
1009
|
+
// C-04 + A-07: Escape backslashes first, then LIKE wildcards
|
|
1010
|
+
const escaped = symbolName.replace(/\\/g, '\\\\').replace(/[%_]/g, '\\$&');
|
|
1011
|
+
const stmt = this.db.prepare(`SELECT DISTINCT path FROM chunks WHERE raw_code LIKE ? ESCAPE '\\'`);
|
|
1012
|
+
stmt.bind([`%${escaped}%`]);
|
|
1013
|
+
const paths = [];
|
|
1014
|
+
while (stmt.step()) {
|
|
1015
|
+
paths.push(stmt.getAsObject().path);
|
|
1016
|
+
}
|
|
1017
|
+
stmt.free();
|
|
1018
|
+
return paths;
|
|
1019
|
+
}
|
|
1020
|
+
close() {
|
|
1021
|
+
this.save();
|
|
1022
|
+
this.db.close();
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
// Re-export for testing
|
|
1026
|
+
export { fastSimilarity };
|
|
1027
|
+
// Backward-compat alias
|
|
1028
|
+
export { NrekiDB as TokenGuardDB };
|
|
1029
|
+
//# sourceMappingURL=database.js.map
|