@pi-unipi/compactor 0.2.3 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/package.json +2 -2
- package/src/commands/index.ts +79 -169
- package/src/compaction/content.ts +2 -2
- package/src/compaction/cut.ts +10 -6
- package/src/compaction/hooks.ts +82 -52
- package/src/compaction/recall-scope.ts +1 -1
- package/src/config/manager.ts +0 -0
- package/src/config/presets.ts +10 -10
- package/src/executor/executor.ts +4 -4
- package/src/index.ts +34 -45
- package/src/info-screen.ts +97 -40
- package/src/session/db.ts +40 -11
- package/src/session/extract.ts +37 -0
- package/src/tools/ctx-batch-execute.ts +5 -16
- package/src/tools/ctx-doctor.ts +0 -18
- package/src/tools/ctx-stats.ts +43 -10
- package/src/tools/register.ts +30 -122
- package/src/tui/settings-overlay.ts +12 -21
- package/src/types.ts +8 -26
- package/src/store/chunking.ts +0 -126
- package/src/store/db-base.ts +0 -87
- package/src/store/index.ts +0 -513
- package/src/store/unified.ts +0 -109
- package/src/tools/ctx-fetch-and-index.ts +0 -32
- package/src/tools/ctx-index.ts +0 -36
- package/src/tools/ctx-search.ts +0 -19
package/src/store/db-base.ts
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* SQLite backend abstraction with auto-detection
|
|
3
|
-
*/
|
|
4
|
-
|
|
5
|
-
import { existsSync, mkdirSync } from "node:fs";
|
|
6
|
-
import { homedir } from "node:os";
|
|
7
|
-
import { dirname, join } from "node:path";
|
|
8
|
-
|
|
9
|
-
export function defaultDBPath(name: string): string {
|
|
10
|
-
const path = join(homedir(), ".unipi", "db", "compactor", `${name}.db`);
|
|
11
|
-
const dir = dirname(path);
|
|
12
|
-
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
|
13
|
-
return path;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
let sqliteLib: any = null;
|
|
17
|
-
let sqliteFlavor: "bun" | "better-sqlite3" | null = null;
|
|
18
|
-
|
|
19
|
-
export async function loadSQLite() {
|
|
20
|
-
if (sqliteLib) return { lib: sqliteLib, flavor: sqliteFlavor! };
|
|
21
|
-
|
|
22
|
-
// Try bun:sqlite first (Bun runtime)
|
|
23
|
-
try {
|
|
24
|
-
sqliteLib = await import("bun:sqlite" as any);
|
|
25
|
-
sqliteFlavor = "bun";
|
|
26
|
-
return { lib: sqliteLib, flavor: sqliteFlavor };
|
|
27
|
-
} catch {
|
|
28
|
-
// Skip node:sqlite — its API (DatabaseSync) is incompatible with
|
|
29
|
-
// better-sqlite3's constructor pattern (Database class).
|
|
30
|
-
try {
|
|
31
|
-
sqliteLib = await import("better-sqlite3");
|
|
32
|
-
sqliteFlavor = "better-sqlite3";
|
|
33
|
-
return { lib: sqliteLib, flavor: sqliteFlavor };
|
|
34
|
-
} catch {
|
|
35
|
-
sqliteLib = {};
|
|
36
|
-
sqliteFlavor = "better-sqlite3";
|
|
37
|
-
return { lib: sqliteLib, flavor: sqliteFlavor };
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
export function applyWALPragmas(db: any): void {
|
|
43
|
-
db.exec("PRAGMA journal_mode = WAL;");
|
|
44
|
-
db.exec("PRAGMA synchronous = NORMAL;");
|
|
45
|
-
// Memory-map the DB file for read-heavy FTS5 search workloads (if enabled)
|
|
46
|
-
try {
|
|
47
|
-
const { loadConfig } = require("../config/manager.js");
|
|
48
|
-
const config = loadConfig();
|
|
49
|
-
if (config.pipeline?.mmapPragma !== false) {
|
|
50
|
-
db.exec("PRAGMA mmap_size = 268435456;"); // 256MB
|
|
51
|
-
}
|
|
52
|
-
} catch {
|
|
53
|
-
// Fallback: always apply mmap if config can't be loaded
|
|
54
|
-
try { db.exec("PRAGMA mmap_size = 268435456;"); } catch { /* unsupported runtime */ }
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
export function withRetry<T>(fn: () => T, maxRetries = 3): T {
|
|
59
|
-
let lastErr: any;
|
|
60
|
-
for (let i = 0; i < maxRetries; i++) {
|
|
61
|
-
try {
|
|
62
|
-
return fn();
|
|
63
|
-
} catch (err: any) {
|
|
64
|
-
lastErr = err;
|
|
65
|
-
if (err?.code === "SQLITE_BUSY" && i < maxRetries - 1) {
|
|
66
|
-
const delay = Math.pow(2, i) * 10 + Math.random() * 10;
|
|
67
|
-
Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, Math.floor(delay));
|
|
68
|
-
continue;
|
|
69
|
-
}
|
|
70
|
-
throw err;
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
throw lastErr;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
export function isSQLiteCorruptionError(err: any): boolean {
|
|
77
|
-
const msg = String(err?.message ?? "").toLowerCase();
|
|
78
|
-
return msg.includes("database disk image is malformed") ||
|
|
79
|
-
msg.includes("database is locked") ||
|
|
80
|
-
msg.includes("file is not a database");
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
export interface PreparedStatement {
|
|
84
|
-
get(...args: any[]): any;
|
|
85
|
-
all(...args: any[]): any[];
|
|
86
|
-
run(...args: any[]): { changes: number };
|
|
87
|
-
}
|
package/src/store/index.ts
DELETED
|
@@ -1,513 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* ContentStore — FTS5 BM25-based knowledge base with trigram/fuzzy/RRF
|
|
3
|
-
*/
|
|
4
|
-
|
|
5
|
-
import { readFileSync, statSync, existsSync } from "node:fs";
|
|
6
|
-
import { createHash } from "node:crypto";
|
|
7
|
-
import { loadSQLite, applyWALPragmas, withRetry, isSQLiteCorruptionError, defaultDBPath } from "./db-base.js";
|
|
8
|
-
import type { PreparedStatement } from "./db-base.js";
|
|
9
|
-
import { autoChunk } from "./chunking.js";
|
|
10
|
-
import type { IndexResult, SearchResult, StoreStats } from "../types.js";
|
|
11
|
-
import { loadConfig } from "../config/manager.js";
|
|
12
|
-
|
|
13
|
-
// --- Fuzzy correction ---
|
|
14
|
-
|
|
15
|
-
/** Build a vocabulary from indexed content for fuzzy suggestions */
|
|
16
|
-
function buildVocabulary(rows: Array<{ content: string }>): Set<string> {
|
|
17
|
-
const vocab = new Set<string>();
|
|
18
|
-
for (const row of rows) {
|
|
19
|
-
const words = row.content.toLowerCase().match(/[a-z_]{3,}/g) ?? [];
|
|
20
|
-
for (const w of words) vocab.add(w);
|
|
21
|
-
}
|
|
22
|
-
return vocab;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
/** Levenshtein distance for fuzzy matching */
|
|
26
|
-
function levenshtein(a: string, b: string): number {
|
|
27
|
-
const m = a.length;
|
|
28
|
-
const n = b.length;
|
|
29
|
-
const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0));
|
|
30
|
-
for (let i = 0; i <= m; i++) dp[i][0] = i;
|
|
31
|
-
for (let j = 0; j <= n; j++) dp[0][j] = j;
|
|
32
|
-
for (let i = 1; i <= m; i++) {
|
|
33
|
-
for (let j = 1; j <= n; j++) {
|
|
34
|
-
dp[i][j] = Math.min(
|
|
35
|
-
dp[i - 1][j] + 1,
|
|
36
|
-
dp[i][j - 1] + 1,
|
|
37
|
-
dp[i - 1][j - 1] + (a[i - 1] === b[j - 1] ? 0 : 1),
|
|
38
|
-
);
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
return dp[m][n];
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
/** Find closest vocabulary word for fuzzy correction */
|
|
45
|
-
function fuzzySuggest(word: string, vocab: Set<string>, maxDistance = 2): string | undefined {
|
|
46
|
-
const lower = word.toLowerCase();
|
|
47
|
-
let best: string | undefined;
|
|
48
|
-
let bestDist = maxDistance + 1;
|
|
49
|
-
for (const v of vocab) {
|
|
50
|
-
if (Math.abs(v.length - lower.length) > maxDistance) continue;
|
|
51
|
-
const dist = levenshtein(lower, v);
|
|
52
|
-
if (dist < bestDist && dist <= maxDistance) {
|
|
53
|
-
bestDist = dist;
|
|
54
|
-
best = v;
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
return best;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
/** Trigram similarity for fuzzy matching */
|
|
61
|
-
function trigramSimilarity(a: string, b: string): number {
|
|
62
|
-
const trigrams = (s: string): Set<string> => {
|
|
63
|
-
const set = new Set<string>();
|
|
64
|
-
const padded = ` ${s} `;
|
|
65
|
-
for (let i = 0; i < padded.length - 2; i++) set.add(padded.slice(i, i + 3));
|
|
66
|
-
return set;
|
|
67
|
-
};
|
|
68
|
-
const aTri = trigrams(a.toLowerCase());
|
|
69
|
-
const bTri = trigrams(b.toLowerCase());
|
|
70
|
-
let intersection = 0;
|
|
71
|
-
for (const t of aTri) if (bTri.has(t)) intersection++;
|
|
72
|
-
return intersection / (aTri.size + bTri.size - intersection);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
/** Trigram search: find rows with high trigram similarity to query */
|
|
76
|
-
function trigramSearch(rows: Array<{ title: string; content: string; content_type: string; label: string; source: string; rank: number }>, query: string, limit: number): SearchResult[] {
|
|
77
|
-
const queryLower = query.toLowerCase();
|
|
78
|
-
const scored = rows
|
|
79
|
-
.map((r) => ({
|
|
80
|
-
...r,
|
|
81
|
-
trigramScore: Math.max(
|
|
82
|
-
trigramSimilarity(queryLower, r.title.toLowerCase()),
|
|
83
|
-
trigramSimilarity(queryLower, r.content.toLowerCase().slice(0, 200)),
|
|
84
|
-
),
|
|
85
|
-
}))
|
|
86
|
-
.filter((r) => r.trigramScore > 0.1)
|
|
87
|
-
.sort((a, b) => b.trigramScore - a.trigramScore)
|
|
88
|
-
.slice(0, limit);
|
|
89
|
-
|
|
90
|
-
return scored.map((r) => ({
|
|
91
|
-
title: r.title,
|
|
92
|
-
content: r.content,
|
|
93
|
-
source: r.source,
|
|
94
|
-
rank: r.trigramScore,
|
|
95
|
-
contentType: r.content_type === "markdown" || r.content_type === "json" ? "prose" as const : "code" as const,
|
|
96
|
-
matchLayer: "trigram" as const,
|
|
97
|
-
}));
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
/** Reciprocal Rank Fusion: merge results from multiple search layers */
|
|
101
|
-
function rrfMerge(
|
|
102
|
-
porterResults: SearchResult[],
|
|
103
|
-
trigramResults: SearchResult[],
|
|
104
|
-
k = 60,
|
|
105
|
-
): SearchResult[] {
|
|
106
|
-
const scores = new Map<string, { result: SearchResult; score: number }>();
|
|
107
|
-
|
|
108
|
-
for (let i = 0; i < porterResults.length; i++) {
|
|
109
|
-
const key = porterResults[i].title + porterResults[i].source;
|
|
110
|
-
const existing = scores.get(key);
|
|
111
|
-
const rrfScore = 1 / (k + i + 1);
|
|
112
|
-
if (existing) {
|
|
113
|
-
existing.score += rrfScore;
|
|
114
|
-
} else {
|
|
115
|
-
scores.set(key, { result: { ...porterResults[i], matchLayer: "rrf" }, score: rrfScore });
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
for (let i = 0; i < trigramResults.length; i++) {
|
|
120
|
-
const key = trigramResults[i].title + trigramResults[i].source;
|
|
121
|
-
const existing = scores.get(key);
|
|
122
|
-
const rrfScore = 1 / (k + i + 1);
|
|
123
|
-
if (existing) {
|
|
124
|
-
existing.score += rrfScore;
|
|
125
|
-
} else {
|
|
126
|
-
scores.set(key, { result: { ...trigramResults[i], matchLayer: "rrf" }, score: rrfScore });
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
return Array.from(scores.values())
|
|
131
|
-
.sort((a, b) => b.score - a.score)
|
|
132
|
-
.map((s) => ({ ...s.result, rank: s.score }));
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
// ── Proximity Reranking (from context-mode) ──────────────────
|
|
136
|
-
|
|
137
|
-
/** Find all character positions of a term in text */
|
|
138
|
-
function findAllPositions(text: string, term: string): number[] {
|
|
139
|
-
const positions: number[] = [];
|
|
140
|
-
let idx = text.indexOf(term);
|
|
141
|
-
while (idx !== -1) {
|
|
142
|
-
positions.push(idx);
|
|
143
|
-
idx = text.indexOf(term, idx + 1);
|
|
144
|
-
}
|
|
145
|
-
return positions;
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
/** Sweep-line algorithm to find minimum span covering all terms */
|
|
149
|
-
function findMinSpan(positionLists: number[][]): number {
|
|
150
|
-
if (positionLists.length === 0) return Infinity;
|
|
151
|
-
if (positionLists.length === 1) return 0;
|
|
152
|
-
|
|
153
|
-
const sorted = positionLists.map((p) => [...p].sort((a, b) => a - b));
|
|
154
|
-
const ptrs = new Array(sorted.length).fill(0);
|
|
155
|
-
let minSpan = Infinity;
|
|
156
|
-
|
|
157
|
-
while (true) {
|
|
158
|
-
let curMin = Infinity;
|
|
159
|
-
let curMax = -Infinity;
|
|
160
|
-
let minIdx = 0;
|
|
161
|
-
|
|
162
|
-
for (let i = 0; i < sorted.length; i++) {
|
|
163
|
-
const val = sorted[i][ptrs[i]];
|
|
164
|
-
if (val < curMin) { curMin = val; minIdx = i; }
|
|
165
|
-
if (val > curMax) { curMax = val; }
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
const span = curMax - curMin;
|
|
169
|
-
if (span < minSpan) minSpan = span;
|
|
170
|
-
|
|
171
|
-
ptrs[minIdx]++;
|
|
172
|
-
if (ptrs[minIdx] >= sorted[minIdx].length) break;
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
return minSpan;
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
/** Count adjacent term pairs within a character gap */
|
|
179
|
-
function countAdjacentPairs(
|
|
180
|
-
positionLists: number[][],
|
|
181
|
-
terms: string[],
|
|
182
|
-
gap: number = 30,
|
|
183
|
-
): number {
|
|
184
|
-
if (positionLists.length < 2 || terms.length < 2) return 0;
|
|
185
|
-
let total = 0;
|
|
186
|
-
const pairs = Math.min(positionLists.length, terms.length) - 1;
|
|
187
|
-
for (let i = 0; i < pairs; i++) {
|
|
188
|
-
const left = positionLists[i];
|
|
189
|
-
const right = positionLists[i + 1];
|
|
190
|
-
const leftLen = terms[i].length;
|
|
191
|
-
let j = 0;
|
|
192
|
-
for (const p of left) {
|
|
193
|
-
const minStart = p + leftLen;
|
|
194
|
-
const maxStart = minStart + gap;
|
|
195
|
-
while (j < right.length && right[j] < minStart) j++;
|
|
196
|
-
if (j < right.length && right[j] <= maxStart) {
|
|
197
|
-
total++;
|
|
198
|
-
j++;
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
return total;
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
/** Apply proximity reranking to RRF results */
|
|
206
|
-
function applyProximityReranking(
|
|
207
|
-
results: SearchResult[],
|
|
208
|
-
query: string,
|
|
209
|
-
): SearchResult[] {
|
|
210
|
-
const allTerms = query
|
|
211
|
-
.toLowerCase()
|
|
212
|
-
.split(/\s+/)
|
|
213
|
-
.filter((w) => w.length >= 2);
|
|
214
|
-
const filtered = allTerms.filter((w) => !STOPWORDS.has(w));
|
|
215
|
-
const terms = filtered.length > 0 ? filtered : allTerms;
|
|
216
|
-
|
|
217
|
-
if (terms.length < 2) return results; // Single-term queries skip proximity
|
|
218
|
-
|
|
219
|
-
const scored = results.map((r) => {
|
|
220
|
-
const titleLower = r.title.toLowerCase();
|
|
221
|
-
const titleHits = terms.filter((t) => titleLower.includes(t)).length;
|
|
222
|
-
const titleWeight = r.contentType === "code" ? 0.6 : 0.3;
|
|
223
|
-
const titleBoost = titleHits > 0 ? titleWeight * (titleHits / terms.length) : 0;
|
|
224
|
-
|
|
225
|
-
let proximityBoost = 0;
|
|
226
|
-
let phraseBoost = 0;
|
|
227
|
-
|
|
228
|
-
const content = r.content.toLowerCase();
|
|
229
|
-
const positions = terms.map((t) => findAllPositions(content, t));
|
|
230
|
-
|
|
231
|
-
if (!positions.some((p) => p.length === 0)) {
|
|
232
|
-
const minSpan = findMinSpan(positions);
|
|
233
|
-
proximityBoost = 1 / (1 + minSpan / Math.max(content.length, 1));
|
|
234
|
-
|
|
235
|
-
const adjacentPairs = countAdjacentPairs(positions, terms);
|
|
236
|
-
phraseBoost = 0.5 * Math.min(1, adjacentPairs / 4);
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
return { result: r, boost: titleBoost + proximityBoost + phraseBoost };
|
|
240
|
-
});
|
|
241
|
-
|
|
242
|
-
return scored
|
|
243
|
-
.sort((a, b) => b.boost - a.boost || a.result.rank - b.result.rank)
|
|
244
|
-
.map((s) => s.result);
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
const STOPWORDS = new Set([
|
|
248
|
-
"the", "and", "for", "are", "but", "not", "you", "all", "can", "had",
|
|
249
|
-
"her", "was", "one", "our", "out", "has", "his", "how", "its", "may",
|
|
250
|
-
"new", "now", "old", "see", "way", "who", "did", "get", "got", "let",
|
|
251
|
-
"say", "she", "too", "use", "will", "with", "this", "that", "from",
|
|
252
|
-
"they", "been", "have", "many", "some", "them", "than", "each", "make",
|
|
253
|
-
"like", "just", "over", "such", "take", "into", "year", "your", "good",
|
|
254
|
-
"could", "would", "about", "which", "their", "there", "other", "after",
|
|
255
|
-
"should", "through", "also", "more", "most", "only", "very", "when",
|
|
256
|
-
"what", "then", "these", "those", "being", "does", "done", "both",
|
|
257
|
-
"same", "still", "while", "where", "here", "were", "much",
|
|
258
|
-
"update", "updates", "updated", "deps", "dev", "tests", "test",
|
|
259
|
-
"add", "added", "fix", "fixed", "run", "running", "using",
|
|
260
|
-
]);
|
|
261
|
-
|
|
262
|
-
function dedupeTokens(tokens: string[]): string[] {
|
|
263
|
-
const seen = new Set<string>();
|
|
264
|
-
const out: string[] = [];
|
|
265
|
-
for (const t of tokens) {
|
|
266
|
-
const key = t.toLowerCase();
|
|
267
|
-
if (!seen.has(key)) {
|
|
268
|
-
seen.add(key);
|
|
269
|
-
out.push(t);
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
return out;
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
export function sanitizeQuery(query: string, mode: "AND" | "OR" = "AND"): string {
|
|
276
|
-
const words = dedupeTokens(
|
|
277
|
-
query.replace(/['"(){}[\]*:^~]/g, " ").split(/\s+/).filter(
|
|
278
|
-
(w) => w.length > 0 && !["AND", "OR", "NOT", "NEAR"].includes(w.toUpperCase()),
|
|
279
|
-
),
|
|
280
|
-
);
|
|
281
|
-
if (words.length === 0) return '""';
|
|
282
|
-
const meaningful = words.filter((w) => !STOPWORDS.has(w.toLowerCase()));
|
|
283
|
-
const final = meaningful.length > 0 ? meaningful : words;
|
|
284
|
-
return final.map((w) => `"${w}"`).join(mode === "OR" ? " OR " : " ");
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
export class ContentStore {
|
|
288
|
-
private db: any;
|
|
289
|
-
private stmts: Map<string, PreparedStatement> = new Map();
|
|
290
|
-
private dbPath: string;
|
|
291
|
-
private ready = false;
|
|
292
|
-
private writeCount = 0;
|
|
293
|
-
|
|
294
|
-
constructor(opts?: { dbPath?: string }) {
|
|
295
|
-
this.dbPath = opts?.dbPath ?? defaultDBPath("content");
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
async init(): Promise<void> {
|
|
299
|
-
const { lib } = await loadSQLite();
|
|
300
|
-
// Handle different SQLite API shapes:
|
|
301
|
-
// - bun:sqlite exports Database as a named export
|
|
302
|
-
// - better-sqlite3 (CJS) exports the constructor as default when imported via ESM
|
|
303
|
-
const Database = lib.Database ?? lib.default?.Database ?? lib.default ?? lib;
|
|
304
|
-
this.db = new Database(this.dbPath);
|
|
305
|
-
applyWALPragmas(this.db);
|
|
306
|
-
this.initSchema();
|
|
307
|
-
this.prepareStatements();
|
|
308
|
-
this.ready = true;
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
private initSchema(): void {
|
|
312
|
-
this.db.exec(`
|
|
313
|
-
CREATE VIRTUAL TABLE IF NOT EXISTS content_fts USING fts5(
|
|
314
|
-
title, content, content_type, label, source,
|
|
315
|
-
tokenize='porter unicode61'
|
|
316
|
-
);
|
|
317
|
-
|
|
318
|
-
CREATE TABLE IF NOT EXISTS content_sources (
|
|
319
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
320
|
-
label TEXT NOT NULL UNIQUE,
|
|
321
|
-
source TEXT NOT NULL,
|
|
322
|
-
content_type TEXT NOT NULL DEFAULT 'plain',
|
|
323
|
-
mtime INTEGER,
|
|
324
|
-
sha256 TEXT,
|
|
325
|
-
chunk_count INTEGER NOT NULL DEFAULT 0,
|
|
326
|
-
indexed_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
327
|
-
);
|
|
328
|
-
|
|
329
|
-
CREATE INDEX IF NOT EXISTS idx_sources_label ON content_sources(label);
|
|
330
|
-
`);
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
private prepareStatements(): void {
|
|
334
|
-
const p = (key: string, sql: string) => {
|
|
335
|
-
this.stmts.set(key, this.db.prepare(sql) as PreparedStatement);
|
|
336
|
-
};
|
|
337
|
-
|
|
338
|
-
p("insertFTS", `INSERT INTO content_fts (title, content, content_type, label, source) VALUES (?, ?, ?, ?, ?)`);
|
|
339
|
-
p("searchFTS", `SELECT title, content, content_type, label, source, rank FROM content_fts WHERE content_fts MATCH ? ORDER BY rank LIMIT ?`);
|
|
340
|
-
p("searchFTSAll", `SELECT title, content, content_type, label, source, rank FROM content_fts ORDER BY rank LIMIT ?`);
|
|
341
|
-
p("deleteByLabel", `DELETE FROM content_fts WHERE label = ?`);
|
|
342
|
-
p("insertSource", `INSERT INTO content_sources (label, source, content_type, mtime, sha256, chunk_count) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(label) DO UPDATE SET source=excluded.source, content_type=excluded.content_type, mtime=excluded.mtime, sha256=excluded.sha256, chunk_count=excluded.chunk_count, indexed_at=datetime('now')`);
|
|
343
|
-
p("getSource", `SELECT label, source, content_type, mtime, sha256, chunk_count, indexed_at FROM content_sources WHERE label = ?`);
|
|
344
|
-
p("getSourceMeta", `SELECT label, chunk_count, indexed_at FROM content_sources WHERE label = ?`);
|
|
345
|
-
p("deleteSource", `DELETE FROM content_sources WHERE label = ?`);
|
|
346
|
-
p("countSources", `SELECT COUNT(*) AS cnt FROM content_sources`);
|
|
347
|
-
p("countFTS", `SELECT COUNT(*) AS cnt FROM content_fts`);
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
private stmt(key: string): PreparedStatement {
|
|
351
|
-
return this.stmts.get(key)!;
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
async index(label: string, text: string, opts?: { contentType?: "markdown" | "json" | "plain"; source?: string; chunkSize?: number }): Promise<IndexResult> {
|
|
355
|
-
if (!this.ready) await this.init();
|
|
356
|
-
|
|
357
|
-
const contentType = opts?.contentType ?? "plain";
|
|
358
|
-
const source = opts?.source ?? label;
|
|
359
|
-
const chunkSize = opts?.chunkSize ?? 4096;
|
|
360
|
-
|
|
361
|
-
// Check staleness for file-backed sources
|
|
362
|
-
let mtime: number | undefined;
|
|
363
|
-
let sha256: string | undefined;
|
|
364
|
-
if (existsSync(source)) {
|
|
365
|
-
const stat = statSync(source);
|
|
366
|
-
mtime = stat.mtimeMs;
|
|
367
|
-
const existing = this.stmt("getSource").get(label) as { mtime?: number; sha256?: string } | undefined;
|
|
368
|
-
if (existing?.mtime === mtime) {
|
|
369
|
-
return { sourceId: -1, label, totalChunks: existing.sha256 ? parseInt(existing.sha256) : 0, codeChunks: 0 };
|
|
370
|
-
}
|
|
371
|
-
sha256 = createHash("sha256").update(text).digest("hex");
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
// Delete old chunks
|
|
375
|
-
this.stmt("deleteByLabel").run(label);
|
|
376
|
-
|
|
377
|
-
const chunks = autoChunk(text, contentType, chunkSize);
|
|
378
|
-
let codeChunks = 0;
|
|
379
|
-
|
|
380
|
-
const transaction = this.db.transaction(() => {
|
|
381
|
-
for (const chunk of chunks) {
|
|
382
|
-
this.stmt("insertFTS").run(chunk.title, chunk.content, contentType, label, source);
|
|
383
|
-
if (chunk.hasCode) codeChunks++;
|
|
384
|
-
}
|
|
385
|
-
this.stmt("insertSource").run(label, source, contentType, mtime ?? null, sha256 ?? null, chunks.length);
|
|
386
|
-
});
|
|
387
|
-
|
|
388
|
-
withRetry(() => transaction());
|
|
389
|
-
this.afterWrite();
|
|
390
|
-
|
|
391
|
-
return { sourceId: 1, label, totalChunks: chunks.length, codeChunks };
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
async search(query: string, opts?: { limit?: number; offset?: number; mode?: "porter" | "trigram" | "rrf" | "fuzzy" }): Promise<SearchResult[]> {
|
|
395
|
-
if (!this.ready) await this.init();
|
|
396
|
-
|
|
397
|
-
const limit = opts?.limit ?? 10;
|
|
398
|
-
const mode = opts?.mode ?? "rrf";
|
|
399
|
-
const sanitized = sanitizeQuery(query);
|
|
400
|
-
|
|
401
|
-
// Porter stemmer search (FTS5 default)
|
|
402
|
-
const porterRows = this.stmt("searchFTS").all(sanitized, limit * 2) as Array<{
|
|
403
|
-
title: string;
|
|
404
|
-
content: string;
|
|
405
|
-
content_type: string;
|
|
406
|
-
label: string;
|
|
407
|
-
source: string;
|
|
408
|
-
rank: number;
|
|
409
|
-
}>;
|
|
410
|
-
|
|
411
|
-
const porterResults: SearchResult[] = porterRows.map((r) => ({
|
|
412
|
-
title: r.title,
|
|
413
|
-
content: r.content,
|
|
414
|
-
source: r.source,
|
|
415
|
-
rank: r.rank,
|
|
416
|
-
contentType: r.content_type === "markdown" || r.content_type === "json" ? "prose" as const : "code" as const,
|
|
417
|
-
matchLayer: "porter" as const,
|
|
418
|
-
}));
|
|
419
|
-
|
|
420
|
-
if (mode === "porter") return porterResults.slice(0, limit);
|
|
421
|
-
|
|
422
|
-
// Trigram search
|
|
423
|
-
const allRows = this.stmt("searchFTSAll").all(limit * 3) as typeof porterRows;
|
|
424
|
-
const trigramResults = trigramSearch(allRows, query, limit * 2);
|
|
425
|
-
|
|
426
|
-
if (mode === "trigram") return trigramResults.slice(0, limit);
|
|
427
|
-
|
|
428
|
-
// RRF fusion
|
|
429
|
-
const rrfResults = rrfMerge(porterResults, trigramResults);
|
|
430
|
-
|
|
431
|
-
// Apply proximity reranking to all RRF results (if enabled)
|
|
432
|
-
const config = loadConfig();
|
|
433
|
-
const rerankedResults = config.pipeline.proximityReranking
|
|
434
|
-
? applyProximityReranking(rrfResults, query)
|
|
435
|
-
: rrfResults;
|
|
436
|
-
|
|
437
|
-
if (mode === "rrf") return rerankedResults.slice(0, limit);
|
|
438
|
-
|
|
439
|
-
// Fuzzy mode: apply fuzzy correction to query terms
|
|
440
|
-
const vocab = buildVocabulary(allRows);
|
|
441
|
-
const queryWords = query.toLowerCase().split(/\s+/).filter((w) => w.length > 2);
|
|
442
|
-
const corrections: string[] = [];
|
|
443
|
-
for (const word of queryWords) {
|
|
444
|
-
const suggestion = fuzzySuggest(word, vocab);
|
|
445
|
-
if (suggestion && suggestion !== word) corrections.push(`${word} → ${suggestion}`);
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
if (corrections.length > 0 && rrfResults.length < 3) {
|
|
449
|
-
// Re-search with corrected terms
|
|
450
|
-
const correctedQuery = queryWords
|
|
451
|
-
.map((w) => fuzzySuggest(w, vocab) ?? w)
|
|
452
|
-
.join(" ");
|
|
453
|
-
const correctedSanitized = sanitizeQuery(correctedQuery);
|
|
454
|
-
const correctedRows = this.stmt("searchFTS").all(correctedSanitized, limit * 2) as typeof porterRows;
|
|
455
|
-
const correctedResults: SearchResult[] = correctedRows.map((r) => ({
|
|
456
|
-
...r,
|
|
457
|
-
contentType: r.content_type === "markdown" || r.content_type === "json" ? "prose" as const : "code" as const,
|
|
458
|
-
matchLayer: "fuzzy" as const,
|
|
459
|
-
rank: r.rank * 0.9, // slightly lower confidence
|
|
460
|
-
}));
|
|
461
|
-
const merged = rrfMerge(rerankedResults, correctedResults);
|
|
462
|
-
return applyProximityReranking(merged, query).slice(0, limit);
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
return rerankedResults.slice(0, limit);
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
async getStats(): Promise<StoreStats> {
|
|
469
|
-
if (!this.ready) await this.init();
|
|
470
|
-
const sourcesRow = this.stmt("countSources").get() as { cnt: number };
|
|
471
|
-
const chunksRow = this.stmt("countFTS").get() as { cnt: number };
|
|
472
|
-
return {
|
|
473
|
-
sources: sourcesRow.cnt,
|
|
474
|
-
chunks: chunksRow.cnt,
|
|
475
|
-
codeChunks: 0,
|
|
476
|
-
};
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
/** Get source metadata for TTL cache check */
|
|
480
|
-
getSourceMeta(label: string): { label: string; chunkCount: number; indexedAt: string } | null {
|
|
481
|
-
const row = this.stmt("getSourceMeta").get(label) as { label: string; chunk_count: number; indexed_at: string } | undefined;
|
|
482
|
-
if (!row) return null;
|
|
483
|
-
return { label: row.label, chunkCount: row.chunk_count, indexedAt: row.indexed_at };
|
|
484
|
-
}
|
|
485
|
-
|
|
486
|
-
async purge(): Promise<number> {
|
|
487
|
-
if (!this.ready) await this.init();
|
|
488
|
-
this.db.exec(`DELETE FROM content_fts; DELETE FROM content_sources;`);
|
|
489
|
-
this.afterWrite();
|
|
490
|
-
const row = this.stmt("countSources").get() as { cnt: number };
|
|
491
|
-
return row.cnt;
|
|
492
|
-
}
|
|
493
|
-
|
|
494
|
-
/** Run WAL checkpoint to prevent unbounded WAL file growth. */
|
|
495
|
-
checkpointWAL(mode: "PASSIVE" | "TRUNCATE" = "PASSIVE"): void {
|
|
496
|
-
if (!this.db) return;
|
|
497
|
-
try {
|
|
498
|
-
this.db.exec(`PRAGMA wal_checkpoint(${mode});`);
|
|
499
|
-
} catch { /* ignore */ }
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
/** Increment write counter and trigger PASSIVE checkpoint every 10th write. */
|
|
503
|
-
private afterWrite(): void {
|
|
504
|
-
this.writeCount++;
|
|
505
|
-
if (this.writeCount % 10 === 0) {
|
|
506
|
-
this.checkpointWAL("PASSIVE");
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
|
|
510
|
-
close(): void {
|
|
511
|
-
try { this.db.close(); } catch { /* ignore */ }
|
|
512
|
-
}
|
|
513
|
-
}
|