@dex-ai/context 0.7.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +204 -0
- package/package.json +36 -0
- package/src/event-log.ts +246 -0
- package/src/extension.ts +1271 -0
- package/src/formatter.ts +127 -0
- package/src/index.ts +45 -0
- package/src/pressure.ts +61 -0
- package/src/search-tool.ts +230 -0
- package/src/snapshot.ts +240 -0
- package/src/store.ts +678 -0
- package/src/summarize.ts +206 -0
- package/src/tokenizer.ts +20 -0
- package/src/tracker.ts +159 -0
- package/src/types.ts +100 -0
package/src/store.ts
ADDED
|
@@ -0,0 +1,678 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ContentStore — In-process FTS5 knowledge base for context preservation.
|
|
3
|
+
*
|
|
4
|
+
* Inspired by context-mode's approach: instead of truncating/forgetting
|
|
5
|
+
* large tool results, we index them into a searchable FTS5 database.
|
|
6
|
+
* The original content is replaced with a compact pointer in context,
|
|
7
|
+
* and the agent can retrieve specific chunks on demand via search.
|
|
8
|
+
*
|
|
9
|
+
* Architecture:
|
|
10
|
+
* - Uses bun:sqlite for FTS5 full-text search
|
|
11
|
+
* - Chunks content by structure (markdown headings, paragraphs, JSON keys)
|
|
12
|
+
* - BM25-ranked search with fuzzy fallback
|
|
13
|
+
* - In-memory per-session (no persistence across sessions needed)
|
|
14
|
+
*
|
|
15
|
+
* Key principle: ZERO INFORMATION LOSS. Content leaves the context window
|
|
16
|
+
* but never leaves the process — it's always retrievable via search.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { Database } from "bun:sqlite";
|
|
20
|
+
|
|
21
|
+
/* ── Types ─────────────────────────────────────────────── */
|
|
22
|
+
|
|
23
|
+
export interface Chunk {
|
|
24
|
+
title: string;
|
|
25
|
+
content: string;
|
|
26
|
+
hasCode: boolean;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface IndexResult {
|
|
30
|
+
sourceId: number;
|
|
31
|
+
label: string;
|
|
32
|
+
totalChunks: number;
|
|
33
|
+
codeChunks: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface SearchResult {
|
|
37
|
+
title: string;
|
|
38
|
+
content: string;
|
|
39
|
+
source: string;
|
|
40
|
+
rank: number;
|
|
41
|
+
highlighted?: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface SourceInfo {
|
|
45
|
+
id: number;
|
|
46
|
+
label: string;
|
|
47
|
+
chunkCount: number;
|
|
48
|
+
indexedAt: string;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface StoreStats {
|
|
52
|
+
sources: number;
|
|
53
|
+
chunks: number;
|
|
54
|
+
totalBytes: number;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/* ── Constants ─────────────────────────────────────────── */
|
|
58
|
+
|
|
59
|
+
/** Maximum chunk size in bytes. Oversized sections are split at paragraph boundaries. */
|
|
60
|
+
const MAX_CHUNK_BYTES = 4096;
|
|
61
|
+
|
|
62
|
+
/** Lines per chunk for plain-text (non-markdown) content. */
|
|
63
|
+
const LINES_PER_CHUNK = 50;
|
|
64
|
+
|
|
65
|
+
/** Stop words filtered from search queries for better BM25 ranking. */
|
|
66
|
+
const STOPWORDS = new Set([
|
|
67
|
+
"the", "and", "for", "are", "but", "not", "you", "all", "can", "had",
|
|
68
|
+
"her", "was", "one", "our", "out", "has", "his", "how", "its", "may",
|
|
69
|
+
"new", "now", "old", "see", "way", "who", "did", "get", "got", "let",
|
|
70
|
+
"say", "she", "too", "use", "will", "with", "this", "that", "from",
|
|
71
|
+
"they", "been", "have", "many", "some", "them", "than", "each", "make",
|
|
72
|
+
"like", "just", "over", "such", "take", "into", "year", "your", "good",
|
|
73
|
+
"could", "would", "about", "which", "their", "there", "other", "after",
|
|
74
|
+
"should", "through", "also", "more", "most", "only", "very", "when",
|
|
75
|
+
"what", "then", "these", "those", "being", "does", "done", "both",
|
|
76
|
+
"same", "still", "while", "where", "here", "were", "much",
|
|
77
|
+
"update", "updates", "updated", "deps", "dev", "tests", "test",
|
|
78
|
+
"add", "added", "fix", "fixed", "run", "running", "using",
|
|
79
|
+
]);
|
|
80
|
+
|
|
81
|
+
/* ── ContentStore ──────────────────────────────────────── */
|
|
82
|
+
|
|
83
|
+
export class ContentStore {
|
|
84
|
+
#db: Database;
|
|
85
|
+
#insertCount = 0;
|
|
86
|
+
|
|
87
|
+
constructor() {
|
|
88
|
+
// In-memory database — lives for the session only
|
|
89
|
+
this.#db = new Database(":memory:");
|
|
90
|
+
this.#db.exec("PRAGMA journal_mode = WAL");
|
|
91
|
+
this.#db.exec("PRAGMA synchronous = OFF");
|
|
92
|
+
this.#initSchema();
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
#initSchema(): void {
|
|
96
|
+
this.#db.exec(`
|
|
97
|
+
CREATE TABLE IF NOT EXISTS sources (
|
|
98
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
99
|
+
label TEXT NOT NULL,
|
|
100
|
+
content_type TEXT DEFAULT 'prose',
|
|
101
|
+
indexed_at TEXT DEFAULT (datetime('now')),
|
|
102
|
+
total_bytes INTEGER DEFAULT 0
|
|
103
|
+
);
|
|
104
|
+
|
|
105
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
106
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
107
|
+
source_id INTEGER NOT NULL REFERENCES sources(id),
|
|
108
|
+
title TEXT NOT NULL,
|
|
109
|
+
content TEXT NOT NULL,
|
|
110
|
+
has_code INTEGER DEFAULT 0,
|
|
111
|
+
FOREIGN KEY (source_id) REFERENCES sources(id)
|
|
112
|
+
);
|
|
113
|
+
|
|
114
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
|
115
|
+
title,
|
|
116
|
+
content,
|
|
117
|
+
content='chunks',
|
|
118
|
+
content_rowid='id',
|
|
119
|
+
tokenize='porter unicode61'
|
|
120
|
+
);
|
|
121
|
+
|
|
122
|
+
CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
|
|
123
|
+
INSERT INTO chunks_fts(rowid, title, content) VALUES (new.id, new.title, new.content);
|
|
124
|
+
END;
|
|
125
|
+
|
|
126
|
+
CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
|
|
127
|
+
INSERT INTO chunks_fts(chunks_fts, rowid, title, content) VALUES('delete', old.id, old.title, old.content);
|
|
128
|
+
END;
|
|
129
|
+
`);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/* ── Indexing ──────────────────────────────────────── */
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Index content into the store. Content is chunked by structure and
|
|
136
|
+
* stored in FTS5 for BM25 search.
|
|
137
|
+
*
|
|
138
|
+
* Returns metadata about what was indexed — this becomes the "pointer"
|
|
139
|
+
* that replaces the raw content in the context window.
|
|
140
|
+
*/
|
|
141
|
+
index(opts: {
|
|
142
|
+
content: string;
|
|
143
|
+
source: string;
|
|
144
|
+
contentType?: "code" | "prose" | "json";
|
|
145
|
+
}): IndexResult {
|
|
146
|
+
const { content, source, contentType } = opts;
|
|
147
|
+
const detectedType = contentType ?? detectContentType(content);
|
|
148
|
+
|
|
149
|
+
// Delete previous source with same label (re-indexing)
|
|
150
|
+
this.#deleteByLabel(source);
|
|
151
|
+
|
|
152
|
+
// Insert source
|
|
153
|
+
const insertSource = this.#db.prepare(
|
|
154
|
+
"INSERT INTO sources (label, content_type, total_bytes) VALUES (?, ?, ?)",
|
|
155
|
+
);
|
|
156
|
+
const result = insertSource.run(source, detectedType, Buffer.byteLength(content));
|
|
157
|
+
const sourceId = Number(result.lastInsertRowid);
|
|
158
|
+
|
|
159
|
+
// Chunk and insert
|
|
160
|
+
let chunks: Chunk[];
|
|
161
|
+
switch (detectedType) {
|
|
162
|
+
case "code":
|
|
163
|
+
case "prose":
|
|
164
|
+
chunks = this.#chunkMarkdown(content);
|
|
165
|
+
break;
|
|
166
|
+
case "json":
|
|
167
|
+
chunks = this.#chunkJSON(content);
|
|
168
|
+
break;
|
|
169
|
+
default:
|
|
170
|
+
chunks = this.#chunkPlainText(content);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Fallback: if no chunks produced, treat as single chunk
|
|
174
|
+
if (chunks.length === 0) {
|
|
175
|
+
chunks = [{ title: source, content: content.slice(0, MAX_CHUNK_BYTES * 4), hasCode: false }];
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
const insertChunk = this.#db.prepare(
|
|
179
|
+
"INSERT INTO chunks (source_id, title, content, has_code) VALUES (?, ?, ?, ?)",
|
|
180
|
+
);
|
|
181
|
+
const insertAll = this.#db.transaction((chs: Chunk[]) => {
|
|
182
|
+
for (const chunk of chs) {
|
|
183
|
+
insertChunk.run(sourceId, chunk.title, chunk.content, chunk.hasCode ? 1 : 0);
|
|
184
|
+
}
|
|
185
|
+
});
|
|
186
|
+
insertAll(chunks);
|
|
187
|
+
|
|
188
|
+
this.#insertCount += chunks.length;
|
|
189
|
+
if (this.#insertCount > 50) {
|
|
190
|
+
this.#optimize();
|
|
191
|
+
this.#insertCount = 0;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const codeChunks = chunks.filter((c) => c.hasCode).length;
|
|
195
|
+
return { sourceId, label: source, totalChunks: chunks.length, codeChunks };
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/* ── Search ───────────────────────────────────────── */
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Search indexed content using BM25 ranking.
|
|
202
|
+
* Falls back to substring match if FTS5 finds nothing.
|
|
203
|
+
*/
|
|
204
|
+
search(query: string, limit = 5, source?: string): SearchResult[] {
|
|
205
|
+
// Try FTS5 first
|
|
206
|
+
const ftsQuery = sanitizeQuery(query);
|
|
207
|
+
let results = this.#searchFTS(ftsQuery, limit, source);
|
|
208
|
+
|
|
209
|
+
// Fallback: try OR mode if AND gave nothing
|
|
210
|
+
if (results.length === 0) {
|
|
211
|
+
const orQuery = sanitizeQuery(query, "OR");
|
|
212
|
+
results = this.#searchFTS(orQuery, limit, source);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Final fallback: LIKE-based substring search
|
|
216
|
+
if (results.length === 0) {
|
|
217
|
+
results = this.#searchSubstring(query, limit, source);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return results;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
#searchFTS(ftsQuery: string, limit: number, source?: string): SearchResult[] {
|
|
224
|
+
try {
|
|
225
|
+
const sql = source
|
|
226
|
+
? `SELECT c.title, c.content, s.label as source, rank
|
|
227
|
+
FROM chunks_fts f
|
|
228
|
+
JOIN chunks c ON c.id = f.rowid
|
|
229
|
+
JOIN sources s ON s.id = c.source_id
|
|
230
|
+
WHERE chunks_fts MATCH ?
|
|
231
|
+
AND s.label LIKE ?
|
|
232
|
+
ORDER BY rank
|
|
233
|
+
LIMIT ?`
|
|
234
|
+
: `SELECT c.title, c.content, s.label as source, rank
|
|
235
|
+
FROM chunks_fts f
|
|
236
|
+
JOIN chunks c ON c.id = f.rowid
|
|
237
|
+
JOIN sources s ON s.id = c.source_id
|
|
238
|
+
WHERE chunks_fts MATCH ?
|
|
239
|
+
ORDER BY rank
|
|
240
|
+
LIMIT ?`;
|
|
241
|
+
|
|
242
|
+
const params = source
|
|
243
|
+
? [ftsQuery, `%${source}%`, limit]
|
|
244
|
+
: [ftsQuery, limit];
|
|
245
|
+
|
|
246
|
+
const rows = this.#db.prepare(sql).all(...params) as SearchResult[];
|
|
247
|
+
return rows;
|
|
248
|
+
} catch {
|
|
249
|
+
// FTS5 syntax error — fall through to substring
|
|
250
|
+
return [];
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
#searchSubstring(query: string, limit: number, source?: string): SearchResult[] {
|
|
255
|
+
const pattern = `%${query.replace(/[%_]/g, "\\$&")}%`;
|
|
256
|
+
const sql = source
|
|
257
|
+
? `SELECT c.title, c.content, s.label as source, 0 as rank
|
|
258
|
+
FROM chunks c
|
|
259
|
+
JOIN sources s ON s.id = c.source_id
|
|
260
|
+
WHERE (c.content LIKE ? ESCAPE '\\' OR c.title LIKE ? ESCAPE '\\')
|
|
261
|
+
AND s.label LIKE ?
|
|
262
|
+
LIMIT ?`
|
|
263
|
+
: `SELECT c.title, c.content, s.label as source, 0 as rank
|
|
264
|
+
FROM chunks c
|
|
265
|
+
JOIN sources s ON s.id = c.source_id
|
|
266
|
+
WHERE (c.content LIKE ? ESCAPE '\\' OR c.title LIKE ? ESCAPE '\\')
|
|
267
|
+
LIMIT ?`;
|
|
268
|
+
|
|
269
|
+
const params = source
|
|
270
|
+
? [pattern, pattern, `%${source}%`, limit]
|
|
271
|
+
: [pattern, pattern, limit];
|
|
272
|
+
|
|
273
|
+
return this.#db.prepare(sql).all(...params) as SearchResult[];
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/* ── Metadata ─────────────────────────────────────── */
|
|
277
|
+
|
|
278
|
+
listSources(): SourceInfo[] {
|
|
279
|
+
return this.#db.prepare(`
|
|
280
|
+
SELECT s.id, s.label, s.indexed_at as indexedAt,
|
|
281
|
+
COUNT(c.id) as chunkCount
|
|
282
|
+
FROM sources s
|
|
283
|
+
LEFT JOIN chunks c ON c.source_id = s.id
|
|
284
|
+
GROUP BY s.id
|
|
285
|
+
ORDER BY s.indexed_at DESC
|
|
286
|
+
`).all() as SourceInfo[];
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
getStats(): StoreStats {
|
|
290
|
+
const row = this.#db.prepare(`
|
|
291
|
+
SELECT
|
|
292
|
+
(SELECT COUNT(*) FROM sources) as sources,
|
|
293
|
+
(SELECT COUNT(*) FROM chunks) as chunks,
|
|
294
|
+
(SELECT COALESCE(SUM(total_bytes), 0) FROM sources) as totalBytes
|
|
295
|
+
`).get() as StoreStats;
|
|
296
|
+
return row;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
isEmpty(): boolean {
|
|
300
|
+
const row = this.#db.prepare("SELECT COUNT(*) as c FROM chunks").get() as { c: number };
|
|
301
|
+
return row.c === 0;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/* ── Cleanup ──────────────────────────────────────── */
|
|
305
|
+
|
|
306
|
+
#deleteByLabel(label: string): void {
|
|
307
|
+
const source = this.#db.prepare("SELECT id FROM sources WHERE label = ?").get(label) as { id: number } | undefined;
|
|
308
|
+
if (!source) return;
|
|
309
|
+
|
|
310
|
+
this.#db.prepare("DELETE FROM chunks WHERE source_id = ?").run(source.id);
|
|
311
|
+
this.#db.prepare("DELETE FROM sources WHERE id = ?").run(source.id);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
#optimize(): void {
|
|
315
|
+
try {
|
|
316
|
+
this.#db.exec("INSERT INTO chunks_fts(chunks_fts) VALUES('optimize')");
|
|
317
|
+
} catch {
|
|
318
|
+
// Ignore FTS5 optimize errors
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
dispose(): void {
|
|
323
|
+
try {
|
|
324
|
+
this.#db.close();
|
|
325
|
+
} catch {
|
|
326
|
+
// Already closed
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/* ── Chunking: Markdown ───────────────────────────── */
|
|
331
|
+
|
|
332
|
+
#chunkMarkdown(text: string): Chunk[] {
|
|
333
|
+
const chunks: Chunk[] = [];
|
|
334
|
+
const lines = text.split("\n");
|
|
335
|
+
const headingStack: Array<{ level: number; text: string }> = [];
|
|
336
|
+
let currentContent: string[] = [];
|
|
337
|
+
let currentHeading = "";
|
|
338
|
+
|
|
339
|
+
const flush = () => {
|
|
340
|
+
const joined = currentContent.join("\n").trim();
|
|
341
|
+
if (joined.length === 0) return;
|
|
342
|
+
|
|
343
|
+
const title = buildTitle(headingStack, currentHeading);
|
|
344
|
+
const hasCode = currentContent.some((l) => /^`{3,}/.test(l));
|
|
345
|
+
|
|
346
|
+
// Under cap: emit as single chunk
|
|
347
|
+
if (Buffer.byteLength(joined) <= MAX_CHUNK_BYTES) {
|
|
348
|
+
chunks.push({ title, content: joined, hasCode });
|
|
349
|
+
currentContent = [];
|
|
350
|
+
return;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
// Split oversized chunk at paragraph boundaries
|
|
354
|
+
const paragraphs = joined.split(/\n\n+/);
|
|
355
|
+
let accumulator: string[] = [];
|
|
356
|
+
let partIndex = 1;
|
|
357
|
+
|
|
358
|
+
const flushAccumulator = () => {
|
|
359
|
+
if (accumulator.length === 0) return;
|
|
360
|
+
const part = accumulator.join("\n\n").trim();
|
|
361
|
+
if (!part) return;
|
|
362
|
+
const partTitle = paragraphs.length > 1 ? `${title} (${partIndex})` : title;
|
|
363
|
+
partIndex++;
|
|
364
|
+
chunks.push({
|
|
365
|
+
title: partTitle,
|
|
366
|
+
content: part,
|
|
367
|
+
hasCode: part.includes("```"),
|
|
368
|
+
});
|
|
369
|
+
accumulator = [];
|
|
370
|
+
};
|
|
371
|
+
|
|
372
|
+
for (const para of paragraphs) {
|
|
373
|
+
accumulator.push(para);
|
|
374
|
+
const candidate = accumulator.join("\n\n");
|
|
375
|
+
if (Buffer.byteLength(candidate) > MAX_CHUNK_BYTES && accumulator.length > 1) {
|
|
376
|
+
accumulator.pop();
|
|
377
|
+
flushAccumulator();
|
|
378
|
+
accumulator = [para];
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
flushAccumulator();
|
|
382
|
+
currentContent = [];
|
|
383
|
+
};
|
|
384
|
+
|
|
385
|
+
let i = 0;
|
|
386
|
+
while (i < lines.length) {
|
|
387
|
+
const line = lines[i]!;
|
|
388
|
+
|
|
389
|
+
// Horizontal rule
|
|
390
|
+
if (/^[-_*]{3,}\s*$/.test(line)) {
|
|
391
|
+
flush();
|
|
392
|
+
i++;
|
|
393
|
+
continue;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Heading
|
|
397
|
+
const headingMatch = line.match(/^(#{1,4})\s+(.+)$/);
|
|
398
|
+
if (headingMatch) {
|
|
399
|
+
flush();
|
|
400
|
+
const level = headingMatch[1]!.length;
|
|
401
|
+
const heading = headingMatch[2]!.trim();
|
|
402
|
+
|
|
403
|
+
while (headingStack.length > 0 && headingStack[headingStack.length - 1]!.level >= level) {
|
|
404
|
+
headingStack.pop();
|
|
405
|
+
}
|
|
406
|
+
headingStack.push({ level, text: heading });
|
|
407
|
+
currentHeading = heading;
|
|
408
|
+
currentContent.push(line);
|
|
409
|
+
i++;
|
|
410
|
+
continue;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// Code block — collect entire block
|
|
414
|
+
const codeMatch = line.match(/^(`{3,})(.*)?$/);
|
|
415
|
+
if (codeMatch) {
|
|
416
|
+
const fence = codeMatch[1]!;
|
|
417
|
+
const codeLines: string[] = [line];
|
|
418
|
+
i++;
|
|
419
|
+
while (i < lines.length) {
|
|
420
|
+
codeLines.push(lines[i]!);
|
|
421
|
+
if (lines[i]!.startsWith(fence) && lines[i]!.trim() === fence) {
|
|
422
|
+
i++;
|
|
423
|
+
break;
|
|
424
|
+
}
|
|
425
|
+
i++;
|
|
426
|
+
}
|
|
427
|
+
currentContent.push(...codeLines);
|
|
428
|
+
continue;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// Regular line
|
|
432
|
+
currentContent.push(line);
|
|
433
|
+
i++;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
flush();
|
|
437
|
+
return chunks;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/* ── Chunking: Plain Text ─────────────────────────── */
|
|
441
|
+
|
|
442
|
+
#chunkPlainText(text: string): Chunk[] {
|
|
443
|
+
// Try paragraph splitting first
|
|
444
|
+
const sections = text.split(/\n\s*\n/);
|
|
445
|
+
if (
|
|
446
|
+
sections.length >= 3 &&
|
|
447
|
+
sections.length <= 200 &&
|
|
448
|
+
sections.every((s) => Buffer.byteLength(s) < 5000)
|
|
449
|
+
) {
|
|
450
|
+
return sections
|
|
451
|
+
.map((section, i) => {
|
|
452
|
+
const trimmed = section.trim();
|
|
453
|
+
const firstLine = trimmed.split("\n")[0]!.slice(0, 80);
|
|
454
|
+
return {
|
|
455
|
+
title: firstLine || `Section ${i + 1}`,
|
|
456
|
+
content: trimmed,
|
|
457
|
+
hasCode: trimmed.includes("```") || /^\s*(function|class|const|let|var|import|export|def|fn)\s/.test(trimmed),
|
|
458
|
+
};
|
|
459
|
+
})
|
|
460
|
+
.filter((s) => s.content.length > 0);
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// Fall back to line-group chunks
|
|
464
|
+
const lines = text.split("\n");
|
|
465
|
+
if (lines.length <= LINES_PER_CHUNK) {
|
|
466
|
+
return [{ title: "Output", content: text, hasCode: false }];
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
const chunks: Chunk[] = [];
|
|
470
|
+
const overlap = 2;
|
|
471
|
+
const step = Math.max(LINES_PER_CHUNK - overlap, 1);
|
|
472
|
+
|
|
473
|
+
for (let i = 0; i < lines.length; i += step) {
|
|
474
|
+
const slice = lines.slice(i, i + LINES_PER_CHUNK);
|
|
475
|
+
if (slice.length === 0) break;
|
|
476
|
+
const firstLine = slice[0]?.trim().slice(0, 80);
|
|
477
|
+
chunks.push({
|
|
478
|
+
title: firstLine || `Lines ${i + 1}-${i + slice.length}`,
|
|
479
|
+
content: slice.join("\n"),
|
|
480
|
+
hasCode: false,
|
|
481
|
+
});
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
return chunks;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
/* ── Chunking: JSON ───────────────────────────────── */
|
|
488
|
+
|
|
489
|
+
#chunkJSON(text: string): Chunk[] {
|
|
490
|
+
try {
|
|
491
|
+
const parsed = JSON.parse(text);
|
|
492
|
+
const chunks: Chunk[] = [];
|
|
493
|
+
this.#walkJSON(parsed, [], chunks);
|
|
494
|
+
return chunks;
|
|
495
|
+
} catch {
|
|
496
|
+
// Not valid JSON — treat as plain text
|
|
497
|
+
return this.#chunkPlainText(text);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
#walkJSON(value: unknown, path: string[], chunks: Chunk[]): void {
|
|
502
|
+
const title = path.length > 0 ? path.join(" > ") : "(root)";
|
|
503
|
+
const serialized = JSON.stringify(value, null, 2);
|
|
504
|
+
|
|
505
|
+
// Small enough — emit as single chunk
|
|
506
|
+
if (Buffer.byteLength(serialized) <= MAX_CHUNK_BYTES) {
|
|
507
|
+
// Recurse into nested objects for better searchability
|
|
508
|
+
const shouldRecurse =
|
|
509
|
+
typeof value === "object" &&
|
|
510
|
+
value !== null &&
|
|
511
|
+
!Array.isArray(value) &&
|
|
512
|
+
Object.values(value).some((v) => typeof v === "object" && v !== null);
|
|
513
|
+
|
|
514
|
+
if (!shouldRecurse) {
|
|
515
|
+
chunks.push({ title, content: serialized, hasCode: true });
|
|
516
|
+
return;
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// Object — recurse
|
|
521
|
+
if (typeof value === "object" && value !== null && !Array.isArray(value)) {
|
|
522
|
+
const entries = Object.entries(value);
|
|
523
|
+
if (entries.length > 0) {
|
|
524
|
+
for (const [key, val] of entries) {
|
|
525
|
+
this.#walkJSON(val, [...path, key], chunks);
|
|
526
|
+
}
|
|
527
|
+
return;
|
|
528
|
+
}
|
|
529
|
+
chunks.push({ title, content: serialized, hasCode: true });
|
|
530
|
+
return;
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// Array — batch by size
|
|
534
|
+
if (Array.isArray(value)) {
|
|
535
|
+
this.#chunkJSONArray(value, path, chunks);
|
|
536
|
+
return;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// Primitive
|
|
540
|
+
chunks.push({ title, content: serialized, hasCode: false });
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
#chunkJSONArray(arr: unknown[], path: string[], chunks: Chunk[]): void {
|
|
544
|
+
const prefix = path.length > 0 ? path.join(" > ") : "(root)";
|
|
545
|
+
|
|
546
|
+
// Find identity field for better titles
|
|
547
|
+
const identityField = findIdentityField(arr);
|
|
548
|
+
|
|
549
|
+
let batch: unknown[] = [];
|
|
550
|
+
let batchStart = 0;
|
|
551
|
+
|
|
552
|
+
const flushBatch = (batchEnd: number) => {
|
|
553
|
+
if (batch.length === 0) return;
|
|
554
|
+
const title = jsonBatchTitle(prefix, batchStart, batchEnd, batch, identityField);
|
|
555
|
+
chunks.push({
|
|
556
|
+
title,
|
|
557
|
+
content: JSON.stringify(batch, null, 2),
|
|
558
|
+
hasCode: true,
|
|
559
|
+
});
|
|
560
|
+
};
|
|
561
|
+
|
|
562
|
+
for (let i = 0; i < arr.length; i++) {
|
|
563
|
+
batch.push(arr[i]);
|
|
564
|
+
const candidate = JSON.stringify(batch, null, 2);
|
|
565
|
+
|
|
566
|
+
if (Buffer.byteLength(candidate) > MAX_CHUNK_BYTES && batch.length > 1) {
|
|
567
|
+
batch.pop();
|
|
568
|
+
flushBatch(i - 1);
|
|
569
|
+
batch = [arr[i]];
|
|
570
|
+
batchStart = i;
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
flushBatch(batchStart + batch.length - 1);
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
/* ── Helpers ───────────────────────────────────────────── */
|
|
579
|
+
|
|
580
|
+
function buildTitle(
|
|
581
|
+
headingStack: Array<{ level: number; text: string }>,
|
|
582
|
+
currentHeading: string,
|
|
583
|
+
): string {
|
|
584
|
+
if (headingStack.length === 0) {
|
|
585
|
+
return currentHeading || "Untitled";
|
|
586
|
+
}
|
|
587
|
+
return headingStack.map((h) => h.text).join(" > ");
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
function detectContentType(content: string): "code" | "prose" | "json" {
|
|
591
|
+
const trimmed = content.trimStart();
|
|
592
|
+
if (trimmed.startsWith("{") || trimmed.startsWith("[")) {
|
|
593
|
+
try {
|
|
594
|
+
JSON.parse(content);
|
|
595
|
+
return "json";
|
|
596
|
+
} catch { /* not JSON */ }
|
|
597
|
+
}
|
|
598
|
+
// Check if markdown/code-heavy
|
|
599
|
+
const sample = content.slice(0, 2000);
|
|
600
|
+
const codeIndicators = (sample.match(/```|^(import|export|function|class|const|let|var|def|fn|pub|use)\s/gm) || []).length;
|
|
601
|
+
const headingIndicators = (sample.match(/^#{1,4}\s/gm) || []).length;
|
|
602
|
+
if (codeIndicators > 3 || headingIndicators > 2) return "code";
|
|
603
|
+
return "prose";
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
/**
|
|
607
|
+
* Sanitize a user query for FTS5 MATCH syntax.
|
|
608
|
+
* Removes special characters and joins terms with AND/OR.
|
|
609
|
+
*/
|
|
610
|
+
export function sanitizeQuery(query: string, mode: "AND" | "OR" = "AND"): string {
|
|
611
|
+
const words = query
|
|
612
|
+
.replace(/['"(){}[\]*:^~]/g, " ")
|
|
613
|
+
.split(/\s+/)
|
|
614
|
+
.filter(
|
|
615
|
+
(w) => w.length > 0 && !["AND", "OR", "NOT", "NEAR"].includes(w.toUpperCase()),
|
|
616
|
+
);
|
|
617
|
+
|
|
618
|
+
if (words.length === 0) return '""';
|
|
619
|
+
|
|
620
|
+
// Filter stopwords for better ranking
|
|
621
|
+
const meaningful = words.filter((w) => !STOPWORDS.has(w.toLowerCase()));
|
|
622
|
+
const final = meaningful.length > 0 ? meaningful : words;
|
|
623
|
+
|
|
624
|
+
// Deduplicate (case-insensitive)
|
|
625
|
+
const seen = new Set<string>();
|
|
626
|
+
const deduped: string[] = [];
|
|
627
|
+
for (const w of final) {
|
|
628
|
+
const key = w.toLowerCase();
|
|
629
|
+
if (!seen.has(key)) {
|
|
630
|
+
seen.add(key);
|
|
631
|
+
deduped.push(w);
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
return deduped.map((w) => `"${w}"`).join(mode === "OR" ? " OR " : " ");
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
function findIdentityField(arr: unknown[]): string | null {
|
|
639
|
+
if (arr.length === 0) return null;
|
|
640
|
+
const first = arr[0];
|
|
641
|
+
if (typeof first !== "object" || first === null || Array.isArray(first)) return null;
|
|
642
|
+
|
|
643
|
+
const candidates = ["id", "name", "title", "path", "slug", "key", "label"];
|
|
644
|
+
const obj = first as Record<string, unknown>;
|
|
645
|
+
for (const field of candidates) {
|
|
646
|
+
if (field in obj && (typeof obj[field] === "string" || typeof obj[field] === "number")) {
|
|
647
|
+
return field;
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
return null;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
function jsonBatchTitle(
|
|
654
|
+
prefix: string,
|
|
655
|
+
startIdx: number,
|
|
656
|
+
endIdx: number,
|
|
657
|
+
batch: unknown[],
|
|
658
|
+
identityField: string | null,
|
|
659
|
+
): string {
|
|
660
|
+
const sep = prefix ? `${prefix} > ` : "";
|
|
661
|
+
|
|
662
|
+
if (!identityField) {
|
|
663
|
+
return startIdx === endIdx
|
|
664
|
+
? `${sep}[${startIdx}]`
|
|
665
|
+
: `${sep}[${startIdx}-${endIdx}]`;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
const getId = (item: unknown) =>
|
|
669
|
+
String((item as Record<string, unknown>)[identityField]);
|
|
670
|
+
|
|
671
|
+
if (batch.length === 1) {
|
|
672
|
+
return `${sep}${getId(batch[0])}`;
|
|
673
|
+
}
|
|
674
|
+
if (batch.length <= 3) {
|
|
675
|
+
return sep + batch.map(getId).join(", ");
|
|
676
|
+
}
|
|
677
|
+
return `${sep}${getId(batch[0])}…${getId(batch[batch.length - 1])}`;
|
|
678
|
+
}
|