@context-vault/core 3.1.4 → 3.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/capture.d.ts +1 -1
- package/dist/capture.d.ts.map +1 -1
- package/dist/capture.js +20 -9
- package/dist/capture.js.map +1 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js.map +1 -1
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js.map +1 -1
- package/dist/db.d.ts.map +1 -1
- package/dist/db.js +10 -0
- package/dist/db.js.map +1 -1
- package/dist/embed.d.ts.map +1 -1
- package/dist/embed.js.map +1 -1
- package/dist/formatters.d.ts.map +1 -1
- package/dist/formatters.js.map +1 -1
- package/dist/frontmatter.d.ts.map +1 -1
- package/dist/frontmatter.js.map +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +27 -6
- package/dist/index.js.map +1 -1
- package/dist/ingest-url.d.ts.map +1 -1
- package/dist/ingest-url.js +32 -9
- package/dist/ingest-url.js.map +1 -1
- package/dist/main.d.ts +3 -3
- package/dist/main.d.ts.map +1 -1
- package/dist/main.js +3 -3
- package/dist/main.js.map +1 -1
- package/dist/search.d.ts +1 -1
- package/dist/search.d.ts.map +1 -1
- package/dist/search.js +3 -2
- package/dist/search.js.map +1 -1
- package/package.json +1 -1
- package/src/capture.ts +63 -19
- package/src/config.ts +4 -8
- package/src/constants.ts +5 -7
- package/src/db.ts +11 -7
- package/src/embed.ts +3 -1
- package/src/formatters.ts +1 -4
- package/src/frontmatter.ts +6 -4
- package/src/index.ts +155 -46
- package/src/ingest-url.ts +153 -40
- package/src/main.ts +8 -11
- package/src/search.ts +29 -10
package/src/index.ts
CHANGED
|
@@ -12,12 +12,27 @@ const EMBED_BATCH_SIZE = 32;
|
|
|
12
12
|
|
|
13
13
|
export async function indexEntry(
|
|
14
14
|
ctx: BaseCtx,
|
|
15
|
-
entry: IndexEntryInput & {
|
|
15
|
+
entry: IndexEntryInput & {
|
|
16
|
+
supersedes?: string[] | null;
|
|
17
|
+
related_to?: string[] | null;
|
|
18
|
+
},
|
|
16
19
|
precomputedEmbedding?: Float32Array | null,
|
|
17
20
|
): Promise<void> {
|
|
18
21
|
const {
|
|
19
|
-
id,
|
|
20
|
-
|
|
22
|
+
id,
|
|
23
|
+
kind,
|
|
24
|
+
category,
|
|
25
|
+
title,
|
|
26
|
+
body,
|
|
27
|
+
meta,
|
|
28
|
+
tags,
|
|
29
|
+
source,
|
|
30
|
+
filePath,
|
|
31
|
+
createdAt,
|
|
32
|
+
identity_key,
|
|
33
|
+
expires_at,
|
|
34
|
+
source_files,
|
|
35
|
+
tier,
|
|
21
36
|
} = entry;
|
|
22
37
|
|
|
23
38
|
if (expires_at && new Date(expires_at) <= new Date()) return;
|
|
@@ -31,13 +46,22 @@ export async function indexEntry(
|
|
|
31
46
|
let wasUpdate = false;
|
|
32
47
|
|
|
33
48
|
if (cat === "entity" && identity_key) {
|
|
34
|
-
const existing = ctx.stmts.getByIdentityKey.get(kind, identity_key) as
|
|
49
|
+
const existing = ctx.stmts.getByIdentityKey.get(kind, identity_key) as
|
|
50
|
+
| Record<string, unknown>
|
|
51
|
+
| undefined;
|
|
35
52
|
if (existing) {
|
|
36
53
|
ctx.stmts.upsertByIdentityKey.run(
|
|
37
|
-
title || null,
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
54
|
+
title || null,
|
|
55
|
+
body,
|
|
56
|
+
metaJson,
|
|
57
|
+
tagsJson,
|
|
58
|
+
source || "claude-code",
|
|
59
|
+
cat,
|
|
60
|
+
filePath,
|
|
61
|
+
expires_at || null,
|
|
62
|
+
sourceFilesJson,
|
|
63
|
+
kind,
|
|
64
|
+
identity_key,
|
|
41
65
|
);
|
|
42
66
|
wasUpdate = true;
|
|
43
67
|
}
|
|
@@ -46,20 +70,39 @@ export async function indexEntry(
|
|
|
46
70
|
if (!wasUpdate) {
|
|
47
71
|
try {
|
|
48
72
|
ctx.stmts.insertEntry.run(
|
|
49
|
-
id,
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
73
|
+
id,
|
|
74
|
+
kind,
|
|
75
|
+
cat,
|
|
76
|
+
title || null,
|
|
77
|
+
body,
|
|
78
|
+
metaJson,
|
|
79
|
+
tagsJson,
|
|
80
|
+
source || "claude-code",
|
|
81
|
+
filePath,
|
|
82
|
+
identity_key || null,
|
|
83
|
+
expires_at || null,
|
|
84
|
+
createdAt,
|
|
85
|
+
createdAt,
|
|
86
|
+
sourceFilesJson,
|
|
87
|
+
effectiveTier,
|
|
53
88
|
);
|
|
54
89
|
} catch (e) {
|
|
55
90
|
if ((e as Error).message.includes("UNIQUE constraint")) {
|
|
56
91
|
ctx.stmts.updateEntry.run(
|
|
57
|
-
title || null,
|
|
58
|
-
|
|
59
|
-
|
|
92
|
+
title || null,
|
|
93
|
+
body,
|
|
94
|
+
metaJson,
|
|
95
|
+
tagsJson,
|
|
96
|
+
source || "claude-code",
|
|
97
|
+
cat,
|
|
98
|
+
identity_key || null,
|
|
99
|
+
expires_at || null,
|
|
100
|
+
filePath,
|
|
60
101
|
);
|
|
61
102
|
if (sourceFilesJson !== null && ctx.stmts.updateSourceFiles) {
|
|
62
|
-
const entryRow = ctx.stmts.getRowidByPath.get(filePath) as
|
|
103
|
+
const entryRow = ctx.stmts.getRowidByPath.get(filePath) as
|
|
104
|
+
| { rowid: number }
|
|
105
|
+
| undefined;
|
|
63
106
|
if (entryRow) {
|
|
64
107
|
const idRow = ctx.db
|
|
65
108
|
.prepare("SELECT id FROM vault WHERE file_path = ?")
|
|
@@ -76,8 +119,8 @@ export async function indexEntry(
|
|
|
76
119
|
}
|
|
77
120
|
|
|
78
121
|
const rowidResult = wasUpdate
|
|
79
|
-
? ctx.stmts.getRowidByPath.get(filePath) as { rowid: number } | undefined
|
|
80
|
-
: ctx.stmts.getRowid.get(id) as { rowid: number } | undefined;
|
|
122
|
+
? (ctx.stmts.getRowidByPath.get(filePath) as { rowid: number } | undefined)
|
|
123
|
+
: (ctx.stmts.getRowid.get(id) as { rowid: number } | undefined);
|
|
81
124
|
|
|
82
125
|
if (!rowidResult || rowidResult.rowid == null) {
|
|
83
126
|
throw new Error(
|
|
@@ -100,12 +143,18 @@ export async function indexEntry(
|
|
|
100
143
|
try {
|
|
101
144
|
embedding = await ctx.embed([title, body].filter(Boolean).join(" "));
|
|
102
145
|
} catch (embedErr) {
|
|
103
|
-
console.warn(
|
|
146
|
+
console.warn(
|
|
147
|
+
`[context-vault] embed() failed for entry ${id} — skipping vec insert: ${(embedErr as Error).message}`,
|
|
148
|
+
);
|
|
104
149
|
}
|
|
105
150
|
}
|
|
106
151
|
|
|
107
152
|
if (embedding) {
|
|
108
|
-
try {
|
|
153
|
+
try {
|
|
154
|
+
ctx.deleteVec(rowid);
|
|
155
|
+
} catch {
|
|
156
|
+
/* no-op */
|
|
157
|
+
}
|
|
109
158
|
ctx.insertVec(rowid, embedding);
|
|
110
159
|
}
|
|
111
160
|
}
|
|
@@ -120,11 +169,17 @@ export async function pruneExpired(ctx: BaseCtx): Promise<number> {
|
|
|
120
169
|
|
|
121
170
|
for (const row of expired) {
|
|
122
171
|
if (row.file_path) {
|
|
123
|
-
try {
|
|
172
|
+
try {
|
|
173
|
+
unlinkSync(row.file_path);
|
|
174
|
+
} catch {}
|
|
124
175
|
}
|
|
125
|
-
const vRowid = (
|
|
176
|
+
const vRowid = (
|
|
177
|
+
ctx.stmts.getRowid.get(row.id) as { rowid: number } | undefined
|
|
178
|
+
)?.rowid;
|
|
126
179
|
if (vRowid) {
|
|
127
|
-
try {
|
|
180
|
+
try {
|
|
181
|
+
ctx.deleteVec(Number(vRowid));
|
|
182
|
+
} catch {}
|
|
128
183
|
}
|
|
129
184
|
ctx.stmts.deleteEntry.run(row.id);
|
|
130
185
|
}
|
|
@@ -137,7 +192,12 @@ export async function reindex(
|
|
|
137
192
|
opts: { fullSync?: boolean } = {},
|
|
138
193
|
): Promise<ReindexStats> {
|
|
139
194
|
const { fullSync = true } = opts;
|
|
140
|
-
const stats: ReindexStats = {
|
|
195
|
+
const stats: ReindexStats = {
|
|
196
|
+
added: 0,
|
|
197
|
+
updated: 0,
|
|
198
|
+
removed: 0,
|
|
199
|
+
unchanged: 0,
|
|
200
|
+
};
|
|
141
201
|
|
|
142
202
|
if (!existsSync(ctx.config.vaultDir)) return stats;
|
|
143
203
|
|
|
@@ -224,20 +284,32 @@ export async function reindex(
|
|
|
224
284
|
if (!existing) {
|
|
225
285
|
const id = (fmMeta.id as string) || ulid();
|
|
226
286
|
const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
|
|
227
|
-
const created =
|
|
287
|
+
const created =
|
|
288
|
+
(fmMeta.created as string) || new Date().toISOString();
|
|
228
289
|
|
|
229
290
|
const result = upsertEntry.run(
|
|
230
|
-
id,
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
291
|
+
id,
|
|
292
|
+
kind,
|
|
293
|
+
category,
|
|
294
|
+
parsed.title || null,
|
|
295
|
+
parsed.body,
|
|
296
|
+
metaJson,
|
|
297
|
+
tagsJson,
|
|
298
|
+
(fmMeta.source as string) || "file",
|
|
299
|
+
filePath,
|
|
300
|
+
identity_key,
|
|
301
|
+
expires_at,
|
|
302
|
+
created,
|
|
303
|
+
(fmMeta.updated as string) || created,
|
|
234
304
|
);
|
|
235
305
|
if ((result as { changes: number }).changes > 0) {
|
|
236
306
|
if (relatedToJson && ctx.stmts.updateRelatedTo) {
|
|
237
307
|
ctx.stmts.updateRelatedTo.run(relatedToJson, id);
|
|
238
308
|
}
|
|
239
309
|
if (category !== "event") {
|
|
240
|
-
const rowidResult = ctx.stmts.getRowid.get(id) as
|
|
310
|
+
const rowidResult = ctx.stmts.getRowid.get(id) as
|
|
311
|
+
| { rowid: number }
|
|
312
|
+
| undefined;
|
|
241
313
|
if (rowidResult?.rowid) {
|
|
242
314
|
const embeddingText = [parsed.title, parsed.body]
|
|
243
315
|
.filter(Boolean)
|
|
@@ -254,24 +326,45 @@ export async function reindex(
|
|
|
254
326
|
}
|
|
255
327
|
} else if (fullSync) {
|
|
256
328
|
const tagsJson = fmMeta.tags ? JSON.stringify(fmMeta.tags) : null;
|
|
257
|
-
const titleChanged =
|
|
329
|
+
const titleChanged =
|
|
330
|
+
(parsed.title || null) !== ((existing.title as string) || null);
|
|
258
331
|
const bodyChanged = (existing.body as string) !== parsed.body;
|
|
259
332
|
const tagsChanged = tagsJson !== ((existing.tags as string) || null);
|
|
260
333
|
const metaChanged = metaJson !== ((existing.meta as string) || null);
|
|
261
|
-
const relatedToChanged =
|
|
262
|
-
|
|
263
|
-
|
|
334
|
+
const relatedToChanged =
|
|
335
|
+
relatedToJson !== ((existing.related_to as string) || null);
|
|
336
|
+
|
|
337
|
+
if (
|
|
338
|
+
bodyChanged ||
|
|
339
|
+
titleChanged ||
|
|
340
|
+
tagsChanged ||
|
|
341
|
+
metaChanged ||
|
|
342
|
+
relatedToChanged
|
|
343
|
+
) {
|
|
264
344
|
ctx.stmts.updateEntry.run(
|
|
265
|
-
parsed.title || null,
|
|
266
|
-
|
|
267
|
-
|
|
345
|
+
parsed.title || null,
|
|
346
|
+
parsed.body,
|
|
347
|
+
metaJson,
|
|
348
|
+
tagsJson,
|
|
349
|
+
(fmMeta.source as string) || "file",
|
|
350
|
+
category,
|
|
351
|
+
identity_key,
|
|
352
|
+
expires_at,
|
|
353
|
+
filePath,
|
|
268
354
|
);
|
|
269
355
|
if (relatedToChanged && ctx.stmts.updateRelatedTo) {
|
|
270
|
-
ctx.stmts.updateRelatedTo.run(
|
|
356
|
+
ctx.stmts.updateRelatedTo.run(
|
|
357
|
+
relatedToJson,
|
|
358
|
+
existing.id as string,
|
|
359
|
+
);
|
|
271
360
|
}
|
|
272
361
|
|
|
273
362
|
if ((bodyChanged || titleChanged) && category !== "event") {
|
|
274
|
-
const rowid = (
|
|
363
|
+
const rowid = (
|
|
364
|
+
ctx.stmts.getRowid.get(existing.id as string) as
|
|
365
|
+
| { rowid: number }
|
|
366
|
+
| undefined
|
|
367
|
+
)?.rowid;
|
|
275
368
|
if (rowid) {
|
|
276
369
|
const embeddingText = [parsed.title, parsed.body]
|
|
277
370
|
.filter(Boolean)
|
|
@@ -291,9 +384,15 @@ export async function reindex(
|
|
|
291
384
|
if (fullSync) {
|
|
292
385
|
for (const [dbPath, row] of dbByPath) {
|
|
293
386
|
if (!diskPaths.has(dbPath)) {
|
|
294
|
-
const vRowid = (
|
|
387
|
+
const vRowid = (
|
|
388
|
+
ctx.stmts.getRowid.get(row.id as string) as
|
|
389
|
+
| { rowid: number }
|
|
390
|
+
| undefined
|
|
391
|
+
)?.rowid;
|
|
295
392
|
if (vRowid) {
|
|
296
|
-
try {
|
|
393
|
+
try {
|
|
394
|
+
ctx.deleteVec(vRowid);
|
|
395
|
+
} catch {}
|
|
297
396
|
}
|
|
298
397
|
ctx.stmts.deleteEntry.run(row.id as string);
|
|
299
398
|
stats.removed++;
|
|
@@ -313,7 +412,9 @@ export async function reindex(
|
|
|
313
412
|
.prepare("SELECT id, rowid FROM vault WHERE kind = ?")
|
|
314
413
|
.all(kind) as { id: string; rowid: number }[];
|
|
315
414
|
for (const row of orphaned) {
|
|
316
|
-
try {
|
|
415
|
+
try {
|
|
416
|
+
ctx.deleteVec(row.rowid);
|
|
417
|
+
} catch {}
|
|
317
418
|
ctx.stmts.deleteEntry.run(row.id);
|
|
318
419
|
stats.removed++;
|
|
319
420
|
}
|
|
@@ -329,11 +430,17 @@ export async function reindex(
|
|
|
329
430
|
|
|
330
431
|
for (const row of expired) {
|
|
331
432
|
if (row.file_path) {
|
|
332
|
-
try {
|
|
433
|
+
try {
|
|
434
|
+
unlinkSync(row.file_path);
|
|
435
|
+
} catch {}
|
|
333
436
|
}
|
|
334
|
-
const vRowid = (
|
|
437
|
+
const vRowid = (
|
|
438
|
+
ctx.stmts.getRowid.get(row.id) as { rowid: number } | undefined
|
|
439
|
+
)?.rowid;
|
|
335
440
|
if (vRowid) {
|
|
336
|
-
try {
|
|
441
|
+
try {
|
|
442
|
+
ctx.deleteVec(Number(vRowid));
|
|
443
|
+
} catch {}
|
|
337
444
|
}
|
|
338
445
|
ctx.stmts.deleteEntry.run(row.id);
|
|
339
446
|
stats.removed++;
|
|
@@ -350,7 +457,9 @@ export async function reindex(
|
|
|
350
457
|
const embeddings = await embedBatch(batch.map((e) => e.text));
|
|
351
458
|
for (let j = 0; j < batch.length; j++) {
|
|
352
459
|
if (embeddings[j]) {
|
|
353
|
-
try {
|
|
460
|
+
try {
|
|
461
|
+
ctx.deleteVec(batch[j].rowid);
|
|
462
|
+
} catch {}
|
|
354
463
|
ctx.insertVec(batch[j].rowid, embeddings[j]!);
|
|
355
464
|
}
|
|
356
465
|
}
|
package/src/ingest-url.ts
CHANGED
|
@@ -6,27 +6,81 @@ export function htmlToMarkdown(html: string): string {
|
|
|
6
6
|
md = md.replace(/<header[\s\S]*?<\/header>/gi, "");
|
|
7
7
|
md = md.replace(/<footer[\s\S]*?<\/footer>/gi, "");
|
|
8
8
|
md = md.replace(/<aside[\s\S]*?<\/aside>/gi, "");
|
|
9
|
-
md = md.replace(
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
md = md.replace(
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
md = md.replace(
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
9
|
+
md = md.replace(
|
|
10
|
+
/<h1[^>]*>([\s\S]*?)<\/h1>/gi,
|
|
11
|
+
(_, c: string) => `\n# ${stripTags(c).trim()}\n`,
|
|
12
|
+
);
|
|
13
|
+
md = md.replace(
|
|
14
|
+
/<h2[^>]*>([\s\S]*?)<\/h2>/gi,
|
|
15
|
+
(_, c: string) => `\n## ${stripTags(c).trim()}\n`,
|
|
16
|
+
);
|
|
17
|
+
md = md.replace(
|
|
18
|
+
/<h3[^>]*>([\s\S]*?)<\/h3>/gi,
|
|
19
|
+
(_, c: string) => `\n### ${stripTags(c).trim()}\n`,
|
|
20
|
+
);
|
|
21
|
+
md = md.replace(
|
|
22
|
+
/<h4[^>]*>([\s\S]*?)<\/h4>/gi,
|
|
23
|
+
(_, c: string) => `\n#### ${stripTags(c).trim()}\n`,
|
|
24
|
+
);
|
|
25
|
+
md = md.replace(
|
|
26
|
+
/<h5[^>]*>([\s\S]*?)<\/h5>/gi,
|
|
27
|
+
(_, c: string) => `\n##### ${stripTags(c).trim()}\n`,
|
|
28
|
+
);
|
|
29
|
+
md = md.replace(
|
|
30
|
+
/<h6[^>]*>([\s\S]*?)<\/h6>/gi,
|
|
31
|
+
(_, c: string) => `\n###### ${stripTags(c).trim()}\n`,
|
|
32
|
+
);
|
|
33
|
+
md = md.replace(
|
|
34
|
+
/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi,
|
|
35
|
+
(_, href: string, text: string) => {
|
|
36
|
+
const cleanText = stripTags(text).trim();
|
|
37
|
+
return cleanText ? `[${cleanText}](${href})` : "";
|
|
38
|
+
},
|
|
39
|
+
);
|
|
40
|
+
md = md.replace(
|
|
41
|
+
/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi,
|
|
42
|
+
(_, c: string) => `\n\`\`\`\n${decodeEntities(c).trim()}\n\`\`\`\n`,
|
|
43
|
+
);
|
|
44
|
+
md = md.replace(
|
|
45
|
+
/<pre[^>]*>([\s\S]*?)<\/pre>/gi,
|
|
46
|
+
(_, c: string) =>
|
|
47
|
+
`\n\`\`\`\n${decodeEntities(stripTags(c)).trim()}\n\`\`\`\n`,
|
|
48
|
+
);
|
|
49
|
+
md = md.replace(
|
|
50
|
+
/<code[^>]*>([\s\S]*?)<\/code>/gi,
|
|
51
|
+
(_, c: string) => `\`${decodeEntities(c).trim()}\``,
|
|
52
|
+
);
|
|
53
|
+
md = md.replace(
|
|
54
|
+
/<(strong|b)[^>]*>([\s\S]*?)<\/\1>/gi,
|
|
55
|
+
(_, __: string, c: string) => `**${stripTags(c).trim()}**`,
|
|
56
|
+
);
|
|
57
|
+
md = md.replace(
|
|
58
|
+
/<(em|i)[^>]*>([\s\S]*?)<\/\1>/gi,
|
|
59
|
+
(_, __: string, c: string) => `*${stripTags(c).trim()}*`,
|
|
60
|
+
);
|
|
61
|
+
md = md.replace(
|
|
62
|
+
/<li[^>]*>([\s\S]*?)<\/li>/gi,
|
|
63
|
+
(_, c: string) => `- ${stripTags(c).trim()}\n`,
|
|
64
|
+
);
|
|
25
65
|
md = md.replace(/<br\s*\/?>/gi, "\n");
|
|
26
|
-
md = md.replace(
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
66
|
+
md = md.replace(
|
|
67
|
+
/<p[^>]*>([\s\S]*?)<\/p>/gi,
|
|
68
|
+
(_, c: string) => `\n${stripTags(c).trim()}\n`,
|
|
69
|
+
);
|
|
70
|
+
md = md.replace(
|
|
71
|
+
/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi,
|
|
72
|
+
(_, c: string) => {
|
|
73
|
+
return (
|
|
74
|
+
"\n" +
|
|
75
|
+
stripTags(c)
|
|
76
|
+
.trim()
|
|
77
|
+
.split("\n")
|
|
78
|
+
.map((l: string) => `> ${l}`)
|
|
79
|
+
.join("\n") +
|
|
80
|
+
"\n"
|
|
81
|
+
);
|
|
82
|
+
},
|
|
83
|
+
);
|
|
30
84
|
md = stripTags(md);
|
|
31
85
|
md = decodeEntities(md);
|
|
32
86
|
md = md.replace(/\n{3,}/g, "\n\n").trim();
|
|
@@ -39,15 +93,28 @@ function stripTags(html: string): string {
|
|
|
39
93
|
|
|
40
94
|
function decodeEntities(text: string): string {
|
|
41
95
|
return text
|
|
42
|
-
.replace(/&/g, "&")
|
|
43
|
-
.replace(/&
|
|
44
|
-
.replace(
|
|
45
|
-
.replace(
|
|
96
|
+
.replace(/&/g, "&")
|
|
97
|
+
.replace(/</g, "<")
|
|
98
|
+
.replace(/>/g, ">")
|
|
99
|
+
.replace(/"/g, '"')
|
|
100
|
+
.replace(/'/g, "'")
|
|
101
|
+
.replace(/ /g, " ")
|
|
102
|
+
.replace(/&#(\d+);/g, (_, n: string) =>
|
|
103
|
+
String.fromCharCode(parseInt(n, 10)),
|
|
104
|
+
)
|
|
105
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, n: string) =>
|
|
106
|
+
String.fromCharCode(parseInt(n, 16)),
|
|
107
|
+
);
|
|
46
108
|
}
|
|
47
109
|
|
|
48
|
-
export function extractHtmlContent(
|
|
110
|
+
export function extractHtmlContent(
|
|
111
|
+
html: string,
|
|
112
|
+
_url: string,
|
|
113
|
+
): { title: string; body: string } {
|
|
49
114
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
50
|
-
const title = titleMatch
|
|
115
|
+
const title = titleMatch
|
|
116
|
+
? stripTags(decodeEntities(titleMatch[1])).trim()
|
|
117
|
+
: "";
|
|
51
118
|
let contentHtml = "";
|
|
52
119
|
const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
|
|
53
120
|
const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
|
|
@@ -63,37 +130,83 @@ export function extractHtmlContent(html: string, _url: string): { title: string;
|
|
|
63
130
|
|
|
64
131
|
export async function ingestUrl(
|
|
65
132
|
url: string,
|
|
66
|
-
opts: {
|
|
67
|
-
|
|
68
|
-
|
|
133
|
+
opts: {
|
|
134
|
+
kind?: string;
|
|
135
|
+
tags?: string[];
|
|
136
|
+
source?: string;
|
|
137
|
+
maxBodyLength?: number;
|
|
138
|
+
timeoutMs?: number;
|
|
139
|
+
} = {},
|
|
140
|
+
): Promise<{
|
|
141
|
+
kind: string;
|
|
142
|
+
title: string;
|
|
143
|
+
body: string;
|
|
144
|
+
tags: string[];
|
|
145
|
+
meta: Record<string, unknown>;
|
|
146
|
+
source: string;
|
|
147
|
+
}> {
|
|
148
|
+
const {
|
|
149
|
+
kind = "reference",
|
|
150
|
+
tags = [],
|
|
151
|
+
source,
|
|
152
|
+
maxBodyLength = 50000,
|
|
153
|
+
timeoutMs = 15000,
|
|
154
|
+
} = opts;
|
|
69
155
|
let domain: string;
|
|
70
|
-
try {
|
|
156
|
+
try {
|
|
157
|
+
domain = new URL(url).hostname;
|
|
158
|
+
} catch {
|
|
159
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
160
|
+
}
|
|
71
161
|
const controller = new AbortController();
|
|
72
162
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
73
163
|
let response: Response;
|
|
74
164
|
try {
|
|
75
165
|
response = await fetch(url, {
|
|
76
166
|
signal: controller.signal,
|
|
77
|
-
headers: {
|
|
167
|
+
headers: {
|
|
168
|
+
"User-Agent":
|
|
169
|
+
"ContextVault/1.0 (+https://github.com/fellanH/context-vault)",
|
|
170
|
+
Accept: "text/html,application/xhtml+xml,text/plain,*/*",
|
|
171
|
+
},
|
|
78
172
|
});
|
|
79
173
|
} catch (err) {
|
|
80
|
-
if ((err as Error).name === "AbortError")
|
|
174
|
+
if ((err as Error).name === "AbortError")
|
|
175
|
+
throw new Error(`Request timed out after ${timeoutMs}ms`);
|
|
81
176
|
throw new Error(`Fetch failed: ${(err as Error).message}`);
|
|
82
|
-
} finally {
|
|
83
|
-
|
|
177
|
+
} finally {
|
|
178
|
+
clearTimeout(timeout);
|
|
179
|
+
}
|
|
180
|
+
if (!response.ok)
|
|
181
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
84
182
|
const contentType = response.headers.get("content-type") || "";
|
|
85
183
|
const html = await response.text();
|
|
86
184
|
let title: string, body: string;
|
|
87
|
-
if (
|
|
185
|
+
if (
|
|
186
|
+
contentType.includes("text/html") ||
|
|
187
|
+
contentType.includes("application/xhtml")
|
|
188
|
+
) {
|
|
88
189
|
const extracted = extractHtmlContent(html, url);
|
|
89
|
-
title = extracted.title;
|
|
90
|
-
|
|
91
|
-
|
|
190
|
+
title = extracted.title;
|
|
191
|
+
body = extracted.body;
|
|
192
|
+
} else {
|
|
193
|
+
title = domain;
|
|
194
|
+
body = html;
|
|
195
|
+
}
|
|
196
|
+
if (body.length > maxBodyLength)
|
|
197
|
+
body = body.slice(0, maxBodyLength) + "\n\n[Content truncated]";
|
|
92
198
|
if (!body.trim()) throw new Error("No readable content extracted from URL");
|
|
93
199
|
return {
|
|
94
|
-
kind,
|
|
200
|
+
kind,
|
|
201
|
+
title: title || domain,
|
|
202
|
+
body,
|
|
95
203
|
tags: [...tags, "web-import"],
|
|
96
|
-
meta: {
|
|
204
|
+
meta: {
|
|
205
|
+
url,
|
|
206
|
+
domain,
|
|
207
|
+
fetched_at: new Date().toISOString(),
|
|
208
|
+
content_type: contentType.split(";")[0].trim() || "text/html",
|
|
209
|
+
},
|
|
97
210
|
source: source || domain,
|
|
98
211
|
};
|
|
99
212
|
}
|
package/src/main.ts
CHANGED
|
@@ -80,7 +80,12 @@ export {
|
|
|
80
80
|
} from "./db.js";
|
|
81
81
|
|
|
82
82
|
// Embeddings
|
|
83
|
-
export {
|
|
83
|
+
export {
|
|
84
|
+
embed,
|
|
85
|
+
embedBatch,
|
|
86
|
+
resetEmbedPipeline,
|
|
87
|
+
isEmbedAvailable,
|
|
88
|
+
} from "./embed.js";
|
|
84
89
|
|
|
85
90
|
// Index (reindex + indexEntry)
|
|
86
91
|
export { indexEntry, reindex, pruneExpired } from "./index.js";
|
|
@@ -97,15 +102,7 @@ export {
|
|
|
97
102
|
} from "./search.js";
|
|
98
103
|
|
|
99
104
|
// Capture
|
|
100
|
-
export {
|
|
101
|
-
writeEntry,
|
|
102
|
-
updateEntryFile,
|
|
103
|
-
captureAndIndex,
|
|
104
|
-
} from "./capture.js";
|
|
105
|
+
export { writeEntry, updateEntryFile, captureAndIndex } from "./capture.js";
|
|
105
106
|
|
|
106
107
|
// Ingest URL
|
|
107
|
-
export {
|
|
108
|
-
htmlToMarkdown,
|
|
109
|
-
extractHtmlContent,
|
|
110
|
-
ingestUrl,
|
|
111
|
-
} from "./ingest-url.js";
|
|
108
|
+
export { htmlToMarkdown, extractHtmlContent, ingestUrl } from "./ingest-url.js";
|
package/src/search.ts
CHANGED
|
@@ -1,9 +1,17 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type {
|
|
2
|
+
BaseCtx,
|
|
3
|
+
SearchResult,
|
|
4
|
+
SearchOptions,
|
|
5
|
+
VaultEntry,
|
|
6
|
+
} from "./types.js";
|
|
2
7
|
|
|
3
8
|
const NEAR_DUP_THRESHOLD = 0.92;
|
|
4
9
|
const RRF_K = 60;
|
|
5
10
|
|
|
6
|
-
export function recencyDecayScore(
|
|
11
|
+
export function recencyDecayScore(
|
|
12
|
+
updatedAt: string | null | undefined,
|
|
13
|
+
decayRate = 0.05,
|
|
14
|
+
): number {
|
|
7
15
|
if (updatedAt == null) return 0.5;
|
|
8
16
|
const ageDays = (Date.now() - new Date(updatedAt).getTime()) / 86400000;
|
|
9
17
|
return Math.exp(-decayRate * ageDays);
|
|
@@ -28,7 +36,11 @@ export function buildFtsQuery(query: string): string | null {
|
|
|
28
36
|
return `${phrase} OR ${near} OR ${and}`;
|
|
29
37
|
}
|
|
30
38
|
|
|
31
|
-
export function recencyBoost(
|
|
39
|
+
export function recencyBoost(
|
|
40
|
+
createdAt: string,
|
|
41
|
+
category: string,
|
|
42
|
+
decayDays = 30,
|
|
43
|
+
): number {
|
|
32
44
|
if (category !== "event") return 1.0;
|
|
33
45
|
const ageDays = (Date.now() - new Date(createdAt).getTime()) / 86400000;
|
|
34
46
|
return 1 / (1 + ageDays / decayDays);
|
|
@@ -46,9 +58,9 @@ export function buildFilterClauses({
|
|
|
46
58
|
since?: string | null;
|
|
47
59
|
until?: string | null;
|
|
48
60
|
includeSuperseeded?: boolean;
|
|
49
|
-
}): { clauses: string[]; params:
|
|
61
|
+
}): { clauses: string[]; params: (string | number | null)[] } {
|
|
50
62
|
const clauses: string[] = [];
|
|
51
|
-
const params:
|
|
63
|
+
const params: (string | number | null)[] = [];
|
|
52
64
|
if (categoryFilter) {
|
|
53
65
|
clauses.push("e.category = ?");
|
|
54
66
|
params.push(categoryFilter);
|
|
@@ -120,7 +132,7 @@ export async function hybridSearch(
|
|
|
120
132
|
if (ftsQuery) {
|
|
121
133
|
try {
|
|
122
134
|
const whereParts = ["vault_fts MATCH ?"];
|
|
123
|
-
const ftsParams:
|
|
135
|
+
const ftsParams: (string | number | null)[] = [ftsQuery];
|
|
124
136
|
|
|
125
137
|
if (kindFilter) {
|
|
126
138
|
whereParts.push("e.kind = ?");
|
|
@@ -130,8 +142,9 @@ export async function hybridSearch(
|
|
|
130
142
|
ftsParams.push(...extraFilters.params);
|
|
131
143
|
|
|
132
144
|
const ftsSQL = `SELECT e.*, rank FROM vault_fts f JOIN vault e ON f.rowid = e.rowid WHERE ${whereParts.join(" AND ")} ORDER BY rank LIMIT 15`;
|
|
133
|
-
|
|
134
|
-
|
|
145
|
+
const rows = ctx.db
|
|
146
|
+
.prepare(ftsSQL)
|
|
147
|
+
.all(...ftsParams) as unknown as (VaultEntry & { rank: number })[];
|
|
135
148
|
|
|
136
149
|
for (const { rank: _rank, ...row } of rows) {
|
|
137
150
|
ftsRankedIds.push(row.id);
|
|
@@ -148,7 +161,11 @@ export async function hybridSearch(
|
|
|
148
161
|
const vecSimMap = new Map<string, number>();
|
|
149
162
|
|
|
150
163
|
try {
|
|
151
|
-
const vecCount = (
|
|
164
|
+
const vecCount = (
|
|
165
|
+
ctx.db.prepare("SELECT COUNT(*) as c FROM vault_vec").get() as {
|
|
166
|
+
c: number;
|
|
167
|
+
}
|
|
168
|
+
).c;
|
|
152
169
|
if (vecCount > 0) {
|
|
153
170
|
queryVec = await ctx.embed(query);
|
|
154
171
|
if (queryVec) {
|
|
@@ -196,7 +213,9 @@ export async function hybridSearch(
|
|
|
196
213
|
}
|
|
197
214
|
} catch (err) {
|
|
198
215
|
if (!(err as Error).message?.includes("no such table")) {
|
|
199
|
-
console.error(
|
|
216
|
+
console.error(
|
|
217
|
+
`[retrieve] Vector search error: ${(err as Error).message}`,
|
|
218
|
+
);
|
|
200
219
|
}
|
|
201
220
|
}
|
|
202
221
|
|