@context-vault/core 3.1.6 → 3.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/capture.d.ts +1 -1
- package/dist/capture.d.ts.map +1 -1
- package/dist/capture.js +34 -47
- package/dist/capture.js.map +1 -1
- package/dist/categories.js +30 -30
- package/dist/config.d.ts +1 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +37 -43
- package/dist/config.js.map +1 -1
- package/dist/constants.d.ts +1 -1
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +4 -4
- package/dist/constants.js.map +1 -1
- package/dist/db.d.ts +2 -2
- package/dist/db.d.ts.map +1 -1
- package/dist/db.js +21 -20
- package/dist/db.js.map +1 -1
- package/dist/embed.d.ts.map +1 -1
- package/dist/embed.js +11 -11
- package/dist/embed.js.map +1 -1
- package/dist/files.d.ts.map +1 -1
- package/dist/files.js +12 -13
- package/dist/files.js.map +1 -1
- package/dist/formatters.js +5 -5
- package/dist/frontmatter.d.ts.map +1 -1
- package/dist/frontmatter.js +23 -23
- package/dist/frontmatter.js.map +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +58 -46
- package/dist/index.js.map +1 -1
- package/dist/ingest-url.d.ts.map +1 -1
- package/dist/ingest-url.js +30 -33
- package/dist/ingest-url.js.map +1 -1
- package/dist/main.d.ts +13 -13
- package/dist/main.d.ts.map +1 -1
- package/dist/main.js +12 -12
- package/dist/main.js.map +1 -1
- package/dist/search.d.ts +1 -1
- package/dist/search.d.ts.map +1 -1
- package/dist/search.js +20 -22
- package/dist/search.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/package.json +1 -1
- package/src/capture.ts +44 -81
- package/src/categories.ts +30 -30
- package/src/config.ts +45 -60
- package/src/constants.ts +8 -10
- package/src/db.ts +37 -56
- package/src/embed.ts +15 -26
- package/src/files.ts +13 -16
- package/src/formatters.ts +5 -5
- package/src/frontmatter.ts +26 -30
- package/src/index.ts +94 -100
- package/src/ingest-url.ts +56 -93
- package/src/main.ts +13 -18
- package/src/search.ts +34 -56
- package/src/types.ts +1 -1
package/src/ingest-url.ts
CHANGED
|
@@ -1,121 +1,96 @@
|
|
|
1
1
|
export function htmlToMarkdown(html: string): string {
|
|
2
2
|
let md = html;
|
|
3
|
-
md = md.replace(/<script[\s\S]*?<\/script>/gi,
|
|
4
|
-
md = md.replace(/<style[\s\S]*?<\/style>/gi,
|
|
5
|
-
md = md.replace(/<nav[\s\S]*?<\/nav>/gi,
|
|
6
|
-
md = md.replace(/<header[\s\S]*?<\/header>/gi,
|
|
7
|
-
md = md.replace(/<footer[\s\S]*?<\/footer>/gi,
|
|
8
|
-
md = md.replace(/<aside[\s\S]*?<\/aside>/gi,
|
|
9
|
-
md = md.replace(
|
|
10
|
-
|
|
11
|
-
(_, c: string) => `\n# ${stripTags(c).trim()}\n`,
|
|
12
|
-
);
|
|
13
|
-
md = md.replace(
|
|
14
|
-
/<h2[^>]*>([\s\S]*?)<\/h2>/gi,
|
|
15
|
-
(_, c: string) => `\n## ${stripTags(c).trim()}\n`,
|
|
16
|
-
);
|
|
3
|
+
md = md.replace(/<script[\s\S]*?<\/script>/gi, '');
|
|
4
|
+
md = md.replace(/<style[\s\S]*?<\/style>/gi, '');
|
|
5
|
+
md = md.replace(/<nav[\s\S]*?<\/nav>/gi, '');
|
|
6
|
+
md = md.replace(/<header[\s\S]*?<\/header>/gi, '');
|
|
7
|
+
md = md.replace(/<footer[\s\S]*?<\/footer>/gi, '');
|
|
8
|
+
md = md.replace(/<aside[\s\S]*?<\/aside>/gi, '');
|
|
9
|
+
md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, (_, c: string) => `\n# ${stripTags(c).trim()}\n`);
|
|
10
|
+
md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, (_, c: string) => `\n## ${stripTags(c).trim()}\n`);
|
|
17
11
|
md = md.replace(
|
|
18
12
|
/<h3[^>]*>([\s\S]*?)<\/h3>/gi,
|
|
19
|
-
(_, c: string) => `\n### ${stripTags(c).trim()}\n
|
|
13
|
+
(_, c: string) => `\n### ${stripTags(c).trim()}\n`
|
|
20
14
|
);
|
|
21
15
|
md = md.replace(
|
|
22
16
|
/<h4[^>]*>([\s\S]*?)<\/h4>/gi,
|
|
23
|
-
(_, c: string) => `\n#### ${stripTags(c).trim()}\n
|
|
17
|
+
(_, c: string) => `\n#### ${stripTags(c).trim()}\n`
|
|
24
18
|
);
|
|
25
19
|
md = md.replace(
|
|
26
20
|
/<h5[^>]*>([\s\S]*?)<\/h5>/gi,
|
|
27
|
-
(_, c: string) => `\n##### ${stripTags(c).trim()}\n
|
|
21
|
+
(_, c: string) => `\n##### ${stripTags(c).trim()}\n`
|
|
28
22
|
);
|
|
29
23
|
md = md.replace(
|
|
30
24
|
/<h6[^>]*>([\s\S]*?)<\/h6>/gi,
|
|
31
|
-
(_, c: string) => `\n###### ${stripTags(c).trim()}\n
|
|
25
|
+
(_, c: string) => `\n###### ${stripTags(c).trim()}\n`
|
|
32
26
|
);
|
|
33
27
|
md = md.replace(
|
|
34
28
|
/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi,
|
|
35
29
|
(_, href: string, text: string) => {
|
|
36
30
|
const cleanText = stripTags(text).trim();
|
|
37
|
-
return cleanText ? `[${cleanText}](${href})` :
|
|
38
|
-
}
|
|
31
|
+
return cleanText ? `[${cleanText}](${href})` : '';
|
|
32
|
+
}
|
|
39
33
|
);
|
|
40
34
|
md = md.replace(
|
|
41
35
|
/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi,
|
|
42
|
-
(_, c: string) => `\n\`\`\`\n${decodeEntities(c).trim()}\n\`\`\`\n
|
|
36
|
+
(_, c: string) => `\n\`\`\`\n${decodeEntities(c).trim()}\n\`\`\`\n`
|
|
43
37
|
);
|
|
44
38
|
md = md.replace(
|
|
45
39
|
/<pre[^>]*>([\s\S]*?)<\/pre>/gi,
|
|
46
|
-
(_, c: string) =>
|
|
47
|
-
`\n\`\`\`\n${decodeEntities(stripTags(c)).trim()}\n\`\`\`\n`,
|
|
40
|
+
(_, c: string) => `\n\`\`\`\n${decodeEntities(stripTags(c)).trim()}\n\`\`\`\n`
|
|
48
41
|
);
|
|
49
42
|
md = md.replace(
|
|
50
43
|
/<code[^>]*>([\s\S]*?)<\/code>/gi,
|
|
51
|
-
(_, c: string) => `\`${decodeEntities(c).trim()}
|
|
44
|
+
(_, c: string) => `\`${decodeEntities(c).trim()}\``
|
|
52
45
|
);
|
|
53
46
|
md = md.replace(
|
|
54
47
|
/<(strong|b)[^>]*>([\s\S]*?)<\/\1>/gi,
|
|
55
|
-
(_, __: string, c: string) => `**${stripTags(c).trim()}
|
|
48
|
+
(_, __: string, c: string) => `**${stripTags(c).trim()}**`
|
|
56
49
|
);
|
|
57
50
|
md = md.replace(
|
|
58
51
|
/<(em|i)[^>]*>([\s\S]*?)<\/\1>/gi,
|
|
59
|
-
(_, __: string, c: string) => `*${stripTags(c).trim()}
|
|
60
|
-
);
|
|
61
|
-
md = md.replace(
|
|
62
|
-
/<li[^>]*>([\s\S]*?)<\/li>/gi,
|
|
63
|
-
(_, c: string) => `- ${stripTags(c).trim()}\n`,
|
|
64
|
-
);
|
|
65
|
-
md = md.replace(/<br\s*\/?>/gi, "\n");
|
|
66
|
-
md = md.replace(
|
|
67
|
-
/<p[^>]*>([\s\S]*?)<\/p>/gi,
|
|
68
|
-
(_, c: string) => `\n${stripTags(c).trim()}\n`,
|
|
69
|
-
);
|
|
70
|
-
md = md.replace(
|
|
71
|
-
/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi,
|
|
72
|
-
(_, c: string) => {
|
|
73
|
-
return (
|
|
74
|
-
"\n" +
|
|
75
|
-
stripTags(c)
|
|
76
|
-
.trim()
|
|
77
|
-
.split("\n")
|
|
78
|
-
.map((l: string) => `> ${l}`)
|
|
79
|
-
.join("\n") +
|
|
80
|
-
"\n"
|
|
81
|
-
);
|
|
82
|
-
},
|
|
52
|
+
(_, __: string, c: string) => `*${stripTags(c).trim()}*`
|
|
83
53
|
);
|
|
54
|
+
md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, c: string) => `- ${stripTags(c).trim()}\n`);
|
|
55
|
+
md = md.replace(/<br\s*\/?>/gi, '\n');
|
|
56
|
+
md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, (_, c: string) => `\n${stripTags(c).trim()}\n`);
|
|
57
|
+
md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_, c: string) => {
|
|
58
|
+
return (
|
|
59
|
+
'\n' +
|
|
60
|
+
stripTags(c)
|
|
61
|
+
.trim()
|
|
62
|
+
.split('\n')
|
|
63
|
+
.map((l: string) => `> ${l}`)
|
|
64
|
+
.join('\n') +
|
|
65
|
+
'\n'
|
|
66
|
+
);
|
|
67
|
+
});
|
|
84
68
|
md = stripTags(md);
|
|
85
69
|
md = decodeEntities(md);
|
|
86
|
-
md = md.replace(/\n{3,}/g,
|
|
70
|
+
md = md.replace(/\n{3,}/g, '\n\n').trim();
|
|
87
71
|
return md;
|
|
88
72
|
}
|
|
89
73
|
|
|
90
74
|
function stripTags(html: string): string {
|
|
91
|
-
return html.replace(/<[^>]+>/g,
|
|
75
|
+
return html.replace(/<[^>]+>/g, '');
|
|
92
76
|
}
|
|
93
77
|
|
|
94
78
|
function decodeEntities(text: string): string {
|
|
95
79
|
return text
|
|
96
|
-
.replace(/&/g,
|
|
97
|
-
.replace(/</g,
|
|
98
|
-
.replace(/>/g,
|
|
80
|
+
.replace(/&/g, '&')
|
|
81
|
+
.replace(/</g, '<')
|
|
82
|
+
.replace(/>/g, '>')
|
|
99
83
|
.replace(/"/g, '"')
|
|
100
84
|
.replace(/'/g, "'")
|
|
101
|
-
.replace(/ /g,
|
|
102
|
-
.replace(/&#(\d+);/g, (_, n: string) =>
|
|
103
|
-
|
|
104
|
-
)
|
|
105
|
-
.replace(/&#x([0-9a-f]+);/gi, (_, n: string) =>
|
|
106
|
-
String.fromCharCode(parseInt(n, 16)),
|
|
107
|
-
);
|
|
85
|
+
.replace(/ /g, ' ')
|
|
86
|
+
.replace(/&#(\d+);/g, (_, n: string) => String.fromCharCode(parseInt(n, 10)))
|
|
87
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, n: string) => String.fromCharCode(parseInt(n, 16)));
|
|
108
88
|
}
|
|
109
89
|
|
|
110
|
-
export function extractHtmlContent(
|
|
111
|
-
html: string,
|
|
112
|
-
_url: string,
|
|
113
|
-
): { title: string; body: string } {
|
|
90
|
+
export function extractHtmlContent(html: string, _url: string): { title: string; body: string } {
|
|
114
91
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
115
|
-
const title = titleMatch
|
|
116
|
-
|
|
117
|
-
: "";
|
|
118
|
-
let contentHtml = "";
|
|
92
|
+
const title = titleMatch ? stripTags(decodeEntities(titleMatch[1])).trim() : '';
|
|
93
|
+
let contentHtml = '';
|
|
119
94
|
const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
|
|
120
95
|
const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
|
|
121
96
|
if (articleMatch) contentHtml = articleMatch[1];
|
|
@@ -136,7 +111,7 @@ export async function ingestUrl(
|
|
|
136
111
|
source?: string;
|
|
137
112
|
maxBodyLength?: number;
|
|
138
113
|
timeoutMs?: number;
|
|
139
|
-
} = {}
|
|
114
|
+
} = {}
|
|
140
115
|
): Promise<{
|
|
141
116
|
kind: string;
|
|
142
117
|
title: string;
|
|
@@ -145,13 +120,7 @@ export async function ingestUrl(
|
|
|
145
120
|
meta: Record<string, unknown>;
|
|
146
121
|
source: string;
|
|
147
122
|
}> {
|
|
148
|
-
const {
|
|
149
|
-
kind = "reference",
|
|
150
|
-
tags = [],
|
|
151
|
-
source,
|
|
152
|
-
maxBodyLength = 50000,
|
|
153
|
-
timeoutMs = 15000,
|
|
154
|
-
} = opts;
|
|
123
|
+
const { kind = 'reference', tags = [], source, maxBodyLength = 50000, timeoutMs = 15000 } = opts;
|
|
155
124
|
let domain: string;
|
|
156
125
|
try {
|
|
157
126
|
domain = new URL(url).hostname;
|
|
@@ -165,27 +134,22 @@ export async function ingestUrl(
|
|
|
165
134
|
response = await fetch(url, {
|
|
166
135
|
signal: controller.signal,
|
|
167
136
|
headers: {
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
Accept: "text/html,application/xhtml+xml,text/plain,*/*",
|
|
137
|
+
'User-Agent': 'ContextVault/1.0 (+https://github.com/fellanH/context-vault)',
|
|
138
|
+
Accept: 'text/html,application/xhtml+xml,text/plain,*/*',
|
|
171
139
|
},
|
|
172
140
|
});
|
|
173
141
|
} catch (err) {
|
|
174
|
-
if ((err as Error).name ===
|
|
142
|
+
if ((err as Error).name === 'AbortError')
|
|
175
143
|
throw new Error(`Request timed out after ${timeoutMs}ms`);
|
|
176
144
|
throw new Error(`Fetch failed: ${(err as Error).message}`);
|
|
177
145
|
} finally {
|
|
178
146
|
clearTimeout(timeout);
|
|
179
147
|
}
|
|
180
|
-
if (!response.ok)
|
|
181
|
-
|
|
182
|
-
const contentType = response.headers.get("content-type") || "";
|
|
148
|
+
if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
149
|
+
const contentType = response.headers.get('content-type') || '';
|
|
183
150
|
const html = await response.text();
|
|
184
151
|
let title: string, body: string;
|
|
185
|
-
if (
|
|
186
|
-
contentType.includes("text/html") ||
|
|
187
|
-
contentType.includes("application/xhtml")
|
|
188
|
-
) {
|
|
152
|
+
if (contentType.includes('text/html') || contentType.includes('application/xhtml')) {
|
|
189
153
|
const extracted = extractHtmlContent(html, url);
|
|
190
154
|
title = extracted.title;
|
|
191
155
|
body = extracted.body;
|
|
@@ -193,19 +157,18 @@ export async function ingestUrl(
|
|
|
193
157
|
title = domain;
|
|
194
158
|
body = html;
|
|
195
159
|
}
|
|
196
|
-
if (body.length > maxBodyLength)
|
|
197
|
-
|
|
198
|
-
if (!body.trim()) throw new Error("No readable content extracted from URL");
|
|
160
|
+
if (body.length > maxBodyLength) body = body.slice(0, maxBodyLength) + '\n\n[Content truncated]';
|
|
161
|
+
if (!body.trim()) throw new Error('No readable content extracted from URL');
|
|
199
162
|
return {
|
|
200
163
|
kind,
|
|
201
164
|
title: title || domain,
|
|
202
165
|
body,
|
|
203
|
-
tags: [...tags,
|
|
166
|
+
tags: [...tags, 'web-import'],
|
|
204
167
|
meta: {
|
|
205
168
|
url,
|
|
206
169
|
domain,
|
|
207
170
|
fetched_at: new Date().toISOString(),
|
|
208
|
-
content_type: contentType.split(
|
|
171
|
+
content_type: contentType.split(';')[0].trim() || 'text/html',
|
|
209
172
|
},
|
|
210
173
|
source: source || domain,
|
|
211
174
|
};
|
package/src/main.ts
CHANGED
|
@@ -13,7 +13,7 @@ export type {
|
|
|
13
13
|
ReindexStats,
|
|
14
14
|
BaseCtx,
|
|
15
15
|
SearchOptions,
|
|
16
|
-
} from
|
|
16
|
+
} from './types.js';
|
|
17
17
|
|
|
18
18
|
// Constants
|
|
19
19
|
export {
|
|
@@ -31,7 +31,7 @@ export {
|
|
|
31
31
|
MAX_IDENTITY_KEY_LENGTH,
|
|
32
32
|
DEFAULT_GROWTH_THRESHOLDS,
|
|
33
33
|
DEFAULT_LIFECYCLE,
|
|
34
|
-
} from
|
|
34
|
+
} from './constants.js';
|
|
35
35
|
|
|
36
36
|
// Categories
|
|
37
37
|
export {
|
|
@@ -40,10 +40,10 @@ export {
|
|
|
40
40
|
defaultTierFor,
|
|
41
41
|
CATEGORY_DIRS,
|
|
42
42
|
KIND_STALENESS_DAYS,
|
|
43
|
-
} from
|
|
43
|
+
} from './categories.js';
|
|
44
44
|
|
|
45
45
|
// Config
|
|
46
|
-
export { parseArgs, resolveConfig } from
|
|
46
|
+
export { parseArgs, resolveConfig } from './config.js';
|
|
47
47
|
|
|
48
48
|
// Files
|
|
49
49
|
export {
|
|
@@ -55,7 +55,7 @@ export {
|
|
|
55
55
|
kindToPath,
|
|
56
56
|
safeJoin,
|
|
57
57
|
walkDir,
|
|
58
|
-
} from
|
|
58
|
+
} from './files.js';
|
|
59
59
|
|
|
60
60
|
// Frontmatter
|
|
61
61
|
export {
|
|
@@ -63,10 +63,10 @@ export {
|
|
|
63
63
|
parseFrontmatter,
|
|
64
64
|
extractCustomMeta,
|
|
65
65
|
parseEntryFromMarkdown,
|
|
66
|
-
} from
|
|
66
|
+
} from './frontmatter.js';
|
|
67
67
|
|
|
68
68
|
// Formatters
|
|
69
|
-
export { formatBody } from
|
|
69
|
+
export { formatBody } from './formatters.js';
|
|
70
70
|
|
|
71
71
|
// Database
|
|
72
72
|
export {
|
|
@@ -77,18 +77,13 @@ export {
|
|
|
77
77
|
insertVec,
|
|
78
78
|
deleteVec,
|
|
79
79
|
testConnection,
|
|
80
|
-
} from
|
|
80
|
+
} from './db.js';
|
|
81
81
|
|
|
82
82
|
// Embeddings
|
|
83
|
-
export {
|
|
84
|
-
embed,
|
|
85
|
-
embedBatch,
|
|
86
|
-
resetEmbedPipeline,
|
|
87
|
-
isEmbedAvailable,
|
|
88
|
-
} from "./embed.js";
|
|
83
|
+
export { embed, embedBatch, resetEmbedPipeline, isEmbedAvailable } from './embed.js';
|
|
89
84
|
|
|
90
85
|
// Index (reindex + indexEntry)
|
|
91
|
-
export { indexEntry, reindex, pruneExpired } from
|
|
86
|
+
export { indexEntry, reindex, pruneExpired } from './index.js';
|
|
92
87
|
|
|
93
88
|
// Search (retrieve)
|
|
94
89
|
export {
|
|
@@ -99,10 +94,10 @@ export {
|
|
|
99
94
|
recencyDecayScore,
|
|
100
95
|
dotProduct,
|
|
101
96
|
reciprocalRankFusion,
|
|
102
|
-
} from
|
|
97
|
+
} from './search.js';
|
|
103
98
|
|
|
104
99
|
// Capture
|
|
105
|
-
export { writeEntry, updateEntryFile, captureAndIndex } from
|
|
100
|
+
export { writeEntry, updateEntryFile, captureAndIndex } from './capture.js';
|
|
106
101
|
|
|
107
102
|
// Ingest URL
|
|
108
|
-
export { htmlToMarkdown, extractHtmlContent, ingestUrl } from
|
|
103
|
+
export { htmlToMarkdown, extractHtmlContent, ingestUrl } from './ingest-url.js';
|
package/src/search.ts
CHANGED
|
@@ -1,17 +1,9 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
BaseCtx,
|
|
3
|
-
SearchResult,
|
|
4
|
-
SearchOptions,
|
|
5
|
-
VaultEntry,
|
|
6
|
-
} from "./types.js";
|
|
1
|
+
import type { BaseCtx, SearchResult, SearchOptions, VaultEntry } from './types.js';
|
|
7
2
|
|
|
8
3
|
const NEAR_DUP_THRESHOLD = 0.92;
|
|
9
4
|
const RRF_K = 60;
|
|
10
5
|
|
|
11
|
-
export function recencyDecayScore(
|
|
12
|
-
updatedAt: string | null | undefined,
|
|
13
|
-
decayRate = 0.05,
|
|
14
|
-
): number {
|
|
6
|
+
export function recencyDecayScore(updatedAt: string | null | undefined, decayRate = 0.05): number {
|
|
15
7
|
if (updatedAt == null) return 0.5;
|
|
16
8
|
const ageDays = (Date.now() - new Date(updatedAt).getTime()) / 86400000;
|
|
17
9
|
return Math.exp(-decayRate * ageDays);
|
|
@@ -26,22 +18,18 @@ export function dotProduct(a: Float32Array, b: Float32Array): number {
|
|
|
26
18
|
export function buildFtsQuery(query: string): string | null {
|
|
27
19
|
const words = query
|
|
28
20
|
.split(/[\s-]+/)
|
|
29
|
-
.map((w) => w.replace(/[*"():^~{}]/g,
|
|
21
|
+
.map((w) => w.replace(/[*"():^~{}]/g, ''))
|
|
30
22
|
.filter((w) => w.length > 0);
|
|
31
23
|
if (!words.length) return null;
|
|
32
24
|
if (words.length === 1) return `"${words[0]}"`;
|
|
33
|
-
const phrase = `"${words.join(
|
|
34
|
-
const near = `NEAR(${words.map((w) => `"${w}"`).join(
|
|
35
|
-
const and = words.map((w) => `"${w}"`).join(
|
|
25
|
+
const phrase = `"${words.join(' ')}"`;
|
|
26
|
+
const near = `NEAR(${words.map((w) => `"${w}"`).join(' ')}, 10)`;
|
|
27
|
+
const and = words.map((w) => `"${w}"`).join(' AND ');
|
|
36
28
|
return `${phrase} OR ${near} OR ${and}`;
|
|
37
29
|
}
|
|
38
30
|
|
|
39
|
-
export function recencyBoost(
|
|
40
|
-
|
|
41
|
-
category: string,
|
|
42
|
-
decayDays = 30,
|
|
43
|
-
): number {
|
|
44
|
-
if (category !== "event") return 1.0;
|
|
31
|
+
export function recencyBoost(createdAt: string, category: string, decayDays = 30): number {
|
|
32
|
+
if (category !== 'event') return 1.0;
|
|
45
33
|
const ageDays = (Date.now() - new Date(createdAt).getTime()) / 86400000;
|
|
46
34
|
return 1 / (1 + ageDays / decayDays);
|
|
47
35
|
}
|
|
@@ -62,30 +50,30 @@ export function buildFilterClauses({
|
|
|
62
50
|
const clauses: string[] = [];
|
|
63
51
|
const params: (string | number | null)[] = [];
|
|
64
52
|
if (categoryFilter) {
|
|
65
|
-
clauses.push(
|
|
53
|
+
clauses.push('e.category = ?');
|
|
66
54
|
params.push(categoryFilter);
|
|
67
55
|
}
|
|
68
56
|
if (excludeEvents && !categoryFilter) {
|
|
69
57
|
clauses.push("e.category != 'event'");
|
|
70
58
|
}
|
|
71
59
|
if (since) {
|
|
72
|
-
clauses.push(
|
|
60
|
+
clauses.push('e.created_at >= ?');
|
|
73
61
|
params.push(since);
|
|
74
62
|
}
|
|
75
63
|
if (until) {
|
|
76
|
-
clauses.push(
|
|
64
|
+
clauses.push('e.created_at <= ?');
|
|
77
65
|
params.push(until);
|
|
78
66
|
}
|
|
79
67
|
clauses.push("(e.expires_at IS NULL OR e.expires_at > datetime('now'))");
|
|
80
68
|
if (!includeSuperseeded) {
|
|
81
|
-
clauses.push(
|
|
69
|
+
clauses.push('e.superseded_by IS NULL');
|
|
82
70
|
}
|
|
83
71
|
return { clauses, params };
|
|
84
72
|
}
|
|
85
73
|
|
|
86
74
|
export function reciprocalRankFusion(
|
|
87
75
|
rankedLists: string[][],
|
|
88
|
-
k: number = RRF_K
|
|
76
|
+
k: number = RRF_K
|
|
89
77
|
): Map<string, number> {
|
|
90
78
|
const scores = new Map<string, number>();
|
|
91
79
|
for (const list of rankedLists) {
|
|
@@ -100,7 +88,7 @@ export function reciprocalRankFusion(
|
|
|
100
88
|
export async function hybridSearch(
|
|
101
89
|
ctx: BaseCtx,
|
|
102
90
|
query: string,
|
|
103
|
-
opts: SearchOptions = {}
|
|
91
|
+
opts: SearchOptions = {}
|
|
104
92
|
): Promise<SearchResult[]> {
|
|
105
93
|
const {
|
|
106
94
|
kindFilter = null,
|
|
@@ -131,27 +119,27 @@ export async function hybridSearch(
|
|
|
131
119
|
const ftsQuery = buildFtsQuery(query);
|
|
132
120
|
if (ftsQuery) {
|
|
133
121
|
try {
|
|
134
|
-
const whereParts = [
|
|
122
|
+
const whereParts = ['vault_fts MATCH ?'];
|
|
135
123
|
const ftsParams: (string | number | null)[] = [ftsQuery];
|
|
136
124
|
|
|
137
125
|
if (kindFilter) {
|
|
138
|
-
whereParts.push(
|
|
126
|
+
whereParts.push('e.kind = ?');
|
|
139
127
|
ftsParams.push(kindFilter);
|
|
140
128
|
}
|
|
141
129
|
whereParts.push(...extraFilters.clauses);
|
|
142
130
|
ftsParams.push(...extraFilters.params);
|
|
143
131
|
|
|
144
|
-
const ftsSQL = `SELECT e.*, rank FROM vault_fts f JOIN vault e ON f.rowid = e.rowid WHERE ${whereParts.join(
|
|
145
|
-
const rows = ctx.db
|
|
146
|
-
|
|
147
|
-
|
|
132
|
+
const ftsSQL = `SELECT e.*, rank FROM vault_fts f JOIN vault e ON f.rowid = e.rowid WHERE ${whereParts.join(' AND ')} ORDER BY rank LIMIT 15`;
|
|
133
|
+
const rows = ctx.db.prepare(ftsSQL).all(...ftsParams) as unknown as (VaultEntry & {
|
|
134
|
+
rank: number;
|
|
135
|
+
})[];
|
|
148
136
|
|
|
149
137
|
for (const { rank: _rank, ...row } of rows) {
|
|
150
138
|
ftsRankedIds.push(row.id);
|
|
151
139
|
if (!rowMap.has(row.id)) rowMap.set(row.id, row);
|
|
152
140
|
}
|
|
153
141
|
} catch (err) {
|
|
154
|
-
if (!(err as Error).message?.includes(
|
|
142
|
+
if (!(err as Error).message?.includes('fts5: syntax error')) {
|
|
155
143
|
console.error(`[retrieve] FTS search error: ${(err as Error).message}`);
|
|
156
144
|
}
|
|
157
145
|
}
|
|
@@ -162,7 +150,7 @@ export async function hybridSearch(
|
|
|
162
150
|
|
|
163
151
|
try {
|
|
164
152
|
const vecCount = (
|
|
165
|
-
ctx.db.prepare(
|
|
153
|
+
ctx.db.prepare('SELECT COUNT(*) as c FROM vault_vec').get() as {
|
|
166
154
|
c: number;
|
|
167
155
|
}
|
|
168
156
|
).c;
|
|
@@ -172,17 +160,15 @@ export async function hybridSearch(
|
|
|
172
160
|
const vecLimit = kindFilter ? 30 : 15;
|
|
173
161
|
const vecRows = ctx.db
|
|
174
162
|
.prepare(
|
|
175
|
-
`SELECT v.rowid, v.distance FROM vault_vec v WHERE embedding MATCH ? ORDER BY distance LIMIT
|
|
163
|
+
`SELECT v.rowid, v.distance FROM vault_vec v WHERE embedding MATCH ? ORDER BY distance LIMIT ?`
|
|
176
164
|
)
|
|
177
165
|
.all(queryVec, vecLimit) as { rowid: number; distance: number }[];
|
|
178
166
|
|
|
179
167
|
if (vecRows.length) {
|
|
180
168
|
const rowids = vecRows.map((vr) => vr.rowid);
|
|
181
|
-
const placeholders = rowids.map(() =>
|
|
169
|
+
const placeholders = rowids.map(() => '?').join(',');
|
|
182
170
|
const hydrated = ctx.db
|
|
183
|
-
.prepare(
|
|
184
|
-
`SELECT rowid, * FROM vault WHERE rowid IN (${placeholders})`,
|
|
185
|
-
)
|
|
171
|
+
.prepare(`SELECT rowid, * FROM vault WHERE rowid IN (${placeholders})`)
|
|
186
172
|
.all(...rowids) as unknown as (VaultEntry & { rowid: number })[];
|
|
187
173
|
|
|
188
174
|
const byRowid = new Map<number, VaultEntry & { rowid: number }>();
|
|
@@ -193,11 +179,10 @@ export async function hybridSearch(
|
|
|
193
179
|
if (!row) continue;
|
|
194
180
|
if (kindFilter && row.kind !== kindFilter) continue;
|
|
195
181
|
if (categoryFilter && row.category !== categoryFilter) continue;
|
|
196
|
-
if (excludeEvents && row.category ===
|
|
182
|
+
if (excludeEvents && row.category === 'event') continue;
|
|
197
183
|
if (since && row.created_at < since) continue;
|
|
198
184
|
if (until && row.created_at > until) continue;
|
|
199
|
-
if (row.expires_at && new Date(row.expires_at) <= new Date())
|
|
200
|
-
continue;
|
|
185
|
+
if (row.expires_at && new Date(row.expires_at) <= new Date()) continue;
|
|
201
186
|
|
|
202
187
|
const { rowid: _rowid, ...cleanRow } = row;
|
|
203
188
|
idToRowid.set(cleanRow.id, Number(row.rowid));
|
|
@@ -212,10 +197,8 @@ export async function hybridSearch(
|
|
|
212
197
|
}
|
|
213
198
|
}
|
|
214
199
|
} catch (err) {
|
|
215
|
-
if (!(err as Error).message?.includes(
|
|
216
|
-
console.error(
|
|
217
|
-
`[retrieve] Vector search error: ${(err as Error).message}`,
|
|
218
|
-
);
|
|
200
|
+
if (!(err as Error).message?.includes('no such table')) {
|
|
201
|
+
console.error(`[retrieve] Vector search error: ${(err as Error).message}`);
|
|
219
202
|
}
|
|
220
203
|
}
|
|
221
204
|
|
|
@@ -241,20 +224,15 @@ export async function hybridSearch(
|
|
|
241
224
|
|
|
242
225
|
const rowidsToFetch = [...idToRowid.values()];
|
|
243
226
|
try {
|
|
244
|
-
const placeholders = rowidsToFetch.map(() =>
|
|
227
|
+
const placeholders = rowidsToFetch.map(() => '?').join(',');
|
|
245
228
|
const vecData = ctx.db
|
|
246
|
-
.prepare(
|
|
247
|
-
`SELECT rowid, embedding FROM vault_vec WHERE rowid IN (${placeholders})`,
|
|
248
|
-
)
|
|
229
|
+
.prepare(`SELECT rowid, embedding FROM vault_vec WHERE rowid IN (${placeholders})`)
|
|
249
230
|
.all(...rowidsToFetch) as { rowid: number; embedding: Buffer }[];
|
|
250
231
|
for (const row of vecData) {
|
|
251
232
|
const id = rowidToId.get(Number(row.rowid));
|
|
252
233
|
const buf = row.embedding;
|
|
253
234
|
if (id && buf) {
|
|
254
|
-
embeddingMap.set(
|
|
255
|
-
id,
|
|
256
|
-
new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4),
|
|
257
|
-
);
|
|
235
|
+
embeddingMap.set(id, new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4));
|
|
258
236
|
}
|
|
259
237
|
}
|
|
260
238
|
} catch {
|
|
@@ -292,10 +270,10 @@ export async function hybridSearch(
|
|
|
292
270
|
function trackAccess(ctx: BaseCtx, entries: SearchResult[]): void {
|
|
293
271
|
if (!entries.length) return;
|
|
294
272
|
try {
|
|
295
|
-
const placeholders = entries.map(() =>
|
|
273
|
+
const placeholders = entries.map(() => '?').join(',');
|
|
296
274
|
ctx.db
|
|
297
275
|
.prepare(
|
|
298
|
-
`UPDATE vault SET hit_count = hit_count + 1, last_accessed_at = datetime('now') WHERE id IN (${placeholders})
|
|
276
|
+
`UPDATE vault SET hit_count = hit_count + 1, last_accessed_at = datetime('now') WHERE id IN (${placeholders})`
|
|
299
277
|
)
|
|
300
278
|
.run(...entries.map((e) => e.id));
|
|
301
279
|
} catch {
|
package/src/types.ts
CHANGED