mcp-astgl-knowledge 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +146 -38
- package/data/knowledge.db +0 -0
- package/dist/alerts.d.ts +22 -0
- package/dist/alerts.js +433 -0
- package/dist/alerts.js.map +1 -0
- package/dist/citation-test.d.ts +14 -0
- package/dist/citation-test.js +298 -0
- package/dist/citation-test.js.map +1 -0
- package/dist/daily-report.d.ts +15 -0
- package/dist/daily-report.js +441 -0
- package/dist/daily-report.js.map +1 -0
- package/dist/discover.js +3 -1
- package/dist/discover.js.map +1 -1
- package/dist/freshness.d.ts +20 -0
- package/dist/freshness.js +508 -0
- package/dist/freshness.js.map +1 -0
- package/dist/index.d.ts +6 -1
- package/dist/index.js +253 -14
- package/dist/index.js.map +1 -1
- package/dist/ingest-projects.d.ts +16 -0
- package/dist/ingest-projects.js +196 -0
- package/dist/ingest-projects.js.map +1 -0
- package/dist/knowledge-db.d.ts +13 -0
- package/dist/knowledge-db.js +156 -0
- package/dist/knowledge-db.js.map +1 -0
- package/dist/pipeline.d.ts +12 -0
- package/dist/pipeline.js +83 -0
- package/dist/pipeline.js.map +1 -0
- package/dist/query-log.d.ts +15 -0
- package/dist/query-log.js +93 -0
- package/dist/query-log.js.map +1 -0
- package/dist/rate-limit.d.ts +34 -0
- package/dist/rate-limit.js +206 -0
- package/dist/rate-limit.js.map +1 -0
- package/dist/related-articles.d.ts +15 -0
- package/dist/related-articles.js +217 -0
- package/dist/related-articles.js.map +1 -0
- package/dist/search.d.ts +13 -4
- package/dist/search.js +274 -39
- package/dist/search.js.map +1 -1
- package/dist/structure.d.ts +11 -0
- package/dist/structure.js +451 -0
- package/dist/structure.js.map +1 -0
- package/dist/types.d.ts +65 -0
- package/dist/types.js.map +1 -1
- package/package.json +10 -2
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
#!/usr/bin/env tsx
|
|
2
|
+
/**
|
|
3
|
+
* Internal linking automation via vector similarity.
|
|
4
|
+
*
|
|
5
|
+
* WHAT: Computes pairwise article similarity and injects related article links
|
|
6
|
+
* WHY: Cross-referencing boosts SEO, AI discoverability, and reader engagement
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* npm run related Compute + print JSON map
|
|
10
|
+
* npm run related -- --inject Also inject into Astro markdown frontmatter
|
|
11
|
+
* npm run related -- --top 3 Number of related articles per article (default: 3)
|
|
12
|
+
*
|
|
13
|
+
* Requires: Ollama running with nomic-embed-text
|
|
14
|
+
*/
|
|
15
|
+
import { join } from "path";
|
|
16
|
+
import { existsSync, readFileSync, writeFileSync, readdirSync } from "fs";
|
|
17
|
+
import Database from "better-sqlite3";
|
|
18
|
+
const DATA_DIR = join(import.meta.dirname, "..", "data");
|
|
19
|
+
const KNOWLEDGE_PATH = join(DATA_DIR, "knowledge.db");
|
|
20
|
+
const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
|
|
21
|
+
const EMBED_MODEL = process.env.EMBED_MODEL || "nomic-embed-text";
|
|
22
|
+
const ASTRO_ANSWERS_DIR = process.env.ASTRO_ANSWERS_DIR ||
|
|
23
|
+
join(import.meta.dirname, "..", "..", "astgl-site", "src", "content", "answers");
|
|
24
|
+
const BASE_URL = "https://astgl.ai/answers";
|
|
25
|
+
// --- Embedding ---
|
|
26
|
+
async function embedTexts(texts) {
|
|
27
|
+
// WHAT: Batch embed via Ollama /api/embed (single call, multiple inputs)
|
|
28
|
+
// WHY: 1 API call for all articles is faster than 20 separate calls
|
|
29
|
+
const resp = await fetch(`${OLLAMA_URL}/api/embed`, {
|
|
30
|
+
method: "POST",
|
|
31
|
+
headers: { "Content-Type": "application/json" },
|
|
32
|
+
body: JSON.stringify({ model: EMBED_MODEL, input: texts }),
|
|
33
|
+
});
|
|
34
|
+
if (!resp.ok) {
|
|
35
|
+
throw new Error(`Ollama embed failed: ${resp.status} ${await resp.text()}`);
|
|
36
|
+
}
|
|
37
|
+
const data = (await resp.json());
|
|
38
|
+
return data.embeddings;
|
|
39
|
+
}
|
|
40
|
+
// WHAT: Cosine similarity between two vectors
|
|
41
|
+
// WHY: Standard similarity metric for embedding comparison (same as sqlite-vec uses)
|
|
42
|
+
function cosineSimilarity(a, b) {
|
|
43
|
+
let dot = 0;
|
|
44
|
+
let normA = 0;
|
|
45
|
+
let normB = 0;
|
|
46
|
+
for (let i = 0; i < a.length; i++) {
|
|
47
|
+
dot += a[i] * b[i];
|
|
48
|
+
normA += a[i] * a[i];
|
|
49
|
+
normB += b[i] * b[i];
|
|
50
|
+
}
|
|
51
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
52
|
+
}
|
|
53
|
+
// --- Knowledge DB: Store related articles ---
|
|
54
|
+
function storeRelatedInDb(db, relatedMap) {
|
|
55
|
+
// WHAT: Create table + upsert related article links
|
|
56
|
+
// WHY: MCP server can serve related articles alongside search results
|
|
57
|
+
db.exec(`
|
|
58
|
+
CREATE TABLE IF NOT EXISTS article_related (
|
|
59
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
60
|
+
article_url TEXT NOT NULL,
|
|
61
|
+
related_url TEXT NOT NULL,
|
|
62
|
+
related_title TEXT NOT NULL,
|
|
63
|
+
similarity_score REAL NOT NULL,
|
|
64
|
+
rank INTEGER NOT NULL,
|
|
65
|
+
UNIQUE(article_url, related_url)
|
|
66
|
+
)
|
|
67
|
+
`);
|
|
68
|
+
db.exec("CREATE INDEX IF NOT EXISTS idx_related_article ON article_related(article_url)");
|
|
69
|
+
const upsert = db.prepare(`INSERT OR REPLACE INTO article_related (article_url, related_url, related_title, similarity_score, rank)
|
|
70
|
+
VALUES (?, ?, ?, ?, ?)`);
|
|
71
|
+
const upsertAll = db.transaction(() => {
|
|
72
|
+
// Clear existing data
|
|
73
|
+
db.prepare("DELETE FROM article_related").run();
|
|
74
|
+
for (const [slug, entry] of Object.entries(relatedMap)) {
|
|
75
|
+
const articleUrl = entry.url;
|
|
76
|
+
for (let i = 0; i < entry.related.length; i++) {
|
|
77
|
+
const rel = entry.related[i];
|
|
78
|
+
upsert.run(articleUrl, `${BASE_URL}/${rel.slug}`, rel.title, rel.score, i + 1);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
upsertAll();
|
|
83
|
+
}
|
|
84
|
+
// --- Astro Frontmatter Injection ---
|
|
85
|
+
// WHAT: Parse YAML frontmatter from a markdown file
|
|
86
|
+
// WHY: Need to read existing frontmatter, add/update `related` field, write back
|
|
87
|
+
function injectRelatedFrontmatter(filePath, related) {
|
|
88
|
+
const content = readFileSync(filePath, "utf-8");
|
|
89
|
+
// Split frontmatter from body
|
|
90
|
+
const fmMatch = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
91
|
+
if (!fmMatch) {
|
|
92
|
+
console.error(` Skipping ${filePath}: no frontmatter found`);
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
let frontmatter = fmMatch[1];
|
|
96
|
+
const body = fmMatch[2];
|
|
97
|
+
// Remove existing `related:` block if present
|
|
98
|
+
frontmatter = frontmatter.replace(/related:\n(?:- slug:[\s\S]*?(?=\n\w|\n---|\z))/g, "");
|
|
99
|
+
// Cleaner regex: remove the related block entirely
|
|
100
|
+
frontmatter = frontmatter.replace(/related:\n(?:(?:- (?:slug|title):.*\n| .*\n)*)/g, "");
|
|
101
|
+
// Remove trailing whitespace
|
|
102
|
+
frontmatter = frontmatter.trimEnd();
|
|
103
|
+
// Build the related YAML block
|
|
104
|
+
const relatedYaml = related
|
|
105
|
+
.map((r) => `- slug: ${r.slug}\n title: "${r.title.replace(/"/g, '\\"')}"`)
|
|
106
|
+
.join("\n");
|
|
107
|
+
frontmatter += `\nrelated:\n${relatedYaml}`;
|
|
108
|
+
const updated = `---\n${frontmatter}\n---\n${body}`;
|
|
109
|
+
writeFileSync(filePath, updated, "utf-8");
|
|
110
|
+
return true;
|
|
111
|
+
}
|
|
112
|
+
// --- CLI ---
|
|
113
|
+
function parseArgs() {
|
|
114
|
+
const args = process.argv.slice(2);
|
|
115
|
+
return {
|
|
116
|
+
inject: args.includes("--inject"),
|
|
117
|
+
top: (() => {
|
|
118
|
+
const idx = args.indexOf("--top");
|
|
119
|
+
return idx >= 0 && args[idx + 1] ? parseInt(args[idx + 1], 10) : 3;
|
|
120
|
+
})(),
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
async function main() {
|
|
124
|
+
const { inject, top } = parseArgs();
|
|
125
|
+
console.error("=== Related Articles Generator ===\n");
|
|
126
|
+
if (!existsSync(KNOWLEDGE_PATH)) {
|
|
127
|
+
console.error("knowledge.db not found. Run 'npm run ingest' first.");
|
|
128
|
+
process.exit(1);
|
|
129
|
+
}
|
|
130
|
+
const db = new Database(KNOWLEDGE_PATH);
|
|
131
|
+
// WHAT: Only process astgl.ai canonical articles (not Substack mirrors)
|
|
132
|
+
// WHY: These are the articles we control and can inject links into
|
|
133
|
+
const articles = db
|
|
134
|
+
.prepare(`SELECT title, description, url, slug
|
|
135
|
+
FROM articles
|
|
136
|
+
WHERE url LIKE 'https://astgl.ai/answers/%'
|
|
137
|
+
ORDER BY rowid`)
|
|
138
|
+
.all();
|
|
139
|
+
console.error(`Found ${articles.length} astgl.ai articles to process\n`);
|
|
140
|
+
if (articles.length === 0) {
|
|
141
|
+
console.error("No articles found.");
|
|
142
|
+
db.close();
|
|
143
|
+
process.exit(0);
|
|
144
|
+
}
|
|
145
|
+
// Step 1: Embed all articles (title + description)
|
|
146
|
+
console.error("Embedding articles...");
|
|
147
|
+
const texts = articles.map((a) => `${a.title}. ${a.description}`);
|
|
148
|
+
const embeddings = await embedTexts(texts);
|
|
149
|
+
console.error(` ${embeddings.length} embeddings generated\n`);
|
|
150
|
+
// Step 2: Compute pairwise similarity
|
|
151
|
+
console.error("Computing pairwise similarity...");
|
|
152
|
+
const relatedMap = {};
|
|
153
|
+
for (let i = 0; i < articles.length; i++) {
|
|
154
|
+
const article = articles[i];
|
|
155
|
+
const similarities = [];
|
|
156
|
+
for (let j = 0; j < articles.length; j++) {
|
|
157
|
+
if (i === j)
|
|
158
|
+
continue;
|
|
159
|
+
const score = Math.round(cosineSimilarity(embeddings[i], embeddings[j]) * 1000) / 1000;
|
|
160
|
+
similarities.push({ index: j, score });
|
|
161
|
+
}
|
|
162
|
+
// Sort by similarity descending, take top N
|
|
163
|
+
similarities.sort((a, b) => b.score - a.score);
|
|
164
|
+
const topRelated = similarities.slice(0, top).map((s) => ({
|
|
165
|
+
slug: articles[s.index].slug,
|
|
166
|
+
title: articles[s.index].title,
|
|
167
|
+
score: s.score,
|
|
168
|
+
}));
|
|
169
|
+
relatedMap[article.slug] = {
|
|
170
|
+
title: article.title,
|
|
171
|
+
url: article.url,
|
|
172
|
+
related: topRelated,
|
|
173
|
+
};
|
|
174
|
+
console.error(` ${article.slug}: ${topRelated.map((r) => `${r.slug} (${r.score})`).join(", ")}`);
|
|
175
|
+
}
|
|
176
|
+
// Step 3: Store in knowledge.db
|
|
177
|
+
console.error("\nStoring in knowledge.db...");
|
|
178
|
+
storeRelatedInDb(db, relatedMap);
|
|
179
|
+
console.error(` ${Object.keys(relatedMap).length * top} relationships stored`);
|
|
180
|
+
db.close();
|
|
181
|
+
// Step 4: Inject into Astro frontmatter (optional)
|
|
182
|
+
if (inject) {
|
|
183
|
+
console.error("\nInjecting into Astro frontmatter...");
|
|
184
|
+
if (!existsSync(ASTRO_ANSWERS_DIR)) {
|
|
185
|
+
console.error(` Astro answers dir not found: ${ASTRO_ANSWERS_DIR}`);
|
|
186
|
+
console.error(" Set ASTRO_ANSWERS_DIR env var to override.");
|
|
187
|
+
}
|
|
188
|
+
else {
|
|
189
|
+
const mdFiles = readdirSync(ASTRO_ANSWERS_DIR).filter((f) => f.endsWith(".md"));
|
|
190
|
+
let injected = 0;
|
|
191
|
+
for (const file of mdFiles) {
|
|
192
|
+
const slug = file.replace(/\.md$/, "");
|
|
193
|
+
const entry = relatedMap[slug];
|
|
194
|
+
if (!entry) {
|
|
195
|
+
console.error(` ${slug}: not in knowledge base, skipping`);
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
const filePath = join(ASTRO_ANSWERS_DIR, file);
|
|
199
|
+
if (injectRelatedFrontmatter(filePath, entry.related)) {
|
|
200
|
+
console.error(` ${slug}: injected ${entry.related.length} related links`);
|
|
201
|
+
injected++;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
console.error(`\n ${injected} files updated`);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
// Output JSON map to stdout
|
|
208
|
+
console.log(JSON.stringify(relatedMap, null, 2));
|
|
209
|
+
console.error("\n=== Done ===");
|
|
210
|
+
}
|
|
211
|
+
main()
|
|
212
|
+
.then(() => process.exit(0))
|
|
213
|
+
.catch((err) => {
|
|
214
|
+
console.error("Related articles failed:", err);
|
|
215
|
+
process.exit(1);
|
|
216
|
+
});
|
|
217
|
+
//# sourceMappingURL=related-articles.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"related-articles.js","sourceRoot":"","sources":["../src/related-articles.ts"],"names":[],"mappings":";AACA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,IAAI,CAAC;AAC1E,OAAO,QAAQ,MAAM,gBAAgB,CAAC;AAEtC,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,MAAM,CAAC,CAAC;AACzD,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;AACtD,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,UAAU,IAAI,wBAAwB,CAAC;AACtE,MAAM,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,kBAAkB,CAAC;AAClE,MAAM,iBAAiB,GACrB,OAAO,CAAC,GAAG,CAAC,iBAAiB;IAC7B,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,YAAY,EAAE,KAAK,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;AACnF,MAAM,QAAQ,GAAG,0BAA0B,CAAC;AAuB5C,oBAAoB;AACpB,KAAK,UAAU,UAAU,CAAC,KAAe;IACvC,yEAAyE;IACzE,oEAAoE;IACpE,MAAM,IAAI,GAAG,MAAM,KAAK,CAAC,GAAG,UAAU,YAAY,EAAE;QAClD,MAAM,EAAE,MAAM;QACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;QAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,EAAE,WAAW,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;KAC3D,CAAC,CAAC;IAEH,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC;QACb,MAAM,IAAI,KAAK,CAAC,wBAAwB,IAAI,CAAC,MAAM,IAAI,MAAM,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;IAC9E,CAAC;IAED,MAAM,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,EAAE,CAA+B,CAAC;IAC/D,OAAO,IAAI,CAAC,UAAU,CAAC;AACzB,CAAC;AAED,8CAA8C;AAC9C,qFAAqF;AACrF,SAAS,gBAAgB,CAAC,CAAW,EAAE,CAAW;IAChD,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACnB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACrB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IACD,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;AACrD,CAAC;AAED,+CAA+C;AAC/C,SAAS,gBAAgB,CACvB,EAAiC,EACjC,UAAsB;IAEtB,oDAAoD;IACpD,sEAAsE;IACtE,EAAE,CAAC,IAAI,CAAC;;;;;;;;;;GAUP,CAAC,CAAC;IACH,EAAE,CAAC,IAAI,CACL,gFAAgF,CACjF,CAAC;IAEF,MAAM,MAAM,GAAG,EAAE,CAAC,OAAO,CACvB;4BACwB,CACzB,CAAC;IAEF,MAAM,SAAS,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,EAAE;QACpC,sBAAsB;QACtB,EAAE,CAAC,OAAO,CAAC,6BAA6B,CAAC,CAAC,GAAG,EAAE,CAAC;QAEhD,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;YACvD,MAAM,UAAU,GAAG,KAAK,CAAC,GAAG,CAAC;YAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC9C,MAAM,GAAG,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;gBAC7B,MAAM,CAAC,GAAG,CACR,UAAU,EACV,GAAG,QAAQ,IAAI,GAAG,CAAC,IAAI,EAAE,EACzB,GAAG,CAAC,KAAK,EACT,GAAG,CAAC,KAAK,EACT,CAAC,GAAG,CAAC,CACN,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,SAAS,EAAE,CAAC;AACd,CAAC;AAED,sCAAsC;AACtC,oDAAoD;AACpD,iFAAiF;AACjF,SAAS,wBAAwB,CAC/B,QAAgB,EAChB,OAAuB;IAEvB,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAEhD,8BAA8B;IAC9B,MAAM,OAAO,GAAG,OAAO,CAAC,KAAK,CAAC,mCAAmC,CAAC,CAAC;IACnE,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CAAC,cAAc,QAAQ,wBAAwB,CAAC,CAAC;QAC9D,OAAO,KAAK,CAAC;IACf,CAAC;IAED,IAAI,WAAW,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;IAC7B,MAAM,IAAI,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;IAExB,8CAA8C;IAC9C,WAAW,GAAG,WAAW,CAAC,OAAO,CAC/B,iDAAiD,EACjD,EAAE,CACH,CAAC;IACF,mDAAmD;IACnD,WAAW,GAAG,WAAW,CAAC,OAAO,CAC/B,kDAAkD,EAClD,EAAE,CACH,CAAC;IACF,6BAA6B;IAC7B,WAAW,GAAG,WAAW,CAAC,OAAO,EAAE,CAAC;IAEpC,+BAA+B;IAC/B,MAAM,WAAW,GAAG,OAAO;SACxB,GAAG,CACF,CAAC,CAAC,EAAE,EAAE,CACJ,WAAW,CAAC,CAAC,IAAI,eAAe,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,GAAG,CAClE;SACA,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,WAAW,IAAI,eAAe,WAAW,EAAE,CAAC;IAE5C,MAAM,OAAO,GAAG,QAAQ,WAAW,UAAU,IAAI,EAAE,CAAC;IACpD,aAAa,CAAC,QAAQ,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;IAC1C,OAAO,IAAI,CAAC;AACd,CAAC;AAED,cAAc;AACd,SAAS,SAAS;IAChB,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACnC,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC;QACjC,GAAG,EAAE,CAAC,GAAG,EAAE;YACT,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAClC,OAAO,GAAG,IAAI,CAAC,IAAI,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACrE,CAAC,CAAC,EAAE;KACL,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,IAAI;IACjB,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,SAAS,EAAE,CAAC;IAEpC,OAAO,CAAC,KAAK,CAAC,sCAAsC,CAAC,CAAC;IAEtD,IAAI,CAAC,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;QAChC,OAAO,CAAC,KAAK,CAAC,qDAAqD,CAAC,CAAC;QACrE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,EAAE,GAAG,IAAI,QAAQ,CAAC,cAAc,CAAC,CAAC;IAExC,wEAAwE;IACxE,mEAAmE;IACnE,MAAM,QAAQ,GAAG,EAAE;SAChB,OAAO,CACN;;;sBAGgB,CACjB;SACA,GAAG,EAAmB,CAAC;IAE1B,OAAO,CAAC,KAAK,CAAC,SAAS,QAAQ,CAAC,MAAM,iCAAiC,CAAC,CAAC;IAEzE,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;QACpC,EAAE,CAAC,KAAK,EAAE,CAAC;QACX,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,mDAAmD;IACnD,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;IACvC,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;IAClE,MAAM,UAAU,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,CAAC;IAC3C,OAAO,CAAC,KAAK,CAAC,KAAK,UAAU,CAAC,MAAM,yBAAyB,CAAC,CAAC;IAE/D,sCAAsC;IACtC,OAAO,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;IAClD,MAAM,UAAU,GAAe,EAAE,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACzC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,YAAY,GAA4C,EAAE,CAAC;QAEjE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACzC,IAAI,CAAC,KAAK,CAAC;gBAAE,SAAS;YACtB,MAAM,KAAK,GACT,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;YAC3E,YAAY,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QACzC,CAAC;QAED,4CAA4C;QAC5C,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAC/C,MAAM,UAAU,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACxD,IAAI,EAAE,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI;YAC5B,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,KAAK;YAC9B,KAAK,EAAE,CAAC,CAAC,KAAK;SACf,CAAC,CAAC,CAAC;QAEJ,UAAU,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG;YACzB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,OAAO,EAAE,UAAU;SACpB,CAAC;QAEF,OAAO,CAAC,KAAK,CACX,KAAK,OAAO,CAAC,IAAI,KAAK,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACnF,CAAC;IACJ,CAAC;IAED,gCAAgC;IAChC,OAAO,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;IAC9C,gBAAgB,CAAC,EAAE,EAAE,UAAU,CAAC,CAAC;IACjC,OAAO,CAAC,KAAK,CAAC,KAAK,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,MAAM,GAAG,GAAG,uBAAuB,CAAC,CAAC;IAEhF,EAAE,CAAC,KAAK,EAAE,CAAC;IAEX,mDAAmD;IACnD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,CAAC,KAAK,CAAC,uCAAuC,CAAC,CAAC;QAEvD,IAAI,CAAC,UAAU,CAAC,iBAAiB,CAAC,EAAE,CAAC;YACnC,OAAO,CAAC,KAAK,CAAC,kCAAkC,iBAAiB,EAAE,CAAC,CAAC;YACrE,OAAO,CAAC,KAAK,CAAC,8CAA8C,CAAC,CAAC;QAChE,CAAC;aAAM,CAAC;YACN,MAAM,OAAO,GAAG,WAAW,CAAC,iBAAiB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAC1D,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAClB,CAAC;YAEF,IAAI,QAAQ,GAAG,CAAC,CAAC;YACjB,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;gBAC3B,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;gBACvC,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;gBAC/B,IAAI,CAAC,KAAK,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CAAC,KAAK,IAAI,mCAAmC,CAAC,CAAC;oBAC5D,SAAS;gBACX,CAAC;gBAED,MAAM,QAAQ,GAAG,IAAI,CAAC,iBAAiB,EAAE,IAAI,CAAC,CAAC;gBAC/C,IAAI,wBAAwB,CAAC,QAAQ,EAAE,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;oBACtD,OAAO,CAAC,KAAK,CAAC,KAAK,IAAI,cAAc,KAAK,CAAC,OAAO,CAAC,MAAM,gBAAgB,CAAC,CAAC;oBAC3E,QAAQ,EAAE,CAAC;gBACb,CAAC;YACH,CAAC;YACD,OAAO,CAAC,KAAK,CAAC,OAAO,QAAQ,gBAAgB,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IAED,4BAA4B;IAC5B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEjD,OAAO,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;AAClC,CAAC;AAED,IAAI,EAAE;KACH,IAAI,CAAC,GAAG,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;KAC3B,KAAK,CAAC,CAAC,GAAG,EAAE,EAAE;IACb,OAAO,CAAC,KAAK,CAAC,0BAA0B,EAAE,GAAG,CAAC,CAAC;IAC/C,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}
|
package/dist/search.d.ts
CHANGED
|
@@ -2,10 +2,19 @@
|
|
|
2
2
|
* Vector search logic against the pre-built knowledge.db.
|
|
3
3
|
* At runtime, queries are embedded via Ollama and matched against stored vectors.
|
|
4
4
|
*
|
|
5
|
-
* WHAT: Provides search_articles, get_answer, and
|
|
5
|
+
* WHAT: Provides search_articles, get_answer, list_topics, and related functions
|
|
6
6
|
* WHY: Separating search logic from MCP server keeps concerns clean
|
|
7
|
+
*
|
|
8
|
+
* Performance:
|
|
9
|
+
* - LRU cache on embeddings (200 entries) avoids redundant Ollama calls
|
|
10
|
+
* - Ollama calls have 10s timeout + 1 retry with 500ms delay
|
|
11
|
+
* - Prepared statements cached at module level
|
|
12
|
+
* - listTopics uses GROUP_CONCAT instead of in-memory join
|
|
7
13
|
*/
|
|
8
|
-
import { SearchResult, AnswerResult, TopicEntry } from "./types.js";
|
|
9
|
-
export declare function searchArticles(query: string, limit?: number): Promise<SearchResult[]>;
|
|
10
|
-
export declare function getAnswer(question: string): Promise<AnswerResult>;
|
|
14
|
+
import { SearchResult, AnswerResult, TopicEntry, TutorialResult, ComparisonResult, LatestArticle } from "./types.js";
|
|
15
|
+
export declare function searchArticles(query: string, limit?: number, contentType?: string): Promise<SearchResult[]>;
|
|
16
|
+
export declare function getAnswer(question: string, contentType?: string): Promise<AnswerResult>;
|
|
17
|
+
export declare function getTutorial(query: string): Promise<TutorialResult>;
|
|
18
|
+
export declare function compareTopics(topicA: string, topicB: string): Promise<ComparisonResult>;
|
|
19
|
+
export declare function getLatest(limit?: number): LatestArticle[];
|
|
11
20
|
export declare function listTopics(): TopicEntry[];
|
package/dist/search.js
CHANGED
|
@@ -2,8 +2,14 @@
|
|
|
2
2
|
* Vector search logic against the pre-built knowledge.db.
|
|
3
3
|
* At runtime, queries are embedded via Ollama and matched against stored vectors.
|
|
4
4
|
*
|
|
5
|
-
* WHAT: Provides search_articles, get_answer, and
|
|
5
|
+
* WHAT: Provides search_articles, get_answer, list_topics, and related functions
|
|
6
6
|
* WHY: Separating search logic from MCP server keeps concerns clean
|
|
7
|
+
*
|
|
8
|
+
* Performance:
|
|
9
|
+
* - LRU cache on embeddings (200 entries) avoids redundant Ollama calls
|
|
10
|
+
* - Ollama calls have 10s timeout + 1 retry with 500ms delay
|
|
11
|
+
* - Prepared statements cached at module level
|
|
12
|
+
* - listTopics uses GROUP_CONCAT instead of in-memory join
|
|
7
13
|
*/
|
|
8
14
|
import { join } from "path";
|
|
9
15
|
import { existsSync } from "fs";
|
|
@@ -12,6 +18,7 @@ import * as sqliteVec from "sqlite-vec";
|
|
|
12
18
|
const DB_PATH = join(import.meta.dirname, "..", "data", "knowledge.db");
|
|
13
19
|
const OLLAMA_URL = process.env.OLLAMA_URL || "http://localhost:11434";
|
|
14
20
|
const EMBED_MODEL = process.env.EMBED_MODEL || "nomic-embed-text";
|
|
21
|
+
// --- Database Connection (lazy singleton) ---
|
|
15
22
|
let db = null;
|
|
16
23
|
function getDb() {
|
|
17
24
|
if (!db) {
|
|
@@ -23,21 +30,100 @@ function getDb() {
|
|
|
23
30
|
}
|
|
24
31
|
return db;
|
|
25
32
|
}
|
|
33
|
+
// --- Embedding Cache (LRU) ---
|
|
34
|
+
// WHAT: In-memory cache for embedding vectors, keyed by normalized query text
|
|
35
|
+
// WHY: Same query → same embedding. At 500 queries/day, even 20% repeats saves 100 Ollama calls
|
|
36
|
+
const EMBED_CACHE_MAX = 200;
|
|
37
|
+
const embedCache = new Map();
|
|
38
|
+
function cacheKey(text) {
|
|
39
|
+
return text.trim().toLowerCase();
|
|
40
|
+
}
|
|
41
|
+
function evictOldest() {
|
|
42
|
+
if (embedCache.size <= EMBED_CACHE_MAX)
|
|
43
|
+
return;
|
|
44
|
+
let oldestKey = "";
|
|
45
|
+
let oldestTs = Infinity;
|
|
46
|
+
for (const [key, entry] of embedCache) {
|
|
47
|
+
if (entry.ts < oldestTs) {
|
|
48
|
+
oldestTs = entry.ts;
|
|
49
|
+
oldestKey = key;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
if (oldestKey)
|
|
53
|
+
embedCache.delete(oldestKey);
|
|
54
|
+
}
|
|
55
|
+
// --- Embedding with Resilience ---
|
|
56
|
+
// WHAT: Embed query text via Ollama with timeout, retry, and caching
|
|
57
|
+
// WHY: Ollama is the bottleneck (~100-500ms). Cache eliminates redundant calls,
|
|
58
|
+
// timeout prevents hanging, retry handles transient failures
|
|
26
59
|
async function embedQuery(text) {
|
|
27
|
-
const
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
60
|
+
const key = cacheKey(text);
|
|
61
|
+
const cached = embedCache.get(key);
|
|
62
|
+
if (cached) {
|
|
63
|
+
cached.ts = Date.now(); // Touch for LRU
|
|
64
|
+
return cached.vec;
|
|
65
|
+
}
|
|
66
|
+
const vec = await embedWithRetry(text);
|
|
67
|
+
embedCache.set(key, { vec, ts: Date.now() });
|
|
68
|
+
evictOldest();
|
|
69
|
+
return vec;
|
|
70
|
+
}
|
|
71
|
+
async function embedWithRetry(text, retries = 1) {
|
|
72
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
73
|
+
try {
|
|
74
|
+
const resp = await fetch(`${OLLAMA_URL}/api/embed`, {
|
|
75
|
+
method: "POST",
|
|
76
|
+
headers: { "Content-Type": "application/json" },
|
|
77
|
+
body: JSON.stringify({ model: EMBED_MODEL, input: text }),
|
|
78
|
+
signal: AbortSignal.timeout(10_000),
|
|
79
|
+
});
|
|
80
|
+
if (!resp.ok) {
|
|
81
|
+
throw new Error(`Ollama embed failed: ${resp.status} ${await resp.text()}`);
|
|
82
|
+
}
|
|
83
|
+
const data = (await resp.json());
|
|
84
|
+
return new Float32Array(data.embeddings[0]);
|
|
85
|
+
}
|
|
86
|
+
catch (err) {
|
|
87
|
+
const isLastAttempt = attempt === retries;
|
|
88
|
+
if (isLastAttempt) {
|
|
89
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
90
|
+
throw new Error(`Embedding unavailable after ${retries + 1} attempt(s): ${message}. ` +
|
|
91
|
+
"Ensure Ollama is running with the nomic-embed-text model loaded.");
|
|
92
|
+
}
|
|
93
|
+
// Wait before retry
|
|
94
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
// Unreachable, but TypeScript needs it
|
|
98
|
+
throw new Error("Embedding failed");
|
|
99
|
+
}
|
|
100
|
+
// --- Prepared Statement Cache ---
|
|
101
|
+
// WHAT: Cache the content_type lookup statement used in post-filtering
|
|
102
|
+
// WHY: Avoid re-preparing the same statement on every search call
|
|
103
|
+
let contentTypeStmt = null;
|
|
104
|
+
function getContentTypeStmt(database) {
|
|
105
|
+
if (!contentTypeStmt) {
|
|
106
|
+
contentTypeStmt = database.prepare("SELECT content_type FROM articles WHERE url = ?");
|
|
34
107
|
}
|
|
35
|
-
|
|
36
|
-
return new Float32Array(data.embeddings[0]);
|
|
108
|
+
return contentTypeStmt;
|
|
37
109
|
}
|
|
38
|
-
|
|
110
|
+
// --- Helper: Post-filter by content type ---
|
|
111
|
+
function filterByContentType(rows, contentType, database) {
|
|
112
|
+
if (!contentType)
|
|
113
|
+
return rows;
|
|
114
|
+
const stmt = getContentTypeStmt(database);
|
|
115
|
+
return rows.filter((row) => {
|
|
116
|
+
const article = stmt.get(row.article_url);
|
|
117
|
+
return article?.content_type === contentType;
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
// --- search_articles ---
|
|
121
|
+
export async function searchArticles(query, limit = 5, contentType) {
|
|
39
122
|
const database = getDb();
|
|
40
123
|
const queryVec = await embedQuery(query);
|
|
124
|
+
// WHAT: Over-fetch when filtering by content_type, then post-filter
|
|
125
|
+
// WHY: sqlite-vec applies k before joins, so we can't pre-filter on content_type
|
|
126
|
+
const fetchK = contentType ? limit * 3 : limit;
|
|
41
127
|
const rows = database
|
|
42
128
|
.prepare(`
|
|
43
129
|
SELECT
|
|
@@ -52,8 +138,9 @@ export async function searchArticles(query, limit = 5) {
|
|
|
52
138
|
AND k = ?
|
|
53
139
|
ORDER BY distance
|
|
54
140
|
`)
|
|
55
|
-
.all(queryVec,
|
|
56
|
-
|
|
141
|
+
.all(queryVec, fetchK);
|
|
142
|
+
const filtered = filterByContentType(rows, contentType, database);
|
|
143
|
+
return filtered.slice(0, limit).map((row) => ({
|
|
57
144
|
title: row.article_title,
|
|
58
145
|
section: row.section_heading,
|
|
59
146
|
content: row.content,
|
|
@@ -62,9 +149,13 @@ export async function searchArticles(query, limit = 5) {
|
|
|
62
149
|
relevance_score: Math.round((1 - row.distance / 2) * 1000) / 1000,
|
|
63
150
|
}));
|
|
64
151
|
}
|
|
65
|
-
|
|
152
|
+
// --- get_answer ---
|
|
153
|
+
export async function getAnswer(question, contentType) {
|
|
66
154
|
const database = getDb();
|
|
67
155
|
const queryVec = await embedQuery(question);
|
|
156
|
+
// WHAT: Over-fetch when filtering by content_type
|
|
157
|
+
// WHY: Same sqlite-vec k-before-join constraint as searchArticles
|
|
158
|
+
const fetchK = contentType ? 30 : 10;
|
|
68
159
|
// Get the best matching chunk, preferring FAQ entries
|
|
69
160
|
const rows = database
|
|
70
161
|
.prepare(`
|
|
@@ -78,27 +169,29 @@ export async function getAnswer(question) {
|
|
|
78
169
|
FROM vec_chunks
|
|
79
170
|
LEFT JOIN chunks ON chunks.id = vec_chunks.chunk_id
|
|
80
171
|
WHERE embedding MATCH ?
|
|
81
|
-
AND k =
|
|
172
|
+
AND k = ?
|
|
82
173
|
ORDER BY distance
|
|
83
174
|
`)
|
|
84
|
-
.all(queryVec);
|
|
85
|
-
|
|
175
|
+
.all(queryVec, fetchK);
|
|
176
|
+
const filteredRows = filterByContentType(rows, contentType, database);
|
|
177
|
+
if (filteredRows.length === 0) {
|
|
86
178
|
return {
|
|
87
179
|
answer: "No relevant information found in the ASTGL knowledge base.",
|
|
88
180
|
source_title: "",
|
|
89
181
|
source_url: "",
|
|
90
182
|
related_articles: [],
|
|
183
|
+
confidence_score: 0,
|
|
91
184
|
};
|
|
92
185
|
}
|
|
93
186
|
// Prefer FAQ entries if they're among the top results (within 20% of best distance)
|
|
94
|
-
const bestDistance =
|
|
95
|
-
const faqCandidate =
|
|
96
|
-
const best = faqCandidate ||
|
|
187
|
+
const bestDistance = filteredRows[0].distance;
|
|
188
|
+
const faqCandidate = filteredRows.find((r) => r.chunk_type === "faq" && r.distance <= bestDistance * 1.2);
|
|
189
|
+
const best = faqCandidate || filteredRows[0];
|
|
97
190
|
// Collect related articles (unique, excluding the source)
|
|
98
191
|
const seen = new Set();
|
|
99
192
|
seen.add(best.article_url);
|
|
100
193
|
const related = [];
|
|
101
|
-
for (const row of
|
|
194
|
+
for (const row of filteredRows) {
|
|
102
195
|
if (!seen.has(row.article_url)) {
|
|
103
196
|
seen.add(row.article_url);
|
|
104
197
|
related.push({ title: row.article_title, url: row.article_url });
|
|
@@ -113,34 +206,176 @@ export async function getAnswer(question) {
|
|
|
113
206
|
if (aMatch)
|
|
114
207
|
answer = aMatch[1];
|
|
115
208
|
}
|
|
209
|
+
// WHAT: Cosine distance 0-2 → confidence 0-1
|
|
210
|
+
// WHY: Gives consumers a normalized quality signal for the answer
|
|
211
|
+
const confidence = Math.round((1 - best.distance / 2) * 1000) / 1000;
|
|
116
212
|
return {
|
|
117
213
|
answer,
|
|
118
214
|
source_title: best.article_title,
|
|
119
215
|
source_url: best.article_url,
|
|
120
216
|
related_articles: related,
|
|
217
|
+
confidence_score: confidence,
|
|
121
218
|
};
|
|
122
219
|
}
|
|
220
|
+
// --- get_tutorial ---
|
|
221
|
+
// WHAT: Find tutorial/guide content and return ordered steps from section headings
|
|
222
|
+
// WHY: AI assistants asking "how do I X" benefit from structured step-by-step answers
|
|
223
|
+
export async function getTutorial(query) {
|
|
224
|
+
const database = getDb();
|
|
225
|
+
const queryVec = await embedQuery(query);
|
|
226
|
+
const rows = database
|
|
227
|
+
.prepare(`
|
|
228
|
+
SELECT
|
|
229
|
+
chunks.article_title,
|
|
230
|
+
chunks.article_url,
|
|
231
|
+
chunks.section_heading,
|
|
232
|
+
chunks.content,
|
|
233
|
+
chunks.chunk_type,
|
|
234
|
+
chunks.article_order,
|
|
235
|
+
vec_chunks.distance
|
|
236
|
+
FROM vec_chunks
|
|
237
|
+
LEFT JOIN chunks ON chunks.id = vec_chunks.chunk_id
|
|
238
|
+
LEFT JOIN articles ON articles.url = chunks.article_url
|
|
239
|
+
WHERE embedding MATCH ?
|
|
240
|
+
AND k = 30
|
|
241
|
+
ORDER BY distance
|
|
242
|
+
`)
|
|
243
|
+
.all(queryVec);
|
|
244
|
+
if (rows.length === 0) {
|
|
245
|
+
return {
|
|
246
|
+
title: "",
|
|
247
|
+
url: "",
|
|
248
|
+
description: "No matching tutorials found.",
|
|
249
|
+
steps: [],
|
|
250
|
+
confidence_score: 0,
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
// WHAT: Pick the best-matching article and collect all its section chunks in order
|
|
254
|
+
// WHY: Tutorial steps come from the sequential sections of a single article
|
|
255
|
+
const bestUrl = rows[0].article_url;
|
|
256
|
+
const bestDistance = rows[0].distance;
|
|
257
|
+
const articleChunks = database
|
|
258
|
+
.prepare(`SELECT section_heading, content, chunk_type, article_order
|
|
259
|
+
FROM chunks
|
|
260
|
+
WHERE article_url = ? AND chunk_type = 'section'
|
|
261
|
+
ORDER BY article_order`)
|
|
262
|
+
.all(bestUrl);
|
|
263
|
+
const description = database
|
|
264
|
+
.prepare("SELECT description FROM articles WHERE url = ?")
|
|
265
|
+
.get(bestUrl);
|
|
266
|
+
const steps = articleChunks.map((c) => `**${c.section_heading}**\n${c.content}`);
|
|
267
|
+
return {
|
|
268
|
+
title: rows[0].article_title,
|
|
269
|
+
url: bestUrl,
|
|
270
|
+
description: description?.description || "",
|
|
271
|
+
steps,
|
|
272
|
+
confidence_score: Math.round((1 - bestDistance / 2) * 1000) / 1000,
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
// --- compare_topics ---
|
|
276
|
+
// WHAT: Side-by-side comparison of two topics using vector search
|
|
277
|
+
// WHY: "X vs Y" queries are common in AI-assisted research
|
|
278
|
+
export async function compareTopics(topicA, topicB) {
|
|
279
|
+
const database = getDb();
|
|
280
|
+
const [vecA, vecB] = await Promise.all([
|
|
281
|
+
embedQuery(topicA),
|
|
282
|
+
embedQuery(topicB),
|
|
283
|
+
]);
|
|
284
|
+
// WHAT: Get best-matching chunks for each topic independently
|
|
285
|
+
// WHY: Each side of the comparison needs its own top-ranked content
|
|
286
|
+
const getTopChunks = (vec) => database
|
|
287
|
+
.prepare(`
|
|
288
|
+
SELECT
|
|
289
|
+
chunks.article_title,
|
|
290
|
+
chunks.article_url,
|
|
291
|
+
chunks.section_heading,
|
|
292
|
+
chunks.content,
|
|
293
|
+
vec_chunks.distance
|
|
294
|
+
FROM vec_chunks
|
|
295
|
+
LEFT JOIN chunks ON chunks.id = vec_chunks.chunk_id
|
|
296
|
+
WHERE embedding MATCH ?
|
|
297
|
+
AND k = 5
|
|
298
|
+
ORDER BY distance
|
|
299
|
+
`)
|
|
300
|
+
.all(vec);
|
|
301
|
+
const rowsA = getTopChunks(vecA);
|
|
302
|
+
const rowsB = getTopChunks(vecB);
|
|
303
|
+
const buildSide = (rows) => {
|
|
304
|
+
if (rows.length === 0) {
|
|
305
|
+
return { title: "No matching content", url: "", key_points: [] };
|
|
306
|
+
}
|
|
307
|
+
// WHAT: Collect key points from top chunks, preferring different sections
|
|
308
|
+
// WHY: Diversity of sections gives a better comparison than repeated content
|
|
309
|
+
const seen = new Set();
|
|
310
|
+
const points = [];
|
|
311
|
+
for (const row of rows) {
|
|
312
|
+
const key = `${row.article_url}:${row.section_heading}`;
|
|
313
|
+
if (seen.has(key))
|
|
314
|
+
continue;
|
|
315
|
+
seen.add(key);
|
|
316
|
+
const point = row.content.length > 300
|
|
317
|
+
? row.content.slice(0, 300) + "..."
|
|
318
|
+
: row.content;
|
|
319
|
+
points.push(`**${row.section_heading}:** ${point}`);
|
|
320
|
+
if (points.length >= 3)
|
|
321
|
+
break;
|
|
322
|
+
}
|
|
323
|
+
return {
|
|
324
|
+
title: rows[0].article_title,
|
|
325
|
+
url: rows[0].article_url,
|
|
326
|
+
key_points: points,
|
|
327
|
+
};
|
|
328
|
+
};
|
|
329
|
+
const worstDistance = Math.max(rowsA[0]?.distance ?? 2, rowsB[0]?.distance ?? 2);
|
|
330
|
+
return {
|
|
331
|
+
topic_a: buildSide(rowsA),
|
|
332
|
+
topic_b: buildSide(rowsB),
|
|
333
|
+
confidence_score: Math.round((1 - worstDistance / 2) * 1000) / 1000,
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
// --- get_latest ---
|
|
337
|
+
// WHAT: Return most recently added articles
|
|
338
|
+
// WHY: "What's new" queries help users discover fresh content
|
|
339
|
+
export function getLatest(limit = 5) {
|
|
340
|
+
const database = getDb();
|
|
341
|
+
// WHAT: Order by processed_at (structuring pipeline) then rowid (ingest order)
|
|
342
|
+
// WHY: Original ingested articles lack processed_at; rowid is the fallback
|
|
343
|
+
const rows = database
|
|
344
|
+
.prepare(`SELECT title, description, url, content_type, processed_at
|
|
345
|
+
FROM articles
|
|
346
|
+
ORDER BY COALESCE(processed_at, '') DESC, rowid DESC
|
|
347
|
+
LIMIT ?`)
|
|
348
|
+
.all(limit);
|
|
349
|
+
return rows.map((r) => ({
|
|
350
|
+
title: r.title,
|
|
351
|
+
description: r.description,
|
|
352
|
+
url: r.url,
|
|
353
|
+
content_type: r.content_type || "article",
|
|
354
|
+
added_at: r.processed_at,
|
|
355
|
+
}));
|
|
356
|
+
}
|
|
357
|
+
// --- list_topics ---
|
|
358
|
+
// WHAT: List all topics with their content types and section headings
|
|
359
|
+
// WHY: Gives AI assistants an overview of available coverage
|
|
123
360
|
export function listTopics() {
|
|
124
361
|
const database = getDb();
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
362
|
+
// WHAT: Single query with GROUP_CONCAT instead of two queries + in-memory join
|
|
363
|
+
// WHY: Reduces to one DB round-trip and eliminates the Map construction overhead
|
|
364
|
+
const rows = database
|
|
365
|
+
.prepare(`SELECT a.title, a.description, a.url,
|
|
366
|
+
COALESCE(a.content_type, 'article') as content_type,
|
|
367
|
+
GROUP_CONCAT(DISTINCT c.section_heading) as sections
|
|
368
|
+
FROM articles a
|
|
369
|
+
LEFT JOIN chunks c ON c.article_url = a.url AND c.chunk_type = 'section'
|
|
370
|
+
GROUP BY a.url
|
|
371
|
+
ORDER BY a.rowid`)
|
|
132
372
|
.all();
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
return articles.map((a) => ({
|
|
140
|
-
title: a.title,
|
|
141
|
-
description: a.description,
|
|
142
|
-
url: a.url,
|
|
143
|
-
topics: sectionsByUrl.get(a.url) || [],
|
|
373
|
+
return rows.map((r) => ({
|
|
374
|
+
title: r.title,
|
|
375
|
+
description: r.description,
|
|
376
|
+
url: r.url,
|
|
377
|
+
content_type: r.content_type,
|
|
378
|
+
topics: r.sections ? r.sections.split(",") : [],
|
|
144
379
|
}));
|
|
145
380
|
}
|
|
146
381
|
//# sourceMappingURL=search.js.map
|