pi-research 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +90 -0
- package/extensions/pi-research.ts +1 -0
- package/index.js +144 -0
- package/lib/planner.js +36 -0
- package/lib/research-memory.js +87 -0
- package/lib/research-profiles.json +72 -0
- package/lib/research.js +693 -0
- package/lib/types.js +49 -0
- package/lib/web-research.js +718 -0
- package/package.json +33 -0
package/lib/research.js
ADDED
|
@@ -0,0 +1,693 @@
|
|
|
1
|
+
function decodeHtmlEntities(text) {
|
|
2
|
+
return String(text || "")
|
|
3
|
+
.replace(/&/g, "&")
|
|
4
|
+
.replace(/"/g, '"')
|
|
5
|
+
.replace(/'|'/g, "'")
|
|
6
|
+
.replace(/</g, "<")
|
|
7
|
+
.replace(/>/g, ">");
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function stripTags(text) {
|
|
11
|
+
return decodeHtmlEntities(String(text || "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim());
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function decodeDuckDuckGoUrl(href) {
|
|
15
|
+
const normalized = href.startsWith("//") ? `https:${href}` : href;
|
|
16
|
+
const url = new URL(normalized);
|
|
17
|
+
const target = url.searchParams.get("uddg");
|
|
18
|
+
return target ? decodeURIComponent(target) : normalized;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function normalizeUrl(url) {
|
|
22
|
+
const parsed = new URL(url);
|
|
23
|
+
parsed.hash = "";
|
|
24
|
+
if (parsed.pathname.length > 1) parsed.pathname = parsed.pathname.replace(/\/+$/, "");
|
|
25
|
+
return parsed.toString();
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function queryTerms(query) {
|
|
29
|
+
return String(query || "")
|
|
30
|
+
.toLowerCase()
|
|
31
|
+
.normalize("NFKD")
|
|
32
|
+
.replace(/[\u0300-\u036f]/g, "")
|
|
33
|
+
.split(/[^a-z0-9]+/)
|
|
34
|
+
.filter((term) => term.length > 2 && !["was", "ist", "the", "and", "oder", "und", "for", "von", "der", "die", "das"].includes(term));
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function countTermMatches(text, terms) {
|
|
38
|
+
const lower = String(text || "").toLowerCase();
|
|
39
|
+
return terms.filter((term) => lower.includes(term)).length;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function allowedSourceBoost(result, allowedSources = []) {
|
|
43
|
+
if (!Array.isArray(allowedSources) || allowedSources.length === 0) return 0;
|
|
44
|
+
const url = String(result?.url || "").toLowerCase();
|
|
45
|
+
const title = String(result?.title || "").toLowerCase();
|
|
46
|
+
const sourceType = classifySourceType(url, title);
|
|
47
|
+
let boost = 0;
|
|
48
|
+
|
|
49
|
+
for (const hint of allowedSources.map((value) => String(value).toLowerCase())) {
|
|
50
|
+
if (!hint) continue;
|
|
51
|
+
if (hint === sourceType || hint === url || title.includes(hint)) boost += 8;
|
|
52
|
+
if (hint === "docs" && (sourceType === "official_doc" || /\/docs?\b|documentation|developer|reference|official/.test(url))) boost += 6;
|
|
53
|
+
if (hint === "github" && (/github\.com/.test(url) || sourceType.startsWith("github_"))) boost += 6;
|
|
54
|
+
if (hint === "paper" && sourceType === "paper") boost += 6;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return boost;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function freshnessBonus(value) {
|
|
61
|
+
const date = new Date(String(value || ""));
|
|
62
|
+
if (Number.isNaN(date.getTime())) return 0;
|
|
63
|
+
const ageMs = Date.now() - date.getTime();
|
|
64
|
+
const months = ageMs / (30 * 24 * 60 * 60 * 1000);
|
|
65
|
+
if (months <= 6) return 8;
|
|
66
|
+
if (months <= 18) return 4;
|
|
67
|
+
if (months <= 36) return 1;
|
|
68
|
+
return -4;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function cleanDefinitionQuery(query) {
|
|
72
|
+
return String(query || "")
|
|
73
|
+
.replace(/^\s*(was ist|what is|wer ist|who is)\s+/i, "")
|
|
74
|
+
.replace(/[?!.]+$/g, "")
|
|
75
|
+
.trim();
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function normalizePaperTitle(title) {
|
|
79
|
+
return String(title || "")
|
|
80
|
+
.replace(/^\s*(?:title|paper|article|preprint)\s*[:\-–—]\s*/i, "")
|
|
81
|
+
.replace(/^\s*(?:title|paper|article|preprint)\s+of\s+/i, "")
|
|
82
|
+
.replace(/\s+/g, " ")
|
|
83
|
+
.trim();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function queryBase(query) {
|
|
87
|
+
return String(query || "")
|
|
88
|
+
.trim()
|
|
89
|
+
.replace(/[?!.]+$/g, "")
|
|
90
|
+
.replace(/\s+/g, " ");
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function splitComparisonQuery(query) {
|
|
94
|
+
const parts = String(query || "").split(/\bvs\.?\b|\bversus\b|\bgegenüber\b|\bcompared to\b/i).map((part) => part.trim()).filter(Boolean);
|
|
95
|
+
return parts.length >= 2 ? parts.slice(0, 2) : null;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function monthsSince(dateText) {
|
|
99
|
+
if (!dateText) return null;
|
|
100
|
+
const date = new Date(dateText);
|
|
101
|
+
if (Number.isNaN(date.getTime())) return null;
|
|
102
|
+
const now = new Date();
|
|
103
|
+
return (now.getFullYear() - date.getFullYear()) * 12 + (now.getMonth() - date.getMonth());
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function summarizeFreshness(dateText) {
|
|
107
|
+
if (!dateText) return "unknown";
|
|
108
|
+
const date = new Date(dateText);
|
|
109
|
+
if (Number.isNaN(date.getTime())) return "unknown";
|
|
110
|
+
const ageMs = Date.now() - date.getTime();
|
|
111
|
+
if (ageMs <= 24 * 60 * 60 * 1000) return "today";
|
|
112
|
+
if (ageMs <= 7 * 24 * 60 * 60 * 1000) return "this_week";
|
|
113
|
+
if (ageMs <= 365 * 24 * 60 * 60 * 1000) return "this_year";
|
|
114
|
+
return "older";
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export function classifyQueryIntent(query) {
|
|
118
|
+
const text = String(query || "").toLowerCase();
|
|
119
|
+
if (/\b(vs\.?|versus|gegenüber|compared to)\b/i.test(query)) return "comparison";
|
|
120
|
+
if (/\b(aktuell|aktueller|current|status|latest|neueste|heute|202\d)\b/i.test(query)) return "temporal";
|
|
121
|
+
if (/\b(best practices?|bester weg|beste methode|recommended|empfohlen|guide)\b/i.test(query)) return "best_practice";
|
|
122
|
+
if (/\b(best|besser|beste|compare|vergleich|alternative|alternativen)\b/i.test(query)) return "comparative";
|
|
123
|
+
if (/^\s*(was ist|what is|wer ist|who is)\b/i.test(query)) return "definition";
|
|
124
|
+
if (/\b(paper|papers|study|studies|arxiv|doi|publication|research)\b/i.test(text)) return "academic";
|
|
125
|
+
return "general";
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
export function inferOfficialDocsSite(query) {
|
|
129
|
+
const lower = String(query || "").toLowerCase();
|
|
130
|
+
if (lower.includes("playwright")) return "playwright.dev/docs";
|
|
131
|
+
if (lower.includes("react")) return "react.dev";
|
|
132
|
+
if (lower.includes("node")) return "nodejs.org/api";
|
|
133
|
+
if (lower.includes("selenium")) return "selenium.dev/documentation";
|
|
134
|
+
if (lower.includes("pandas")) return "pandas.pydata.org";
|
|
135
|
+
if (lower.includes("polars")) return "docs.pola.rs";
|
|
136
|
+
return null;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function academicHints(query) {
|
|
140
|
+
const lower = String(query || "").toLowerCase();
|
|
141
|
+
const hints = [];
|
|
142
|
+
if (lower.includes("transformer") || lower.includes("attention")) hints.push("Attention is All You Need arxiv", "transformer self-attention original paper arxiv");
|
|
143
|
+
if (lower.includes("rag") || lower.includes("retrieval augmented")) hints.push("retrieval augmented generation arxiv", "rag paper arxiv");
|
|
144
|
+
return [...new Set(hints)];
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
export function buildFastQueries(query, limit = 2) {
|
|
148
|
+
const trimmed = String(query || "").trim();
|
|
149
|
+
const year = new Date().getFullYear();
|
|
150
|
+
const intent = classifyQueryIntent(trimmed);
|
|
151
|
+
let queries;
|
|
152
|
+
|
|
153
|
+
if (intent === "definition") {
|
|
154
|
+
queries = [cleanDefinitionQuery(trimmed) || trimmed];
|
|
155
|
+
} else if (intent === "comparison") {
|
|
156
|
+
const compact = trimmed.replace(/\bvs\.?\b/i, "vs").replace(/[?!.]+$/g, "").trim();
|
|
157
|
+
const entities = compact.replace(/\bvs\b/i, " ").replace(/\s+/g, " ").trim();
|
|
158
|
+
queries = [`${compact} comparison`, entities];
|
|
159
|
+
} else if (intent === "temporal") {
|
|
160
|
+
const withoutPunctuation = trimmed.replace(/[?!.]+$/g, "");
|
|
161
|
+
queries = [`${withoutPunctuation} ${year}`, `${withoutPunctuation} official ${year}`];
|
|
162
|
+
} else if (intent === "academic") {
|
|
163
|
+
queries = [`${trimmed} site:arxiv.org`, `${trimmed} site:semanticscholar.org`, ...academicHints(trimmed)];
|
|
164
|
+
} else if (intent === "best_practice" || intent === "comparative") {
|
|
165
|
+
queries = [trimmed, `${trimmed} official docs`];
|
|
166
|
+
} else {
|
|
167
|
+
queries = [trimmed, `${trimmed} overview`];
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return [...new Set(queries.map((item) => item.trim()).filter(Boolean))].slice(0, limit);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
export function buildDeepQueries(query, limit = 4) {
|
|
174
|
+
const base = queryBase(query);
|
|
175
|
+
const intent = classifyQueryIntent(base);
|
|
176
|
+
const docsSite = inferOfficialDocsSite(base);
|
|
177
|
+
const queries = [base];
|
|
178
|
+
|
|
179
|
+
if (intent === "comparison") {
|
|
180
|
+
const parts = splitComparisonQuery(base);
|
|
181
|
+
if (parts) {
|
|
182
|
+
queries.push(`${parts[0]} ${parts[1]} official docs`);
|
|
183
|
+
queries.push(`${parts[0]} ${parts[1]} benchmark`);
|
|
184
|
+
queries.push(`${parts[0]} ${parts[1]} GitHub README filetype:md`);
|
|
185
|
+
}
|
|
186
|
+
} else if (intent === "academic") {
|
|
187
|
+
queries.push(...academicHints(base));
|
|
188
|
+
queries.push(`${base} site:arxiv.org`);
|
|
189
|
+
queries.push(`${base} site:semanticscholar.org`);
|
|
190
|
+
queries.push(`${base} site:doi.org`);
|
|
191
|
+
} else if (intent === "temporal") {
|
|
192
|
+
const year = new Date().getFullYear();
|
|
193
|
+
queries.push(`${base} official ${year}`);
|
|
194
|
+
queries.push(`${base} docs ${year}`);
|
|
195
|
+
queries.push(`${base} GitHub README filetype:md`);
|
|
196
|
+
} else {
|
|
197
|
+
queries.push(`${base} official docs`);
|
|
198
|
+
queries.push(docsSite ? `${base} site:${docsSite}` : `${base} documentation`);
|
|
199
|
+
queries.push(`${base} GitHub README filetype:md`);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return [...new Set(queries)].slice(0, limit);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export function parseDeepQueryPlan(text, query, limit = 4) {
|
|
206
|
+
try {
|
|
207
|
+
const trimmed = String(text || "").trim();
|
|
208
|
+
const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/i);
|
|
209
|
+
const candidate = fenced ? fenced[1].trim() : trimmed;
|
|
210
|
+
const parsed = JSON.parse(candidate);
|
|
211
|
+
if (parsed && Array.isArray(parsed.queries)) {
|
|
212
|
+
const queries = parsed.queries.map((item) => String(item).trim()).filter(Boolean);
|
|
213
|
+
if (queries.length) return [...new Set(queries)].slice(0, limit);
|
|
214
|
+
}
|
|
215
|
+
} catch {
|
|
216
|
+
// fall through
|
|
217
|
+
}
|
|
218
|
+
return buildDeepQueries(query, limit);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
export function buildJinaReaderUrl(url) {
|
|
222
|
+
return `https://r.jina.ai/${url}`;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
export function buildFallbackQueries(query) {
|
|
226
|
+
const variants = [query, `${query} overview`];
|
|
227
|
+
if (/best practices/i.test(query)) variants.push(`${query} guide`);
|
|
228
|
+
else variants.push(`${query} best practices`);
|
|
229
|
+
return [...new Set(variants)];
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
export function extractDuckDuckGoResults(html) {
|
|
233
|
+
const matches = String(html || "").matchAll(/<a[^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/g);
|
|
234
|
+
const results = [];
|
|
235
|
+
|
|
236
|
+
for (const match of matches) {
|
|
237
|
+
const nearby = String(html || "").slice(match.index, match.index + 3000);
|
|
238
|
+
const snippetMatch = nearby.match(/class="result__snippet"[^>]*>([\s\S]*?)<\/a>/);
|
|
239
|
+
results.push({
|
|
240
|
+
title: stripTags(match[2]),
|
|
241
|
+
url: decodeDuckDuckGoUrl(decodeHtmlEntities(match[1])),
|
|
242
|
+
snippet: snippetMatch ? stripTags(snippetMatch[1]) : "",
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return results;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
export function extractDuckDuckGoLiteResults(html) {
|
|
250
|
+
const matches = String(html || "").matchAll(/<a\b([^>]*class=["']result-link["'][^>]*)>([\s\S]*?)<\/a>|<a\b([^>]*class=[^>]*result-link[^>]*)>([\s\S]*?)<\/a>/g);
|
|
251
|
+
const results = [];
|
|
252
|
+
|
|
253
|
+
for (const match of matches) {
|
|
254
|
+
const attrs = match[1] || match[3] || "";
|
|
255
|
+
const titleHtml = match[2] || match[4] || "";
|
|
256
|
+
const hrefMatch = attrs.match(/href=["']([^"']+)["']/i);
|
|
257
|
+
if (!hrefMatch) continue;
|
|
258
|
+
const nearby = String(html || "").slice(match.index, match.index + 1200);
|
|
259
|
+
const snippetMatch = nearby.match(/class=["']result-snippet["'][^>]*>([\s\S]*?)<\/td>/);
|
|
260
|
+
results.push({
|
|
261
|
+
title: stripTags(titleHtml),
|
|
262
|
+
url: decodeDuckDuckGoUrl(decodeHtmlEntities(hrefMatch[1])),
|
|
263
|
+
snippet: snippetMatch ? stripTags(snippetMatch[1]) : "",
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
return results;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
export function extractJinaSearchResults(markdown) {
|
|
271
|
+
const matches = String(markdown || "").matchAll(/^## \[([^\]]+)\]\(([^)]+)\)\s*\n([^#\n][^\n]*)?/gm);
|
|
272
|
+
const results = [];
|
|
273
|
+
|
|
274
|
+
for (const match of matches) {
|
|
275
|
+
const url = decodeDuckDuckGoUrl(decodeHtmlEntities(match[2]));
|
|
276
|
+
results.push({
|
|
277
|
+
title: decodeHtmlEntities(match[1]).trim(),
|
|
278
|
+
url,
|
|
279
|
+
snippet: decodeHtmlEntities(match[3] || "").trim(),
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
return results;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
export function extractPublishDate(html) {
|
|
287
|
+
const match = String(html || "").match(/<meta[^>]+(?:property|name)=["'](?:article:published_time|datePublished|publish-date)["'][^>]+content=["']([^"']+)/i);
|
|
288
|
+
if (!match) return null;
|
|
289
|
+
const value = match[1].slice(0, 10);
|
|
290
|
+
return /^\d{4}-\d{2}-\d{2}$/.test(value) ? value : null;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
export function selectRelevantChunks(text, query, limit = 3) {
|
|
294
|
+
return String(text || "")
|
|
295
|
+
.split(/\n\s*\n/)
|
|
296
|
+
.map((chunk) => chunk.trim())
|
|
297
|
+
.filter(Boolean)
|
|
298
|
+
.map((chunk) => ({ chunk, score: countTermMatches(chunk, queryTerms(query)) }))
|
|
299
|
+
.sort((a, b) => b.score - a.score)
|
|
300
|
+
.slice(0, limit)
|
|
301
|
+
.map((item) => item.chunk);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
export function classifySourceType(url, title = "") {
|
|
305
|
+
const lower = String(url || "").toLowerCase();
|
|
306
|
+
if (lower.startsWith("file://")) return "file";
|
|
307
|
+
if (/github\.com\/[^/]+\/[^/]+#readme|github\.com\/[^/]+\/[^/]+\/blob\//.test(lower)) return "github_readme";
|
|
308
|
+
if (/github\.com\/[^/]+\/[^/]+/.test(lower)) return "github_repo";
|
|
309
|
+
if (/arxiv\.org|ieee\.org|springer\.com|pubmed\.ncbi\.nlm\.nih\.gov|doi\.org|semanticscholar\.org|acm\.org|nature\.com|science\.org/.test(lower)) return "paper";
|
|
310
|
+
if (/reddit\.com|stackoverflow\.com|forum/.test(lower)) return "forum";
|
|
311
|
+
if (/blog\.|medium\.com|dev\.to|substack\.com/.test(lower)) return "blog";
|
|
312
|
+
if (/\/docs?\b|documentation|developer|reference|official/.test(lower) || /official|documentation|reference|guide/i.test(title) || /\.edu\/|\.ac\.uk\//.test(lower)) return "official_doc";
|
|
313
|
+
return "other";
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
export function isAuthoritativeUrl(url) {
|
|
317
|
+
const lower = String(url || "").toLowerCase();
|
|
318
|
+
return /\/docs?\b|documentation|developer|reference|official|github\.com\/[^/]+\/[^/]+(#readme|\/tree\/[^/]+\/docs)?|npmjs\.com\/package\/|arxiv\.org|pubmed\.ncbi\.nlm\.nih\.gov|semanticscholar\.org|doi\.org|\.edu\/|\.ac\.uk\//.test(lower);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
export function scoreSearchResult(result, query, config = {}) {
|
|
322
|
+
const terms = queryTerms(query);
|
|
323
|
+
const url = String(result.url || "").toLowerCase();
|
|
324
|
+
let score = 0;
|
|
325
|
+
score += countTermMatches(result.title, terms) * 3;
|
|
326
|
+
score += countTermMatches(result.snippet, terms) * 2;
|
|
327
|
+
if (/\/docs?\b|documentation|developer|reference|official/.test(url)) score += 5;
|
|
328
|
+
if (/github\.com/.test(url) && /(readme|\/docs?\/|#readme)/.test(url)) score += 4;
|
|
329
|
+
if (/github\.com/.test(url) && /\/(issues|pull|pulls)\//.test(url)) score -= 4;
|
|
330
|
+
if (/\/login|\/signin|\/sign-in|\/account|\/subscribe|\/checkout/.test(url)) score -= 8;
|
|
331
|
+
if (!result.snippet) score -= 2;
|
|
332
|
+
score += allowedSourceBoost(result, config.allowedSources);
|
|
333
|
+
if (config.preferRecent) score += freshnessBonus(result.publishDate || result.freshness);
|
|
334
|
+
return score;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
export function rankAndDeduplicateResults(results, limit = 5) {
|
|
338
|
+
const seen = new Set();
|
|
339
|
+
const deduped = [];
|
|
340
|
+
|
|
341
|
+
for (const result of results) {
|
|
342
|
+
const key = normalizeUrl(result.url);
|
|
343
|
+
if (seen.has(key)) continue;
|
|
344
|
+
seen.add(key);
|
|
345
|
+
deduped.push(result);
|
|
346
|
+
if (deduped.length >= limit) break;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
return deduped;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
export function rankSearchResults(results, query, limit = 5, config = {}) {
|
|
353
|
+
return rankAndDeduplicateResults([...results].sort((a, b) => scoreSearchResult(b, query, config) - scoreSearchResult(a, query, config)), limit);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
function isVolatileQuery(query) {
|
|
357
|
+
return /\b(npm|package|deprecated|deprecation|support|supported|status|latest|current|compatibility|compatible|version|release)\b/i.test(query);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
export function scoreFetchedPage(page, query, config = {}) {
|
|
361
|
+
const terms = queryTerms(query);
|
|
362
|
+
const text = String(page?.text || "");
|
|
363
|
+
const firstChunk = text.slice(0, 500);
|
|
364
|
+
const url = String(page?.url || "").toLowerCase();
|
|
365
|
+
let score = countTermMatches(text, terms) + countTermMatches(firstChunk, terms) * 3;
|
|
366
|
+
|
|
367
|
+
if (/\/docs?\b|documentation|developer|reference|official/.test(url)) score += 6;
|
|
368
|
+
if (/github\.com/.test(url) && /(readme|\/docs?\/|#readme)/.test(url)) score += 4;
|
|
369
|
+
if (/stackoverflow\.com|reddit\.com|quora\.com/.test(url)) score -= 2;
|
|
370
|
+
if (countTermMatches(firstChunk, terms) > 0) score += 5;
|
|
371
|
+
|
|
372
|
+
const ageInMonths = monthsSince(page?.publishDate);
|
|
373
|
+
if (config.preferRecent && ageInMonths !== null) {
|
|
374
|
+
if (ageInMonths <= 6) score += 8;
|
|
375
|
+
else if (ageInMonths <= 18) score += 4;
|
|
376
|
+
else if (ageInMonths > 36) score -= 4;
|
|
377
|
+
} else if (isVolatileQuery(query) && ageInMonths !== null && ageInMonths > 18) {
|
|
378
|
+
score -= 3;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
score += allowedSourceBoost(page, config.allowedSources);
|
|
382
|
+
return score;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
export function rankFetchedPages(pages, query, limit = pages.length, config = {}) {
|
|
386
|
+
return [...pages].sort((a, b) => scoreFetchedPage(b, query, config) - scoreFetchedPage(a, query, config)).slice(0, limit);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
export function detectConflictSignals(pages) {
|
|
390
|
+
if (!Array.isArray(pages) || pages.length < 2) {
|
|
391
|
+
return { detected: false, reason: null, conflictSummary: "", conflictingSourcePairs: [] };
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
const positivePattern = /\b(works?|supported|recommended|available|yes|stable|compatible)\b/i;
|
|
395
|
+
const negativePattern = /\b(does not|not supported|unsupported|deprecated|no support|broken|incompatible|removed)\b/i;
|
|
396
|
+
const entries = pages.map((page, index) => {
|
|
397
|
+
try {
|
|
398
|
+
return { page, index, domain: new URL(page.url).hostname.replace(/^www\./, "") };
|
|
399
|
+
} catch {
|
|
400
|
+
return { page, index, domain: "" };
|
|
401
|
+
}
|
|
402
|
+
});
|
|
403
|
+
const domains = new Set(entries.map((item) => item.domain).filter(Boolean));
|
|
404
|
+
if (domains.size < 2) return { detected: false, reason: null, conflictSummary: "", conflictingSourcePairs: [] };
|
|
405
|
+
|
|
406
|
+
const positivePages = entries.filter(({ page }) => positivePattern.test(page.text || ""));
|
|
407
|
+
const negativePages = entries.filter(({ page }) => negativePattern.test(page.text || ""));
|
|
408
|
+
const pair = positivePages.find((pos) => negativePages.some((neg) => neg.domain !== pos.domain || neg.index !== pos.index));
|
|
409
|
+
const opposite = pair && negativePages.find((neg) => neg.domain !== pair.domain || neg.index !== pair.index);
|
|
410
|
+
|
|
411
|
+
if (pair && opposite) {
|
|
412
|
+
return {
|
|
413
|
+
detected: true,
|
|
414
|
+
reason: "Some retrieved pages contain opposing support or recommendation claims.",
|
|
415
|
+
conflictSummary: `Sources disagree on ${pages[0]?.title || "the topic"}.`,
|
|
416
|
+
conflictingSourcePairs: [[pair.index, opposite.index]],
|
|
417
|
+
};
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
return { detected: false, reason: null, conflictSummary: "", conflictingSourcePairs: [] };
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
export function detectResearchGaps(query, pages) {
|
|
424
|
+
const hasAuthoritativeSource = pages.some((page) => {
|
|
425
|
+
const scored = scoreSourceEntry(page, query || "");
|
|
426
|
+
return Boolean(page.authoritative || scored.authoritative);
|
|
427
|
+
});
|
|
428
|
+
if (!hasAuthoritativeSource) {
|
|
429
|
+
return {
|
|
430
|
+
detected: true,
|
|
431
|
+
reason: "Retrieved pages lack an authoritative docs or README source.",
|
|
432
|
+
followupQuery: `${queryBase(query)} official docs`,
|
|
433
|
+
missingAspects: ["authoritative sources"],
|
|
434
|
+
};
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
return { detected: false, reason: null, followupQuery: null, missingAspects: [] };
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
export function buildFollowUpQuery(query, pages) {
|
|
441
|
+
const conflict = detectConflictSignals(pages);
|
|
442
|
+
if (conflict.detected) return `${queryBase(query)} official docs support status`;
|
|
443
|
+
const gaps = detectResearchGaps(query, pages);
|
|
444
|
+
if (gaps.detected) return gaps.followupQuery;
|
|
445
|
+
return `${queryBase(query)} clarification official docs`;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
function queryTermsForFactCheck(text) {
|
|
449
|
+
return String(text || "")
|
|
450
|
+
.toLowerCase()
|
|
451
|
+
.replace(/[^a-z0-9\s]+/g, " ")
|
|
452
|
+
.split(/\s+/)
|
|
453
|
+
.filter((term) => term.length > 3 && !["that", "this", "with", "from", "have", "has", "are", "was", "were", "the", "and", "for", "not", "you", "your", "about", "into"].includes(term));
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
const BOILERPLATE_FACT_CHECK_PATTERNS = [
|
|
457
|
+
/^i found \d+ sources?/i,
|
|
458
|
+
/\bstrongest sources?\b/i,
|
|
459
|
+
/\bsummar(?:y|ized|ised) below\b/i,
|
|
460
|
+
/\bbased on \d+ readable sources?\b/i,
|
|
461
|
+
/\bi could not find enough reliable sources?\b/i,
|
|
462
|
+
];
|
|
463
|
+
|
|
464
|
+
function isBoilerplateClaim(sentence) {
|
|
465
|
+
return BOILERPLATE_FACT_CHECK_PATTERNS.some((pattern) => pattern.test(sentence));
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
export function factCheckAnswer(answer, sources = []) {
|
|
469
|
+
const sentences = String(answer || "")
|
|
470
|
+
.split(/[.!?]+/)
|
|
471
|
+
.map((sentence) => sentence.trim())
|
|
472
|
+
.filter(Boolean);
|
|
473
|
+
const unverifiedClaims = [];
|
|
474
|
+
const verifiedClaims = [];
|
|
475
|
+
|
|
476
|
+
for (const sentence of sentences) {
|
|
477
|
+
if (isBoilerplateClaim(sentence)) continue;
|
|
478
|
+
const terms = queryTermsForFactCheck(sentence);
|
|
479
|
+
if (terms.length === 0) continue;
|
|
480
|
+
const verified = sources.some((source) => {
|
|
481
|
+
const haystack = `${source.title || ""} ${source.snippet || source.text || ""}`.toLowerCase();
|
|
482
|
+
return terms.filter((term) => haystack.includes(term)).length >= Math.max(1, Math.ceil(terms.length / 2));
|
|
483
|
+
});
|
|
484
|
+
if (verified) verifiedClaims.push(sentence);
|
|
485
|
+
else unverifiedClaims.push(sentence);
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
return { verifiedClaims, unverifiedClaims };
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
export function buildConfidenceSummary(pages, meta = {}) {
|
|
492
|
+
if (!pages.length) return "Based on 0 readable sources.";
|
|
493
|
+
const domains = new Set();
|
|
494
|
+
for (const page of pages) {
|
|
495
|
+
try {
|
|
496
|
+
domains.add(new URL(page.url).hostname.replace(/^www\./, ""));
|
|
497
|
+
} catch {
|
|
498
|
+
// ignore
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
const lines = [
|
|
503
|
+
`Based on ${pages.length} readable sources from ${domains.size || 1} independent domains.`,
|
|
504
|
+
pages.some((page) => {
|
|
505
|
+
const scored = scoreSourceEntry(page, "");
|
|
506
|
+
return Boolean(page.authoritative || scored.authoritative);
|
|
507
|
+
})
|
|
508
|
+
? "Authoritative docs, papers, or README sources were found."
|
|
509
|
+
: "No authoritative docs, papers, or README source was found.",
|
|
510
|
+
];
|
|
511
|
+
|
|
512
|
+
if (meta.followupRounds > 0) lines.push(`One follow-up round was used to resolve uncertainty.`);
|
|
513
|
+
lines.push(meta.conflictDetected ? "Conflict scan found opposing claims in the retrieved pages." : "No clear source conflicts detected in the retrieved pages.");
|
|
514
|
+
return lines.join("\n");
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
export function scoreSourceEntry(source, query = "") {
|
|
518
|
+
const url = String(source?.url || "");
|
|
519
|
+
const title = String(source?.title || "");
|
|
520
|
+
const sourceType = classifySourceType(url, title);
|
|
521
|
+
const freshness = summarizeFreshness(source?.publishDate || source?.freshness);
|
|
522
|
+
let typeScore = 0;
|
|
523
|
+
let freshnessScore = 0;
|
|
524
|
+
let domainScore = 0;
|
|
525
|
+
let authoritative = isAuthoritativeUrl(url) || sourceType === "official_doc" || sourceType === "paper" || sourceType === "file";
|
|
526
|
+
|
|
527
|
+
if (authoritative) typeScore += 10;
|
|
528
|
+
if (sourceType === "official_doc") typeScore += 8;
|
|
529
|
+
if (sourceType === "github_readme") typeScore += 7;
|
|
530
|
+
if (sourceType === "paper") typeScore += 8;
|
|
531
|
+
if (sourceType === "github_repo") typeScore += 4;
|
|
532
|
+
if (sourceType === "file") typeScore += 6;
|
|
533
|
+
if (sourceType === "forum") typeScore -= 1;
|
|
534
|
+
if (sourceType === "blog") typeScore -= 2;
|
|
535
|
+
|
|
536
|
+
if (freshness === "today") freshnessScore += 4;
|
|
537
|
+
else if (freshness === "this_week") freshnessScore += 3;
|
|
538
|
+
else if (freshness === "this_year") freshnessScore += 2;
|
|
539
|
+
|
|
540
|
+
try {
|
|
541
|
+
const parsed = new URL(url);
|
|
542
|
+
const hostname = parsed.hostname.replace(/^www\./, "").toLowerCase();
|
|
543
|
+
const path = parsed.pathname.toLowerCase();
|
|
544
|
+
const terms = queryTerms(query);
|
|
545
|
+
const hostMatches = terms.filter((term) => hostname.includes(term)).length;
|
|
546
|
+
if (path === "/" || path === "") {
|
|
547
|
+
if (hostMatches > 0) domainScore += 6 + hostMatches;
|
|
548
|
+
} else if (hostMatches > 0) {
|
|
549
|
+
domainScore += 3 + hostMatches;
|
|
550
|
+
}
|
|
551
|
+
if (/arxiv\.org|semanticscholar\.org|doi\.org|pubmed\.ncbi\.nlm\.nih\.gov|\.edu$|\.ac\.uk$/.test(hostname)) domainScore += 5;
|
|
552
|
+
if (/linkedin\.com|newreleases\.io|releasealert\.|pacgie\.|versio\./.test(hostname)) domainScore -= 6;
|
|
553
|
+
if (/blog\./.test(hostname) && sourceType !== "official_doc") domainScore -= 2;
|
|
554
|
+
} catch {
|
|
555
|
+
// ignore
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
const total = typeScore + freshnessScore + domainScore;
|
|
559
|
+
authoritative = authoritative || total >= 10;
|
|
560
|
+
return { sourceType, authoritative, freshness, typeScore, freshnessScore, domainScore, total, score: total };
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
export function prioritizeSourceEntries(sources, query = "") {
|
|
564
|
+
return [...sources]
|
|
565
|
+
.map((source) => {
|
|
566
|
+
const scored = scoreSourceEntry(source, query);
|
|
567
|
+
return {
|
|
568
|
+
...source,
|
|
569
|
+
sourceType: source.sourceType || scored.sourceType,
|
|
570
|
+
authoritative: typeof source.authoritative === "boolean" ? source.authoritative : scored.authoritative,
|
|
571
|
+
freshness: source.freshness || scored.freshness,
|
|
572
|
+
score: typeof source.score === "number" ? source.score : scored.total,
|
|
573
|
+
};
|
|
574
|
+
})
|
|
575
|
+
.sort((a, b) => (b.score || 0) - (a.score || 0));
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
function trimCodeBlock(block, maxLines = 20) {
|
|
579
|
+
const lines = String(block || "").split("\n").slice(0, maxLines);
|
|
580
|
+
return lines.join("\n").trim();
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
export function extractCodeBlocks(text) {
|
|
584
|
+
const value = String(text || "");
|
|
585
|
+
const blocks = [];
|
|
586
|
+
for (const match of value.matchAll(/```[a-z0-9_-]*\n([\s\S]*?)```/gi)) blocks.push(match[1].trim());
|
|
587
|
+
for (const match of value.matchAll(/<pre[^>]*>([\s\S]*?)<\/pre>/gi)) blocks.push(stripTags(match[1]));
|
|
588
|
+
for (const match of value.matchAll(/<code[^>]*>([\s\S]*?)<\/code>/gi)) blocks.push(stripTags(match[1]));
|
|
589
|
+
return [...new Set(blocks.map((block) => trimCodeBlock(block)).filter(Boolean))];
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
export function evaluateSufficiency(input, legacyPages, legacyConflictDetected = false) {
|
|
593
|
+
const payload = typeof input === "string"
|
|
594
|
+
? { query: input, sources: legacyPages || [], conflictDetected: legacyConflictDetected }
|
|
595
|
+
: { query: input?.query || "", sources: input?.sources || [], conflictDetected: Boolean(input?.conflictDetected), confidence: input?.confidence, minSources: input?.minSources };
|
|
596
|
+
|
|
597
|
+
const scoredSources = payload.sources.map((page) => scoreSourceEntry(page, payload.query || ""));
|
|
598
|
+
const authoritativeCount = scoredSources.filter((scored) => Boolean(scored.authoritative)).length;
|
|
599
|
+
const authoritativeSourcesFound = authoritativeCount > 0;
|
|
600
|
+
const conflict = detectConflictSignals(payload.sources);
|
|
601
|
+
const conflictDetected = payload.conflictDetected || conflict.detected;
|
|
602
|
+
const missingAspects = [];
|
|
603
|
+
if (!authoritativeSourcesFound) missingAspects.push("authoritative sources");
|
|
604
|
+
if (conflictDetected) missingAspects.push("conflict resolution");
|
|
605
|
+
if (!payload.sources.length) missingAspects.push("readable sources");
|
|
606
|
+
|
|
607
|
+
const openSubQuestions = [];
|
|
608
|
+
if (!authoritativeSourcesFound) openSubQuestions.push(`${queryBase(payload.query)} official docs`);
|
|
609
|
+
if (conflictDetected) openSubQuestions.push(`Which authoritative source resolves the conflicting claims about ${queryBase(payload.query)}?`);
|
|
610
|
+
if (!openSubQuestions.length) openSubQuestions.push(`${queryBase(payload.query)} follow-up`);
|
|
611
|
+
|
|
612
|
+
const minSources = payload.minSources || 1;
|
|
613
|
+
const sourceCount = payload.sources.length;
|
|
614
|
+
const domainCount = new Set(payload.sources.map((page) => {
|
|
615
|
+
try {
|
|
616
|
+
return new URL(page.url).hostname.replace(/^www\./, "");
|
|
617
|
+
} catch {
|
|
618
|
+
return "";
|
|
619
|
+
}
|
|
620
|
+
}).filter(Boolean)).size;
|
|
621
|
+
const confidenceScore = typeof payload.confidence === "number"
|
|
622
|
+
? payload.confidence
|
|
623
|
+
: Math.max(0.1, Math.min(0.95, 0.35 + Math.min(sourceCount, 4) * 0.08 + Math.min(authoritativeCount, 3) * 0.12 + Math.min(domainCount, 3) * 0.04 - (conflictDetected ? 0.18 : 0)));
|
|
624
|
+
|
|
625
|
+
const sufficient = sourceCount >= minSources && confidenceScore >= 0.85 && (!conflictDetected || authoritativeSourcesFound);
|
|
626
|
+
|
|
627
|
+
return {
|
|
628
|
+
sufficient,
|
|
629
|
+
confidence: confidenceScore,
|
|
630
|
+
confidenceScore,
|
|
631
|
+
missingAspects: [...new Set(missingAspects)],
|
|
632
|
+
openSubQuestions: [...new Set(openSubQuestions)],
|
|
633
|
+
authoritativeSourcesFound,
|
|
634
|
+
conflictSummary: conflictDetected ? (conflict.conflictSummary || `Sources disagree on ${queryBase(payload.query)}.`) : "",
|
|
635
|
+
conflictingSourcePairs: conflict.conflictingSourcePairs || [],
|
|
636
|
+
};
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
export function compactResearchPayload(payload) {
|
|
640
|
+
return {
|
|
641
|
+
answer: payload.answer,
|
|
642
|
+
bullets: Array.isArray(payload.bullets) ? payload.bullets.slice(0, 5) : [],
|
|
643
|
+
confidence: typeof payload.confidence === "number" ? payload.confidence : "",
|
|
644
|
+
citations: Array.isArray(payload.citations) ? payload.citations.slice(0, 8) : [],
|
|
645
|
+
codeBlocks: Array.isArray(payload.codeBlocks) ? payload.codeBlocks.slice(0, 3).map((block) => trimCodeBlock(block)) : [],
|
|
646
|
+
sources: Array.isArray(payload.sources)
|
|
647
|
+
? payload.sources.slice(0, 5).map((source) => ({
|
|
648
|
+
title: source.title,
|
|
649
|
+
url: source.url,
|
|
650
|
+
...(source.freshness ? { freshness: source.freshness } : {}),
|
|
651
|
+
...(source.sourceType ? { sourceType: source.sourceType } : {}),
|
|
652
|
+
...(typeof source.score === "number" ? { score: source.score } : {}),
|
|
653
|
+
...(typeof source.authoritative === "boolean" ? { authoritative: source.authoritative } : {}),
|
|
654
|
+
...(typeof source.local === "boolean" ? { local: source.local } : {}),
|
|
655
|
+
}))
|
|
656
|
+
: [],
|
|
657
|
+
sourceTypes: Array.isArray(payload.sourceTypes) ? payload.sourceTypes.slice(0, 8) : [],
|
|
658
|
+
unverifiedClaims: Array.isArray(payload.unverifiedClaims) ? payload.unverifiedClaims.slice(0, 8) : [],
|
|
659
|
+
meta: payload.meta && typeof payload.meta === "object" ? payload.meta : undefined,
|
|
660
|
+
sufficient: Boolean(payload.sufficient),
|
|
661
|
+
authoritativeSourcesFound: Boolean(payload.authoritativeSourcesFound),
|
|
662
|
+
openSubQuestions: Array.isArray(payload.openSubQuestions) ? payload.openSubQuestions.slice(0, 5) : [],
|
|
663
|
+
missingAspects: Array.isArray(payload.missingAspects) ? payload.missingAspects.slice(0, 5) : [],
|
|
664
|
+
conflictSummary: payload.conflictSummary || "",
|
|
665
|
+
};
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
export function extractPageSnapshot(html, url) {
|
|
669
|
+
const titleMatch = String(html || "").match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
670
|
+
const title = titleMatch ? stripTags(titleMatch[1]) : url;
|
|
671
|
+
const body = String(html || "")
|
|
672
|
+
.replace(/<script[\s\S]*?<\/script>/gi, " ")
|
|
673
|
+
.replace(/<style[\s\S]*?<\/style>/gi, " ");
|
|
674
|
+
|
|
675
|
+
return { title, url, text: stripTags(body), codeBlocks: extractCodeBlocks(html) };
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
export function formatResearchResponse({ answer, bullets, sources, confidence }) {
|
|
679
|
+
const parts = ["## Answer", "", String(answer || "").trim(), "", "## Key points"];
|
|
680
|
+
for (const bullet of bullets || []) parts.push(`- ${bullet}`);
|
|
681
|
+
if (confidence) parts.push("", "## Confidence", "", confidence);
|
|
682
|
+
parts.push("", "## Sources");
|
|
683
|
+
(sources || []).forEach((source, index) => {
|
|
684
|
+
const freshness = source.freshness ? ` (${source.freshness})` : "";
|
|
685
|
+
const meta = [];
|
|
686
|
+
if (source.sourceType) meta.push(source.sourceType);
|
|
687
|
+
if (typeof source.score === "number") meta.push(`score:${source.score}`);
|
|
688
|
+
if (typeof source.authoritative === "boolean") meta.push(source.authoritative ? "authoritative" : "non-authoritative");
|
|
689
|
+
const metaText = meta.length ? ` [${meta.join(", ")}]` : "";
|
|
690
|
+
parts.push(`${index + 1}. ${source.title} — ${source.url}${metaText}${freshness}`);
|
|
691
|
+
});
|
|
692
|
+
return parts.join("\n").trim();
|
|
693
|
+
}
|