pi-research 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +90 -0
- package/extensions/pi-research.ts +1 -0
- package/index.js +144 -0
- package/lib/planner.js +36 -0
- package/lib/research-memory.js +87 -0
- package/lib/research-profiles.json +72 -0
- package/lib/research.js +693 -0
- package/lib/types.js +49 -0
- package/lib/web-research.js +718 -0
- package/package.json +33 -0
|
@@ -0,0 +1,718 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
|
|
4
|
+
import { complete } from "@mariozechner/pi-ai";
|
|
5
|
+
|
|
6
|
+
import profiles from "./research-profiles.json" with { type: "json" };
|
|
7
|
+
import { createResearchResult } from "./types.js";
|
|
8
|
+
import {
|
|
9
|
+
buildConfidenceSummary,
|
|
10
|
+
buildDeepQueries,
|
|
11
|
+
buildFallbackQueries,
|
|
12
|
+
buildFastQueries,
|
|
13
|
+
buildFollowUpQuery,
|
|
14
|
+
buildJinaReaderUrl,
|
|
15
|
+
classifySourceType,
|
|
16
|
+
compactResearchPayload,
|
|
17
|
+
normalizePaperTitle,
|
|
18
|
+
detectConflictSignals,
|
|
19
|
+
evaluateSufficiency,
|
|
20
|
+
extractCodeBlocks,
|
|
21
|
+
extractDuckDuckGoLiteResults,
|
|
22
|
+
extractDuckDuckGoResults,
|
|
23
|
+
extractJinaSearchResults,
|
|
24
|
+
extractPageSnapshot,
|
|
25
|
+
extractPublishDate,
|
|
26
|
+
factCheckAnswer,
|
|
27
|
+
formatResearchResponse,
|
|
28
|
+
normalizeUrl,
|
|
29
|
+
parseDeepQueryPlan,
|
|
30
|
+
prioritizeSourceEntries,
|
|
31
|
+
rankFetchedPages,
|
|
32
|
+
rankSearchResults,
|
|
33
|
+
scoreSourceEntry,
|
|
34
|
+
selectRelevantChunks,
|
|
35
|
+
} from "./research.js";
|
|
36
|
+
import { planResearch } from "./planner.js";
|
|
37
|
+
import {
|
|
38
|
+
clearResearchMemory,
|
|
39
|
+
getResearchMemory,
|
|
40
|
+
hashResearchQuery,
|
|
41
|
+
readCachedResult,
|
|
42
|
+
setResearchMemory,
|
|
43
|
+
writeCachedResult,
|
|
44
|
+
} from "./research-memory.js";
|
|
45
|
+
|
|
46
|
+
const USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36";
|
|
47
|
+
const MIN_PAGE_TEXT = 300;
|
|
48
|
+
const SEARCH_CACHE_TTL_MS = 5 * 60 * 1000;
|
|
49
|
+
const PAGE_CACHE_TTL_MS = 30 * 60 * 1000;
|
|
50
|
+
const searchCache = new Map();
|
|
51
|
+
const pageCache = new Map();
|
|
52
|
+
|
|
53
|
+
function getCacheValue(cache, key) {
|
|
54
|
+
const entry = cache.get(key);
|
|
55
|
+
if (!entry) return null;
|
|
56
|
+
if (entry.expiresAt <= Date.now()) {
|
|
57
|
+
cache.delete(key);
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
return entry.value;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function setCacheValue(cache, key, value, ttlMs) {
|
|
64
|
+
cache.set(key, { value, expiresAt: Date.now() + ttlMs });
|
|
65
|
+
return value;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function hashText(text) {
|
|
69
|
+
return createHash("sha1").update(String(text || "")).digest("hex");
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function normalizeResearchOptions(input = "fast") {
|
|
73
|
+
if (typeof input === "string") return { mode: input };
|
|
74
|
+
if (input && typeof input === "object") return input;
|
|
75
|
+
return { mode: "fast" };
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function resolveResearchConfig(input = "fast") {
|
|
79
|
+
const options = normalizeResearchOptions(input);
|
|
80
|
+
const base = profiles[options.mode] || profiles.fast;
|
|
81
|
+
const deep = options.deepResearchConfig || {};
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
...base,
|
|
85
|
+
...options,
|
|
86
|
+
mode: base.mode,
|
|
87
|
+
maxTurns: options.maxTurns ?? (deep.depth ? Math.max(base.maxTurns || 1, deep.depth) : (base.maxTurns || 1)),
|
|
88
|
+
maxQueries: options.maxQueries ?? (deep.breadth ? Math.max(base.maxQueries || 2, deep.breadth * (deep.depth || 1)) : (base.maxQueries || 2)),
|
|
89
|
+
maxPages: options.maxSites ?? options.maxPages ?? base.maxPages,
|
|
90
|
+
allowedSourceTypes: options.allowedSourceTypes ?? base.allowedSourceTypes,
|
|
91
|
+
searchProvider: options.searchProvider ?? base.searchProvider,
|
|
92
|
+
concurrentQueries: deep.concurrency ?? options.concurrentQueries ?? 3,
|
|
93
|
+
depth: deep.depth ?? 1,
|
|
94
|
+
breadth: deep.breadth ?? 2,
|
|
95
|
+
pageTextLimit: options.pageTextLimit ?? base.pageTextLimit,
|
|
96
|
+
minPageText: options.minPageText ?? base.minPageText ?? MIN_PAGE_TEXT,
|
|
97
|
+
preferRecent: options.preferRecent ?? base.preferRecent ?? false,
|
|
98
|
+
minYear: options.minYear ?? base.minYear,
|
|
99
|
+
maxYear: options.maxYear ?? base.maxYear,
|
|
100
|
+
cacheTtlMs: options.cacheTtlMs ?? base.cacheTtlMs ?? 24 * 60 * 60 * 1000,
|
|
101
|
+
files: Array.isArray(options.files) ? options.files : [],
|
|
102
|
+
isolate: Boolean(options.isolate || process.env.RESEARCH_ISOLATE === "1"),
|
|
103
|
+
force: Boolean(options.force),
|
|
104
|
+
format: options.format ?? "markdown",
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export function getResearchConfig(mode = "fast") {
|
|
109
|
+
return resolveResearchConfig(mode);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export function resolveResearchModel(ctx) {
|
|
113
|
+
return process.env.WEB_RESEARCH_MODEL || ctx?.model || null;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function textFromCompletion(response) {
|
|
117
|
+
return response.content.filter((part) => part.type === "text").map((part) => part.text).join("\n").trim();
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function parseJsonBlock(text) {
|
|
121
|
+
const trimmed = String(text || "").trim();
|
|
122
|
+
const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/i);
|
|
123
|
+
const candidate = fenced ? fenced[1].trim() : trimmed;
|
|
124
|
+
return JSON.parse(candidate);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
async function completeWithResearchModel(ctx, signal, prompt, reasoningEffort = "low") {
|
|
128
|
+
if (typeof ctx?.completeResearch === "function") {
|
|
129
|
+
return ctx.completeResearch(prompt, { signal, reasoningEffort });
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const model = resolveResearchModel(ctx);
|
|
133
|
+
if (!model) return null;
|
|
134
|
+
|
|
135
|
+
const auth = await ctx.modelRegistry.getApiKeyAndHeaders(model);
|
|
136
|
+
if (!auth.ok || !auth.apiKey) return null;
|
|
137
|
+
|
|
138
|
+
const response = await complete(model, {
|
|
139
|
+
messages: [{ role: "user", content: [{ type: "text", text: prompt }], timestamp: Date.now() }],
|
|
140
|
+
}, {
|
|
141
|
+
apiKey: auth.apiKey,
|
|
142
|
+
headers: auth.headers,
|
|
143
|
+
signal,
|
|
144
|
+
reasoningEffort,
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
if (response.stopReason === "aborted") return null;
|
|
148
|
+
return textFromCompletion(response);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
export async function buildQueries(query, mode = "fast", ctx, signal) {
|
|
152
|
+
const config = getResearchConfig(mode);
|
|
153
|
+
if (config.mode === "code") {
|
|
154
|
+
return planResearch(query, "code").subqueries.slice(0, config.maxQueries);
|
|
155
|
+
}
|
|
156
|
+
if (config.mode === "deep" || config.mode === "academic") {
|
|
157
|
+
const prompt = [
|
|
158
|
+
"Generate web research search queries as JSON only.",
|
|
159
|
+
'Return shape: {"queries":["..."]}',
|
|
160
|
+
config.mode === "academic"
|
|
161
|
+
? "Use 3-5 focused paper-search queries covering arXiv, DOI, Semantic Scholar, benchmarks, and official references."
|
|
162
|
+
: "Use 3-5 focused queries covering official docs, examples, source/readme, and recent status when relevant.",
|
|
163
|
+
`Question: ${query}`,
|
|
164
|
+
].join("\n");
|
|
165
|
+
|
|
166
|
+
try {
|
|
167
|
+
const text = await completeWithResearchModel(ctx, signal, prompt, "low");
|
|
168
|
+
if (text) return parseDeepQueryPlan(text, query, config.maxQueries);
|
|
169
|
+
} catch {
|
|
170
|
+
// fall through
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return buildDeepQueries(query, config.maxQueries);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return buildFastQueries(query, config.maxQueries);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function withTimeoutSignal(signal, timeoutMs) {
|
|
180
|
+
if (!timeoutMs) return signal;
|
|
181
|
+
const timeoutSignal = AbortSignal.timeout(timeoutMs);
|
|
182
|
+
return signal ? AbortSignal.any([signal, timeoutSignal]) : timeoutSignal;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
async function fetchTextWithRetry(url, signal, attempts = 2, headers = {
|
|
186
|
+
"user-agent": USER_AGENT,
|
|
187
|
+
"accept-language": "en-US,en;q=0.9",
|
|
188
|
+
}, timeoutMs) {
|
|
189
|
+
let lastError;
|
|
190
|
+
for (let attempt = 0; attempt < attempts; attempt++) {
|
|
191
|
+
try {
|
|
192
|
+
const response = await fetch(url, { headers, redirect: "follow", signal: withTimeoutSignal(signal, timeoutMs) });
|
|
193
|
+
return response;
|
|
194
|
+
} catch (error) {
|
|
195
|
+
lastError = error;
|
|
196
|
+
if (attempt + 1 < attempts) await new Promise((resolve) => setTimeout(resolve, 100 * (attempt + 1)));
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
throw lastError;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function inferAllowedSources(config) {
|
|
203
|
+
if (!Array.isArray(config.allowedSources) || config.allowedSources.length === 0) return null;
|
|
204
|
+
return new Set(config.allowedSources.map((value) => String(value).toLowerCase()));
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
function filterBySourceOptions(result, config) {
|
|
208
|
+
const allowed = inferAllowedSources(config);
|
|
209
|
+
if (!allowed) return true;
|
|
210
|
+
const type = classifySourceType(result.url, result.title);
|
|
211
|
+
if (allowed.has("official_docs") && type === "official_doc") return true;
|
|
212
|
+
if (allowed.has("paper") && type === "paper") return true;
|
|
213
|
+
if (allowed.has(type)) return true;
|
|
214
|
+
try {
|
|
215
|
+
const hostname = new URL(result.url).hostname.toLowerCase();
|
|
216
|
+
if (allowed.has(hostname)) return true;
|
|
217
|
+
} catch {
|
|
218
|
+
// ignore
|
|
219
|
+
}
|
|
220
|
+
return false;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function filterSearchResults(results, config = getResearchConfig()) {
|
|
224
|
+
return results.filter((result) => {
|
|
225
|
+
try {
|
|
226
|
+
const hostname = new URL(result.url).hostname;
|
|
227
|
+
if (hostname.includes("duckduckgo.com") || !result.snippet) return false;
|
|
228
|
+
const sourceType = classifySourceType(result.url, result.title);
|
|
229
|
+
if (Array.isArray(config.allowedSourceTypes) && !config.allowedSourceTypes.includes(sourceType)) return false;
|
|
230
|
+
return true;
|
|
231
|
+
} catch {
|
|
232
|
+
return false;
|
|
233
|
+
}
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function sourceFromPaper(title, url, snippet, publishDate) {
|
|
238
|
+
return { title: normalizePaperTitle(title), url, snippet, publishDate, sourceType: "paper" };
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
async function searchArxiv(query, signal, config) {
|
|
242
|
+
try {
|
|
243
|
+
const response = await fetchTextWithRetry(`https://export.arxiv.org/api/query?search_query=all:${encodeURIComponent(query)}&start=0&max_results=${config.resultsPerQuery}`, signal, 2, {}, config.pageTimeoutMs);
|
|
244
|
+
const xml = await response.text();
|
|
245
|
+
return [...xml.matchAll(/<entry>([\s\S]*?)<\/entry>/g)].map((match) => {
|
|
246
|
+
const entry = match[1];
|
|
247
|
+
const url = entry.match(/<id>([^<]+)<\/id>/)?.[1] || "";
|
|
248
|
+
const title = normalizePaperTitle((entry.match(/<title>([\s\S]*?)<\/title>/)?.[1] || "").replace(/\s+/g, " ").trim());
|
|
249
|
+
const summary = (entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1] || "").replace(/\s+/g, " ").trim();
|
|
250
|
+
const published = entry.match(/<published>([^<]+)<\/published>/)?.[1]?.slice(0, 10);
|
|
251
|
+
return sourceFromPaper(title, url, summary, published);
|
|
252
|
+
}).filter((item) => item.url && item.title);
|
|
253
|
+
} catch {
|
|
254
|
+
return [];
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
async function searchSemanticScholar(query, signal, config) {
|
|
259
|
+
try {
|
|
260
|
+
const response = await fetchTextWithRetry(`https://api.semanticscholar.org/graph/v1/paper/search?query=${encodeURIComponent(query)}&limit=${config.resultsPerQuery}&fields=title,abstract,url,year`, signal, 2, {}, config.pageTimeoutMs);
|
|
261
|
+
const data = await response.json();
|
|
262
|
+
return (data?.data || []).map((item) => sourceFromPaper(item.title, item.url || `https://www.semanticscholar.org/search?q=${encodeURIComponent(item.title)}`, item.abstract || "", item.year ? `${item.year}-01-01` : null)).filter((item) => item.title);
|
|
263
|
+
} catch {
|
|
264
|
+
return [];
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
async function searchCrossref(query, signal, config) {
|
|
269
|
+
try {
|
|
270
|
+
const response = await fetchTextWithRetry(`https://api.crossref.org/works?query.title=${encodeURIComponent(query)}&rows=${config.resultsPerQuery}`, signal, 2, { "user-agent": USER_AGENT }, config.pageTimeoutMs);
|
|
271
|
+
const data = await response.json();
|
|
272
|
+
return (data?.message?.items || []).map((item) => {
|
|
273
|
+
const doi = item.DOI ? `https://doi.org/${item.DOI}` : "";
|
|
274
|
+
const dateParts = item.published?.["date-parts"]?.[0] || [];
|
|
275
|
+
const publishDate = dateParts.length ? `${String(dateParts[0]).padStart(4, "0")}-${String(dateParts[1] || 1).padStart(2, "0")}-${String(dateParts[2] || 1).padStart(2, "0")}` : null;
|
|
276
|
+
return sourceFromPaper(item.title?.[0] || "", doi, String(item.abstract || "").replace(/<[^>]+>/g, " "), publishDate);
|
|
277
|
+
}).filter((item) => item.url && item.title);
|
|
278
|
+
} catch {
|
|
279
|
+
return [];
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
export async function searchDuckDuckGo(query, signal, config = getResearchConfig()) {
|
|
284
|
+
const cacheKey = `${query}::${config.resultsPerQuery}::${config.searchProvider || "ddg_html"}::${JSON.stringify({
|
|
285
|
+
allowedSourceTypes: config.allowedSourceTypes || [],
|
|
286
|
+
allowedSources: config.allowedSources || [],
|
|
287
|
+
preferRecent: config.preferRecent || false,
|
|
288
|
+
minYear: config.minYear || "",
|
|
289
|
+
maxYear: config.maxYear || "",
|
|
290
|
+
})}`;
|
|
291
|
+
const cached = config.isolate ? null : getCacheValue(searchCache, cacheKey);
|
|
292
|
+
if (cached) return cached;
|
|
293
|
+
|
|
294
|
+
let results = [];
|
|
295
|
+
const providerOrder = config.searchProvider === "lite"
|
|
296
|
+
? ["lite", "ddg_html", "jina"]
|
|
297
|
+
: config.searchProvider === "jina"
|
|
298
|
+
? ["jina", "ddg_html", "lite"]
|
|
299
|
+
: ["ddg_html", "lite", "jina"];
|
|
300
|
+
|
|
301
|
+
for (const provider of providerOrder) {
|
|
302
|
+
try {
|
|
303
|
+
if (provider === "ddg_html") {
|
|
304
|
+
const htmlResponse = await fetchTextWithRetry(`https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`, signal, 1);
|
|
305
|
+
results = filterSearchResults(extractDuckDuckGoResults(await htmlResponse.text()), config);
|
|
306
|
+
} else if (provider === "lite") {
|
|
307
|
+
const liteResponse = await fetchTextWithRetry(`https://lite.duckduckgo.com/lite/?q=${encodeURIComponent(query)}`, signal);
|
|
308
|
+
results = filterSearchResults(extractDuckDuckGoLiteResults(await liteResponse.text()), config);
|
|
309
|
+
} else {
|
|
310
|
+
const jinaResponse = await fetchTextWithRetry(`https://r.jina.ai/http://duckduckgo.com/html/?q=${encodeURIComponent(query)}`, signal, 2, {});
|
|
311
|
+
results = filterSearchResults(extractJinaSearchResults(await jinaResponse.text()), config);
|
|
312
|
+
}
|
|
313
|
+
} catch {
|
|
314
|
+
results = [];
|
|
315
|
+
}
|
|
316
|
+
if (results.length > 0) break;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
if (config.mode === "academic") {
|
|
320
|
+
const academic = [
|
|
321
|
+
...(await searchArxiv(query, signal, config)),
|
|
322
|
+
...(await searchSemanticScholar(query, signal, config)),
|
|
323
|
+
...(await searchCrossref(query, signal, config)),
|
|
324
|
+
];
|
|
325
|
+
results = [...results, ...academic];
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
const ranked = rankSearchResults(results, query, config.resultsPerQuery, config);
|
|
329
|
+
return config.isolate ? ranked : setCacheValue(searchCache, cacheKey, ranked, SEARCH_CACHE_TTL_MS);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
function shouldSkipUrl(url) {
|
|
333
|
+
return /(\/login|\/signin|\/sign-in|\/account|\/subscribe|\/checkout)/i.test(url);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
function shouldUseJinaFirst(url) {
|
|
337
|
+
return /(^|\.)medium\.com$|(^|\.)dev\.to$|(^|\.)substack\.com$/i.test(new URL(url).hostname);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
function pageFromText(title, url, text, config, extra = {}) {
|
|
341
|
+
const trimmed = String(text || "").slice(0, config.pageTextLimit).trim();
|
|
342
|
+
if (trimmed.length < config.minPageText) return null;
|
|
343
|
+
return { title, url, text: trimmed, codeBlocks: extractCodeBlocks(text), ...extra };
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
async function fetchJinaPageSource(url, signal, config) {
|
|
347
|
+
if (!config.useJinaFallback || shouldSkipUrl(url)) return null;
|
|
348
|
+
try {
|
|
349
|
+
const response = await fetchTextWithRetry(buildJinaReaderUrl(url), signal, 2, {}, config.pageTimeoutMs);
|
|
350
|
+
const body = await response.text();
|
|
351
|
+
const firstLine = body.split("\n").find((line) => line.trim().replace(/^#+\s*/, ""));
|
|
352
|
+
const title = firstLine ? firstLine.trim().replace(/^#+\s*/, "") : url;
|
|
353
|
+
return pageFromText(title, url, body, config, { sourceType: classifySourceType(url, title) });
|
|
354
|
+
} catch {
|
|
355
|
+
return null;
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
function withinTimeframe(page, config) {
|
|
360
|
+
if (!config.minYear && !config.maxYear && !config.preferRecent) return true;
|
|
361
|
+
const year = page.publishDate ? Number(String(page.publishDate).slice(0, 4)) : null;
|
|
362
|
+
if (config.minYear && year && year < config.minYear) return false;
|
|
363
|
+
if (config.maxYear && year && year > config.maxYear) return false;
|
|
364
|
+
return true;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
export async function fetchPageSource(url, signal, config = getResearchConfig()) {
|
|
368
|
+
if (shouldSkipUrl(url)) return null;
|
|
369
|
+
const cacheKey = `${normalizeUrl(url)}::${config.pageTextLimit}::${JSON.stringify({
|
|
370
|
+
preferRecent: config.preferRecent || false,
|
|
371
|
+
minYear: config.minYear || "",
|
|
372
|
+
maxYear: config.maxYear || "",
|
|
373
|
+
useJinaFallback: Boolean(config.useJinaFallback),
|
|
374
|
+
})}`;
|
|
375
|
+
const cached = config.isolate ? null : getCacheValue(pageCache, cacheKey);
|
|
376
|
+
if (cached) return cached;
|
|
377
|
+
if (shouldUseJinaFirst(url)) {
|
|
378
|
+
const first = await fetchJinaPageSource(url, signal, config);
|
|
379
|
+
return config.isolate ? first : setCacheValue(pageCache, cacheKey, first, PAGE_CACHE_TTL_MS);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
try {
|
|
383
|
+
const response = await fetchTextWithRetry(url, signal, 2, {
|
|
384
|
+
"user-agent": USER_AGENT,
|
|
385
|
+
"accept-language": "en-US,en;q=0.9",
|
|
386
|
+
}, config.pageTimeoutMs);
|
|
387
|
+
|
|
388
|
+
const contentType = response.headers.get("content-type") || "";
|
|
389
|
+
if (!contentType.includes("text/html") && !contentType.includes("text/plain")) return null;
|
|
390
|
+
|
|
391
|
+
const body = await response.text();
|
|
392
|
+
const snapshot = extractPageSnapshot(body, response.url || url);
|
|
393
|
+
const page = pageFromText(snapshot.title, snapshot.url, snapshot.text, config, {
|
|
394
|
+
publishDate: extractPublishDate(body),
|
|
395
|
+
sourceType: classifySourceType(snapshot.url, snapshot.title),
|
|
396
|
+
codeBlocks: snapshot.codeBlocks,
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
const resolved = page || await fetchJinaPageSource(url, signal, config);
|
|
400
|
+
const finalPage = resolved && withinTimeframe(resolved, config) ? resolved : null;
|
|
401
|
+
return config.isolate ? finalPage : setCacheValue(pageCache, cacheKey, finalPage, PAGE_CACHE_TTL_MS);
|
|
402
|
+
} catch {
|
|
403
|
+
const fallback = await fetchJinaPageSource(url, signal, config);
|
|
404
|
+
return config.isolate ? fallback : setCacheValue(pageCache, cacheKey, fallback, PAGE_CACHE_TTL_MS);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
async function readLocalFiles(paths, config) {
|
|
409
|
+
const pages = [];
|
|
410
|
+
for (const path of paths) {
|
|
411
|
+
try {
|
|
412
|
+
const text = await readFile(path, "utf8");
|
|
413
|
+
const page = pageFromText(path.split("/").pop() || path, `file://${path}`, text, config, {
|
|
414
|
+
sourceType: "file",
|
|
415
|
+
publishDate: null,
|
|
416
|
+
local: true,
|
|
417
|
+
});
|
|
418
|
+
if (page) pages.push(page);
|
|
419
|
+
} catch {
|
|
420
|
+
// ignore unreadable files
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
return pages;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
function fallbackSynthesis(query, pages) {
|
|
427
|
+
const sources = prioritizeSourceEntries(pages.slice(0, Math.min(5, pages.length)).map((page, index) => ({
|
|
428
|
+
number: index + 1,
|
|
429
|
+
title: page.title,
|
|
430
|
+
url: page.url,
|
|
431
|
+
freshness: page.publishDate ? page.publishDate.slice(0, 10) : undefined,
|
|
432
|
+
sourceType: page.sourceType,
|
|
433
|
+
score: page.score,
|
|
434
|
+
authoritative: page.authoritative,
|
|
435
|
+
})), query);
|
|
436
|
+
|
|
437
|
+
const bullets = pages.slice(0, Math.min(5, pages.length)).map((page, index) => `${page.text.replace(/\s+/g, " ").slice(0, 180).trim()} [${index + 1}]`);
|
|
438
|
+
const answer = pages.length
|
|
439
|
+
? `I found ${pages.length} relevant sources for “${query}” [1]. The strongest sources are summarized below.`
|
|
440
|
+
: `I could not find enough reliable sources for “${query}”.`;
|
|
441
|
+
|
|
442
|
+
return { answer, bullets, sources, citations: sources.map((source) => ({ text: source.title, sourceIndex: source.number || 0 })) };
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
export async function synthesizeResearch(query, pages, ctx, signal) {
|
|
446
|
+
const prompt = [
|
|
447
|
+
"You are a concise research synthesizer.",
|
|
448
|
+
"Answer only from the provided sources.",
|
|
449
|
+
"Return only JSON with this exact shape:",
|
|
450
|
+
'{"answer":"...","bullets":["..."],"sourceIds":[1,2],"citations":[{"text":"...","sourceIndex":1}]}',
|
|
451
|
+
"Rules:",
|
|
452
|
+
"- answer: one short paragraph with inline citations like [1] [2]",
|
|
453
|
+
"- bullets: 3-5 short bullet strings, each with inline citations",
|
|
454
|
+
`Question: ${query}`,
|
|
455
|
+
"Sources:",
|
|
456
|
+
...pages.map((page, index) => [
|
|
457
|
+
`[${index + 1}] ${page.title}`,
|
|
458
|
+
`URL: ${page.url}`,
|
|
459
|
+
`Type: ${page.sourceType || classifySourceType(page.url, page.title)}`,
|
|
460
|
+
`Score: ${typeof page.score === "number" ? page.score : scoreSourceEntry(page, query).total}`,
|
|
461
|
+
`Text: ${page.text}`,
|
|
462
|
+
].join("\n")),
|
|
463
|
+
].join("\n\n");
|
|
464
|
+
|
|
465
|
+
try {
|
|
466
|
+
const text = await completeWithResearchModel(ctx, signal, prompt, "medium");
|
|
467
|
+
const parsed = text ? parseJsonBlock(text) : null;
|
|
468
|
+
if (parsed && typeof parsed.answer === "string" && Array.isArray(parsed.bullets) && Array.isArray(parsed.sourceIds)) {
|
|
469
|
+
const sourceIds = [...new Set(parsed.sourceIds.map((id) => Number(id)).filter((id) => Number.isInteger(id) && id >= 1 && id <= pages.length))];
|
|
470
|
+
if (sourceIds.length > 0) {
|
|
471
|
+
const sources = prioritizeSourceEntries(sourceIds.map((id) => ({
|
|
472
|
+
number: id,
|
|
473
|
+
title: pages[id - 1].title,
|
|
474
|
+
url: pages[id - 1].url,
|
|
475
|
+
freshness: pages[id - 1].publishDate ? pages[id - 1].publishDate.slice(0, 10) : undefined,
|
|
476
|
+
sourceType: pages[id - 1].sourceType || classifySourceType(pages[id - 1].url, pages[id - 1].title),
|
|
477
|
+
score: typeof pages[id - 1].score === "number" ? pages[id - 1].score : scoreSourceEntry(pages[id - 1], query).total,
|
|
478
|
+
authoritative: typeof pages[id - 1].authoritative === "boolean" ? pages[id - 1].authoritative : scoreSourceEntry(pages[id - 1], query).authoritative,
|
|
479
|
+
})), query);
|
|
480
|
+
return {
|
|
481
|
+
answer: parsed.answer.trim(),
|
|
482
|
+
bullets: parsed.bullets.map((item) => String(item).trim()).filter(Boolean).slice(0, 5),
|
|
483
|
+
sources,
|
|
484
|
+
citations: Array.isArray(parsed.citations) ? parsed.citations.slice(0, 8) : sources.map((source) => ({ text: source.title, sourceIndex: source.number || 0 })),
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
} catch {
|
|
489
|
+
// fall through
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
return fallbackSynthesis(query, pages);
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
function planSubqueries(rootQuery, currentQuery, config, sufficiency) {
|
|
496
|
+
const queries = [];
|
|
497
|
+
if (sufficiency?.openSubQuestions?.length) queries.push(...sufficiency.openSubQuestions);
|
|
498
|
+
if (queries.length === 0) queries.push(buildFollowUpQuery(currentQuery || rootQuery, []));
|
|
499
|
+
return [...new Set(queries.filter(Boolean))].slice(0, Math.max(1, config.breadth || 2));
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
function formatResultText(result) {
|
|
503
|
+
return formatResearchResponse({ answer: result.answer, bullets: result.bullets, sources: result.sources, confidence: result.confidence });
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
function modeCacheKey(query, config) {
|
|
507
|
+
return `${config.mode}:${hashResearchQuery(query)}:${hashText(JSON.stringify({
|
|
508
|
+
files: config.files || [],
|
|
509
|
+
allowedSources: config.allowedSources || [],
|
|
510
|
+
allowedSourceTypes: config.allowedSourceTypes || [],
|
|
511
|
+
maxPages: config.maxPages,
|
|
512
|
+
maxTurns: config.maxTurns,
|
|
513
|
+
maxQueries: config.maxQueries,
|
|
514
|
+
resultsPerQuery: config.resultsPerQuery,
|
|
515
|
+
preferRecent: config.preferRecent,
|
|
516
|
+
minYear: config.minYear || "",
|
|
517
|
+
maxYear: config.maxYear || "",
|
|
518
|
+
searchProvider: config.searchProvider || "",
|
|
519
|
+
}))}`;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
export async function runWebResearch(query, ctx, signal, onUpdate, mode = "fast") {
|
|
523
|
+
const config = getResearchConfig(mode);
|
|
524
|
+
const cacheKey = modeCacheKey(query, config);
|
|
525
|
+
|
|
526
|
+
if (!config.isolate && !config.force) {
|
|
527
|
+
const memoryHit = getResearchMemory(cacheKey);
|
|
528
|
+
if (memoryHit) return memoryHit;
|
|
529
|
+
const persistentHit = readCachedResult(cacheKey);
|
|
530
|
+
if (persistentHit) {
|
|
531
|
+
setResearchMemory(cacheKey, persistentHit);
|
|
532
|
+
return persistentHit;
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
const emit = (stage, text) => onUpdate?.({ content: [{ type: "text", text: `[pipeline:${stage}] ${text}` }] });
|
|
537
|
+
const startedAt = Date.now();
|
|
538
|
+
const seenUrls = new Set();
|
|
539
|
+
const seenContentHashes = new Set();
|
|
540
|
+
const mergedPages = [];
|
|
541
|
+
const allCodeBlocks = [];
|
|
542
|
+
let subqueries = [];
|
|
543
|
+
let followupRounds = 0;
|
|
544
|
+
let followupQuery = null;
|
|
545
|
+
let conflictDetected = false;
|
|
546
|
+
let conflictSummary = "";
|
|
547
|
+
let conflictingSourcePairs = [];
|
|
548
|
+
let sufficiency = { sufficient: false, confidenceScore: 0.1, missingAspects: [], openSubQuestions: [] };
|
|
549
|
+
let currentQueries = await buildQueries(query, config.mode, ctx, signal);
|
|
550
|
+
subqueries = [...currentQueries];
|
|
551
|
+
|
|
552
|
+
const localPages = await readLocalFiles(config.files || [], config);
|
|
553
|
+
for (const page of localPages) {
|
|
554
|
+
const scored = scoreSourceEntry(page, query);
|
|
555
|
+
const contentHash = hashText(page.text);
|
|
556
|
+
if (seenContentHashes.has(contentHash)) continue;
|
|
557
|
+
seenContentHashes.add(contentHash);
|
|
558
|
+
mergedPages.push({
|
|
559
|
+
...page,
|
|
560
|
+
score: scored.total,
|
|
561
|
+
authoritative: scored.authoritative,
|
|
562
|
+
freshness: scored.freshness,
|
|
563
|
+
sourceType: page.sourceType || scored.sourceType,
|
|
564
|
+
local: true,
|
|
565
|
+
});
|
|
566
|
+
if (Array.isArray(page.codeBlocks)) allCodeBlocks.push(...page.codeBlocks);
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
for (let turn = 0; turn < Math.max(1, config.maxTurns || 1); turn++) {
|
|
570
|
+
emit(turn === 0 ? "plan" : "followup", `Planning ${config.mode} research... turn=${turn + 1}/${config.maxTurns}`);
|
|
571
|
+
const queriesThisTurn = currentQueries.slice(0, config.maxQueries);
|
|
572
|
+
emit("search", `Searching ${queriesThisTurn.length} queries...`);
|
|
573
|
+
|
|
574
|
+
const searchGroups = await Promise.all(queriesThisTurn.map((subquery) => searchDuckDuckGo(subquery, signal, config)));
|
|
575
|
+
const results = rankSearchResults(searchGroups.flat(), query, config.maxPages * 2, config)
|
|
576
|
+
.filter((result) => {
|
|
577
|
+
const key = normalizeUrl(result.url);
|
|
578
|
+
if (seenUrls.has(key)) return false;
|
|
579
|
+
seenUrls.add(key);
|
|
580
|
+
return true;
|
|
581
|
+
})
|
|
582
|
+
.slice(0, config.maxPages);
|
|
583
|
+
|
|
584
|
+
emit("fetch", `Reading ${results.length} sources...`);
|
|
585
|
+
const pageCandidates = await Promise.all(results.map((result) => fetchPageSource(result.url, signal, config)));
|
|
586
|
+
const rankedPages = rankFetchedPages(pageCandidates.filter(Boolean).map((page) => {
|
|
587
|
+
const scored = scoreSourceEntry(page, query);
|
|
588
|
+
return {
|
|
589
|
+
...page,
|
|
590
|
+
score: typeof page.score === "number" ? page.score : scored.total,
|
|
591
|
+
authoritative: typeof page.authoritative === "boolean" ? page.authoritative : scored.authoritative,
|
|
592
|
+
freshness: page.freshness || scored.freshness,
|
|
593
|
+
sourceType: page.sourceType || scored.sourceType,
|
|
594
|
+
text: selectRelevantChunks(page.text, query, config.maxChunksPerPage).join("\n\n") || page.text,
|
|
595
|
+
};
|
|
596
|
+
}).filter((page) => withinTimeframe(page, config)), query, config.maxPages, config);
|
|
597
|
+
|
|
598
|
+
for (const page of prioritizeSourceEntries(rankedPages, query)) {
|
|
599
|
+
const key = normalizeUrl(page.url);
|
|
600
|
+
const contentHash = hashText(page.text);
|
|
601
|
+
if (mergedPages.some((existing) => normalizeUrl(existing.url) === key)) continue;
|
|
602
|
+
if (seenContentHashes.has(contentHash)) continue;
|
|
603
|
+
seenContentHashes.add(contentHash);
|
|
604
|
+
mergedPages.push(page);
|
|
605
|
+
if (Array.isArray(page.codeBlocks)) allCodeBlocks.push(...page.codeBlocks);
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
const conflict = detectConflictSignals(mergedPages);
|
|
609
|
+
conflictDetected = conflict.detected;
|
|
610
|
+
conflictSummary = conflict.conflictSummary || "";
|
|
611
|
+
conflictingSourcePairs = conflict.conflictingSourcePairs || [];
|
|
612
|
+
|
|
613
|
+
const minSources = config.mode === "fast"
|
|
614
|
+
? (mergedPages.some((page) => page.authoritative) ? 1 : Math.max(3, config.minSources || 3))
|
|
615
|
+
: (config.minSources || 3);
|
|
616
|
+
|
|
617
|
+
sufficiency = evaluateSufficiency({
|
|
618
|
+
query,
|
|
619
|
+
sources: mergedPages,
|
|
620
|
+
conflictDetected,
|
|
621
|
+
minSources,
|
|
622
|
+
});
|
|
623
|
+
|
|
624
|
+
if (mergedPages.length >= minSources && sufficiency.confidenceScore >= 0.85 && (!conflictDetected || mergedPages.some((page) => page.authoritative))) {
|
|
625
|
+
sufficiency = { ...sufficiency, sufficient: true };
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
if (sufficiency.sufficient || turn === (config.maxTurns - 1)) break;
|
|
629
|
+
|
|
630
|
+
followupRounds += 1;
|
|
631
|
+
followupQuery = buildFollowUpQuery(query, mergedPages);
|
|
632
|
+
currentQueries = planSubqueries(query, followupQuery, config, sufficiency);
|
|
633
|
+
subqueries = [...new Set([...subqueries, ...currentQueries])];
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
if (mergedPages.length === 0) {
|
|
637
|
+
return {
|
|
638
|
+
ok: false,
|
|
639
|
+
action: "web_research",
|
|
640
|
+
query,
|
|
641
|
+
mode: config.mode,
|
|
642
|
+
subqueries,
|
|
643
|
+
reason: "No readable web sources were retrieved.",
|
|
644
|
+
openSubQuestions: buildFallbackQueries(query),
|
|
645
|
+
error: "No readable web sources were retrieved.",
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
emit("synthesis", `Synthesizing ${mergedPages.length} sources...`);
|
|
650
|
+
const synthesis = await synthesizeResearch(query, mergedPages, ctx, signal);
|
|
651
|
+
const sources = prioritizeSourceEntries(synthesis.sources.map((source) => ({
|
|
652
|
+
...source,
|
|
653
|
+
...(source.number ? {} : { number: undefined }),
|
|
654
|
+
})), query);
|
|
655
|
+
const confidence = buildConfidenceSummary(mergedPages, { conflictDetected, followupRounds });
|
|
656
|
+
const codeBlocks = [...new Set(allCodeBlocks)].slice(0, 5);
|
|
657
|
+
const sourceTypes = [...new Set(sources.map((source) => source.sourceType).filter(Boolean))];
|
|
658
|
+
const openSubQuestions = sufficiency.openSubQuestions.length ? sufficiency.openSubQuestions : (sufficiency.sufficient ? subqueries.slice(0, Math.min(3, subqueries.length)) : []);
|
|
659
|
+
|
|
660
|
+
const factCheck = factCheckAnswer(synthesis.answer, mergedPages);
|
|
661
|
+
const unverifiedRatio = synthesis.answer ? factCheck.unverifiedClaims.length / Math.max(1, factCheck.verifiedClaims.length + factCheck.unverifiedClaims.length) : 0;
|
|
662
|
+
const normalizedResult = createResearchResult({
|
|
663
|
+
answer: synthesis.answer,
|
|
664
|
+
bullets: synthesis.bullets,
|
|
665
|
+
citations: synthesis.citations || [],
|
|
666
|
+
sources,
|
|
667
|
+
codeBlocks,
|
|
668
|
+
sufficient: sufficiency.sufficient && unverifiedRatio <= 0.2,
|
|
669
|
+
missingAspects: sufficiency.missingAspects,
|
|
670
|
+
openSubQuestions,
|
|
671
|
+
conflictSummary: conflictSummary || sufficiency.conflictSummary || "",
|
|
672
|
+
confidence: sufficiency.confidenceScore,
|
|
673
|
+
sourceTypes,
|
|
674
|
+
unverifiedClaims: factCheck.unverifiedClaims,
|
|
675
|
+
meta: {
|
|
676
|
+
turns: followupRounds + 1,
|
|
677
|
+
sitesVisited: mergedPages.length,
|
|
678
|
+
totalFetchTimeMs: Date.now() - startedAt,
|
|
679
|
+
cacheHit: false,
|
|
680
|
+
},
|
|
681
|
+
});
|
|
682
|
+
|
|
683
|
+
const result = {
|
|
684
|
+
ok: true,
|
|
685
|
+
action: "web_research",
|
|
686
|
+
query,
|
|
687
|
+
mode: config.mode,
|
|
688
|
+
subqueries,
|
|
689
|
+
followupRounds,
|
|
690
|
+
followupQuery,
|
|
691
|
+
conflictDetected,
|
|
692
|
+
conflictSummary: normalizedResult.conflictSummary,
|
|
693
|
+
conflictingSourcePairs,
|
|
694
|
+
pagesRead: mergedPages.length,
|
|
695
|
+
answer: normalizedResult.answer,
|
|
696
|
+
bullets: normalizedResult.bullets,
|
|
697
|
+
citations: normalizedResult.citations,
|
|
698
|
+
sources: normalizedResult.sources,
|
|
699
|
+
sourceTypes,
|
|
700
|
+
codeBlocks: normalizedResult.codeBlocks,
|
|
701
|
+
confidence,
|
|
702
|
+
meta: normalizedResult.meta,
|
|
703
|
+
confidenceScore: sufficiency.confidenceScore,
|
|
704
|
+
authoritativeSourcesFound: sufficiency.authoritativeSourcesFound,
|
|
705
|
+
sufficient: normalizedResult.sufficient,
|
|
706
|
+
followupRecommended: !normalizedResult.sufficient,
|
|
707
|
+
openSubQuestions: normalizedResult.openSubQuestions,
|
|
708
|
+
missingAspects: normalizedResult.missingAspects,
|
|
709
|
+
unverifiedClaims: normalizedResult.unverifiedClaims,
|
|
710
|
+
contentText: formatResultText({ answer: normalizedResult.answer, bullets: normalizedResult.bullets, sources: normalizedResult.sources, confidence }),
|
|
711
|
+
};
|
|
712
|
+
|
|
713
|
+
setResearchMemory(cacheKey, result);
|
|
714
|
+
writeCachedResult(cacheKey, result, config.cacheTtlMs);
|
|
715
|
+
return result;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
export { compactResearchPayload, clearResearchMemory };
|