webpeel 0.21.3 → 0.21.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/fetch.js +48 -4
- package/dist/cli/utils.js +4 -1
- package/dist/cli.js +5 -0
- package/dist/core/deep-research.d.ts +53 -1
- package/dist/core/deep-research.js +219 -22
- package/dist/core/domain-extractors.js +20 -10
- package/dist/core/llm-provider.d.ts +5 -2
- package/dist/core/llm-provider.js +80 -2
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +276 -0
- package/dist/server/app.js +2 -0
- package/dist/server/routes/ask.js +61 -26
- package/dist/server/routes/deep-research.js +1 -0
- package/dist/server/routes/extract.d.ts +9 -3
- package/dist/server/routes/extract.js +159 -81
- package/package.json +1 -1
|
@@ -44,6 +44,10 @@ async function runStdin(options) {
|
|
|
44
44
|
// ─── runFetch ─────────────────────────────────────────────────────────────────
|
|
45
45
|
// Main fetch handler — shared with the `pipe` and `ask` subcommands
|
|
46
46
|
export async function runFetch(url, options) {
|
|
47
|
+
// --silent: suppress all log output (set env var before any logger fires)
|
|
48
|
+
if (options.silent && !process.env.WEBPEEL_LOG_LEVEL) {
|
|
49
|
+
process.env.WEBPEEL_LOG_LEVEL = 'silent';
|
|
50
|
+
}
|
|
47
51
|
// --content-only: override all output flags — we just want raw content
|
|
48
52
|
if (options.contentOnly) {
|
|
49
53
|
options.silent = true;
|
|
@@ -452,12 +456,25 @@ export async function runFetch(url, options) {
|
|
|
452
456
|
// Do NOT set extract here — peel runs normally, LLM extraction happens below.
|
|
453
457
|
}
|
|
454
458
|
else if (options.extract) {
|
|
455
|
-
// CSS
|
|
459
|
+
// Smart extract: detect schema format vs CSS selectors
|
|
460
|
+
let extractJson;
|
|
456
461
|
try {
|
|
457
|
-
|
|
462
|
+
extractJson = JSON.parse(options.extract);
|
|
458
463
|
}
|
|
459
464
|
catch {
|
|
460
|
-
throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\')'), { _code: 'FETCH_FAILED' });
|
|
465
|
+
throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\' or \'{"company": "string"}\')'), { _code: 'FETCH_FAILED' });
|
|
466
|
+
}
|
|
467
|
+
// If all values are type names (string/boolean/number/array/object),
|
|
468
|
+
// treat as structured schema extraction (routed to extractStructured after fetch).
|
|
469
|
+
// Otherwise treat as CSS selector map.
|
|
470
|
+
const { isTypeSchema } = await import('../../core/structured-extract.js');
|
|
471
|
+
if (isTypeSchema(extractJson)) {
|
|
472
|
+
// Mark for post-fetch structured extraction (handled below)
|
|
473
|
+
options._structuredSchema = extractJson;
|
|
474
|
+
}
|
|
475
|
+
else {
|
|
476
|
+
// CSS-based extraction
|
|
477
|
+
extract = { selectors: extractJson };
|
|
461
478
|
}
|
|
462
479
|
}
|
|
463
480
|
// Validate maxTokens
|
|
@@ -786,6 +803,32 @@ export async function runFetch(url, options) {
|
|
|
786
803
|
console.error(`⚠ ${warningMsg}`);
|
|
787
804
|
}
|
|
788
805
|
}
|
|
806
|
+
// --- Structured schema extraction (--extract with type schema or --extract-prompt) ---
|
|
807
|
+
if (options._structuredSchema || options.extractPrompt) {
|
|
808
|
+
const { extractStructured, simpleToExtractionSchema } = await import('../../core/structured-extract.js');
|
|
809
|
+
const rawSchema = options._structuredSchema;
|
|
810
|
+
const schema = rawSchema
|
|
811
|
+
? simpleToExtractionSchema(rawSchema)
|
|
812
|
+
: { type: 'object', properties: { result: { type: 'string', description: options.extractPrompt } } };
|
|
813
|
+
const strResult = await extractStructured(result.content, schema, undefined, // No LLM config — use heuristic (no key needed)
|
|
814
|
+
options.extractPrompt);
|
|
815
|
+
if (isJson) {
|
|
816
|
+
await writeStdout(JSON.stringify({
|
|
817
|
+
success: true,
|
|
818
|
+
data: strResult.data,
|
|
819
|
+
confidence: strResult.confidence,
|
|
820
|
+
method: 'heuristic',
|
|
821
|
+
}, null, 2) + '\n');
|
|
822
|
+
}
|
|
823
|
+
else {
|
|
824
|
+
await writeStdout(JSON.stringify(strResult.data, null, 2) + '\n');
|
|
825
|
+
if (!options.silent) {
|
|
826
|
+
console.error(`\n📊 Structured extraction: confidence=${(strResult.confidence * 100).toFixed(0)}% (heuristic)`);
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
await cleanup();
|
|
830
|
+
process.exit(0);
|
|
831
|
+
}
|
|
789
832
|
// --- LLM-based extraction (post-peel) ---
|
|
790
833
|
if (options.llmExtract || options.extractSchema) {
|
|
791
834
|
const { extractWithLLM } = await import('../../core/llm-extract.js');
|
|
@@ -1091,7 +1134,8 @@ export function registerFetchCommands(program) {
|
|
|
1091
1134
|
.option('--full', 'Alias for --raw — full page content, no budget')
|
|
1092
1135
|
.option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
|
|
1093
1136
|
.option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
|
|
1094
|
-
.option('--extract <json>', 'Extract structured data using CSS selectors (
|
|
1137
|
+
.option('--extract <json>', 'Extract structured data using CSS selectors or type schema (e.g., \'{"title": "h1"}\' for CSS, \'{"name": "string"}\' for schema)')
|
|
1138
|
+
.option('--extract-prompt <prompt>', 'Natural language prompt for structured extraction (no LLM key needed — uses heuristics)')
|
|
1095
1139
|
.option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
|
|
1096
1140
|
.option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
|
|
1097
1141
|
.option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
|
package/dist/cli/utils.js
CHANGED
|
@@ -33,7 +33,10 @@ export async function checkForUpdates() {
|
|
|
33
33
|
const data = await res.json();
|
|
34
34
|
const latest = data.version;
|
|
35
35
|
if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
|
|
36
|
-
|
|
36
|
+
// Skip update notice in silent mode
|
|
37
|
+
if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
|
|
38
|
+
console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
|
|
39
|
+
}
|
|
37
40
|
}
|
|
38
41
|
}
|
|
39
42
|
catch { /* silently ignore — don't slow down the user */ }
|
package/dist/cli.js
CHANGED
|
@@ -22,6 +22,11 @@ import { registerInteractCommands } from './cli/commands/interact.js';
|
|
|
22
22
|
import { registerAuthCommands } from './cli/commands/auth.js';
|
|
23
23
|
import { registerScreenshotCommands } from './cli/commands/screenshot.js';
|
|
24
24
|
import { registerJobsCommands } from './cli/commands/jobs.js';
|
|
25
|
+
// ── Early silent/log-level detection (must happen before any async module code) ──
|
|
26
|
+
// Set WEBPEEL_LOG_LEVEL early so logger checks see it when async IIFEs fire.
|
|
27
|
+
if (!process.env.WEBPEEL_LOG_LEVEL && process.argv.includes('--silent')) {
|
|
28
|
+
process.env.WEBPEEL_LOG_LEVEL = 'silent';
|
|
29
|
+
}
|
|
25
30
|
// ── Verb alias intercept (before Commander parses) ────────────────────────────
|
|
26
31
|
// "webpeel fetch <url>" → "webpeel <url>"
|
|
27
32
|
// Note: 'read' is intentionally excluded — it's a registered subcommand.
|
|
@@ -12,8 +12,9 @@
|
|
|
12
12
|
* 6. Re-Search Loop — Generate new queries if gaps found (max N rounds)
|
|
13
13
|
* 7. Synthesis — LLM generates final cited report
|
|
14
14
|
*/
|
|
15
|
+
import { type WebSearchResult } from './search-provider.js';
|
|
15
16
|
import { type LLMConfig } from './llm-provider.js';
|
|
16
|
-
export type ProgressEventType = 'decomposing' | 'searching' | 'fetching' | 'scoring' | 'gap_check' | 'researching' | 'synthesizing' | 'done' | 'error';
|
|
17
|
+
export type ProgressEventType = 'decomposing' | 'searching' | 'fetching' | 'scoring' | 'gap_check' | 'researching' | 'verification' | 'synthesizing' | 'done' | 'error';
|
|
17
18
|
export interface DeepResearchProgressEvent {
|
|
18
19
|
type: ProgressEventType;
|
|
19
20
|
message: string;
|
|
@@ -54,6 +55,56 @@ export interface DeepResearchResponse {
|
|
|
54
55
|
};
|
|
55
56
|
elapsed: number;
|
|
56
57
|
}
|
|
58
|
+
/** Source credibility assessment */
|
|
59
|
+
export interface SourceCredibility {
|
|
60
|
+
/** Credibility tier */
|
|
61
|
+
tier: 'official' | 'verified' | 'general';
|
|
62
|
+
/** Star rating (1–3) */
|
|
63
|
+
stars: number;
|
|
64
|
+
/** Human-readable label */
|
|
65
|
+
label: string;
|
|
66
|
+
}
|
|
67
|
+
interface FetchedSource {
|
|
68
|
+
result: WebSearchResult;
|
|
69
|
+
content: string;
|
|
70
|
+
relevanceScore: number;
|
|
71
|
+
subQuery: string;
|
|
72
|
+
/** Credibility assessment (populated after fetchSources) */
|
|
73
|
+
credibility?: SourceCredibility;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Assess the credibility of a source URL.
|
|
77
|
+
*
|
|
78
|
+
* Returns:
|
|
79
|
+
* - tier: 'official' | 'verified' | 'general'
|
|
80
|
+
* - stars: 3 / 2 / 1
|
|
81
|
+
* - label: human-readable string for the synthesis prompt
|
|
82
|
+
*/
|
|
83
|
+
export declare function getSourceCredibility(url: string): SourceCredibility;
|
|
84
|
+
/** Render stars string for a credibility tier */
|
|
85
|
+
export declare function starsString(stars: number): string;
|
|
86
|
+
interface GapDetectionResult {
|
|
87
|
+
hasEnoughInfo: boolean;
|
|
88
|
+
gaps: string[];
|
|
89
|
+
additionalQueries: string[];
|
|
90
|
+
/** Detected source conflicts (optional, from LLM analysis) */
|
|
91
|
+
conflicts?: string[];
|
|
92
|
+
/** Overall confidence level based on source quality */
|
|
93
|
+
confidence?: 'high' | 'medium' | 'low';
|
|
94
|
+
}
|
|
95
|
+
interface VerificationSummary {
|
|
96
|
+
conflicts: string[];
|
|
97
|
+
confidence: 'high' | 'medium' | 'low';
|
|
98
|
+
sourceDiversity: boolean;
|
|
99
|
+
officialCount: number;
|
|
100
|
+
verifiedCount: number;
|
|
101
|
+
generalCount: number;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Compute a verification summary from fetched sources and optional gap detection result.
|
|
105
|
+
* Used to emit the 'verification' progress event before synthesis.
|
|
106
|
+
*/
|
|
107
|
+
export declare function computeVerificationSummary(sources: FetchedSource[], gapResult?: GapDetectionResult): VerificationSummary;
|
|
57
108
|
/**
|
|
58
109
|
* Run a deep research session.
|
|
59
110
|
*
|
|
@@ -61,3 +112,4 @@ export interface DeepResearchResponse {
|
|
|
61
112
|
* relevance scoring → gap detection → re-search loop → synthesis.
|
|
62
113
|
*/
|
|
63
114
|
export declare function runDeepResearch(req: DeepResearchRequest): Promise<DeepResearchResponse>;
|
|
115
|
+
export {};
|
|
@@ -39,6 +39,85 @@ function normalizeUrl(url) {
|
|
|
39
39
|
return url.toLowerCase().replace(/^https?:\/\/(www\.)?/, '').replace(/\/$/, '');
|
|
40
40
|
}
|
|
41
41
|
}
|
|
42
|
+
/** Extract bare hostname (no www) from a URL, or return empty string on failure */
|
|
43
|
+
function extractDomain(url) {
|
|
44
|
+
try {
|
|
45
|
+
return new URL(url).hostname.toLowerCase().replace(/^www\./, '');
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
return url.toLowerCase().replace(/^https?:\/\/(www\.)?/, '').split('/')[0] ?? '';
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
// Source Credibility
|
|
53
|
+
// ---------------------------------------------------------------------------
|
|
54
|
+
/** Official TLDs and hostnames that indicate high-authority sources */
|
|
55
|
+
const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil']);
|
|
56
|
+
const OFFICIAL_HOSTNAMES = new Set([
|
|
57
|
+
// Academic / research
|
|
58
|
+
'arxiv.org', 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
|
|
59
|
+
'jstor.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
|
|
60
|
+
'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
|
|
61
|
+
// International organisations
|
|
62
|
+
'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
|
|
63
|
+
// Official tech documentation
|
|
64
|
+
'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
|
|
65
|
+
'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
|
|
66
|
+
'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
|
|
67
|
+
]);
|
|
68
|
+
const VERIFIED_HOSTNAMES = new Set([
|
|
69
|
+
// Encyclopaedia / reference
|
|
70
|
+
'wikipedia.org', 'en.wikipedia.org',
|
|
71
|
+
// Reputable news agencies
|
|
72
|
+
'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'nytimes.com',
|
|
73
|
+
'washingtonpost.com', 'theguardian.com', 'economist.com', 'ft.com',
|
|
74
|
+
// Developer resources
|
|
75
|
+
'github.com', 'stackoverflow.com', 'npmjs.com', 'pypi.org',
|
|
76
|
+
'crates.io', 'docs.rs', 'packagist.org',
|
|
77
|
+
// Official cloud / vendor docs
|
|
78
|
+
'docs.aws.amazon.com', 'cloud.google.com', 'docs.github.com',
|
|
79
|
+
'azure.microsoft.com', 'registry.terraform.io',
|
|
80
|
+
]);
|
|
81
|
+
/**
|
|
82
|
+
* Assess the credibility of a source URL.
|
|
83
|
+
*
|
|
84
|
+
* Returns:
|
|
85
|
+
* - tier: 'official' | 'verified' | 'general'
|
|
86
|
+
* - stars: 3 / 2 / 1
|
|
87
|
+
* - label: human-readable string for the synthesis prompt
|
|
88
|
+
*/
|
|
89
|
+
export function getSourceCredibility(url) {
|
|
90
|
+
try {
|
|
91
|
+
const hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, '');
|
|
92
|
+
// Check official TLDs
|
|
93
|
+
for (const tld of OFFICIAL_TLDS) {
|
|
94
|
+
if (hostname.endsWith(tld)) {
|
|
95
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
// Check known official hostnames
|
|
99
|
+
if (OFFICIAL_HOSTNAMES.has(hostname)) {
|
|
100
|
+
return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
|
|
101
|
+
}
|
|
102
|
+
// Check known verified hostnames
|
|
103
|
+
if (VERIFIED_HOSTNAMES.has(hostname)) {
|
|
104
|
+
return { tier: 'verified', stars: 2, label: 'VERIFIED' };
|
|
105
|
+
}
|
|
106
|
+
// Everything else
|
|
107
|
+
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
108
|
+
}
|
|
109
|
+
catch {
|
|
110
|
+
return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
/** Render stars string for a credibility tier */
|
|
114
|
+
export function starsString(stars) {
|
|
115
|
+
if (stars >= 3)
|
|
116
|
+
return '★★★';
|
|
117
|
+
if (stars >= 2)
|
|
118
|
+
return '★★☆';
|
|
119
|
+
return '★☆☆';
|
|
120
|
+
}
|
|
42
121
|
// ---------------------------------------------------------------------------
|
|
43
122
|
// LLM call with merged token tracking
|
|
44
123
|
// ---------------------------------------------------------------------------
|
|
@@ -182,9 +261,11 @@ async function fetchSources(searchResults, maxSources, signal) {
|
|
|
182
261
|
}));
|
|
183
262
|
for (const outcome of settled) {
|
|
184
263
|
if (outcome.status === 'fulfilled') {
|
|
264
|
+
const src = outcome.value;
|
|
185
265
|
fetched.push({
|
|
186
|
-
...
|
|
266
|
+
...src,
|
|
187
267
|
relevanceScore: 0, // filled in step 4
|
|
268
|
+
credibility: getSourceCredibility(src.result.url),
|
|
188
269
|
});
|
|
189
270
|
}
|
|
190
271
|
}
|
|
@@ -224,7 +305,43 @@ function scoreSources(sources, question) {
|
|
|
224
305
|
});
|
|
225
306
|
}
|
|
226
307
|
async function detectGaps(question, sources, config, tokens, signal) {
|
|
227
|
-
//
|
|
308
|
+
// ── Heuristic pre-checks (no LLM call needed) ──────────────────────────
|
|
309
|
+
if (sources.length >= 3) {
|
|
310
|
+
// Heuristic 1: All sources from the same domain → need diversity
|
|
311
|
+
const domains = sources.map((s) => extractDomain(s.result.url));
|
|
312
|
+
const uniqueDomains = new Set(domains.filter((d) => d.length > 0));
|
|
313
|
+
if (uniqueDomains.size === 1) {
|
|
314
|
+
const soloDomain = [...uniqueDomains][0];
|
|
315
|
+
return {
|
|
316
|
+
hasEnoughInfo: false,
|
|
317
|
+
gaps: [
|
|
318
|
+
`All ${sources.length} sources are from the same domain (${soloDomain}). Diverse sources needed for reliable research.`,
|
|
319
|
+
],
|
|
320
|
+
additionalQueries: [
|
|
321
|
+
`${question} alternative perspectives`,
|
|
322
|
+
`${question} overview explanation`,
|
|
323
|
+
],
|
|
324
|
+
conflicts: [],
|
|
325
|
+
confidence: 'low',
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
// Heuristic 2: Question implies need for official docs but no official sources found
|
|
329
|
+
const hasOfficialSource = sources.some((s) => (s.credibility || getSourceCredibility(s.result.url)).tier === 'official');
|
|
330
|
+
const questionWantsOfficial = /\b(official|documentation|docs|policy|government|authority|academic|standards?|specification|rfc)\b/i.test(question);
|
|
331
|
+
if (!hasOfficialSource && questionWantsOfficial) {
|
|
332
|
+
return {
|
|
333
|
+
hasEnoughInfo: false,
|
|
334
|
+
gaps: ['No official or academic sources found. The question requires authoritative documentation.'],
|
|
335
|
+
additionalQueries: [
|
|
336
|
+
`${question} site:.gov OR site:.edu`,
|
|
337
|
+
`${question} official documentation`,
|
|
338
|
+
],
|
|
339
|
+
conflicts: [],
|
|
340
|
+
confidence: 'low',
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
// ── LLM-based gap + conflict detection ─────────────────────────────────
|
|
228
345
|
const topSources = sources
|
|
229
346
|
.sort((a, b) => b.relevanceScore - a.relevanceScore)
|
|
230
347
|
.slice(0, 8);
|
|
@@ -240,40 +357,44 @@ async function detectGaps(question, sources, config, tokens, signal) {
|
|
|
240
357
|
content: [
|
|
241
358
|
'You are a research quality assessor. Given a question and the sources collected so far,',
|
|
242
359
|
'determine if there is sufficient information to write a comprehensive answer.',
|
|
360
|
+
'Also detect any factual conflicts between sources.',
|
|
243
361
|
'',
|
|
244
362
|
'Respond in this EXACT JSON format (no markdown, no code blocks):',
|
|
245
363
|
'{',
|
|
246
364
|
' "hasEnoughInfo": boolean,',
|
|
247
365
|
' "gaps": ["gap1", "gap2"],',
|
|
248
|
-
' "additionalQueries": ["query1", "query2"]',
|
|
366
|
+
' "additionalQueries": ["query1", "query2"],',
|
|
367
|
+
' "conflicts": ["Source A says X while Source B says Y"],',
|
|
368
|
+
' "confidence": "high" | "medium" | "low"',
|
|
249
369
|
'}',
|
|
250
370
|
'',
|
|
251
371
|
'"gaps" should be 0-3 specific aspects not covered by the sources.',
|
|
252
372
|
'"additionalQueries" should be 0-3 new search queries to fill those gaps.',
|
|
373
|
+
'"conflicts" should be 0-3 factual disagreements found between sources.',
|
|
374
|
+
'"confidence": high = consistent official sources, medium = mixed, low = conflicting or poor sources.',
|
|
253
375
|
'If hasEnoughInfo is true, set gaps and additionalQueries to empty arrays.',
|
|
254
376
|
].join('\n'),
|
|
255
377
|
},
|
|
256
378
|
{
|
|
257
379
|
role: 'user',
|
|
258
|
-
content: `Question: "${question}"\n\nSources collected:\n\n${contextSummary}\n\nAnalyze coverage and
|
|
380
|
+
content: `Question: "${question}"\n\nSources collected:\n\n${contextSummary}\n\nAnalyze coverage, gaps, and conflicts:`,
|
|
259
381
|
},
|
|
260
382
|
];
|
|
261
383
|
let text;
|
|
262
384
|
try {
|
|
263
385
|
text = await callWithTracking(config, messages, tokens, {
|
|
264
386
|
signal,
|
|
265
|
-
maxTokens:
|
|
387
|
+
maxTokens: 700,
|
|
266
388
|
});
|
|
267
389
|
}
|
|
268
390
|
catch (err) {
|
|
269
391
|
if (isFreeTierLimitError(err))
|
|
270
392
|
throw err;
|
|
271
393
|
// On LLM failure, assume we have enough info
|
|
272
|
-
return { hasEnoughInfo: true, gaps: [], additionalQueries: [] };
|
|
394
|
+
return { hasEnoughInfo: true, gaps: [], additionalQueries: [], conflicts: [], confidence: 'medium' };
|
|
273
395
|
}
|
|
274
396
|
// Parse JSON response
|
|
275
397
|
try {
|
|
276
|
-
// Strip markdown code fences if present
|
|
277
398
|
const cleaned = text
|
|
278
399
|
.replace(/```json\s*/gi, '')
|
|
279
400
|
.replace(/```\s*/g, '')
|
|
@@ -285,28 +406,80 @@ async function detectGaps(question, sources, config, tokens, signal) {
|
|
|
285
406
|
additionalQueries: Array.isArray(json.additionalQueries)
|
|
286
407
|
? json.additionalQueries.slice(0, 3)
|
|
287
408
|
: [],
|
|
409
|
+
conflicts: Array.isArray(json.conflicts) ? json.conflicts.slice(0, 3) : [],
|
|
410
|
+
confidence: ['high', 'medium', 'low'].includes(String(json.confidence))
|
|
411
|
+
? json.confidence
|
|
412
|
+
: 'medium',
|
|
288
413
|
};
|
|
289
414
|
}
|
|
290
415
|
catch {
|
|
291
|
-
|
|
292
|
-
|
|
416
|
+
return { hasEnoughInfo: true, gaps: [], additionalQueries: [], conflicts: [], confidence: 'medium' };
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
/**
|
|
420
|
+
* Compute a verification summary from fetched sources and optional gap detection result.
|
|
421
|
+
* Used to emit the 'verification' progress event before synthesis.
|
|
422
|
+
*/
|
|
423
|
+
export function computeVerificationSummary(sources, gapResult) {
|
|
424
|
+
const credibilities = sources.map((s) => s.credibility || getSourceCredibility(s.result.url));
|
|
425
|
+
const officialCount = credibilities.filter((c) => c.tier === 'official').length;
|
|
426
|
+
const verifiedCount = credibilities.filter((c) => c.tier === 'verified').length;
|
|
427
|
+
const generalCount = credibilities.filter((c) => c.tier === 'general').length;
|
|
428
|
+
const total = sources.length || 1;
|
|
429
|
+
// Source diversity: at least 3 unique domains (or all are diverse if < 3 sources)
|
|
430
|
+
const domains = new Set(sources.map((s) => extractDomain(s.result.url)).filter((d) => d.length > 0));
|
|
431
|
+
const sourceDiversity = domains.size >= Math.min(3, total);
|
|
432
|
+
// Compute confidence from source quality
|
|
433
|
+
let confidence;
|
|
434
|
+
if (gapResult?.confidence) {
|
|
435
|
+
confidence = gapResult.confidence;
|
|
436
|
+
}
|
|
437
|
+
else {
|
|
438
|
+
const highQualityRatio = (officialCount + verifiedCount) / total;
|
|
439
|
+
if (officialCount >= 2 || highQualityRatio >= 0.5) {
|
|
440
|
+
confidence = 'high';
|
|
441
|
+
}
|
|
442
|
+
else if (verifiedCount >= 1 || highQualityRatio >= 0.25) {
|
|
443
|
+
confidence = 'medium';
|
|
444
|
+
}
|
|
445
|
+
else {
|
|
446
|
+
confidence = 'low';
|
|
447
|
+
}
|
|
293
448
|
}
|
|
449
|
+
const conflicts = gapResult?.conflicts ?? [];
|
|
450
|
+
return { conflicts, confidence, sourceDiversity, officialCount, verifiedCount, generalCount };
|
|
294
451
|
}
|
|
295
452
|
// ---------------------------------------------------------------------------
|
|
296
453
|
// Step 7: Synthesis
|
|
297
454
|
// ---------------------------------------------------------------------------
|
|
298
455
|
async function synthesizeReport(question, sources, config, tokens, opts) {
|
|
299
|
-
// Sort by
|
|
456
|
+
// Sort by credibility tier first (official > verified > general), then by relevance
|
|
457
|
+
const tierOrder = { official: 0, verified: 1, general: 2 };
|
|
300
458
|
const topSources = sources
|
|
301
|
-
.
|
|
459
|
+
.map((s) => ({ ...s, credibility: s.credibility || getSourceCredibility(s.result.url) }))
|
|
460
|
+
.sort((a, b) => {
|
|
461
|
+
const tierDiff = (tierOrder[a.credibility.tier] ?? 2) - (tierOrder[b.credibility.tier] ?? 2);
|
|
462
|
+
if (tierDiff !== 0)
|
|
463
|
+
return tierDiff;
|
|
464
|
+
return b.relevanceScore - a.relevanceScore;
|
|
465
|
+
})
|
|
302
466
|
.slice(0, 15);
|
|
303
|
-
// Build context
|
|
467
|
+
// Build context with credibility labels
|
|
304
468
|
const contextParts = [];
|
|
305
469
|
const citations = [];
|
|
306
470
|
topSources.forEach((source, i) => {
|
|
307
471
|
const idx = i + 1;
|
|
472
|
+
const cred = source.credibility;
|
|
473
|
+
const stars = starsString(cred.stars);
|
|
308
474
|
const sanitized = sanitizeForLLM(truncate(source.content || source.result.snippet || '', 3000));
|
|
309
|
-
contextParts.push(
|
|
475
|
+
contextParts.push([
|
|
476
|
+
`SOURCE [${idx}] ${stars}`,
|
|
477
|
+
`Title: ${source.result.title}`,
|
|
478
|
+
`URL: ${source.result.url}`,
|
|
479
|
+
`Credibility: ${cred.label}`,
|
|
480
|
+
'',
|
|
481
|
+
sanitized.content,
|
|
482
|
+
].join('\n'));
|
|
310
483
|
citations.push({
|
|
311
484
|
index: idx,
|
|
312
485
|
title: source.result.title,
|
|
@@ -321,19 +494,27 @@ async function synthesizeReport(question, sources, config, tokens, opts) {
|
|
|
321
494
|
role: 'system',
|
|
322
495
|
content: [
|
|
323
496
|
'You are a research analyst that writes comprehensive, well-cited reports.',
|
|
324
|
-
'
|
|
325
|
-
'
|
|
326
|
-
'
|
|
327
|
-
'
|
|
328
|
-
'
|
|
329
|
-
'
|
|
330
|
-
' -
|
|
331
|
-
'
|
|
497
|
+
'Each source is rated by credibility:',
|
|
498
|
+
' ★★★ = OFFICIAL SOURCE (government, academic, official docs) — highest authority',
|
|
499
|
+
' ★★☆ = VERIFIED (reputable news, Wikipedia, major developer platforms)',
|
|
500
|
+
' ★☆☆ = UNVERIFIED (blogs, forums, unknown sites) — use with caution',
|
|
501
|
+
'',
|
|
502
|
+
'Rules:',
|
|
503
|
+
' - Prioritize official sources [★★★] over unverified ones [★☆☆]',
|
|
504
|
+
' - If sources disagree, note the conflict and trust the higher-credibility source',
|
|
505
|
+
' - Cite every factual claim with [1], [2], etc.',
|
|
506
|
+
' - Use ONLY the provided sources — do not fabricate information or citations',
|
|
507
|
+
' - Structure your report with:',
|
|
508
|
+
' • Executive Summary',
|
|
509
|
+
' • Key Findings (with citations)',
|
|
510
|
+
' • Detailed Analysis',
|
|
511
|
+
' • Conclusion',
|
|
512
|
+
' - End with: **Confidence: HIGH/MEDIUM/LOW** based on source quality and agreement',
|
|
332
513
|
].join('\n'),
|
|
333
514
|
},
|
|
334
515
|
{
|
|
335
516
|
role: 'user',
|
|
336
|
-
content: `Research question: "${question}"\n\nSources:\n\n${context}\n\nWrite a comprehensive research report with citations:`,
|
|
517
|
+
content: `Research question: "${question}"\n\nSources (ranked by credibility):\n\n${context}\n\nWrite a comprehensive research report with citations:`,
|
|
337
518
|
},
|
|
338
519
|
];
|
|
339
520
|
const report = await callWithTracking(config, messages, tokens, {
|
|
@@ -374,6 +555,7 @@ export async function runDeepResearch(req) {
|
|
|
374
555
|
const allSources = [];
|
|
375
556
|
const seenUrls = new Set();
|
|
376
557
|
let usedQueries = new Set();
|
|
558
|
+
let lastGapResult;
|
|
377
559
|
// ── Round 0..maxRounds ────────────────────────────────────────────────────
|
|
378
560
|
let currentQueries = [];
|
|
379
561
|
for (let round = 0; round < maxRounds; round++) {
|
|
@@ -447,6 +629,7 @@ export async function runDeepResearch(req) {
|
|
|
447
629
|
throw err;
|
|
448
630
|
break;
|
|
449
631
|
}
|
|
632
|
+
lastGapResult = gapResult;
|
|
450
633
|
if (gapResult.hasEnoughInfo || gapResult.additionalQueries.length === 0) {
|
|
451
634
|
break;
|
|
452
635
|
}
|
|
@@ -459,6 +642,20 @@ export async function runDeepResearch(req) {
|
|
|
459
642
|
});
|
|
460
643
|
currentQueries = gapResult.additionalQueries;
|
|
461
644
|
}
|
|
645
|
+
// Verification summary (emitted before synthesis so streaming clients can show status)
|
|
646
|
+
const verifySummary = computeVerificationSummary(allSources, lastGapResult);
|
|
647
|
+
progress({
|
|
648
|
+
type: 'verification',
|
|
649
|
+
message: `Verification complete — confidence: ${verifySummary.confidence.toUpperCase()}`,
|
|
650
|
+
data: {
|
|
651
|
+
conflicts: verifySummary.conflicts,
|
|
652
|
+
confidence: verifySummary.confidence,
|
|
653
|
+
sourceDiversity: verifySummary.sourceDiversity,
|
|
654
|
+
officialCount: verifySummary.officialCount,
|
|
655
|
+
verifiedCount: verifySummary.verifiedCount,
|
|
656
|
+
generalCount: verifySummary.generalCount,
|
|
657
|
+
},
|
|
658
|
+
});
|
|
462
659
|
// Step 7: Synthesis
|
|
463
660
|
progress({ type: 'synthesizing', message: 'Synthesizing research report…' });
|
|
464
661
|
// Sort all sources by relevance for synthesis
|
|
@@ -728,7 +728,7 @@ ${commentsMd || '*No comments found.*'}`;
|
|
|
728
728
|
};
|
|
729
729
|
});
|
|
730
730
|
const subredditName = posts[0]?.url?.match(/\/r\/([^/]+)\//)?.[1] || path.match(/\/r\/([^/]+)/)?.[1] || '';
|
|
731
|
-
const structured = { subreddit: `r/${subredditName}`, posts };
|
|
731
|
+
const structured = { title: `r/${subredditName} — Top Posts`, subreddit: `r/${subredditName}`, posts };
|
|
732
732
|
const cleanContent = `## 📋 r/${subredditName} — Hot Posts
|
|
733
733
|
|
|
734
734
|
${posts.map((p, i) => `${i + 1}. **${p.title}**\n ${p.author} | ↑ ${p.score} | 💬 ${p.commentCount}${p.flair ? ` | ${p.flair}` : ''}\n ${p.url}`).join('\n\n')}`;
|
|
@@ -756,7 +756,7 @@ ${posts.map((p, i) => `${i + 1}. **${p.title}**\n ${p.author} | ↑ ${p.score}
|
|
|
756
756
|
flair: d.link_flair_text || null,
|
|
757
757
|
};
|
|
758
758
|
});
|
|
759
|
-
const structured = { sortType, posts, postCount: posts.length };
|
|
759
|
+
const structured = { title: `Reddit — ${sortType.charAt(0).toUpperCase() + sortType.slice(1)} Posts`, sortType, posts, postCount: posts.length };
|
|
760
760
|
const listMd = posts.map((p, i) => {
|
|
761
761
|
const flairTag = p.flair ? ` | ${p.flair}` : '';
|
|
762
762
|
return `${i + 1}. **${p.title}**\n ${p.author} in ${p.subreddit} | ↑ ${p.score} | 💬 ${p.commentCount}${flairTag}\n ${p.url}`;
|
|
@@ -918,6 +918,7 @@ ${commentsMd || '*No comments.*'}`;
|
|
|
918
918
|
catch { /* ignore */ }
|
|
919
919
|
}
|
|
920
920
|
const structured = {
|
|
921
|
+
title: `${owner}/${repo}`,
|
|
921
922
|
name: `${owner}/${repo}`,
|
|
922
923
|
description: repoData.description || '',
|
|
923
924
|
stars: repoData.stargazers_count ?? 0,
|
|
@@ -1039,7 +1040,7 @@ ${commentsMd || '*No comments found.*'}`;
|
|
|
1039
1040
|
url: s.url || `https://news.ycombinator.com/item?id=${s.id}`,
|
|
1040
1041
|
hnUrl: `https://news.ycombinator.com/item?id=${s.id}`,
|
|
1041
1042
|
}));
|
|
1042
|
-
const structured = { stories };
|
|
1043
|
+
const structured = { title: 'Hacker News — Front Page', stories };
|
|
1043
1044
|
const cleanContent = `## 🟠 Hacker News — Front Page
|
|
1044
1045
|
|
|
1045
1046
|
${stories.map((s, i) => `${i + 1}. **${s.title}**\n ↑ ${s.score} | 💬 ${s.commentCount} | by ${s.author}\n ${s.url}`).join('\n\n')}`;
|
|
@@ -1346,15 +1347,24 @@ async function arxivExtractor(_html, url) {
|
|
|
1346
1347
|
const match = xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
|
|
1347
1348
|
return match ? stripHtml(match[1]).trim() : '';
|
|
1348
1349
|
};
|
|
1349
|
-
|
|
1350
|
-
|
|
1350
|
+
// getAllTags removed — unused
|
|
1351
|
+
// ArXiv Atom feed: <feed><title>query URL</title> ... <entry><title>Paper Title</title>...
|
|
1352
|
+
// We must grab the entry title, not the feed title.
|
|
1353
|
+
const entryMatch = xml.match(/<entry[\s\S]*?<\/entry>/);
|
|
1354
|
+
const entryXml = entryMatch ? entryMatch[0] : xml;
|
|
1355
|
+
const getEntryTag = (tag) => {
|
|
1356
|
+
const match = entryXml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
|
|
1357
|
+
return match ? stripHtml(match[1]).trim() : '';
|
|
1358
|
+
};
|
|
1359
|
+
const getAllEntryTags = (tag) => {
|
|
1360
|
+
const matches = [...entryXml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
|
|
1351
1361
|
return matches.map(m => stripHtml(m[1]).trim()).filter(Boolean);
|
|
1352
1362
|
};
|
|
1353
|
-
const title = getTag('title');
|
|
1354
|
-
const summary = getTag('summary');
|
|
1355
|
-
const published = getTag('published');
|
|
1356
|
-
const updated = getTag('updated');
|
|
1357
|
-
const authors =
|
|
1363
|
+
const title = getEntryTag('title') || getTag('title');
|
|
1364
|
+
const summary = getEntryTag('summary') || getTag('summary');
|
|
1365
|
+
const published = getEntryTag('published') || getTag('published');
|
|
1366
|
+
const updated = getEntryTag('updated') || getTag('updated');
|
|
1367
|
+
const authors = getAllEntryTags('name');
|
|
1358
1368
|
// Extract categories
|
|
1359
1369
|
const categories = [...xml.matchAll(/category[^>]*term="([^"]+)"/g)].map(m => m[1]);
|
|
1360
1370
|
// Extract DOI and journal ref if available
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* 4. Google Gemini (BYOK)
|
|
9
9
|
* 5. Ollama (local, OpenAI-compatible)
|
|
10
10
|
*/
|
|
11
|
-
export type DeepResearchLLMProvider = 'cloudflare' | 'openai' | 'anthropic' | 'google' | 'ollama';
|
|
11
|
+
export type DeepResearchLLMProvider = 'cloudflare' | 'openai' | 'anthropic' | 'google' | 'ollama' | 'cerebras';
|
|
12
12
|
export interface LLMConfig {
|
|
13
13
|
provider: DeepResearchLLMProvider;
|
|
14
14
|
apiKey?: string;
|
|
@@ -64,7 +64,10 @@ export declare function resetNeuronUsage(): void;
|
|
|
64
64
|
export declare function callLLM(config: LLMConfig, options: LLMCallOptions): Promise<LLMCallResult>;
|
|
65
65
|
/**
|
|
66
66
|
* Get the default LLM config based on available environment variables.
|
|
67
|
-
*
|
|
67
|
+
*
|
|
68
|
+
* Priority order: Anthropic → OpenAI → Google → Cerebras → Cloudflare (free tier fallback).
|
|
69
|
+
* If no BYOK key and no Cloudflare credentials are configured, returns a cloudflare config
|
|
70
|
+
* that will throw a clear error when callLLM is invoked (CLOUDFLARE_ACCOUNT_ID missing).
|
|
68
71
|
*/
|
|
69
72
|
export declare function getDefaultLLMConfig(): LLMConfig;
|
|
70
73
|
/** Type guard: check if a thrown value is a FreeTierLimitError */
|