npm - webpeel - Versions diffs - 0.21.3 → 0.21.5 - Mend

webpeel 0.21.3 → 0.21.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/cli/commands/fetch.js +48 -4
package/dist/cli/utils.js +4 -1
package/dist/cli.js +5 -0
package/dist/core/deep-research.d.ts +53 -1
package/dist/core/deep-research.js +219 -22
package/dist/core/domain-extractors.js +20 -10
package/dist/core/llm-provider.d.ts +5 -2
package/dist/core/llm-provider.js +80 -2
package/dist/core/source-scoring.d.ts +166 -0
package/dist/core/source-scoring.js +396 -0
package/dist/core/structured-extract.d.ts +43 -0
package/dist/core/structured-extract.js +276 -0
package/dist/server/app.js +2 -0
package/dist/server/routes/ask.js +61 -26
package/dist/server/routes/deep-research.js +1 -0
package/dist/server/routes/extract.d.ts +9 -3
package/dist/server/routes/extract.js +159 -81
package/package.json +1 -1

package/dist/cli/commands/fetch.js CHANGED Viewed

@@ -44,6 +44,10 @@ async function runStdin(options) {
 // ─── runFetch ─────────────────────────────────────────────────────────────────
 // Main fetch handler — shared with the `pipe` and `ask` subcommands
 export async function runFetch(url, options) {
+    // --silent: suppress all log output (set env var before any logger fires)
+    if (options.silent && !process.env.WEBPEEL_LOG_LEVEL) {
+        process.env.WEBPEEL_LOG_LEVEL = 'silent';
+    }
     // --content-only: override all output flags — we just want raw content
     if (options.contentOnly) {
         options.silent = true;
@@ -452,12 +456,25 @@ export async function runFetch(url, options) {
             // Do NOT set extract here — peel runs normally, LLM extraction happens below.
         }
         else if (options.extract) {
-            // CSS-based extraction
+            // Smart extract: detect schema format vs CSS selectors
+            let extractJson;
             try {
-                extract = { selectors: JSON.parse(options.extract) };
+                extractJson = JSON.parse(options.extract);
             }
             catch {
-                throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\')'), { _code: 'FETCH_FAILED' });
+                throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\' or \'{"company": "string"}\')'), { _code: 'FETCH_FAILED' });
+            }
+            // If all values are type names (string/boolean/number/array/object),
+            // treat as structured schema extraction (routed to extractStructured after fetch).
+            // Otherwise treat as CSS selector map.
+            const { isTypeSchema } = await import('../../core/structured-extract.js');
+            if (isTypeSchema(extractJson)) {
+                // Mark for post-fetch structured extraction (handled below)
+                options._structuredSchema = extractJson;
+            }
+            else {
+                // CSS-based extraction
+                extract = { selectors: extractJson };
             }
         }
         // Validate maxTokens
@@ -786,6 +803,32 @@ export async function runFetch(url, options) {
                 console.error(`⚠ ${warningMsg}`);
             }
         }
+        // --- Structured schema extraction (--extract with type schema or --extract-prompt) ---
+        if (options._structuredSchema || options.extractPrompt) {
+            const { extractStructured, simpleToExtractionSchema } = await import('../../core/structured-extract.js');
+            const rawSchema = options._structuredSchema;
+            const schema = rawSchema
+                ? simpleToExtractionSchema(rawSchema)
+                : { type: 'object', properties: { result: { type: 'string', description: options.extractPrompt } } };
+            const strResult = await extractStructured(result.content, schema, undefined, // No LLM config — use heuristic (no key needed)
+            options.extractPrompt);
+            if (isJson) {
+                await writeStdout(JSON.stringify({
+                    success: true,
+                    data: strResult.data,
+                    confidence: strResult.confidence,
+                    method: 'heuristic',
+                }, null, 2) + '\n');
+            }
+            else {
+                await writeStdout(JSON.stringify(strResult.data, null, 2) + '\n');
+                if (!options.silent) {
+                    console.error(`\n📊 Structured extraction: confidence=${(strResult.confidence * 100).toFixed(0)}% (heuristic)`);
+                }
+            }
+            await cleanup();
+            process.exit(0);
+        }
         // --- LLM-based extraction (post-peel) ---
         if (options.llmExtract || options.extractSchema) {
             const { extractWithLLM } = await import('../../core/llm-extract.js');
@@ -1091,7 +1134,8 @@ export function registerFetchCommands(program) {
         .option('--full', 'Alias for --raw — full page content, no budget')
         .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
         .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
-        .option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
+        .option('--extract <json>', 'Extract structured data using CSS selectors or type schema (e.g., \'{"title": "h1"}\' for CSS, \'{"name": "string"}\' for schema)')
+        .option('--extract-prompt <prompt>', 'Natural language prompt for structured extraction (no LLM key needed — uses heuristics)')
         .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
         .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
         .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')

package/dist/cli/utils.js CHANGED Viewed

@@ -33,7 +33,10 @@ export async function checkForUpdates() {
         const data = await res.json();
         const latest = data.version;
         if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
-            console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
+            // Skip update notice in silent mode
+            if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
+                console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
+            }
         }
     }
     catch { /* silently ignore — don't slow down the user */ }

package/dist/cli.js CHANGED Viewed

@@ -22,6 +22,11 @@ import { registerInteractCommands } from './cli/commands/interact.js';
 import { registerAuthCommands } from './cli/commands/auth.js';
 import { registerScreenshotCommands } from './cli/commands/screenshot.js';
 import { registerJobsCommands } from './cli/commands/jobs.js';
+// ── Early silent/log-level detection (must happen before any async module code) ──
+// Set WEBPEEL_LOG_LEVEL early so logger checks see it when async IIFEs fire.
+if (!process.env.WEBPEEL_LOG_LEVEL && process.argv.includes('--silent')) {
+    process.env.WEBPEEL_LOG_LEVEL = 'silent';
+}
 // ── Verb alias intercept (before Commander parses) ────────────────────────────
 // "webpeel fetch <url>" → "webpeel <url>"
 // Note: 'read' is intentionally excluded — it's a registered subcommand.

package/dist/core/deep-research.d.ts CHANGED Viewed

@@ -12,8 +12,9 @@
  *   6. Re-Search Loop        — Generate new queries if gaps found (max N rounds)
  *   7. Synthesis             — LLM generates final cited report
  */
+import { type WebSearchResult } from './search-provider.js';
 import { type LLMConfig } from './llm-provider.js';
-export type ProgressEventType = 'decomposing' | 'searching' | 'fetching' | 'scoring' | 'gap_check' | 'researching' | 'synthesizing' | 'done' | 'error';
+export type ProgressEventType = 'decomposing' | 'searching' | 'fetching' | 'scoring' | 'gap_check' | 'researching' | 'verification' | 'synthesizing' | 'done' | 'error';
 export interface DeepResearchProgressEvent {
     type: ProgressEventType;
     message: string;
@@ -54,6 +55,56 @@ export interface DeepResearchResponse {
     };
     elapsed: number;
 }
+/** Source credibility assessment */
+export interface SourceCredibility {
+    /** Credibility tier */
+    tier: 'official' | 'verified' | 'general';
+    /** Star rating (1–3) */
+    stars: number;
+    /** Human-readable label */
+    label: string;
+}
+interface FetchedSource {
+    result: WebSearchResult;
+    content: string;
+    relevanceScore: number;
+    subQuery: string;
+    /** Credibility assessment (populated after fetchSources) */
+    credibility?: SourceCredibility;
+}
+/**
+ * Assess the credibility of a source URL.
+ *
+ * Returns:
+ *   - tier: 'official' | 'verified' | 'general'
+ *   - stars: 3 / 2 / 1
+ *   - label: human-readable string for the synthesis prompt
+ */
+export declare function getSourceCredibility(url: string): SourceCredibility;
+/** Render stars string for a credibility tier */
+export declare function starsString(stars: number): string;
+interface GapDetectionResult {
+    hasEnoughInfo: boolean;
+    gaps: string[];
+    additionalQueries: string[];
+    /** Detected source conflicts (optional, from LLM analysis) */
+    conflicts?: string[];
+    /** Overall confidence level based on source quality */
+    confidence?: 'high' | 'medium' | 'low';
+}
+interface VerificationSummary {
+    conflicts: string[];
+    confidence: 'high' | 'medium' | 'low';
+    sourceDiversity: boolean;
+    officialCount: number;
+    verifiedCount: number;
+    generalCount: number;
+}
+/**
+ * Compute a verification summary from fetched sources and optional gap detection result.
+ * Used to emit the 'verification' progress event before synthesis.
+ */
+export declare function computeVerificationSummary(sources: FetchedSource[], gapResult?: GapDetectionResult): VerificationSummary;
 /**
  * Run a deep research session.
  *
@@ -61,3 +112,4 @@ export interface DeepResearchResponse {
  * relevance scoring → gap detection → re-search loop → synthesis.
  */
 export declare function runDeepResearch(req: DeepResearchRequest): Promise<DeepResearchResponse>;
+export {};

package/dist/core/deep-research.js CHANGED Viewed

@@ -39,6 +39,85 @@ function normalizeUrl(url) {
         return url.toLowerCase().replace(/^https?:\/\/(www\.)?/, '').replace(/\/$/, '');
     }
 }
+/** Extract bare hostname (no www) from a URL, or return empty string on failure */
+function extractDomain(url) {
+    try {
+        return new URL(url).hostname.toLowerCase().replace(/^www\./, '');
+    }
+    catch {
+        return url.toLowerCase().replace(/^https?:\/\/(www\.)?/, '').split('/')[0] ?? '';
+    }
+}
+// ---------------------------------------------------------------------------
+// Source Credibility
+// ---------------------------------------------------------------------------
+/** Official TLDs and hostnames that indicate high-authority sources */
+const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil']);
+const OFFICIAL_HOSTNAMES = new Set([
+    // Academic / research
+    'arxiv.org', 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
+    'jstor.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
+    'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
+    // International organisations
+    'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
+    // Official tech documentation
+    'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
+    'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
+    'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
+]);
+const VERIFIED_HOSTNAMES = new Set([
+    // Encyclopaedia / reference
+    'wikipedia.org', 'en.wikipedia.org',
+    // Reputable news agencies
+    'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'nytimes.com',
+    'washingtonpost.com', 'theguardian.com', 'economist.com', 'ft.com',
+    // Developer resources
+    'github.com', 'stackoverflow.com', 'npmjs.com', 'pypi.org',
+    'crates.io', 'docs.rs', 'packagist.org',
+    // Official cloud / vendor docs
+    'docs.aws.amazon.com', 'cloud.google.com', 'docs.github.com',
+    'azure.microsoft.com', 'registry.terraform.io',
+]);
+/**
+ * Assess the credibility of a source URL.
+ *
+ * Returns:
+ *   - tier: 'official' | 'verified' | 'general'
+ *   - stars: 3 / 2 / 1
+ *   - label: human-readable string for the synthesis prompt
+ */
+export function getSourceCredibility(url) {
+    try {
+        const hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, '');
+        // Check official TLDs
+        for (const tld of OFFICIAL_TLDS) {
+            if (hostname.endsWith(tld)) {
+                return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
+            }
+        }
+        // Check known official hostnames
+        if (OFFICIAL_HOSTNAMES.has(hostname)) {
+            return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
+        }
+        // Check known verified hostnames
+        if (VERIFIED_HOSTNAMES.has(hostname)) {
+            return { tier: 'verified', stars: 2, label: 'VERIFIED' };
+        }
+        // Everything else
+        return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
+    }
+    catch {
+        return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
+    }
+}
+/** Render stars string for a credibility tier */
+export function starsString(stars) {
+    if (stars >= 3)
+        return '★★★';
+    if (stars >= 2)
+        return '★★☆';
+    return '★☆☆';
+}
 // ---------------------------------------------------------------------------
 // LLM call with merged token tracking
 // ---------------------------------------------------------------------------
@@ -182,9 +261,11 @@ async function fetchSources(searchResults, maxSources, signal) {
         }));
         for (const outcome of settled) {
             if (outcome.status === 'fulfilled') {
+                const src = outcome.value;
                 fetched.push({
-                    ...outcome.value,
+                    ...src,
                     relevanceScore: 0, // filled in step 4
+                    credibility: getSourceCredibility(src.result.url),
                 });
             }
         }
@@ -224,7 +305,43 @@ function scoreSources(sources, question) {
     });
 }
 async function detectGaps(question, sources, config, tokens, signal) {
-    // Build summary of what we have
+    // ── Heuristic pre-checks (no LLM call needed) ──────────────────────────
+    if (sources.length >= 3) {
+        // Heuristic 1: All sources from the same domain → need diversity
+        const domains = sources.map((s) => extractDomain(s.result.url));
+        const uniqueDomains = new Set(domains.filter((d) => d.length > 0));
+        if (uniqueDomains.size === 1) {
+            const soloDomain = [...uniqueDomains][0];
+            return {
+                hasEnoughInfo: false,
+                gaps: [
+                    `All ${sources.length} sources are from the same domain (${soloDomain}). Diverse sources needed for reliable research.`,
+                ],
+                additionalQueries: [
+                    `${question} alternative perspectives`,
+                    `${question} overview explanation`,
+                ],
+                conflicts: [],
+                confidence: 'low',
+            };
+        }
+        // Heuristic 2: Question implies need for official docs but no official sources found
+        const hasOfficialSource = sources.some((s) => (s.credibility || getSourceCredibility(s.result.url)).tier === 'official');
+        const questionWantsOfficial = /\b(official|documentation|docs|policy|government|authority|academic|standards?|specification|rfc)\b/i.test(question);
+        if (!hasOfficialSource && questionWantsOfficial) {
+            return {
+                hasEnoughInfo: false,
+                gaps: ['No official or academic sources found. The question requires authoritative documentation.'],
+                additionalQueries: [
+                    `${question} site:.gov OR site:.edu`,
+                    `${question} official documentation`,
+                ],
+                conflicts: [],
+                confidence: 'low',
+            };
+        }
+    }
+    // ── LLM-based gap + conflict detection ─────────────────────────────────
     const topSources = sources
         .sort((a, b) => b.relevanceScore - a.relevanceScore)
         .slice(0, 8);
@@ -240,40 +357,44 @@ async function detectGaps(question, sources, config, tokens, signal) {
             content: [
                 'You are a research quality assessor. Given a question and the sources collected so far,',
                 'determine if there is sufficient information to write a comprehensive answer.',
+                'Also detect any factual conflicts between sources.',
                 '',
                 'Respond in this EXACT JSON format (no markdown, no code blocks):',
                 '{',
                 '  "hasEnoughInfo": boolean,',
                 '  "gaps": ["gap1", "gap2"],',
-                '  "additionalQueries": ["query1", "query2"]',
+                '  "additionalQueries": ["query1", "query2"],',
+                '  "conflicts": ["Source A says X while Source B says Y"],',
+                '  "confidence": "high" | "medium" | "low"',
                 '}',
                 '',
                 '"gaps" should be 0-3 specific aspects not covered by the sources.',
                 '"additionalQueries" should be 0-3 new search queries to fill those gaps.',
+                '"conflicts" should be 0-3 factual disagreements found between sources.',
+                '"confidence": high = consistent official sources, medium = mixed, low = conflicting or poor sources.',
                 'If hasEnoughInfo is true, set gaps and additionalQueries to empty arrays.',
             ].join('\n'),
         },
         {
             role: 'user',
-            content: `Question: "${question}"\n\nSources collected:\n\n${contextSummary}\n\nAnalyze coverage and gaps:`,
+            content: `Question: "${question}"\n\nSources collected:\n\n${contextSummary}\n\nAnalyze coverage, gaps, and conflicts:`,
         },
     ];
     let text;
     try {
         text = await callWithTracking(config, messages, tokens, {
             signal,
-            maxTokens: 600,
+            maxTokens: 700,
         });
     }
     catch (err) {
         if (isFreeTierLimitError(err))
             throw err;
         // On LLM failure, assume we have enough info
-        return { hasEnoughInfo: true, gaps: [], additionalQueries: [] };
+        return { hasEnoughInfo: true, gaps: [], additionalQueries: [], conflicts: [], confidence: 'medium' };
     }
     // Parse JSON response
     try {
-        // Strip markdown code fences if present
         const cleaned = text
             .replace(/```json\s*/gi, '')
             .replace(/```\s*/g, '')
@@ -285,28 +406,80 @@ async function detectGaps(question, sources, config, tokens, signal) {
             additionalQueries: Array.isArray(json.additionalQueries)
                 ? json.additionalQueries.slice(0, 3)
                 : [],
+            conflicts: Array.isArray(json.conflicts) ? json.conflicts.slice(0, 3) : [],
+            confidence: ['high', 'medium', 'low'].includes(String(json.confidence))
+                ? json.confidence
+                : 'medium',
         };
     }
     catch {
-        // Couldn't parse JSON — assume enough info
-        return { hasEnoughInfo: true, gaps: [], additionalQueries: [] };
+        return { hasEnoughInfo: true, gaps: [], additionalQueries: [], conflicts: [], confidence: 'medium' };
+    }
+}
+/**
+ * Compute a verification summary from fetched sources and optional gap detection result.
+ * Used to emit the 'verification' progress event before synthesis.
+ */
+export function computeVerificationSummary(sources, gapResult) {
+    const credibilities = sources.map((s) => s.credibility || getSourceCredibility(s.result.url));
+    const officialCount = credibilities.filter((c) => c.tier === 'official').length;
+    const verifiedCount = credibilities.filter((c) => c.tier === 'verified').length;
+    const generalCount = credibilities.filter((c) => c.tier === 'general').length;
+    const total = sources.length || 1;
+    // Source diversity: at least 3 unique domains (or all are diverse if < 3 sources)
+    const domains = new Set(sources.map((s) => extractDomain(s.result.url)).filter((d) => d.length > 0));
+    const sourceDiversity = domains.size >= Math.min(3, total);
+    // Compute confidence from source quality
+    let confidence;
+    if (gapResult?.confidence) {
+        confidence = gapResult.confidence;
+    }
+    else {
+        const highQualityRatio = (officialCount + verifiedCount) / total;
+        if (officialCount >= 2 || highQualityRatio >= 0.5) {
+            confidence = 'high';
+        }
+        else if (verifiedCount >= 1 || highQualityRatio >= 0.25) {
+            confidence = 'medium';
+        }
+        else {
+            confidence = 'low';
+        }
     }
+    const conflicts = gapResult?.conflicts ?? [];
+    return { conflicts, confidence, sourceDiversity, officialCount, verifiedCount, generalCount };
 }
 // ---------------------------------------------------------------------------
 // Step 7: Synthesis
 // ---------------------------------------------------------------------------
 async function synthesizeReport(question, sources, config, tokens, opts) {
-    // Sort by relevance, take best sources (max 15 for context)
+    // Sort by credibility tier first (official > verified > general), then by relevance
+    const tierOrder = { official: 0, verified: 1, general: 2 };
     const topSources = sources
-        .sort((a, b) => b.relevanceScore - a.relevanceScore)
+        .map((s) => ({ ...s, credibility: s.credibility || getSourceCredibility(s.result.url) }))
+        .sort((a, b) => {
+        const tierDiff = (tierOrder[a.credibility.tier] ?? 2) - (tierOrder[b.credibility.tier] ?? 2);
+        if (tierDiff !== 0)
+            return tierDiff;
+        return b.relevanceScore - a.relevanceScore;
+    })
         .slice(0, 15);
-    // Build context
+    // Build context with credibility labels
     const contextParts = [];
     const citations = [];
     topSources.forEach((source, i) => {
         const idx = i + 1;
+        const cred = source.credibility;
+        const stars = starsString(cred.stars);
         const sanitized = sanitizeForLLM(truncate(source.content || source.result.snippet || '', 3000));
-        contextParts.push(`SOURCE [${idx}]\nTitle: ${source.result.title}\nURL: ${source.result.url}\n\n${sanitized.content}`);
+        contextParts.push([
+            `SOURCE [${idx}] ${stars}`,
+            `Title: ${source.result.title}`,
+            `URL: ${source.result.url}`,
+            `Credibility: ${cred.label}`,
+            '',
+            sanitized.content,
+        ].join('\n'));
         citations.push({
             index: idx,
             title: source.result.title,
@@ -321,19 +494,27 @@ async function synthesizeReport(question, sources, config, tokens, opts) {
             role: 'system',
             content: [
                 'You are a research analyst that writes comprehensive, well-cited reports.',
-                'Use ONLY the provided sources to answer the question.',
-                'Cite sources using bracketed numbers like [1], [2], [3].',
-                'Structure your report with:',
-                '  - A brief executive summary',
-                '  - Key findings (with citations)',
-                '  - Detailed analysis',
-                '  - Conclusion',
-                'Do not fabricate URLs or citations. Do not include information not found in the sources.',
+                'Each source is rated by credibility:',
+                '  ★★★ = OFFICIAL SOURCE (government, academic, official docs) — highest authority',
+                '  ★★☆ = VERIFIED (reputable news, Wikipedia, major developer platforms)',
+                '  ★☆☆ = UNVERIFIED (blogs, forums, unknown sites) — use with caution',
+                '',
+                'Rules:',
+                '  - Prioritize official sources [★★★] over unverified ones [★☆☆]',
+                '  - If sources disagree, note the conflict and trust the higher-credibility source',
+                '  - Cite every factual claim with [1], [2], etc.',
+                '  - Use ONLY the provided sources — do not fabricate information or citations',
+                '  - Structure your report with:',
+                '      • Executive Summary',
+                '      • Key Findings (with citations)',
+                '      • Detailed Analysis',
+                '      • Conclusion',
+                '  - End with: **Confidence: HIGH/MEDIUM/LOW** based on source quality and agreement',
             ].join('\n'),
         },
         {
             role: 'user',
-            content: `Research question: "${question}"\n\nSources:\n\n${context}\n\nWrite a comprehensive research report with citations:`,
+            content: `Research question: "${question}"\n\nSources (ranked by credibility):\n\n${context}\n\nWrite a comprehensive research report with citations:`,
         },
     ];
     const report = await callWithTracking(config, messages, tokens, {
@@ -374,6 +555,7 @@ export async function runDeepResearch(req) {
     const allSources = [];
     const seenUrls = new Set();
     let usedQueries = new Set();
+    let lastGapResult;
     // ── Round 0..maxRounds ────────────────────────────────────────────────────
     let currentQueries = [];
     for (let round = 0; round < maxRounds; round++) {
@@ -447,6 +629,7 @@ export async function runDeepResearch(req) {
                 throw err;
             break;
         }
+        lastGapResult = gapResult;
         if (gapResult.hasEnoughInfo || gapResult.additionalQueries.length === 0) {
             break;
         }
@@ -459,6 +642,20 @@ export async function runDeepResearch(req) {
         });
         currentQueries = gapResult.additionalQueries;
     }
+    // Verification summary (emitted before synthesis so streaming clients can show status)
+    const verifySummary = computeVerificationSummary(allSources, lastGapResult);
+    progress({
+        type: 'verification',
+        message: `Verification complete — confidence: ${verifySummary.confidence.toUpperCase()}`,
+        data: {
+            conflicts: verifySummary.conflicts,
+            confidence: verifySummary.confidence,
+            sourceDiversity: verifySummary.sourceDiversity,
+            officialCount: verifySummary.officialCount,
+            verifiedCount: verifySummary.verifiedCount,
+            generalCount: verifySummary.generalCount,
+        },
+    });
     // Step 7: Synthesis
     progress({ type: 'synthesizing', message: 'Synthesizing research report…' });
     // Sort all sources by relevance for synthesis

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -728,7 +728,7 @@ ${commentsMd || '*No comments found.*'}`;
             };
         });
         const subredditName = posts[0]?.url?.match(/\/r\/([^/]+)\//)?.[1] || path.match(/\/r\/([^/]+)/)?.[1] || '';
-        const structured = { subreddit: `r/${subredditName}`, posts };
+        const structured = { title: `r/${subredditName} — Top Posts`, subreddit: `r/${subredditName}`, posts };
         const cleanContent = `## 📋 r/${subredditName} — Hot Posts
 ${posts.map((p, i) => `${i + 1}. **${p.title}**\n   ${p.author} | ↑ ${p.score} | 💬 ${p.commentCount}${p.flair ? ` | ${p.flair}` : ''}\n   ${p.url}`).join('\n\n')}`;
@@ -756,7 +756,7 @@ ${posts.map((p, i) => `${i + 1}. **${p.title}**\n   ${p.author} | ↑ ${p.score}
                 flair: d.link_flair_text || null,
             };
         });
-        const structured = { sortType, posts, postCount: posts.length };
+        const structured = { title: `Reddit — ${sortType.charAt(0).toUpperCase() + sortType.slice(1)} Posts`, sortType, posts, postCount: posts.length };
         const listMd = posts.map((p, i) => {
             const flairTag = p.flair ? ` | ${p.flair}` : '';
             return `${i + 1}. **${p.title}**\n   ${p.author} in ${p.subreddit} | ↑ ${p.score} | 💬 ${p.commentCount}${flairTag}\n   ${p.url}`;
@@ -918,6 +918,7 @@ ${commentsMd || '*No comments.*'}`;
             catch { /* ignore */ }
         }
         const structured = {
+            title: `${owner}/${repo}`,
             name: `${owner}/${repo}`,
             description: repoData.description || '',
             stars: repoData.stargazers_count ?? 0,
@@ -1039,7 +1040,7 @@ ${commentsMd || '*No comments found.*'}`;
             url: s.url || `https://news.ycombinator.com/item?id=${s.id}`,
             hnUrl: `https://news.ycombinator.com/item?id=${s.id}`,
         }));
-        const structured = { stories };
+        const structured = { title: 'Hacker News — Front Page', stories };
         const cleanContent = `## 🟠 Hacker News — Front Page
 ${stories.map((s, i) => `${i + 1}. **${s.title}**\n   ↑ ${s.score} | 💬 ${s.commentCount} | by ${s.author}\n   ${s.url}`).join('\n\n')}`;
@@ -1346,15 +1347,24 @@ async function arxivExtractor(_html, url) {
             const match = xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
             return match ? stripHtml(match[1]).trim() : '';
         };
-        const getAllTags = (tag) => {
-            const matches = [...xml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
+        // getAllTags removed — unused
+        // ArXiv Atom feed: <feed><title>query URL</title> ... <entry><title>Paper Title</title>...
+        // We must grab the entry title, not the feed title.
+        const entryMatch = xml.match(/<entry[\s\S]*?<\/entry>/);
+        const entryXml = entryMatch ? entryMatch[0] : xml;
+        const getEntryTag = (tag) => {
+            const match = entryXml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
+            return match ? stripHtml(match[1]).trim() : '';
+        };
+        const getAllEntryTags = (tag) => {
+            const matches = [...entryXml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
             return matches.map(m => stripHtml(m[1]).trim()).filter(Boolean);
         };
-        const title = getTag('title');
-        const summary = getTag('summary');
-        const published = getTag('published');
-        const updated = getTag('updated');
-        const authors = getAllTags('name');
+        const title = getEntryTag('title') || getTag('title');
+        const summary = getEntryTag('summary') || getTag('summary');
+        const published = getEntryTag('published') || getTag('published');
+        const updated = getEntryTag('updated') || getTag('updated');
+        const authors = getAllEntryTags('name');
         // Extract categories
         const categories = [...xml.matchAll(/category[^>]*term="([^"]+)"/g)].map(m => m[1]);
         // Extract DOI and journal ref if available

package/dist/core/llm-provider.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@
  *   4. Google Gemini (BYOK)
  *   5. Ollama (local, OpenAI-compatible)
  */
-export type DeepResearchLLMProvider = 'cloudflare' | 'openai' | 'anthropic' | 'google' | 'ollama';
+export type DeepResearchLLMProvider = 'cloudflare' | 'openai' | 'anthropic' | 'google' | 'ollama' | 'cerebras';
 export interface LLMConfig {
     provider: DeepResearchLLMProvider;
     apiKey?: string;
@@ -64,7 +64,10 @@ export declare function resetNeuronUsage(): void;
 export declare function callLLM(config: LLMConfig, options: LLMCallOptions): Promise<LLMCallResult>;
 /**
  * Get the default LLM config based on available environment variables.
- * Falls back to Cloudflare if nothing else is configured.
+ *
+ * Priority order: Anthropic → OpenAI → Google → Cerebras → Cloudflare (free tier fallback).
+ * If no BYOK key and no Cloudflare credentials are configured, returns a cloudflare config
+ * that will throw a clear error when callLLM is invoked (CLOUDFLARE_ACCOUNT_ID missing).
  */
 export declare function getDefaultLLMConfig(): LLMConfig;
 /** Type guard: check if a thrown value is a FreeTierLimitError */