webpeel 0.21.3 → 0.21.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,6 +44,10 @@ async function runStdin(options) {
44
44
  // ─── runFetch ─────────────────────────────────────────────────────────────────
45
45
  // Main fetch handler — shared with the `pipe` and `ask` subcommands
46
46
  export async function runFetch(url, options) {
47
+ // --silent: suppress all log output (set env var before any logger fires)
48
+ if (options.silent && !process.env.WEBPEEL_LOG_LEVEL) {
49
+ process.env.WEBPEEL_LOG_LEVEL = 'silent';
50
+ }
47
51
  // --content-only: override all output flags — we just want raw content
48
52
  if (options.contentOnly) {
49
53
  options.silent = true;
@@ -452,12 +456,25 @@ export async function runFetch(url, options) {
452
456
  // Do NOT set extract here — peel runs normally, LLM extraction happens below.
453
457
  }
454
458
  else if (options.extract) {
455
- // CSS-based extraction
459
+ // Smart extract: detect schema format vs CSS selectors
460
+ let extractJson;
456
461
  try {
457
- extract = { selectors: JSON.parse(options.extract) };
462
+ extractJson = JSON.parse(options.extract);
458
463
  }
459
464
  catch {
460
- throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\')'), { _code: 'FETCH_FAILED' });
465
+ throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\' or \'{"company": "string"}\')'), { _code: 'FETCH_FAILED' });
466
+ }
467
+ // If all values are type names (string/boolean/number/array/object),
468
+ // treat as structured schema extraction (routed to extractStructured after fetch).
469
+ // Otherwise treat as CSS selector map.
470
+ const { isTypeSchema } = await import('../../core/structured-extract.js');
471
+ if (isTypeSchema(extractJson)) {
472
+ // Mark for post-fetch structured extraction (handled below)
473
+ options._structuredSchema = extractJson;
474
+ }
475
+ else {
476
+ // CSS-based extraction
477
+ extract = { selectors: extractJson };
461
478
  }
462
479
  }
463
480
  // Validate maxTokens
@@ -786,6 +803,32 @@ export async function runFetch(url, options) {
786
803
  console.error(`⚠ ${warningMsg}`);
787
804
  }
788
805
  }
806
+ // --- Structured schema extraction (--extract with type schema or --extract-prompt) ---
807
+ if (options._structuredSchema || options.extractPrompt) {
808
+ const { extractStructured, simpleToExtractionSchema } = await import('../../core/structured-extract.js');
809
+ const rawSchema = options._structuredSchema;
810
+ const schema = rawSchema
811
+ ? simpleToExtractionSchema(rawSchema)
812
+ : { type: 'object', properties: { result: { type: 'string', description: options.extractPrompt } } };
813
+ const strResult = await extractStructured(result.content, schema, undefined, // No LLM config — use heuristic (no key needed)
814
+ options.extractPrompt);
815
+ if (isJson) {
816
+ await writeStdout(JSON.stringify({
817
+ success: true,
818
+ data: strResult.data,
819
+ confidence: strResult.confidence,
820
+ method: 'heuristic',
821
+ }, null, 2) + '\n');
822
+ }
823
+ else {
824
+ await writeStdout(JSON.stringify(strResult.data, null, 2) + '\n');
825
+ if (!options.silent) {
826
+ console.error(`\n📊 Structured extraction: confidence=${(strResult.confidence * 100).toFixed(0)}% (heuristic)`);
827
+ }
828
+ }
829
+ await cleanup();
830
+ process.exit(0);
831
+ }
789
832
  // --- LLM-based extraction (post-peel) ---
790
833
  if (options.llmExtract || options.extractSchema) {
791
834
  const { extractWithLLM } = await import('../../core/llm-extract.js');
@@ -1091,7 +1134,8 @@ export function registerFetchCommands(program) {
1091
1134
  .option('--full', 'Alias for --raw — full page content, no budget')
1092
1135
  .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
1093
1136
  .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
1094
- .option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
1137
+ .option('--extract <json>', 'Extract structured data using CSS selectors or type schema (e.g., \'{"title": "h1"}\' for CSS, \'{"name": "string"}\' for schema)')
1138
+ .option('--extract-prompt <prompt>', 'Natural language prompt for structured extraction (no LLM key needed — uses heuristics)')
1095
1139
  .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
1096
1140
  .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
1097
1141
  .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
package/dist/cli/utils.js CHANGED
@@ -33,7 +33,10 @@ export async function checkForUpdates() {
33
33
  const data = await res.json();
34
34
  const latest = data.version;
35
35
  if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
36
- console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
36
+ // Skip update notice in silent mode
37
+ if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
38
+ console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
39
+ }
37
40
  }
38
41
  }
39
42
  catch { /* silently ignore — don't slow down the user */ }
package/dist/cli.js CHANGED
@@ -22,6 +22,11 @@ import { registerInteractCommands } from './cli/commands/interact.js';
22
22
  import { registerAuthCommands } from './cli/commands/auth.js';
23
23
  import { registerScreenshotCommands } from './cli/commands/screenshot.js';
24
24
  import { registerJobsCommands } from './cli/commands/jobs.js';
25
+ // ── Early silent/log-level detection (must happen before any async module code) ──
26
+ // Set WEBPEEL_LOG_LEVEL early so logger checks see it when async IIFEs fire.
27
+ if (!process.env.WEBPEEL_LOG_LEVEL && process.argv.includes('--silent')) {
28
+ process.env.WEBPEEL_LOG_LEVEL = 'silent';
29
+ }
25
30
  // ── Verb alias intercept (before Commander parses) ────────────────────────────
26
31
  // "webpeel fetch <url>" → "webpeel <url>"
27
32
  // Note: 'read' is intentionally excluded — it's a registered subcommand.
@@ -12,8 +12,9 @@
12
12
  * 6. Re-Search Loop — Generate new queries if gaps found (max N rounds)
13
13
  * 7. Synthesis — LLM generates final cited report
14
14
  */
15
+ import { type WebSearchResult } from './search-provider.js';
15
16
  import { type LLMConfig } from './llm-provider.js';
16
- export type ProgressEventType = 'decomposing' | 'searching' | 'fetching' | 'scoring' | 'gap_check' | 'researching' | 'synthesizing' | 'done' | 'error';
17
+ export type ProgressEventType = 'decomposing' | 'searching' | 'fetching' | 'scoring' | 'gap_check' | 'researching' | 'verification' | 'synthesizing' | 'done' | 'error';
17
18
  export interface DeepResearchProgressEvent {
18
19
  type: ProgressEventType;
19
20
  message: string;
@@ -54,6 +55,56 @@ export interface DeepResearchResponse {
54
55
  };
55
56
  elapsed: number;
56
57
  }
58
+ /** Source credibility assessment */
59
+ export interface SourceCredibility {
60
+ /** Credibility tier */
61
+ tier: 'official' | 'verified' | 'general';
62
+ /** Star rating (1–3) */
63
+ stars: number;
64
+ /** Human-readable label */
65
+ label: string;
66
+ }
67
+ interface FetchedSource {
68
+ result: WebSearchResult;
69
+ content: string;
70
+ relevanceScore: number;
71
+ subQuery: string;
72
+ /** Credibility assessment (populated after fetchSources) */
73
+ credibility?: SourceCredibility;
74
+ }
75
+ /**
76
+ * Assess the credibility of a source URL.
77
+ *
78
+ * Returns:
79
+ * - tier: 'official' | 'verified' | 'general'
80
+ * - stars: 3 / 2 / 1
81
+ * - label: human-readable string for the synthesis prompt
82
+ */
83
+ export declare function getSourceCredibility(url: string): SourceCredibility;
84
+ /** Render stars string for a credibility tier */
85
+ export declare function starsString(stars: number): string;
86
+ interface GapDetectionResult {
87
+ hasEnoughInfo: boolean;
88
+ gaps: string[];
89
+ additionalQueries: string[];
90
+ /** Detected source conflicts (optional, from LLM analysis) */
91
+ conflicts?: string[];
92
+ /** Overall confidence level based on source quality */
93
+ confidence?: 'high' | 'medium' | 'low';
94
+ }
95
+ interface VerificationSummary {
96
+ conflicts: string[];
97
+ confidence: 'high' | 'medium' | 'low';
98
+ sourceDiversity: boolean;
99
+ officialCount: number;
100
+ verifiedCount: number;
101
+ generalCount: number;
102
+ }
103
+ /**
104
+ * Compute a verification summary from fetched sources and optional gap detection result.
105
+ * Used to emit the 'verification' progress event before synthesis.
106
+ */
107
+ export declare function computeVerificationSummary(sources: FetchedSource[], gapResult?: GapDetectionResult): VerificationSummary;
57
108
  /**
58
109
  * Run a deep research session.
59
110
  *
@@ -61,3 +112,4 @@ export interface DeepResearchResponse {
61
112
  * relevance scoring → gap detection → re-search loop → synthesis.
62
113
  */
63
114
  export declare function runDeepResearch(req: DeepResearchRequest): Promise<DeepResearchResponse>;
115
+ export {};
@@ -39,6 +39,85 @@ function normalizeUrl(url) {
39
39
  return url.toLowerCase().replace(/^https?:\/\/(www\.)?/, '').replace(/\/$/, '');
40
40
  }
41
41
  }
42
+ /** Extract bare hostname (no www) from a URL, or return empty string on failure */
43
+ function extractDomain(url) {
44
+ try {
45
+ return new URL(url).hostname.toLowerCase().replace(/^www\./, '');
46
+ }
47
+ catch {
48
+ return url.toLowerCase().replace(/^https?:\/\/(www\.)?/, '').split('/')[0] ?? '';
49
+ }
50
+ }
51
+ // ---------------------------------------------------------------------------
52
+ // Source Credibility
53
+ // ---------------------------------------------------------------------------
54
+ /** Official TLDs and hostnames that indicate high-authority sources */
55
+ const OFFICIAL_TLDS = new Set(['.gov', '.edu', '.mil']);
56
+ const OFFICIAL_HOSTNAMES = new Set([
57
+ // Academic / research
58
+ 'arxiv.org', 'scholar.google.com', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
59
+ 'jstor.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org', 'bmj.com',
60
+ 'thelancet.com', 'plos.org', 'springer.com', 'elsevier.com',
61
+ // International organisations
62
+ 'who.int', 'un.org', 'worldbank.org', 'imf.org', 'oecd.org', 'europa.eu',
63
+ // Official tech documentation
64
+ 'docs.python.org', 'developer.mozilla.org', 'nodejs.org', 'rust-lang.org',
65
+ 'docs.microsoft.com', 'learn.microsoft.com', 'developer.apple.com',
66
+ 'developer.android.com', 'php.net', 'ruby-lang.org', 'golang.org', 'go.dev',
67
+ ]);
68
+ const VERIFIED_HOSTNAMES = new Set([
69
+ // Encyclopaedia / reference
70
+ 'wikipedia.org', 'en.wikipedia.org',
71
+ // Reputable news agencies
72
+ 'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'nytimes.com',
73
+ 'washingtonpost.com', 'theguardian.com', 'economist.com', 'ft.com',
74
+ // Developer resources
75
+ 'github.com', 'stackoverflow.com', 'npmjs.com', 'pypi.org',
76
+ 'crates.io', 'docs.rs', 'packagist.org',
77
+ // Official cloud / vendor docs
78
+ 'docs.aws.amazon.com', 'cloud.google.com', 'docs.github.com',
79
+ 'azure.microsoft.com', 'registry.terraform.io',
80
+ ]);
81
+ /**
82
+ * Assess the credibility of a source URL.
83
+ *
84
+ * Returns:
85
+ * - tier: 'official' | 'verified' | 'general'
86
+ * - stars: 3 / 2 / 1
87
+ * - label: human-readable string for the synthesis prompt
88
+ */
89
+ export function getSourceCredibility(url) {
90
+ try {
91
+ const hostname = new URL(url).hostname.toLowerCase().replace(/^www\./, '');
92
+ // Check official TLDs
93
+ for (const tld of OFFICIAL_TLDS) {
94
+ if (hostname.endsWith(tld)) {
95
+ return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
96
+ }
97
+ }
98
+ // Check known official hostnames
99
+ if (OFFICIAL_HOSTNAMES.has(hostname)) {
100
+ return { tier: 'official', stars: 3, label: 'OFFICIAL SOURCE' };
101
+ }
102
+ // Check known verified hostnames
103
+ if (VERIFIED_HOSTNAMES.has(hostname)) {
104
+ return { tier: 'verified', stars: 2, label: 'VERIFIED' };
105
+ }
106
+ // Everything else
107
+ return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
108
+ }
109
+ catch {
110
+ return { tier: 'general', stars: 1, label: 'UNVERIFIED' };
111
+ }
112
+ }
113
+ /** Render stars string for a credibility tier */
114
+ export function starsString(stars) {
115
+ if (stars >= 3)
116
+ return '★★★';
117
+ if (stars >= 2)
118
+ return '★★☆';
119
+ return '★☆☆';
120
+ }
42
121
  // ---------------------------------------------------------------------------
43
122
  // LLM call with merged token tracking
44
123
  // ---------------------------------------------------------------------------
@@ -182,9 +261,11 @@ async function fetchSources(searchResults, maxSources, signal) {
182
261
  }));
183
262
  for (const outcome of settled) {
184
263
  if (outcome.status === 'fulfilled') {
264
+ const src = outcome.value;
185
265
  fetched.push({
186
- ...outcome.value,
266
+ ...src,
187
267
  relevanceScore: 0, // filled in step 4
268
+ credibility: getSourceCredibility(src.result.url),
188
269
  });
189
270
  }
190
271
  }
@@ -224,7 +305,43 @@ function scoreSources(sources, question) {
224
305
  });
225
306
  }
226
307
  async function detectGaps(question, sources, config, tokens, signal) {
227
- // Build summary of what we have
308
+ // ── Heuristic pre-checks (no LLM call needed) ──────────────────────────
309
+ if (sources.length >= 3) {
310
+ // Heuristic 1: All sources from the same domain → need diversity
311
+ const domains = sources.map((s) => extractDomain(s.result.url));
312
+ const uniqueDomains = new Set(domains.filter((d) => d.length > 0));
313
+ if (uniqueDomains.size === 1) {
314
+ const soloDomain = [...uniqueDomains][0];
315
+ return {
316
+ hasEnoughInfo: false,
317
+ gaps: [
318
+ `All ${sources.length} sources are from the same domain (${soloDomain}). Diverse sources needed for reliable research.`,
319
+ ],
320
+ additionalQueries: [
321
+ `${question} alternative perspectives`,
322
+ `${question} overview explanation`,
323
+ ],
324
+ conflicts: [],
325
+ confidence: 'low',
326
+ };
327
+ }
328
+ // Heuristic 2: Question implies need for official docs but no official sources found
329
+ const hasOfficialSource = sources.some((s) => (s.credibility || getSourceCredibility(s.result.url)).tier === 'official');
330
+ const questionWantsOfficial = /\b(official|documentation|docs|policy|government|authority|academic|standards?|specification|rfc)\b/i.test(question);
331
+ if (!hasOfficialSource && questionWantsOfficial) {
332
+ return {
333
+ hasEnoughInfo: false,
334
+ gaps: ['No official or academic sources found. The question requires authoritative documentation.'],
335
+ additionalQueries: [
336
+ `${question} site:.gov OR site:.edu`,
337
+ `${question} official documentation`,
338
+ ],
339
+ conflicts: [],
340
+ confidence: 'low',
341
+ };
342
+ }
343
+ }
344
+ // ── LLM-based gap + conflict detection ─────────────────────────────────
228
345
  const topSources = sources
229
346
  .sort((a, b) => b.relevanceScore - a.relevanceScore)
230
347
  .slice(0, 8);
@@ -240,40 +357,44 @@ async function detectGaps(question, sources, config, tokens, signal) {
240
357
  content: [
241
358
  'You are a research quality assessor. Given a question and the sources collected so far,',
242
359
  'determine if there is sufficient information to write a comprehensive answer.',
360
+ 'Also detect any factual conflicts between sources.',
243
361
  '',
244
362
  'Respond in this EXACT JSON format (no markdown, no code blocks):',
245
363
  '{',
246
364
  ' "hasEnoughInfo": boolean,',
247
365
  ' "gaps": ["gap1", "gap2"],',
248
- ' "additionalQueries": ["query1", "query2"]',
366
+ ' "additionalQueries": ["query1", "query2"],',
367
+ ' "conflicts": ["Source A says X while Source B says Y"],',
368
+ ' "confidence": "high" | "medium" | "low"',
249
369
  '}',
250
370
  '',
251
371
  '"gaps" should be 0-3 specific aspects not covered by the sources.',
252
372
  '"additionalQueries" should be 0-3 new search queries to fill those gaps.',
373
+ '"conflicts" should be 0-3 factual disagreements found between sources.',
374
+ '"confidence": high = consistent official sources, medium = mixed, low = conflicting or poor sources.',
253
375
  'If hasEnoughInfo is true, set gaps and additionalQueries to empty arrays.',
254
376
  ].join('\n'),
255
377
  },
256
378
  {
257
379
  role: 'user',
258
- content: `Question: "${question}"\n\nSources collected:\n\n${contextSummary}\n\nAnalyze coverage and gaps:`,
380
+ content: `Question: "${question}"\n\nSources collected:\n\n${contextSummary}\n\nAnalyze coverage, gaps, and conflicts:`,
259
381
  },
260
382
  ];
261
383
  let text;
262
384
  try {
263
385
  text = await callWithTracking(config, messages, tokens, {
264
386
  signal,
265
- maxTokens: 600,
387
+ maxTokens: 700,
266
388
  });
267
389
  }
268
390
  catch (err) {
269
391
  if (isFreeTierLimitError(err))
270
392
  throw err;
271
393
  // On LLM failure, assume we have enough info
272
- return { hasEnoughInfo: true, gaps: [], additionalQueries: [] };
394
+ return { hasEnoughInfo: true, gaps: [], additionalQueries: [], conflicts: [], confidence: 'medium' };
273
395
  }
274
396
  // Parse JSON response
275
397
  try {
276
- // Strip markdown code fences if present
277
398
  const cleaned = text
278
399
  .replace(/```json\s*/gi, '')
279
400
  .replace(/```\s*/g, '')
@@ -285,28 +406,80 @@ async function detectGaps(question, sources, config, tokens, signal) {
285
406
  additionalQueries: Array.isArray(json.additionalQueries)
286
407
  ? json.additionalQueries.slice(0, 3)
287
408
  : [],
409
+ conflicts: Array.isArray(json.conflicts) ? json.conflicts.slice(0, 3) : [],
410
+ confidence: ['high', 'medium', 'low'].includes(String(json.confidence))
411
+ ? json.confidence
412
+ : 'medium',
288
413
  };
289
414
  }
290
415
  catch {
291
- // Couldn't parse JSON assume enough info
292
- return { hasEnoughInfo: true, gaps: [], additionalQueries: [] };
416
+ return { hasEnoughInfo: true, gaps: [], additionalQueries: [], conflicts: [], confidence: 'medium' };
417
+ }
418
+ }
419
+ /**
420
+ * Compute a verification summary from fetched sources and optional gap detection result.
421
+ * Used to emit the 'verification' progress event before synthesis.
422
+ */
423
+ export function computeVerificationSummary(sources, gapResult) {
424
+ const credibilities = sources.map((s) => s.credibility || getSourceCredibility(s.result.url));
425
+ const officialCount = credibilities.filter((c) => c.tier === 'official').length;
426
+ const verifiedCount = credibilities.filter((c) => c.tier === 'verified').length;
427
+ const generalCount = credibilities.filter((c) => c.tier === 'general').length;
428
+ const total = sources.length || 1;
429
+ // Source diversity: at least 3 unique domains (or all are diverse if < 3 sources)
430
+ const domains = new Set(sources.map((s) => extractDomain(s.result.url)).filter((d) => d.length > 0));
431
+ const sourceDiversity = domains.size >= Math.min(3, total);
432
+ // Compute confidence from source quality
433
+ let confidence;
434
+ if (gapResult?.confidence) {
435
+ confidence = gapResult.confidence;
436
+ }
437
+ else {
438
+ const highQualityRatio = (officialCount + verifiedCount) / total;
439
+ if (officialCount >= 2 || highQualityRatio >= 0.5) {
440
+ confidence = 'high';
441
+ }
442
+ else if (verifiedCount >= 1 || highQualityRatio >= 0.25) {
443
+ confidence = 'medium';
444
+ }
445
+ else {
446
+ confidence = 'low';
447
+ }
293
448
  }
449
+ const conflicts = gapResult?.conflicts ?? [];
450
+ return { conflicts, confidence, sourceDiversity, officialCount, verifiedCount, generalCount };
294
451
  }
295
452
  // ---------------------------------------------------------------------------
296
453
  // Step 7: Synthesis
297
454
  // ---------------------------------------------------------------------------
298
455
  async function synthesizeReport(question, sources, config, tokens, opts) {
299
- // Sort by relevance, take best sources (max 15 for context)
456
+ // Sort by credibility tier first (official > verified > general), then by relevance
457
+ const tierOrder = { official: 0, verified: 1, general: 2 };
300
458
  const topSources = sources
301
- .sort((a, b) => b.relevanceScore - a.relevanceScore)
459
+ .map((s) => ({ ...s, credibility: s.credibility || getSourceCredibility(s.result.url) }))
460
+ .sort((a, b) => {
461
+ const tierDiff = (tierOrder[a.credibility.tier] ?? 2) - (tierOrder[b.credibility.tier] ?? 2);
462
+ if (tierDiff !== 0)
463
+ return tierDiff;
464
+ return b.relevanceScore - a.relevanceScore;
465
+ })
302
466
  .slice(0, 15);
303
- // Build context
467
+ // Build context with credibility labels
304
468
  const contextParts = [];
305
469
  const citations = [];
306
470
  topSources.forEach((source, i) => {
307
471
  const idx = i + 1;
472
+ const cred = source.credibility;
473
+ const stars = starsString(cred.stars);
308
474
  const sanitized = sanitizeForLLM(truncate(source.content || source.result.snippet || '', 3000));
309
- contextParts.push(`SOURCE [${idx}]\nTitle: ${source.result.title}\nURL: ${source.result.url}\n\n${sanitized.content}`);
475
+ contextParts.push([
476
+ `SOURCE [${idx}] ${stars}`,
477
+ `Title: ${source.result.title}`,
478
+ `URL: ${source.result.url}`,
479
+ `Credibility: ${cred.label}`,
480
+ '',
481
+ sanitized.content,
482
+ ].join('\n'));
310
483
  citations.push({
311
484
  index: idx,
312
485
  title: source.result.title,
@@ -321,19 +494,27 @@ async function synthesizeReport(question, sources, config, tokens, opts) {
321
494
  role: 'system',
322
495
  content: [
323
496
  'You are a research analyst that writes comprehensive, well-cited reports.',
324
- 'Use ONLY the provided sources to answer the question.',
325
- 'Cite sources using bracketed numbers like [1], [2], [3].',
326
- 'Structure your report with:',
327
- ' - A brief executive summary',
328
- ' - Key findings (with citations)',
329
- ' - Detailed analysis',
330
- ' - Conclusion',
331
- 'Do not fabricate URLs or citations. Do not include information not found in the sources.',
497
+ 'Each source is rated by credibility:',
498
+ ' ★★★ = OFFICIAL SOURCE (government, academic, official docs) — highest authority',
499
+ ' ★★☆ = VERIFIED (reputable news, Wikipedia, major developer platforms)',
500
+ ' ★☆☆ = UNVERIFIED (blogs, forums, unknown sites) — use with caution',
501
+ '',
502
+ 'Rules:',
503
+ ' - Prioritize official sources [★★★] over unverified ones [★☆☆]',
504
+ ' - If sources disagree, note the conflict and trust the higher-credibility source',
505
+ ' - Cite every factual claim with [1], [2], etc.',
506
+ ' - Use ONLY the provided sources — do not fabricate information or citations',
507
+ ' - Structure your report with:',
508
+ ' • Executive Summary',
509
+ ' • Key Findings (with citations)',
510
+ ' • Detailed Analysis',
511
+ ' • Conclusion',
512
+ ' - End with: **Confidence: HIGH/MEDIUM/LOW** based on source quality and agreement',
332
513
  ].join('\n'),
333
514
  },
334
515
  {
335
516
  role: 'user',
336
- content: `Research question: "${question}"\n\nSources:\n\n${context}\n\nWrite a comprehensive research report with citations:`,
517
+ content: `Research question: "${question}"\n\nSources (ranked by credibility):\n\n${context}\n\nWrite a comprehensive research report with citations:`,
337
518
  },
338
519
  ];
339
520
  const report = await callWithTracking(config, messages, tokens, {
@@ -374,6 +555,7 @@ export async function runDeepResearch(req) {
374
555
  const allSources = [];
375
556
  const seenUrls = new Set();
376
557
  let usedQueries = new Set();
558
+ let lastGapResult;
377
559
  // ── Round 0..maxRounds ────────────────────────────────────────────────────
378
560
  let currentQueries = [];
379
561
  for (let round = 0; round < maxRounds; round++) {
@@ -447,6 +629,7 @@ export async function runDeepResearch(req) {
447
629
  throw err;
448
630
  break;
449
631
  }
632
+ lastGapResult = gapResult;
450
633
  if (gapResult.hasEnoughInfo || gapResult.additionalQueries.length === 0) {
451
634
  break;
452
635
  }
@@ -459,6 +642,20 @@ export async function runDeepResearch(req) {
459
642
  });
460
643
  currentQueries = gapResult.additionalQueries;
461
644
  }
645
+ // Verification summary (emitted before synthesis so streaming clients can show status)
646
+ const verifySummary = computeVerificationSummary(allSources, lastGapResult);
647
+ progress({
648
+ type: 'verification',
649
+ message: `Verification complete — confidence: ${verifySummary.confidence.toUpperCase()}`,
650
+ data: {
651
+ conflicts: verifySummary.conflicts,
652
+ confidence: verifySummary.confidence,
653
+ sourceDiversity: verifySummary.sourceDiversity,
654
+ officialCount: verifySummary.officialCount,
655
+ verifiedCount: verifySummary.verifiedCount,
656
+ generalCount: verifySummary.generalCount,
657
+ },
658
+ });
462
659
  // Step 7: Synthesis
463
660
  progress({ type: 'synthesizing', message: 'Synthesizing research report…' });
464
661
  // Sort all sources by relevance for synthesis
@@ -728,7 +728,7 @@ ${commentsMd || '*No comments found.*'}`;
728
728
  };
729
729
  });
730
730
  const subredditName = posts[0]?.url?.match(/\/r\/([^/]+)\//)?.[1] || path.match(/\/r\/([^/]+)/)?.[1] || '';
731
- const structured = { subreddit: `r/${subredditName}`, posts };
731
+ const structured = { title: `r/${subredditName} — Top Posts`, subreddit: `r/${subredditName}`, posts };
732
732
  const cleanContent = `## 📋 r/${subredditName} — Hot Posts
733
733
 
734
734
  ${posts.map((p, i) => `${i + 1}. **${p.title}**\n ${p.author} | ↑ ${p.score} | 💬 ${p.commentCount}${p.flair ? ` | ${p.flair}` : ''}\n ${p.url}`).join('\n\n')}`;
@@ -756,7 +756,7 @@ ${posts.map((p, i) => `${i + 1}. **${p.title}**\n ${p.author} | ↑ ${p.score}
756
756
  flair: d.link_flair_text || null,
757
757
  };
758
758
  });
759
- const structured = { sortType, posts, postCount: posts.length };
759
+ const structured = { title: `Reddit — ${sortType.charAt(0).toUpperCase() + sortType.slice(1)} Posts`, sortType, posts, postCount: posts.length };
760
760
  const listMd = posts.map((p, i) => {
761
761
  const flairTag = p.flair ? ` | ${p.flair}` : '';
762
762
  return `${i + 1}. **${p.title}**\n ${p.author} in ${p.subreddit} | ↑ ${p.score} | 💬 ${p.commentCount}${flairTag}\n ${p.url}`;
@@ -918,6 +918,7 @@ ${commentsMd || '*No comments.*'}`;
918
918
  catch { /* ignore */ }
919
919
  }
920
920
  const structured = {
921
+ title: `${owner}/${repo}`,
921
922
  name: `${owner}/${repo}`,
922
923
  description: repoData.description || '',
923
924
  stars: repoData.stargazers_count ?? 0,
@@ -1039,7 +1040,7 @@ ${commentsMd || '*No comments found.*'}`;
1039
1040
  url: s.url || `https://news.ycombinator.com/item?id=${s.id}`,
1040
1041
  hnUrl: `https://news.ycombinator.com/item?id=${s.id}`,
1041
1042
  }));
1042
- const structured = { stories };
1043
+ const structured = { title: 'Hacker News — Front Page', stories };
1043
1044
  const cleanContent = `## 🟠 Hacker News — Front Page
1044
1045
 
1045
1046
  ${stories.map((s, i) => `${i + 1}. **${s.title}**\n ↑ ${s.score} | 💬 ${s.commentCount} | by ${s.author}\n ${s.url}`).join('\n\n')}`;
@@ -1346,15 +1347,24 @@ async function arxivExtractor(_html, url) {
1346
1347
  const match = xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
1347
1348
  return match ? stripHtml(match[1]).trim() : '';
1348
1349
  };
1349
- const getAllTags = (tag) => {
1350
- const matches = [...xml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
1350
+ // getAllTags removed unused
1351
+ // ArXiv Atom feed: <feed><title>query URL</title> ... <entry><title>Paper Title</title>...
1352
+ // We must grab the entry title, not the feed title.
1353
+ const entryMatch = xml.match(/<entry[\s\S]*?<\/entry>/);
1354
+ const entryXml = entryMatch ? entryMatch[0] : xml;
1355
+ const getEntryTag = (tag) => {
1356
+ const match = entryXml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
1357
+ return match ? stripHtml(match[1]).trim() : '';
1358
+ };
1359
+ const getAllEntryTags = (tag) => {
1360
+ const matches = [...entryXml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
1351
1361
  return matches.map(m => stripHtml(m[1]).trim()).filter(Boolean);
1352
1362
  };
1353
- const title = getTag('title');
1354
- const summary = getTag('summary');
1355
- const published = getTag('published');
1356
- const updated = getTag('updated');
1357
- const authors = getAllTags('name');
1363
+ const title = getEntryTag('title') || getTag('title');
1364
+ const summary = getEntryTag('summary') || getTag('summary');
1365
+ const published = getEntryTag('published') || getTag('published');
1366
+ const updated = getEntryTag('updated') || getTag('updated');
1367
+ const authors = getAllEntryTags('name');
1358
1368
  // Extract categories
1359
1369
  const categories = [...xml.matchAll(/category[^>]*term="([^"]+)"/g)].map(m => m[1]);
1360
1370
  // Extract DOI and journal ref if available
@@ -8,7 +8,7 @@
8
8
  * 4. Google Gemini (BYOK)
9
9
  * 5. Ollama (local, OpenAI-compatible)
10
10
  */
11
- export type DeepResearchLLMProvider = 'cloudflare' | 'openai' | 'anthropic' | 'google' | 'ollama';
11
+ export type DeepResearchLLMProvider = 'cloudflare' | 'openai' | 'anthropic' | 'google' | 'ollama' | 'cerebras';
12
12
  export interface LLMConfig {
13
13
  provider: DeepResearchLLMProvider;
14
14
  apiKey?: string;
@@ -64,7 +64,10 @@ export declare function resetNeuronUsage(): void;
64
64
  export declare function callLLM(config: LLMConfig, options: LLMCallOptions): Promise<LLMCallResult>;
65
65
  /**
66
66
  * Get the default LLM config based on available environment variables.
67
- * Falls back to Cloudflare if nothing else is configured.
67
+ *
68
+ * Priority order: Anthropic → OpenAI → Google → Cerebras → Cloudflare (free tier fallback).
69
+ * If no BYOK key and no Cloudflare credentials are configured, returns a cloudflare config
70
+ * that will throw a clear error when callLLM is invoked (CLOUDFLARE_ACCOUNT_ID missing).
68
71
  */
69
72
  export declare function getDefaultLLMConfig(): LLMConfig;
70
73
  /** Type guard: check if a thrown value is a FreeTierLimitError */