webpeel 0.21.2 → 0.21.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,6 +44,10 @@ async function runStdin(options) {
44
44
  // ─── runFetch ─────────────────────────────────────────────────────────────────
45
45
  // Main fetch handler — shared with the `pipe` and `ask` subcommands
46
46
  export async function runFetch(url, options) {
47
+ // --silent: suppress all log output (set env var before any logger fires)
48
+ if (options.silent && !process.env.WEBPEEL_LOG_LEVEL) {
49
+ process.env.WEBPEEL_LOG_LEVEL = 'silent';
50
+ }
47
51
  // --content-only: override all output flags — we just want raw content
48
52
  if (options.contentOnly) {
49
53
  options.silent = true;
@@ -452,12 +456,25 @@ export async function runFetch(url, options) {
452
456
  // Do NOT set extract here — peel runs normally, LLM extraction happens below.
453
457
  }
454
458
  else if (options.extract) {
455
- // CSS-based extraction
459
+ // Smart extract: detect schema format vs CSS selectors
460
+ let extractJson;
456
461
  try {
457
- extract = { selectors: JSON.parse(options.extract) };
462
+ extractJson = JSON.parse(options.extract);
458
463
  }
459
464
  catch {
460
- throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\')'), { _code: 'FETCH_FAILED' });
465
+ throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\' or \'{"company": "string"}\')'), { _code: 'FETCH_FAILED' });
466
+ }
467
+ // If all values are type names (string/boolean/number/array/object),
468
+ // treat as structured schema extraction (routed to extractStructured after fetch).
469
+ // Otherwise treat as CSS selector map.
470
+ const { isTypeSchema } = await import('../../core/structured-extract.js');
471
+ if (isTypeSchema(extractJson)) {
472
+ // Mark for post-fetch structured extraction (handled below)
473
+ options._structuredSchema = extractJson;
474
+ }
475
+ else {
476
+ // CSS-based extraction
477
+ extract = { selectors: extractJson };
461
478
  }
462
479
  }
463
480
  // Validate maxTokens
@@ -786,6 +803,32 @@ export async function runFetch(url, options) {
786
803
  console.error(`⚠ ${warningMsg}`);
787
804
  }
788
805
  }
806
+ // --- Structured schema extraction (--extract with type schema or --extract-prompt) ---
807
+ if (options._structuredSchema || options.extractPrompt) {
808
+ const { extractStructured, simpleToExtractionSchema } = await import('../../core/structured-extract.js');
809
+ const rawSchema = options._structuredSchema;
810
+ const schema = rawSchema
811
+ ? simpleToExtractionSchema(rawSchema)
812
+ : { type: 'object', properties: { result: { type: 'string', description: options.extractPrompt } } };
813
+ const strResult = await extractStructured(result.content, schema, undefined, // No LLM config — use heuristic (no key needed)
814
+ options.extractPrompt);
815
+ if (isJson) {
816
+ await writeStdout(JSON.stringify({
817
+ success: true,
818
+ data: strResult.data,
819
+ confidence: strResult.confidence,
820
+ method: 'heuristic',
821
+ }, null, 2) + '\n');
822
+ }
823
+ else {
824
+ await writeStdout(JSON.stringify(strResult.data, null, 2) + '\n');
825
+ if (!options.silent) {
826
+ console.error(`\n📊 Structured extraction: confidence=${(strResult.confidence * 100).toFixed(0)}% (heuristic)`);
827
+ }
828
+ }
829
+ await cleanup();
830
+ process.exit(0);
831
+ }
789
832
  // --- LLM-based extraction (post-peel) ---
790
833
  if (options.llmExtract || options.extractSchema) {
791
834
  const { extractWithLLM } = await import('../../core/llm-extract.js');
@@ -1091,7 +1134,8 @@ export function registerFetchCommands(program) {
1091
1134
  .option('--full', 'Alias for --raw — full page content, no budget')
1092
1135
  .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
1093
1136
  .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
1094
- .option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
1137
+ .option('--extract <json>', 'Extract structured data using CSS selectors or type schema (e.g., \'{"title": "h1"}\' for CSS, \'{"name": "string"}\' for schema)')
1138
+ .option('--extract-prompt <prompt>', 'Natural language prompt for structured extraction (no LLM key needed — uses heuristics)')
1095
1139
  .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
1096
1140
  .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
1097
1141
  .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
package/dist/cli/utils.js CHANGED
@@ -33,7 +33,10 @@ export async function checkForUpdates() {
33
33
  const data = await res.json();
34
34
  const latest = data.version;
35
35
  if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
36
- console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
36
+ // Skip update notice in silent mode
37
+ if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
38
+ console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
39
+ }
37
40
  }
38
41
  }
39
42
  catch { /* silently ignore — don't slow down the user */ }
package/dist/cli.js CHANGED
@@ -22,6 +22,11 @@ import { registerInteractCommands } from './cli/commands/interact.js';
22
22
  import { registerAuthCommands } from './cli/commands/auth.js';
23
23
  import { registerScreenshotCommands } from './cli/commands/screenshot.js';
24
24
  import { registerJobsCommands } from './cli/commands/jobs.js';
25
+ // ── Early silent/log-level detection (must happen before any async module code) ──
26
+ // Set WEBPEEL_LOG_LEVEL early so logger checks see it when async IIFEs fire.
27
+ if (!process.env.WEBPEEL_LOG_LEVEL && process.argv.includes('--silent')) {
28
+ process.env.WEBPEEL_LOG_LEVEL = 'silent';
29
+ }
25
30
  // ── Verb alias intercept (before Commander parses) ────────────────────────────
26
31
  // "webpeel fetch <url>" → "webpeel <url>"
27
32
  // Note: 'read' is intentionally excluded — it's a registered subcommand.
@@ -0,0 +1,115 @@
1
+ /**
2
+ * WebPeel Deep Research
3
+ *
4
+ * Multi-step search agent that turns one question into a comprehensive,
5
+ * cited research report. Orchestrates:
6
+ *
7
+ * 1. Query Decomposition — LLM breaks question into 3-5 sub-queries
8
+ * 2. Parallel Multi-Search — All sub-queries across DDG + Stealth
9
+ * 3. Source Fetching — peel() on top results per sub-query
10
+ * 4. Relevance Scoring — BM25 against the original question
11
+ * 5. Gap Detection — LLM: "Is there enough info? What's missing?"
12
+ * 6. Re-Search Loop — Generate new queries if gaps found (max N rounds)
13
+ * 7. Synthesis — LLM generates final cited report
14
+ */
15
+ import { type WebSearchResult } from './search-provider.js';
16
+ import { type LLMConfig } from './llm-provider.js';
17
+ export type ProgressEventType = 'decomposing' | 'searching' | 'fetching' | 'scoring' | 'gap_check' | 'researching' | 'verification' | 'synthesizing' | 'done' | 'error';
18
+ export interface DeepResearchProgressEvent {
19
+ type: ProgressEventType;
20
+ message: string;
21
+ round?: number;
22
+ data?: Record<string, unknown>;
23
+ }
24
+ export interface Citation {
25
+ index: number;
26
+ title: string;
27
+ url: string;
28
+ snippet: string;
29
+ relevanceScore: number;
30
+ }
31
+ export interface DeepResearchRequest {
32
+ question: string;
33
+ llm?: LLMConfig;
34
+ /** Maximum research rounds (default: 3) */
35
+ maxRounds?: number;
36
+ /** Maximum sources to consider (default: 20) */
37
+ maxSources?: number;
38
+ stream?: boolean;
39
+ /** Called with incremental report text when stream=true */
40
+ onChunk?: (text: string) => void;
41
+ /** Called with progress updates */
42
+ onProgress?: (event: DeepResearchProgressEvent) => void;
43
+ signal?: AbortSignal;
44
+ }
45
+ export interface DeepResearchResponse {
46
+ report: string;
47
+ citations: Citation[];
48
+ sourcesUsed: number;
49
+ roundsCompleted: number;
50
+ totalSearchQueries: number;
51
+ llmProvider: string;
52
+ tokensUsed: {
53
+ input: number;
54
+ output: number;
55
+ };
56
+ elapsed: number;
57
+ }
58
+ /** Source credibility assessment */
59
+ export interface SourceCredibility {
60
+ /** Credibility tier */
61
+ tier: 'official' | 'verified' | 'general';
62
+ /** Star rating (1–3) */
63
+ stars: number;
64
+ /** Human-readable label */
65
+ label: string;
66
+ }
67
+ interface FetchedSource {
68
+ result: WebSearchResult;
69
+ content: string;
70
+ relevanceScore: number;
71
+ subQuery: string;
72
+ /** Credibility assessment (populated after fetchSources) */
73
+ credibility?: SourceCredibility;
74
+ }
75
+ /**
76
+ * Assess the credibility of a source URL.
77
+ *
78
+ * Returns:
79
+ * - tier: 'official' | 'verified' | 'general'
80
+ * - stars: 3 / 2 / 1
81
+ * - label: human-readable string for the synthesis prompt
82
+ */
83
+ export declare function getSourceCredibility(url: string): SourceCredibility;
84
+ /** Render stars string for a credibility tier */
85
+ export declare function starsString(stars: number): string;
86
+ interface GapDetectionResult {
87
+ hasEnoughInfo: boolean;
88
+ gaps: string[];
89
+ additionalQueries: string[];
90
+ /** Detected source conflicts (optional, from LLM analysis) */
91
+ conflicts?: string[];
92
+ /** Overall confidence level based on source quality */
93
+ confidence?: 'high' | 'medium' | 'low';
94
+ }
95
+ interface VerificationSummary {
96
+ conflicts: string[];
97
+ confidence: 'high' | 'medium' | 'low';
98
+ sourceDiversity: boolean;
99
+ officialCount: number;
100
+ verifiedCount: number;
101
+ generalCount: number;
102
+ }
103
+ /**
104
+ * Compute a verification summary from fetched sources and optional gap detection result.
105
+ * Used to emit the 'verification' progress event before synthesis.
106
+ */
107
+ export declare function computeVerificationSummary(sources: FetchedSource[], gapResult?: GapDetectionResult): VerificationSummary;
108
+ /**
109
+ * Run a deep research session.
110
+ *
111
+ * Orchestrates query decomposition → multi-search → source fetching →
112
+ * relevance scoring → gap detection → re-search loop → synthesis.
113
+ */
114
+ export declare function runDeepResearch(req: DeepResearchRequest): Promise<DeepResearchResponse>;
115
+ export {};