npm - webpeel - Versions diffs - 0.21.2 → 0.21.5 - Mend

webpeel 0.21.2 → 0.21.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/cli/commands/fetch.js +48 -4
package/dist/cli/utils.js +4 -1
package/dist/cli.js +5 -0
package/dist/core/deep-research.d.ts +115 -0
package/dist/core/deep-research.js +684 -0
package/dist/core/domain-extractors.js +20 -10
package/dist/core/llm-provider.d.ts +74 -0
package/dist/core/llm-provider.js +570 -0
package/dist/core/source-scoring.d.ts +166 -0
package/dist/core/source-scoring.js +396 -0
package/dist/core/structured-extract.d.ts +43 -0
package/dist/core/structured-extract.js +276 -0
package/dist/server/app.js +6 -0
package/dist/server/routes/ask.js +61 -26
package/dist/server/routes/deep-research.d.ts +11 -0
package/dist/server/routes/deep-research.js +232 -0
package/dist/server/routes/extract.d.ts +9 -3
package/dist/server/routes/extract.js +159 -81
package/package.json +1 -1

package/dist/cli/commands/fetch.js CHANGED Viewed

@@ -44,6 +44,10 @@ async function runStdin(options) {
 // ─── runFetch ─────────────────────────────────────────────────────────────────
 // Main fetch handler — shared with the `pipe` and `ask` subcommands
 export async function runFetch(url, options) {
+    // --silent: suppress all log output (set env var before any logger fires)
+    if (options.silent && !process.env.WEBPEEL_LOG_LEVEL) {
+        process.env.WEBPEEL_LOG_LEVEL = 'silent';
+    }
     // --content-only: override all output flags — we just want raw content
     if (options.contentOnly) {
         options.silent = true;
@@ -452,12 +456,25 @@ export async function runFetch(url, options) {
             // Do NOT set extract here — peel runs normally, LLM extraction happens below.
         }
         else if (options.extract) {
-            // CSS-based extraction
+            // Smart extract: detect schema format vs CSS selectors
+            let extractJson;
             try {
-                extract = { selectors: JSON.parse(options.extract) };
+                extractJson = JSON.parse(options.extract);
             }
             catch {
-                throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\')'), { _code: 'FETCH_FAILED' });
+                throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\' or \'{"company": "string"}\')'), { _code: 'FETCH_FAILED' });
+            }
+            // If all values are type names (string/boolean/number/array/object),
+            // treat as structured schema extraction (routed to extractStructured after fetch).
+            // Otherwise treat as CSS selector map.
+            const { isTypeSchema } = await import('../../core/structured-extract.js');
+            if (isTypeSchema(extractJson)) {
+                // Mark for post-fetch structured extraction (handled below)
+                options._structuredSchema = extractJson;
+            }
+            else {
+                // CSS-based extraction
+                extract = { selectors: extractJson };
             }
         }
         // Validate maxTokens
@@ -786,6 +803,32 @@ export async function runFetch(url, options) {
                 console.error(`⚠ ${warningMsg}`);
             }
         }
+        // --- Structured schema extraction (--extract with type schema or --extract-prompt) ---
+        if (options._structuredSchema || options.extractPrompt) {
+            const { extractStructured, simpleToExtractionSchema } = await import('../../core/structured-extract.js');
+            const rawSchema = options._structuredSchema;
+            const schema = rawSchema
+                ? simpleToExtractionSchema(rawSchema)
+                : { type: 'object', properties: { result: { type: 'string', description: options.extractPrompt } } };
+            const strResult = await extractStructured(result.content, schema, undefined, // No LLM config — use heuristic (no key needed)
+            options.extractPrompt);
+            if (isJson) {
+                await writeStdout(JSON.stringify({
+                    success: true,
+                    data: strResult.data,
+                    confidence: strResult.confidence,
+                    method: 'heuristic',
+                }, null, 2) + '\n');
+            }
+            else {
+                await writeStdout(JSON.stringify(strResult.data, null, 2) + '\n');
+                if (!options.silent) {
+                    console.error(`\n📊 Structured extraction: confidence=${(strResult.confidence * 100).toFixed(0)}% (heuristic)`);
+                }
+            }
+            await cleanup();
+            process.exit(0);
+        }
         // --- LLM-based extraction (post-peel) ---
         if (options.llmExtract || options.extractSchema) {
             const { extractWithLLM } = await import('../../core/llm-extract.js');
@@ -1091,7 +1134,8 @@ export function registerFetchCommands(program) {
         .option('--full', 'Alias for --raw — full page content, no budget')
         .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
         .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
-        .option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
+        .option('--extract <json>', 'Extract structured data using CSS selectors or type schema (e.g., \'{"title": "h1"}\' for CSS, \'{"name": "string"}\' for schema)')
+        .option('--extract-prompt <prompt>', 'Natural language prompt for structured extraction (no LLM key needed — uses heuristics)')
         .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
         .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
         .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')

package/dist/cli/utils.js CHANGED Viewed

@@ -33,7 +33,10 @@ export async function checkForUpdates() {
         const data = await res.json();
         const latest = data.version;
         if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
-            console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
+            // Skip update notice in silent mode
+            if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
+                console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
+            }
         }
     }
     catch { /* silently ignore — don't slow down the user */ }

package/dist/cli.js CHANGED Viewed

@@ -22,6 +22,11 @@ import { registerInteractCommands } from './cli/commands/interact.js';
 import { registerAuthCommands } from './cli/commands/auth.js';
 import { registerScreenshotCommands } from './cli/commands/screenshot.js';
 import { registerJobsCommands } from './cli/commands/jobs.js';
+// ── Early silent/log-level detection (must happen before any async module code) ──
+// Set WEBPEEL_LOG_LEVEL early so logger checks see it when async IIFEs fire.
+if (!process.env.WEBPEEL_LOG_LEVEL && process.argv.includes('--silent')) {
+    process.env.WEBPEEL_LOG_LEVEL = 'silent';
+}
 // ── Verb alias intercept (before Commander parses) ────────────────────────────
 // "webpeel fetch <url>" → "webpeel <url>"
 // Note: 'read' is intentionally excluded — it's a registered subcommand.

package/dist/core/deep-research.d.ts ADDED Viewed

@@ -0,0 +1,115 @@
+/**
+ * WebPeel Deep Research
+ *
+ * Multi-step search agent that turns one question into a comprehensive,
+ * cited research report. Orchestrates:
+ *
+ *   1. Query Decomposition  — LLM breaks question into 3-5 sub-queries
+ *   2. Parallel Multi-Search — All sub-queries across DDG + Stealth
+ *   3. Source Fetching       — peel() on top results per sub-query
+ *   4. Relevance Scoring     — BM25 against the original question
+ *   5. Gap Detection         — LLM: "Is there enough info? What's missing?"
+ *   6. Re-Search Loop        — Generate new queries if gaps found (max N rounds)
+ *   7. Synthesis             — LLM generates final cited report
+ */
+import { type WebSearchResult } from './search-provider.js';
+import { type LLMConfig } from './llm-provider.js';
+export type ProgressEventType = 'decomposing' | 'searching' | 'fetching' | 'scoring' | 'gap_check' | 'researching' | 'verification' | 'synthesizing' | 'done' | 'error';
+export interface DeepResearchProgressEvent {
+    type: ProgressEventType;
+    message: string;
+    round?: number;
+    data?: Record<string, unknown>;
+}
+export interface Citation {
+    index: number;
+    title: string;
+    url: string;
+    snippet: string;
+    relevanceScore: number;
+}
+export interface DeepResearchRequest {
+    question: string;
+    llm?: LLMConfig;
+    /** Maximum research rounds (default: 3) */
+    maxRounds?: number;
+    /** Maximum sources to consider (default: 20) */
+    maxSources?: number;
+    stream?: boolean;
+    /** Called with incremental report text when stream=true */
+    onChunk?: (text: string) => void;
+    /** Called with progress updates */
+    onProgress?: (event: DeepResearchProgressEvent) => void;
+    signal?: AbortSignal;
+}
+export interface DeepResearchResponse {
+    report: string;
+    citations: Citation[];
+    sourcesUsed: number;
+    roundsCompleted: number;
+    totalSearchQueries: number;
+    llmProvider: string;
+    tokensUsed: {
+        input: number;
+        output: number;
+    };
+    elapsed: number;
+}
+/** Source credibility assessment */
+export interface SourceCredibility {
+    /** Credibility tier */
+    tier: 'official' | 'verified' | 'general';
+    /** Star rating (1–3) */
+    stars: number;
+    /** Human-readable label */
+    label: string;
+}
+interface FetchedSource {
+    result: WebSearchResult;
+    content: string;
+    relevanceScore: number;
+    subQuery: string;
+    /** Credibility assessment (populated after fetchSources) */
+    credibility?: SourceCredibility;
+}
+/**
+ * Assess the credibility of a source URL.
+ *
+ * Returns:
+ *   - tier: 'official' | 'verified' | 'general'
+ *   - stars: 3 / 2 / 1
+ *   - label: human-readable string for the synthesis prompt
+ */
+export declare function getSourceCredibility(url: string): SourceCredibility;
+/** Render stars string for a credibility tier */
+export declare function starsString(stars: number): string;
+interface GapDetectionResult {
+    hasEnoughInfo: boolean;
+    gaps: string[];
+    additionalQueries: string[];
+    /** Detected source conflicts (optional, from LLM analysis) */
+    conflicts?: string[];
+    /** Overall confidence level based on source quality */
+    confidence?: 'high' | 'medium' | 'low';
+}
+interface VerificationSummary {
+    conflicts: string[];
+    confidence: 'high' | 'medium' | 'low';
+    sourceDiversity: boolean;
+    officialCount: number;
+    verifiedCount: number;
+    generalCount: number;
+}
+/**
+ * Compute a verification summary from fetched sources and optional gap detection result.
+ * Used to emit the 'verification' progress event before synthesis.
+ */
+export declare function computeVerificationSummary(sources: FetchedSource[], gapResult?: GapDetectionResult): VerificationSummary;
+/**
+ * Run a deep research session.
+ *
+ * Orchestrates query decomposition → multi-search → source fetching →
+ * relevance scoring → gap detection → re-search loop → synthesis.
+ */
+export declare function runDeepResearch(req: DeepResearchRequest): Promise<DeepResearchResponse>;
+export {};