npm - webpeel - Versions diffs - 0.21.28 → 0.21.30 - Mend

webpeel 0.21.28 → 0.21.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/core/http-fetch.d.ts +29 -0
package/dist/core/http-fetch.js +165 -34
package/dist/core/search-provider.d.ts +24 -3
package/dist/core/search-provider.js +277 -10
package/dist/core/user-agents.d.ts +26 -1
package/dist/core/user-agents.js +83 -2
package/dist/server/app.js +4 -0
package/dist/server/routes/research.d.ts +13 -0
package/dist/server/routes/research.js +401 -0
package/package.json +1 -1

package/dist/core/http-fetch.d.ts CHANGED Viewed

@@ -4,6 +4,35 @@
  */
 export declare function closePool(): Promise<void>;
 export declare function createAbortError(): Error;
+/**
+ * Domains known to aggressively block datacenter IPs.
+ * Requests to these domains automatically route through the Webshare residential
+ * proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
+ */
+export declare const PROXY_PREFERRED_DOMAINS: readonly string[];
+/**
+ * Returns true if the URL's domain is on the proxy-preferred blocklist.
+ * Matches exact hostname (sans www.) and all subdomains.
+ *
+ * @example
+ * shouldUseProxy('https://www.reddit.com/r/news') // true
+ * shouldUseProxy('https://example.com')           // false
+ */
+export declare function shouldUseProxy(url: string): boolean;
+/**
+ * Generate browser-like request headers tailored to the User-Agent type.
+ *
+ * - Chrome/Edge:  full Sec-CH-UA + Sec-Fetch-* header set
+ * - Firefox:      adjusted Accept, TE header, partial Sec-Fetch-* (no Sec-CH-UA)
+ * - Safari:       minimal headers, no Sec-Fetch-* or Sec-CH-UA
+ * - Other:        basic headers only
+ *
+ * Automatically adds a Google referer for domains where it helps bypass blocks.
+ *
+ * @param url        - Target URL (used for domain-specific header additions)
+ * @param userAgent  - User-Agent string (determines which header set is applied)
+ */
+export declare function getStealthHeaders(url: string, userAgent: string): Record<string, string>;
 /**
  * SECURITY: Validate URL to prevent SSRF attacks
  * Blocks localhost, private IPs, link-local, and various bypass techniques

package/dist/core/http-fetch.js CHANGED Viewed

@@ -8,7 +8,8 @@
 // Must run before any network library is used.
 import dns from 'dns';
 dns.setDefaultResultOrder('ipv4first');
-import { getRealisticUserAgent, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
+import { getHttpUA, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
+import { getWebshareProxyUrl } from './proxy-config.js';
 import { fetch as undiciFetch, Agent, ProxyAgent } from 'undici';
 import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
 import { getCached } from './cache.js';
@@ -145,6 +146,149 @@ export function createAbortError() {
     error.name = 'AbortError';
     return error;
 }
+// ── Stealth headers & proxy routing ──────────────────────────────────────────
+/**
+ * Domains known to aggressively block datacenter IPs.
+ * Requests to these domains automatically route through the Webshare residential
+ * proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
+ */
+export const PROXY_PREFERRED_DOMAINS = [
+    'reddit.com',
+    'old.reddit.com',
+    'forbes.com',
+    'fortune.com',
+    'cargurus.com',
+    'edmunds.com',
+    'cars.com',
+    'truecar.com',
+    'autotrader.com',
+    'carfax.com',
+    'tesla.com',
+    'nerdwallet.com',
+    'bankrate.com',
+    'homeadvisor.com',
+    'angi.com',
+    'insideevs.com',
+    'electrek.co',
+    'motortrend.com',
+    'jdpower.com',
+];
+/**
+ * Returns true if the URL's domain is on the proxy-preferred blocklist.
+ * Matches exact hostname (sans www.) and all subdomains.
+ *
+ * @example
+ * shouldUseProxy('https://www.reddit.com/r/news') // true
+ * shouldUseProxy('https://example.com')           // false
+ */
+export function shouldUseProxy(url) {
+    try {
+        const host = new URL(url).hostname.replace(/^www\./, '');
+        return PROXY_PREFERRED_DOMAINS.some(d => host === d || host.endsWith('.' + d));
+    }
+    catch {
+        return false;
+    }
+}
+/**
+ * Generate browser-like request headers tailored to the User-Agent type.
+ *
+ * - Chrome/Edge:  full Sec-CH-UA + Sec-Fetch-* header set
+ * - Firefox:      adjusted Accept, TE header, partial Sec-Fetch-* (no Sec-CH-UA)
+ * - Safari:       minimal headers, no Sec-Fetch-* or Sec-CH-UA
+ * - Other:        basic headers only
+ *
+ * Automatically adds a Google referer for domains where it helps bypass blocks.
+ *
+ * @param url        - Target URL (used for domain-specific header additions)
+ * @param userAgent  - User-Agent string (determines which header set is applied)
+ */
+export function getStealthHeaders(url, userAgent) {
+    const isFirefox = userAgent.includes('Firefox');
+    const isSafari = userAgent.includes('Safari') && !userAgent.includes('Chrome');
+    const isChrome = !isFirefox && !isSafari && (userAgent.includes('Chrome') || userAgent.includes('Chromium'));
+    const isMobile = userAgent.includes('Mobile') || userAgent.includes('Android');
+    // Base headers all browsers send
+    const headers = {
+        'User-Agent': userAgent,
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Cache-Control': 'max-age=0',
+        'DNT': '1',
+        'Upgrade-Insecure-Requests': '1',
+    };
+    if (isFirefox) {
+        // Firefox: different Accept, TE, and partial Sec-Fetch (no Sec-CH-UA)
+        headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8';
+        headers['Accept-Language'] = 'en-US,en;q=0.5';
+        headers['TE'] = 'trailers';
+        headers['Sec-Fetch-Dest'] = 'document';
+        headers['Sec-Fetch-Mode'] = 'navigate';
+        headers['Sec-Fetch-Site'] = 'none';
+        // Firefox omits Sec-Fetch-User in many navigations
+    }
+    else if (isSafari) {
+        // Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
+        headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
+        // Safari does not send Sec-Fetch headers at all
+    }
+    else if (isChrome) {
+        // Chrome/Edge: full set of Sec-Fetch-* and Sec-CH-UA headers
+        headers['Sec-Fetch-Dest'] = 'document';
+        headers['Sec-Fetch-Mode'] = 'navigate';
+        headers['Sec-Fetch-Site'] = 'none';
+        headers['Sec-Fetch-User'] = '?1';
+        headers['Sec-CH-UA'] = getSecCHUA(userAgent);
+        headers['Sec-CH-UA-Mobile'] = isMobile ? '?1' : '?0';
+        headers['Sec-CH-UA-Platform'] = getSecCHUAPlatform(userAgent);
+        headers['Connection'] = 'keep-alive';
+        headers['Priority'] = 'u=0, i';
+    }
+    // else: custom/API UAs (e.g. "WebPeel/1.0") — basic headers only, no browser fingerprints
+    // Add Google Referer for domains where it's known to help bypass blocks
+    try {
+        const domain = new URL(url).hostname;
+        const referrerDomains = [
+            'reddit.com', 'forbes.com', 'cargurus.com', 'edmunds.com',
+            'cars.com', 'truecar.com', 'nerdwallet.com', 'homeadvisor.com',
+            'angi.com', 'motortrend.com', 'jdpower.com', 'electrek.co', 'insideevs.com',
+        ];
+        if (referrerDomains.some(d => domain.includes(d))) {
+            headers['Referer'] = 'https://www.google.com/';
+        }
+    }
+    catch {
+        // Non-fatal: URL parsing failed, skip Referer
+    }
+    return headers;
+}
+/** Pick a different UA than the one currently in use (for 403/503 retries). */
+function getDifferentUA(current) {
+    for (let i = 0; i < 10; i++) {
+        const ua = getHttpUA();
+        if (ua !== current)
+            return ua;
+    }
+    return getHttpUA();
+}
+/**
+ * Build the merged request headers: stealth defaults + caller custom headers.
+ * Throws WebPeelError if customHeaders attempts to override the Host header.
+ */
+function buildMergedHeaders(url, userAgent, customHeaders) {
+    const merged = { ...getStealthHeaders(url, userAgent) };
+    if (customHeaders) {
+        for (const [key, value] of Object.entries(customHeaders)) {
+            // SECURITY: Block Host header override
+            if (key.toLowerCase() === 'host') {
+                throw new WebPeelError('Custom Host header is not allowed');
+            }
+            merged[key] = value;
+        }
+    }
+    return merged;
+}
 // ── SSRF / URL validation ─────────────────────────────────────────────────────
 /**
  * SECURITY: Validate URL to prevent SSRF attacks
@@ -368,42 +512,19 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
     // SEC.gov requires a User-Agent with contact info (their documented automated access policy)
     const hostname = new URL(url).hostname.toLowerCase();
     const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
-    const validatedUserAgent = isSecGov
+    let activeUserAgent = isSecGov
         ? 'WebPeel/1.0 (support@webpeel.dev)'
-        : (userAgent ? validateUserAgent(userAgent) : getRealisticUserAgent());
-    // SECURITY: Merge custom headers with defaults, block Host header override
-    const defaultHeaders = {
-        'User-Agent': validatedUserAgent,
-        'Accept': 'text/markdown, text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-        'Accept-Language': 'en-US,en;q=0.9',
-        'Accept-Encoding': 'br, gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-        'Upgrade-Insecure-Requests': '1',
-        'Sec-CH-UA': getSecCHUA(validatedUserAgent),
-        'Sec-CH-UA-Mobile': '?0',
-        'Sec-CH-UA-Platform': getSecCHUAPlatform(validatedUserAgent),
-        'Sec-Fetch-Dest': 'document',
-        'Sec-Fetch-Mode': 'navigate',
-        'Sec-Fetch-Site': 'none',
-        'Sec-Fetch-User': '?1',
-        'Cache-Control': 'max-age=0',
-        'Priority': 'u=0, i',
-    };
-    const mergedHeaders = { ...defaultHeaders };
-    if (customHeaders) {
-        for (const [key, value] of Object.entries(customHeaders)) {
-            // SECURITY: Block Host header override
-            if (key.toLowerCase() === 'host') {
-                throw new WebPeelError('Custom Host header is not allowed');
-            }
-            mergedHeaders[key] = value;
-        }
-    }
+        : (userAgent ? validateUserAgent(userAgent) : getHttpUA());
+    // Build stealth headers merged with any caller-supplied custom headers
+    let mergedHeaders = buildMergedHeaders(url, activeUserAgent, customHeaders);
+    // Auto-route through residential proxy for sites known to block datacenter IPs.
+    // The explicit `proxy` param always wins; auto-proxy only kicks in when unset.
+    const effectiveProxy = proxy ?? (shouldUseProxy(url) ? (getWebshareProxyUrl() ?? undefined) : undefined);
     const MAX_REDIRECTS = 10;
     let redirectCount = 0;
     let currentUrl = url;
     const seenUrls = new Set();
+    let retried = false; // track whether we've already retried with a different UA
     try {
         const hostname = new URL(url).hostname;
         void resolveAndCache(hostname).catch(() => {
@@ -436,8 +557,8 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
             if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
                 requestHeaders['If-Modified-Since'] = validators.lastModified;
             }
-            // Use proxy if provided, otherwise use shared connection pool
-            const dispatcher = proxy ? new ProxyAgent(proxy) : httpPool;
+            // Use proxy if provided or auto-selected, otherwise use shared connection pool
+            const dispatcher = effectiveProxy ? new ProxyAgent(effectiveProxy) : httpPool;
             const response = await undiciFetch(currentUrl, {
                 headers: requestHeaders,
                 signal,
@@ -475,6 +596,16 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
             }
             if (!response.ok) {
                 if (response.status === 403 || response.status === 503) {
+                    // Retry once with a different UA — cheap and catches UA-based blocks
+                    if (!retried && !userAgent) {
+                        retried = true;
+                        activeUserAgent = getDifferentUA(activeUserAgent);
+                        mergedHeaders = buildMergedHeaders(currentUrl, activeUserAgent, customHeaders);
+                        // Allow the retry to re-visit the same URL (not a redirect loop)
+                        seenUrls.delete(currentUrl);
+                        log.debug(`HTTP ${response.status} on first attempt; retrying with different UA`);
+                        continue;
+                    }
                     throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
                 }
                 const statusText = response.statusText || HTTP_STATUS_TEXT[response.status] || 'Unknown Error';

package/dist/core/search-provider.d.ts CHANGED Viewed

@@ -44,18 +44,21 @@ declare class ProviderStatsTracker {
     private readonly windowSize;
     private readonly failThreshold;
     private readonly minSamples;
-    constructor(windowSize?: number, failThreshold?: number, minSamples?: number);
+    private readonly decayMs;
+    constructor(windowSize?: number, failThreshold?: number, minSamples?: number, decayMs?: number);
     /** Record the outcome of a single attempt for the given source. */
     record(sourceId: string, success: boolean): void;
     /**
      * Returns the failure rate (0–1) for the given source based on
      * the sliding window of recorded attempts.  Returns 0 if fewer
-     * than minSamples have been recorded.
+     * than minSamples have been recorded, or if all samples are older
+     * than decayMs (failures expire so cold-start blips don't permanently
+     * lock out a provider).
      */
     getFailureRate(sourceId: string): number;
     /**
      * Returns true when the source should be skipped (failure rate >=
-     * failThreshold with at least minSamples recorded).
+     * failThreshold with at least minSamples recent recorded).
      */
     shouldSkip(sourceId: string): boolean;
     /** Debug snapshot for a source. */
@@ -73,6 +76,11 @@ declare class ProviderStatsTracker {
  * (e.g. in tests) and to log diagnostics.
  */
 export declare const providerStats: ProviderStatsTracker;
+/**
+ * Merge results from multiple sources, deduplicating by normalized URL.
+ * Preserves original order (first occurrence wins) and limits to maxCount.
+ */
+export declare function mergeSearchResults(results: WebSearchResult[], maxCount: number): WebSearchResult[];
 /**
  * Filter and rank results by relevance to the original query.
  *
@@ -128,6 +136,19 @@ export declare class DuckDuckGoProvider implements SearchProvider {
      * works when the main HTML endpoint is temporarily blocked on datacenter IPs.
      */
     private searchLite;
+    /**
+     * HTTP-only Bing scraping via undici + cheerio. No browser required.
+     * Routes through Webshare proxy (proxy first, direct fallback).
+     * Tracks stats via providerStats('bing-http').
+     */
+    private _searchBingHttp;
+    /**
+     * HTTP-only Google scraping via undici + cheerio. No browser required.
+     * Routes through Webshare proxy (proxy first, direct fallback).
+     * Sends CONSENT cookie to bypass Google consent page.
+     * Tracks stats via providerStats('google-http').
+     */
+    private _searchGoogleHttp;
     searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
     /**
      * Exposed for testing: score and filter a pre-fetched result list against a query.

package/dist/core/search-provider.js CHANGED Viewed

@@ -114,15 +114,17 @@ class ProviderStatsTracker {
     windowSize;
     failThreshold;
     minSamples;
-    constructor(windowSize = 10, failThreshold = 0.8, minSamples = 3) {
+    decayMs; // failures older than this are ignored
+    constructor(windowSize = 10, failThreshold = 0.8, minSamples = 5, decayMs = 5 * 60 * 1000) {
         this.windowSize = windowSize;
         this.failThreshold = failThreshold;
         this.minSamples = minSamples;
+        this.decayMs = decayMs; // default 5 minutes: old failures don't permanently lock a provider
     }
     /** Record the outcome of a single attempt for the given source. */
     record(sourceId, success) {
         const arr = this.history.get(sourceId) ?? [];
-        arr.push({ success });
+        arr.push({ success, ts: Date.now() });
         if (arr.length > this.windowSize)
             arr.splice(0, arr.length - this.windowSize);
         this.history.set(sourceId, arr);
@@ -130,18 +132,24 @@ class ProviderStatsTracker {
     /**
      * Returns the failure rate (0–1) for the given source based on
      * the sliding window of recorded attempts.  Returns 0 if fewer
-     * than minSamples have been recorded.
+     * than minSamples have been recorded, or if all samples are older
+     * than decayMs (failures expire so cold-start blips don't permanently
+     * lock out a provider).
      */
     getFailureRate(sourceId) {
         const arr = this.history.get(sourceId);
         if (!arr || arr.length < this.minSamples)
             return 0;
-        const failures = arr.filter(a => !a.success).length;
-        return failures / arr.length;
+        const cutoff = Date.now() - this.decayMs;
+        const recent = arr.filter(a => a.ts >= cutoff);
+        if (recent.length < this.minSamples)
+            return 0; // not enough recent samples
+        const failures = recent.filter(a => !a.success).length;
+        return failures / recent.length;
     }
     /**
      * Returns true when the source should be skipped (failure rate >=
-     * failThreshold with at least minSamples recorded).
+     * failThreshold with at least minSamples recent recorded).
      */
     shouldSkip(sourceId) {
         return this.getFailureRate(sourceId) >= this.failThreshold;
@@ -195,6 +203,24 @@ function normalizeUrlForDedupe(rawUrl) {
             .replace(/\/+$/g, '');
     }
 }
+/**
+ * Merge results from multiple sources, deduplicating by normalized URL.
+ * Preserves original order (first occurrence wins) and limits to maxCount.
+ */
+export function mergeSearchResults(results, maxCount) {
+    const seen = new Set();
+    const merged = [];
+    for (const r of results) {
+        if (merged.length >= maxCount)
+            break;
+        const key = normalizeUrlForDedupe(r.url);
+        if (seen.has(key))
+            continue;
+        seen.add(key);
+        merged.push(r);
+    }
+    return merged;
+}
 // ============================================================
 // Result Relevance Filtering
 // Lightweight keyword-overlap scoring — no external deps.
@@ -206,6 +232,9 @@ const STOP_WORDS = new Set([
     'of', 'with', 'how', 'what', 'where', 'when', 'why', 'best', 'top', 'most',
     'and', 'or', 'but', 'not', 'do', 'does', 'did', 'be', 'been', 'have', 'has',
     'buy', 'get', 'find', 'about', 'from', 'by', 'its', 'it', 'this', 'that',
+    'much', 'very', 'can', 'will', 'would', 'could', 'should', 'per', 'than',
+    'some', 'just', 'also', 'more', 'like', 'make', 'any', 'each', 'all', 'my',
+    'your', 'our', 'their', 'me', 'us', 'them', 'so', 'if', 'then', 'here',
 ]);
 /**
  * Extract meaningful keywords from a search query by stripping stop words and
@@ -271,8 +300,10 @@ export function filterRelevantResults(results, query) {
         score: scoreResult(r, keywords),
         idx,
     }));
-    // Drop results with zero overlap
-    const relevant = scored.filter(s => s.score > 0);
+    // Drop results with insufficient overlap — require ≥15% keyword match
+    // to filter out dictionary/definition pages that match on a single common word
+    const minScore = keywords.length >= 3 ? 0.15 : 0.01;
+    const relevant = scored.filter(s => s.score >= minScore);
     // Sort by score descending, original order as tiebreaker
     relevant.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.idx - b.idx));
     return relevant.map(s => ({
@@ -571,9 +602,21 @@ export class DuckDuckGoProvider {
         const attempts = [];
         // Required retry strategy order:
         // 1) original query
-        // 2) quoted query
-        // 3) query site:*
+        // 2) keywords-only (strip question words, articles, prepositions)
+        // 3) quoted query
+        // 4) query site:*
         attempts.push(q);
+        // For long queries (>5 words), extract just the meaningful keywords
+        // "how much does a used 2023 Tesla Model 3 cost per month" → "2023 Tesla Model 3 cost month"
+        const words = q.split(/\s+/);
+        if (words.length > 5) {
+            const keywordsOnly = words
+                .filter(w => !STOP_WORDS.has(w.toLowerCase()) && w.length >= 2)
+                .join(' ');
+            if (keywordsOnly && keywordsOnly !== q) {
+                attempts.push(keywordsOnly);
+            }
+        }
         if (!/^".*"$/.test(q))
             attempts.push(`"${q}"`);
         attempts.push(`${q} site:*`);
@@ -776,6 +819,219 @@ export class DuckDuckGoProvider {
         });
         return results;
     }
+    /**
+     * HTTP-only Bing scraping via undici + cheerio. No browser required.
+     * Routes through Webshare proxy (proxy first, direct fallback).
+     * Tracks stats via providerStats('bing-http').
+     */
+    // @ts-expect-error Disabled Stage 3.5 — kept for future re-enablement
+    async _searchBingHttp(query, options) {
+        const { count, signal } = options;
+        const bingRate = providerStats.getFailureRate('bing-http');
+        const timeoutMs = bingRate > 0.5 ? 3_000 : 8_000;
+        const bingSignal = createTimeoutSignal(timeoutMs, signal);
+        const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=10`;
+        const headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Sec-Fetch-User': '?1',
+            'Upgrade-Insecure-Requests': '1',
+        };
+        const proxyUrl = getWebshareProxyUrl();
+        let response;
+        try {
+            if (proxyUrl) {
+                try {
+                    const dispatcher = new ProxyAgent(proxyUrl);
+                    response = await undiciFetch(url, { headers, signal: bingSignal, dispatcher });
+                }
+                catch (proxyErr) {
+                    log.debug('Bing HTTP proxy failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
+                    response = await undiciFetch(url, { headers, signal: bingSignal });
+                }
+            }
+            else {
+                response = await undiciFetch(url, { headers, signal: bingSignal });
+            }
+            if (!response.ok) {
+                providerStats.record('bing-http', false);
+                return [];
+            }
+            const html = await response.text();
+            const $ = load(html);
+            const results = [];
+            const seen = new Set();
+            // Parse Bing organic results; skip ad containers
+            $('li.b_algo').each((_i, elem) => {
+                if (results.length >= count)
+                    return;
+                const $r = $(elem);
+                // Skip if inside a .b_ad block or is itself an ad container
+                if ($r.hasClass('b_ad') || $r.closest('.b_ad').length > 0)
+                    return;
+                const $a = $r.find('h2 > a').first();
+                const title = cleanText($a.text(), { maxLen: 200 });
+                const rawUrl = $a.attr('href') || '';
+                if (!title || !rawUrl)
+                    return;
+                // Decode Bing redirect URLs:
+                //   Relative:  /ck/a?!&&p=...&u=a1<base64url>&ntb=1
+                //   Absolute:  https://www.bing.com/ck/a?...&u=a1<base64url>&ntb=1
+                let finalUrl = rawUrl;
+                try {
+                    const base = rawUrl.startsWith('/') ? `https://www.bing.com${rawUrl}` : rawUrl;
+                    const ckUrl = new URL(base);
+                    if (ckUrl.hostname.endsWith('bing.com') && ckUrl.pathname.startsWith('/ck/')) {
+                        const u = ckUrl.searchParams.get('u');
+                        if (u && u.startsWith('a1')) {
+                            const decoded = Buffer.from(u.slice(2), 'base64url').toString('utf-8');
+                            if (decoded.startsWith('http'))
+                                finalUrl = decoded;
+                        }
+                    }
+                }
+                catch { /* use rawUrl as-is */ }
+                // Validate: HTTP/HTTPS only
+                try {
+                    const parsed = new URL(finalUrl);
+                    if (!['http:', 'https:'].includes(parsed.protocol))
+                        return;
+                    finalUrl = parsed.href;
+                }
+                catch {
+                    return;
+                }
+                const key = normalizeUrlForDedupe(finalUrl);
+                if (seen.has(key))
+                    return;
+                seen.add(key);
+                const snippetRaw = $r.find('.b_caption p').first().text() ||
+                    $r.find('.b_caption').first().text();
+                const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
+                results.push({ title, url: finalUrl, snippet });
+            });
+            providerStats.record('bing-http', results.length > 0);
+            return results;
+        }
+        catch (e) {
+            log.debug('Bing HTTP search failed:', e instanceof Error ? e.message : e);
+            providerStats.record('bing-http', false);
+            return [];
+        }
+    }
+    /**
+     * HTTP-only Google scraping via undici + cheerio. No browser required.
+     * Routes through Webshare proxy (proxy first, direct fallback).
+     * Sends CONSENT cookie to bypass Google consent page.
+     * Tracks stats via providerStats('google-http').
+     */
+    // @ts-expect-error Disabled Stage 3.5 — kept for future re-enablement
+    async _searchGoogleHttp(query, options) {
+        const { count, signal } = options;
+        const googleRate = providerStats.getFailureRate('google-http');
+        const timeoutMs = googleRate > 0.5 ? 3_000 : 8_000;
+        const googleSignal = createTimeoutSignal(timeoutMs, signal);
+        const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=10&hl=en`;
+        const headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            // Skip Google consent/cookie wall
+            'Cookie': 'CONSENT=YES+; SOCS=CAESEwgDEgk0OTg3ODQ2NzMaAmVuIAEaBgiA0LqmBg',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Sec-Fetch-User': '?1',
+            'Upgrade-Insecure-Requests': '1',
+        };
+        const proxyUrl = getWebshareProxyUrl();
+        let response;
+        try {
+            if (proxyUrl) {
+                try {
+                    const dispatcher = new ProxyAgent(proxyUrl);
+                    response = await undiciFetch(url, { headers, signal: googleSignal, dispatcher });
+                }
+                catch (proxyErr) {
+                    log.debug('Google HTTP proxy failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
+                    response = await undiciFetch(url, { headers, signal: googleSignal });
+                }
+            }
+            else {
+                response = await undiciFetch(url, { headers, signal: googleSignal });
+            }
+            if (!response.ok) {
+                providerStats.record('google-http', false);
+                return [];
+            }
+            const html = await response.text();
+            const $ = load(html);
+            const results = [];
+            const seen = new Set();
+            // Google organic results live in div.g blocks.
+            // Skip ad blocks (data-text-ad attr), People Also Ask, and related searches.
+            $('div.g').each((_i, elem) => {
+                if (results.length >= count)
+                    return;
+                const $r = $(elem);
+                // Skip ad containers (data-text-ad may be on div.g itself or on a descendant)
+                if ($r.attr('data-text-ad') !== undefined || $r.find('[data-text-ad]').length > 0)
+                    return;
+                if ($r.closest('.commercial-unit-desktop-top, .ads-ad').length > 0)
+                    return;
+                const $h3 = $r.find('h3').first();
+                if (!$h3.length)
+                    return;
+                // Find a valid external link (starts with http, not a Google domain)
+                const $a = $r.find('a[href]').filter((_j, el) => {
+                    const href = $(el).attr('href') || '';
+                    return href.startsWith('http') && !href.includes('google.com/');
+                }).first();
+                if (!$a.length)
+                    return;
+                const href = $a.attr('href') || '';
+                // Validate URL
+                let finalUrl;
+                try {
+                    const parsed = new URL(href);
+                    if (!['http:', 'https:'].includes(parsed.protocol))
+                        return;
+                    if (parsed.hostname.includes('google.com'))
+                        return;
+                    finalUrl = parsed.href;
+                }
+                catch {
+                    return;
+                }
+                const key = normalizeUrlForDedupe(finalUrl);
+                if (seen.has(key))
+                    return;
+                seen.add(key);
+                const title = cleanText($h3.text(), { maxLen: 200 });
+                if (!title)
+                    return;
+                // Snippet: try multiple known Google snippet CSS classes/attrs
+                const snippetRaw = $r.find('.VwiC3b').first().text() ||
+                    $r.find('[data-sncf]').first().text() ||
+                    $r.find('[style*="-webkit-line-clamp"]').first().text() ||
+                    $r.find('.st').first().text() ||
+                    '';
+                const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
+                results.push({ title, url: finalUrl, snippet });
+            });
+            providerStats.record('google-http', results.length > 0);
+            return results;
+        }
+        catch (e) {
+            log.debug('Google HTTP search failed:', e instanceof Error ? e.message : e);
+            providerStats.record('google-http', false);
+            return [];
+        }
+    }
     async searchWeb(query, options) {
         const attempts = this.buildQueryAttempts(query);
         // -----------------------------------------------------------
@@ -867,6 +1123,17 @@ export class DuckDuckGoProvider {
             }
         }
         // -----------------------------------------------------------
+        // Stage 3.5: HTTP-based Bing + Google (no browser, no API key)
+        // DISABLED: Both Bing and Google detect non-browser HTTP clients and
+        // serve different/irrelevant content (dictionary pages, random sites).
+        // The scrapers are built (searchBingHttp, searchGoogleHttp) but need
+        // further work on request fingerprinting to get real results.
+        // TODO: Re-enable when fingerprinting is improved.
+        // -----------------------------------------------------------
+        // const skipBingHttp = providerStats.shouldSkip('bing-http');
+        // const skipGoogleHttp = providerStats.shouldSkip('google-http');
+        // if (!skipBingHttp || !skipGoogleHttp) { ... }
+        // -----------------------------------------------------------
         // Stage 4: Stealth multi-engine (DDG + Bing + Ecosia in parallel)
         // Bypasses bot-detection on datacenter IPs. This is the reliable
         // last resort — but it spins up a browser so it takes a few seconds.

package/dist/core/user-agents.d.ts CHANGED Viewed

@@ -9,6 +9,11 @@
  * Also provides `getSecCHUA()` for generating correct Sec-CH-UA header values
  * that match the selected user agent (version-accurate brand hints).
  */
+/**
+ * Full UA pool for HTTP-only requests (Chrome + Firefox + Safari + Edge + Mobile).
+ * NOT for browser contexts — use getRealisticUserAgent() there (Chrome-only).
+ */
+export declare const HTTP_UAS: readonly string[];
 /**
  * Returns a realistic, recent Chrome user agent string.
  * Randomly picks from a curated list of real-world UAs (Chrome 132-136 range).
@@ -32,7 +37,27 @@ export declare function getRealisticUserAgent(platform?: 'windows' | 'mac' | 'li
  */
 export declare function getRandomUA(): string;
 /**
- * The full curated list of realistic user agents.
+ * Returns a realistic user agent for HTTP-only (non-browser) requests.
+ * Unlike `getRealisticUserAgent()` which is Chrome-only for browser contexts,
+ * this function returns from a wider pool: Chrome, Firefox, Safari, Edge, and Mobile.
+ *
+ * Weight distribution (approximate):
+ * - Chrome Windows:  ~30%
+ * - Chrome macOS:    ~25%
+ * - Chrome Linux:    ~10%
+ * - Firefox:         ~15%
+ * - Safari:          ~10%
+ * - Edge:             ~5%
+ * - Mobile Chrome:    ~5%
+ *
+ * @example
+ * ```ts
+ * const ua = getHttpUA(); // e.g. "Mozilla/5.0 ... Firefox/133.0"
+ * ```
+ */
+export declare function getHttpUA(): string;
+/**
+ * The full curated list of realistic user agents (Chrome-only, all platforms).
  * Exported for inspection / testing.
  */
 export declare const REALISTIC_USER_AGENTS: readonly string[];

package/dist/core/user-agents.js CHANGED Viewed

@@ -42,8 +42,44 @@ const LINUX_UAS = [
     // Chrome 136 Linux
     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
 ];
-/** All UAs combined (fallback when no platform is specified) */
+/** All Chrome UAs combined (fallback when no platform is specified) */
 const ALL_UAS = [...WINDOWS_UAS, ...MAC_UAS, ...LINUX_UAS];
+// ── Extended pools for non-Chrome browsers (HTTP-only use) ───────────────────
+/** Firefox UAs — Windows, Mac, Linux */
+const FIREFOX_UAS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:133.0) Gecko/20100101 Firefox/133.0',
+    'Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0',
+];
+/** Safari UAs — macOS */
+const SAFARI_UAS = [
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 15_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15',
+];
+/** Microsoft Edge UAs */
+const EDGE_UAS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0',
+];
+/** Mobile Chrome UAs */
+const MOBILE_CHROME_UAS = [
+    'Mozilla/5.0 (Linux; Android 14; SM-S928B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36',
+    'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Mobile Safari/537.36',
+    'Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.6778.103 Mobile/15E148 Safari/604.1',
+];
+/**
+ * Full UA pool for HTTP-only requests (Chrome + Firefox + Safari + Edge + Mobile).
+ * NOT for browser contexts — use getRealisticUserAgent() there (Chrome-only).
+ */
+export const HTTP_UAS = [
+    ...ALL_UAS,
+    ...FIREFOX_UAS,
+    ...SAFARI_UAS,
+    ...EDGE_UAS,
+    ...MOBILE_CHROME_UAS,
+];
 // ── Public API ────────────────────────────────────────────────────────────────
 /**
  * Returns a realistic, recent Chrome user agent string.
@@ -97,7 +133,52 @@ export function getRandomUA() {
     return ALL_UAS[idx];
 }
 /**
- * The full curated list of realistic user agents.
+ * Returns a realistic user agent for HTTP-only (non-browser) requests.
+ * Unlike `getRealisticUserAgent()` which is Chrome-only for browser contexts,
+ * this function returns from a wider pool: Chrome, Firefox, Safari, Edge, and Mobile.
+ *
+ * Weight distribution (approximate):
+ * - Chrome Windows:  ~30%
+ * - Chrome macOS:    ~25%
+ * - Chrome Linux:    ~10%
+ * - Firefox:         ~15%
+ * - Safari:          ~10%
+ * - Edge:             ~5%
+ * - Mobile Chrome:    ~5%
+ *
+ * @example
+ * ```ts
+ * const ua = getHttpUA(); // e.g. "Mozilla/5.0 ... Firefox/133.0"
+ * ```
+ */
+export function getHttpUA() {
+    const roll = Math.random();
+    let pool;
+    if (roll < 0.30) {
+        pool = WINDOWS_UAS;
+    }
+    else if (roll < 0.55) {
+        pool = MAC_UAS;
+    }
+    else if (roll < 0.65) {
+        pool = LINUX_UAS;
+    }
+    else if (roll < 0.80) {
+        pool = FIREFOX_UAS;
+    }
+    else if (roll < 0.90) {
+        pool = SAFARI_UAS;
+    }
+    else if (roll < 0.95) {
+        pool = EDGE_UAS;
+    }
+    else {
+        pool = MOBILE_CHROME_UAS;
+    }
+    return pool[Math.floor(Math.random() * pool.length)];
+}
+/**
+ * The full curated list of realistic user agents (Chrome-only, all platforms).
  * Exported for inspection / testing.
  */
 export const REALISTIC_USER_AGENTS = ALL_UAS;

package/dist/server/app.js CHANGED Viewed

@@ -28,6 +28,7 @@ import { createJobsRouter } from './routes/jobs.js';
 import { createBatchRouter } from './routes/batch.js';
 import { createAnswerRouter } from './routes/answer.js';
 import { createDeepResearchRouter } from './routes/deep-research.js';
+import { createResearchRouter } from './routes/research.js';
 import { createAskRouter } from './routes/ask.js';
 import { createMcpRouter } from './routes/mcp.js';
 import { createDoRouter } from './routes/do.js';
@@ -291,6 +292,9 @@ export function createApp(config = {}) {
     app.use('/v1/screenshot', requireScope('full', 'read'));
     app.use(createScreenshotRouter(authStore));
     app.use(createSearchRouter(authStore));
+    // /v1/research — lightweight research (search → fetch → compile), BYOK LLM optional
+    app.use('/v1/research', requireScope('full', 'read'));
+    app.use(createResearchRouter());
     app.use(createBillingPortalRouter(pool));
     app.use(createUserRouter());
     app.use(createOAuthRouter());

package/dist/server/routes/research.d.ts ADDED Viewed

@@ -0,0 +1,13 @@
+/**
+ * POST /v1/research
+ *
+ * Lightweight research endpoint that chains search → fetch → compile.
+ * No LLM required for baseline results; optional BYOK LLM synthesis.
+ *
+ * Auth: API key required (full or read scope)
+ * Body: ResearchRequest
+ */
+import { Router } from 'express';
+export declare function expandQuery(query: string): string[];
+export declare function extractKeyFacts(content: string, query: string, maxFacts?: number): string[];
+export declare function createResearchRouter(): Router;

package/dist/server/routes/research.js ADDED Viewed

@@ -0,0 +1,401 @@
+/**
+ * POST /v1/research
+ *
+ * Lightweight research endpoint that chains search → fetch → compile.
+ * No LLM required for baseline results; optional BYOK LLM synthesis.
+ *
+ * Auth: API key required (full or read scope)
+ * Body: ResearchRequest
+ */
+import { Router } from 'express';
+import { peel } from '../../index.js';
+import { getSearchProvider } from '../../core/search-provider.js';
+import { callLLM, } from '../../core/llm-provider.js';
+// ---------------------------------------------------------------------------
+// Query expansion — simple heuristics, no LLM needed
+// ---------------------------------------------------------------------------
+const CURRENT_YEAR = new Date().getFullYear();
+// Keywords that suggest the query is time-sensitive
+const TIME_SENSITIVE_PATTERNS = /\b(price|cost|best|top|latest|current|now|today|new|salary|rate|speed|version|release|stock|review)\b/i;
+// Prefixes that can be rephrased
+const HOW_MUCH_RE = /^how much (?:does|do|is|are) (.+?)(?:\s+cost|\s+price|\s+charge)?[\s?]*$/i;
+const HOW_TO_RE = /^how (?:to|do(?:es)?) (.+?)[\s?]*$/i;
+const WHAT_IS_RE = /^(?:what (?:is|are)) (.+?)[\s?]*$/i;
+export function expandQuery(query) {
+    const q = query.trim();
+    const queries = [q];
+    // Add year variant if time-sensitive and year not already present
+    const hasYear = /\b(20\d{2}|19\d{2})\b/.test(q);
+    if (!hasYear && TIME_SENSITIVE_PATTERNS.test(q)) {
+        queries.push(`${q} ${CURRENT_YEAR}`);
+    }
+    // Rephrase "how much does X cost" → "X cost price"
+    const howMuchMatch = HOW_MUCH_RE.exec(q);
+    if (howMuchMatch) {
+        const subject = howMuchMatch[1].trim();
+        const rephrased = `${subject} cost price`;
+        if (!queries.includes(rephrased)) {
+            queries.push(rephrased);
+        }
+    }
+    // Rephrase "how to X" → "X guide tutorial"
+    const howToMatch = HOW_TO_RE.exec(q);
+    if (howToMatch) {
+        const subject = howToMatch[1].trim();
+        const rephrased = `${subject} guide`;
+        if (!queries.includes(rephrased)) {
+            queries.push(rephrased);
+        }
+    }
+    // Rephrase "what is X" → "X definition overview"
+    const whatIsMatch = WHAT_IS_RE.exec(q);
+    if (whatIsMatch) {
+        const subject = whatIsMatch[1].trim();
+        const rephrased = `${subject} overview`;
+        if (!queries.includes(rephrased)) {
+            queries.push(rephrased);
+        }
+    }
+    // Cap at 3 variations
+    return queries.slice(0, 3);
+}
+// ---------------------------------------------------------------------------
+// Key-fact extraction — score sentences by keyword overlap
+// ---------------------------------------------------------------------------
+function tokenize(text) {
+    return text
+        .toLowerCase()
+        .split(/\W+/)
+        .filter(w => w.length > 2);
+}
+// Common English stop-words to skip when scoring
+const STOP_WORDS = new Set([
+    'the', 'and', 'for', 'are', 'was', 'were', 'but', 'not', 'you', 'all',
+    'can', 'her', 'his', 'its', 'our', 'out', 'one', 'had', 'has', 'have',
+    'this', 'that', 'with', 'they', 'from', 'your', 'what', 'when', 'how',
+    'will', 'been', 'than', 'more', 'also', 'into', 'which', 'about',
+]);
+export function extractKeyFacts(content, query, maxFacts = 5) {
+    if (!content || !query)
+        return [];
+    const queryKeywords = new Set(tokenize(query).filter(w => !STOP_WORDS.has(w)));
+    if (queryKeywords.size === 0)
+        return [];
+    // Split into sentences on common terminators
+    const sentences = content
+        .replace(/\n{2,}/g, ' ')
+        .split(/(?<=[.!?])\s+/)
+        .map(s => s.trim())
+        // Filter length
+        .filter(s => s.length > 40 && s.length < 500)
+        // Skip markdown headers (## Heading, # Title)
+        .filter(s => !/^#{1,4}\s/.test(s))
+        // Skip navigation/link-heavy lines (lots of []() markdown)
+        .filter(s => (s.match(/\[.*?\]\(.*?\)/g) || []).length < 3)
+        // Skip lines that are just questions or teasers with no data
+        .filter(s => !/^(thinking about|wondering|let's|let me|in this article|we'll|here's|read on|click|sign up|subscribe|after diving|but the big question|for full data|source:|select make|select model)/i.test(s))
+        // Skip lines that are just italicized markdown filler (_text_)
+        .filter(s => !s.startsWith('_') || s.includes('$') || s.includes('%') || /\d/.test(s))
+        // Skip markdown image lines (![...](...))
+        .filter(s => !/^!\[/.test(s))
+        // Skip "Read more about..." lines
+        .filter(s => !/^\[read more|^\[learn more|\[read more|\[learn more/i.test(s));
+    // Prefer sentences with numbers (prices, percentages, years)
+    // (we don't remove number-less ones, just score them lower)
+    if (sentences.length === 0)
+        return [];
+    // Score each sentence by keyword overlap
+    const scored = sentences.map(sentence => {
+        const words = tokenize(sentence);
+        let hits = 0;
+        const seen = new Set();
+        for (const w of words) {
+            if (queryKeywords.has(w) && !seen.has(w)) {
+                hits++;
+                seen.add(w);
+            }
+        }
+        let score = hits / queryKeywords.size;
+        // Boost sentences with numbers/prices/percentages — likely to contain real data
+        if (/\$[\d,]+|[\d,]+\/mo|\d+%|\d+\s*year|\d+\s*month|\d+,\d{3}/.test(sentence)) {
+            score *= 1.5;
+        }
+        return { sentence, score };
+    });
+    scored.sort((a, b) => b.score - a.score);
+    // Return top N, deduped
+    const seen = new Set();
+    const result = [];
+    for (const { sentence, score } of scored) {
+        if (score === 0)
+            break; // no keyword overlap
+        const normalized = sentence.toLowerCase().slice(0, 80);
+        if (seen.has(normalized))
+            continue;
+        seen.add(normalized);
+        result.push(sentence);
+        if (result.length >= maxFacts)
+            break;
+    }
+    return result;
+}
+// ---------------------------------------------------------------------------
+// Route factory
+// ---------------------------------------------------------------------------
+const VALID_LLM_PROVIDERS = [
+    'openai',
+    'anthropic',
+    'google',
+    'ollama',
+    'cerebras',
+    'cloudflare',
+];
+const MAX_SOURCES_HARD_LIMIT = 8;
+const PER_URL_TIMEOUT_MS = 15_000;
+const TOTAL_TIMEOUT_MS = 60_000;
+export function createResearchRouter() {
+    const router = Router();
+    router.post('/v1/research', async (req, res) => {
+        const startTime = Date.now();
+        // ── Auth ─────────────────────────────────────────────────────────────────
+        const authId = req.auth?.keyInfo?.accountId || req.user?.userId;
+        if (!authId) {
+            res.status(401).json({
+                success: false,
+                error: {
+                    type: 'authentication_required',
+                    message: 'API key required. Get one at https://app.webpeel.dev/keys',
+                    hint: 'Get a free API key at https://app.webpeel.dev/keys',
+                    docs: 'https://webpeel.dev/docs/errors#authentication_required',
+                },
+                requestId: req.requestId,
+            });
+            return;
+        }
+        // ── Parse & validate body ─────────────────────────────────────────────
+        const body = req.body;
+        if (!body.query || typeof body.query !== 'string' || body.query.trim().length === 0) {
+            res.status(400).json({
+                success: false,
+                error: {
+                    type: 'invalid_request',
+                    message: 'Missing or empty "query" field.',
+                    hint: 'Send JSON: { "query": "your research question" }',
+                    docs: 'https://webpeel.dev/docs/api-reference#research',
+                },
+                requestId: req.requestId,
+            });
+            return;
+        }
+        const query = body.query.trim().slice(0, 500); // hard cap
+        const depth = body.depth ?? 'quick';
+        if (depth !== 'quick' && depth !== 'deep') {
+            res.status(400).json({
+                success: false,
+                error: {
+                    type: 'invalid_request',
+                    message: 'Invalid "depth" value: must be "quick" or "deep".',
+                    docs: 'https://webpeel.dev/docs/api-reference#research',
+                },
+                requestId: req.requestId,
+            });
+            return;
+        }
+        // Depth-based defaults
+        const defaultMaxSources = depth === 'deep' ? 8 : 3;
+        const defaultSearchCount = depth === 'deep' ? 10 : 5;
+        const numSearchQueries = depth === 'deep' ? 3 : 1;
+        const requestedMax = typeof body.maxSources === 'number' ? body.maxSources : defaultMaxSources;
+        const maxSources = Math.min(Math.max(1, requestedMax), MAX_SOURCES_HARD_LIMIT);
+        // Optional LLM config
+        let llmConfig;
+        if (body.llm) {
+            const { provider, apiKey, model } = body.llm;
+            if (!provider || typeof provider !== 'string') {
+                res.status(400).json({
+                    success: false,
+                    error: {
+                        type: 'invalid_request',
+                        message: 'llm.provider is required when providing llm config.',
+                        docs: 'https://webpeel.dev/docs/api-reference#research',
+                    },
+                    requestId: req.requestId,
+                });
+                return;
+            }
+            if (!VALID_LLM_PROVIDERS.includes(provider)) {
+                res.status(400).json({
+                    success: false,
+                    error: {
+                        type: 'invalid_request',
+                        message: `Invalid llm.provider. Must be one of: ${VALID_LLM_PROVIDERS.join(', ')}`,
+                        docs: 'https://webpeel.dev/docs/api-reference#research',
+                    },
+                    requestId: req.requestId,
+                });
+                return;
+            }
+            if (!apiKey || typeof apiKey !== 'string' || apiKey.trim().length === 0) {
+                res.status(400).json({
+                    success: false,
+                    error: {
+                        type: 'invalid_request',
+                        message: 'llm.apiKey is required when providing llm config.',
+                        docs: 'https://webpeel.dev/docs/api-reference#research',
+                    },
+                    requestId: req.requestId,
+                });
+                return;
+            }
+            llmConfig = {
+                provider: provider,
+                apiKey: apiKey.trim(),
+                model: model,
+            };
+        }
+        // ── Set up total-timeout race ─────────────────────────────────────────
+        const overallDeadline = startTime + TOTAL_TIMEOUT_MS;
+        try {
+            // ── 1. Query expansion ────────────────────────────────────────────────
+            const allQueries = expandQuery(query);
+            const searchQueries = allQueries.slice(0, numSearchQueries);
+            // ── 2. Search all query variations, collect unique URLs ───────────────
+            const searchProvider = getSearchProvider('duckduckgo');
+            const seenUrls = new Set();
+            const urlQueue = [];
+            for (const sq of searchQueries) {
+                if (Date.now() > overallDeadline - 5_000)
+                    break; // stop if < 5s left
+                try {
+                    const results = await searchProvider.searchWeb(sq, { count: defaultSearchCount });
+                    for (const r of results) {
+                        if (!r.url || seenUrls.has(r.url))
+                            continue;
+                        seenUrls.add(r.url);
+                        urlQueue.push({ url: r.url, title: r.title, snippet: r.snippet });
+                    }
+                }
+                catch {
+                    // Search failure — continue with whatever URLs we have
+                }
+            }
+            // ── 3. Fetch top N unique URLs sequentially ───────────────────────────
+            const sources = [];
+            const fetchedContents = [];
+            for (const { url, title, snippet } of urlQueue) {
+                if (sources.length >= maxSources)
+                    break;
+                if (Date.now() > overallDeadline - 2_000)
+                    break;
+                const timeLeft = overallDeadline - Date.now();
+                const urlTimeout = Math.min(PER_URL_TIMEOUT_MS, timeLeft);
+                if (urlTimeout < 1000)
+                    break;
+                const fetchStart = Date.now();
+                try {
+                    const result = await Promise.race([
+                        peel(url, {
+                            format: 'markdown',
+                            noEscalate: true, // NEVER launch browser — 512MB container
+                            timeout: urlTimeout,
+                            readable: true,
+                            budget: 3000,
+                        }),
+                        new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
+                    ]);
+                    const fetchTime = Date.now() - fetchStart;
+                    const content = result.content || '';
+                    const wordCount = content.split(/\s+/).filter(Boolean).length;
+                    const pageTitle = result.title || title;
+                    // Build snippet: prefer LLM-extracted summary, else first 500 chars of content
+                    const sourceSnippet = content.slice(0, 500).replace(/\s+/g, ' ').trim();
+                    sources.push({
+                        url,
+                        title: pageTitle.slice(0, 200),
+                        snippet: sourceSnippet || snippet.slice(0, 500),
+                        wordCount,
+                        fetchTime,
+                    });
+                    if (content.length > 0) {
+                        fetchedContents.push({ url, content });
+                    }
+                }
+                catch {
+                    // Skip failed URLs, continue to next
+                }
+            }
+            // ── 4. Extract key facts across all fetched pages ─────────────────────
+            const allFacts = [];
+            const seenFacts = new Set();
+            for (const { content } of fetchedContents) {
+                const pageFacts = extractKeyFacts(content, query, 5);
+                for (const fact of pageFacts) {
+                    const key = fact.toLowerCase().slice(0, 100);
+                    if (!seenFacts.has(key)) {
+                        seenFacts.add(key);
+                        allFacts.push(fact);
+                    }
+                }
+                if (allFacts.length >= 20)
+                    break; // global cap
+            }
+            // ── 5. Optional LLM synthesis ─────────────────────────────────────────
+            let summary;
+            if (llmConfig && fetchedContents.length > 0 && Date.now() < overallDeadline - 3_000) {
+                try {
+                    const sourcesText = fetchedContents
+                        .map((fc, i) => `[${i + 1}] ${fc.url}\n${fc.content.slice(0, 2000)}`)
+                        .join('\n\n---\n\n');
+                    const llmResult = await callLLM(llmConfig, {
+                        messages: [
+                            {
+                                role: 'system',
+                                content: 'You are a research assistant. Synthesize the following sources into a clear, ' +
+                                    'comprehensive answer to the user\'s question. Cite sources by number [1], [2], etc. ' +
+                                    'Be concise but thorough. Use plain text without excessive markdown.',
+                            },
+                            {
+                                role: 'user',
+                                content: `Question: ${query}\n\nSources:\n\n${sourcesText}`,
+                            },
+                        ],
+                        maxTokens: 1000,
+                    });
+                    summary = llmResult.text;
+                }
+                catch {
+                    // LLM synthesis failure is non-fatal — return results without summary
+                }
+            }
+            const elapsed = Date.now() - startTime;
+            res.json({
+                success: true,
+                data: {
+                    query,
+                    ...(summary !== undefined ? { summary } : {}),
+                    sources,
+                    keyFacts: allFacts,
+                    totalSources: sources.length,
+                    searchQueries,
+                    elapsed,
+                },
+                requestId: req.requestId,
+            });
+        }
+        catch (error) {
+            console.error('[research] Unexpected error:', error);
+            if (res.headersSent)
+                return;
+            res.status(500).json({
+                success: false,
+                error: {
+                    type: 'research_failed',
+                    message: 'Research request failed. Please try again.',
+                    docs: 'https://webpeel.dev/docs/api-reference#research',
+                },
+                requestId: req.requestId,
+            });
+        }
+    });
+    return router;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.28",
+  "version": "0.21.30",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",