npm - webpeel - Versions diffs - 0.21.23 → 0.21.25 - Mend

webpeel 0.21.23 → 0.21.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/core/search-provider.js +18 -11
package/dist/core/structured-extract.d.ts +1 -1
package/dist/core/structured-extract.js +30 -2
package/dist/server/routes/extract.js +13 -1
package/dist/server/routes/fetch.js +4 -0
package/package.json +1 -1

package/dist/core/search-provider.js CHANGED Viewed

@@ -844,20 +844,27 @@ export class DuckDuckGoProvider {
         // Stage 4: Stealth multi-engine (DDG + Bing + Ecosia in parallel)
         // Bypasses bot-detection on datacenter IPs. This is the reliable
         // last resort — but it spins up a browser so it takes a few seconds.
+        // DISABLED on memory-constrained servers (512MB) — Playwright OOM kills.
+        // Set NO_BROWSER_SEARCH=1 to skip this stage entirely.
         // -----------------------------------------------------------
-        log.debug('Trying stealth browser search (DDG + Bing + Ecosia)...');
-        try {
-            const stealthProvider = new StealthSearchProvider();
-            // StealthSearchProvider already applies filterRelevantResults internally.
-            const stealthResults = await stealthProvider.searchWeb(query, options);
-            if (stealthResults.length > 0) {
-                log.debug(`source=stealth returned ${stealthResults.length} results`);
-                return stealthResults;
+        if (!process.env.NO_BROWSER_SEARCH) {
+            log.debug('Trying stealth browser search (DDG + Bing + Ecosia)...');
+            try {
+                const stealthProvider = new StealthSearchProvider();
+                // StealthSearchProvider already applies filterRelevantResults internally.
+                const stealthResults = await stealthProvider.searchWeb(query, options);
+                if (stealthResults.length > 0) {
+                    log.debug(`source=stealth returned ${stealthResults.length} results`);
+                    return stealthResults;
+                }
+                log.debug('Stealth search returned 0 results');
+            }
+            catch (e) {
+                log.debug('Stealth search failed:', e instanceof Error ? e.message : e);
             }
-            log.debug('Stealth search returned 0 results');
         }
-        catch (e) {
-            log.debug('Stealth search failed:', e instanceof Error ? e.message : e);
+        else {
+            log.debug('Stealth browser search skipped (NO_BROWSER_SEARCH=1)');
         }
         return [];
     }

package/dist/core/structured-extract.d.ts CHANGED Viewed

@@ -30,7 +30,7 @@ export interface ExtractionResult {
  * @param llmConfig Optional LLM config (if omitted, uses heuristic fallback)
  * @param prompt    Optional user guidance added to the LLM prompt
  */
-export declare function extractStructured(content: string, schema: ExtractionSchema, llmConfig?: LLMConfig, prompt?: string): Promise<ExtractionResult>;
+export declare function extractStructured(content: string, schema: ExtractionSchema, llmConfig?: LLMConfig, prompt?: string, domainHints?: Record<string, unknown>): Promise<ExtractionResult>;
 /**
  * Convert a shorthand schema `{ field: "string", active: "boolean" }` to a
  * full ExtractionSchema.  Useful for CLI --extract flag.

package/dist/core/structured-extract.js CHANGED Viewed

@@ -422,7 +422,7 @@ async function heuristicExtract(content, schema) {
  * @param llmConfig Optional LLM config (if omitted, uses heuristic fallback)
  * @param prompt    Optional user guidance added to the LLM prompt
  */
-export async function extractStructured(content, schema, llmConfig, prompt) {
+export async function extractStructured(content, schema, llmConfig, prompt, domainHints) {
     // Guard: empty content
     if (!content || content.trim().length === 0) {
         return { data: {}, confidence: 0, tokensUsed: 0 };
@@ -495,7 +495,35 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
         }
     }
     // ── Heuristic extraction ─────────────────────────────────────────────────
-    return heuristicExtract(content, schema);
+    const heuristic = await heuristicExtract(content, schema);
+    // ── Domain hints overlay ─────────────────────────────────────────────────
+    // If domain-api pre-extracted fields (e.g. GitHub stars/language), merge them
+    // into the result. Domain-api data is authoritative — prefer over heuristic.
+    if (domainHints && Object.keys(domainHints).length > 0) {
+        const props = schema.properties;
+        let hintMerged = 0;
+        for (const [field, hintValue] of Object.entries(domainHints)) {
+            if (field in props && hintValue !== null && hintValue !== undefined) {
+                const expected = props[field].type;
+                const actual = typeof hintValue;
+                // Only merge if type matches (or number vs string coercion)
+                if (actual === expected ||
+                    (expected === 'number' && actual === 'string' && !isNaN(Number(hintValue))) ||
+                    (expected === 'string' && actual !== 'object')) {
+                    heuristic.data[field] =
+                        expected === 'number' ? Number(hintValue) : hintValue;
+                    hintMerged++;
+                }
+            }
+        }
+        if (hintMerged > 0) {
+            // Boost confidence since we have authoritative domain-api data
+            const filled = Object.values(heuristic.data).filter(v => v !== null && v !== undefined).length;
+            const total = Object.keys(props).length;
+            heuristic.confidence = parseFloat(Math.min(0.90, 0.65 + (filled / total) * 0.25).toFixed(2));
+        }
+    }
+    return heuristic;
 }
 // ---------------------------------------------------------------------------
 // Helper: convert simple { field: "type" } map → ExtractionSchema

package/dist/server/routes/extract.js CHANGED Viewed

@@ -186,12 +186,24 @@ export function createExtractRouter() {
             const peelResult = await peel(url, {
                 format: 'markdown',
                 render: useRender,
+                noEscalate: !useRender, // prevent OOM: only browser when render=true explicitly
                 timeout: 30000,
                 readable: true,
             });
             const content = peelResult.content || '';
             // ── Extract structured data ─────────────────────────────────────────
-            const extractResult = await extractStructured(content, schema, llmConfig, typeof prompt === 'string' ? prompt : undefined);
+            // Seed hints from domain-api structured data (GitHub stars/language, etc.)
+            // This lets heuristic extraction use pre-parsed structured fields as ground truth.
+            const domainHints = {};
+            const rawDomainData = peelResult.domainData?.structured;
+            if (rawDomainData && typeof rawDomainData === 'object') {
+                for (const [k, v] of Object.entries(rawDomainData)) {
+                    if (v !== null && v !== undefined && v !== '') {
+                        domainHints[k] = v;
+                    }
+                }
+            }
+            const extractResult = await extractStructured(content, schema, llmConfig, typeof prompt === 'string' ? prompt : undefined, Object.keys(domainHints).length > 0 ? domainHints : undefined);
             const method = llmConfig ? 'llm' : 'heuristic';
             res.json({
                 success: true,

package/dist/server/routes/fetch.js CHANGED Viewed

@@ -341,6 +341,10 @@ export function createFetchRouter(authStore) {
                 lite: lite === 'true',
                 timeout: timeout ? parseInt(timeout, 10) : undefined,
                 captionImages: captionImages === 'true',
+                // Prevent auto-escalation to browser unless render=true is explicitly requested.
+                // On 512MB containers, surprise browser launches cause OOM kills.
+                // Domain extractors (GitHub, Wikipedia, npm etc.) use HTTP APIs, not the browser.
+                noEscalate: !shouldRender,
             };
             // Auto-budget: default to 4000 tokens for API requests when no budget specified
             // Opt-out: budget=0 explicitly disables. Lite mode disables auto-budget.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.23",
+  "version": "0.21.25",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",