npm - webpeel - Versions diffs - 0.21.42 → 0.21.44 - Mend

webpeel 0.21.42 → 0.21.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/core/domain-extractors.js +27 -10
package/dist/server/routes/research.js +4 -2
package/package.json +1 -1

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -149,16 +149,33 @@ function unixToIso(sec) {
 }
 /** Fetch JSON from a URL using simpleFetch (reuses WebPeel's HTTP stack). */
 async function fetchJson(url, customHeaders) {
-    const result = await simpleFetch(url, 'webpeel/0.21 (https://webpeel.dev)', 15000, {
-        Accept: 'application/json',
-        ...customHeaders,
-    });
-    const parsed = tryParseJson(result.html);
-    if (parsed === null && result.html.length > 0) {
-        // Log when we get non-JSON back (likely an HTML error page)
-        console.warn(`[webpeel:fetchJson] Non-JSON response from ${url} (${result.html.length} bytes, status: ${result.statusCode}): ${result.html.slice(0, 120)}`);
-    }
-    return parsed;
+    // Use plain fetch (not simpleFetch) for JSON API calls.
+    // simpleFetch adds stealth browser headers (Sec-CH-UA, Sec-Fetch-*, etc.)
+    // which confuse API endpoints like api.github.com into returning HTML.
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), 15000);
+    try {
+        const resp = await fetch(url, {
+            headers: {
+                'User-Agent': 'webpeel/0.21 (https://webpeel.dev)',
+                'Accept': 'application/json',
+                ...customHeaders,
+            },
+            signal: controller.signal,
+            redirect: 'follow',
+        });
+        clearTimeout(timer);
+        const text = await resp.text();
+        const parsed = tryParseJson(text);
+        if (parsed === null && text.length > 0) {
+            console.warn(`[webpeel:fetchJson] Non-JSON response from ${url} (${text.length} bytes, status: ${resp.status}): ${text.slice(0, 120)}`);
+        }
+        return parsed;
+    }
+    catch (e) {
+        clearTimeout(timer);
+        throw e;
+    }
 }
 /** Fetch JSON with exponential backoff retry on 429 / rate-limit errors. */
 async function fetchJsonWithRetry(url, headers, retries = 2, baseDelayMs = 1000) {

package/dist/server/routes/research.js CHANGED Viewed

@@ -153,7 +153,7 @@ const VALID_LLM_PROVIDERS = [
     'cerebras',
     'cloudflare',
 ];
-const MAX_SOURCES_HARD_LIMIT = 8;
+const MAX_SOURCES_HARD_LIMIT = 4; // 512MB container — never fetch more than 4 sources
 const PER_URL_TIMEOUT_MS = 8_000;
 const TOTAL_TIMEOUT_MS = 60_000;
 export function createResearchRouter() {
@@ -302,8 +302,10 @@ export function createResearchRouter() {
                         new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
                     ]);
                     const fetchTime = Date.now() - fetchStart;
+                    // Cap HTML at 100KB before parsing — huge pages (Reddit 500KB+) OOM 512MB container
+                    const rawHtml = (fetchResult.html || '').slice(0, 100_000);
                     // Extract clean text via cheerio (no Readability.js, no markdown pipeline)
-                    const $ = cheerioLoad(fetchResult.html || '');
+                    const $ = cheerioLoad(rawHtml);
                     $('script,style,nav,footer,header,aside,noscript,[aria-hidden]').remove();
                     const pageTitle = ($('title').text() || $('h1').first().text() || title).trim().slice(0, 200);
                     const rawText = $('main, article, [role=main], body').first().text()

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.42",
+  "version": "0.21.44",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",