npm - webpeel - Versions diffs - 0.21.40 → 0.21.41 - Mend

webpeel 0.21.40 → 0.21.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/server/routes/research.js +13 -12
package/package.json +1 -1

package/dist/server/routes/research.js CHANGED Viewed

@@ -9,7 +9,8 @@
  * Body: ResearchRequest
  */
 import { Router } from 'express';
-import { peel } from '../../index.js';
+import { simpleFetch } from '../../core/fetcher.js';
+import { load as cheerioLoad } from 'cheerio';
 import { getSearchProvider } from '../../core/search-provider.js';
 import { callLLM, } from '../../core/llm-provider.js';
 import { sanitizeForLLM, hardenSystemPrompt, validateOutput } from '../../core/prompt-guard.js';
@@ -295,21 +296,21 @@ export function createResearchRouter() {
                     break;
                 const fetchStart = Date.now();
                 try {
-                    const result = await Promise.race([
-                        peel(url, {
-                            format: 'markdown',
-                            noEscalate: true, // NEVER launch browser — 512MB container
-                            timeout: urlTimeout,
-                            readable: true,
-                            budget: 3000,
-                        }),
+                    // Use simpleFetch + cheerio (no peel/pipeline) — keeps memory under 512MB
+                    const fetchResult = await Promise.race([
+                        simpleFetch(url, undefined, urlTimeout),
                         new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
                     ]);
                     const fetchTime = Date.now() - fetchStart;
-                    const content = result.content || '';
+                    // Extract clean text via cheerio (no Readability.js, no markdown pipeline)
+                    const $ = cheerioLoad(fetchResult.html || '');
+                    $('script,style,nav,footer,header,aside,noscript,[aria-hidden]').remove();
+                    const pageTitle = ($('title').text() || $('h1').first().text() || title).trim().slice(0, 200);
+                    const rawText = $('main, article, [role=main], body').first().text()
+                        .replace(/\s+/g, ' ').trim();
+                    const content = rawText.slice(0, 4000); // ~3000 words max
                     const wordCount = content.split(/\s+/).filter(Boolean).length;
-                    const pageTitle = result.title || title;
-                    // Build snippet: prefer LLM-extracted summary, else first 500 chars of content
+                    // Build snippet: first 500 chars of content
                     const sourceSnippet = content.slice(0, 500).replace(/\s+/g, ' ').trim();
                     sources.push({
                         url,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.40",
+  "version": "0.21.41",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",