webpeel 0.21.40 → 0.21.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,8 @@
9
9
  * Body: ResearchRequest
10
10
  */
11
11
  import { Router } from 'express';
12
- import { peel } from '../../index.js';
12
+ import { simpleFetch } from '../../core/fetcher.js';
13
+ import { load as cheerioLoad } from 'cheerio';
13
14
  import { getSearchProvider } from '../../core/search-provider.js';
14
15
  import { callLLM, } from '../../core/llm-provider.js';
15
16
  import { sanitizeForLLM, hardenSystemPrompt, validateOutput } from '../../core/prompt-guard.js';
@@ -295,21 +296,21 @@ export function createResearchRouter() {
295
296
  break;
296
297
  const fetchStart = Date.now();
297
298
  try {
298
- const result = await Promise.race([
299
- peel(url, {
300
- format: 'markdown',
301
- noEscalate: true, // NEVER launch browser — 512MB container
302
- timeout: urlTimeout,
303
- readable: true,
304
- budget: 3000,
305
- }),
299
+ // Use simpleFetch + cheerio (no peel/pipeline) — keeps memory under 512MB
300
+ const fetchResult = await Promise.race([
301
+ simpleFetch(url, undefined, urlTimeout),
306
302
  new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
307
303
  ]);
308
304
  const fetchTime = Date.now() - fetchStart;
309
- const content = result.content || '';
305
+ // Extract clean text via cheerio (no Readability.js, no markdown pipeline)
306
+ const $ = cheerioLoad(fetchResult.html || '');
307
+ $('script,style,nav,footer,header,aside,noscript,[aria-hidden]').remove();
308
+ const pageTitle = ($('title').text() || $('h1').first().text() || title).trim().slice(0, 200);
309
+ const rawText = $('main, article, [role=main], body').first().text()
310
+ .replace(/\s+/g, ' ').trim();
311
+ const content = rawText.slice(0, 4000); // ~3000 words max
310
312
  const wordCount = content.split(/\s+/).filter(Boolean).length;
311
- const pageTitle = result.title || title;
312
- // Build snippet: prefer LLM-extracted summary, else first 500 chars of content
313
+ // Build snippet: first 500 chars of content
313
314
  const sourceSnippet = content.slice(0, 500).replace(/\s+/g, ' ').trim();
314
315
  sources.push({
315
316
  url,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.40",
3
+ "version": "0.21.41",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",