webpeel 0.21.43 → 0.21.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -153,7 +153,7 @@ const VALID_LLM_PROVIDERS = [
|
|
|
153
153
|
'cerebras',
|
|
154
154
|
'cloudflare',
|
|
155
155
|
];
|
|
156
|
-
const MAX_SOURCES_HARD_LIMIT =
|
|
156
|
+
const MAX_SOURCES_HARD_LIMIT = 4; // 512MB container — never fetch more than 4 sources
|
|
157
157
|
const PER_URL_TIMEOUT_MS = 8_000;
|
|
158
158
|
const TOTAL_TIMEOUT_MS = 60_000;
|
|
159
159
|
export function createResearchRouter() {
|
|
@@ -302,8 +302,10 @@ export function createResearchRouter() {
|
|
|
302
302
|
new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
|
|
303
303
|
]);
|
|
304
304
|
const fetchTime = Date.now() - fetchStart;
|
|
305
|
+
// Cap HTML at 100KB before parsing — huge pages (Reddit 500KB+) OOM 512MB container
|
|
306
|
+
const rawHtml = (fetchResult.html || '').slice(0, 100_000);
|
|
305
307
|
// Extract clean text via cheerio (no Readability.js, no markdown pipeline)
|
|
306
|
-
const $ = cheerioLoad(
|
|
308
|
+
const $ = cheerioLoad(rawHtml);
|
|
307
309
|
$('script,style,nav,footer,header,aside,noscript,[aria-hidden]').remove();
|
|
308
310
|
const pageTitle = ($('title').text() || $('h1').first().text() || title).trim().slice(0, 200);
|
|
309
311
|
const rawText = $('main, article, [role=main], body').first().text()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.44",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|