webpeel 0.21.42 → 0.21.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -149,16 +149,33 @@ function unixToIso(sec) {
|
|
|
149
149
|
}
|
|
150
150
|
/** Fetch JSON from a URL using simpleFetch (reuses WebPeel's HTTP stack). */
|
|
151
151
|
async function fetchJson(url, customHeaders) {
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
const
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
152
|
+
// Use plain fetch (not simpleFetch) for JSON API calls.
|
|
153
|
+
// simpleFetch adds stealth browser headers (Sec-CH-UA, Sec-Fetch-*, etc.)
|
|
154
|
+
// which confuse API endpoints like api.github.com into returning HTML.
|
|
155
|
+
const controller = new AbortController();
|
|
156
|
+
const timer = setTimeout(() => controller.abort(), 15000);
|
|
157
|
+
try {
|
|
158
|
+
const resp = await fetch(url, {
|
|
159
|
+
headers: {
|
|
160
|
+
'User-Agent': 'webpeel/0.21 (https://webpeel.dev)',
|
|
161
|
+
'Accept': 'application/json',
|
|
162
|
+
...customHeaders,
|
|
163
|
+
},
|
|
164
|
+
signal: controller.signal,
|
|
165
|
+
redirect: 'follow',
|
|
166
|
+
});
|
|
167
|
+
clearTimeout(timer);
|
|
168
|
+
const text = await resp.text();
|
|
169
|
+
const parsed = tryParseJson(text);
|
|
170
|
+
if (parsed === null && text.length > 0) {
|
|
171
|
+
console.warn(`[webpeel:fetchJson] Non-JSON response from ${url} (${text.length} bytes, status: ${resp.status}): ${text.slice(0, 120)}`);
|
|
172
|
+
}
|
|
173
|
+
return parsed;
|
|
174
|
+
}
|
|
175
|
+
catch (e) {
|
|
176
|
+
clearTimeout(timer);
|
|
177
|
+
throw e;
|
|
178
|
+
}
|
|
162
179
|
}
|
|
163
180
|
/** Fetch JSON with exponential backoff retry on 429 / rate-limit errors. */
|
|
164
181
|
async function fetchJsonWithRetry(url, headers, retries = 2, baseDelayMs = 1000) {
|
|
@@ -153,7 +153,7 @@ const VALID_LLM_PROVIDERS = [
|
|
|
153
153
|
'cerebras',
|
|
154
154
|
'cloudflare',
|
|
155
155
|
];
|
|
156
|
-
const MAX_SOURCES_HARD_LIMIT =
|
|
156
|
+
const MAX_SOURCES_HARD_LIMIT = 4; // 512MB container — never fetch more than 4 sources
|
|
157
157
|
const PER_URL_TIMEOUT_MS = 8_000;
|
|
158
158
|
const TOTAL_TIMEOUT_MS = 60_000;
|
|
159
159
|
export function createResearchRouter() {
|
|
@@ -302,8 +302,10 @@ export function createResearchRouter() {
|
|
|
302
302
|
new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
|
|
303
303
|
]);
|
|
304
304
|
const fetchTime = Date.now() - fetchStart;
|
|
305
|
+
// Cap HTML at 100KB before parsing — huge pages (Reddit 500KB+) OOM 512MB container
|
|
306
|
+
const rawHtml = (fetchResult.html || '').slice(0, 100_000);
|
|
305
307
|
// Extract clean text via cheerio (no Readability.js, no markdown pipeline)
|
|
306
|
-
const $ = cheerioLoad(
|
|
308
|
+
const $ = cheerioLoad(rawHtml);
|
|
307
309
|
$('script,style,nav,footer,header,aside,noscript,[aria-hidden]').remove();
|
|
308
310
|
const pageTitle = ($('title').text() || $('h1').first().text() || title).trim().slice(0, 200);
|
|
309
311
|
const rawText = $('main, article, [role=main], body').first().text()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.44",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|