webpeel 0.21.42 → 0.21.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -149,16 +149,33 @@ function unixToIso(sec) {
149
149
  }
150
150
  /** Fetch JSON from a URL using simpleFetch (reuses WebPeel's HTTP stack). */
151
151
  async function fetchJson(url, customHeaders) {
152
- const result = await simpleFetch(url, 'webpeel/0.21 (https://webpeel.dev)', 15000, {
153
- Accept: 'application/json',
154
- ...customHeaders,
155
- });
156
- const parsed = tryParseJson(result.html);
157
- if (parsed === null && result.html.length > 0) {
158
- // Log when we get non-JSON back (likely an HTML error page)
159
- console.warn(`[webpeel:fetchJson] Non-JSON response from ${url} (${result.html.length} bytes, status: ${result.statusCode}): ${result.html.slice(0, 120)}`);
160
- }
161
- return parsed;
152
+ // Use plain fetch (not simpleFetch) for JSON API calls.
153
+ // simpleFetch adds stealth browser headers (Sec-CH-UA, Sec-Fetch-*, etc.)
154
+ // which confuse API endpoints like api.github.com into returning HTML.
155
+ const controller = new AbortController();
156
+ const timer = setTimeout(() => controller.abort(), 15000);
157
+ try {
158
+ const resp = await fetch(url, {
159
+ headers: {
160
+ 'User-Agent': 'webpeel/0.21 (https://webpeel.dev)',
161
+ 'Accept': 'application/json',
162
+ ...customHeaders,
163
+ },
164
+ signal: controller.signal,
165
+ redirect: 'follow',
166
+ });
167
+ clearTimeout(timer);
168
+ const text = await resp.text();
169
+ const parsed = tryParseJson(text);
170
+ if (parsed === null && text.length > 0) {
171
+ console.warn(`[webpeel:fetchJson] Non-JSON response from ${url} (${text.length} bytes, status: ${resp.status}): ${text.slice(0, 120)}`);
172
+ }
173
+ return parsed;
174
+ }
175
+ catch (e) {
176
+ clearTimeout(timer);
177
+ throw e;
178
+ }
162
179
  }
163
180
  /** Fetch JSON with exponential backoff retry on 429 / rate-limit errors. */
164
181
  async function fetchJsonWithRetry(url, headers, retries = 2, baseDelayMs = 1000) {
@@ -153,7 +153,7 @@ const VALID_LLM_PROVIDERS = [
153
153
  'cerebras',
154
154
  'cloudflare',
155
155
  ];
156
- const MAX_SOURCES_HARD_LIMIT = 8;
156
+ const MAX_SOURCES_HARD_LIMIT = 4; // 512MB container — never fetch more than 4 sources
157
157
  const PER_URL_TIMEOUT_MS = 8_000;
158
158
  const TOTAL_TIMEOUT_MS = 60_000;
159
159
  export function createResearchRouter() {
@@ -302,8 +302,10 @@ export function createResearchRouter() {
302
302
  new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
303
303
  ]);
304
304
  const fetchTime = Date.now() - fetchStart;
305
+ // Cap HTML at 100KB before parsing — huge pages (Reddit 500KB+) OOM 512MB container
306
+ const rawHtml = (fetchResult.html || '').slice(0, 100_000);
305
307
  // Extract clean text via cheerio (no Readability.js, no markdown pipeline)
306
- const $ = cheerioLoad(fetchResult.html || '');
308
+ const $ = cheerioLoad(rawHtml);
307
309
  $('script,style,nav,footer,header,aside,noscript,[aria-hidden]').remove();
308
310
  const pageTitle = ($('title').text() || $('h1').first().text() || title).trim().slice(0, 200);
309
311
  const rawText = $('main, article, [role=main], body').first().text()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.42",
3
+ "version": "0.21.44",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",