webpeel 0.21.22 → 0.21.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -844,20 +844,27 @@ export class DuckDuckGoProvider {
|
|
|
844
844
|
// Stage 4: Stealth multi-engine (DDG + Bing + Ecosia in parallel)
|
|
845
845
|
// Bypasses bot-detection on datacenter IPs. This is the reliable
|
|
846
846
|
// last resort — but it spins up a browser so it takes a few seconds.
|
|
847
|
+
// DISABLED on memory-constrained servers (512MB) — Playwright OOM kills.
|
|
848
|
+
// Set NO_BROWSER_SEARCH=1 to skip this stage entirely.
|
|
847
849
|
// -----------------------------------------------------------
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
850
|
+
if (!process.env.NO_BROWSER_SEARCH) {
|
|
851
|
+
log.debug('Trying stealth browser search (DDG + Bing + Ecosia)...');
|
|
852
|
+
try {
|
|
853
|
+
const stealthProvider = new StealthSearchProvider();
|
|
854
|
+
// StealthSearchProvider already applies filterRelevantResults internally.
|
|
855
|
+
const stealthResults = await stealthProvider.searchWeb(query, options);
|
|
856
|
+
if (stealthResults.length > 0) {
|
|
857
|
+
log.debug(`source=stealth returned ${stealthResults.length} results`);
|
|
858
|
+
return stealthResults;
|
|
859
|
+
}
|
|
860
|
+
log.debug('Stealth search returned 0 results');
|
|
861
|
+
}
|
|
862
|
+
catch (e) {
|
|
863
|
+
log.debug('Stealth search failed:', e instanceof Error ? e.message : e);
|
|
856
864
|
}
|
|
857
|
-
log.debug('Stealth search returned 0 results');
|
|
858
865
|
}
|
|
859
|
-
|
|
860
|
-
log.debug('Stealth search
|
|
866
|
+
else {
|
|
867
|
+
log.debug('Stealth browser search skipped (NO_BROWSER_SEARCH=1)');
|
|
861
868
|
}
|
|
862
869
|
return [];
|
|
863
870
|
}
|
|
@@ -341,6 +341,10 @@ export function createFetchRouter(authStore) {
|
|
|
341
341
|
lite: lite === 'true',
|
|
342
342
|
timeout: timeout ? parseInt(timeout, 10) : undefined,
|
|
343
343
|
captionImages: captionImages === 'true',
|
|
344
|
+
// Prevent auto-escalation to browser unless render=true is explicitly requested.
|
|
345
|
+
// On 512MB containers, surprise browser launches cause OOM kills.
|
|
346
|
+
// Domain extractors (GitHub, Wikipedia, npm etc.) use HTTP APIs, not the browser.
|
|
347
|
+
noEscalate: !shouldRender,
|
|
344
348
|
};
|
|
345
349
|
// Auto-budget: default to 4000 tokens for API requests when no budget specified
|
|
346
350
|
// Opt-out: budget=0 explicitly disables. Lite mode disables auto-budget.
|
|
@@ -6,6 +6,7 @@ import { fetch as undiciFetch } from 'undici';
|
|
|
6
6
|
import { load } from 'cheerio';
|
|
7
7
|
import { LRUCache } from 'lru-cache';
|
|
8
8
|
import { peel } from '../../index.js';
|
|
9
|
+
import { simpleFetch } from '../../core/fetcher.js';
|
|
9
10
|
import { getSearchProvider, getBestSearchProvider, } from '../../core/search-provider.js';
|
|
10
11
|
export function createSearchRouter(authStore) {
|
|
11
12
|
const router = Router();
|
|
@@ -149,25 +150,35 @@ export function createSearchRouter(authStore) {
|
|
|
149
150
|
}
|
|
150
151
|
}
|
|
151
152
|
}
|
|
152
|
-
//
|
|
153
|
-
//
|
|
153
|
+
// Lightweight enrichment — HTTP-only, no browser, no full pipeline
|
|
154
|
+
// Uses simpleFetch + cheerio to extract text without launching Playwright
|
|
155
|
+
// This is intentionally minimal to stay within 512MB container memory limit
|
|
154
156
|
if (enrichCount > 0 && !shouldScrape) {
|
|
155
|
-
const ENRICH_TIMEOUT = 4000;
|
|
157
|
+
const ENRICH_TIMEOUT = 4000;
|
|
156
158
|
const toEnrich = results.slice(0, enrichCount);
|
|
157
159
|
const enrichResults = await Promise.allSettled(toEnrich.map(async (result) => {
|
|
158
|
-
const
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
160
|
+
const t0 = Date.now();
|
|
161
|
+
const fetchPromise = (async () => {
|
|
162
|
+
const fetched = await simpleFetch(result.url, undefined, ENRICH_TIMEOUT);
|
|
163
|
+
if (!fetched.html)
|
|
164
|
+
return { url: result.url, content: null, wordCount: 0, method: 'empty', fetchTimeMs: 0 };
|
|
165
|
+
// Extract visible text with cheerio — lightweight, no full pipeline
|
|
166
|
+
const $ = load(fetched.html);
|
|
167
|
+
$('script, style, nav, header, footer, [aria-hidden="true"], .ad, .advertisement').remove();
|
|
168
|
+
// Try main content selectors first, then body
|
|
169
|
+
const mainEl = $('main, article, [role="main"], .content, .article-body, #content').first();
|
|
170
|
+
const textEl = mainEl.length ? mainEl : $('body');
|
|
171
|
+
const text = textEl.text().replace(/\s+/g, ' ').trim().substring(0, 2000);
|
|
172
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
173
|
+
return {
|
|
174
|
+
url: result.url,
|
|
175
|
+
content: text.substring(0, 1500) || null,
|
|
176
|
+
wordCount,
|
|
177
|
+
method: 'simple',
|
|
178
|
+
fetchTimeMs: Date.now() - t0,
|
|
179
|
+
};
|
|
180
|
+
})();
|
|
181
|
+
const timeoutPromise = new Promise(resolve => setTimeout(() => resolve({ url: result.url, content: null, wordCount: 0, method: 'timeout', fetchTimeMs: ENRICH_TIMEOUT }), ENRICH_TIMEOUT));
|
|
171
182
|
return Promise.race([fetchPromise, timeoutPromise]);
|
|
172
183
|
}));
|
|
173
184
|
// Merge enrichment data back into results
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.24",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|