webpeel 0.21.21 → 0.21.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/server/routes/search.js +27 -13
- package/package.json +1 -1
|
@@ -6,6 +6,7 @@ import { fetch as undiciFetch } from 'undici';
|
|
|
6
6
|
import { load } from 'cheerio';
|
|
7
7
|
import { LRUCache } from 'lru-cache';
|
|
8
8
|
import { peel } from '../../index.js';
|
|
9
|
+
import { simpleFetch } from '../../core/fetcher.js';
|
|
9
10
|
import { getSearchProvider, getBestSearchProvider, } from '../../core/search-provider.js';
|
|
10
11
|
export function createSearchRouter(authStore) {
|
|
11
12
|
const router = Router();
|
|
@@ -149,22 +150,35 @@ export function createSearchRouter(authStore) {
|
|
|
149
150
|
}
|
|
150
151
|
}
|
|
151
152
|
}
|
|
152
|
-
//
|
|
153
|
+
// Lightweight enrichment — HTTP-only, no browser, no full pipeline
|
|
154
|
+
// Uses simpleFetch + cheerio to extract text without launching Playwright
|
|
155
|
+
// This is intentionally minimal to stay within 512MB container memory limit
|
|
153
156
|
if (enrichCount > 0 && !shouldScrape) {
|
|
154
|
-
const ENRICH_TIMEOUT = 4000;
|
|
157
|
+
const ENRICH_TIMEOUT = 4000;
|
|
155
158
|
const toEnrich = results.slice(0, enrichCount);
|
|
156
159
|
const enrichResults = await Promise.allSettled(toEnrich.map(async (result) => {
|
|
157
|
-
const
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
160
|
+
const t0 = Date.now();
|
|
161
|
+
const fetchPromise = (async () => {
|
|
162
|
+
const fetched = await simpleFetch(result.url, undefined, ENRICH_TIMEOUT);
|
|
163
|
+
if (!fetched.html)
|
|
164
|
+
return { url: result.url, content: null, wordCount: 0, method: 'empty', fetchTimeMs: 0 };
|
|
165
|
+
// Extract visible text with cheerio — lightweight, no full pipeline
|
|
166
|
+
const $ = load(fetched.html);
|
|
167
|
+
$('script, style, nav, header, footer, [aria-hidden="true"], .ad, .advertisement').remove();
|
|
168
|
+
// Try main content selectors first, then body
|
|
169
|
+
const mainEl = $('main, article, [role="main"], .content, .article-body, #content').first();
|
|
170
|
+
const textEl = mainEl.length ? mainEl : $('body');
|
|
171
|
+
const text = textEl.text().replace(/\s+/g, ' ').trim().substring(0, 2000);
|
|
172
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
173
|
+
return {
|
|
174
|
+
url: result.url,
|
|
175
|
+
content: text.substring(0, 1500) || null,
|
|
176
|
+
wordCount,
|
|
177
|
+
method: 'simple',
|
|
178
|
+
fetchTimeMs: Date.now() - t0,
|
|
179
|
+
};
|
|
180
|
+
})();
|
|
181
|
+
const timeoutPromise = new Promise(resolve => setTimeout(() => resolve({ url: result.url, content: null, wordCount: 0, method: 'timeout', fetchTimeMs: ENRICH_TIMEOUT }), ENRICH_TIMEOUT));
|
|
168
182
|
return Promise.race([fetchPromise, timeoutPromise]);
|
|
169
183
|
}));
|
|
170
184
|
// Merge enrichment data back into results
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.23",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|