webpeel 0.21.43 → 0.21.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -153,7 +153,7 @@ const VALID_LLM_PROVIDERS = [
|
|
|
153
153
|
'cerebras',
|
|
154
154
|
'cloudflare',
|
|
155
155
|
];
|
|
156
|
-
const MAX_SOURCES_HARD_LIMIT =
|
|
156
|
+
const MAX_SOURCES_HARD_LIMIT = 4; // 512MB container — never fetch more than 4 sources
|
|
157
157
|
const PER_URL_TIMEOUT_MS = 8_000;
|
|
158
158
|
const TOTAL_TIMEOUT_MS = 60_000;
|
|
159
159
|
export function createResearchRouter() {
|
|
@@ -175,6 +175,33 @@ export function createResearchRouter() {
|
|
|
175
175
|
});
|
|
176
176
|
return;
|
|
177
177
|
}
|
|
178
|
+
// ── Hetzner research worker proxy ────────────────────────────────────
|
|
179
|
+
// When RESEARCH_WORKER_URL is set, forward the entire request to the
|
|
180
|
+
// Hetzner VPS worker (local SearXNG + Ollama). Falls back to local if proxy fails.
|
|
181
|
+
if (process.env.RESEARCH_WORKER_URL) {
|
|
182
|
+
try {
|
|
183
|
+
const resp = await fetch(process.env.RESEARCH_WORKER_URL + '/research', {
|
|
184
|
+
method: 'POST',
|
|
185
|
+
headers: {
|
|
186
|
+
'Content-Type': 'application/json',
|
|
187
|
+
'Authorization': 'Bearer ' + (process.env.OLLAMA_SECRET || ''),
|
|
188
|
+
},
|
|
189
|
+
body: JSON.stringify(req.body),
|
|
190
|
+
signal: AbortSignal.timeout(55_000),
|
|
191
|
+
});
|
|
192
|
+
const result = await resp.json();
|
|
193
|
+
// Attach requestId for consistency
|
|
194
|
+
if (result && typeof result === 'object') {
|
|
195
|
+
result.requestId = req.requestId;
|
|
196
|
+
}
|
|
197
|
+
res.json(result);
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
catch (proxyErr) {
|
|
201
|
+
console.warn('[research] Hetzner proxy failed, falling back to local:', proxyErr.message);
|
|
202
|
+
// Fall through to local research pipeline
|
|
203
|
+
}
|
|
204
|
+
}
|
|
178
205
|
// ── Parse & validate body ─────────────────────────────────────────────
|
|
179
206
|
const body = req.body;
|
|
180
207
|
if (!body.query || typeof body.query !== 'string' || body.query.trim().length === 0) {
|
|
@@ -302,8 +329,10 @@ export function createResearchRouter() {
|
|
|
302
329
|
new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
|
|
303
330
|
]);
|
|
304
331
|
const fetchTime = Date.now() - fetchStart;
|
|
332
|
+
// Cap HTML at 100KB before parsing — huge pages (Reddit 500KB+) OOM 512MB container
|
|
333
|
+
const rawHtml = (fetchResult.html || '').slice(0, 100_000);
|
|
305
334
|
// Extract clean text via cheerio (no Readability.js, no markdown pipeline)
|
|
306
|
-
const $ = cheerioLoad(
|
|
335
|
+
const $ = cheerioLoad(rawHtml);
|
|
307
336
|
$('script,style,nav,footer,header,aside,noscript,[aria-hidden]').remove();
|
|
308
337
|
const pageTitle = ($('title').text() || $('h1').first().text() || title).trim().slice(0, 200);
|
|
309
338
|
const rawText = $('main, article, [role=main], body').first().text()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.45",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|