webpeel 0.21.39 → 0.21.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1061,7 +1061,7 @@ export class DuckDuckGoProvider {
|
|
|
1061
1061
|
const searxResults = await searchViaSearXNG(query, {
|
|
1062
1062
|
count: options.count ?? 10,
|
|
1063
1063
|
signal: options.signal,
|
|
1064
|
-
timeoutMs:
|
|
1064
|
+
timeoutMs: 12000,
|
|
1065
1065
|
});
|
|
1066
1066
|
if (searxResults.length > 0) {
|
|
1067
1067
|
providerStats.record('searxng', true);
|
|
@@ -21,7 +21,7 @@ export async function searchViaSearXNG(query, options = {}) {
|
|
|
21
21
|
const baseUrl = process.env.SEARXNG_URL;
|
|
22
22
|
if (!baseUrl)
|
|
23
23
|
return [];
|
|
24
|
-
const { count = 10, signal, timeoutMs =
|
|
24
|
+
const { count = 10, signal, timeoutMs = 15000, engines = '', language = 'en', } = options;
|
|
25
25
|
const controller = new AbortController();
|
|
26
26
|
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
27
27
|
if (signal)
|
|
@@ -95,7 +95,7 @@ export async function searchViaSearXNG(query, options = {}) {
|
|
|
95
95
|
*/
|
|
96
96
|
export async function isSearXNGHealthy() {
|
|
97
97
|
try {
|
|
98
|
-
const results = await searchViaSearXNG('test', { count: 1, timeoutMs:
|
|
98
|
+
const results = await searchViaSearXNG('test', { count: 1, timeoutMs: 10000 });
|
|
99
99
|
return results.length > 0;
|
|
100
100
|
}
|
|
101
101
|
catch {
|
|
@@ -9,7 +9,8 @@
|
|
|
9
9
|
* Body: ResearchRequest
|
|
10
10
|
*/
|
|
11
11
|
import { Router } from 'express';
|
|
12
|
-
import {
|
|
12
|
+
import { simpleFetch } from '../../core/fetcher.js';
|
|
13
|
+
import { load as cheerioLoad } from 'cheerio';
|
|
13
14
|
import { getSearchProvider } from '../../core/search-provider.js';
|
|
14
15
|
import { callLLM, } from '../../core/llm-provider.js';
|
|
15
16
|
import { sanitizeForLLM, hardenSystemPrompt, validateOutput } from '../../core/prompt-guard.js';
|
|
@@ -295,21 +296,21 @@ export function createResearchRouter() {
|
|
|
295
296
|
break;
|
|
296
297
|
const fetchStart = Date.now();
|
|
297
298
|
try {
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
noEscalate: true, // NEVER launch browser — 512MB container
|
|
302
|
-
timeout: urlTimeout,
|
|
303
|
-
readable: true,
|
|
304
|
-
budget: 3000,
|
|
305
|
-
}),
|
|
299
|
+
// Use simpleFetch + cheerio (no peel/pipeline) — keeps memory under 512MB
|
|
300
|
+
const fetchResult = await Promise.race([
|
|
301
|
+
simpleFetch(url, undefined, urlTimeout),
|
|
306
302
|
new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
|
|
307
303
|
]);
|
|
308
304
|
const fetchTime = Date.now() - fetchStart;
|
|
309
|
-
|
|
305
|
+
// Extract clean text via cheerio (no Readability.js, no markdown pipeline)
|
|
306
|
+
const $ = cheerioLoad(fetchResult.html || '');
|
|
307
|
+
$('script,style,nav,footer,header,aside,noscript,[aria-hidden]').remove();
|
|
308
|
+
const pageTitle = ($('title').text() || $('h1').first().text() || title).trim().slice(0, 200);
|
|
309
|
+
const rawText = $('main, article, [role=main], body').first().text()
|
|
310
|
+
.replace(/\s+/g, ' ').trim();
|
|
311
|
+
const content = rawText.slice(0, 4000); // ~3000 words max
|
|
310
312
|
const wordCount = content.split(/\s+/).filter(Boolean).length;
|
|
311
|
-
|
|
312
|
-
// Build snippet: prefer LLM-extracted summary, else first 500 chars of content
|
|
313
|
+
// Build snippet: first 500 chars of content
|
|
313
314
|
const sourceSnippet = content.slice(0, 500).replace(/\s+/g, ' ').trim();
|
|
314
315
|
sources.push({
|
|
315
316
|
url,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.41",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|