webpeel 0.21.31 → 0.21.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -17,6 +17,7 @@ import { load } from 'cheerio';
|
|
|
17
17
|
import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
18
|
import { getWebshareProxy, getWebshareProxyUrl } from './proxy-config.js';
|
|
19
19
|
import { createLogger } from './logger.js';
|
|
20
|
+
import { searchViaSearXNG } from './searxng-provider.js';
|
|
20
21
|
const log = createLogger('search');
|
|
21
22
|
function decodeHtmlEntities(input) {
|
|
22
23
|
// Cheerio usually decodes entities when using `.text()`, but keep this as a
|
|
@@ -1050,6 +1051,33 @@ export class DuckDuckGoProvider {
|
|
|
1050
1051
|
async searchWeb(query, options) {
|
|
1051
1052
|
const attempts = this.buildQueryAttempts(query);
|
|
1052
1053
|
// -----------------------------------------------------------
|
|
1054
|
+
// Stage 0: SearXNG (self-hosted, residential IP — highest reliability)
|
|
1055
|
+
// Uses Mac Mini running SearXNG exposed via Cloudflare Tunnel.
|
|
1056
|
+
// Aggregates Google, Bing, Brave, Startpage — 30-40 results typical.
|
|
1057
|
+
// Env: SEARXNG_URL=https://search.webpeel.dev
|
|
1058
|
+
// -----------------------------------------------------------
|
|
1059
|
+
if (process.env.SEARXNG_URL) {
|
|
1060
|
+
try {
|
|
1061
|
+
const searxResults = await searchViaSearXNG(query, {
|
|
1062
|
+
count: options.count ?? 10,
|
|
1063
|
+
signal: options.signal,
|
|
1064
|
+
timeoutMs: 6000,
|
|
1065
|
+
});
|
|
1066
|
+
if (searxResults.length > 0) {
|
|
1067
|
+
providerStats.record('searxng', true);
|
|
1068
|
+
log.debug(`source=searxng returned ${searxResults.length} results`);
|
|
1069
|
+
const filtered = filterRelevantResults(searxResults, query);
|
|
1070
|
+
return filtered.length > 0 ? filtered : searxResults;
|
|
1071
|
+
}
|
|
1072
|
+
providerStats.record('searxng', false);
|
|
1073
|
+
log.debug('SearXNG returned 0 results, falling through to DDG');
|
|
1074
|
+
}
|
|
1075
|
+
catch (e) {
|
|
1076
|
+
providerStats.record('searxng', false);
|
|
1077
|
+
log.debug('SearXNG failed:', e instanceof Error ? e.message : e);
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
// -----------------------------------------------------------
|
|
1053
1081
|
// Stage 1: DDG HTTP
|
|
1054
1082
|
// Skip entirely if the source has a ≥80% failure rate over the
|
|
1055
1083
|
// last 10 attempts. When elevated-but-not-skipped, cap the per-
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SearXNG Search Provider
|
|
3
|
+
*
|
|
4
|
+
* Connects to a self-hosted SearXNG instance (running on Mac Mini with residential IP,
|
|
5
|
+
* exposed via Cloudflare Tunnel). SearXNG aggregates Google, Bing, Brave, Startpage, etc.
|
|
6
|
+
* and is not rate-limited or blocked since it runs on a residential IP.
|
|
7
|
+
*
|
|
8
|
+
* Config (env vars):
|
|
9
|
+
* SEARXNG_URL — Base URL of SearXNG instance (e.g. https://search.webpeel.dev)
|
|
10
|
+
*
|
|
11
|
+
* Falls back gracefully if SEARXNG_URL is not set or instance is unreachable.
|
|
12
|
+
*/
|
|
13
|
+
export interface SearXNGSearchResult {
|
|
14
|
+
title: string;
|
|
15
|
+
url: string;
|
|
16
|
+
description?: string;
|
|
17
|
+
publishedDate?: string;
|
|
18
|
+
score?: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Fetches search results from a SearXNG instance.
|
|
22
|
+
* Returns results compatible with WebSearchResult interface in search-provider.ts.
|
|
23
|
+
*/
|
|
24
|
+
export declare function searchViaSearXNG(query: string, options?: {
|
|
25
|
+
count?: number;
|
|
26
|
+
signal?: AbortSignal;
|
|
27
|
+
timeoutMs?: number;
|
|
28
|
+
engines?: string;
|
|
29
|
+
language?: string;
|
|
30
|
+
}): Promise<SearXNGSearchResult[]>;
|
|
31
|
+
/**
|
|
32
|
+
* Quick health check — true if SearXNG is reachable and returning results.
|
|
33
|
+
*/
|
|
34
|
+
export declare function isSearXNGHealthy(): Promise<boolean>;
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SearXNG Search Provider
|
|
3
|
+
*
|
|
4
|
+
* Connects to a self-hosted SearXNG instance (running on Mac Mini with residential IP,
|
|
5
|
+
* exposed via Cloudflare Tunnel). SearXNG aggregates Google, Bing, Brave, Startpage, etc.
|
|
6
|
+
* and is not rate-limited or blocked since it runs on a residential IP.
|
|
7
|
+
*
|
|
8
|
+
* Config (env vars):
|
|
9
|
+
* SEARXNG_URL — Base URL of SearXNG instance (e.g. https://search.webpeel.dev)
|
|
10
|
+
*
|
|
11
|
+
* Falls back gracefully if SEARXNG_URL is not set or instance is unreachable.
|
|
12
|
+
*/
|
|
13
|
+
import { fetch as undiciFetch } from 'undici';
|
|
14
|
+
import { createLogger } from './logger.js';
|
|
15
|
+
const log = createLogger('searxng');
|
|
16
|
+
/**
|
|
17
|
+
* Fetches search results from a SearXNG instance.
|
|
18
|
+
* Returns results compatible with WebSearchResult interface in search-provider.ts.
|
|
19
|
+
*/
|
|
20
|
+
export async function searchViaSearXNG(query, options = {}) {
|
|
21
|
+
const baseUrl = process.env.SEARXNG_URL;
|
|
22
|
+
if (!baseUrl)
|
|
23
|
+
return [];
|
|
24
|
+
const { count = 10, signal, timeoutMs = 8000, engines = '', language = 'en', } = options;
|
|
25
|
+
const controller = new AbortController();
|
|
26
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
27
|
+
if (signal)
|
|
28
|
+
signal.addEventListener('abort', () => controller.abort());
|
|
29
|
+
try {
|
|
30
|
+
const params = new URLSearchParams({
|
|
31
|
+
q: query,
|
|
32
|
+
format: 'json',
|
|
33
|
+
language,
|
|
34
|
+
safesearch: '0',
|
|
35
|
+
categories: 'general',
|
|
36
|
+
});
|
|
37
|
+
if (engines)
|
|
38
|
+
params.set('engines', engines);
|
|
39
|
+
const url = `${baseUrl.replace(/\/$/, '')}/search?${params.toString()}`;
|
|
40
|
+
const response = await undiciFetch(url, {
|
|
41
|
+
signal: controller.signal,
|
|
42
|
+
headers: {
|
|
43
|
+
'Accept': 'application/json',
|
|
44
|
+
'User-Agent': 'WebPeel/1.0 (internal search aggregator)',
|
|
45
|
+
},
|
|
46
|
+
});
|
|
47
|
+
if (!response.ok) {
|
|
48
|
+
log.debug(`HTTP ${response.status}`);
|
|
49
|
+
return [];
|
|
50
|
+
}
|
|
51
|
+
const data = (await response.json());
|
|
52
|
+
const results = data?.results ?? [];
|
|
53
|
+
if (results.length === 0) {
|
|
54
|
+
log.debug('0 results returned');
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
57
|
+
const seen = new Set();
|
|
58
|
+
const output = [];
|
|
59
|
+
for (const r of results) {
|
|
60
|
+
if (!r.url || !r.title)
|
|
61
|
+
continue;
|
|
62
|
+
const normalized = r.url.replace(/\/$/, '').toLowerCase();
|
|
63
|
+
if (seen.has(normalized))
|
|
64
|
+
continue;
|
|
65
|
+
seen.add(normalized);
|
|
66
|
+
output.push({
|
|
67
|
+
title: r.title,
|
|
68
|
+
url: r.url,
|
|
69
|
+
description: r.content ?? undefined,
|
|
70
|
+
publishedDate: r.publishedDate ?? undefined,
|
|
71
|
+
score: r.score ?? undefined,
|
|
72
|
+
});
|
|
73
|
+
if (output.length >= count)
|
|
74
|
+
break;
|
|
75
|
+
}
|
|
76
|
+
log.debug(`${output.length} results for "${query.substring(0, 40)}"`);
|
|
77
|
+
return output;
|
|
78
|
+
}
|
|
79
|
+
catch (e) {
|
|
80
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
81
|
+
if (msg.includes('abort') || msg.includes('timeout') || msg.includes('AbortError')) {
|
|
82
|
+
log.debug(`timed out after ${timeoutMs}ms`);
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
log.debug('fetch error:', msg);
|
|
86
|
+
}
|
|
87
|
+
return [];
|
|
88
|
+
}
|
|
89
|
+
finally {
|
|
90
|
+
clearTimeout(timeoutId);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Quick health check — true if SearXNG is reachable and returning results.
|
|
95
|
+
*/
|
|
96
|
+
export async function isSearXNGHealthy() {
|
|
97
|
+
try {
|
|
98
|
+
const results = await searchViaSearXNG('test', { count: 1, timeoutMs: 5000 });
|
|
99
|
+
return results.length > 0;
|
|
100
|
+
}
|
|
101
|
+
catch {
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.32",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|