webpeel 0.21.30 → 0.21.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -17,6 +17,7 @@ import { load } from 'cheerio';
|
|
|
17
17
|
import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
18
|
import { getWebshareProxy, getWebshareProxyUrl } from './proxy-config.js';
|
|
19
19
|
import { createLogger } from './logger.js';
|
|
20
|
+
import { searchViaSearXNG } from './searxng-provider.js';
|
|
20
21
|
const log = createLogger('search');
|
|
21
22
|
function decodeHtmlEntities(input) {
|
|
22
23
|
// Cheerio usually decodes entities when using `.text()`, but keep this as a
|
|
@@ -684,25 +685,37 @@ export class DuckDuckGoProvider {
|
|
|
684
685
|
'Upgrade-Insecure-Requests': '1',
|
|
685
686
|
'Referer': 'https://duckduckgo.com/',
|
|
686
687
|
};
|
|
687
|
-
// Try
|
|
688
|
+
// Try direct first, then proxy as fallback.
|
|
689
|
+
// Webshare backbone IPs are blocked by DDG (returns empty results).
|
|
690
|
+
// Render datacenter IPs work intermittently — direct has better odds.
|
|
688
691
|
let response;
|
|
689
|
-
|
|
692
|
+
let html;
|
|
693
|
+
// let usedProxy = false;
|
|
694
|
+
// Attempt 1: Direct fetch (no proxy)
|
|
695
|
+
try {
|
|
696
|
+
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
|
|
697
|
+
html = response.ok ? await response.text() : '';
|
|
698
|
+
}
|
|
699
|
+
catch (directErr) {
|
|
700
|
+
log.debug('DDG direct fetch failed:', directErr instanceof Error ? directErr.message : directErr);
|
|
701
|
+
html = '';
|
|
702
|
+
}
|
|
703
|
+
// Check if direct returned actual results (not empty/CAPTCHA)
|
|
704
|
+
const hasResults = html.includes('class="result"') || html.includes('class="result ');
|
|
705
|
+
if (!hasResults && proxyUrl) {
|
|
706
|
+
// Attempt 2: Proxy fallback
|
|
707
|
+
log.debug('DDG direct returned no results, trying proxy...');
|
|
690
708
|
try {
|
|
709
|
+
// usedProxy = true;
|
|
691
710
|
const dispatcher = new ProxyAgent(proxyUrl);
|
|
692
711
|
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal, dispatcher });
|
|
712
|
+
if (response.ok)
|
|
713
|
+
html = await response.text();
|
|
693
714
|
}
|
|
694
715
|
catch (proxyErr) {
|
|
695
|
-
log.debug('DDG proxy
|
|
696
|
-
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
|
|
716
|
+
log.debug('DDG proxy also failed:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
|
|
697
717
|
}
|
|
698
718
|
}
|
|
699
|
-
else {
|
|
700
|
-
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
|
|
701
|
-
}
|
|
702
|
-
if (!response.ok) {
|
|
703
|
-
throw new Error(`Search failed: HTTP ${response.status}`);
|
|
704
|
-
}
|
|
705
|
-
const html = await response.text();
|
|
706
719
|
const $ = load(html);
|
|
707
720
|
const results = [];
|
|
708
721
|
const seen = new Set();
|
|
@@ -766,22 +779,25 @@ export class DuckDuckGoProvider {
|
|
|
766
779
|
'Referer': 'https://lite.duckduckgo.com/',
|
|
767
780
|
};
|
|
768
781
|
const liteUrl = `https://lite.duckduckgo.com/lite/?${params.toString()}`;
|
|
769
|
-
|
|
770
|
-
|
|
782
|
+
// Direct first, proxy fallback (same reasoning as searchOnce — Webshare IPs blocked by DDG)
|
|
783
|
+
let html = '';
|
|
784
|
+
try {
|
|
785
|
+
const resp = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
|
|
786
|
+
if (resp.ok)
|
|
787
|
+
html = await resp.text();
|
|
788
|
+
}
|
|
789
|
+
catch { /* direct failed */ }
|
|
790
|
+
if (!html.includes('result-link') && liteProxyUrl) {
|
|
771
791
|
try {
|
|
772
792
|
const dispatcher = new ProxyAgent(liteProxyUrl);
|
|
773
|
-
|
|
793
|
+
const resp = await undiciFetch(liteUrl, { headers: liteHeaders, signal, dispatcher });
|
|
794
|
+
if (resp.ok)
|
|
795
|
+
html = await resp.text();
|
|
774
796
|
}
|
|
775
|
-
catch {
|
|
776
|
-
response = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
|
|
777
|
-
}
|
|
778
|
-
}
|
|
779
|
-
else {
|
|
780
|
-
response = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
|
|
797
|
+
catch { /* proxy also failed */ }
|
|
781
798
|
}
|
|
782
|
-
if (!
|
|
799
|
+
if (!html)
|
|
783
800
|
return [];
|
|
784
|
-
const html = await response.text();
|
|
785
801
|
const $ = load(html);
|
|
786
802
|
const results = [];
|
|
787
803
|
const seen = new Set();
|
|
@@ -1035,6 +1051,33 @@ export class DuckDuckGoProvider {
|
|
|
1035
1051
|
async searchWeb(query, options) {
|
|
1036
1052
|
const attempts = this.buildQueryAttempts(query);
|
|
1037
1053
|
// -----------------------------------------------------------
|
|
1054
|
+
// Stage 0: SearXNG (self-hosted, residential IP — highest reliability)
|
|
1055
|
+
// Uses Mac Mini running SearXNG exposed via Cloudflare Tunnel.
|
|
1056
|
+
// Aggregates Google, Bing, Brave, Startpage — 30-40 results typical.
|
|
1057
|
+
// Env: SEARXNG_URL=https://search.webpeel.dev
|
|
1058
|
+
// -----------------------------------------------------------
|
|
1059
|
+
if (process.env.SEARXNG_URL) {
|
|
1060
|
+
try {
|
|
1061
|
+
const searxResults = await searchViaSearXNG(query, {
|
|
1062
|
+
count: options.count ?? 10,
|
|
1063
|
+
signal: options.signal,
|
|
1064
|
+
timeoutMs: 6000,
|
|
1065
|
+
});
|
|
1066
|
+
if (searxResults.length > 0) {
|
|
1067
|
+
providerStats.record('searxng', true);
|
|
1068
|
+
log.debug(`source=searxng returned ${searxResults.length} results`);
|
|
1069
|
+
const filtered = filterRelevantResults(searxResults, query);
|
|
1070
|
+
return filtered.length > 0 ? filtered : searxResults;
|
|
1071
|
+
}
|
|
1072
|
+
providerStats.record('searxng', false);
|
|
1073
|
+
log.debug('SearXNG returned 0 results, falling through to DDG');
|
|
1074
|
+
}
|
|
1075
|
+
catch (e) {
|
|
1076
|
+
providerStats.record('searxng', false);
|
|
1077
|
+
log.debug('SearXNG failed:', e instanceof Error ? e.message : e);
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
// -----------------------------------------------------------
|
|
1038
1081
|
// Stage 1: DDG HTTP
|
|
1039
1082
|
// Skip entirely if the source has a ≥80% failure rate over the
|
|
1040
1083
|
// last 10 attempts. When elevated-but-not-skipped, cap the per-
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SearXNG Search Provider
|
|
3
|
+
*
|
|
4
|
+
* Connects to a self-hosted SearXNG instance (running on Mac Mini with residential IP,
|
|
5
|
+
* exposed via Cloudflare Tunnel). SearXNG aggregates Google, Bing, Brave, Startpage, etc.
|
|
6
|
+
* and is not rate-limited or blocked since it runs on a residential IP.
|
|
7
|
+
*
|
|
8
|
+
* Config (env vars):
|
|
9
|
+
* SEARXNG_URL — Base URL of SearXNG instance (e.g. https://search.webpeel.dev)
|
|
10
|
+
*
|
|
11
|
+
* Falls back gracefully if SEARXNG_URL is not set or instance is unreachable.
|
|
12
|
+
*/
|
|
13
|
+
export interface SearXNGSearchResult {
|
|
14
|
+
title: string;
|
|
15
|
+
url: string;
|
|
16
|
+
description?: string;
|
|
17
|
+
publishedDate?: string;
|
|
18
|
+
score?: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Fetches search results from a SearXNG instance.
|
|
22
|
+
* Returns results compatible with WebSearchResult interface in search-provider.ts.
|
|
23
|
+
*/
|
|
24
|
+
export declare function searchViaSearXNG(query: string, options?: {
|
|
25
|
+
count?: number;
|
|
26
|
+
signal?: AbortSignal;
|
|
27
|
+
timeoutMs?: number;
|
|
28
|
+
engines?: string;
|
|
29
|
+
language?: string;
|
|
30
|
+
}): Promise<SearXNGSearchResult[]>;
|
|
31
|
+
/**
|
|
32
|
+
* Quick health check — true if SearXNG is reachable and returning results.
|
|
33
|
+
*/
|
|
34
|
+
export declare function isSearXNGHealthy(): Promise<boolean>;
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SearXNG Search Provider
|
|
3
|
+
*
|
|
4
|
+
* Connects to a self-hosted SearXNG instance (running on Mac Mini with residential IP,
|
|
5
|
+
* exposed via Cloudflare Tunnel). SearXNG aggregates Google, Bing, Brave, Startpage, etc.
|
|
6
|
+
* and is not rate-limited or blocked since it runs on a residential IP.
|
|
7
|
+
*
|
|
8
|
+
* Config (env vars):
|
|
9
|
+
* SEARXNG_URL — Base URL of SearXNG instance (e.g. https://search.webpeel.dev)
|
|
10
|
+
*
|
|
11
|
+
* Falls back gracefully if SEARXNG_URL is not set or instance is unreachable.
|
|
12
|
+
*/
|
|
13
|
+
import { fetch as undiciFetch } from 'undici';
|
|
14
|
+
import { createLogger } from './logger.js';
|
|
15
|
+
const log = createLogger('searxng');
|
|
16
|
+
/**
|
|
17
|
+
* Fetches search results from a SearXNG instance.
|
|
18
|
+
* Returns results compatible with WebSearchResult interface in search-provider.ts.
|
|
19
|
+
*/
|
|
20
|
+
export async function searchViaSearXNG(query, options = {}) {
|
|
21
|
+
const baseUrl = process.env.SEARXNG_URL;
|
|
22
|
+
if (!baseUrl)
|
|
23
|
+
return [];
|
|
24
|
+
const { count = 10, signal, timeoutMs = 8000, engines = '', language = 'en', } = options;
|
|
25
|
+
const controller = new AbortController();
|
|
26
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
27
|
+
if (signal)
|
|
28
|
+
signal.addEventListener('abort', () => controller.abort());
|
|
29
|
+
try {
|
|
30
|
+
const params = new URLSearchParams({
|
|
31
|
+
q: query,
|
|
32
|
+
format: 'json',
|
|
33
|
+
language,
|
|
34
|
+
safesearch: '0',
|
|
35
|
+
categories: 'general',
|
|
36
|
+
});
|
|
37
|
+
if (engines)
|
|
38
|
+
params.set('engines', engines);
|
|
39
|
+
const url = `${baseUrl.replace(/\/$/, '')}/search?${params.toString()}`;
|
|
40
|
+
const response = await undiciFetch(url, {
|
|
41
|
+
signal: controller.signal,
|
|
42
|
+
headers: {
|
|
43
|
+
'Accept': 'application/json',
|
|
44
|
+
'User-Agent': 'WebPeel/1.0 (internal search aggregator)',
|
|
45
|
+
},
|
|
46
|
+
});
|
|
47
|
+
if (!response.ok) {
|
|
48
|
+
log.debug(`HTTP ${response.status}`);
|
|
49
|
+
return [];
|
|
50
|
+
}
|
|
51
|
+
const data = (await response.json());
|
|
52
|
+
const results = data?.results ?? [];
|
|
53
|
+
if (results.length === 0) {
|
|
54
|
+
log.debug('0 results returned');
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
57
|
+
const seen = new Set();
|
|
58
|
+
const output = [];
|
|
59
|
+
for (const r of results) {
|
|
60
|
+
if (!r.url || !r.title)
|
|
61
|
+
continue;
|
|
62
|
+
const normalized = r.url.replace(/\/$/, '').toLowerCase();
|
|
63
|
+
if (seen.has(normalized))
|
|
64
|
+
continue;
|
|
65
|
+
seen.add(normalized);
|
|
66
|
+
output.push({
|
|
67
|
+
title: r.title,
|
|
68
|
+
url: r.url,
|
|
69
|
+
description: r.content ?? undefined,
|
|
70
|
+
publishedDate: r.publishedDate ?? undefined,
|
|
71
|
+
score: r.score ?? undefined,
|
|
72
|
+
});
|
|
73
|
+
if (output.length >= count)
|
|
74
|
+
break;
|
|
75
|
+
}
|
|
76
|
+
log.debug(`${output.length} results for "${query.substring(0, 40)}"`);
|
|
77
|
+
return output;
|
|
78
|
+
}
|
|
79
|
+
catch (e) {
|
|
80
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
81
|
+
if (msg.includes('abort') || msg.includes('timeout') || msg.includes('AbortError')) {
|
|
82
|
+
log.debug(`timed out after ${timeoutMs}ms`);
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
log.debug('fetch error:', msg);
|
|
86
|
+
}
|
|
87
|
+
return [];
|
|
88
|
+
}
|
|
89
|
+
finally {
|
|
90
|
+
clearTimeout(timeoutId);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Quick health check — true if SearXNG is reachable and returning results.
|
|
95
|
+
*/
|
|
96
|
+
export async function isSearXNGHealthy() {
|
|
97
|
+
try {
|
|
98
|
+
const results = await searchViaSearXNG('test', { count: 1, timeoutMs: 5000 });
|
|
99
|
+
return results.length > 0;
|
|
100
|
+
}
|
|
101
|
+
catch {
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.32",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|