webpeel 0.21.30 → 0.21.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@ import { load } from 'cheerio';
17
17
  import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
18
18
  import { getWebshareProxy, getWebshareProxyUrl } from './proxy-config.js';
19
19
  import { createLogger } from './logger.js';
20
+ import { searchViaSearXNG } from './searxng-provider.js';
20
21
  const log = createLogger('search');
21
22
  function decodeHtmlEntities(input) {
22
23
  // Cheerio usually decodes entities when using `.text()`, but keep this as a
@@ -684,25 +685,37 @@ export class DuckDuckGoProvider {
684
685
  'Upgrade-Insecure-Requests': '1',
685
686
  'Referer': 'https://duckduckgo.com/',
686
687
  };
687
- // Try with proxy first (bypasses datacenter IP blocks), fall back to direct
688
+ // Try direct first, then proxy as fallback.
689
+ // Webshare backbone IPs are blocked by DDG (returns empty results).
690
+ // Render datacenter IPs work intermittently — direct has better odds.
688
691
  let response;
689
- if (proxyUrl) {
692
+ let html;
693
+ // let usedProxy = false;
694
+ // Attempt 1: Direct fetch (no proxy)
695
+ try {
696
+ response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
697
+ html = response.ok ? await response.text() : '';
698
+ }
699
+ catch (directErr) {
700
+ log.debug('DDG direct fetch failed:', directErr instanceof Error ? directErr.message : directErr);
701
+ html = '';
702
+ }
703
+ // Check if direct returned actual results (not empty/CAPTCHA)
704
+ const hasResults = html.includes('class="result"') || html.includes('class="result ');
705
+ if (!hasResults && proxyUrl) {
706
+ // Attempt 2: Proxy fallback
707
+ log.debug('DDG direct returned no results, trying proxy...');
690
708
  try {
709
+ // usedProxy = true;
691
710
  const dispatcher = new ProxyAgent(proxyUrl);
692
711
  response = await undiciFetch(searchUrl, { headers: baseHeaders, signal, dispatcher });
712
+ if (response.ok)
713
+ html = await response.text();
693
714
  }
694
715
  catch (proxyErr) {
695
- log.debug('DDG proxy fetch failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
696
- response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
716
+ log.debug('DDG proxy also failed:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
697
717
  }
698
718
  }
699
- else {
700
- response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
701
- }
702
- if (!response.ok) {
703
- throw new Error(`Search failed: HTTP ${response.status}`);
704
- }
705
- const html = await response.text();
706
719
  const $ = load(html);
707
720
  const results = [];
708
721
  const seen = new Set();
@@ -766,22 +779,25 @@ export class DuckDuckGoProvider {
766
779
  'Referer': 'https://lite.duckduckgo.com/',
767
780
  };
768
781
  const liteUrl = `https://lite.duckduckgo.com/lite/?${params.toString()}`;
769
- let response;
770
- if (liteProxyUrl) {
782
+ // Direct first, proxy fallback (same reasoning as searchOnce — Webshare IPs blocked by DDG)
783
+ let html = '';
784
+ try {
785
+ const resp = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
786
+ if (resp.ok)
787
+ html = await resp.text();
788
+ }
789
+ catch { /* direct failed */ }
790
+ if (!html.includes('result-link') && liteProxyUrl) {
771
791
  try {
772
792
  const dispatcher = new ProxyAgent(liteProxyUrl);
773
- response = await undiciFetch(liteUrl, { headers: liteHeaders, signal, dispatcher });
793
+ const resp = await undiciFetch(liteUrl, { headers: liteHeaders, signal, dispatcher });
794
+ if (resp.ok)
795
+ html = await resp.text();
774
796
  }
775
- catch {
776
- response = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
777
- }
778
- }
779
- else {
780
- response = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
797
+ catch { /* proxy also failed */ }
781
798
  }
782
- if (!response.ok)
799
+ if (!html)
783
800
  return [];
784
- const html = await response.text();
785
801
  const $ = load(html);
786
802
  const results = [];
787
803
  const seen = new Set();
@@ -1035,6 +1051,33 @@ export class DuckDuckGoProvider {
1035
1051
  async searchWeb(query, options) {
1036
1052
  const attempts = this.buildQueryAttempts(query);
1037
1053
  // -----------------------------------------------------------
1054
+ // Stage 0: SearXNG (self-hosted, residential IP — highest reliability)
1055
+ // Uses Mac Mini running SearXNG exposed via Cloudflare Tunnel.
1056
+ // Aggregates Google, Bing, Brave, Startpage — 30-40 results typical.
1057
+ // Env: SEARXNG_URL=https://search.webpeel.dev
1058
+ // -----------------------------------------------------------
1059
+ if (process.env.SEARXNG_URL) {
1060
+ try {
1061
+ const searxResults = await searchViaSearXNG(query, {
1062
+ count: options.count ?? 10,
1063
+ signal: options.signal,
1064
+ timeoutMs: 6000,
1065
+ });
1066
+ if (searxResults.length > 0) {
1067
+ providerStats.record('searxng', true);
1068
+ log.debug(`source=searxng returned ${searxResults.length} results`);
1069
+ const filtered = filterRelevantResults(searxResults, query);
1070
+ return filtered.length > 0 ? filtered : searxResults;
1071
+ }
1072
+ providerStats.record('searxng', false);
1073
+ log.debug('SearXNG returned 0 results, falling through to DDG');
1074
+ }
1075
+ catch (e) {
1076
+ providerStats.record('searxng', false);
1077
+ log.debug('SearXNG failed:', e instanceof Error ? e.message : e);
1078
+ }
1079
+ }
1080
+ // -----------------------------------------------------------
1038
1081
  // Stage 1: DDG HTTP
1039
1082
  // Skip entirely if the source has a ≥80% failure rate over the
1040
1083
  // last 10 attempts. When elevated-but-not-skipped, cap the per-
@@ -0,0 +1,34 @@
1
+ /**
2
+ * SearXNG Search Provider
3
+ *
4
+ * Connects to a self-hosted SearXNG instance (running on Mac Mini with residential IP,
5
+ * exposed via Cloudflare Tunnel). SearXNG aggregates Google, Bing, Brave, Startpage, etc.
6
+ * and is not rate-limited or blocked since it runs on a residential IP.
7
+ *
8
+ * Config (env vars):
9
+ * SEARXNG_URL — Base URL of SearXNG instance (e.g. https://search.webpeel.dev)
10
+ *
11
+ * Falls back gracefully if SEARXNG_URL is not set or instance is unreachable.
12
+ */
13
+ export interface SearXNGSearchResult {
14
+ title: string;
15
+ url: string;
16
+ description?: string;
17
+ publishedDate?: string;
18
+ score?: number;
19
+ }
20
+ /**
21
+ * Fetches search results from a SearXNG instance.
22
+ * Returns results compatible with WebSearchResult interface in search-provider.ts.
23
+ */
24
+ export declare function searchViaSearXNG(query: string, options?: {
25
+ count?: number;
26
+ signal?: AbortSignal;
27
+ timeoutMs?: number;
28
+ engines?: string;
29
+ language?: string;
30
+ }): Promise<SearXNGSearchResult[]>;
31
+ /**
32
+ * Quick health check — true if SearXNG is reachable and returning results.
33
+ */
34
+ export declare function isSearXNGHealthy(): Promise<boolean>;
@@ -0,0 +1,104 @@
1
+ /**
2
+ * SearXNG Search Provider
3
+ *
4
+ * Connects to a self-hosted SearXNG instance (running on Mac Mini with residential IP,
5
+ * exposed via Cloudflare Tunnel). SearXNG aggregates Google, Bing, Brave, Startpage, etc.
6
+ * and is not rate-limited or blocked since it runs on a residential IP.
7
+ *
8
+ * Config (env vars):
9
+ * SEARXNG_URL — Base URL of SearXNG instance (e.g. https://search.webpeel.dev)
10
+ *
11
+ * Falls back gracefully if SEARXNG_URL is not set or instance is unreachable.
12
+ */
13
+ import { fetch as undiciFetch } from 'undici';
14
+ import { createLogger } from './logger.js';
15
+ const log = createLogger('searxng');
16
+ /**
17
+ * Fetches search results from a SearXNG instance.
18
+ * Returns results compatible with WebSearchResult interface in search-provider.ts.
19
+ */
20
+ export async function searchViaSearXNG(query, options = {}) {
21
+ const baseUrl = process.env.SEARXNG_URL;
22
+ if (!baseUrl)
23
+ return [];
24
+ const { count = 10, signal, timeoutMs = 8000, engines = '', language = 'en', } = options;
25
+ const controller = new AbortController();
26
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
27
+ if (signal)
28
+ signal.addEventListener('abort', () => controller.abort());
29
+ try {
30
+ const params = new URLSearchParams({
31
+ q: query,
32
+ format: 'json',
33
+ language,
34
+ safesearch: '0',
35
+ categories: 'general',
36
+ });
37
+ if (engines)
38
+ params.set('engines', engines);
39
+ const url = `${baseUrl.replace(/\/$/, '')}/search?${params.toString()}`;
40
+ const response = await undiciFetch(url, {
41
+ signal: controller.signal,
42
+ headers: {
43
+ 'Accept': 'application/json',
44
+ 'User-Agent': 'WebPeel/1.0 (internal search aggregator)',
45
+ },
46
+ });
47
+ if (!response.ok) {
48
+ log.debug(`HTTP ${response.status}`);
49
+ return [];
50
+ }
51
+ const data = (await response.json());
52
+ const results = data?.results ?? [];
53
+ if (results.length === 0) {
54
+ log.debug('0 results returned');
55
+ return [];
56
+ }
57
+ const seen = new Set();
58
+ const output = [];
59
+ for (const r of results) {
60
+ if (!r.url || !r.title)
61
+ continue;
62
+ const normalized = r.url.replace(/\/$/, '').toLowerCase();
63
+ if (seen.has(normalized))
64
+ continue;
65
+ seen.add(normalized);
66
+ output.push({
67
+ title: r.title,
68
+ url: r.url,
69
+ description: r.content ?? undefined,
70
+ publishedDate: r.publishedDate ?? undefined,
71
+ score: r.score ?? undefined,
72
+ });
73
+ if (output.length >= count)
74
+ break;
75
+ }
76
+ log.debug(`${output.length} results for "${query.substring(0, 40)}"`);
77
+ return output;
78
+ }
79
+ catch (e) {
80
+ const msg = e instanceof Error ? e.message : String(e);
81
+ if (msg.includes('abort') || msg.includes('timeout') || msg.includes('AbortError')) {
82
+ log.debug(`timed out after ${timeoutMs}ms`);
83
+ }
84
+ else {
85
+ log.debug('fetch error:', msg);
86
+ }
87
+ return [];
88
+ }
89
+ finally {
90
+ clearTimeout(timeoutId);
91
+ }
92
+ }
93
+ /**
94
+ * Quick health check — true if SearXNG is reachable and returning results.
95
+ */
96
+ export async function isSearXNGHealthy() {
97
+ try {
98
+ const results = await searchViaSearXNG('test', { count: 1, timeoutMs: 5000 });
99
+ return results.length > 0;
100
+ }
101
+ catch {
102
+ return false;
103
+ }
104
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.30",
3
+ "version": "0.21.32",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",