webpeel 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +11 -657
- package/README.md +246 -325
- package/dist/cli.js +330 -73
- package/dist/cli.js.map +1 -1
- package/dist/core/browser-fetch.d.ts +12 -0
- package/dist/core/browser-fetch.d.ts.map +1 -1
- package/dist/core/browser-fetch.js +70 -17
- package/dist/core/browser-fetch.js.map +1 -1
- package/dist/core/cf-worker-proxy.d.ts +33 -0
- package/dist/core/cf-worker-proxy.d.ts.map +1 -0
- package/dist/core/cf-worker-proxy.js +88 -0
- package/dist/core/cf-worker-proxy.js.map +1 -0
- package/dist/core/chunker.d.ts +47 -0
- package/dist/core/chunker.d.ts.map +1 -0
- package/dist/core/chunker.js +250 -0
- package/dist/core/chunker.js.map +1 -0
- package/dist/core/cloak-fetch.d.ts +43 -0
- package/dist/core/cloak-fetch.d.ts.map +1 -0
- package/dist/core/cloak-fetch.js +141 -0
- package/dist/core/cloak-fetch.js.map +1 -0
- package/dist/core/crawl-checkpoint.d.ts +55 -0
- package/dist/core/crawl-checkpoint.d.ts.map +1 -0
- package/dist/core/crawl-checkpoint.js +105 -0
- package/dist/core/crawl-checkpoint.js.map +1 -0
- package/dist/core/crawler.d.ts +5 -1
- package/dist/core/crawler.d.ts.map +1 -1
- package/dist/core/crawler.js +60 -5
- package/dist/core/crawler.js.map +1 -1
- package/dist/core/cycle-fetch.d.ts +27 -0
- package/dist/core/cycle-fetch.d.ts.map +1 -0
- package/dist/core/cycle-fetch.js +99 -0
- package/dist/core/cycle-fetch.js.map +1 -0
- package/dist/core/domain-extractors.d.ts.map +1 -1
- package/dist/core/domain-extractors.js +754 -14
- package/dist/core/domain-extractors.js.map +1 -1
- package/dist/core/google-cache.d.ts +30 -0
- package/dist/core/google-cache.d.ts.map +1 -0
- package/dist/core/google-cache.js +181 -0
- package/dist/core/google-cache.js.map +1 -0
- package/dist/core/markdown.d.ts +11 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +43 -0
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/peel-tls.d.ts +26 -0
- package/dist/core/peel-tls.d.ts.map +1 -0
- package/dist/core/peel-tls.js +221 -0
- package/dist/core/peel-tls.js.map +1 -0
- package/dist/core/pipeline.d.ts +5 -1
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +269 -21
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/schema-postprocess.d.ts +33 -0
- package/dist/core/schema-postprocess.d.ts.map +1 -0
- package/dist/core/schema-postprocess.js +470 -0
- package/dist/core/schema-postprocess.js.map +1 -0
- package/dist/core/schema-templates.d.ts +20 -0
- package/dist/core/schema-templates.d.ts.map +1 -0
- package/dist/core/schema-templates.js +131 -0
- package/dist/core/schema-templates.js.map +1 -0
- package/dist/core/search-fallback.d.ts +28 -0
- package/dist/core/search-fallback.d.ts.map +1 -0
- package/dist/core/search-fallback.js +185 -0
- package/dist/core/search-fallback.js.map +1 -0
- package/dist/core/search-provider.d.ts +47 -4
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +278 -7
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/stealth-patches.d.ts +58 -0
- package/dist/core/stealth-patches.d.ts.map +1 -0
- package/dist/core/stealth-patches.js +340 -0
- package/dist/core/stealth-patches.js.map +1 -0
- package/dist/core/strategies.d.ts +20 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +284 -48
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/strategy-hooks.d.ts +1 -1
- package/dist/core/strategy-hooks.d.ts.map +1 -1
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +37 -15
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +109 -4
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +29 -0
- package/dist/server/app.js.map +1 -1
- package/dist/server/middleware/rate-limit.d.ts +2 -1
- package/dist/server/middleware/rate-limit.d.ts.map +1 -1
- package/dist/server/middleware/rate-limit.js +24 -8
- package/dist/server/middleware/rate-limit.js.map +1 -1
- package/dist/server/routes/agent.d.ts +4 -0
- package/dist/server/routes/agent.d.ts.map +1 -1
- package/dist/server/routes/agent.js +196 -9
- package/dist/server/routes/agent.js.map +1 -1
- package/dist/server/routes/batch.js +5 -5
- package/dist/server/routes/batch.js.map +1 -1
- package/dist/server/routes/compat.d.ts.map +1 -1
- package/dist/server/routes/compat.js +1 -0
- package/dist/server/routes/compat.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +60 -6
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +103 -2
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/search.js +1 -1
- package/dist/server/routes/search.js.map +1 -1
- package/dist/types.d.ts +55 -4
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +4 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +55 -125
- package/package.json +15 -1
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Search-as-proxy fallback for blocked pages.
|
|
3
|
+
* When a protected site blocks our fetch, we search for the URL
|
|
4
|
+
* and extract info from search engine snippets.
|
|
5
|
+
*
|
|
6
|
+
* Uses the getBestSearchProvider() chain which handles:
|
|
7
|
+
* Google CSE API → Brave API → Google stealth → DDG
|
|
8
|
+
* This avoids direct HTML scraping which is blocked by CAPTCHAs on datacenter IPs.
|
|
9
|
+
*/
|
|
10
|
+
export interface SearchFallbackResult {
|
|
11
|
+
title: string;
|
|
12
|
+
snippet: string;
|
|
13
|
+
cachedContent: string;
|
|
14
|
+
source: 'duckduckgo' | 'google' | 'none';
|
|
15
|
+
/** Extracted product data if this looks like a product page */
|
|
16
|
+
productData?: {
|
|
17
|
+
price?: string;
|
|
18
|
+
rating?: string;
|
|
19
|
+
brand?: string;
|
|
20
|
+
availability?: string;
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Search for a URL using the best available search provider and extract the snippet.
|
|
25
|
+
* Returns the title, snippet, and any extracted product data.
|
|
26
|
+
*/
|
|
27
|
+
export declare function searchFallback(url: string): Promise<SearchFallbackResult>;
|
|
28
|
+
//# sourceMappingURL=search-fallback.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search-fallback.d.ts","sourceRoot":"","sources":["../../src/core/search-fallback.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,aAAa,EAAE,MAAM,CAAC;IACtB,MAAM,EAAE,YAAY,GAAG,QAAQ,GAAG,MAAM,CAAC;IACzC,+DAA+D;IAC/D,WAAW,CAAC,EAAE;QACZ,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,YAAY,CAAC,EAAE,MAAM,CAAC;KACvB,CAAC;CACH;AAmID;;;GAGG;AACH,wBAAsB,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC,CA4D/E"}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Search-as-proxy fallback for blocked pages.
|
|
3
|
+
* When a protected site blocks our fetch, we search for the URL
|
|
4
|
+
* and extract info from search engine snippets.
|
|
5
|
+
*
|
|
6
|
+
* Uses the getBestSearchProvider() chain which handles:
|
|
7
|
+
* Google CSE API → Brave API → Google stealth → DDG
|
|
8
|
+
* This avoids direct HTML scraping which is blocked by CAPTCHAs on datacenter IPs.
|
|
9
|
+
*/
|
|
10
|
+
import { getBestSearchProvider } from './search-provider.js';
|
|
11
|
+
/**
|
|
12
|
+
* Detect if a URL is likely a product page.
|
|
13
|
+
*/
|
|
14
|
+
function isProductUrl(url) {
|
|
15
|
+
try {
|
|
16
|
+
const u = new URL(url);
|
|
17
|
+
const host = u.hostname;
|
|
18
|
+
const path = u.pathname.toLowerCase();
|
|
19
|
+
const productHosts = ['bestbuy.com', 'walmart.com', 'amazon.com', 'target.com', 'newegg.com', 'ebay.com'];
|
|
20
|
+
const isProductHost = productHosts.some(h => host.includes(h));
|
|
21
|
+
const hasProductPath = path.includes('/dp/') ||
|
|
22
|
+
path.includes('/ip/') ||
|
|
23
|
+
path.includes('/site/') ||
|
|
24
|
+
path.includes('/p/') ||
|
|
25
|
+
path.includes('/product') ||
|
|
26
|
+
/\/\d{5,}/.test(path);
|
|
27
|
+
return isProductHost || hasProductPath;
|
|
28
|
+
}
|
|
29
|
+
catch {
|
|
30
|
+
return false;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Build an optimized search query for the given URL.
|
|
35
|
+
* For product pages, adds price/availability keywords for richer snippets.
|
|
36
|
+
*/
|
|
37
|
+
function buildSearchQuery(url) {
|
|
38
|
+
try {
|
|
39
|
+
const urlObj = new URL(url);
|
|
40
|
+
// Extract meaningful path terms (skip numeric IDs, short tokens)
|
|
41
|
+
const pathTerms = urlObj.pathname
|
|
42
|
+
.split(/[-/_]/)
|
|
43
|
+
.filter(t => t.length > 2 && !/^\d+$/.test(t) && !/^[a-z]{1,2}$/.test(t))
|
|
44
|
+
.slice(0, 6)
|
|
45
|
+
.join(' ');
|
|
46
|
+
const isProduct = isProductUrl(url);
|
|
47
|
+
if (isProduct) {
|
|
48
|
+
// For product pages, search with price intent for richer snippets
|
|
49
|
+
return `"${urlObj.hostname}" ${pathTerms} price`.trim();
|
|
50
|
+
}
|
|
51
|
+
return `site:${urlObj.hostname} ${pathTerms}`.trim();
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
return url;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Extract product-specific data from a search snippet.
|
|
59
|
+
* Google/Brave often include price, rating, availability in rich snippets.
|
|
60
|
+
*/
|
|
61
|
+
function extractProductData(title, snippet) {
|
|
62
|
+
const text = `${title} ${snippet}`;
|
|
63
|
+
const data = {
|
|
64
|
+
price: '',
|
|
65
|
+
rating: '',
|
|
66
|
+
brand: '',
|
|
67
|
+
availability: '',
|
|
68
|
+
};
|
|
69
|
+
// Price: $XX.XX or $X,XXX.XX
|
|
70
|
+
const priceMatch = text.match(/\$[\d,]+(?:\.\d{2})?/);
|
|
71
|
+
if (priceMatch)
|
|
72
|
+
data.price = priceMatch[0];
|
|
73
|
+
// Rating: "4.5 out of 5", "4.5/5", "4.5 stars"
|
|
74
|
+
const ratingMatch = text.match(/(\d(?:\.\d)?)\s*(?:out of 5|\/5|★|\bstars?\b)/i);
|
|
75
|
+
if (ratingMatch)
|
|
76
|
+
data.rating = `${ratingMatch[1]}/5`;
|
|
77
|
+
// Availability: in stock, out of stock, available
|
|
78
|
+
const availMatch = text.match(/\b(in stock|out of stock|available|sold out|ships in|pre-order)\b/i);
|
|
79
|
+
if (availMatch)
|
|
80
|
+
data.availability = availMatch[0];
|
|
81
|
+
// Brand: check for "by BrandName" or "Brand:" patterns
|
|
82
|
+
const brandMatch = text.match(/(?:by|brand:|sold by)\s+([A-Z][a-zA-Z\s&]{2,20}?)(?:\s*[|·•,]|\s+(?:at|on|for|price))/i);
|
|
83
|
+
if (brandMatch)
|
|
84
|
+
data.brand = brandMatch[1].trim();
|
|
85
|
+
const hasData = data.price || data.rating || data.availability;
|
|
86
|
+
if (!hasData)
|
|
87
|
+
return undefined;
|
|
88
|
+
return {
|
|
89
|
+
price: data.price || undefined,
|
|
90
|
+
rating: data.rating || undefined,
|
|
91
|
+
brand: data.brand || undefined,
|
|
92
|
+
availability: data.availability || undefined,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Build a rich markdown content block from search results.
|
|
97
|
+
* Formats differently for product pages vs general pages.
|
|
98
|
+
*/
|
|
99
|
+
function buildCachedContent(url, title, snippet, productData) {
|
|
100
|
+
const lines = [];
|
|
101
|
+
lines.push(`# ${title}`);
|
|
102
|
+
lines.push('');
|
|
103
|
+
// For product pages, add structured product info
|
|
104
|
+
if (productData) {
|
|
105
|
+
if (productData.price)
|
|
106
|
+
lines.push(`**Price:** ${productData.price}`);
|
|
107
|
+
if (productData.brand)
|
|
108
|
+
lines.push(`**Brand:** ${productData.brand}`);
|
|
109
|
+
if (productData.rating)
|
|
110
|
+
lines.push(`**Rating:** ${productData.rating}`);
|
|
111
|
+
if (productData.availability)
|
|
112
|
+
lines.push(`**Availability:** ${productData.availability}`);
|
|
113
|
+
lines.push('');
|
|
114
|
+
}
|
|
115
|
+
if (snippet) {
|
|
116
|
+
lines.push(snippet);
|
|
117
|
+
lines.push('');
|
|
118
|
+
}
|
|
119
|
+
lines.push(`---`);
|
|
120
|
+
lines.push(`*Source: Search engine snippet for ${url}*`);
|
|
121
|
+
lines.push(`*⚠️ Limited content — original page blocked direct access. For full data, configure GOOGLE_SEARCH_KEY or BRAVE_SEARCH_KEY.*`);
|
|
122
|
+
return lines.join('\n');
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Search for a URL using the best available search provider and extract the snippet.
|
|
126
|
+
* Returns the title, snippet, and any extracted product data.
|
|
127
|
+
*/
|
|
128
|
+
export async function searchFallback(url) {
|
|
129
|
+
const emptyResult = {
|
|
130
|
+
title: '',
|
|
131
|
+
snippet: '',
|
|
132
|
+
cachedContent: '',
|
|
133
|
+
source: 'none',
|
|
134
|
+
};
|
|
135
|
+
try {
|
|
136
|
+
// Validate URL early — invalid URLs return empty result immediately
|
|
137
|
+
try {
|
|
138
|
+
new URL(url);
|
|
139
|
+
}
|
|
140
|
+
catch {
|
|
141
|
+
return emptyResult;
|
|
142
|
+
}
|
|
143
|
+
const searchQuery = buildSearchQuery(url);
|
|
144
|
+
const { provider, apiKey } = getBestSearchProvider();
|
|
145
|
+
const results = await provider.searchWeb(searchQuery, {
|
|
146
|
+
count: 3,
|
|
147
|
+
apiKey,
|
|
148
|
+
});
|
|
149
|
+
if (!results || results.length === 0) {
|
|
150
|
+
return emptyResult;
|
|
151
|
+
}
|
|
152
|
+
const topResult = results[0];
|
|
153
|
+
const title = topResult.title?.trim() || '';
|
|
154
|
+
const snippet = topResult.snippet?.trim() || '';
|
|
155
|
+
if (!title && !snippet) {
|
|
156
|
+
return emptyResult;
|
|
157
|
+
}
|
|
158
|
+
const productData = isProductUrl(url)
|
|
159
|
+
? extractProductData(title, snippet)
|
|
160
|
+
: undefined;
|
|
161
|
+
const cachedContent = buildCachedContent(url, title, snippet, productData);
|
|
162
|
+
// Map provider ID to our source type
|
|
163
|
+
const sourceMap = {
|
|
164
|
+
duckduckgo: 'duckduckgo',
|
|
165
|
+
brave: 'google',
|
|
166
|
+
stealth: 'duckduckgo',
|
|
167
|
+
google: 'google',
|
|
168
|
+
};
|
|
169
|
+
const source = sourceMap[provider.id] ?? 'google';
|
|
170
|
+
return {
|
|
171
|
+
title,
|
|
172
|
+
snippet,
|
|
173
|
+
cachedContent,
|
|
174
|
+
source,
|
|
175
|
+
productData,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
catch (e) {
|
|
179
|
+
if (process.env.DEBUG) {
|
|
180
|
+
console.debug('[webpeel]', 'Search fallback failed:', e instanceof Error ? e.message : e);
|
|
181
|
+
}
|
|
182
|
+
return emptyResult;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
//# sourceMappingURL=search-fallback.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search-fallback.js","sourceRoot":"","sources":["../../src/core/search-fallback.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,qBAAqB,EAAE,MAAM,sBAAsB,CAAC;AAgB7D;;GAEG;AACH,SAAS,YAAY,CAAC,GAAW;IAC/B,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QACvB,MAAM,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC;QACxB,MAAM,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;QAEtC,MAAM,YAAY,GAAG,CAAC,aAAa,EAAE,aAAa,EAAE,YAAY,EAAE,YAAY,EAAE,YAAY,EAAE,UAAU,CAAC,CAAC;QAC1G,MAAM,aAAa,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,cAAc,GAClB,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;YACrB,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;YACrB,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC;YACvB,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC;YACpB,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC;YACzB,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAExB,OAAO,aAAa,IAAI,cAAc,CAAC;IACzC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,GAAW;IACnC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAE5B,iEAAiE;QACjE,MAAM,SAAS,GAAG,MAAM,CAAC,QAAQ;aAC9B,KAAK,CAAC,OAAO,CAAC;aACd,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;aACxE,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;aACX,IAAI,CAAC,GAAG,CAAC,CAAC;QAEb,MAAM,SAAS,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;QAEpC,IAAI,SAAS,EAAE,CAAC;YACd,kEAAkE;YAClE,OAAO,IAAI,MAAM,CAAC,QAAQ,KAAK,SAAS,QAAQ,CAAC,IAAI,EAAE,CAAC;QAC1D,CAAC;QAED,OAAO,QAAQ,MAAM,CAAC,QAAQ,IAAI,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC;IACvD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,GAAG,CAAC;IACb,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,kBAAkB,CAAC,KAAa,EAAE,OAAe;IACxD,MAAM,IAAI,GAAG,GAAG,KAAK,IAAI,OAAO,EAAE,CAAC;IACnC,MAAM,IAAI,GAA+D;QACvE,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,KAAK,EAAE,EAAE;QACT,YAAY,EAAE,EAAE;KACjB,CAAC;IAEF,6BAA6B;IAC7B,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;IACtD,IAAI,UAAU;QAAE,IAAI,CAAC,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAE3C,+CAA+C;IAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;IACjF,IAAI,WAAW;QAAE,IAAI,CAAC,MAAM,GAAG,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC;IAErD,kDAAkD;IAClD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,oEAAoE,CAAC,CAAC;IACpG,IAAI,UAAU;QAAE,IAAI,CAAC,YAAY,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAElD,uDAAuD;IACvD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,wFAAwF,CAAC,CAAC;IACxH,IAAI,UAAU;QAAE,IAAI,CAAC,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAElD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,YAAY,CAAC;IAC/D,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAC;IAE/B,OAAO;QACL,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,SAAS;QAC9B,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,SAAS;QAChC,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,SAAS;QAC9B,YAAY,EAAE,IAAI,CAAC,YAAY,IAAI,SAAS;KAC7C,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,SAAS,kBAAkB,CACzB,GAAW,EACX,KAAa,EACb,OAAe,EACf,WAAiD;IAEjD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;IACzB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,iDAAiD;IACjD,IAAI,WAAW,EAAE,CAAC;QAChB,IAAI,WAAW,CAAC,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,cAAc,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC;QACrE,IAAI,WAAW,CAAC,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,cAAc,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC;QACrE,IAAI,WAAW,CAAC,MAAM;YAAE,KAAK,CAAC,IAAI,CAAC,eAAe,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC;QACxE,IAAI,WAAW,CAAC,YAAY;YAAE,KAAK,CAAC,IAAI,CAAC,qBAAqB,WAAW,CAAC,YAAY,EAAE,CAAC,CAAC;QAC1F,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,IAAI,OAAO,EAAE,CAAC;QACZ,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACpB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAClB,KAAK,CAAC,IAAI,CAAC,sCAAsC,GAAG,GAAG,CAAC,CAAC;IACzD,KAAK,CAAC,IAAI,CAAC,6HAA6H,CAAC,CAAC;IAE1I,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,GAAW;IAC9C,MAAM,WAAW,GAAyB;QACxC,KAAK,EAAE,EAAE;QACT,OAAO,EAAE,EAAE;QACX,aAAa,EAAE,EAAE;QACjB,MAAM,EAAE,MAAM;KACf,CAAC;IAEF,IAAI,CAAC;QACH,oEAAoE;QACpE,IAAI,CAAC;YAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC;YAAC,OAAO,WAAW,CAAC;QAAC,CAAC;QAEnD,MAAM,WAAW,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAC;QAC1C,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,qBAAqB,EAAE,CAAC;QAErD,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,SAAS,CAAC,WAAW,EAAE;YACpD,KAAK,EAAE,CAAC;YACR,MAAM;SACP,CAAC,CAAC;QAEH,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACrC,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,MAAM,SAAS,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QAC7B,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC5C,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAEhD,IAAI,CAAC,KAAK,IAAI,CAAC,OAAO,EAAE,CAAC;YACvB,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,MAAM,WAAW,GAAG,YAAY,CAAC,GAAG,CAAC;YACnC,CAAC,CAAC,kBAAkB,CAAC,KAAK,EAAE,OAAO,CAAC;YACpC,CAAC,CAAC,SAAS,CAAC;QAEd,MAAM,aAAa,GAAG,kBAAkB,CAAC,GAAG,EAAE,KAAK,EAAE,OAAO,EAAE,WAAW,CAAC,CAAC;QAE3E,qCAAqC;QACrC,MAAM,SAAS,GAA4C;YACzD,UAAU,EAAE,YAAY;YACxB,KAAK,EAAE,QAAQ;YACf,OAAO,EAAE,YAAY;YACrB,MAAM,EAAE,QAAQ;SACjB,CAAC;QACF,MAAM,MAAM,GAAG,SAAS,CAAC,QAAQ,CAAC,EAAE,CAAC,IAAI,QAAQ,CAAC;QAElD,OAAO;YACL,KAAK;YACL,OAAO;YACP,aAAa;YACb,MAAM;YACN,WAAW;SACZ,CAAC;IACJ,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,IAAI,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC;YACtB,OAAO,CAAC,KAAK,CAAC,WAAW,EAAE,yBAAyB,EAAE,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5F,CAAC;QACD,OAAO,WAAW,CAAC;IACrB,CAAC;AACH,CAAC"}
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* In production with no API keys configured, getBestSearchProvider() returns
|
|
13
13
|
* StealthSearchProvider since DDG HTTP is often blocked on datacenter IPs.
|
|
14
14
|
*/
|
|
15
|
-
export type SearchProviderId = 'duckduckgo' | 'brave' | 'stealth';
|
|
15
|
+
export type SearchProviderId = 'duckduckgo' | 'brave' | 'stealth' | 'google';
|
|
16
16
|
export interface WebSearchResult {
|
|
17
17
|
title: string;
|
|
18
18
|
url: string;
|
|
@@ -86,11 +86,54 @@ export declare class BraveSearchProvider implements SearchProvider {
|
|
|
86
86
|
readonly requiresApiKey = true;
|
|
87
87
|
searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
|
|
88
88
|
}
|
|
89
|
+
/**
|
|
90
|
+
* GoogleSearchProvider — Google Search via stealth browser or Custom Search JSON API
|
|
91
|
+
*
|
|
92
|
+
* Two modes:
|
|
93
|
+
* 1. Custom Search JSON API (BYOK): set GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX env vars.
|
|
94
|
+
* Reliable, structured, 100 free queries/day. Works from any IP.
|
|
95
|
+
* 2. Stealth browser scraping (no API key): uses playwright-extra stealth plugin to
|
|
96
|
+
* scrape google.com/search directly. Works from datacenter IPs where DDG/Bing/Ecosia
|
|
97
|
+
* are blocked. Gracefully returns [] if Playwright is unavailable.
|
|
98
|
+
*
|
|
99
|
+
* Docs: https://developers.google.com/custom-search/v1/overview
|
|
100
|
+
*/
|
|
101
|
+
export declare class GoogleSearchProvider implements SearchProvider {
|
|
102
|
+
readonly id: SearchProviderId;
|
|
103
|
+
/**
|
|
104
|
+
* requiresApiKey is false: works without API keys via stealth browser fallback.
|
|
105
|
+
*/
|
|
106
|
+
readonly requiresApiKey = false;
|
|
107
|
+
/**
|
|
108
|
+
* Map standard freshness values to Google's dateRestrict format.
|
|
109
|
+
* Google dateRestrict: d[n]=past n days, w[n]=past n weeks,
|
|
110
|
+
* m[n]=past n months, y[n]=past n years.
|
|
111
|
+
*/
|
|
112
|
+
private mapFreshnessToDateRestrict;
|
|
113
|
+
/** Validate URL; returns null if invalid/non-http */
|
|
114
|
+
private validateUrl;
|
|
115
|
+
/**
|
|
116
|
+
* Stealth browser scrape of google.com/search.
|
|
117
|
+
* Used when no Custom Search API key is configured.
|
|
118
|
+
* Strategy A: peel() with stealth rendering (consistent with StealthSearchProvider).
|
|
119
|
+
* Strategy B: direct playwright-extra launch (if peel returns no results).
|
|
120
|
+
*/
|
|
121
|
+
private scrapeGoogleStealth;
|
|
122
|
+
/** Parse Google search result HTML using cheerio. No DOM lib types required. */
|
|
123
|
+
private _parseGoogleHtml;
|
|
124
|
+
searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
|
|
125
|
+
}
|
|
89
126
|
export declare function getSearchProvider(id: SearchProviderId | undefined): SearchProvider;
|
|
90
127
|
/**
|
|
91
|
-
* Get the best available search provider based on configured API keys
|
|
92
|
-
*
|
|
93
|
-
*
|
|
128
|
+
* Get the best available search provider based on configured API keys and
|
|
129
|
+
* available runtime dependencies.
|
|
130
|
+
*
|
|
131
|
+
* Priority:
|
|
132
|
+
* 1. Google Custom Search JSON API (if GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX set)
|
|
133
|
+
* 2. Brave Search (if BRAVE_SEARCH_KEY is set)
|
|
134
|
+
* 3. Google stealth browser scraping (works from datacenter IPs; no API key needed)
|
|
135
|
+
* — only when playwright-extra is available in node_modules
|
|
136
|
+
* 4. DuckDuckGo with full fallback chain (DDG HTTP → DDG Lite → stealth multi-engine)
|
|
94
137
|
*/
|
|
95
138
|
export declare function getBestSearchProvider(): {
|
|
96
139
|
provider: SearchProvider;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"search-provider.d.ts","sourceRoot":"","sources":["../../src/core/search-provider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;
|
|
1
|
+
{"version":3,"file":"search-provider.d.ts","sourceRoot":"","sources":["../../src/core/search-provider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAOH,MAAM,MAAM,gBAAgB,GAAG,YAAY,GAAG,OAAO,GAAG,SAAS,GAAG,QAAQ,CAAC;AAE7E,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,gBAAgB;IAC/B,+BAA+B;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,iEAAiE;IACjE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,yCAAyC;IACzC,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,wCAAwC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,2BAA2B;IAC3B,MAAM,CAAC,EAAE,WAAW,CAAC;CACtB;AAED,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAC;IAC9B,QAAQ,CAAC,cAAc,EAAE,OAAO,CAAC;IAEjC,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC;CACjF;AAwED;;;;;;;;GAQG;AACH,qBAAa,qBAAsB,YAAW,cAAc;IAC1D,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAa;IAC1C,QAAQ,CAAC,cAAc,SAAS;IAEhC,qEAAqE;IACrE,OAAO,CAAC,WAAW;IAUnB;;;OAGG;YACW,SAAS;IAuDvB;;;OAGG;YACW,UAAU;IAqDxB;;;OAGG;YACW,YAAY;IA2DpB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CA8BtF;AAED,qBAAa,kBAAmB,YAAW,cAAc;IACvD,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAgB;IAC7C,QAAQ,CAAC,cAAc,SAAS;IAEhC,OAAO,CAAC,kBAAkB;IA8C1B,OAAO,CAAC,cAAc;YAoBR,UAAU;IAuFxB;;;OAGG;YACW,UAAU;IAiElB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CAyCtF;AAED,qBAAa,mBAAoB,YAAW,cAAc;IACxD,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAW;IACxC,QAAQ,CAAC,cAAc,QAAQ;IAEzB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CA8DtF;AAED;;;;;;;;;;;GAWG;AACH,qBAAa,oBAAqB,YAAW,cAAc;IACzD,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAY;IACzC;;OAEG;IACH,QAAQ,CAAC,cAAc,SAAS;IAEhC;;;;OAIG;IACH,OAAO,CAAC,0BAA0B;IAWlC,qDAAqD;IACrD,OAAO,CAAC,WAAW;IAUnB;;;;;OAKG;YACW,mBAAmB;IA4HjC,gFAAgF;IAChF,OAAO,CAAC,gBAAgB;IA4ClB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CAuCtF;AAED,wBAAgB,iBAAiB,CAAC,EAAE,EAAE,gBAAgB,GAAG,SAAS,GAAG,cAAc,CAQlF;AAkBD;;;;;;;;;;GAUG;AACH,wBAAgB,qBAAqB,IAAI;IAAE,QAAQ,EAAE,cAAc,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAuBrF"}
|
|
@@ -14,6 +14,8 @@
|
|
|
14
14
|
*/
|
|
15
15
|
import { fetch as undiciFetch } from 'undici';
|
|
16
16
|
import { load } from 'cheerio';
|
|
17
|
+
import { existsSync } from 'fs';
|
|
18
|
+
import { resolve as pathResolve } from 'path';
|
|
17
19
|
function decodeHtmlEntities(input) {
|
|
18
20
|
// Cheerio usually decodes entities when using `.text()`, but keep this as a
|
|
19
21
|
// safety net since DuckDuckGo snippets sometimes leak encoded entities.
|
|
@@ -596,6 +598,243 @@ export class BraveSearchProvider {
|
|
|
596
598
|
return results;
|
|
597
599
|
}
|
|
598
600
|
}
|
|
601
|
+
/**
|
|
602
|
+
* GoogleSearchProvider — Google Search via stealth browser or Custom Search JSON API
|
|
603
|
+
*
|
|
604
|
+
* Two modes:
|
|
605
|
+
* 1. Custom Search JSON API (BYOK): set GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX env vars.
|
|
606
|
+
* Reliable, structured, 100 free queries/day. Works from any IP.
|
|
607
|
+
* 2. Stealth browser scraping (no API key): uses playwright-extra stealth plugin to
|
|
608
|
+
* scrape google.com/search directly. Works from datacenter IPs where DDG/Bing/Ecosia
|
|
609
|
+
* are blocked. Gracefully returns [] if Playwright is unavailable.
|
|
610
|
+
*
|
|
611
|
+
* Docs: https://developers.google.com/custom-search/v1/overview
|
|
612
|
+
*/
|
|
613
|
+
export class GoogleSearchProvider {
|
|
614
|
+
id = 'google';
|
|
615
|
+
/**
|
|
616
|
+
* requiresApiKey is false: works without API keys via stealth browser fallback.
|
|
617
|
+
*/
|
|
618
|
+
requiresApiKey = false;
|
|
619
|
+
/**
|
|
620
|
+
* Map standard freshness values to Google's dateRestrict format.
|
|
621
|
+
* Google dateRestrict: d[n]=past n days, w[n]=past n weeks,
|
|
622
|
+
* m[n]=past n months, y[n]=past n years.
|
|
623
|
+
*/
|
|
624
|
+
mapFreshnessToDateRestrict(tbs) {
|
|
625
|
+
if (!tbs)
|
|
626
|
+
return undefined;
|
|
627
|
+
const map = {
|
|
628
|
+
pd: 'd1',
|
|
629
|
+
pw: 'w1',
|
|
630
|
+
pm: 'm1',
|
|
631
|
+
py: 'y1',
|
|
632
|
+
};
|
|
633
|
+
return map[tbs];
|
|
634
|
+
}
|
|
635
|
+
/** Validate URL; returns null if invalid/non-http */
|
|
636
|
+
validateUrl(rawUrl) {
|
|
637
|
+
try {
|
|
638
|
+
const parsed = new URL(rawUrl);
|
|
639
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
640
|
+
return null;
|
|
641
|
+
return parsed.href;
|
|
642
|
+
}
|
|
643
|
+
catch {
|
|
644
|
+
return null;
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
/**
|
|
648
|
+
* Stealth browser scrape of google.com/search.
|
|
649
|
+
* Used when no Custom Search API key is configured.
|
|
650
|
+
* Strategy A: peel() with stealth rendering (consistent with StealthSearchProvider).
|
|
651
|
+
* Strategy B: direct playwright-extra launch (if peel returns no results).
|
|
652
|
+
*/
|
|
653
|
+
async scrapeGoogleStealth(query, count) {
|
|
654
|
+
// Strategy A: peel() + cheerio parse
|
|
655
|
+
try {
|
|
656
|
+
const { peel } = await import('../index.js');
|
|
657
|
+
const params = new URLSearchParams({
|
|
658
|
+
q: query,
|
|
659
|
+
num: String(Math.min(count * 2, 20)),
|
|
660
|
+
hl: 'en',
|
|
661
|
+
gl: 'us',
|
|
662
|
+
});
|
|
663
|
+
const url = `https://www.google.com/search?${params.toString()}`;
|
|
664
|
+
const result = await Promise.race([
|
|
665
|
+
peel(url, { render: true, stealth: true, format: 'html', wait: 3000 }),
|
|
666
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Google stealth peel timeout')), 20_000)),
|
|
667
|
+
]);
|
|
668
|
+
const html = result.content || '';
|
|
669
|
+
if (html) {
|
|
670
|
+
const $ = load(html);
|
|
671
|
+
const results = [];
|
|
672
|
+
const seen = new Set();
|
|
673
|
+
// Multiple selector patterns for resilience across Google HTML variants
|
|
674
|
+
const resultBlocks = $('#search .g, #rso .g, [data-hveid] .g');
|
|
675
|
+
resultBlocks.each((_i, elem) => {
|
|
676
|
+
if (results.length >= count)
|
|
677
|
+
return;
|
|
678
|
+
const $r = $(elem);
|
|
679
|
+
const $a = $r.find('a[href^="http"]').first();
|
|
680
|
+
const $h3 = $r.find('h3').first();
|
|
681
|
+
if (!$a.length || !$h3.length)
|
|
682
|
+
return;
|
|
683
|
+
const href = $a.attr('href') || '';
|
|
684
|
+
if (href.includes('google.com/') ||
|
|
685
|
+
href.includes('accounts.google') ||
|
|
686
|
+
href.includes('/aclk') ||
|
|
687
|
+
href.startsWith('#'))
|
|
688
|
+
return;
|
|
689
|
+
const validated = this.validateUrl(href);
|
|
690
|
+
if (!validated)
|
|
691
|
+
return;
|
|
692
|
+
const key = normalizeUrlForDedupe(validated);
|
|
693
|
+
if (seen.has(key))
|
|
694
|
+
return;
|
|
695
|
+
seen.add(key);
|
|
696
|
+
const title = cleanText($h3.text(), { maxLen: 200 });
|
|
697
|
+
if (!title)
|
|
698
|
+
return;
|
|
699
|
+
const snippetText = $r.find('[data-sncf]').first().text() ||
|
|
700
|
+
$r.find('.VwiC3b').first().text() ||
|
|
701
|
+
$r.find('[style*="-webkit-line-clamp"]').first().text() ||
|
|
702
|
+
$r.find('.st').first().text() ||
|
|
703
|
+
'';
|
|
704
|
+
const snippet = cleanText(snippetText, { maxLen: 500, stripEllipsisPadding: true });
|
|
705
|
+
results.push({ title, url: validated, snippet });
|
|
706
|
+
});
|
|
707
|
+
if (results.length > 0)
|
|
708
|
+
return results.slice(0, count);
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
catch (e) {
|
|
712
|
+
if (process.env.DEBUG) {
|
|
713
|
+
console.debug('[webpeel] Google stealth (peel) error:', e.message);
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
// Strategy B: direct playwright-extra + stealth plugin
|
|
717
|
+
let browser;
|
|
718
|
+
let context;
|
|
719
|
+
let page;
|
|
720
|
+
try {
|
|
721
|
+
const pwExtra = await import('playwright-extra');
|
|
722
|
+
const StealthPlugin = (await import('puppeteer-extra-plugin-stealth')).default;
|
|
723
|
+
const stealthChromium = pwExtra.chromium;
|
|
724
|
+
stealthChromium.use(StealthPlugin());
|
|
725
|
+
const params = new URLSearchParams({
|
|
726
|
+
q: query,
|
|
727
|
+
num: String(Math.min(count * 2, 20)),
|
|
728
|
+
hl: 'en',
|
|
729
|
+
gl: 'us',
|
|
730
|
+
});
|
|
731
|
+
const url = `https://www.google.com/search?${params.toString()}`;
|
|
732
|
+
browser = await stealthChromium.launch({
|
|
733
|
+
headless: true,
|
|
734
|
+
args: [
|
|
735
|
+
'--disable-blink-features=AutomationControlled',
|
|
736
|
+
'--disable-dev-shm-usage',
|
|
737
|
+
'--no-sandbox',
|
|
738
|
+
'--disable-setuid-sandbox',
|
|
739
|
+
'--disable-gpu',
|
|
740
|
+
],
|
|
741
|
+
});
|
|
742
|
+
context = await browser.newContext({
|
|
743
|
+
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
744
|
+
viewport: { width: 1280, height: 720 },
|
|
745
|
+
locale: 'en-US',
|
|
746
|
+
});
|
|
747
|
+
page = await context.newPage();
|
|
748
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15_000 });
|
|
749
|
+
// Use page.content() + cheerio to avoid needing DOM lib types in tsconfig
|
|
750
|
+
const html = await page.content();
|
|
751
|
+
return this._parseGoogleHtml(html, count);
|
|
752
|
+
}
|
|
753
|
+
catch (e) {
|
|
754
|
+
if (process.env.DEBUG) {
|
|
755
|
+
console.debug('[webpeel] Google stealth (playwright) error:', e.message);
|
|
756
|
+
}
|
|
757
|
+
return [];
|
|
758
|
+
}
|
|
759
|
+
finally {
|
|
760
|
+
await page?.close().catch(() => { });
|
|
761
|
+
await context?.close().catch(() => { });
|
|
762
|
+
await browser?.close().catch(() => { });
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
/** Parse Google search result HTML using cheerio. No DOM lib types required. */
|
|
766
|
+
_parseGoogleHtml(html, count) {
|
|
767
|
+
const $ = load(html);
|
|
768
|
+
const results = [];
|
|
769
|
+
const seen = new Set();
|
|
770
|
+
const resultBlocks = $('#search .g, #rso .g, [data-hveid] .g');
|
|
771
|
+
resultBlocks.each((_i, elem) => {
|
|
772
|
+
if (results.length >= count)
|
|
773
|
+
return;
|
|
774
|
+
const $r = $(elem);
|
|
775
|
+
const $a = $r.find('a[href^="http"]').first();
|
|
776
|
+
const $h3 = $r.find('h3').first();
|
|
777
|
+
if (!$a.length || !$h3.length)
|
|
778
|
+
return;
|
|
779
|
+
const href = $a.attr('href') || '';
|
|
780
|
+
if (href.includes('google.com/') ||
|
|
781
|
+
href.includes('accounts.google') ||
|
|
782
|
+
href.includes('/aclk') ||
|
|
783
|
+
href.startsWith('#'))
|
|
784
|
+
return;
|
|
785
|
+
const validated = this.validateUrl(href);
|
|
786
|
+
if (!validated)
|
|
787
|
+
return;
|
|
788
|
+
const key = normalizeUrlForDedupe(validated);
|
|
789
|
+
if (seen.has(key))
|
|
790
|
+
return;
|
|
791
|
+
seen.add(key);
|
|
792
|
+
const title = cleanText($h3.text(), { maxLen: 200 });
|
|
793
|
+
if (!title)
|
|
794
|
+
return;
|
|
795
|
+
const snippetText = $r.find('[data-sncf]').first().text() ||
|
|
796
|
+
$r.find('.VwiC3b').first().text() ||
|
|
797
|
+
$r.find('[style*="-webkit-line-clamp"]').first().text() ||
|
|
798
|
+
$r.find('.st').first().text() ||
|
|
799
|
+
'';
|
|
800
|
+
const snippet = cleanText(snippetText, { maxLen: 500, stripEllipsisPadding: true });
|
|
801
|
+
results.push({ title, url: validated, snippet });
|
|
802
|
+
});
|
|
803
|
+
return results.slice(0, count);
|
|
804
|
+
}
|
|
805
|
+
async searchWeb(query, options) {
|
|
806
|
+
const { count, apiKey: optApiKey, tbs } = options;
|
|
807
|
+
const apiKey = optApiKey || process.env.GOOGLE_SEARCH_KEY || process.env.GOOGLE_API_KEY;
|
|
808
|
+
const cx = process.env.GOOGLE_SEARCH_CX;
|
|
809
|
+
// No API key — fall back to stealth browser scraping
|
|
810
|
+
if (!apiKey || !cx) {
|
|
811
|
+
return this.scrapeGoogleStealth(query, count);
|
|
812
|
+
}
|
|
813
|
+
// Custom Search JSON API path
|
|
814
|
+
const params = new URLSearchParams({
|
|
815
|
+
key: apiKey,
|
|
816
|
+
cx: cx,
|
|
817
|
+
q: query,
|
|
818
|
+
num: String(Math.min(count, 10)), // Google CSE max is 10 per request
|
|
819
|
+
});
|
|
820
|
+
const dateRestrict = this.mapFreshnessToDateRestrict(tbs);
|
|
821
|
+
if (dateRestrict)
|
|
822
|
+
params.set('dateRestrict', dateRestrict);
|
|
823
|
+
const response = await fetch(`https://www.googleapis.com/customsearch/v1?${params}`, {
|
|
824
|
+
signal: AbortSignal.timeout(10000),
|
|
825
|
+
});
|
|
826
|
+
if (!response.ok) {
|
|
827
|
+
const text = await response.text();
|
|
828
|
+
throw new Error(`Google search failed (${response.status}): ${text.substring(0, 200)}`);
|
|
829
|
+
}
|
|
830
|
+
const data = await response.json();
|
|
831
|
+
return (data.items || []).map((item) => ({
|
|
832
|
+
url: item.link,
|
|
833
|
+
title: item.title,
|
|
834
|
+
snippet: item.snippet || '',
|
|
835
|
+
}));
|
|
836
|
+
}
|
|
837
|
+
}
|
|
599
838
|
export function getSearchProvider(id) {
|
|
600
839
|
if (!id || id === 'duckduckgo')
|
|
601
840
|
return new DuckDuckGoProvider();
|
|
@@ -603,23 +842,55 @@ export function getSearchProvider(id) {
|
|
|
603
842
|
return new BraveSearchProvider();
|
|
604
843
|
if (id === 'stealth')
|
|
605
844
|
return new StealthSearchProvider();
|
|
845
|
+
if (id === 'google')
|
|
846
|
+
return new GoogleSearchProvider();
|
|
606
847
|
// Exhaustive fallback (should be unreachable due to typing)
|
|
607
848
|
return new DuckDuckGoProvider();
|
|
608
849
|
}
|
|
609
850
|
/**
|
|
610
|
-
*
|
|
611
|
-
*
|
|
612
|
-
|
|
851
|
+
* Check whether playwright-extra is available synchronously.
|
|
852
|
+
* Uses fs.existsSync on node_modules to avoid making getBestSearchProvider async.
|
|
853
|
+
*/
|
|
854
|
+
function isPlaywrightExtraAvailable() {
|
|
855
|
+
try {
|
|
856
|
+
const cwd = process.cwd();
|
|
857
|
+
return (existsSync(pathResolve(cwd, 'node_modules', 'playwright-extra')) ||
|
|
858
|
+
existsSync(pathResolve(cwd, '..', 'node_modules', 'playwright-extra')));
|
|
859
|
+
}
|
|
860
|
+
catch {
|
|
861
|
+
return false;
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
/**
|
|
865
|
+
* Get the best available search provider based on configured API keys and
|
|
866
|
+
* available runtime dependencies.
|
|
867
|
+
*
|
|
868
|
+
* Priority:
|
|
869
|
+
* 1. Google Custom Search JSON API (if GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX set)
|
|
870
|
+
* 2. Brave Search (if BRAVE_SEARCH_KEY is set)
|
|
871
|
+
* 3. Google stealth browser scraping (works from datacenter IPs; no API key needed)
|
|
872
|
+
* — only when playwright-extra is available in node_modules
|
|
873
|
+
* 4. DuckDuckGo with full fallback chain (DDG HTTP → DDG Lite → stealth multi-engine)
|
|
613
874
|
*/
|
|
614
875
|
export function getBestSearchProvider() {
|
|
615
|
-
//
|
|
876
|
+
// 1. Google Custom Search JSON API (BYOK) — works from any IP
|
|
877
|
+
const googleKey = process.env.GOOGLE_SEARCH_KEY || process.env.GOOGLE_API_KEY;
|
|
878
|
+
const googleCx = process.env.GOOGLE_SEARCH_CX;
|
|
879
|
+
if (googleKey && googleCx) {
|
|
880
|
+
return { provider: new GoogleSearchProvider(), apiKey: googleKey };
|
|
881
|
+
}
|
|
882
|
+
// 2. Brave Search (BYOK)
|
|
616
883
|
const braveKey = process.env.BRAVE_SEARCH_KEY || process.env.BRAVE_API_KEY;
|
|
617
884
|
if (braveKey) {
|
|
618
885
|
return { provider: new BraveSearchProvider(), apiKey: braveKey };
|
|
619
886
|
}
|
|
620
|
-
//
|
|
621
|
-
//
|
|
622
|
-
|
|
887
|
+
// 3. Google stealth browser — works from datacenter IPs where DDG/Bing/Ecosia fail.
|
|
888
|
+
// GoogleSearchProvider.searchWeb() falls back to stealth scraping when no API key is set.
|
|
889
|
+
if (isPlaywrightExtraAvailable()) {
|
|
890
|
+
return { provider: new GoogleSearchProvider() };
|
|
891
|
+
}
|
|
892
|
+
// 4. DuckDuckGo with full internal fallback chain
|
|
893
|
+
// (DDG HTTP → DDG Lite → stealth multi-engine)
|
|
623
894
|
return { provider: new DuckDuckGoProvider() };
|
|
624
895
|
}
|
|
625
896
|
//# sourceMappingURL=search-provider.js.map
|