webpeel 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/LICENSE +11 -657
  2. package/README.md +246 -325
  3. package/dist/cli.js +330 -73
  4. package/dist/cli.js.map +1 -1
  5. package/dist/core/browser-fetch.d.ts +12 -0
  6. package/dist/core/browser-fetch.d.ts.map +1 -1
  7. package/dist/core/browser-fetch.js +70 -17
  8. package/dist/core/browser-fetch.js.map +1 -1
  9. package/dist/core/cf-worker-proxy.d.ts +33 -0
  10. package/dist/core/cf-worker-proxy.d.ts.map +1 -0
  11. package/dist/core/cf-worker-proxy.js +88 -0
  12. package/dist/core/cf-worker-proxy.js.map +1 -0
  13. package/dist/core/chunker.d.ts +47 -0
  14. package/dist/core/chunker.d.ts.map +1 -0
  15. package/dist/core/chunker.js +250 -0
  16. package/dist/core/chunker.js.map +1 -0
  17. package/dist/core/cloak-fetch.d.ts +43 -0
  18. package/dist/core/cloak-fetch.d.ts.map +1 -0
  19. package/dist/core/cloak-fetch.js +141 -0
  20. package/dist/core/cloak-fetch.js.map +1 -0
  21. package/dist/core/crawl-checkpoint.d.ts +55 -0
  22. package/dist/core/crawl-checkpoint.d.ts.map +1 -0
  23. package/dist/core/crawl-checkpoint.js +105 -0
  24. package/dist/core/crawl-checkpoint.js.map +1 -0
  25. package/dist/core/crawler.d.ts +5 -1
  26. package/dist/core/crawler.d.ts.map +1 -1
  27. package/dist/core/crawler.js +60 -5
  28. package/dist/core/crawler.js.map +1 -1
  29. package/dist/core/cycle-fetch.d.ts +27 -0
  30. package/dist/core/cycle-fetch.d.ts.map +1 -0
  31. package/dist/core/cycle-fetch.js +99 -0
  32. package/dist/core/cycle-fetch.js.map +1 -0
  33. package/dist/core/domain-extractors.d.ts.map +1 -1
  34. package/dist/core/domain-extractors.js +754 -14
  35. package/dist/core/domain-extractors.js.map +1 -1
  36. package/dist/core/google-cache.d.ts +30 -0
  37. package/dist/core/google-cache.d.ts.map +1 -0
  38. package/dist/core/google-cache.js +181 -0
  39. package/dist/core/google-cache.js.map +1 -0
  40. package/dist/core/markdown.d.ts +11 -0
  41. package/dist/core/markdown.d.ts.map +1 -1
  42. package/dist/core/markdown.js +43 -0
  43. package/dist/core/markdown.js.map +1 -1
  44. package/dist/core/peel-tls.d.ts +26 -0
  45. package/dist/core/peel-tls.d.ts.map +1 -0
  46. package/dist/core/peel-tls.js +221 -0
  47. package/dist/core/peel-tls.js.map +1 -0
  48. package/dist/core/pipeline.d.ts +5 -1
  49. package/dist/core/pipeline.d.ts.map +1 -1
  50. package/dist/core/pipeline.js +269 -21
  51. package/dist/core/pipeline.js.map +1 -1
  52. package/dist/core/schema-postprocess.d.ts +33 -0
  53. package/dist/core/schema-postprocess.d.ts.map +1 -0
  54. package/dist/core/schema-postprocess.js +470 -0
  55. package/dist/core/schema-postprocess.js.map +1 -0
  56. package/dist/core/schema-templates.d.ts +20 -0
  57. package/dist/core/schema-templates.d.ts.map +1 -0
  58. package/dist/core/schema-templates.js +131 -0
  59. package/dist/core/schema-templates.js.map +1 -0
  60. package/dist/core/search-fallback.d.ts +28 -0
  61. package/dist/core/search-fallback.d.ts.map +1 -0
  62. package/dist/core/search-fallback.js +185 -0
  63. package/dist/core/search-fallback.js.map +1 -0
  64. package/dist/core/search-provider.d.ts +47 -4
  65. package/dist/core/search-provider.d.ts.map +1 -1
  66. package/dist/core/search-provider.js +278 -7
  67. package/dist/core/search-provider.js.map +1 -1
  68. package/dist/core/stealth-patches.d.ts +58 -0
  69. package/dist/core/stealth-patches.d.ts.map +1 -0
  70. package/dist/core/stealth-patches.js +340 -0
  71. package/dist/core/stealth-patches.js.map +1 -0
  72. package/dist/core/strategies.d.ts +20 -0
  73. package/dist/core/strategies.d.ts.map +1 -1
  74. package/dist/core/strategies.js +284 -48
  75. package/dist/core/strategies.js.map +1 -1
  76. package/dist/core/strategy-hooks.d.ts +1 -1
  77. package/dist/core/strategy-hooks.d.ts.map +1 -1
  78. package/dist/index.d.ts +11 -0
  79. package/dist/index.d.ts.map +1 -1
  80. package/dist/index.js +37 -15
  81. package/dist/index.js.map +1 -1
  82. package/dist/mcp/server.js +109 -4
  83. package/dist/mcp/server.js.map +1 -1
  84. package/dist/server/app.d.ts.map +1 -1
  85. package/dist/server/app.js +29 -0
  86. package/dist/server/app.js.map +1 -1
  87. package/dist/server/middleware/rate-limit.d.ts +2 -1
  88. package/dist/server/middleware/rate-limit.d.ts.map +1 -1
  89. package/dist/server/middleware/rate-limit.js +24 -8
  90. package/dist/server/middleware/rate-limit.js.map +1 -1
  91. package/dist/server/routes/agent.d.ts +4 -0
  92. package/dist/server/routes/agent.d.ts.map +1 -1
  93. package/dist/server/routes/agent.js +196 -9
  94. package/dist/server/routes/agent.js.map +1 -1
  95. package/dist/server/routes/batch.js +5 -5
  96. package/dist/server/routes/batch.js.map +1 -1
  97. package/dist/server/routes/compat.d.ts.map +1 -1
  98. package/dist/server/routes/compat.js +1 -0
  99. package/dist/server/routes/compat.js.map +1 -1
  100. package/dist/server/routes/fetch.d.ts.map +1 -1
  101. package/dist/server/routes/fetch.js +60 -6
  102. package/dist/server/routes/fetch.js.map +1 -1
  103. package/dist/server/routes/mcp.d.ts.map +1 -1
  104. package/dist/server/routes/mcp.js +103 -2
  105. package/dist/server/routes/mcp.js.map +1 -1
  106. package/dist/server/routes/search.js +1 -1
  107. package/dist/server/routes/search.js.map +1 -1
  108. package/dist/types.d.ts +55 -4
  109. package/dist/types.d.ts.map +1 -1
  110. package/dist/types.js +4 -1
  111. package/dist/types.js.map +1 -1
  112. package/llms.txt +55 -125
  113. package/package.json +15 -1
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Search-as-proxy fallback for blocked pages.
3
+ * When a protected site blocks our fetch, we search for the URL
4
+ * and extract info from search engine snippets.
5
+ *
6
+ * Uses the getBestSearchProvider() chain which handles:
7
+ * Google CSE API → Brave API → Google stealth → DDG
8
+ * This avoids direct HTML scraping which is blocked by CAPTCHAs on datacenter IPs.
9
+ */
10
+ export interface SearchFallbackResult {
11
+ title: string;
12
+ snippet: string;
13
+ cachedContent: string;
14
+ source: 'duckduckgo' | 'google' | 'none';
15
+ /** Extracted product data if this looks like a product page */
16
+ productData?: {
17
+ price?: string;
18
+ rating?: string;
19
+ brand?: string;
20
+ availability?: string;
21
+ };
22
+ }
23
+ /**
24
+ * Search for a URL using the best available search provider and extract the snippet.
25
+ * Returns the title, snippet, and any extracted product data.
26
+ */
27
+ export declare function searchFallback(url: string): Promise<SearchFallbackResult>;
28
+ //# sourceMappingURL=search-fallback.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"search-fallback.d.ts","sourceRoot":"","sources":["../../src/core/search-fallback.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,aAAa,EAAE,MAAM,CAAC;IACtB,MAAM,EAAE,YAAY,GAAG,QAAQ,GAAG,MAAM,CAAC;IACzC,+DAA+D;IAC/D,WAAW,CAAC,EAAE;QACZ,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,YAAY,CAAC,EAAE,MAAM,CAAC;KACvB,CAAC;CACH;AAmID;;;GAGG;AACH,wBAAsB,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC,CA4D/E"}
@@ -0,0 +1,185 @@
1
+ /**
2
+ * Search-as-proxy fallback for blocked pages.
3
+ * When a protected site blocks our fetch, we search for the URL
4
+ * and extract info from search engine snippets.
5
+ *
6
+ * Uses the getBestSearchProvider() chain which handles:
7
+ * Google CSE API → Brave API → Google stealth → DDG
8
+ * This avoids direct HTML scraping which is blocked by CAPTCHAs on datacenter IPs.
9
+ */
10
+ import { getBestSearchProvider } from './search-provider.js';
11
+ /**
12
+ * Detect if a URL is likely a product page.
13
+ */
14
+ function isProductUrl(url) {
15
+ try {
16
+ const u = new URL(url);
17
+ const host = u.hostname;
18
+ const path = u.pathname.toLowerCase();
19
+ const productHosts = ['bestbuy.com', 'walmart.com', 'amazon.com', 'target.com', 'newegg.com', 'ebay.com'];
20
+ const isProductHost = productHosts.some(h => host.includes(h));
21
+ const hasProductPath = path.includes('/dp/') ||
22
+ path.includes('/ip/') ||
23
+ path.includes('/site/') ||
24
+ path.includes('/p/') ||
25
+ path.includes('/product') ||
26
+ /\/\d{5,}/.test(path);
27
+ return isProductHost || hasProductPath;
28
+ }
29
+ catch {
30
+ return false;
31
+ }
32
+ }
33
+ /**
34
+ * Build an optimized search query for the given URL.
35
+ * For product pages, adds price/availability keywords for richer snippets.
36
+ */
37
+ function buildSearchQuery(url) {
38
+ try {
39
+ const urlObj = new URL(url);
40
+ // Extract meaningful path terms (skip numeric IDs, short tokens)
41
+ const pathTerms = urlObj.pathname
42
+ .split(/[-/_]/)
43
+ .filter(t => t.length > 2 && !/^\d+$/.test(t) && !/^[a-z]{1,2}$/.test(t))
44
+ .slice(0, 6)
45
+ .join(' ');
46
+ const isProduct = isProductUrl(url);
47
+ if (isProduct) {
48
+ // For product pages, search with price intent for richer snippets
49
+ return `"${urlObj.hostname}" ${pathTerms} price`.trim();
50
+ }
51
+ return `site:${urlObj.hostname} ${pathTerms}`.trim();
52
+ }
53
+ catch {
54
+ return url;
55
+ }
56
+ }
57
+ /**
58
+ * Extract product-specific data from a search snippet.
59
+ * Google/Brave often include price, rating, availability in rich snippets.
60
+ */
61
+ function extractProductData(title, snippet) {
62
+ const text = `${title} ${snippet}`;
63
+ const data = {
64
+ price: '',
65
+ rating: '',
66
+ brand: '',
67
+ availability: '',
68
+ };
69
+ // Price: $XX.XX or $X,XXX.XX
70
+ const priceMatch = text.match(/\$[\d,]+(?:\.\d{2})?/);
71
+ if (priceMatch)
72
+ data.price = priceMatch[0];
73
+ // Rating: "4.5 out of 5", "4.5/5", "4.5 stars"
74
+ const ratingMatch = text.match(/(\d(?:\.\d)?)\s*(?:out of 5|\/5|★|\bstars?\b)/i);
75
+ if (ratingMatch)
76
+ data.rating = `${ratingMatch[1]}/5`;
77
+ // Availability: in stock, out of stock, available
78
+ const availMatch = text.match(/\b(in stock|out of stock|available|sold out|ships in|pre-order)\b/i);
79
+ if (availMatch)
80
+ data.availability = availMatch[0];
81
+ // Brand: check for "by BrandName" or "Brand:" patterns
82
+ const brandMatch = text.match(/(?:by|brand:|sold by)\s+([A-Z][a-zA-Z\s&]{2,20}?)(?:\s*[|·•,]|\s+(?:at|on|for|price))/i);
83
+ if (brandMatch)
84
+ data.brand = brandMatch[1].trim();
85
+ const hasData = data.price || data.rating || data.availability;
86
+ if (!hasData)
87
+ return undefined;
88
+ return {
89
+ price: data.price || undefined,
90
+ rating: data.rating || undefined,
91
+ brand: data.brand || undefined,
92
+ availability: data.availability || undefined,
93
+ };
94
+ }
95
+ /**
96
+ * Build a rich markdown content block from search results.
97
+ * Formats differently for product pages vs general pages.
98
+ */
99
+ function buildCachedContent(url, title, snippet, productData) {
100
+ const lines = [];
101
+ lines.push(`# ${title}`);
102
+ lines.push('');
103
+ // For product pages, add structured product info
104
+ if (productData) {
105
+ if (productData.price)
106
+ lines.push(`**Price:** ${productData.price}`);
107
+ if (productData.brand)
108
+ lines.push(`**Brand:** ${productData.brand}`);
109
+ if (productData.rating)
110
+ lines.push(`**Rating:** ${productData.rating}`);
111
+ if (productData.availability)
112
+ lines.push(`**Availability:** ${productData.availability}`);
113
+ lines.push('');
114
+ }
115
+ if (snippet) {
116
+ lines.push(snippet);
117
+ lines.push('');
118
+ }
119
+ lines.push(`---`);
120
+ lines.push(`*Source: Search engine snippet for ${url}*`);
121
+ lines.push(`*⚠️ Limited content — original page blocked direct access. For full data, configure GOOGLE_SEARCH_KEY or BRAVE_SEARCH_KEY.*`);
122
+ return lines.join('\n');
123
+ }
124
+ /**
125
+ * Search for a URL using the best available search provider and extract the snippet.
126
+ * Returns the title, snippet, and any extracted product data.
127
+ */
128
+ export async function searchFallback(url) {
129
+ const emptyResult = {
130
+ title: '',
131
+ snippet: '',
132
+ cachedContent: '',
133
+ source: 'none',
134
+ };
135
+ try {
136
+ // Validate URL early — invalid URLs return empty result immediately
137
+ try {
138
+ new URL(url);
139
+ }
140
+ catch {
141
+ return emptyResult;
142
+ }
143
+ const searchQuery = buildSearchQuery(url);
144
+ const { provider, apiKey } = getBestSearchProvider();
145
+ const results = await provider.searchWeb(searchQuery, {
146
+ count: 3,
147
+ apiKey,
148
+ });
149
+ if (!results || results.length === 0) {
150
+ return emptyResult;
151
+ }
152
+ const topResult = results[0];
153
+ const title = topResult.title?.trim() || '';
154
+ const snippet = topResult.snippet?.trim() || '';
155
+ if (!title && !snippet) {
156
+ return emptyResult;
157
+ }
158
+ const productData = isProductUrl(url)
159
+ ? extractProductData(title, snippet)
160
+ : undefined;
161
+ const cachedContent = buildCachedContent(url, title, snippet, productData);
162
+ // Map provider ID to our source type
163
+ const sourceMap = {
164
+ duckduckgo: 'duckduckgo',
165
+ brave: 'google',
166
+ stealth: 'duckduckgo',
167
+ google: 'google',
168
+ };
169
+ const source = sourceMap[provider.id] ?? 'google';
170
+ return {
171
+ title,
172
+ snippet,
173
+ cachedContent,
174
+ source,
175
+ productData,
176
+ };
177
+ }
178
+ catch (e) {
179
+ if (process.env.DEBUG) {
180
+ console.debug('[webpeel]', 'Search fallback failed:', e instanceof Error ? e.message : e);
181
+ }
182
+ return emptyResult;
183
+ }
184
+ }
185
+ //# sourceMappingURL=search-fallback.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"search-fallback.js","sourceRoot":"","sources":["../../src/core/search-fallback.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,qBAAqB,EAAE,MAAM,sBAAsB,CAAC;AAgB7D;;GAEG;AACH,SAAS,YAAY,CAAC,GAAW;IAC/B,IAAI,CAAC;QACH,MAAM,CAAC,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QACvB,MAAM,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC;QACxB,MAAM,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;QAEtC,MAAM,YAAY,GAAG,CAAC,aAAa,EAAE,aAAa,EAAE,YAAY,EAAE,YAAY,EAAE,YAAY,EAAE,UAAU,CAAC,CAAC;QAC1G,MAAM,aAAa,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,MAAM,cAAc,GAClB,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;YACrB,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC;YACrB,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC;YACvB,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC;YACpB,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC;YACzB,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAExB,OAAO,aAAa,IAAI,cAAc,CAAC;IACzC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,GAAW;IACnC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAE5B,iEAAiE;QACjE,MAAM,SAAS,GAAG,MAAM,CAAC,QAAQ;aAC9B,KAAK,CAAC,OAAO,CAAC;aACd,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;aACxE,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;aACX,IAAI,CAAC,GAAG,CAAC,CAAC;QAEb,MAAM,SAAS,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC;QAEpC,IAAI,SAAS,EAAE,CAAC;YACd,kEAAkE;YAClE,OAAO,IAAI,MAAM,CAAC,QAAQ,KAAK,SAAS,QAAQ,CAAC,IAAI,EAAE,CAAC;QAC1D,CAAC;QAED,OAAO,QAAQ,MAAM,CAAC,QAAQ,IAAI,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC;IACvD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,GAAG,CAAC;IACb,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,SAAS,kBAAkB,CAAC,KAAa,EAAE,OAAe;IACxD,MAAM,IAAI,GAAG,GAAG,KAAK,IAAI,OAAO,EAAE,CAAC;IACnC,MAAM,IAAI,GAA+D;QACvE,KAAK,EAAE,EAAE;QACT,MAAM,EAAE,EAAE;QACV,KAAK,EAAE,EAAE;QACT,YAAY,EAAE,EAAE;KACjB,CAAC;IAEF,6BAA6B;IAC7B,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;IACtD,IAAI,UAAU;QAAE,IAAI,CAAC,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAE3C,+CAA+C;IAC/C,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,gDAAgD,CAAC,CAAC;IACjF,IAAI,WAAW;QAAE,IAAI,CAAC,MAAM,GAAG,GAAG,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC;IAErD,kDAAkD;IAClD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,oEAAoE,CAAC,CAAC;IACpG,IAAI,UAAU;QAAE,IAAI,CAAC,YAAY,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAElD,uDAAuD;IACvD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,wFAAwF,CAAC,CAAC;IACxH,IAAI,UAAU;QAAE,IAAI,CAAC,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAElD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,YAAY,CAAC;IAC/D,IAAI,CAAC,OAAO;QAAE,OAAO,SAAS,CAAC;IAE/B,OAAO;QACL,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,SAAS;QAC9B,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,SAAS;QAChC,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,SAAS;QAC9B,YAAY,EAAE,IAAI,CAAC,YAAY,IAAI,SAAS;KAC7C,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,SAAS,kBAAkB,CACzB,GAAW,EACX,KAAa,EACb,OAAe,EACf,WAAiD;IAEjD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;IACzB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,iDAAiD;IACjD,IAAI,WAAW,EAAE,CAAC;QAChB,IAAI,WAAW,CAAC,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,cAAc,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC;QACrE,IAAI,WAAW,CAAC,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,cAAc,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC;QACrE,IAAI,WAAW,CAAC,MAAM;YAAE,KAAK,CAAC,IAAI,CAAC,eAAe,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC;QACxE,IAAI,WAAW,CAAC,YAAY;YAAE,KAAK,CAAC,IAAI,CAAC,qBAAqB,WAAW,CAAC,YAAY,EAAE,CAAC,CAAC;QAC1F,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,IAAI,OAAO,EAAE,CAAC;QACZ,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACpB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAClB,KAAK,CAAC,IAAI,CAAC,sCAAsC,GAAG,GAAG,CAAC,CAAC;IACzD,KAAK,CAAC,IAAI,CAAC,6HAA6H,CAAC,CAAC;IAE1I,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,GAAW;IAC9C,MAAM,WAAW,GAAyB;QACxC,KAAK,EAAE,EAAE;QACT,OAAO,EAAE,EAAE;QACX,aAAa,EAAE,EAAE;QACjB,MAAM,EAAE,MAAM;KACf,CAAC;IAEF,IAAI,CAAC;QACH,oEAAoE;QACpE,IAAI,CAAC;YAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC;YAAC,OAAO,WAAW,CAAC;QAAC,CAAC;QAEnD,MAAM,WAAW,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAC;QAC1C,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,qBAAqB,EAAE,CAAC;QAErD,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,SAAS,CAAC,WAAW,EAAE;YACpD,KAAK,EAAE,CAAC;YACR,MAAM;SACP,CAAC,CAAC;QAEH,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACrC,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,MAAM,SAAS,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QAC7B,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAC5C,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAEhD,IAAI,CAAC,KAAK,IAAI,CAAC,OAAO,EAAE,CAAC;YACvB,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,MAAM,WAAW,GAAG,YAAY,CAAC,GAAG,CAAC;YACnC,CAAC,CAAC,kBAAkB,CAAC,KAAK,EAAE,OAAO,CAAC;YACpC,CAAC,CAAC,SAAS,CAAC;QAEd,MAAM,aAAa,GAAG,kBAAkB,CAAC,GAAG,EAAE,KAAK,EAAE,OAAO,EAAE,WAAW,CAAC,CAAC;QAE3E,qCAAqC;QACrC,MAAM,SAAS,GAA4C;YACzD,UAAU,EAAE,YAAY;YACxB,KAAK,EAAE,QAAQ;YACf,OAAO,EAAE,YAAY;YACrB,MAAM,EAAE,QAAQ;SACjB,CAAC;QACF,MAAM,MAAM,GAAG,SAAS,CAAC,QAAQ,CAAC,EAAE,CAAC,IAAI,QAAQ,CAAC;QAElD,OAAO;YACL,KAAK;YACL,OAAO;YACP,aAAa;YACb,MAAM;YACN,WAAW;SACZ,CAAC;IACJ,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,IAAI,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC;YACtB,OAAO,CAAC,KAAK,CAAC,WAAW,EAAE,yBAAyB,EAAE,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5F,CAAC;QACD,OAAO,WAAW,CAAC;IACrB,CAAC;AACH,CAAC"}
@@ -12,7 +12,7 @@
12
12
  * In production with no API keys configured, getBestSearchProvider() returns
13
13
  * StealthSearchProvider since DDG HTTP is often blocked on datacenter IPs.
14
14
  */
15
- export type SearchProviderId = 'duckduckgo' | 'brave' | 'stealth';
15
+ export type SearchProviderId = 'duckduckgo' | 'brave' | 'stealth' | 'google';
16
16
  export interface WebSearchResult {
17
17
  title: string;
18
18
  url: string;
@@ -86,11 +86,54 @@ export declare class BraveSearchProvider implements SearchProvider {
86
86
  readonly requiresApiKey = true;
87
87
  searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
88
88
  }
89
+ /**
90
+ * GoogleSearchProvider — Google Search via stealth browser or Custom Search JSON API
91
+ *
92
+ * Two modes:
93
+ * 1. Custom Search JSON API (BYOK): set GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX env vars.
94
+ * Reliable, structured, 100 free queries/day. Works from any IP.
95
+ * 2. Stealth browser scraping (no API key): uses playwright-extra stealth plugin to
96
+ * scrape google.com/search directly. Works from datacenter IPs where DDG/Bing/Ecosia
97
+ * are blocked. Gracefully returns [] if Playwright is unavailable.
98
+ *
99
+ * Docs: https://developers.google.com/custom-search/v1/overview
100
+ */
101
+ export declare class GoogleSearchProvider implements SearchProvider {
102
+ readonly id: SearchProviderId;
103
+ /**
104
+ * requiresApiKey is false: works without API keys via stealth browser fallback.
105
+ */
106
+ readonly requiresApiKey = false;
107
+ /**
108
+ * Map standard freshness values to Google's dateRestrict format.
109
+ * Google dateRestrict: d[n]=past n days, w[n]=past n weeks,
110
+ * m[n]=past n months, y[n]=past n years.
111
+ */
112
+ private mapFreshnessToDateRestrict;
113
+ /** Validate URL; returns null if invalid/non-http */
114
+ private validateUrl;
115
+ /**
116
+ * Stealth browser scrape of google.com/search.
117
+ * Used when no Custom Search API key is configured.
118
+ * Strategy A: peel() with stealth rendering (consistent with StealthSearchProvider).
119
+ * Strategy B: direct playwright-extra launch (if peel returns no results).
120
+ */
121
+ private scrapeGoogleStealth;
122
+ /** Parse Google search result HTML using cheerio. No DOM lib types required. */
123
+ private _parseGoogleHtml;
124
+ searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
125
+ }
89
126
  export declare function getSearchProvider(id: SearchProviderId | undefined): SearchProvider;
90
127
  /**
91
- * Get the best available search provider based on configured API keys.
92
- * In production with no API keys configured, returns StealthSearchProvider
93
- * since DDG HTTP is often blocked on datacenter/server IPs.
128
+ * Get the best available search provider based on configured API keys and
129
+ * available runtime dependencies.
130
+ *
131
+ * Priority:
132
+ * 1. Google Custom Search JSON API (if GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX set)
133
+ * 2. Brave Search (if BRAVE_SEARCH_KEY is set)
134
+ * 3. Google stealth browser scraping (works from datacenter IPs; no API key needed)
135
+ * — only when playwright-extra is available in node_modules
136
+ * 4. DuckDuckGo with full fallback chain (DDG HTTP → DDG Lite → stealth multi-engine)
94
137
  */
95
138
  export declare function getBestSearchProvider(): {
96
139
  provider: SearchProvider;
@@ -1 +1 @@
1
- {"version":3,"file":"search-provider.d.ts","sourceRoot":"","sources":["../../src/core/search-provider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAKH,MAAM,MAAM,gBAAgB,GAAG,YAAY,GAAG,OAAO,GAAG,SAAS,CAAC;AAElE,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,gBAAgB;IAC/B,+BAA+B;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,iEAAiE;IACjE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,yCAAyC;IACzC,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,wCAAwC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,2BAA2B;IAC3B,MAAM,CAAC,EAAE,WAAW,CAAC;CACtB;AAED,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAC;IAC9B,QAAQ,CAAC,cAAc,EAAE,OAAO,CAAC;IAEjC,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC;CACjF;AAwED;;;;;;;;GAQG;AACH,qBAAa,qBAAsB,YAAW,cAAc;IAC1D,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAa;IAC1C,QAAQ,CAAC,cAAc,SAAS;IAEhC,qEAAqE;IACrE,OAAO,CAAC,WAAW;IAUnB;;;OAGG;YACW,SAAS;IAuDvB;;;OAGG;YACW,UAAU;IAqDxB;;;OAGG;YACW,YAAY;IA2DpB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CA8BtF;AAED,qBAAa,kBAAmB,YAAW,cAAc;IACvD,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAgB;IAC7C,QAAQ,CAAC,cAAc,SAAS;IAEhC,OAAO,CAAC,kBAAkB;IA8C1B,OAAO,CAAC,cAAc;YAoBR,UAAU;IAuFxB;;;OAGG;YACW,UAAU;IAiElB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CAyCtF;AAED,qBAAa,mBAAoB,YAAW,cAAc;IACxD,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAW;IACxC,QAAQ,CAAC,cAAc,QAAQ;IAEzB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CA8DtF;AAED,wBAAgB,iBAAiB,CAAC,EAAE,EAAE,gBAAgB,GAAG,SAAS,GAAG,cAAc,CAOlF;AAED;;;;GAIG;AACH,wBAAgB,qBAAqB,IAAI;IAAE,QAAQ,EAAE,cAAc,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAWrF"}
1
+ {"version":3,"file":"search-provider.d.ts","sourceRoot":"","sources":["../../src/core/search-provider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAOH,MAAM,MAAM,gBAAgB,GAAG,YAAY,GAAG,OAAO,GAAG,SAAS,GAAG,QAAQ,CAAC;AAE7E,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,gBAAgB;IAC/B,+BAA+B;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,iEAAiE;IACjE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,yCAAyC;IACzC,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,wCAAwC;IACxC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,2BAA2B;IAC3B,MAAM,CAAC,EAAE,WAAW,CAAC;CACtB;AAED,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAC;IAC9B,QAAQ,CAAC,cAAc,EAAE,OAAO,CAAC;IAEjC,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC;CACjF;AAwED;;;;;;;;GAQG;AACH,qBAAa,qBAAsB,YAAW,cAAc;IAC1D,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAa;IAC1C,QAAQ,CAAC,cAAc,SAAS;IAEhC,qEAAqE;IACrE,OAAO,CAAC,WAAW;IAUnB;;;OAGG;YACW,SAAS;IAuDvB;;;OAGG;YACW,UAAU;IAqDxB;;;OAGG;YACW,YAAY;IA2DpB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CA8BtF;AAED,qBAAa,kBAAmB,YAAW,cAAc;IACvD,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAgB;IAC7C,QAAQ,CAAC,cAAc,SAAS;IAEhC,OAAO,CAAC,kBAAkB;IA8C1B,OAAO,CAAC,cAAc;YAoBR,UAAU;IAuFxB;;;OAGG;YACW,UAAU;IAiElB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CAyCtF;AAED,qBAAa,mBAAoB,YAAW,cAAc;IACxD,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAW;IACxC,QAAQ,CAAC,cAAc,QAAQ;IAEzB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CA8DtF;AAED;;;;;;;;;;;GAWG;AACH,qBAAa,oBAAqB,YAAW,cAAc;IACzD,QAAQ,CAAC,EAAE,EAAE,gBAAgB,CAAY;IACzC;;OAEG;IACH,QAAQ,CAAC,cAAc,SAAS;IAEhC;;;;OAIG;IACH,OAAO,CAAC,0BAA0B;IAWlC,qDAAqD;IACrD,OAAO,CAAC,WAAW;IAUnB;;;;;OAKG;YACW,mBAAmB;IA4HjC,gFAAgF;IAChF,OAAO,CAAC,gBAAgB;IA4ClB,SAAS,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;CAuCtF;AAED,wBAAgB,iBAAiB,CAAC,EAAE,EAAE,gBAAgB,GAAG,SAAS,GAAG,cAAc,CAQlF;AAkBD;;;;;;;;;;GAUG;AACH,wBAAgB,qBAAqB,IAAI;IAAE,QAAQ,EAAE,cAAc,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAuBrF"}
@@ -14,6 +14,8 @@
14
14
  */
15
15
  import { fetch as undiciFetch } from 'undici';
16
16
  import { load } from 'cheerio';
17
+ import { existsSync } from 'fs';
18
+ import { resolve as pathResolve } from 'path';
17
19
  function decodeHtmlEntities(input) {
18
20
  // Cheerio usually decodes entities when using `.text()`, but keep this as a
19
21
  // safety net since DuckDuckGo snippets sometimes leak encoded entities.
@@ -596,6 +598,243 @@ export class BraveSearchProvider {
596
598
  return results;
597
599
  }
598
600
  }
601
+ /**
602
+ * GoogleSearchProvider — Google Search via stealth browser or Custom Search JSON API
603
+ *
604
+ * Two modes:
605
+ * 1. Custom Search JSON API (BYOK): set GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX env vars.
606
+ * Reliable, structured, 100 free queries/day. Works from any IP.
607
+ * 2. Stealth browser scraping (no API key): uses playwright-extra stealth plugin to
608
+ * scrape google.com/search directly. Works from datacenter IPs where DDG/Bing/Ecosia
609
+ * are blocked. Gracefully returns [] if Playwright is unavailable.
610
+ *
611
+ * Docs: https://developers.google.com/custom-search/v1/overview
612
+ */
613
+ export class GoogleSearchProvider {
614
+ id = 'google';
615
+ /**
616
+ * requiresApiKey is false: works without API keys via stealth browser fallback.
617
+ */
618
+ requiresApiKey = false;
619
+ /**
620
+ * Map standard freshness values to Google's dateRestrict format.
621
+ * Google dateRestrict: d[n]=past n days, w[n]=past n weeks,
622
+ * m[n]=past n months, y[n]=past n years.
623
+ */
624
+ mapFreshnessToDateRestrict(tbs) {
625
+ if (!tbs)
626
+ return undefined;
627
+ const map = {
628
+ pd: 'd1',
629
+ pw: 'w1',
630
+ pm: 'm1',
631
+ py: 'y1',
632
+ };
633
+ return map[tbs];
634
+ }
635
+ /** Validate URL; returns null if invalid/non-http */
636
+ validateUrl(rawUrl) {
637
+ try {
638
+ const parsed = new URL(rawUrl);
639
+ if (!['http:', 'https:'].includes(parsed.protocol))
640
+ return null;
641
+ return parsed.href;
642
+ }
643
+ catch {
644
+ return null;
645
+ }
646
+ }
647
+ /**
648
+ * Stealth browser scrape of google.com/search.
649
+ * Used when no Custom Search API key is configured.
650
+ * Strategy A: peel() with stealth rendering (consistent with StealthSearchProvider).
651
+ * Strategy B: direct playwright-extra launch (if peel returns no results).
652
+ */
653
+ async scrapeGoogleStealth(query, count) {
654
+ // Strategy A: peel() + cheerio parse
655
+ try {
656
+ const { peel } = await import('../index.js');
657
+ const params = new URLSearchParams({
658
+ q: query,
659
+ num: String(Math.min(count * 2, 20)),
660
+ hl: 'en',
661
+ gl: 'us',
662
+ });
663
+ const url = `https://www.google.com/search?${params.toString()}`;
664
+ const result = await Promise.race([
665
+ peel(url, { render: true, stealth: true, format: 'html', wait: 3000 }),
666
+ new Promise((_, reject) => setTimeout(() => reject(new Error('Google stealth peel timeout')), 20_000)),
667
+ ]);
668
+ const html = result.content || '';
669
+ if (html) {
670
+ const $ = load(html);
671
+ const results = [];
672
+ const seen = new Set();
673
+ // Multiple selector patterns for resilience across Google HTML variants
674
+ const resultBlocks = $('#search .g, #rso .g, [data-hveid] .g');
675
+ resultBlocks.each((_i, elem) => {
676
+ if (results.length >= count)
677
+ return;
678
+ const $r = $(elem);
679
+ const $a = $r.find('a[href^="http"]').first();
680
+ const $h3 = $r.find('h3').first();
681
+ if (!$a.length || !$h3.length)
682
+ return;
683
+ const href = $a.attr('href') || '';
684
+ if (href.includes('google.com/') ||
685
+ href.includes('accounts.google') ||
686
+ href.includes('/aclk') ||
687
+ href.startsWith('#'))
688
+ return;
689
+ const validated = this.validateUrl(href);
690
+ if (!validated)
691
+ return;
692
+ const key = normalizeUrlForDedupe(validated);
693
+ if (seen.has(key))
694
+ return;
695
+ seen.add(key);
696
+ const title = cleanText($h3.text(), { maxLen: 200 });
697
+ if (!title)
698
+ return;
699
+ const snippetText = $r.find('[data-sncf]').first().text() ||
700
+ $r.find('.VwiC3b').first().text() ||
701
+ $r.find('[style*="-webkit-line-clamp"]').first().text() ||
702
+ $r.find('.st').first().text() ||
703
+ '';
704
+ const snippet = cleanText(snippetText, { maxLen: 500, stripEllipsisPadding: true });
705
+ results.push({ title, url: validated, snippet });
706
+ });
707
+ if (results.length > 0)
708
+ return results.slice(0, count);
709
+ }
710
+ }
711
+ catch (e) {
712
+ if (process.env.DEBUG) {
713
+ console.debug('[webpeel] Google stealth (peel) error:', e.message);
714
+ }
715
+ }
716
+ // Strategy B: direct playwright-extra + stealth plugin
717
+ let browser;
718
+ let context;
719
+ let page;
720
+ try {
721
+ const pwExtra = await import('playwright-extra');
722
+ const StealthPlugin = (await import('puppeteer-extra-plugin-stealth')).default;
723
+ const stealthChromium = pwExtra.chromium;
724
+ stealthChromium.use(StealthPlugin());
725
+ const params = new URLSearchParams({
726
+ q: query,
727
+ num: String(Math.min(count * 2, 20)),
728
+ hl: 'en',
729
+ gl: 'us',
730
+ });
731
+ const url = `https://www.google.com/search?${params.toString()}`;
732
+ browser = await stealthChromium.launch({
733
+ headless: true,
734
+ args: [
735
+ '--disable-blink-features=AutomationControlled',
736
+ '--disable-dev-shm-usage',
737
+ '--no-sandbox',
738
+ '--disable-setuid-sandbox',
739
+ '--disable-gpu',
740
+ ],
741
+ });
742
+ context = await browser.newContext({
743
+ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
744
+ viewport: { width: 1280, height: 720 },
745
+ locale: 'en-US',
746
+ });
747
+ page = await context.newPage();
748
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15_000 });
749
+ // Use page.content() + cheerio to avoid needing DOM lib types in tsconfig
750
+ const html = await page.content();
751
+ return this._parseGoogleHtml(html, count);
752
+ }
753
+ catch (e) {
754
+ if (process.env.DEBUG) {
755
+ console.debug('[webpeel] Google stealth (playwright) error:', e.message);
756
+ }
757
+ return [];
758
+ }
759
+ finally {
760
+ await page?.close().catch(() => { });
761
+ await context?.close().catch(() => { });
762
+ await browser?.close().catch(() => { });
763
+ }
764
+ }
765
+ /** Parse Google search result HTML using cheerio. No DOM lib types required. */
766
+ _parseGoogleHtml(html, count) {
767
+ const $ = load(html);
768
+ const results = [];
769
+ const seen = new Set();
770
+ const resultBlocks = $('#search .g, #rso .g, [data-hveid] .g');
771
+ resultBlocks.each((_i, elem) => {
772
+ if (results.length >= count)
773
+ return;
774
+ const $r = $(elem);
775
+ const $a = $r.find('a[href^="http"]').first();
776
+ const $h3 = $r.find('h3').first();
777
+ if (!$a.length || !$h3.length)
778
+ return;
779
+ const href = $a.attr('href') || '';
780
+ if (href.includes('google.com/') ||
781
+ href.includes('accounts.google') ||
782
+ href.includes('/aclk') ||
783
+ href.startsWith('#'))
784
+ return;
785
+ const validated = this.validateUrl(href);
786
+ if (!validated)
787
+ return;
788
+ const key = normalizeUrlForDedupe(validated);
789
+ if (seen.has(key))
790
+ return;
791
+ seen.add(key);
792
+ const title = cleanText($h3.text(), { maxLen: 200 });
793
+ if (!title)
794
+ return;
795
+ const snippetText = $r.find('[data-sncf]').first().text() ||
796
+ $r.find('.VwiC3b').first().text() ||
797
+ $r.find('[style*="-webkit-line-clamp"]').first().text() ||
798
+ $r.find('.st').first().text() ||
799
+ '';
800
+ const snippet = cleanText(snippetText, { maxLen: 500, stripEllipsisPadding: true });
801
+ results.push({ title, url: validated, snippet });
802
+ });
803
+ return results.slice(0, count);
804
+ }
805
+ async searchWeb(query, options) {
806
+ const { count, apiKey: optApiKey, tbs } = options;
807
+ const apiKey = optApiKey || process.env.GOOGLE_SEARCH_KEY || process.env.GOOGLE_API_KEY;
808
+ const cx = process.env.GOOGLE_SEARCH_CX;
809
+ // No API key — fall back to stealth browser scraping
810
+ if (!apiKey || !cx) {
811
+ return this.scrapeGoogleStealth(query, count);
812
+ }
813
+ // Custom Search JSON API path
814
+ const params = new URLSearchParams({
815
+ key: apiKey,
816
+ cx: cx,
817
+ q: query,
818
+ num: String(Math.min(count, 10)), // Google CSE max is 10 per request
819
+ });
820
+ const dateRestrict = this.mapFreshnessToDateRestrict(tbs);
821
+ if (dateRestrict)
822
+ params.set('dateRestrict', dateRestrict);
823
+ const response = await fetch(`https://www.googleapis.com/customsearch/v1?${params}`, {
824
+ signal: AbortSignal.timeout(10000),
825
+ });
826
+ if (!response.ok) {
827
+ const text = await response.text();
828
+ throw new Error(`Google search failed (${response.status}): ${text.substring(0, 200)}`);
829
+ }
830
+ const data = await response.json();
831
+ return (data.items || []).map((item) => ({
832
+ url: item.link,
833
+ title: item.title,
834
+ snippet: item.snippet || '',
835
+ }));
836
+ }
837
+ }
599
838
  export function getSearchProvider(id) {
600
839
  if (!id || id === 'duckduckgo')
601
840
  return new DuckDuckGoProvider();
@@ -603,23 +842,55 @@ export function getSearchProvider(id) {
603
842
  return new BraveSearchProvider();
604
843
  if (id === 'stealth')
605
844
  return new StealthSearchProvider();
845
+ if (id === 'google')
846
+ return new GoogleSearchProvider();
606
847
  // Exhaustive fallback (should be unreachable due to typing)
607
848
  return new DuckDuckGoProvider();
608
849
  }
609
850
  /**
610
- * Get the best available search provider based on configured API keys.
611
- * In production with no API keys configured, returns StealthSearchProvider
612
- * since DDG HTTP is often blocked on datacenter/server IPs.
851
+ * Check whether playwright-extra is available synchronously.
852
+ * Uses fs.existsSync on node_modules to avoid making getBestSearchProvider async.
853
+ */
854
+ function isPlaywrightExtraAvailable() {
855
+ try {
856
+ const cwd = process.cwd();
857
+ return (existsSync(pathResolve(cwd, 'node_modules', 'playwright-extra')) ||
858
+ existsSync(pathResolve(cwd, '..', 'node_modules', 'playwright-extra')));
859
+ }
860
+ catch {
861
+ return false;
862
+ }
863
+ }
864
+ /**
865
+ * Get the best available search provider based on configured API keys and
866
+ * available runtime dependencies.
867
+ *
868
+ * Priority:
869
+ * 1. Google Custom Search JSON API (if GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX set)
870
+ * 2. Brave Search (if BRAVE_SEARCH_KEY is set)
871
+ * 3. Google stealth browser scraping (works from datacenter IPs; no API key needed)
872
+ * — only when playwright-extra is available in node_modules
873
+ * 4. DuckDuckGo with full fallback chain (DDG HTTP → DDG Lite → stealth multi-engine)
613
874
  */
614
875
  export function getBestSearchProvider() {
615
- // Check for Brave
876
+ // 1. Google Custom Search JSON API (BYOK) — works from any IP
877
+ const googleKey = process.env.GOOGLE_SEARCH_KEY || process.env.GOOGLE_API_KEY;
878
+ const googleCx = process.env.GOOGLE_SEARCH_CX;
879
+ if (googleKey && googleCx) {
880
+ return { provider: new GoogleSearchProvider(), apiKey: googleKey };
881
+ }
882
+ // 2. Brave Search (BYOK)
616
883
  const braveKey = process.env.BRAVE_SEARCH_KEY || process.env.BRAVE_API_KEY;
617
884
  if (braveKey) {
618
885
  return { provider: new BraveSearchProvider(), apiKey: braveKey };
619
886
  }
620
- // Always use DuckDuckGoProviderit has the full fallback chain:
621
- // DDG HTTP DDG Lite Brave (if key) stealth multi-engine.
622
- // This ensures search works even if Playwright isn't available (graceful degradation).
887
+ // 3. Google stealth browser works from datacenter IPs where DDG/Bing/Ecosia fail.
888
+ // GoogleSearchProvider.searchWeb() falls back to stealth scraping when no API key is set.
889
+ if (isPlaywrightExtraAvailable()) {
890
+ return { provider: new GoogleSearchProvider() };
891
+ }
892
+ // 4. DuckDuckGo with full internal fallback chain
893
+ // (DDG HTTP → DDG Lite → stealth multi-engine)
623
894
  return { provider: new DuckDuckGoProvider() };
624
895
  }
625
896
  //# sourceMappingURL=search-provider.js.map