webpeel 0.21.28 → 0.21.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,35 @@
4
4
  */
5
5
  export declare function closePool(): Promise<void>;
6
6
  export declare function createAbortError(): Error;
7
+ /**
8
+ * Domains known to aggressively block datacenter IPs.
9
+ * Requests to these domains automatically route through the Webshare residential
10
+ * proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
11
+ */
12
+ export declare const PROXY_PREFERRED_DOMAINS: readonly string[];
13
+ /**
14
+ * Returns true if the URL's domain is on the proxy-preferred blocklist.
15
+ * Matches exact hostname (sans www.) and all subdomains.
16
+ *
17
+ * @example
18
+ * shouldUseProxy('https://www.reddit.com/r/news') // true
19
+ * shouldUseProxy('https://example.com') // false
20
+ */
21
+ export declare function shouldUseProxy(url: string): boolean;
22
+ /**
23
+ * Generate browser-like request headers tailored to the User-Agent type.
24
+ *
25
+ * - Chrome/Edge: full Sec-CH-UA + Sec-Fetch-* header set
26
+ * - Firefox: adjusted Accept, TE header, partial Sec-Fetch-* (no Sec-CH-UA)
27
+ * - Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
28
+ * - Other: basic headers only
29
+ *
30
+ * Automatically adds a Google referer for domains where it helps bypass blocks.
31
+ *
32
+ * @param url - Target URL (used for domain-specific header additions)
33
+ * @param userAgent - User-Agent string (determines which header set is applied)
34
+ */
35
+ export declare function getStealthHeaders(url: string, userAgent: string): Record<string, string>;
7
36
  /**
8
37
  * SECURITY: Validate URL to prevent SSRF attacks
9
38
  * Blocks localhost, private IPs, link-local, and various bypass techniques
@@ -8,7 +8,8 @@
8
8
  // Must run before any network library is used.
9
9
  import dns from 'dns';
10
10
  dns.setDefaultResultOrder('ipv4first');
11
- import { getRealisticUserAgent, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
11
+ import { getHttpUA, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
12
+ import { getWebshareProxyUrl } from './proxy-config.js';
12
13
  import { fetch as undiciFetch, Agent, ProxyAgent } from 'undici';
13
14
  import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
14
15
  import { getCached } from './cache.js';
@@ -145,6 +146,149 @@ export function createAbortError() {
145
146
  error.name = 'AbortError';
146
147
  return error;
147
148
  }
149
+ // ── Stealth headers & proxy routing ──────────────────────────────────────────
150
+ /**
151
+ * Domains known to aggressively block datacenter IPs.
152
+ * Requests to these domains automatically route through the Webshare residential
153
+ * proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
154
+ */
155
+ export const PROXY_PREFERRED_DOMAINS = [
156
+ 'reddit.com',
157
+ 'old.reddit.com',
158
+ 'forbes.com',
159
+ 'fortune.com',
160
+ 'cargurus.com',
161
+ 'edmunds.com',
162
+ 'cars.com',
163
+ 'truecar.com',
164
+ 'autotrader.com',
165
+ 'carfax.com',
166
+ 'tesla.com',
167
+ 'nerdwallet.com',
168
+ 'bankrate.com',
169
+ 'homeadvisor.com',
170
+ 'angi.com',
171
+ 'insideevs.com',
172
+ 'electrek.co',
173
+ 'motortrend.com',
174
+ 'jdpower.com',
175
+ ];
176
+ /**
177
+ * Returns true if the URL's domain is on the proxy-preferred blocklist.
178
+ * Matches exact hostname (sans www.) and all subdomains.
179
+ *
180
+ * @example
181
+ * shouldUseProxy('https://www.reddit.com/r/news') // true
182
+ * shouldUseProxy('https://example.com') // false
183
+ */
184
+ export function shouldUseProxy(url) {
185
+ try {
186
+ const host = new URL(url).hostname.replace(/^www\./, '');
187
+ return PROXY_PREFERRED_DOMAINS.some(d => host === d || host.endsWith('.' + d));
188
+ }
189
+ catch {
190
+ return false;
191
+ }
192
+ }
193
+ /**
194
+ * Generate browser-like request headers tailored to the User-Agent type.
195
+ *
196
+ * - Chrome/Edge: full Sec-CH-UA + Sec-Fetch-* header set
197
+ * - Firefox: adjusted Accept, TE header, partial Sec-Fetch-* (no Sec-CH-UA)
198
+ * - Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
199
+ * - Other: basic headers only
200
+ *
201
+ * Automatically adds a Google referer for domains where it helps bypass blocks.
202
+ *
203
+ * @param url - Target URL (used for domain-specific header additions)
204
+ * @param userAgent - User-Agent string (determines which header set is applied)
205
+ */
206
+ export function getStealthHeaders(url, userAgent) {
207
+ const isFirefox = userAgent.includes('Firefox');
208
+ const isSafari = userAgent.includes('Safari') && !userAgent.includes('Chrome');
209
+ const isChrome = !isFirefox && !isSafari && (userAgent.includes('Chrome') || userAgent.includes('Chromium'));
210
+ const isMobile = userAgent.includes('Mobile') || userAgent.includes('Android');
211
+ // Base headers all browsers send
212
+ const headers = {
213
+ 'User-Agent': userAgent,
214
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
215
+ 'Accept-Language': 'en-US,en;q=0.9',
216
+ 'Accept-Encoding': 'gzip, deflate, br',
217
+ 'Cache-Control': 'max-age=0',
218
+ 'DNT': '1',
219
+ 'Upgrade-Insecure-Requests': '1',
220
+ };
221
+ if (isFirefox) {
222
+ // Firefox: different Accept, TE, and partial Sec-Fetch (no Sec-CH-UA)
223
+ headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8';
224
+ headers['Accept-Language'] = 'en-US,en;q=0.5';
225
+ headers['TE'] = 'trailers';
226
+ headers['Sec-Fetch-Dest'] = 'document';
227
+ headers['Sec-Fetch-Mode'] = 'navigate';
228
+ headers['Sec-Fetch-Site'] = 'none';
229
+ // Firefox omits Sec-Fetch-User in many navigations
230
+ }
231
+ else if (isSafari) {
232
+ // Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
233
+ headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
234
+ // Safari does not send Sec-Fetch headers at all
235
+ }
236
+ else if (isChrome) {
237
+ // Chrome/Edge: full set of Sec-Fetch-* and Sec-CH-UA headers
238
+ headers['Sec-Fetch-Dest'] = 'document';
239
+ headers['Sec-Fetch-Mode'] = 'navigate';
240
+ headers['Sec-Fetch-Site'] = 'none';
241
+ headers['Sec-Fetch-User'] = '?1';
242
+ headers['Sec-CH-UA'] = getSecCHUA(userAgent);
243
+ headers['Sec-CH-UA-Mobile'] = isMobile ? '?1' : '?0';
244
+ headers['Sec-CH-UA-Platform'] = getSecCHUAPlatform(userAgent);
245
+ headers['Connection'] = 'keep-alive';
246
+ headers['Priority'] = 'u=0, i';
247
+ }
248
+ // else: custom/API UAs (e.g. "WebPeel/1.0") — basic headers only, no browser fingerprints
249
+ // Add Google Referer for domains where it's known to help bypass blocks
250
+ try {
251
+ const domain = new URL(url).hostname;
252
+ const referrerDomains = [
253
+ 'reddit.com', 'forbes.com', 'cargurus.com', 'edmunds.com',
254
+ 'cars.com', 'truecar.com', 'nerdwallet.com', 'homeadvisor.com',
255
+ 'angi.com', 'motortrend.com', 'jdpower.com', 'electrek.co', 'insideevs.com',
256
+ ];
257
+ if (referrerDomains.some(d => domain.includes(d))) {
258
+ headers['Referer'] = 'https://www.google.com/';
259
+ }
260
+ }
261
+ catch {
262
+ // Non-fatal: URL parsing failed, skip Referer
263
+ }
264
+ return headers;
265
+ }
266
+ /** Pick a different UA than the one currently in use (for 403/503 retries). */
267
+ function getDifferentUA(current) {
268
+ for (let i = 0; i < 10; i++) {
269
+ const ua = getHttpUA();
270
+ if (ua !== current)
271
+ return ua;
272
+ }
273
+ return getHttpUA();
274
+ }
275
+ /**
276
+ * Build the merged request headers: stealth defaults + caller custom headers.
277
+ * Throws WebPeelError if customHeaders attempts to override the Host header.
278
+ */
279
+ function buildMergedHeaders(url, userAgent, customHeaders) {
280
+ const merged = { ...getStealthHeaders(url, userAgent) };
281
+ if (customHeaders) {
282
+ for (const [key, value] of Object.entries(customHeaders)) {
283
+ // SECURITY: Block Host header override
284
+ if (key.toLowerCase() === 'host') {
285
+ throw new WebPeelError('Custom Host header is not allowed');
286
+ }
287
+ merged[key] = value;
288
+ }
289
+ }
290
+ return merged;
291
+ }
148
292
  // ── SSRF / URL validation ─────────────────────────────────────────────────────
149
293
  /**
150
294
  * SECURITY: Validate URL to prevent SSRF attacks
@@ -368,42 +512,19 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
368
512
  // SEC.gov requires a User-Agent with contact info (their documented automated access policy)
369
513
  const hostname = new URL(url).hostname.toLowerCase();
370
514
  const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
371
- const validatedUserAgent = isSecGov
515
+ let activeUserAgent = isSecGov
372
516
  ? 'WebPeel/1.0 (support@webpeel.dev)'
373
- : (userAgent ? validateUserAgent(userAgent) : getRealisticUserAgent());
374
- // SECURITY: Merge custom headers with defaults, block Host header override
375
- const defaultHeaders = {
376
- 'User-Agent': validatedUserAgent,
377
- 'Accept': 'text/markdown, text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
378
- 'Accept-Language': 'en-US,en;q=0.9',
379
- 'Accept-Encoding': 'br, gzip, deflate',
380
- 'DNT': '1',
381
- 'Connection': 'keep-alive',
382
- 'Upgrade-Insecure-Requests': '1',
383
- 'Sec-CH-UA': getSecCHUA(validatedUserAgent),
384
- 'Sec-CH-UA-Mobile': '?0',
385
- 'Sec-CH-UA-Platform': getSecCHUAPlatform(validatedUserAgent),
386
- 'Sec-Fetch-Dest': 'document',
387
- 'Sec-Fetch-Mode': 'navigate',
388
- 'Sec-Fetch-Site': 'none',
389
- 'Sec-Fetch-User': '?1',
390
- 'Cache-Control': 'max-age=0',
391
- 'Priority': 'u=0, i',
392
- };
393
- const mergedHeaders = { ...defaultHeaders };
394
- if (customHeaders) {
395
- for (const [key, value] of Object.entries(customHeaders)) {
396
- // SECURITY: Block Host header override
397
- if (key.toLowerCase() === 'host') {
398
- throw new WebPeelError('Custom Host header is not allowed');
399
- }
400
- mergedHeaders[key] = value;
401
- }
402
- }
517
+ : (userAgent ? validateUserAgent(userAgent) : getHttpUA());
518
+ // Build stealth headers merged with any caller-supplied custom headers
519
+ let mergedHeaders = buildMergedHeaders(url, activeUserAgent, customHeaders);
520
+ // Auto-route through residential proxy for sites known to block datacenter IPs.
521
+ // The explicit `proxy` param always wins; auto-proxy only kicks in when unset.
522
+ const effectiveProxy = proxy ?? (shouldUseProxy(url) ? (getWebshareProxyUrl() ?? undefined) : undefined);
403
523
  const MAX_REDIRECTS = 10;
404
524
  let redirectCount = 0;
405
525
  let currentUrl = url;
406
526
  const seenUrls = new Set();
527
+ let retried = false; // track whether we've already retried with a different UA
407
528
  try {
408
529
  const hostname = new URL(url).hostname;
409
530
  void resolveAndCache(hostname).catch(() => {
@@ -436,8 +557,8 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
436
557
  if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
437
558
  requestHeaders['If-Modified-Since'] = validators.lastModified;
438
559
  }
439
- // Use proxy if provided, otherwise use shared connection pool
440
- const dispatcher = proxy ? new ProxyAgent(proxy) : httpPool;
560
+ // Use proxy if provided or auto-selected, otherwise use shared connection pool
561
+ const dispatcher = effectiveProxy ? new ProxyAgent(effectiveProxy) : httpPool;
441
562
  const response = await undiciFetch(currentUrl, {
442
563
  headers: requestHeaders,
443
564
  signal,
@@ -475,6 +596,16 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
475
596
  }
476
597
  if (!response.ok) {
477
598
  if (response.status === 403 || response.status === 503) {
599
+ // Retry once with a different UA — cheap and catches UA-based blocks
600
+ if (!retried && !userAgent) {
601
+ retried = true;
602
+ activeUserAgent = getDifferentUA(activeUserAgent);
603
+ mergedHeaders = buildMergedHeaders(currentUrl, activeUserAgent, customHeaders);
604
+ // Allow the retry to re-visit the same URL (not a redirect loop)
605
+ seenUrls.delete(currentUrl);
606
+ log.debug(`HTTP ${response.status} on first attempt; retrying with different UA`);
607
+ continue;
608
+ }
478
609
  throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
479
610
  }
480
611
  const statusText = response.statusText || HTTP_STATUS_TEXT[response.status] || 'Unknown Error';
@@ -44,18 +44,21 @@ declare class ProviderStatsTracker {
44
44
  private readonly windowSize;
45
45
  private readonly failThreshold;
46
46
  private readonly minSamples;
47
- constructor(windowSize?: number, failThreshold?: number, minSamples?: number);
47
+ private readonly decayMs;
48
+ constructor(windowSize?: number, failThreshold?: number, minSamples?: number, decayMs?: number);
48
49
  /** Record the outcome of a single attempt for the given source. */
49
50
  record(sourceId: string, success: boolean): void;
50
51
  /**
51
52
  * Returns the failure rate (0–1) for the given source based on
52
53
  * the sliding window of recorded attempts. Returns 0 if fewer
53
- * than minSamples have been recorded.
54
+ * than minSamples have been recorded, or if all samples are older
55
+ * than decayMs (failures expire so cold-start blips don't permanently
56
+ * lock out a provider).
54
57
  */
55
58
  getFailureRate(sourceId: string): number;
56
59
  /**
57
60
  * Returns true when the source should be skipped (failure rate >=
58
- * failThreshold with at least minSamples recorded).
61
+ * failThreshold with at least minSamples recent recorded).
59
62
  */
60
63
  shouldSkip(sourceId: string): boolean;
61
64
  /** Debug snapshot for a source. */
@@ -73,6 +76,11 @@ declare class ProviderStatsTracker {
73
76
  * (e.g. in tests) and to log diagnostics.
74
77
  */
75
78
  export declare const providerStats: ProviderStatsTracker;
79
+ /**
80
+ * Merge results from multiple sources, deduplicating by normalized URL.
81
+ * Preserves original order (first occurrence wins) and limits to maxCount.
82
+ */
83
+ export declare function mergeSearchResults(results: WebSearchResult[], maxCount: number): WebSearchResult[];
76
84
  /**
77
85
  * Filter and rank results by relevance to the original query.
78
86
  *
@@ -128,6 +136,19 @@ export declare class DuckDuckGoProvider implements SearchProvider {
128
136
  * works when the main HTML endpoint is temporarily blocked on datacenter IPs.
129
137
  */
130
138
  private searchLite;
139
+ /**
140
+ * HTTP-only Bing scraping via undici + cheerio. No browser required.
141
+ * Routes through Webshare proxy (proxy first, direct fallback).
142
+ * Tracks stats via providerStats('bing-http').
143
+ */
144
+ private _searchBingHttp;
145
+ /**
146
+ * HTTP-only Google scraping via undici + cheerio. No browser required.
147
+ * Routes through Webshare proxy (proxy first, direct fallback).
148
+ * Sends CONSENT cookie to bypass Google consent page.
149
+ * Tracks stats via providerStats('google-http').
150
+ */
151
+ private _searchGoogleHttp;
131
152
  searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
132
153
  /**
133
154
  * Exposed for testing: score and filter a pre-fetched result list against a query.
@@ -114,15 +114,17 @@ class ProviderStatsTracker {
114
114
  windowSize;
115
115
  failThreshold;
116
116
  minSamples;
117
- constructor(windowSize = 10, failThreshold = 0.8, minSamples = 3) {
117
+ decayMs; // failures older than this are ignored
118
+ constructor(windowSize = 10, failThreshold = 0.8, minSamples = 5, decayMs = 5 * 60 * 1000) {
118
119
  this.windowSize = windowSize;
119
120
  this.failThreshold = failThreshold;
120
121
  this.minSamples = minSamples;
122
+ this.decayMs = decayMs; // default 5 minutes: old failures don't permanently lock a provider
121
123
  }
122
124
  /** Record the outcome of a single attempt for the given source. */
123
125
  record(sourceId, success) {
124
126
  const arr = this.history.get(sourceId) ?? [];
125
- arr.push({ success });
127
+ arr.push({ success, ts: Date.now() });
126
128
  if (arr.length > this.windowSize)
127
129
  arr.splice(0, arr.length - this.windowSize);
128
130
  this.history.set(sourceId, arr);
@@ -130,18 +132,24 @@ class ProviderStatsTracker {
130
132
  /**
131
133
  * Returns the failure rate (0–1) for the given source based on
132
134
  * the sliding window of recorded attempts. Returns 0 if fewer
133
- * than minSamples have been recorded.
135
+ * than minSamples have been recorded, or if all samples are older
136
+ * than decayMs (failures expire so cold-start blips don't permanently
137
+ * lock out a provider).
134
138
  */
135
139
  getFailureRate(sourceId) {
136
140
  const arr = this.history.get(sourceId);
137
141
  if (!arr || arr.length < this.minSamples)
138
142
  return 0;
139
- const failures = arr.filter(a => !a.success).length;
140
- return failures / arr.length;
143
+ const cutoff = Date.now() - this.decayMs;
144
+ const recent = arr.filter(a => a.ts >= cutoff);
145
+ if (recent.length < this.minSamples)
146
+ return 0; // not enough recent samples
147
+ const failures = recent.filter(a => !a.success).length;
148
+ return failures / recent.length;
141
149
  }
142
150
  /**
143
151
  * Returns true when the source should be skipped (failure rate >=
144
- * failThreshold with at least minSamples recorded).
152
+ * failThreshold with at least minSamples recent recorded).
145
153
  */
146
154
  shouldSkip(sourceId) {
147
155
  return this.getFailureRate(sourceId) >= this.failThreshold;
@@ -195,6 +203,24 @@ function normalizeUrlForDedupe(rawUrl) {
195
203
  .replace(/\/+$/g, '');
196
204
  }
197
205
  }
206
+ /**
207
+ * Merge results from multiple sources, deduplicating by normalized URL.
208
+ * Preserves original order (first occurrence wins) and limits to maxCount.
209
+ */
210
+ export function mergeSearchResults(results, maxCount) {
211
+ const seen = new Set();
212
+ const merged = [];
213
+ for (const r of results) {
214
+ if (merged.length >= maxCount)
215
+ break;
216
+ const key = normalizeUrlForDedupe(r.url);
217
+ if (seen.has(key))
218
+ continue;
219
+ seen.add(key);
220
+ merged.push(r);
221
+ }
222
+ return merged;
223
+ }
198
224
  // ============================================================
199
225
  // Result Relevance Filtering
200
226
  // Lightweight keyword-overlap scoring — no external deps.
@@ -206,6 +232,9 @@ const STOP_WORDS = new Set([
206
232
  'of', 'with', 'how', 'what', 'where', 'when', 'why', 'best', 'top', 'most',
207
233
  'and', 'or', 'but', 'not', 'do', 'does', 'did', 'be', 'been', 'have', 'has',
208
234
  'buy', 'get', 'find', 'about', 'from', 'by', 'its', 'it', 'this', 'that',
235
+ 'much', 'very', 'can', 'will', 'would', 'could', 'should', 'per', 'than',
236
+ 'some', 'just', 'also', 'more', 'like', 'make', 'any', 'each', 'all', 'my',
237
+ 'your', 'our', 'their', 'me', 'us', 'them', 'so', 'if', 'then', 'here',
209
238
  ]);
210
239
  /**
211
240
  * Extract meaningful keywords from a search query by stripping stop words and
@@ -271,8 +300,10 @@ export function filterRelevantResults(results, query) {
271
300
  score: scoreResult(r, keywords),
272
301
  idx,
273
302
  }));
274
- // Drop results with zero overlap
275
- const relevant = scored.filter(s => s.score > 0);
303
+ // Drop results with insufficient overlap — require ≥15% keyword match
304
+ // to filter out dictionary/definition pages that match on a single common word
305
+ const minScore = keywords.length >= 3 ? 0.15 : 0.01;
306
+ const relevant = scored.filter(s => s.score >= minScore);
276
307
  // Sort by score descending, original order as tiebreaker
277
308
  relevant.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.idx - b.idx));
278
309
  return relevant.map(s => ({
@@ -571,9 +602,21 @@ export class DuckDuckGoProvider {
571
602
  const attempts = [];
572
603
  // Required retry strategy order:
573
604
  // 1) original query
574
- // 2) quoted query
575
- // 3) query site:*
605
+ // 2) keywords-only (strip question words, articles, prepositions)
606
+ // 3) quoted query
607
+ // 4) query site:*
576
608
  attempts.push(q);
609
+ // For long queries (>5 words), extract just the meaningful keywords
610
+ // "how much does a used 2023 Tesla Model 3 cost per month" → "2023 Tesla Model 3 cost month"
611
+ const words = q.split(/\s+/);
612
+ if (words.length > 5) {
613
+ const keywordsOnly = words
614
+ .filter(w => !STOP_WORDS.has(w.toLowerCase()) && w.length >= 2)
615
+ .join(' ');
616
+ if (keywordsOnly && keywordsOnly !== q) {
617
+ attempts.push(keywordsOnly);
618
+ }
619
+ }
577
620
  if (!/^".*"$/.test(q))
578
621
  attempts.push(`"${q}"`);
579
622
  attempts.push(`${q} site:*`);
@@ -776,6 +819,219 @@ export class DuckDuckGoProvider {
776
819
  });
777
820
  return results;
778
821
  }
822
+ /**
823
+ * HTTP-only Bing scraping via undici + cheerio. No browser required.
824
+ * Routes through Webshare proxy (proxy first, direct fallback).
825
+ * Tracks stats via providerStats('bing-http').
826
+ */
827
+ // @ts-expect-error Disabled Stage 3.5 — kept for future re-enablement
828
+ async _searchBingHttp(query, options) {
829
+ const { count, signal } = options;
830
+ const bingRate = providerStats.getFailureRate('bing-http');
831
+ const timeoutMs = bingRate > 0.5 ? 3_000 : 8_000;
832
+ const bingSignal = createTimeoutSignal(timeoutMs, signal);
833
+ const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=10`;
834
+ const headers = {
835
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
836
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
837
+ 'Accept-Language': 'en-US,en;q=0.9',
838
+ 'Sec-Fetch-Dest': 'document',
839
+ 'Sec-Fetch-Mode': 'navigate',
840
+ 'Sec-Fetch-Site': 'none',
841
+ 'Sec-Fetch-User': '?1',
842
+ 'Upgrade-Insecure-Requests': '1',
843
+ };
844
+ const proxyUrl = getWebshareProxyUrl();
845
+ let response;
846
+ try {
847
+ if (proxyUrl) {
848
+ try {
849
+ const dispatcher = new ProxyAgent(proxyUrl);
850
+ response = await undiciFetch(url, { headers, signal: bingSignal, dispatcher });
851
+ }
852
+ catch (proxyErr) {
853
+ log.debug('Bing HTTP proxy failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
854
+ response = await undiciFetch(url, { headers, signal: bingSignal });
855
+ }
856
+ }
857
+ else {
858
+ response = await undiciFetch(url, { headers, signal: bingSignal });
859
+ }
860
+ if (!response.ok) {
861
+ providerStats.record('bing-http', false);
862
+ return [];
863
+ }
864
+ const html = await response.text();
865
+ const $ = load(html);
866
+ const results = [];
867
+ const seen = new Set();
868
+ // Parse Bing organic results; skip ad containers
869
+ $('li.b_algo').each((_i, elem) => {
870
+ if (results.length >= count)
871
+ return;
872
+ const $r = $(elem);
873
+ // Skip if inside a .b_ad block or is itself an ad container
874
+ if ($r.hasClass('b_ad') || $r.closest('.b_ad').length > 0)
875
+ return;
876
+ const $a = $r.find('h2 > a').first();
877
+ const title = cleanText($a.text(), { maxLen: 200 });
878
+ const rawUrl = $a.attr('href') || '';
879
+ if (!title || !rawUrl)
880
+ return;
881
+ // Decode Bing redirect URLs:
882
+ // Relative: /ck/a?!&&p=...&u=a1<base64url>&ntb=1
883
+ // Absolute: https://www.bing.com/ck/a?...&u=a1<base64url>&ntb=1
884
+ let finalUrl = rawUrl;
885
+ try {
886
+ const base = rawUrl.startsWith('/') ? `https://www.bing.com${rawUrl}` : rawUrl;
887
+ const ckUrl = new URL(base);
888
+ if (ckUrl.hostname.endsWith('bing.com') && ckUrl.pathname.startsWith('/ck/')) {
889
+ const u = ckUrl.searchParams.get('u');
890
+ if (u && u.startsWith('a1')) {
891
+ const decoded = Buffer.from(u.slice(2), 'base64url').toString('utf-8');
892
+ if (decoded.startsWith('http'))
893
+ finalUrl = decoded;
894
+ }
895
+ }
896
+ }
897
+ catch { /* use rawUrl as-is */ }
898
+ // Validate: HTTP/HTTPS only
899
+ try {
900
+ const parsed = new URL(finalUrl);
901
+ if (!['http:', 'https:'].includes(parsed.protocol))
902
+ return;
903
+ finalUrl = parsed.href;
904
+ }
905
+ catch {
906
+ return;
907
+ }
908
+ const key = normalizeUrlForDedupe(finalUrl);
909
+ if (seen.has(key))
910
+ return;
911
+ seen.add(key);
912
+ const snippetRaw = $r.find('.b_caption p').first().text() ||
913
+ $r.find('.b_caption').first().text();
914
+ const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
915
+ results.push({ title, url: finalUrl, snippet });
916
+ });
917
+ providerStats.record('bing-http', results.length > 0);
918
+ return results;
919
+ }
920
+ catch (e) {
921
+ log.debug('Bing HTTP search failed:', e instanceof Error ? e.message : e);
922
+ providerStats.record('bing-http', false);
923
+ return [];
924
+ }
925
+ }
926
+ /**
927
+ * HTTP-only Google scraping via undici + cheerio. No browser required.
928
+ * Routes through Webshare proxy (proxy first, direct fallback).
929
+ * Sends CONSENT cookie to bypass Google consent page.
930
+ * Tracks stats via providerStats('google-http').
931
+ */
932
+ // @ts-expect-error Disabled Stage 3.5 — kept for future re-enablement
933
+ async _searchGoogleHttp(query, options) {
934
+ const { count, signal } = options;
935
+ const googleRate = providerStats.getFailureRate('google-http');
936
+ const timeoutMs = googleRate > 0.5 ? 3_000 : 8_000;
937
+ const googleSignal = createTimeoutSignal(timeoutMs, signal);
938
+ const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=10&hl=en`;
939
+ const headers = {
940
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
941
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
942
+ 'Accept-Language': 'en-US,en;q=0.9',
943
+ // Skip Google consent/cookie wall
944
+ 'Cookie': 'CONSENT=YES+; SOCS=CAESEwgDEgk0OTg3ODQ2NzMaAmVuIAEaBgiA0LqmBg',
945
+ 'Sec-Fetch-Dest': 'document',
946
+ 'Sec-Fetch-Mode': 'navigate',
947
+ 'Sec-Fetch-Site': 'none',
948
+ 'Sec-Fetch-User': '?1',
949
+ 'Upgrade-Insecure-Requests': '1',
950
+ };
951
+ const proxyUrl = getWebshareProxyUrl();
952
+ let response;
953
+ try {
954
+ if (proxyUrl) {
955
+ try {
956
+ const dispatcher = new ProxyAgent(proxyUrl);
957
+ response = await undiciFetch(url, { headers, signal: googleSignal, dispatcher });
958
+ }
959
+ catch (proxyErr) {
960
+ log.debug('Google HTTP proxy failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
961
+ response = await undiciFetch(url, { headers, signal: googleSignal });
962
+ }
963
+ }
964
+ else {
965
+ response = await undiciFetch(url, { headers, signal: googleSignal });
966
+ }
967
+ if (!response.ok) {
968
+ providerStats.record('google-http', false);
969
+ return [];
970
+ }
971
+ const html = await response.text();
972
+ const $ = load(html);
973
+ const results = [];
974
+ const seen = new Set();
975
+ // Google organic results live in div.g blocks.
976
+ // Skip ad blocks (data-text-ad attr), People Also Ask, and related searches.
977
+ $('div.g').each((_i, elem) => {
978
+ if (results.length >= count)
979
+ return;
980
+ const $r = $(elem);
981
+ // Skip ad containers (data-text-ad may be on div.g itself or on a descendant)
982
+ if ($r.attr('data-text-ad') !== undefined || $r.find('[data-text-ad]').length > 0)
983
+ return;
984
+ if ($r.closest('.commercial-unit-desktop-top, .ads-ad').length > 0)
985
+ return;
986
+ const $h3 = $r.find('h3').first();
987
+ if (!$h3.length)
988
+ return;
989
+ // Find a valid external link (starts with http, not a Google domain)
990
+ const $a = $r.find('a[href]').filter((_j, el) => {
991
+ const href = $(el).attr('href') || '';
992
+ return href.startsWith('http') && !href.includes('google.com/');
993
+ }).first();
994
+ if (!$a.length)
995
+ return;
996
+ const href = $a.attr('href') || '';
997
+ // Validate URL
998
+ let finalUrl;
999
+ try {
1000
+ const parsed = new URL(href);
1001
+ if (!['http:', 'https:'].includes(parsed.protocol))
1002
+ return;
1003
+ if (parsed.hostname.includes('google.com'))
1004
+ return;
1005
+ finalUrl = parsed.href;
1006
+ }
1007
+ catch {
1008
+ return;
1009
+ }
1010
+ const key = normalizeUrlForDedupe(finalUrl);
1011
+ if (seen.has(key))
1012
+ return;
1013
+ seen.add(key);
1014
+ const title = cleanText($h3.text(), { maxLen: 200 });
1015
+ if (!title)
1016
+ return;
1017
+ // Snippet: try multiple known Google snippet CSS classes/attrs
1018
+ const snippetRaw = $r.find('.VwiC3b').first().text() ||
1019
+ $r.find('[data-sncf]').first().text() ||
1020
+ $r.find('[style*="-webkit-line-clamp"]').first().text() ||
1021
+ $r.find('.st').first().text() ||
1022
+ '';
1023
+ const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
1024
+ results.push({ title, url: finalUrl, snippet });
1025
+ });
1026
+ providerStats.record('google-http', results.length > 0);
1027
+ return results;
1028
+ }
1029
+ catch (e) {
1030
+ log.debug('Google HTTP search failed:', e instanceof Error ? e.message : e);
1031
+ providerStats.record('google-http', false);
1032
+ return [];
1033
+ }
1034
+ }
779
1035
  async searchWeb(query, options) {
780
1036
  const attempts = this.buildQueryAttempts(query);
781
1037
  // -----------------------------------------------------------
@@ -867,6 +1123,17 @@ export class DuckDuckGoProvider {
867
1123
  }
868
1124
  }
869
1125
  // -----------------------------------------------------------
1126
+ // Stage 3.5: HTTP-based Bing + Google (no browser, no API key)
1127
+ // DISABLED: Both Bing and Google detect non-browser HTTP clients and
1128
+ // serve different/irrelevant content (dictionary pages, random sites).
1129
+ // The scrapers are built (searchBingHttp, searchGoogleHttp) but need
1130
+ // further work on request fingerprinting to get real results.
1131
+ // TODO: Re-enable when fingerprinting is improved.
1132
+ // -----------------------------------------------------------
1133
+ // const skipBingHttp = providerStats.shouldSkip('bing-http');
1134
+ // const skipGoogleHttp = providerStats.shouldSkip('google-http');
1135
+ // if (!skipBingHttp || !skipGoogleHttp) { ... }
1136
+ // -----------------------------------------------------------
870
1137
  // Stage 4: Stealth multi-engine (DDG + Bing + Ecosia in parallel)
871
1138
  // Bypasses bot-detection on datacenter IPs. This is the reliable
872
1139
  // last resort — but it spins up a browser so it takes a few seconds.
@@ -9,6 +9,11 @@
9
9
  * Also provides `getSecCHUA()` for generating correct Sec-CH-UA header values
10
10
  * that match the selected user agent (version-accurate brand hints).
11
11
  */
12
+ /**
13
+ * Full UA pool for HTTP-only requests (Chrome + Firefox + Safari + Edge + Mobile).
14
+ * NOT for browser contexts — use getRealisticUserAgent() there (Chrome-only).
15
+ */
16
+ export declare const HTTP_UAS: readonly string[];
12
17
  /**
13
18
  * Returns a realistic, recent Chrome user agent string.
14
19
  * Randomly picks from a curated list of real-world UAs (Chrome 132-136 range).
@@ -32,7 +37,27 @@ export declare function getRealisticUserAgent(platform?: 'windows' | 'mac' | 'li
32
37
  */
33
38
  export declare function getRandomUA(): string;
34
39
  /**
35
- * The full curated list of realistic user agents.
40
+ * Returns a realistic user agent for HTTP-only (non-browser) requests.
41
+ * Unlike `getRealisticUserAgent()` which is Chrome-only for browser contexts,
42
+ * this function returns from a wider pool: Chrome, Firefox, Safari, Edge, and Mobile.
43
+ *
44
+ * Weight distribution (approximate):
45
+ * - Chrome Windows: ~30%
46
+ * - Chrome macOS: ~25%
47
+ * - Chrome Linux: ~10%
48
+ * - Firefox: ~15%
49
+ * - Safari: ~10%
50
+ * - Edge: ~5%
51
+ * - Mobile Chrome: ~5%
52
+ *
53
+ * @example
54
+ * ```ts
55
+ * const ua = getHttpUA(); // e.g. "Mozilla/5.0 ... Firefox/133.0"
56
+ * ```
57
+ */
58
+ export declare function getHttpUA(): string;
59
+ /**
60
+ * The full curated list of realistic user agents (Chrome-only, all platforms).
36
61
  * Exported for inspection / testing.
37
62
  */
38
63
  export declare const REALISTIC_USER_AGENTS: readonly string[];
@@ -42,8 +42,44 @@ const LINUX_UAS = [
42
42
  // Chrome 136 Linux
43
43
  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
44
44
  ];
45
- /** All UAs combined (fallback when no platform is specified) */
45
+ /** All Chrome UAs combined (fallback when no platform is specified) */
46
46
  const ALL_UAS = [...WINDOWS_UAS, ...MAC_UAS, ...LINUX_UAS];
47
+ // ── Extended pools for non-Chrome browsers (HTTP-only use) ───────────────────
48
+ /** Firefox UAs — Windows, Mac, Linux */
49
+ const FIREFOX_UAS = [
50
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
51
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0',
52
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:133.0) Gecko/20100101 Firefox/133.0',
53
+ 'Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0',
54
+ ];
55
+ /** Safari UAs — macOS */
56
+ const SAFARI_UAS = [
57
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15',
58
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 15_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15',
59
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15',
60
+ ];
61
+ /** Microsoft Edge UAs */
62
+ const EDGE_UAS = [
63
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
64
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0',
65
+ ];
66
+ /** Mobile Chrome UAs */
67
+ const MOBILE_CHROME_UAS = [
68
+ 'Mozilla/5.0 (Linux; Android 14; SM-S928B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36',
69
+ 'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Mobile Safari/537.36',
70
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.6778.103 Mobile/15E148 Safari/604.1',
71
+ ];
72
+ /**
73
+ * Full UA pool for HTTP-only requests (Chrome + Firefox + Safari + Edge + Mobile).
74
+ * NOT for browser contexts — use getRealisticUserAgent() there (Chrome-only).
75
+ */
76
+ export const HTTP_UAS = [
77
+ ...ALL_UAS,
78
+ ...FIREFOX_UAS,
79
+ ...SAFARI_UAS,
80
+ ...EDGE_UAS,
81
+ ...MOBILE_CHROME_UAS,
82
+ ];
47
83
  // ── Public API ────────────────────────────────────────────────────────────────
48
84
  /**
49
85
  * Returns a realistic, recent Chrome user agent string.
@@ -97,7 +133,52 @@ export function getRandomUA() {
97
133
  return ALL_UAS[idx];
98
134
  }
99
135
  /**
100
- * The full curated list of realistic user agents.
136
+ * Returns a realistic user agent for HTTP-only (non-browser) requests.
137
+ * Unlike `getRealisticUserAgent()` which is Chrome-only for browser contexts,
138
+ * this function returns from a wider pool: Chrome, Firefox, Safari, Edge, and Mobile.
139
+ *
140
+ * Weight distribution (approximate):
141
+ * - Chrome Windows: ~30%
142
+ * - Chrome macOS: ~25%
143
+ * - Chrome Linux: ~10%
144
+ * - Firefox: ~15%
145
+ * - Safari: ~10%
146
+ * - Edge: ~5%
147
+ * - Mobile Chrome: ~5%
148
+ *
149
+ * @example
150
+ * ```ts
151
+ * const ua = getHttpUA(); // e.g. "Mozilla/5.0 ... Firefox/133.0"
152
+ * ```
153
+ */
154
+ export function getHttpUA() {
155
+ const roll = Math.random();
156
+ let pool;
157
+ if (roll < 0.30) {
158
+ pool = WINDOWS_UAS;
159
+ }
160
+ else if (roll < 0.55) {
161
+ pool = MAC_UAS;
162
+ }
163
+ else if (roll < 0.65) {
164
+ pool = LINUX_UAS;
165
+ }
166
+ else if (roll < 0.80) {
167
+ pool = FIREFOX_UAS;
168
+ }
169
+ else if (roll < 0.90) {
170
+ pool = SAFARI_UAS;
171
+ }
172
+ else if (roll < 0.95) {
173
+ pool = EDGE_UAS;
174
+ }
175
+ else {
176
+ pool = MOBILE_CHROME_UAS;
177
+ }
178
+ return pool[Math.floor(Math.random() * pool.length)];
179
+ }
180
+ /**
181
+ * The full curated list of realistic user agents (Chrome-only, all platforms).
101
182
  * Exported for inspection / testing.
102
183
  */
103
184
  export const REALISTIC_USER_AGENTS = ALL_UAS;
@@ -28,6 +28,7 @@ import { createJobsRouter } from './routes/jobs.js';
28
28
  import { createBatchRouter } from './routes/batch.js';
29
29
  import { createAnswerRouter } from './routes/answer.js';
30
30
  import { createDeepResearchRouter } from './routes/deep-research.js';
31
+ import { createResearchRouter } from './routes/research.js';
31
32
  import { createAskRouter } from './routes/ask.js';
32
33
  import { createMcpRouter } from './routes/mcp.js';
33
34
  import { createDoRouter } from './routes/do.js';
@@ -291,6 +292,9 @@ export function createApp(config = {}) {
291
292
  app.use('/v1/screenshot', requireScope('full', 'read'));
292
293
  app.use(createScreenshotRouter(authStore));
293
294
  app.use(createSearchRouter(authStore));
295
+ // /v1/research — lightweight research (search → fetch → compile), BYOK LLM optional
296
+ app.use('/v1/research', requireScope('full', 'read'));
297
+ app.use(createResearchRouter());
294
298
  app.use(createBillingPortalRouter(pool));
295
299
  app.use(createUserRouter());
296
300
  app.use(createOAuthRouter());
@@ -0,0 +1,13 @@
1
+ /**
2
+ * POST /v1/research
3
+ *
4
+ * Lightweight research endpoint that chains search → fetch → compile.
5
+ * No LLM required for baseline results; optional BYOK LLM synthesis.
6
+ *
7
+ * Auth: API key required (full or read scope)
8
+ * Body: ResearchRequest
9
+ */
10
+ import { Router } from 'express';
11
+ export declare function expandQuery(query: string): string[];
12
+ export declare function extractKeyFacts(content: string, query: string, maxFacts?: number): string[];
13
+ export declare function createResearchRouter(): Router;
@@ -0,0 +1,401 @@
1
+ /**
2
+ * POST /v1/research
3
+ *
4
+ * Lightweight research endpoint that chains search → fetch → compile.
5
+ * No LLM required for baseline results; optional BYOK LLM synthesis.
6
+ *
7
+ * Auth: API key required (full or read scope)
8
+ * Body: ResearchRequest
9
+ */
10
+ import { Router } from 'express';
11
+ import { peel } from '../../index.js';
12
+ import { getSearchProvider } from '../../core/search-provider.js';
13
+ import { callLLM, } from '../../core/llm-provider.js';
14
+ // ---------------------------------------------------------------------------
15
+ // Query expansion — simple heuristics, no LLM needed
16
+ // ---------------------------------------------------------------------------
17
+ const CURRENT_YEAR = new Date().getFullYear();
18
+ // Keywords that suggest the query is time-sensitive
19
+ const TIME_SENSITIVE_PATTERNS = /\b(price|cost|best|top|latest|current|now|today|new|salary|rate|speed|version|release|stock|review)\b/i;
20
+ // Prefixes that can be rephrased
21
+ const HOW_MUCH_RE = /^how much (?:does|do|is|are) (.+?)(?:\s+cost|\s+price|\s+charge)?[\s?]*$/i;
22
+ const HOW_TO_RE = /^how (?:to|do(?:es)?) (.+?)[\s?]*$/i;
23
+ const WHAT_IS_RE = /^(?:what (?:is|are)) (.+?)[\s?]*$/i;
24
+ export function expandQuery(query) {
25
+ const q = query.trim();
26
+ const queries = [q];
27
+ // Add year variant if time-sensitive and year not already present
28
+ const hasYear = /\b(20\d{2}|19\d{2})\b/.test(q);
29
+ if (!hasYear && TIME_SENSITIVE_PATTERNS.test(q)) {
30
+ queries.push(`${q} ${CURRENT_YEAR}`);
31
+ }
32
+ // Rephrase "how much does X cost" → "X cost price"
33
+ const howMuchMatch = HOW_MUCH_RE.exec(q);
34
+ if (howMuchMatch) {
35
+ const subject = howMuchMatch[1].trim();
36
+ const rephrased = `${subject} cost price`;
37
+ if (!queries.includes(rephrased)) {
38
+ queries.push(rephrased);
39
+ }
40
+ }
41
+ // Rephrase "how to X" → "X guide tutorial"
42
+ const howToMatch = HOW_TO_RE.exec(q);
43
+ if (howToMatch) {
44
+ const subject = howToMatch[1].trim();
45
+ const rephrased = `${subject} guide`;
46
+ if (!queries.includes(rephrased)) {
47
+ queries.push(rephrased);
48
+ }
49
+ }
50
+ // Rephrase "what is X" → "X definition overview"
51
+ const whatIsMatch = WHAT_IS_RE.exec(q);
52
+ if (whatIsMatch) {
53
+ const subject = whatIsMatch[1].trim();
54
+ const rephrased = `${subject} overview`;
55
+ if (!queries.includes(rephrased)) {
56
+ queries.push(rephrased);
57
+ }
58
+ }
59
+ // Cap at 3 variations
60
+ return queries.slice(0, 3);
61
+ }
62
+ // ---------------------------------------------------------------------------
63
+ // Key-fact extraction — score sentences by keyword overlap
64
+ // ---------------------------------------------------------------------------
65
+ function tokenize(text) {
66
+ return text
67
+ .toLowerCase()
68
+ .split(/\W+/)
69
+ .filter(w => w.length > 2);
70
+ }
71
+ // Common English stop-words to skip when scoring
72
+ const STOP_WORDS = new Set([
73
+ 'the', 'and', 'for', 'are', 'was', 'were', 'but', 'not', 'you', 'all',
74
+ 'can', 'her', 'his', 'its', 'our', 'out', 'one', 'had', 'has', 'have',
75
+ 'this', 'that', 'with', 'they', 'from', 'your', 'what', 'when', 'how',
76
+ 'will', 'been', 'than', 'more', 'also', 'into', 'which', 'about',
77
+ ]);
78
+ export function extractKeyFacts(content, query, maxFacts = 5) {
79
+ if (!content || !query)
80
+ return [];
81
+ const queryKeywords = new Set(tokenize(query).filter(w => !STOP_WORDS.has(w)));
82
+ if (queryKeywords.size === 0)
83
+ return [];
84
+ // Split into sentences on common terminators
85
+ const sentences = content
86
+ .replace(/\n{2,}/g, ' ')
87
+ .split(/(?<=[.!?])\s+/)
88
+ .map(s => s.trim())
89
+ // Filter length
90
+ .filter(s => s.length > 40 && s.length < 500)
91
+ // Skip markdown headers (## Heading, # Title)
92
+ .filter(s => !/^#{1,4}\s/.test(s))
93
+ // Skip navigation/link-heavy lines (lots of []() markdown)
94
+ .filter(s => (s.match(/\[.*?\]\(.*?\)/g) || []).length < 3)
95
+ // Skip lines that are just questions or teasers with no data
96
+ .filter(s => !/^(thinking about|wondering|let's|let me|in this article|we'll|here's|read on|click|sign up|subscribe|after diving|but the big question|for full data|source:|select make|select model)/i.test(s))
97
+ // Skip lines that are just italicized markdown filler (_text_)
98
+ .filter(s => !s.startsWith('_') || s.includes('$') || s.includes('%') || /\d/.test(s))
99
+ // Skip markdown image lines (![...](...))
100
+ .filter(s => !/^!\[/.test(s))
101
+ // Skip "Read more about..." lines
102
+ .filter(s => !/^\[read more|^\[learn more|\[read more|\[learn more/i.test(s));
103
+ // Prefer sentences with numbers (prices, percentages, years)
104
+ // (we don't remove number-less ones, just score them lower)
105
+ if (sentences.length === 0)
106
+ return [];
107
+ // Score each sentence by keyword overlap
108
+ const scored = sentences.map(sentence => {
109
+ const words = tokenize(sentence);
110
+ let hits = 0;
111
+ const seen = new Set();
112
+ for (const w of words) {
113
+ if (queryKeywords.has(w) && !seen.has(w)) {
114
+ hits++;
115
+ seen.add(w);
116
+ }
117
+ }
118
+ let score = hits / queryKeywords.size;
119
+ // Boost sentences with numbers/prices/percentages — likely to contain real data
120
+ if (/\$[\d,]+|[\d,]+\/mo|\d+%|\d+\s*year|\d+\s*month|\d+,\d{3}/.test(sentence)) {
121
+ score *= 1.5;
122
+ }
123
+ return { sentence, score };
124
+ });
125
+ scored.sort((a, b) => b.score - a.score);
126
+ // Return top N, deduped
127
+ const seen = new Set();
128
+ const result = [];
129
+ for (const { sentence, score } of scored) {
130
+ if (score === 0)
131
+ break; // no keyword overlap
132
+ const normalized = sentence.toLowerCase().slice(0, 80);
133
+ if (seen.has(normalized))
134
+ continue;
135
+ seen.add(normalized);
136
+ result.push(sentence);
137
+ if (result.length >= maxFacts)
138
+ break;
139
+ }
140
+ return result;
141
+ }
142
+ // ---------------------------------------------------------------------------
143
+ // Route factory
144
+ // ---------------------------------------------------------------------------
145
+ const VALID_LLM_PROVIDERS = [
146
+ 'openai',
147
+ 'anthropic',
148
+ 'google',
149
+ 'ollama',
150
+ 'cerebras',
151
+ 'cloudflare',
152
+ ];
153
+ const MAX_SOURCES_HARD_LIMIT = 8;
154
+ const PER_URL_TIMEOUT_MS = 15_000;
155
+ const TOTAL_TIMEOUT_MS = 60_000;
156
+ export function createResearchRouter() {
157
+ const router = Router();
158
+ router.post('/v1/research', async (req, res) => {
159
+ const startTime = Date.now();
160
+ // ── Auth ─────────────────────────────────────────────────────────────────
161
+ const authId = req.auth?.keyInfo?.accountId || req.user?.userId;
162
+ if (!authId) {
163
+ res.status(401).json({
164
+ success: false,
165
+ error: {
166
+ type: 'authentication_required',
167
+ message: 'API key required. Get one at https://app.webpeel.dev/keys',
168
+ hint: 'Get a free API key at https://app.webpeel.dev/keys',
169
+ docs: 'https://webpeel.dev/docs/errors#authentication_required',
170
+ },
171
+ requestId: req.requestId,
172
+ });
173
+ return;
174
+ }
175
+ // ── Parse & validate body ─────────────────────────────────────────────
176
+ const body = req.body;
177
+ if (!body.query || typeof body.query !== 'string' || body.query.trim().length === 0) {
178
+ res.status(400).json({
179
+ success: false,
180
+ error: {
181
+ type: 'invalid_request',
182
+ message: 'Missing or empty "query" field.',
183
+ hint: 'Send JSON: { "query": "your research question" }',
184
+ docs: 'https://webpeel.dev/docs/api-reference#research',
185
+ },
186
+ requestId: req.requestId,
187
+ });
188
+ return;
189
+ }
190
+ const query = body.query.trim().slice(0, 500); // hard cap
191
+ const depth = body.depth ?? 'quick';
192
+ if (depth !== 'quick' && depth !== 'deep') {
193
+ res.status(400).json({
194
+ success: false,
195
+ error: {
196
+ type: 'invalid_request',
197
+ message: 'Invalid "depth" value: must be "quick" or "deep".',
198
+ docs: 'https://webpeel.dev/docs/api-reference#research',
199
+ },
200
+ requestId: req.requestId,
201
+ });
202
+ return;
203
+ }
204
+ // Depth-based defaults
205
+ const defaultMaxSources = depth === 'deep' ? 8 : 3;
206
+ const defaultSearchCount = depth === 'deep' ? 10 : 5;
207
+ const numSearchQueries = depth === 'deep' ? 3 : 1;
208
+ const requestedMax = typeof body.maxSources === 'number' ? body.maxSources : defaultMaxSources;
209
+ const maxSources = Math.min(Math.max(1, requestedMax), MAX_SOURCES_HARD_LIMIT);
210
+ // Optional LLM config
211
+ let llmConfig;
212
+ if (body.llm) {
213
+ const { provider, apiKey, model } = body.llm;
214
+ if (!provider || typeof provider !== 'string') {
215
+ res.status(400).json({
216
+ success: false,
217
+ error: {
218
+ type: 'invalid_request',
219
+ message: 'llm.provider is required when providing llm config.',
220
+ docs: 'https://webpeel.dev/docs/api-reference#research',
221
+ },
222
+ requestId: req.requestId,
223
+ });
224
+ return;
225
+ }
226
+ if (!VALID_LLM_PROVIDERS.includes(provider)) {
227
+ res.status(400).json({
228
+ success: false,
229
+ error: {
230
+ type: 'invalid_request',
231
+ message: `Invalid llm.provider. Must be one of: ${VALID_LLM_PROVIDERS.join(', ')}`,
232
+ docs: 'https://webpeel.dev/docs/api-reference#research',
233
+ },
234
+ requestId: req.requestId,
235
+ });
236
+ return;
237
+ }
238
+ if (!apiKey || typeof apiKey !== 'string' || apiKey.trim().length === 0) {
239
+ res.status(400).json({
240
+ success: false,
241
+ error: {
242
+ type: 'invalid_request',
243
+ message: 'llm.apiKey is required when providing llm config.',
244
+ docs: 'https://webpeel.dev/docs/api-reference#research',
245
+ },
246
+ requestId: req.requestId,
247
+ });
248
+ return;
249
+ }
250
+ llmConfig = {
251
+ provider: provider,
252
+ apiKey: apiKey.trim(),
253
+ model: model,
254
+ };
255
+ }
256
+ // ── Set up total-timeout race ─────────────────────────────────────────
257
+ const overallDeadline = startTime + TOTAL_TIMEOUT_MS;
258
+ try {
259
+ // ── 1. Query expansion ────────────────────────────────────────────────
260
+ const allQueries = expandQuery(query);
261
+ const searchQueries = allQueries.slice(0, numSearchQueries);
262
+ // ── 2. Search all query variations, collect unique URLs ───────────────
263
+ const searchProvider = getSearchProvider('duckduckgo');
264
+ const seenUrls = new Set();
265
+ const urlQueue = [];
266
+ for (const sq of searchQueries) {
267
+ if (Date.now() > overallDeadline - 5_000)
268
+ break; // stop if < 5s left
269
+ try {
270
+ const results = await searchProvider.searchWeb(sq, { count: defaultSearchCount });
271
+ for (const r of results) {
272
+ if (!r.url || seenUrls.has(r.url))
273
+ continue;
274
+ seenUrls.add(r.url);
275
+ urlQueue.push({ url: r.url, title: r.title, snippet: r.snippet });
276
+ }
277
+ }
278
+ catch {
279
+ // Search failure — continue with whatever URLs we have
280
+ }
281
+ }
282
+ // ── 3. Fetch top N unique URLs sequentially ───────────────────────────
283
+ const sources = [];
284
+ const fetchedContents = [];
285
+ for (const { url, title, snippet } of urlQueue) {
286
+ if (sources.length >= maxSources)
287
+ break;
288
+ if (Date.now() > overallDeadline - 2_000)
289
+ break;
290
+ const timeLeft = overallDeadline - Date.now();
291
+ const urlTimeout = Math.min(PER_URL_TIMEOUT_MS, timeLeft);
292
+ if (urlTimeout < 1000)
293
+ break;
294
+ const fetchStart = Date.now();
295
+ try {
296
+ const result = await Promise.race([
297
+ peel(url, {
298
+ format: 'markdown',
299
+ noEscalate: true, // NEVER launch browser — 512MB container
300
+ timeout: urlTimeout,
301
+ readable: true,
302
+ budget: 3000,
303
+ }),
304
+ new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
305
+ ]);
306
+ const fetchTime = Date.now() - fetchStart;
307
+ const content = result.content || '';
308
+ const wordCount = content.split(/\s+/).filter(Boolean).length;
309
+ const pageTitle = result.title || title;
310
+ // Build snippet: prefer LLM-extracted summary, else first 500 chars of content
311
+ const sourceSnippet = content.slice(0, 500).replace(/\s+/g, ' ').trim();
312
+ sources.push({
313
+ url,
314
+ title: pageTitle.slice(0, 200),
315
+ snippet: sourceSnippet || snippet.slice(0, 500),
316
+ wordCount,
317
+ fetchTime,
318
+ });
319
+ if (content.length > 0) {
320
+ fetchedContents.push({ url, content });
321
+ }
322
+ }
323
+ catch {
324
+ // Skip failed URLs, continue to next
325
+ }
326
+ }
327
+ // ── 4. Extract key facts across all fetched pages ─────────────────────
328
+ const allFacts = [];
329
+ const seenFacts = new Set();
330
+ for (const { content } of fetchedContents) {
331
+ const pageFacts = extractKeyFacts(content, query, 5);
332
+ for (const fact of pageFacts) {
333
+ const key = fact.toLowerCase().slice(0, 100);
334
+ if (!seenFacts.has(key)) {
335
+ seenFacts.add(key);
336
+ allFacts.push(fact);
337
+ }
338
+ }
339
+ if (allFacts.length >= 20)
340
+ break; // global cap
341
+ }
342
+ // ── 5. Optional LLM synthesis ─────────────────────────────────────────
343
+ let summary;
344
+ if (llmConfig && fetchedContents.length > 0 && Date.now() < overallDeadline - 3_000) {
345
+ try {
346
+ const sourcesText = fetchedContents
347
+ .map((fc, i) => `[${i + 1}] ${fc.url}\n${fc.content.slice(0, 2000)}`)
348
+ .join('\n\n---\n\n');
349
+ const llmResult = await callLLM(llmConfig, {
350
+ messages: [
351
+ {
352
+ role: 'system',
353
+ content: 'You are a research assistant. Synthesize the following sources into a clear, ' +
354
+ 'comprehensive answer to the user\'s question. Cite sources by number [1], [2], etc. ' +
355
+ 'Be concise but thorough. Use plain text without excessive markdown.',
356
+ },
357
+ {
358
+ role: 'user',
359
+ content: `Question: ${query}\n\nSources:\n\n${sourcesText}`,
360
+ },
361
+ ],
362
+ maxTokens: 1000,
363
+ });
364
+ summary = llmResult.text;
365
+ }
366
+ catch {
367
+ // LLM synthesis failure is non-fatal — return results without summary
368
+ }
369
+ }
370
+ const elapsed = Date.now() - startTime;
371
+ res.json({
372
+ success: true,
373
+ data: {
374
+ query,
375
+ ...(summary !== undefined ? { summary } : {}),
376
+ sources,
377
+ keyFacts: allFacts,
378
+ totalSources: sources.length,
379
+ searchQueries,
380
+ elapsed,
381
+ },
382
+ requestId: req.requestId,
383
+ });
384
+ }
385
+ catch (error) {
386
+ console.error('[research] Unexpected error:', error);
387
+ if (res.headersSent)
388
+ return;
389
+ res.status(500).json({
390
+ success: false,
391
+ error: {
392
+ type: 'research_failed',
393
+ message: 'Research request failed. Please try again.',
394
+ docs: 'https://webpeel.dev/docs/api-reference#research',
395
+ },
396
+ requestId: req.requestId,
397
+ });
398
+ }
399
+ });
400
+ return router;
401
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.28",
3
+ "version": "0.21.30",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",