webpeel 0.21.28 → 0.21.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/http-fetch.d.ts +29 -0
- package/dist/core/http-fetch.js +165 -34
- package/dist/core/search-provider.d.ts +18 -0
- package/dist/core/search-provider.js +263 -4
- package/dist/core/user-agents.d.ts +26 -1
- package/dist/core/user-agents.js +83 -2
- package/dist/server/app.js +4 -0
- package/dist/server/routes/research.d.ts +13 -0
- package/dist/server/routes/research.js +401 -0
- package/package.json +1 -1
|
@@ -4,6 +4,35 @@
|
|
|
4
4
|
*/
|
|
5
5
|
export declare function closePool(): Promise<void>;
|
|
6
6
|
export declare function createAbortError(): Error;
|
|
7
|
+
/**
|
|
8
|
+
* Domains known to aggressively block datacenter IPs.
|
|
9
|
+
* Requests to these domains automatically route through the Webshare residential
|
|
10
|
+
* proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
|
|
11
|
+
*/
|
|
12
|
+
export declare const PROXY_PREFERRED_DOMAINS: readonly string[];
|
|
13
|
+
/**
|
|
14
|
+
* Returns true if the URL's domain is on the proxy-preferred blocklist.
|
|
15
|
+
* Matches exact hostname (sans www.) and all subdomains.
|
|
16
|
+
*
|
|
17
|
+
* @example
|
|
18
|
+
* shouldUseProxy('https://www.reddit.com/r/news') // true
|
|
19
|
+
* shouldUseProxy('https://example.com') // false
|
|
20
|
+
*/
|
|
21
|
+
export declare function shouldUseProxy(url: string): boolean;
|
|
22
|
+
/**
|
|
23
|
+
* Generate browser-like request headers tailored to the User-Agent type.
|
|
24
|
+
*
|
|
25
|
+
* - Chrome/Edge: full Sec-CH-UA + Sec-Fetch-* header set
|
|
26
|
+
* - Firefox: adjusted Accept, TE header, partial Sec-Fetch-* (no Sec-CH-UA)
|
|
27
|
+
* - Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
|
|
28
|
+
* - Other: basic headers only
|
|
29
|
+
*
|
|
30
|
+
* Automatically adds a Google referer for domains where it helps bypass blocks.
|
|
31
|
+
*
|
|
32
|
+
* @param url - Target URL (used for domain-specific header additions)
|
|
33
|
+
* @param userAgent - User-Agent string (determines which header set is applied)
|
|
34
|
+
*/
|
|
35
|
+
export declare function getStealthHeaders(url: string, userAgent: string): Record<string, string>;
|
|
7
36
|
/**
|
|
8
37
|
* SECURITY: Validate URL to prevent SSRF attacks
|
|
9
38
|
* Blocks localhost, private IPs, link-local, and various bypass techniques
|
package/dist/core/http-fetch.js
CHANGED
|
@@ -8,7 +8,8 @@
|
|
|
8
8
|
// Must run before any network library is used.
|
|
9
9
|
import dns from 'dns';
|
|
10
10
|
dns.setDefaultResultOrder('ipv4first');
|
|
11
|
-
import {
|
|
11
|
+
import { getHttpUA, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
|
|
12
|
+
import { getWebshareProxyUrl } from './proxy-config.js';
|
|
12
13
|
import { fetch as undiciFetch, Agent, ProxyAgent } from 'undici';
|
|
13
14
|
import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
|
|
14
15
|
import { getCached } from './cache.js';
|
|
@@ -145,6 +146,149 @@ export function createAbortError() {
|
|
|
145
146
|
error.name = 'AbortError';
|
|
146
147
|
return error;
|
|
147
148
|
}
|
|
149
|
+
// ── Stealth headers & proxy routing ──────────────────────────────────────────
|
|
150
|
+
/**
|
|
151
|
+
* Domains known to aggressively block datacenter IPs.
|
|
152
|
+
* Requests to these domains automatically route through the Webshare residential
|
|
153
|
+
* proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
|
|
154
|
+
*/
|
|
155
|
+
export const PROXY_PREFERRED_DOMAINS = [
|
|
156
|
+
'reddit.com',
|
|
157
|
+
'old.reddit.com',
|
|
158
|
+
'forbes.com',
|
|
159
|
+
'fortune.com',
|
|
160
|
+
'cargurus.com',
|
|
161
|
+
'edmunds.com',
|
|
162
|
+
'cars.com',
|
|
163
|
+
'truecar.com',
|
|
164
|
+
'autotrader.com',
|
|
165
|
+
'carfax.com',
|
|
166
|
+
'tesla.com',
|
|
167
|
+
'nerdwallet.com',
|
|
168
|
+
'bankrate.com',
|
|
169
|
+
'homeadvisor.com',
|
|
170
|
+
'angi.com',
|
|
171
|
+
'insideevs.com',
|
|
172
|
+
'electrek.co',
|
|
173
|
+
'motortrend.com',
|
|
174
|
+
'jdpower.com',
|
|
175
|
+
];
|
|
176
|
+
/**
|
|
177
|
+
* Returns true if the URL's domain is on the proxy-preferred blocklist.
|
|
178
|
+
* Matches exact hostname (sans www.) and all subdomains.
|
|
179
|
+
*
|
|
180
|
+
* @example
|
|
181
|
+
* shouldUseProxy('https://www.reddit.com/r/news') // true
|
|
182
|
+
* shouldUseProxy('https://example.com') // false
|
|
183
|
+
*/
|
|
184
|
+
export function shouldUseProxy(url) {
|
|
185
|
+
try {
|
|
186
|
+
const host = new URL(url).hostname.replace(/^www\./, '');
|
|
187
|
+
return PROXY_PREFERRED_DOMAINS.some(d => host === d || host.endsWith('.' + d));
|
|
188
|
+
}
|
|
189
|
+
catch {
|
|
190
|
+
return false;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Generate browser-like request headers tailored to the User-Agent type.
|
|
195
|
+
*
|
|
196
|
+
* - Chrome/Edge: full Sec-CH-UA + Sec-Fetch-* header set
|
|
197
|
+
* - Firefox: adjusted Accept, TE header, partial Sec-Fetch-* (no Sec-CH-UA)
|
|
198
|
+
* - Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
|
|
199
|
+
* - Other: basic headers only
|
|
200
|
+
*
|
|
201
|
+
* Automatically adds a Google referer for domains where it helps bypass blocks.
|
|
202
|
+
*
|
|
203
|
+
* @param url - Target URL (used for domain-specific header additions)
|
|
204
|
+
* @param userAgent - User-Agent string (determines which header set is applied)
|
|
205
|
+
*/
|
|
206
|
+
export function getStealthHeaders(url, userAgent) {
|
|
207
|
+
const isFirefox = userAgent.includes('Firefox');
|
|
208
|
+
const isSafari = userAgent.includes('Safari') && !userAgent.includes('Chrome');
|
|
209
|
+
const isChrome = !isFirefox && !isSafari && (userAgent.includes('Chrome') || userAgent.includes('Chromium'));
|
|
210
|
+
const isMobile = userAgent.includes('Mobile') || userAgent.includes('Android');
|
|
211
|
+
// Base headers all browsers send
|
|
212
|
+
const headers = {
|
|
213
|
+
'User-Agent': userAgent,
|
|
214
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
215
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
216
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
217
|
+
'Cache-Control': 'max-age=0',
|
|
218
|
+
'DNT': '1',
|
|
219
|
+
'Upgrade-Insecure-Requests': '1',
|
|
220
|
+
};
|
|
221
|
+
if (isFirefox) {
|
|
222
|
+
// Firefox: different Accept, TE, and partial Sec-Fetch (no Sec-CH-UA)
|
|
223
|
+
headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8';
|
|
224
|
+
headers['Accept-Language'] = 'en-US,en;q=0.5';
|
|
225
|
+
headers['TE'] = 'trailers';
|
|
226
|
+
headers['Sec-Fetch-Dest'] = 'document';
|
|
227
|
+
headers['Sec-Fetch-Mode'] = 'navigate';
|
|
228
|
+
headers['Sec-Fetch-Site'] = 'none';
|
|
229
|
+
// Firefox omits Sec-Fetch-User in many navigations
|
|
230
|
+
}
|
|
231
|
+
else if (isSafari) {
|
|
232
|
+
// Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
|
|
233
|
+
headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
|
234
|
+
// Safari does not send Sec-Fetch headers at all
|
|
235
|
+
}
|
|
236
|
+
else if (isChrome) {
|
|
237
|
+
// Chrome/Edge: full set of Sec-Fetch-* and Sec-CH-UA headers
|
|
238
|
+
headers['Sec-Fetch-Dest'] = 'document';
|
|
239
|
+
headers['Sec-Fetch-Mode'] = 'navigate';
|
|
240
|
+
headers['Sec-Fetch-Site'] = 'none';
|
|
241
|
+
headers['Sec-Fetch-User'] = '?1';
|
|
242
|
+
headers['Sec-CH-UA'] = getSecCHUA(userAgent);
|
|
243
|
+
headers['Sec-CH-UA-Mobile'] = isMobile ? '?1' : '?0';
|
|
244
|
+
headers['Sec-CH-UA-Platform'] = getSecCHUAPlatform(userAgent);
|
|
245
|
+
headers['Connection'] = 'keep-alive';
|
|
246
|
+
headers['Priority'] = 'u=0, i';
|
|
247
|
+
}
|
|
248
|
+
// else: custom/API UAs (e.g. "WebPeel/1.0") — basic headers only, no browser fingerprints
|
|
249
|
+
// Add Google Referer for domains where it's known to help bypass blocks
|
|
250
|
+
try {
|
|
251
|
+
const domain = new URL(url).hostname;
|
|
252
|
+
const referrerDomains = [
|
|
253
|
+
'reddit.com', 'forbes.com', 'cargurus.com', 'edmunds.com',
|
|
254
|
+
'cars.com', 'truecar.com', 'nerdwallet.com', 'homeadvisor.com',
|
|
255
|
+
'angi.com', 'motortrend.com', 'jdpower.com', 'electrek.co', 'insideevs.com',
|
|
256
|
+
];
|
|
257
|
+
if (referrerDomains.some(d => domain.includes(d))) {
|
|
258
|
+
headers['Referer'] = 'https://www.google.com/';
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
catch {
|
|
262
|
+
// Non-fatal: URL parsing failed, skip Referer
|
|
263
|
+
}
|
|
264
|
+
return headers;
|
|
265
|
+
}
|
|
266
|
+
/** Pick a different UA than the one currently in use (for 403/503 retries). */
|
|
267
|
+
function getDifferentUA(current) {
|
|
268
|
+
for (let i = 0; i < 10; i++) {
|
|
269
|
+
const ua = getHttpUA();
|
|
270
|
+
if (ua !== current)
|
|
271
|
+
return ua;
|
|
272
|
+
}
|
|
273
|
+
return getHttpUA();
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Build the merged request headers: stealth defaults + caller custom headers.
|
|
277
|
+
* Throws WebPeelError if customHeaders attempts to override the Host header.
|
|
278
|
+
*/
|
|
279
|
+
function buildMergedHeaders(url, userAgent, customHeaders) {
|
|
280
|
+
const merged = { ...getStealthHeaders(url, userAgent) };
|
|
281
|
+
if (customHeaders) {
|
|
282
|
+
for (const [key, value] of Object.entries(customHeaders)) {
|
|
283
|
+
// SECURITY: Block Host header override
|
|
284
|
+
if (key.toLowerCase() === 'host') {
|
|
285
|
+
throw new WebPeelError('Custom Host header is not allowed');
|
|
286
|
+
}
|
|
287
|
+
merged[key] = value;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
return merged;
|
|
291
|
+
}
|
|
148
292
|
// ── SSRF / URL validation ─────────────────────────────────────────────────────
|
|
149
293
|
/**
|
|
150
294
|
* SECURITY: Validate URL to prevent SSRF attacks
|
|
@@ -368,42 +512,19 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
368
512
|
// SEC.gov requires a User-Agent with contact info (their documented automated access policy)
|
|
369
513
|
const hostname = new URL(url).hostname.toLowerCase();
|
|
370
514
|
const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
|
|
371
|
-
|
|
515
|
+
let activeUserAgent = isSecGov
|
|
372
516
|
? 'WebPeel/1.0 (support@webpeel.dev)'
|
|
373
|
-
: (userAgent ? validateUserAgent(userAgent) :
|
|
374
|
-
//
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
'Accept-Encoding': 'br, gzip, deflate',
|
|
380
|
-
'DNT': '1',
|
|
381
|
-
'Connection': 'keep-alive',
|
|
382
|
-
'Upgrade-Insecure-Requests': '1',
|
|
383
|
-
'Sec-CH-UA': getSecCHUA(validatedUserAgent),
|
|
384
|
-
'Sec-CH-UA-Mobile': '?0',
|
|
385
|
-
'Sec-CH-UA-Platform': getSecCHUAPlatform(validatedUserAgent),
|
|
386
|
-
'Sec-Fetch-Dest': 'document',
|
|
387
|
-
'Sec-Fetch-Mode': 'navigate',
|
|
388
|
-
'Sec-Fetch-Site': 'none',
|
|
389
|
-
'Sec-Fetch-User': '?1',
|
|
390
|
-
'Cache-Control': 'max-age=0',
|
|
391
|
-
'Priority': 'u=0, i',
|
|
392
|
-
};
|
|
393
|
-
const mergedHeaders = { ...defaultHeaders };
|
|
394
|
-
if (customHeaders) {
|
|
395
|
-
for (const [key, value] of Object.entries(customHeaders)) {
|
|
396
|
-
// SECURITY: Block Host header override
|
|
397
|
-
if (key.toLowerCase() === 'host') {
|
|
398
|
-
throw new WebPeelError('Custom Host header is not allowed');
|
|
399
|
-
}
|
|
400
|
-
mergedHeaders[key] = value;
|
|
401
|
-
}
|
|
402
|
-
}
|
|
517
|
+
: (userAgent ? validateUserAgent(userAgent) : getHttpUA());
|
|
518
|
+
// Build stealth headers merged with any caller-supplied custom headers
|
|
519
|
+
let mergedHeaders = buildMergedHeaders(url, activeUserAgent, customHeaders);
|
|
520
|
+
// Auto-route through residential proxy for sites known to block datacenter IPs.
|
|
521
|
+
// The explicit `proxy` param always wins; auto-proxy only kicks in when unset.
|
|
522
|
+
const effectiveProxy = proxy ?? (shouldUseProxy(url) ? (getWebshareProxyUrl() ?? undefined) : undefined);
|
|
403
523
|
const MAX_REDIRECTS = 10;
|
|
404
524
|
let redirectCount = 0;
|
|
405
525
|
let currentUrl = url;
|
|
406
526
|
const seenUrls = new Set();
|
|
527
|
+
let retried = false; // track whether we've already retried with a different UA
|
|
407
528
|
try {
|
|
408
529
|
const hostname = new URL(url).hostname;
|
|
409
530
|
void resolveAndCache(hostname).catch(() => {
|
|
@@ -436,8 +557,8 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
436
557
|
if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
|
|
437
558
|
requestHeaders['If-Modified-Since'] = validators.lastModified;
|
|
438
559
|
}
|
|
439
|
-
// Use proxy if provided, otherwise use shared connection pool
|
|
440
|
-
const dispatcher =
|
|
560
|
+
// Use proxy if provided or auto-selected, otherwise use shared connection pool
|
|
561
|
+
const dispatcher = effectiveProxy ? new ProxyAgent(effectiveProxy) : httpPool;
|
|
441
562
|
const response = await undiciFetch(currentUrl, {
|
|
442
563
|
headers: requestHeaders,
|
|
443
564
|
signal,
|
|
@@ -475,6 +596,16 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
|
|
|
475
596
|
}
|
|
476
597
|
if (!response.ok) {
|
|
477
598
|
if (response.status === 403 || response.status === 503) {
|
|
599
|
+
// Retry once with a different UA — cheap and catches UA-based blocks
|
|
600
|
+
if (!retried && !userAgent) {
|
|
601
|
+
retried = true;
|
|
602
|
+
activeUserAgent = getDifferentUA(activeUserAgent);
|
|
603
|
+
mergedHeaders = buildMergedHeaders(currentUrl, activeUserAgent, customHeaders);
|
|
604
|
+
// Allow the retry to re-visit the same URL (not a redirect loop)
|
|
605
|
+
seenUrls.delete(currentUrl);
|
|
606
|
+
log.debug(`HTTP ${response.status} on first attempt; retrying with different UA`);
|
|
607
|
+
continue;
|
|
608
|
+
}
|
|
478
609
|
throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
|
|
479
610
|
}
|
|
480
611
|
const statusText = response.statusText || HTTP_STATUS_TEXT[response.status] || 'Unknown Error';
|
|
@@ -73,6 +73,11 @@ declare class ProviderStatsTracker {
|
|
|
73
73
|
* (e.g. in tests) and to log diagnostics.
|
|
74
74
|
*/
|
|
75
75
|
export declare const providerStats: ProviderStatsTracker;
|
|
76
|
+
/**
|
|
77
|
+
* Merge results from multiple sources, deduplicating by normalized URL.
|
|
78
|
+
* Preserves original order (first occurrence wins) and limits to maxCount.
|
|
79
|
+
*/
|
|
80
|
+
export declare function mergeSearchResults(results: WebSearchResult[], maxCount: number): WebSearchResult[];
|
|
76
81
|
/**
|
|
77
82
|
* Filter and rank results by relevance to the original query.
|
|
78
83
|
*
|
|
@@ -128,6 +133,19 @@ export declare class DuckDuckGoProvider implements SearchProvider {
|
|
|
128
133
|
* works when the main HTML endpoint is temporarily blocked on datacenter IPs.
|
|
129
134
|
*/
|
|
130
135
|
private searchLite;
|
|
136
|
+
/**
|
|
137
|
+
* HTTP-only Bing scraping via undici + cheerio. No browser required.
|
|
138
|
+
* Routes through Webshare proxy (proxy first, direct fallback).
|
|
139
|
+
* Tracks stats via providerStats('bing-http').
|
|
140
|
+
*/
|
|
141
|
+
private _searchBingHttp;
|
|
142
|
+
/**
|
|
143
|
+
* HTTP-only Google scraping via undici + cheerio. No browser required.
|
|
144
|
+
* Routes through Webshare proxy (proxy first, direct fallback).
|
|
145
|
+
* Sends CONSENT cookie to bypass Google consent page.
|
|
146
|
+
* Tracks stats via providerStats('google-http').
|
|
147
|
+
*/
|
|
148
|
+
private _searchGoogleHttp;
|
|
131
149
|
searchWeb(query: string, options: WebSearchOptions): Promise<WebSearchResult[]>;
|
|
132
150
|
/**
|
|
133
151
|
* Exposed for testing: score and filter a pre-fetched result list against a query.
|
|
@@ -195,6 +195,24 @@ function normalizeUrlForDedupe(rawUrl) {
|
|
|
195
195
|
.replace(/\/+$/g, '');
|
|
196
196
|
}
|
|
197
197
|
}
|
|
198
|
+
/**
|
|
199
|
+
* Merge results from multiple sources, deduplicating by normalized URL.
|
|
200
|
+
* Preserves original order (first occurrence wins) and limits to maxCount.
|
|
201
|
+
*/
|
|
202
|
+
export function mergeSearchResults(results, maxCount) {
|
|
203
|
+
const seen = new Set();
|
|
204
|
+
const merged = [];
|
|
205
|
+
for (const r of results) {
|
|
206
|
+
if (merged.length >= maxCount)
|
|
207
|
+
break;
|
|
208
|
+
const key = normalizeUrlForDedupe(r.url);
|
|
209
|
+
if (seen.has(key))
|
|
210
|
+
continue;
|
|
211
|
+
seen.add(key);
|
|
212
|
+
merged.push(r);
|
|
213
|
+
}
|
|
214
|
+
return merged;
|
|
215
|
+
}
|
|
198
216
|
// ============================================================
|
|
199
217
|
// Result Relevance Filtering
|
|
200
218
|
// Lightweight keyword-overlap scoring — no external deps.
|
|
@@ -206,6 +224,9 @@ const STOP_WORDS = new Set([
|
|
|
206
224
|
'of', 'with', 'how', 'what', 'where', 'when', 'why', 'best', 'top', 'most',
|
|
207
225
|
'and', 'or', 'but', 'not', 'do', 'does', 'did', 'be', 'been', 'have', 'has',
|
|
208
226
|
'buy', 'get', 'find', 'about', 'from', 'by', 'its', 'it', 'this', 'that',
|
|
227
|
+
'much', 'very', 'can', 'will', 'would', 'could', 'should', 'per', 'than',
|
|
228
|
+
'some', 'just', 'also', 'more', 'like', 'make', 'any', 'each', 'all', 'my',
|
|
229
|
+
'your', 'our', 'their', 'me', 'us', 'them', 'so', 'if', 'then', 'here',
|
|
209
230
|
]);
|
|
210
231
|
/**
|
|
211
232
|
* Extract meaningful keywords from a search query by stripping stop words and
|
|
@@ -271,8 +292,10 @@ export function filterRelevantResults(results, query) {
|
|
|
271
292
|
score: scoreResult(r, keywords),
|
|
272
293
|
idx,
|
|
273
294
|
}));
|
|
274
|
-
// Drop results with
|
|
275
|
-
|
|
295
|
+
// Drop results with insufficient overlap — require ≥15% keyword match
|
|
296
|
+
// to filter out dictionary/definition pages that match on a single common word
|
|
297
|
+
const minScore = keywords.length >= 3 ? 0.15 : 0.01;
|
|
298
|
+
const relevant = scored.filter(s => s.score >= minScore);
|
|
276
299
|
// Sort by score descending, original order as tiebreaker
|
|
277
300
|
relevant.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.idx - b.idx));
|
|
278
301
|
return relevant.map(s => ({
|
|
@@ -571,9 +594,21 @@ export class DuckDuckGoProvider {
|
|
|
571
594
|
const attempts = [];
|
|
572
595
|
// Required retry strategy order:
|
|
573
596
|
// 1) original query
|
|
574
|
-
// 2)
|
|
575
|
-
// 3) query
|
|
597
|
+
// 2) keywords-only (strip question words, articles, prepositions)
|
|
598
|
+
// 3) quoted query
|
|
599
|
+
// 4) query site:*
|
|
576
600
|
attempts.push(q);
|
|
601
|
+
// For long queries (>5 words), extract just the meaningful keywords
|
|
602
|
+
// "how much does a used 2023 Tesla Model 3 cost per month" → "2023 Tesla Model 3 cost month"
|
|
603
|
+
const words = q.split(/\s+/);
|
|
604
|
+
if (words.length > 5) {
|
|
605
|
+
const keywordsOnly = words
|
|
606
|
+
.filter(w => !STOP_WORDS.has(w.toLowerCase()) && w.length >= 2)
|
|
607
|
+
.join(' ');
|
|
608
|
+
if (keywordsOnly && keywordsOnly !== q) {
|
|
609
|
+
attempts.push(keywordsOnly);
|
|
610
|
+
}
|
|
611
|
+
}
|
|
577
612
|
if (!/^".*"$/.test(q))
|
|
578
613
|
attempts.push(`"${q}"`);
|
|
579
614
|
attempts.push(`${q} site:*`);
|
|
@@ -776,6 +811,219 @@ export class DuckDuckGoProvider {
|
|
|
776
811
|
});
|
|
777
812
|
return results;
|
|
778
813
|
}
|
|
814
|
+
/**
|
|
815
|
+
* HTTP-only Bing scraping via undici + cheerio. No browser required.
|
|
816
|
+
* Routes through Webshare proxy (proxy first, direct fallback).
|
|
817
|
+
* Tracks stats via providerStats('bing-http').
|
|
818
|
+
*/
|
|
819
|
+
// @ts-expect-error Disabled Stage 3.5 — kept for future re-enablement
|
|
820
|
+
async _searchBingHttp(query, options) {
|
|
821
|
+
const { count, signal } = options;
|
|
822
|
+
const bingRate = providerStats.getFailureRate('bing-http');
|
|
823
|
+
const timeoutMs = bingRate > 0.5 ? 3_000 : 8_000;
|
|
824
|
+
const bingSignal = createTimeoutSignal(timeoutMs, signal);
|
|
825
|
+
const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=10`;
|
|
826
|
+
const headers = {
|
|
827
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
828
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
829
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
830
|
+
'Sec-Fetch-Dest': 'document',
|
|
831
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
832
|
+
'Sec-Fetch-Site': 'none',
|
|
833
|
+
'Sec-Fetch-User': '?1',
|
|
834
|
+
'Upgrade-Insecure-Requests': '1',
|
|
835
|
+
};
|
|
836
|
+
const proxyUrl = getWebshareProxyUrl();
|
|
837
|
+
let response;
|
|
838
|
+
try {
|
|
839
|
+
if (proxyUrl) {
|
|
840
|
+
try {
|
|
841
|
+
const dispatcher = new ProxyAgent(proxyUrl);
|
|
842
|
+
response = await undiciFetch(url, { headers, signal: bingSignal, dispatcher });
|
|
843
|
+
}
|
|
844
|
+
catch (proxyErr) {
|
|
845
|
+
log.debug('Bing HTTP proxy failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
|
|
846
|
+
response = await undiciFetch(url, { headers, signal: bingSignal });
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
else {
|
|
850
|
+
response = await undiciFetch(url, { headers, signal: bingSignal });
|
|
851
|
+
}
|
|
852
|
+
if (!response.ok) {
|
|
853
|
+
providerStats.record('bing-http', false);
|
|
854
|
+
return [];
|
|
855
|
+
}
|
|
856
|
+
const html = await response.text();
|
|
857
|
+
const $ = load(html);
|
|
858
|
+
const results = [];
|
|
859
|
+
const seen = new Set();
|
|
860
|
+
// Parse Bing organic results; skip ad containers
|
|
861
|
+
$('li.b_algo').each((_i, elem) => {
|
|
862
|
+
if (results.length >= count)
|
|
863
|
+
return;
|
|
864
|
+
const $r = $(elem);
|
|
865
|
+
// Skip if inside a .b_ad block or is itself an ad container
|
|
866
|
+
if ($r.hasClass('b_ad') || $r.closest('.b_ad').length > 0)
|
|
867
|
+
return;
|
|
868
|
+
const $a = $r.find('h2 > a').first();
|
|
869
|
+
const title = cleanText($a.text(), { maxLen: 200 });
|
|
870
|
+
const rawUrl = $a.attr('href') || '';
|
|
871
|
+
if (!title || !rawUrl)
|
|
872
|
+
return;
|
|
873
|
+
// Decode Bing redirect URLs:
|
|
874
|
+
// Relative: /ck/a?!&&p=...&u=a1<base64url>&ntb=1
|
|
875
|
+
// Absolute: https://www.bing.com/ck/a?...&u=a1<base64url>&ntb=1
|
|
876
|
+
let finalUrl = rawUrl;
|
|
877
|
+
try {
|
|
878
|
+
const base = rawUrl.startsWith('/') ? `https://www.bing.com${rawUrl}` : rawUrl;
|
|
879
|
+
const ckUrl = new URL(base);
|
|
880
|
+
if (ckUrl.hostname.endsWith('bing.com') && ckUrl.pathname.startsWith('/ck/')) {
|
|
881
|
+
const u = ckUrl.searchParams.get('u');
|
|
882
|
+
if (u && u.startsWith('a1')) {
|
|
883
|
+
const decoded = Buffer.from(u.slice(2), 'base64url').toString('utf-8');
|
|
884
|
+
if (decoded.startsWith('http'))
|
|
885
|
+
finalUrl = decoded;
|
|
886
|
+
}
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
catch { /* use rawUrl as-is */ }
|
|
890
|
+
// Validate: HTTP/HTTPS only
|
|
891
|
+
try {
|
|
892
|
+
const parsed = new URL(finalUrl);
|
|
893
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
894
|
+
return;
|
|
895
|
+
finalUrl = parsed.href;
|
|
896
|
+
}
|
|
897
|
+
catch {
|
|
898
|
+
return;
|
|
899
|
+
}
|
|
900
|
+
const key = normalizeUrlForDedupe(finalUrl);
|
|
901
|
+
if (seen.has(key))
|
|
902
|
+
return;
|
|
903
|
+
seen.add(key);
|
|
904
|
+
const snippetRaw = $r.find('.b_caption p').first().text() ||
|
|
905
|
+
$r.find('.b_caption').first().text();
|
|
906
|
+
const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
|
|
907
|
+
results.push({ title, url: finalUrl, snippet });
|
|
908
|
+
});
|
|
909
|
+
providerStats.record('bing-http', results.length > 0);
|
|
910
|
+
return results;
|
|
911
|
+
}
|
|
912
|
+
catch (e) {
|
|
913
|
+
log.debug('Bing HTTP search failed:', e instanceof Error ? e.message : e);
|
|
914
|
+
providerStats.record('bing-http', false);
|
|
915
|
+
return [];
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
/**
|
|
919
|
+
* HTTP-only Google scraping via undici + cheerio. No browser required.
|
|
920
|
+
* Routes through Webshare proxy (proxy first, direct fallback).
|
|
921
|
+
* Sends CONSENT cookie to bypass Google consent page.
|
|
922
|
+
* Tracks stats via providerStats('google-http').
|
|
923
|
+
*/
|
|
924
|
+
// @ts-expect-error Disabled Stage 3.5 — kept for future re-enablement
|
|
925
|
+
async _searchGoogleHttp(query, options) {
|
|
926
|
+
const { count, signal } = options;
|
|
927
|
+
const googleRate = providerStats.getFailureRate('google-http');
|
|
928
|
+
const timeoutMs = googleRate > 0.5 ? 3_000 : 8_000;
|
|
929
|
+
const googleSignal = createTimeoutSignal(timeoutMs, signal);
|
|
930
|
+
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=10&hl=en`;
|
|
931
|
+
const headers = {
|
|
932
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
933
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
934
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
935
|
+
// Skip Google consent/cookie wall
|
|
936
|
+
'Cookie': 'CONSENT=YES+; SOCS=CAESEwgDEgk0OTg3ODQ2NzMaAmVuIAEaBgiA0LqmBg',
|
|
937
|
+
'Sec-Fetch-Dest': 'document',
|
|
938
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
939
|
+
'Sec-Fetch-Site': 'none',
|
|
940
|
+
'Sec-Fetch-User': '?1',
|
|
941
|
+
'Upgrade-Insecure-Requests': '1',
|
|
942
|
+
};
|
|
943
|
+
const proxyUrl = getWebshareProxyUrl();
|
|
944
|
+
let response;
|
|
945
|
+
try {
|
|
946
|
+
if (proxyUrl) {
|
|
947
|
+
try {
|
|
948
|
+
const dispatcher = new ProxyAgent(proxyUrl);
|
|
949
|
+
response = await undiciFetch(url, { headers, signal: googleSignal, dispatcher });
|
|
950
|
+
}
|
|
951
|
+
catch (proxyErr) {
|
|
952
|
+
log.debug('Google HTTP proxy failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
|
|
953
|
+
response = await undiciFetch(url, { headers, signal: googleSignal });
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
else {
|
|
957
|
+
response = await undiciFetch(url, { headers, signal: googleSignal });
|
|
958
|
+
}
|
|
959
|
+
if (!response.ok) {
|
|
960
|
+
providerStats.record('google-http', false);
|
|
961
|
+
return [];
|
|
962
|
+
}
|
|
963
|
+
const html = await response.text();
|
|
964
|
+
const $ = load(html);
|
|
965
|
+
const results = [];
|
|
966
|
+
const seen = new Set();
|
|
967
|
+
// Google organic results live in div.g blocks.
|
|
968
|
+
// Skip ad blocks (data-text-ad attr), People Also Ask, and related searches.
|
|
969
|
+
$('div.g').each((_i, elem) => {
|
|
970
|
+
if (results.length >= count)
|
|
971
|
+
return;
|
|
972
|
+
const $r = $(elem);
|
|
973
|
+
// Skip ad containers (data-text-ad may be on div.g itself or on a descendant)
|
|
974
|
+
if ($r.attr('data-text-ad') !== undefined || $r.find('[data-text-ad]').length > 0)
|
|
975
|
+
return;
|
|
976
|
+
if ($r.closest('.commercial-unit-desktop-top, .ads-ad').length > 0)
|
|
977
|
+
return;
|
|
978
|
+
const $h3 = $r.find('h3').first();
|
|
979
|
+
if (!$h3.length)
|
|
980
|
+
return;
|
|
981
|
+
// Find a valid external link (starts with http, not a Google domain)
|
|
982
|
+
const $a = $r.find('a[href]').filter((_j, el) => {
|
|
983
|
+
const href = $(el).attr('href') || '';
|
|
984
|
+
return href.startsWith('http') && !href.includes('google.com/');
|
|
985
|
+
}).first();
|
|
986
|
+
if (!$a.length)
|
|
987
|
+
return;
|
|
988
|
+
const href = $a.attr('href') || '';
|
|
989
|
+
// Validate URL
|
|
990
|
+
let finalUrl;
|
|
991
|
+
try {
|
|
992
|
+
const parsed = new URL(href);
|
|
993
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
994
|
+
return;
|
|
995
|
+
if (parsed.hostname.includes('google.com'))
|
|
996
|
+
return;
|
|
997
|
+
finalUrl = parsed.href;
|
|
998
|
+
}
|
|
999
|
+
catch {
|
|
1000
|
+
return;
|
|
1001
|
+
}
|
|
1002
|
+
const key = normalizeUrlForDedupe(finalUrl);
|
|
1003
|
+
if (seen.has(key))
|
|
1004
|
+
return;
|
|
1005
|
+
seen.add(key);
|
|
1006
|
+
const title = cleanText($h3.text(), { maxLen: 200 });
|
|
1007
|
+
if (!title)
|
|
1008
|
+
return;
|
|
1009
|
+
// Snippet: try multiple known Google snippet CSS classes/attrs
|
|
1010
|
+
const snippetRaw = $r.find('.VwiC3b').first().text() ||
|
|
1011
|
+
$r.find('[data-sncf]').first().text() ||
|
|
1012
|
+
$r.find('[style*="-webkit-line-clamp"]').first().text() ||
|
|
1013
|
+
$r.find('.st').first().text() ||
|
|
1014
|
+
'';
|
|
1015
|
+
const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
|
|
1016
|
+
results.push({ title, url: finalUrl, snippet });
|
|
1017
|
+
});
|
|
1018
|
+
providerStats.record('google-http', results.length > 0);
|
|
1019
|
+
return results;
|
|
1020
|
+
}
|
|
1021
|
+
catch (e) {
|
|
1022
|
+
log.debug('Google HTTP search failed:', e instanceof Error ? e.message : e);
|
|
1023
|
+
providerStats.record('google-http', false);
|
|
1024
|
+
return [];
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
779
1027
|
async searchWeb(query, options) {
|
|
780
1028
|
const attempts = this.buildQueryAttempts(query);
|
|
781
1029
|
// -----------------------------------------------------------
|
|
@@ -867,6 +1115,17 @@ export class DuckDuckGoProvider {
|
|
|
867
1115
|
}
|
|
868
1116
|
}
|
|
869
1117
|
// -----------------------------------------------------------
|
|
1118
|
+
// Stage 3.5: HTTP-based Bing + Google (no browser, no API key)
|
|
1119
|
+
// DISABLED: Both Bing and Google detect non-browser HTTP clients and
|
|
1120
|
+
// serve different/irrelevant content (dictionary pages, random sites).
|
|
1121
|
+
// The scrapers are built (searchBingHttp, searchGoogleHttp) but need
|
|
1122
|
+
// further work on request fingerprinting to get real results.
|
|
1123
|
+
// TODO: Re-enable when fingerprinting is improved.
|
|
1124
|
+
// -----------------------------------------------------------
|
|
1125
|
+
// const skipBingHttp = providerStats.shouldSkip('bing-http');
|
|
1126
|
+
// const skipGoogleHttp = providerStats.shouldSkip('google-http');
|
|
1127
|
+
// if (!skipBingHttp || !skipGoogleHttp) { ... }
|
|
1128
|
+
// -----------------------------------------------------------
|
|
870
1129
|
// Stage 4: Stealth multi-engine (DDG + Bing + Ecosia in parallel)
|
|
871
1130
|
// Bypasses bot-detection on datacenter IPs. This is the reliable
|
|
872
1131
|
// last resort — but it spins up a browser so it takes a few seconds.
|
|
@@ -9,6 +9,11 @@
|
|
|
9
9
|
* Also provides `getSecCHUA()` for generating correct Sec-CH-UA header values
|
|
10
10
|
* that match the selected user agent (version-accurate brand hints).
|
|
11
11
|
*/
|
|
12
|
+
/**
|
|
13
|
+
* Full UA pool for HTTP-only requests (Chrome + Firefox + Safari + Edge + Mobile).
|
|
14
|
+
* NOT for browser contexts — use getRealisticUserAgent() there (Chrome-only).
|
|
15
|
+
*/
|
|
16
|
+
export declare const HTTP_UAS: readonly string[];
|
|
12
17
|
/**
|
|
13
18
|
* Returns a realistic, recent Chrome user agent string.
|
|
14
19
|
* Randomly picks from a curated list of real-world UAs (Chrome 132-136 range).
|
|
@@ -32,7 +37,27 @@ export declare function getRealisticUserAgent(platform?: 'windows' | 'mac' | 'li
|
|
|
32
37
|
*/
|
|
33
38
|
export declare function getRandomUA(): string;
|
|
34
39
|
/**
|
|
35
|
-
*
|
|
40
|
+
* Returns a realistic user agent for HTTP-only (non-browser) requests.
|
|
41
|
+
* Unlike `getRealisticUserAgent()` which is Chrome-only for browser contexts,
|
|
42
|
+
* this function returns from a wider pool: Chrome, Firefox, Safari, Edge, and Mobile.
|
|
43
|
+
*
|
|
44
|
+
* Weight distribution (approximate):
|
|
45
|
+
* - Chrome Windows: ~30%
|
|
46
|
+
* - Chrome macOS: ~25%
|
|
47
|
+
* - Chrome Linux: ~10%
|
|
48
|
+
* - Firefox: ~15%
|
|
49
|
+
* - Safari: ~10%
|
|
50
|
+
* - Edge: ~5%
|
|
51
|
+
* - Mobile Chrome: ~5%
|
|
52
|
+
*
|
|
53
|
+
* @example
|
|
54
|
+
* ```ts
|
|
55
|
+
* const ua = getHttpUA(); // e.g. "Mozilla/5.0 ... Firefox/133.0"
|
|
56
|
+
* ```
|
|
57
|
+
*/
|
|
58
|
+
export declare function getHttpUA(): string;
|
|
59
|
+
/**
|
|
60
|
+
* The full curated list of realistic user agents (Chrome-only, all platforms).
|
|
36
61
|
* Exported for inspection / testing.
|
|
37
62
|
*/
|
|
38
63
|
export declare const REALISTIC_USER_AGENTS: readonly string[];
|
package/dist/core/user-agents.js
CHANGED
|
@@ -42,8 +42,44 @@ const LINUX_UAS = [
|
|
|
42
42
|
// Chrome 136 Linux
|
|
43
43
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
|
|
44
44
|
];
|
|
45
|
-
/** All UAs combined (fallback when no platform is specified) */
|
|
45
|
+
/** All Chrome UAs combined (fallback when no platform is specified) */
|
|
46
46
|
const ALL_UAS = [...WINDOWS_UAS, ...MAC_UAS, ...LINUX_UAS];
|
|
47
|
+
// ── Extended pools for non-Chrome browsers (HTTP-only use) ───────────────────
|
|
48
|
+
/** Firefox UAs — Windows, Mac, Linux */
|
|
49
|
+
const FIREFOX_UAS = [
|
|
50
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
|
|
51
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0',
|
|
52
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:133.0) Gecko/20100101 Firefox/133.0',
|
|
53
|
+
'Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0',
|
|
54
|
+
];
|
|
55
|
+
/** Safari UAs — macOS */
|
|
56
|
+
const SAFARI_UAS = [
|
|
57
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15',
|
|
58
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 15_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15',
|
|
59
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15',
|
|
60
|
+
];
|
|
61
|
+
/** Microsoft Edge UAs */
|
|
62
|
+
const EDGE_UAS = [
|
|
63
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
|
|
64
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0',
|
|
65
|
+
];
|
|
66
|
+
/** Mobile Chrome UAs */
|
|
67
|
+
const MOBILE_CHROME_UAS = [
|
|
68
|
+
'Mozilla/5.0 (Linux; Android 14; SM-S928B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36',
|
|
69
|
+
'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Mobile Safari/537.36',
|
|
70
|
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.6778.103 Mobile/15E148 Safari/604.1',
|
|
71
|
+
];
|
|
72
|
+
/**
|
|
73
|
+
* Full UA pool for HTTP-only requests (Chrome + Firefox + Safari + Edge + Mobile).
|
|
74
|
+
* NOT for browser contexts — use getRealisticUserAgent() there (Chrome-only).
|
|
75
|
+
*/
|
|
76
|
+
export const HTTP_UAS = [
|
|
77
|
+
...ALL_UAS,
|
|
78
|
+
...FIREFOX_UAS,
|
|
79
|
+
...SAFARI_UAS,
|
|
80
|
+
...EDGE_UAS,
|
|
81
|
+
...MOBILE_CHROME_UAS,
|
|
82
|
+
];
|
|
47
83
|
// ── Public API ────────────────────────────────────────────────────────────────
|
|
48
84
|
/**
|
|
49
85
|
* Returns a realistic, recent Chrome user agent string.
|
|
@@ -97,7 +133,52 @@ export function getRandomUA() {
|
|
|
97
133
|
return ALL_UAS[idx];
|
|
98
134
|
}
|
|
99
135
|
/**
|
|
100
|
-
*
|
|
136
|
+
* Returns a realistic user agent for HTTP-only (non-browser) requests.
|
|
137
|
+
* Unlike `getRealisticUserAgent()` which is Chrome-only for browser contexts,
|
|
138
|
+
* this function returns from a wider pool: Chrome, Firefox, Safari, Edge, and Mobile.
|
|
139
|
+
*
|
|
140
|
+
* Weight distribution (approximate):
|
|
141
|
+
* - Chrome Windows: ~30%
|
|
142
|
+
* - Chrome macOS: ~25%
|
|
143
|
+
* - Chrome Linux: ~10%
|
|
144
|
+
* - Firefox: ~15%
|
|
145
|
+
* - Safari: ~10%
|
|
146
|
+
* - Edge: ~5%
|
|
147
|
+
* - Mobile Chrome: ~5%
|
|
148
|
+
*
|
|
149
|
+
* @example
|
|
150
|
+
* ```ts
|
|
151
|
+
* const ua = getHttpUA(); // e.g. "Mozilla/5.0 ... Firefox/133.0"
|
|
152
|
+
* ```
|
|
153
|
+
*/
|
|
154
|
+
export function getHttpUA() {
|
|
155
|
+
const roll = Math.random();
|
|
156
|
+
let pool;
|
|
157
|
+
if (roll < 0.30) {
|
|
158
|
+
pool = WINDOWS_UAS;
|
|
159
|
+
}
|
|
160
|
+
else if (roll < 0.55) {
|
|
161
|
+
pool = MAC_UAS;
|
|
162
|
+
}
|
|
163
|
+
else if (roll < 0.65) {
|
|
164
|
+
pool = LINUX_UAS;
|
|
165
|
+
}
|
|
166
|
+
else if (roll < 0.80) {
|
|
167
|
+
pool = FIREFOX_UAS;
|
|
168
|
+
}
|
|
169
|
+
else if (roll < 0.90) {
|
|
170
|
+
pool = SAFARI_UAS;
|
|
171
|
+
}
|
|
172
|
+
else if (roll < 0.95) {
|
|
173
|
+
pool = EDGE_UAS;
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
pool = MOBILE_CHROME_UAS;
|
|
177
|
+
}
|
|
178
|
+
return pool[Math.floor(Math.random() * pool.length)];
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* The full curated list of realistic user agents (Chrome-only, all platforms).
|
|
101
182
|
* Exported for inspection / testing.
|
|
102
183
|
*/
|
|
103
184
|
export const REALISTIC_USER_AGENTS = ALL_UAS;
|
package/dist/server/app.js
CHANGED
|
@@ -28,6 +28,7 @@ import { createJobsRouter } from './routes/jobs.js';
|
|
|
28
28
|
import { createBatchRouter } from './routes/batch.js';
|
|
29
29
|
import { createAnswerRouter } from './routes/answer.js';
|
|
30
30
|
import { createDeepResearchRouter } from './routes/deep-research.js';
|
|
31
|
+
import { createResearchRouter } from './routes/research.js';
|
|
31
32
|
import { createAskRouter } from './routes/ask.js';
|
|
32
33
|
import { createMcpRouter } from './routes/mcp.js';
|
|
33
34
|
import { createDoRouter } from './routes/do.js';
|
|
@@ -291,6 +292,9 @@ export function createApp(config = {}) {
|
|
|
291
292
|
app.use('/v1/screenshot', requireScope('full', 'read'));
|
|
292
293
|
app.use(createScreenshotRouter(authStore));
|
|
293
294
|
app.use(createSearchRouter(authStore));
|
|
295
|
+
// /v1/research — lightweight research (search → fetch → compile), BYOK LLM optional
|
|
296
|
+
app.use('/v1/research', requireScope('full', 'read'));
|
|
297
|
+
app.use(createResearchRouter());
|
|
294
298
|
app.use(createBillingPortalRouter(pool));
|
|
295
299
|
app.use(createUserRouter());
|
|
296
300
|
app.use(createOAuthRouter());
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* POST /v1/research
|
|
3
|
+
*
|
|
4
|
+
* Lightweight research endpoint that chains search → fetch → compile.
|
|
5
|
+
* No LLM required for baseline results; optional BYOK LLM synthesis.
|
|
6
|
+
*
|
|
7
|
+
* Auth: API key required (full or read scope)
|
|
8
|
+
* Body: ResearchRequest
|
|
9
|
+
*/
|
|
10
|
+
import { Router } from 'express';
|
|
11
|
+
export declare function expandQuery(query: string): string[];
|
|
12
|
+
export declare function extractKeyFacts(content: string, query: string, maxFacts?: number): string[];
|
|
13
|
+
export declare function createResearchRouter(): Router;
|
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* POST /v1/research
|
|
3
|
+
*
|
|
4
|
+
* Lightweight research endpoint that chains search → fetch → compile.
|
|
5
|
+
* No LLM required for baseline results; optional BYOK LLM synthesis.
|
|
6
|
+
*
|
|
7
|
+
* Auth: API key required (full or read scope)
|
|
8
|
+
* Body: ResearchRequest
|
|
9
|
+
*/
|
|
10
|
+
import { Router } from 'express';
|
|
11
|
+
import { peel } from '../../index.js';
|
|
12
|
+
import { getSearchProvider } from '../../core/search-provider.js';
|
|
13
|
+
import { callLLM, } from '../../core/llm-provider.js';
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Query expansion — simple heuristics, no LLM needed
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
const CURRENT_YEAR = new Date().getFullYear();
|
|
18
|
+
// Keywords that suggest the query is time-sensitive
|
|
19
|
+
const TIME_SENSITIVE_PATTERNS = /\b(price|cost|best|top|latest|current|now|today|new|salary|rate|speed|version|release|stock|review)\b/i;
|
|
20
|
+
// Prefixes that can be rephrased
|
|
21
|
+
const HOW_MUCH_RE = /^how much (?:does|do|is|are) (.+?)(?:\s+cost|\s+price|\s+charge)?[\s?]*$/i;
|
|
22
|
+
const HOW_TO_RE = /^how (?:to|do(?:es)?) (.+?)[\s?]*$/i;
|
|
23
|
+
const WHAT_IS_RE = /^(?:what (?:is|are)) (.+?)[\s?]*$/i;
|
|
24
|
+
export function expandQuery(query) {
|
|
25
|
+
const q = query.trim();
|
|
26
|
+
const queries = [q];
|
|
27
|
+
// Add year variant if time-sensitive and year not already present
|
|
28
|
+
const hasYear = /\b(20\d{2}|19\d{2})\b/.test(q);
|
|
29
|
+
if (!hasYear && TIME_SENSITIVE_PATTERNS.test(q)) {
|
|
30
|
+
queries.push(`${q} ${CURRENT_YEAR}`);
|
|
31
|
+
}
|
|
32
|
+
// Rephrase "how much does X cost" → "X cost price"
|
|
33
|
+
const howMuchMatch = HOW_MUCH_RE.exec(q);
|
|
34
|
+
if (howMuchMatch) {
|
|
35
|
+
const subject = howMuchMatch[1].trim();
|
|
36
|
+
const rephrased = `${subject} cost price`;
|
|
37
|
+
if (!queries.includes(rephrased)) {
|
|
38
|
+
queries.push(rephrased);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
// Rephrase "how to X" → "X guide tutorial"
|
|
42
|
+
const howToMatch = HOW_TO_RE.exec(q);
|
|
43
|
+
if (howToMatch) {
|
|
44
|
+
const subject = howToMatch[1].trim();
|
|
45
|
+
const rephrased = `${subject} guide`;
|
|
46
|
+
if (!queries.includes(rephrased)) {
|
|
47
|
+
queries.push(rephrased);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
// Rephrase "what is X" → "X definition overview"
|
|
51
|
+
const whatIsMatch = WHAT_IS_RE.exec(q);
|
|
52
|
+
if (whatIsMatch) {
|
|
53
|
+
const subject = whatIsMatch[1].trim();
|
|
54
|
+
const rephrased = `${subject} overview`;
|
|
55
|
+
if (!queries.includes(rephrased)) {
|
|
56
|
+
queries.push(rephrased);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
// Cap at 3 variations
|
|
60
|
+
return queries.slice(0, 3);
|
|
61
|
+
}
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
// Key-fact extraction — score sentences by keyword overlap
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
function tokenize(text) {
|
|
66
|
+
return text
|
|
67
|
+
.toLowerCase()
|
|
68
|
+
.split(/\W+/)
|
|
69
|
+
.filter(w => w.length > 2);
|
|
70
|
+
}
|
|
71
|
+
// Common English stop-words to skip when scoring
|
|
72
|
+
const STOP_WORDS = new Set([
|
|
73
|
+
'the', 'and', 'for', 'are', 'was', 'were', 'but', 'not', 'you', 'all',
|
|
74
|
+
'can', 'her', 'his', 'its', 'our', 'out', 'one', 'had', 'has', 'have',
|
|
75
|
+
'this', 'that', 'with', 'they', 'from', 'your', 'what', 'when', 'how',
|
|
76
|
+
'will', 'been', 'than', 'more', 'also', 'into', 'which', 'about',
|
|
77
|
+
]);
|
|
78
|
+
export function extractKeyFacts(content, query, maxFacts = 5) {
|
|
79
|
+
if (!content || !query)
|
|
80
|
+
return [];
|
|
81
|
+
const queryKeywords = new Set(tokenize(query).filter(w => !STOP_WORDS.has(w)));
|
|
82
|
+
if (queryKeywords.size === 0)
|
|
83
|
+
return [];
|
|
84
|
+
// Split into sentences on common terminators
|
|
85
|
+
const sentences = content
|
|
86
|
+
.replace(/\n{2,}/g, ' ')
|
|
87
|
+
.split(/(?<=[.!?])\s+/)
|
|
88
|
+
.map(s => s.trim())
|
|
89
|
+
// Filter length
|
|
90
|
+
.filter(s => s.length > 40 && s.length < 500)
|
|
91
|
+
// Skip markdown headers (## Heading, # Title)
|
|
92
|
+
.filter(s => !/^#{1,4}\s/.test(s))
|
|
93
|
+
// Skip navigation/link-heavy lines (lots of []() markdown)
|
|
94
|
+
.filter(s => (s.match(/\[.*?\]\(.*?\)/g) || []).length < 3)
|
|
95
|
+
// Skip lines that are just questions or teasers with no data
|
|
96
|
+
.filter(s => !/^(thinking about|wondering|let's|let me|in this article|we'll|here's|read on|click|sign up|subscribe|after diving|but the big question|for full data|source:|select make|select model)/i.test(s))
|
|
97
|
+
// Skip lines that are just italicized markdown filler (_text_)
|
|
98
|
+
.filter(s => !s.startsWith('_') || s.includes('$') || s.includes('%') || /\d/.test(s))
|
|
99
|
+
// Skip markdown image lines ()
|
|
100
|
+
.filter(s => !/^!\[/.test(s))
|
|
101
|
+
// Skip "Read more about..." lines
|
|
102
|
+
.filter(s => !/^\[read more|^\[learn more|\[read more|\[learn more/i.test(s));
|
|
103
|
+
// Prefer sentences with numbers (prices, percentages, years)
|
|
104
|
+
// (we don't remove number-less ones, just score them lower)
|
|
105
|
+
if (sentences.length === 0)
|
|
106
|
+
return [];
|
|
107
|
+
// Score each sentence by keyword overlap
|
|
108
|
+
const scored = sentences.map(sentence => {
|
|
109
|
+
const words = tokenize(sentence);
|
|
110
|
+
let hits = 0;
|
|
111
|
+
const seen = new Set();
|
|
112
|
+
for (const w of words) {
|
|
113
|
+
if (queryKeywords.has(w) && !seen.has(w)) {
|
|
114
|
+
hits++;
|
|
115
|
+
seen.add(w);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
let score = hits / queryKeywords.size;
|
|
119
|
+
// Boost sentences with numbers/prices/percentages — likely to contain real data
|
|
120
|
+
if (/\$[\d,]+|[\d,]+\/mo|\d+%|\d+\s*year|\d+\s*month|\d+,\d{3}/.test(sentence)) {
|
|
121
|
+
score *= 1.5;
|
|
122
|
+
}
|
|
123
|
+
return { sentence, score };
|
|
124
|
+
});
|
|
125
|
+
scored.sort((a, b) => b.score - a.score);
|
|
126
|
+
// Return top N, deduped
|
|
127
|
+
const seen = new Set();
|
|
128
|
+
const result = [];
|
|
129
|
+
for (const { sentence, score } of scored) {
|
|
130
|
+
if (score === 0)
|
|
131
|
+
break; // no keyword overlap
|
|
132
|
+
const normalized = sentence.toLowerCase().slice(0, 80);
|
|
133
|
+
if (seen.has(normalized))
|
|
134
|
+
continue;
|
|
135
|
+
seen.add(normalized);
|
|
136
|
+
result.push(sentence);
|
|
137
|
+
if (result.length >= maxFacts)
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
return result;
|
|
141
|
+
}
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
// Route factory
|
|
144
|
+
// ---------------------------------------------------------------------------
|
|
145
|
+
const VALID_LLM_PROVIDERS = [
|
|
146
|
+
'openai',
|
|
147
|
+
'anthropic',
|
|
148
|
+
'google',
|
|
149
|
+
'ollama',
|
|
150
|
+
'cerebras',
|
|
151
|
+
'cloudflare',
|
|
152
|
+
];
|
|
153
|
+
const MAX_SOURCES_HARD_LIMIT = 8;
|
|
154
|
+
const PER_URL_TIMEOUT_MS = 15_000;
|
|
155
|
+
const TOTAL_TIMEOUT_MS = 60_000;
|
|
156
|
+
export function createResearchRouter() {
|
|
157
|
+
const router = Router();
|
|
158
|
+
router.post('/v1/research', async (req, res) => {
|
|
159
|
+
const startTime = Date.now();
|
|
160
|
+
// ── Auth ─────────────────────────────────────────────────────────────────
|
|
161
|
+
const authId = req.auth?.keyInfo?.accountId || req.user?.userId;
|
|
162
|
+
if (!authId) {
|
|
163
|
+
res.status(401).json({
|
|
164
|
+
success: false,
|
|
165
|
+
error: {
|
|
166
|
+
type: 'authentication_required',
|
|
167
|
+
message: 'API key required. Get one at https://app.webpeel.dev/keys',
|
|
168
|
+
hint: 'Get a free API key at https://app.webpeel.dev/keys',
|
|
169
|
+
docs: 'https://webpeel.dev/docs/errors#authentication_required',
|
|
170
|
+
},
|
|
171
|
+
requestId: req.requestId,
|
|
172
|
+
});
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
// ── Parse & validate body ─────────────────────────────────────────────
|
|
176
|
+
const body = req.body;
|
|
177
|
+
if (!body.query || typeof body.query !== 'string' || body.query.trim().length === 0) {
|
|
178
|
+
res.status(400).json({
|
|
179
|
+
success: false,
|
|
180
|
+
error: {
|
|
181
|
+
type: 'invalid_request',
|
|
182
|
+
message: 'Missing or empty "query" field.',
|
|
183
|
+
hint: 'Send JSON: { "query": "your research question" }',
|
|
184
|
+
docs: 'https://webpeel.dev/docs/api-reference#research',
|
|
185
|
+
},
|
|
186
|
+
requestId: req.requestId,
|
|
187
|
+
});
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
const query = body.query.trim().slice(0, 500); // hard cap
|
|
191
|
+
const depth = body.depth ?? 'quick';
|
|
192
|
+
if (depth !== 'quick' && depth !== 'deep') {
|
|
193
|
+
res.status(400).json({
|
|
194
|
+
success: false,
|
|
195
|
+
error: {
|
|
196
|
+
type: 'invalid_request',
|
|
197
|
+
message: 'Invalid "depth" value: must be "quick" or "deep".',
|
|
198
|
+
docs: 'https://webpeel.dev/docs/api-reference#research',
|
|
199
|
+
},
|
|
200
|
+
requestId: req.requestId,
|
|
201
|
+
});
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
// Depth-based defaults
|
|
205
|
+
const defaultMaxSources = depth === 'deep' ? 8 : 3;
|
|
206
|
+
const defaultSearchCount = depth === 'deep' ? 10 : 5;
|
|
207
|
+
const numSearchQueries = depth === 'deep' ? 3 : 1;
|
|
208
|
+
const requestedMax = typeof body.maxSources === 'number' ? body.maxSources : defaultMaxSources;
|
|
209
|
+
const maxSources = Math.min(Math.max(1, requestedMax), MAX_SOURCES_HARD_LIMIT);
|
|
210
|
+
// Optional LLM config
|
|
211
|
+
let llmConfig;
|
|
212
|
+
if (body.llm) {
|
|
213
|
+
const { provider, apiKey, model } = body.llm;
|
|
214
|
+
if (!provider || typeof provider !== 'string') {
|
|
215
|
+
res.status(400).json({
|
|
216
|
+
success: false,
|
|
217
|
+
error: {
|
|
218
|
+
type: 'invalid_request',
|
|
219
|
+
message: 'llm.provider is required when providing llm config.',
|
|
220
|
+
docs: 'https://webpeel.dev/docs/api-reference#research',
|
|
221
|
+
},
|
|
222
|
+
requestId: req.requestId,
|
|
223
|
+
});
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
226
|
+
if (!VALID_LLM_PROVIDERS.includes(provider)) {
|
|
227
|
+
res.status(400).json({
|
|
228
|
+
success: false,
|
|
229
|
+
error: {
|
|
230
|
+
type: 'invalid_request',
|
|
231
|
+
message: `Invalid llm.provider. Must be one of: ${VALID_LLM_PROVIDERS.join(', ')}`,
|
|
232
|
+
docs: 'https://webpeel.dev/docs/api-reference#research',
|
|
233
|
+
},
|
|
234
|
+
requestId: req.requestId,
|
|
235
|
+
});
|
|
236
|
+
return;
|
|
237
|
+
}
|
|
238
|
+
if (!apiKey || typeof apiKey !== 'string' || apiKey.trim().length === 0) {
|
|
239
|
+
res.status(400).json({
|
|
240
|
+
success: false,
|
|
241
|
+
error: {
|
|
242
|
+
type: 'invalid_request',
|
|
243
|
+
message: 'llm.apiKey is required when providing llm config.',
|
|
244
|
+
docs: 'https://webpeel.dev/docs/api-reference#research',
|
|
245
|
+
},
|
|
246
|
+
requestId: req.requestId,
|
|
247
|
+
});
|
|
248
|
+
return;
|
|
249
|
+
}
|
|
250
|
+
llmConfig = {
|
|
251
|
+
provider: provider,
|
|
252
|
+
apiKey: apiKey.trim(),
|
|
253
|
+
model: model,
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
// ── Set up total-timeout race ─────────────────────────────────────────
|
|
257
|
+
const overallDeadline = startTime + TOTAL_TIMEOUT_MS;
|
|
258
|
+
try {
|
|
259
|
+
// ── 1. Query expansion ────────────────────────────────────────────────
|
|
260
|
+
const allQueries = expandQuery(query);
|
|
261
|
+
const searchQueries = allQueries.slice(0, numSearchQueries);
|
|
262
|
+
// ── 2. Search all query variations, collect unique URLs ───────────────
|
|
263
|
+
const searchProvider = getSearchProvider('duckduckgo');
|
|
264
|
+
const seenUrls = new Set();
|
|
265
|
+
const urlQueue = [];
|
|
266
|
+
for (const sq of searchQueries) {
|
|
267
|
+
if (Date.now() > overallDeadline - 5_000)
|
|
268
|
+
break; // stop if < 5s left
|
|
269
|
+
try {
|
|
270
|
+
const results = await searchProvider.searchWeb(sq, { count: defaultSearchCount });
|
|
271
|
+
for (const r of results) {
|
|
272
|
+
if (!r.url || seenUrls.has(r.url))
|
|
273
|
+
continue;
|
|
274
|
+
seenUrls.add(r.url);
|
|
275
|
+
urlQueue.push({ url: r.url, title: r.title, snippet: r.snippet });
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
catch {
|
|
279
|
+
// Search failure — continue with whatever URLs we have
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
// ── 3. Fetch top N unique URLs sequentially ───────────────────────────
|
|
283
|
+
const sources = [];
|
|
284
|
+
const fetchedContents = [];
|
|
285
|
+
for (const { url, title, snippet } of urlQueue) {
|
|
286
|
+
if (sources.length >= maxSources)
|
|
287
|
+
break;
|
|
288
|
+
if (Date.now() > overallDeadline - 2_000)
|
|
289
|
+
break;
|
|
290
|
+
const timeLeft = overallDeadline - Date.now();
|
|
291
|
+
const urlTimeout = Math.min(PER_URL_TIMEOUT_MS, timeLeft);
|
|
292
|
+
if (urlTimeout < 1000)
|
|
293
|
+
break;
|
|
294
|
+
const fetchStart = Date.now();
|
|
295
|
+
try {
|
|
296
|
+
const result = await Promise.race([
|
|
297
|
+
peel(url, {
|
|
298
|
+
format: 'markdown',
|
|
299
|
+
noEscalate: true, // NEVER launch browser — 512MB container
|
|
300
|
+
timeout: urlTimeout,
|
|
301
|
+
readable: true,
|
|
302
|
+
budget: 3000,
|
|
303
|
+
}),
|
|
304
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('per-url timeout')), urlTimeout)),
|
|
305
|
+
]);
|
|
306
|
+
const fetchTime = Date.now() - fetchStart;
|
|
307
|
+
const content = result.content || '';
|
|
308
|
+
const wordCount = content.split(/\s+/).filter(Boolean).length;
|
|
309
|
+
const pageTitle = result.title || title;
|
|
310
|
+
// Build snippet: prefer LLM-extracted summary, else first 500 chars of content
|
|
311
|
+
const sourceSnippet = content.slice(0, 500).replace(/\s+/g, ' ').trim();
|
|
312
|
+
sources.push({
|
|
313
|
+
url,
|
|
314
|
+
title: pageTitle.slice(0, 200),
|
|
315
|
+
snippet: sourceSnippet || snippet.slice(0, 500),
|
|
316
|
+
wordCount,
|
|
317
|
+
fetchTime,
|
|
318
|
+
});
|
|
319
|
+
if (content.length > 0) {
|
|
320
|
+
fetchedContents.push({ url, content });
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
catch {
|
|
324
|
+
// Skip failed URLs, continue to next
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
// ── 4. Extract key facts across all fetched pages ─────────────────────
|
|
328
|
+
const allFacts = [];
|
|
329
|
+
const seenFacts = new Set();
|
|
330
|
+
for (const { content } of fetchedContents) {
|
|
331
|
+
const pageFacts = extractKeyFacts(content, query, 5);
|
|
332
|
+
for (const fact of pageFacts) {
|
|
333
|
+
const key = fact.toLowerCase().slice(0, 100);
|
|
334
|
+
if (!seenFacts.has(key)) {
|
|
335
|
+
seenFacts.add(key);
|
|
336
|
+
allFacts.push(fact);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
if (allFacts.length >= 20)
|
|
340
|
+
break; // global cap
|
|
341
|
+
}
|
|
342
|
+
// ── 5. Optional LLM synthesis ─────────────────────────────────────────
|
|
343
|
+
let summary;
|
|
344
|
+
if (llmConfig && fetchedContents.length > 0 && Date.now() < overallDeadline - 3_000) {
|
|
345
|
+
try {
|
|
346
|
+
const sourcesText = fetchedContents
|
|
347
|
+
.map((fc, i) => `[${i + 1}] ${fc.url}\n${fc.content.slice(0, 2000)}`)
|
|
348
|
+
.join('\n\n---\n\n');
|
|
349
|
+
const llmResult = await callLLM(llmConfig, {
|
|
350
|
+
messages: [
|
|
351
|
+
{
|
|
352
|
+
role: 'system',
|
|
353
|
+
content: 'You are a research assistant. Synthesize the following sources into a clear, ' +
|
|
354
|
+
'comprehensive answer to the user\'s question. Cite sources by number [1], [2], etc. ' +
|
|
355
|
+
'Be concise but thorough. Use plain text without excessive markdown.',
|
|
356
|
+
},
|
|
357
|
+
{
|
|
358
|
+
role: 'user',
|
|
359
|
+
content: `Question: ${query}\n\nSources:\n\n${sourcesText}`,
|
|
360
|
+
},
|
|
361
|
+
],
|
|
362
|
+
maxTokens: 1000,
|
|
363
|
+
});
|
|
364
|
+
summary = llmResult.text;
|
|
365
|
+
}
|
|
366
|
+
catch {
|
|
367
|
+
// LLM synthesis failure is non-fatal — return results without summary
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
const elapsed = Date.now() - startTime;
|
|
371
|
+
res.json({
|
|
372
|
+
success: true,
|
|
373
|
+
data: {
|
|
374
|
+
query,
|
|
375
|
+
...(summary !== undefined ? { summary } : {}),
|
|
376
|
+
sources,
|
|
377
|
+
keyFacts: allFacts,
|
|
378
|
+
totalSources: sources.length,
|
|
379
|
+
searchQueries,
|
|
380
|
+
elapsed,
|
|
381
|
+
},
|
|
382
|
+
requestId: req.requestId,
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
catch (error) {
|
|
386
|
+
console.error('[research] Unexpected error:', error);
|
|
387
|
+
if (res.headersSent)
|
|
388
|
+
return;
|
|
389
|
+
res.status(500).json({
|
|
390
|
+
success: false,
|
|
391
|
+
error: {
|
|
392
|
+
type: 'research_failed',
|
|
393
|
+
message: 'Research request failed. Please try again.',
|
|
394
|
+
docs: 'https://webpeel.dev/docs/api-reference#research',
|
|
395
|
+
},
|
|
396
|
+
requestId: req.requestId,
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
});
|
|
400
|
+
return router;
|
|
401
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.29",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|