webpeel 0.21.6 → 0.21.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/utils.js +63 -14
- package/dist/core/proxy-config.d.ts +55 -0
- package/dist/core/proxy-config.js +79 -0
- package/dist/core/search-provider.js +7 -0
- package/dist/core/strategies.js +8 -2
- package/dist/core/structured-extract.js +190 -23
- package/dist/core/youtube.js +6 -2
- package/dist/server/app.js +2 -2
- package/dist/server/routes/fetch.js +76 -34
- package/dist/types.d.ts +12 -0
- package/package.json +1 -1
package/dist/cli/utils.js
CHANGED
|
@@ -131,22 +131,30 @@ export function parseActions(actionStrings) {
|
|
|
131
131
|
*/
|
|
132
132
|
export function formatError(error, _url, options) {
|
|
133
133
|
const msg = error.message || String(error);
|
|
134
|
+
const errorType = error.errorType || '';
|
|
134
135
|
const lines = [`\x1b[31m✖ ${msg}\x1b[0m`];
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
}
|
|
138
|
-
else if (msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
|
|
136
|
+
// Check structured errorType from API first (takes precedence over message heuristics)
|
|
137
|
+
if (errorType === 'timeout' || msg.includes('took too long') || msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
|
|
139
138
|
lines.push('\x1b[33m💡 Try increasing timeout: --timeout 60000\x1b[0m');
|
|
140
139
|
if (!options.render) {
|
|
141
140
|
lines.push('\x1b[33m💡 Site may need browser rendering: --render\x1b[0m');
|
|
142
141
|
}
|
|
143
142
|
}
|
|
144
|
-
else if (msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
|
|
143
|
+
else if (errorType === 'blocked' || msg.includes('blocking automated') || msg.includes('bot protection') || msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
|
|
145
144
|
if (!options.stealth) {
|
|
146
145
|
lines.push('\x1b[33m💡 Try stealth mode to bypass bot detection: --stealth\x1b[0m');
|
|
147
146
|
}
|
|
148
147
|
lines.push('\x1b[33m💡 Try a different user agent: --ua "Mozilla/5.0..."\x1b[0m');
|
|
149
148
|
}
|
|
149
|
+
else if (errorType === 'not_found' || msg.includes('domain may not exist') || msg.includes('not found') || msg.includes('ENOTFOUND') || msg.includes('net::ERR_') || msg.includes('ECONNREFUSED')) {
|
|
150
|
+
lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
|
|
151
|
+
}
|
|
152
|
+
else if (errorType === 'network' || msg.includes('Could not reach') || msg.includes('could not connect') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
|
|
153
|
+
lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
|
|
154
|
+
}
|
|
155
|
+
else if (errorType === 'server_error' || msg.includes('server error')) {
|
|
156
|
+
lines.push('\x1b[33m💡 The target site returned a server error. Try again in a moment.\x1b[0m');
|
|
157
|
+
}
|
|
150
158
|
else if (msg.includes('empty') || msg.includes('no content') || msg.includes('0 tokens')) {
|
|
151
159
|
if (!options.render) {
|
|
152
160
|
lines.push('\x1b[33m💡 Page may be JavaScript-rendered. Try: --render\x1b[0m');
|
|
@@ -213,18 +221,39 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
|
|
|
213
221
|
if (!res.ok) {
|
|
214
222
|
const body = await res.text().catch(() => '');
|
|
215
223
|
// Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
|
|
216
|
-
const isHtml = body.trimStart().startsWith('<');
|
|
224
|
+
const isHtml = body.trimStart().startsWith('<') || body.includes('<!DOCTYPE') || body.includes('<html');
|
|
217
225
|
let errorMsg;
|
|
226
|
+
let errorType;
|
|
218
227
|
if (res.status === 502 || res.status === 503 || res.status === 504) {
|
|
219
|
-
errorMsg = `Could not reach this website
|
|
228
|
+
errorMsg = `Could not reach this website. The site may be blocking our server or timing out.`;
|
|
229
|
+
errorType = res.status === 504 ? 'timeout' : 'network';
|
|
220
230
|
}
|
|
221
231
|
else if (isHtml) {
|
|
222
|
-
errorMsg = `Server returned an error page`;
|
|
232
|
+
errorMsg = `Server returned an error page (${res.status})`;
|
|
223
233
|
}
|
|
224
234
|
else {
|
|
225
|
-
|
|
235
|
+
// Try to parse a structured JSON error response
|
|
236
|
+
try {
|
|
237
|
+
const json = JSON.parse(body);
|
|
238
|
+
const errObj = json?.error;
|
|
239
|
+
if (errObj && typeof errObj === 'object') {
|
|
240
|
+
errorMsg = typeof errObj.message === 'string' ? errObj.message : (body.slice(0, 200) || 'Unknown error');
|
|
241
|
+
if (typeof errObj.type === 'string')
|
|
242
|
+
errorType = errObj.type;
|
|
243
|
+
}
|
|
244
|
+
else {
|
|
245
|
+
errorMsg = body.slice(0, 200) || 'Unknown error';
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
catch {
|
|
249
|
+
errorMsg = body.slice(0, 200) || 'Unknown error';
|
|
250
|
+
}
|
|
226
251
|
}
|
|
227
|
-
|
|
252
|
+
const err = new Error(`${errorMsg}`);
|
|
253
|
+
if (errorType)
|
|
254
|
+
err.errorType = errorType;
|
|
255
|
+
err.statusCode = res.status;
|
|
256
|
+
throw err;
|
|
228
257
|
}
|
|
229
258
|
const data = await res.json();
|
|
230
259
|
// Map API response to PeelResult shape that the CLI already handles
|
|
@@ -405,20 +434,40 @@ export function classifyErrorCode(error) {
|
|
|
405
434
|
// Check for our custom _code first (set in pre-fetch validation)
|
|
406
435
|
if (error._code)
|
|
407
436
|
return error._code;
|
|
437
|
+
// Check for structured errorType from API responses (set by fetchViaApi)
|
|
438
|
+
const errorType = error.errorType;
|
|
439
|
+
if (errorType) {
|
|
440
|
+
const typeMap = {
|
|
441
|
+
timeout: 'TIMEOUT',
|
|
442
|
+
blocked: 'BLOCKED',
|
|
443
|
+
not_found: 'NOT_FOUND',
|
|
444
|
+
server_error: 'SERVER_ERROR',
|
|
445
|
+
network: 'NETWORK',
|
|
446
|
+
unknown: 'FETCH_FAILED',
|
|
447
|
+
};
|
|
448
|
+
if (typeMap[errorType])
|
|
449
|
+
return typeMap[errorType];
|
|
450
|
+
}
|
|
408
451
|
const msg = error.message.toLowerCase();
|
|
409
452
|
const name = error.name || '';
|
|
410
|
-
if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out')) {
|
|
453
|
+
if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out') || msg.includes('took too long')) {
|
|
411
454
|
return 'TIMEOUT';
|
|
412
455
|
}
|
|
413
|
-
if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
|
|
456
|
+
if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare') || msg.includes('bot protection')) {
|
|
414
457
|
return 'BLOCKED';
|
|
415
458
|
}
|
|
416
|
-
if (msg.includes('
|
|
417
|
-
return '
|
|
459
|
+
if (msg.includes('domain may not exist') || msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed')) {
|
|
460
|
+
return 'NOT_FOUND';
|
|
461
|
+
}
|
|
462
|
+
if (msg.includes('http 404') || msg.includes('page was not found')) {
|
|
463
|
+
return 'NOT_FOUND';
|
|
418
464
|
}
|
|
419
465
|
if (msg.includes('invalid url') || msg.includes('invalid hostname') || msg.includes('only http')) {
|
|
420
466
|
return 'INVALID_URL';
|
|
421
467
|
}
|
|
468
|
+
if (msg.includes('could not reach') || msg.includes('could not connect') || msg.includes('econnrefused')) {
|
|
469
|
+
return 'NETWORK';
|
|
470
|
+
}
|
|
422
471
|
return 'FETCH_FAILED';
|
|
423
472
|
}
|
|
424
473
|
/**
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared Webshare residential proxy configuration.
|
|
3
|
+
*
|
|
4
|
+
* WebPeel uses Webshare residential proxies (configured via env vars) to route
|
|
5
|
+
* requests through US residential IPs, bypassing datacenter IP blocks from
|
|
6
|
+
* DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
|
|
7
|
+
*
|
|
8
|
+
* Proxy credentials are loaded from environment variables:
|
|
9
|
+
* WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
|
|
10
|
+
* WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
|
|
11
|
+
* WEBSHARE_PROXY_USER — proxy username (without slot suffix)
|
|
12
|
+
* WEBSHARE_PROXY_PASS — proxy password
|
|
13
|
+
* WEBSHARE_PROXY_SLOTS — number of available US residential slots
|
|
14
|
+
*
|
|
15
|
+
* With the Webshare backbone plan each US slot has its own port:
|
|
16
|
+
* slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
|
|
17
|
+
*/
|
|
18
|
+
export interface ProxyConfig {
|
|
19
|
+
/** Proxy server URL in the format "http://host:port" */
|
|
20
|
+
server: string;
|
|
21
|
+
/** Proxy username (includes slot suffix, e.g. "user-US-42") */
|
|
22
|
+
username: string;
|
|
23
|
+
/** Proxy password */
|
|
24
|
+
password: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Get a random Webshare residential proxy config.
|
|
28
|
+
* Returns null if the proxy is not configured (env vars missing or slots = 0).
|
|
29
|
+
*
|
|
30
|
+
* Uses random slot selection across all available US slots for even load
|
|
31
|
+
* distribution — same approach as youtube.ts proxyRequestSlotted().
|
|
32
|
+
*/
|
|
33
|
+
export declare function getWebshareProxy(): ProxyConfig | null;
|
|
34
|
+
/**
|
|
35
|
+
* Check if Webshare proxies are configured (env vars are present and non-empty).
|
|
36
|
+
* Does NOT guarantee the proxy is reachable — just that credentials are set.
|
|
37
|
+
*/
|
|
38
|
+
export declare function hasWebshareProxy(): boolean;
|
|
39
|
+
/**
|
|
40
|
+
* Convert a ProxyConfig to a Playwright-compatible proxy object.
|
|
41
|
+
* Useful for passing directly to browser.newContext({ proxy: ... }).
|
|
42
|
+
*/
|
|
43
|
+
export declare function toPlaywrightProxy(config: ProxyConfig): {
|
|
44
|
+
server: string;
|
|
45
|
+
username: string;
|
|
46
|
+
password: string;
|
|
47
|
+
};
|
|
48
|
+
/**
|
|
49
|
+
* Get a random Webshare proxy as a fully-qualified URL string with embedded
|
|
50
|
+
* credentials. The format is: `http://username:password@host:port`
|
|
51
|
+
*
|
|
52
|
+
* Useful for passing to strategies.ts proxy option (which expects a URL string).
|
|
53
|
+
* Returns null if proxies are not configured.
|
|
54
|
+
*/
|
|
55
|
+
export declare function getWebshareProxyUrl(): string | null;
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared Webshare residential proxy configuration.
|
|
3
|
+
*
|
|
4
|
+
* WebPeel uses Webshare residential proxies (configured via env vars) to route
|
|
5
|
+
* requests through US residential IPs, bypassing datacenter IP blocks from
|
|
6
|
+
* DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
|
|
7
|
+
*
|
|
8
|
+
* Proxy credentials are loaded from environment variables:
|
|
9
|
+
* WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
|
|
10
|
+
* WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
|
|
11
|
+
* WEBSHARE_PROXY_USER — proxy username (without slot suffix)
|
|
12
|
+
* WEBSHARE_PROXY_PASS — proxy password
|
|
13
|
+
* WEBSHARE_PROXY_SLOTS — number of available US residential slots
|
|
14
|
+
*
|
|
15
|
+
* With the Webshare backbone plan each US slot has its own port:
|
|
16
|
+
* slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Get a random Webshare residential proxy config.
|
|
20
|
+
* Returns null if the proxy is not configured (env vars missing or slots = 0).
|
|
21
|
+
*
|
|
22
|
+
* Uses random slot selection across all available US slots for even load
|
|
23
|
+
* distribution — same approach as youtube.ts proxyRequestSlotted().
|
|
24
|
+
*/
|
|
25
|
+
export function getWebshareProxy() {
|
|
26
|
+
const host = process.env.WEBSHARE_PROXY_HOST;
|
|
27
|
+
const user = process.env.WEBSHARE_PROXY_USER;
|
|
28
|
+
const pass = process.env.WEBSHARE_PROXY_PASS;
|
|
29
|
+
const basePort = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
30
|
+
const slots = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '0', 10);
|
|
31
|
+
if (!host || !user || !pass || slots <= 0)
|
|
32
|
+
return null;
|
|
33
|
+
const slot = Math.floor(Math.random() * slots) + 1;
|
|
34
|
+
const port = basePort + slot - 1;
|
|
35
|
+
return {
|
|
36
|
+
server: `http://${host}:${port}`,
|
|
37
|
+
username: `${user}-US-${slot}`,
|
|
38
|
+
password: pass,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Check if Webshare proxies are configured (env vars are present and non-empty).
|
|
43
|
+
* Does NOT guarantee the proxy is reachable — just that credentials are set.
|
|
44
|
+
*/
|
|
45
|
+
export function hasWebshareProxy() {
|
|
46
|
+
return !!(process.env.WEBSHARE_PROXY_HOST &&
|
|
47
|
+
process.env.WEBSHARE_PROXY_USER &&
|
|
48
|
+
process.env.WEBSHARE_PROXY_PASS);
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Convert a ProxyConfig to a Playwright-compatible proxy object.
|
|
52
|
+
* Useful for passing directly to browser.newContext({ proxy: ... }).
|
|
53
|
+
*/
|
|
54
|
+
export function toPlaywrightProxy(config) {
|
|
55
|
+
return {
|
|
56
|
+
server: config.server,
|
|
57
|
+
username: config.username,
|
|
58
|
+
password: config.password,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Get a random Webshare proxy as a fully-qualified URL string with embedded
|
|
63
|
+
* credentials. The format is: `http://username:password@host:port`
|
|
64
|
+
*
|
|
65
|
+
* Useful for passing to strategies.ts proxy option (which expects a URL string).
|
|
66
|
+
* Returns null if proxies are not configured.
|
|
67
|
+
*/
|
|
68
|
+
export function getWebshareProxyUrl() {
|
|
69
|
+
const config = getWebshareProxy();
|
|
70
|
+
if (!config)
|
|
71
|
+
return null;
|
|
72
|
+
try {
|
|
73
|
+
const url = new URL(config.server);
|
|
74
|
+
return `http://${encodeURIComponent(config.username)}:${encodeURIComponent(config.password)}@${url.host}`;
|
|
75
|
+
}
|
|
76
|
+
catch {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import { fetch as undiciFetch } from 'undici';
|
|
16
16
|
import { load } from 'cheerio';
|
|
17
17
|
import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
|
+
import { getWebshareProxy } from './proxy-config.js';
|
|
18
19
|
import { createLogger } from './logger.js';
|
|
19
20
|
const log = createLogger('search');
|
|
20
21
|
function decodeHtmlEntities(input) {
|
|
@@ -236,10 +237,12 @@ export class StealthSearchProvider {
|
|
|
236
237
|
const browser = await getStealthBrowser();
|
|
237
238
|
const params = new URLSearchParams({ q: query });
|
|
238
239
|
const url = `https://html.duckduckgo.com/html/?${params.toString()}`;
|
|
240
|
+
const proxy = getWebshareProxy();
|
|
239
241
|
ctx = await browser.newContext({
|
|
240
242
|
userAgent: getRandomUserAgent(),
|
|
241
243
|
locale: 'en-US',
|
|
242
244
|
timezoneId: 'America/New_York',
|
|
245
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
243
246
|
});
|
|
244
247
|
const page = await ctx.newPage();
|
|
245
248
|
await applyStealthScripts(page);
|
|
@@ -303,10 +306,12 @@ export class StealthSearchProvider {
|
|
|
303
306
|
const browser = await getStealthBrowser();
|
|
304
307
|
const params = new URLSearchParams({ q: query });
|
|
305
308
|
const url = `https://www.bing.com/search?${params.toString()}`;
|
|
309
|
+
const proxy = getWebshareProxy();
|
|
306
310
|
ctx = await browser.newContext({
|
|
307
311
|
userAgent: getRandomUserAgent(),
|
|
308
312
|
locale: 'en-US',
|
|
309
313
|
timezoneId: 'America/New_York',
|
|
314
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
310
315
|
});
|
|
311
316
|
const page = await ctx.newPage();
|
|
312
317
|
await applyStealthScripts(page);
|
|
@@ -380,10 +385,12 @@ export class StealthSearchProvider {
|
|
|
380
385
|
const browser = await getStealthBrowser();
|
|
381
386
|
const params = new URLSearchParams({ q: query });
|
|
382
387
|
const url = `https://www.ecosia.org/search?${params.toString()}`;
|
|
388
|
+
const proxy = getWebshareProxy();
|
|
383
389
|
ctx = await browser.newContext({
|
|
384
390
|
userAgent: getRandomUserAgent(),
|
|
385
391
|
locale: 'en-US',
|
|
386
392
|
timezoneId: 'America/New_York',
|
|
393
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
387
394
|
});
|
|
388
395
|
const page = await ctx.newPage();
|
|
389
396
|
await applyStealthScripts(page);
|
package/dist/core/strategies.js
CHANGED
|
@@ -10,6 +10,7 @@ import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
|
|
|
10
10
|
import { getCached, setCached as setBasicCache } from './cache.js';
|
|
11
11
|
import { resolveAndCache } from './dns-cache.js';
|
|
12
12
|
import { BlockedError, NetworkError } from '../types.js';
|
|
13
|
+
import { getWebshareProxyUrl } from './proxy-config.js';
|
|
13
14
|
import { detectChallenge } from './challenge-detection.js';
|
|
14
15
|
import { getStrategyHooks, } from './strategy-hooks.js';
|
|
15
16
|
import { createLogger } from './logger.js';
|
|
@@ -310,10 +311,15 @@ async function fetchWithBrowserStrategy(url, options) {
|
|
|
310
311
|
export async function smartFetch(url, options = {}) {
|
|
311
312
|
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
|
|
312
313
|
const usePeelTLS = tls || cycle;
|
|
313
|
-
// Build effective proxy list: explicit proxies array, or single proxy, or empty
|
|
314
|
+
// Build effective proxy list: explicit proxies array, or single proxy, or empty.
|
|
315
|
+
// When no explicit proxy is configured and Webshare is available, automatically
|
|
316
|
+
// add it as a fallback: try direct connection first (fast), then Webshare on block.
|
|
314
317
|
const effectiveProxies = proxies?.length ? proxies :
|
|
315
318
|
proxy ? [proxy] :
|
|
316
|
-
|
|
319
|
+
(() => {
|
|
320
|
+
const wsUrl = getWebshareProxyUrl();
|
|
321
|
+
return wsUrl ? [undefined, wsUrl] : [undefined];
|
|
322
|
+
})();
|
|
317
323
|
const firstProxy = effectiveProxies[0];
|
|
318
324
|
const hooks = getStrategyHooks();
|
|
319
325
|
const fetchStartMs = Date.now();
|
|
@@ -86,56 +86,223 @@ function parseLLMJson(text) {
|
|
|
86
86
|
/**
|
|
87
87
|
* For string fields: search for field name in content, extract surrounding text.
|
|
88
88
|
*/
|
|
89
|
-
|
|
89
|
+
/** Extract first H1 or page title from markdown content */
|
|
90
|
+
function extractPageTitle(content) {
|
|
91
|
+
const h1 = content.match(/^#\s+(.+)$/m);
|
|
92
|
+
if (h1?.[1])
|
|
93
|
+
return h1[1].replace(/[*_`]/g, '').trim();
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
/** Extract meta description (after *X min read* pattern common in WebPeel output) */
|
|
97
|
+
function extractDescription(content) {
|
|
98
|
+
// First paragraph after the title
|
|
99
|
+
const lines = content.split('\n').filter(l => l.trim());
|
|
100
|
+
let seenH1 = false;
|
|
101
|
+
for (const line of lines) {
|
|
102
|
+
if (line.startsWith('#')) {
|
|
103
|
+
seenH1 = true;
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
if (line.startsWith('*') && line.endsWith('*'))
|
|
107
|
+
continue; // byline
|
|
108
|
+
if (seenH1 && line.length > 30)
|
|
109
|
+
return line.replace(/[*_`]/g, '').trim().slice(0, 300);
|
|
110
|
+
}
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
/** Extract company/brand name from title (before " — ", " - ", " | ", " · ") */
|
|
114
|
+
function extractCompanyFromTitle(title) {
|
|
115
|
+
const sep = title.match(/^([^|·\-—]+)[|·\-—]/);
|
|
116
|
+
if (sep?.[1])
|
|
117
|
+
return sep[1].trim();
|
|
118
|
+
return title.trim().slice(0, 60);
|
|
119
|
+
}
|
|
120
|
+
/** Smart field-name-aware string extractor */
|
|
121
|
+
function heuristicExtractString(fieldName, content, pageUrl) {
|
|
122
|
+
const lf = fieldName.toLowerCase();
|
|
90
123
|
const humanName = fieldName.replace(/_/g, ' ');
|
|
124
|
+
const title = extractPageTitle(content);
|
|
125
|
+
// --- Concept-aware extraction ---
|
|
126
|
+
// Company/brand/organization name
|
|
127
|
+
if (/company|brand|organization|org_name/.test(lf)) {
|
|
128
|
+
if (title)
|
|
129
|
+
return extractCompanyFromTitle(title);
|
|
130
|
+
// Fallback: extract from first heading of any level
|
|
131
|
+
const anyHeading = content.match(/^#{1,3}\s+(.+)$/m);
|
|
132
|
+
if (anyHeading?.[1])
|
|
133
|
+
return anyHeading[1].replace(/[*_`[\]]/g, '').trim().slice(0, 60);
|
|
134
|
+
}
|
|
135
|
+
// Title/name/product → first H1 or any heading, stripped of markdown
|
|
136
|
+
if (/^(title|name|product_name|product|heading)$/.test(lf)) {
|
|
137
|
+
const rawTitle = title ?? content.match(/^#{1,3}\s+(.+)$/m)?.[1];
|
|
138
|
+
if (rawTitle) {
|
|
139
|
+
// Strip markdown links [text](url) → text, badges  → '', etc.
|
|
140
|
+
return rawTitle
|
|
141
|
+
.replace(/!\[[^\]]*\]\([^)]*\)/g, '') // remove images
|
|
142
|
+
.replace(/\[([^\]]+)\]\([^)]*\)/g, '$1') // [text](url) → text
|
|
143
|
+
.replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
|
|
144
|
+
.replace(/[*_`[\]]/g, '')
|
|
145
|
+
.replace(/&[a-z]+;/g, '') // HTML entities
|
|
146
|
+
.replace(/\s+/g, ' ')
|
|
147
|
+
.trim().slice(0, 150);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
// Description/summary/about → first paragraph
|
|
151
|
+
if (/description|summary|about|overview/.test(lf)) {
|
|
152
|
+
return extractDescription(content) ?? null;
|
|
153
|
+
}
|
|
154
|
+
// URL/website/link → use the URL if we have it
|
|
155
|
+
if (/^(url|website|link|homepage|site)$/.test(lf)) {
|
|
156
|
+
if (pageUrl)
|
|
157
|
+
return pageUrl;
|
|
158
|
+
}
|
|
159
|
+
// Author/writer/by
|
|
160
|
+
if (/author|writer|by/.test(lf)) {
|
|
161
|
+
const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
|
|
162
|
+
if (m?.[1])
|
|
163
|
+
return m[1].trim().slice(0, 100);
|
|
164
|
+
}
|
|
165
|
+
// Date/published/updated
|
|
166
|
+
if (/date|published|updated|modified/.test(lf)) {
|
|
167
|
+
const m = content.match(/(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b)/i)
|
|
168
|
+
?? content.match(/(\d{4}-\d{2}-\d{2})/);
|
|
169
|
+
if (m?.[1])
|
|
170
|
+
return m[1];
|
|
171
|
+
}
|
|
172
|
+
// Email
|
|
173
|
+
if (/email|contact/.test(lf)) {
|
|
174
|
+
const m = content.match(/[\w.+-]+@[\w-]+\.[a-z]{2,}/i);
|
|
175
|
+
if (m?.[0])
|
|
176
|
+
return m[0];
|
|
177
|
+
}
|
|
178
|
+
// Price/cost/pricing → extract value near $
|
|
179
|
+
if (/price|cost|pricing|fee/.test(lf)) {
|
|
180
|
+
const m = content.match(/\$\s*[\d,]+(?:\.\d{2})?(?:\s*\/\s*\w+)?/)
|
|
181
|
+
?? content.match(/(free|no cost|no charge)/i);
|
|
182
|
+
if (m?.[0])
|
|
183
|
+
return m[0].trim();
|
|
184
|
+
}
|
|
185
|
+
// Language (for GitHub repos)
|
|
186
|
+
if (/language|lang|tech/.test(lf)) {
|
|
187
|
+
const m = content.match(/💻\s*(\w[\w#+.-]+)/) ?? content.match(/Language[:\s]+(\w[\w#+.-]+)/i);
|
|
188
|
+
if (m?.[1])
|
|
189
|
+
return m[1];
|
|
190
|
+
}
|
|
191
|
+
// Stars (for GitHub)
|
|
192
|
+
if (/stars?/.test(lf)) {
|
|
193
|
+
const m = content.match(/⭐\s*([\d,]+)\s*stars?/i) ?? content.match(/([\d,]+)\s*stars?/i);
|
|
194
|
+
if (m?.[1])
|
|
195
|
+
return m[1].replace(/,/g, '');
|
|
196
|
+
}
|
|
197
|
+
// License
|
|
198
|
+
if (/license/.test(lf)) {
|
|
199
|
+
const m = content.match(/📜\s*(\w+)/) ?? content.match(/License[:\s]+(MIT|Apache|GPL|BSD|ISC|AGPL|MPL)[^\s]*/i);
|
|
200
|
+
if (m?.[1])
|
|
201
|
+
return m[1];
|
|
202
|
+
}
|
|
203
|
+
// --- Generic patterns (exact-ish match) ---
|
|
91
204
|
const patterns = [
|
|
92
|
-
// "field_name: value" or "Field Name: value" patterns
|
|
93
205
|
new RegExp(`(?:^|\\n)[ \\t]*${humanName}[:\\s]+([^\\n]{5,200})`, 'i'),
|
|
94
|
-
// JSON-like "field": "value"
|
|
95
206
|
new RegExp(`"${fieldName}"\\s*:\\s*"([^"]{1,300})"`, 'i'),
|
|
96
|
-
// Markdown bold **Field Name**: value
|
|
97
207
|
new RegExp(`\\*{1,2}${humanName}\\*{0,2}[:\\s]+([^\\n]{5,200})`, 'i'),
|
|
98
|
-
// Heading followed by content
|
|
99
208
|
new RegExp(`#+\\s*${humanName}\\s*\\n+([^\\n]{5,300})`, 'i'),
|
|
100
209
|
];
|
|
101
210
|
for (const pattern of patterns) {
|
|
102
211
|
const match = content.match(pattern);
|
|
103
|
-
if (match?.[1])
|
|
212
|
+
if (match?.[1])
|
|
104
213
|
return match[1].trim().replace(/[|*_`]/g, '').slice(0, 300);
|
|
105
|
-
}
|
|
106
214
|
}
|
|
107
215
|
return null;
|
|
108
216
|
}
|
|
109
217
|
/**
|
|
110
|
-
* For boolean fields: search for positive/negative indicators
|
|
218
|
+
* For boolean fields: search the ENTIRE content for positive/negative indicators.
|
|
111
219
|
*/
|
|
112
220
|
function heuristicExtractBoolean(fieldName, content) {
|
|
113
|
-
const
|
|
221
|
+
const lf = fieldName.toLowerCase();
|
|
114
222
|
const ctx = content.toLowerCase();
|
|
115
|
-
//
|
|
116
|
-
|
|
117
|
-
if (
|
|
118
|
-
|
|
119
|
-
if (fieldIdx === -1)
|
|
120
|
-
return null;
|
|
121
|
-
// Look at a window of ±150 chars around the field name
|
|
122
|
-
const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
|
|
123
|
-
const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
|
|
124
|
-
const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
|
|
125
|
-
for (const pos of positive) {
|
|
126
|
-
if (window.includes(pos))
|
|
223
|
+
// Concept-aware boolean extraction — search entire content, not just near field name
|
|
224
|
+
// Free tier / free plan
|
|
225
|
+
if (/free_tier|has_free|is_free/.test(lf)) {
|
|
226
|
+
if (/free tier|free plan|\$0|no cost|no charge|free forever/.test(ctx))
|
|
127
227
|
return true;
|
|
228
|
+
if (/no free|paid only|subscription required/.test(ctx))
|
|
229
|
+
return false;
|
|
128
230
|
}
|
|
129
|
-
|
|
130
|
-
|
|
231
|
+
// Open source
|
|
232
|
+
if (/open_source|is_open|oss/.test(lf)) {
|
|
233
|
+
if (/open[- ]source|mit license|apache license|gpl|bsd license|📜\s*mit|📜\s*apache/.test(ctx))
|
|
234
|
+
return true;
|
|
235
|
+
if (/closed[- ]source|proprietary|commercial license/.test(ctx))
|
|
131
236
|
return false;
|
|
132
237
|
}
|
|
238
|
+
// API availability
|
|
239
|
+
if (/has_api|api_available|has_rest/.test(lf)) {
|
|
240
|
+
if (/rest api|graphql api|api endpoint|api key|\/v1\/|\/api\//.test(ctx))
|
|
241
|
+
return true;
|
|
242
|
+
}
|
|
243
|
+
// Authentication
|
|
244
|
+
if (/requires_auth|has_auth|is_authenticated/.test(lf)) {
|
|
245
|
+
if (/login|sign in|authentication|api key|bearer token/.test(ctx))
|
|
246
|
+
return true;
|
|
247
|
+
}
|
|
248
|
+
// General approach: search near field name concept
|
|
249
|
+
const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
|
|
250
|
+
let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
|
|
251
|
+
if (fieldIdx === -1)
|
|
252
|
+
fieldIdx = ctx.indexOf(humanName);
|
|
253
|
+
if (fieldIdx !== -1) {
|
|
254
|
+
const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
|
|
255
|
+
const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
|
|
256
|
+
const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
|
|
257
|
+
for (const pos of positive) {
|
|
258
|
+
if (window.includes(pos))
|
|
259
|
+
return true;
|
|
260
|
+
}
|
|
261
|
+
for (const neg of negative) {
|
|
262
|
+
if (window.includes(neg))
|
|
263
|
+
return false;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
133
266
|
return null;
|
|
134
267
|
}
|
|
135
268
|
/**
|
|
136
269
|
* For number fields: find digits near the field name.
|
|
137
270
|
*/
|
|
138
271
|
function heuristicExtractNumber(fieldName, content) {
|
|
272
|
+
const lf = fieldName.toLowerCase();
|
|
273
|
+
// Stars (GitHub)
|
|
274
|
+
if (/stars?/.test(lf)) {
|
|
275
|
+
const m = content.match(/⭐\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*stars?/i);
|
|
276
|
+
if (m?.[1]) {
|
|
277
|
+
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
278
|
+
return isNaN(n) ? null : n;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
// Forks
|
|
282
|
+
if (/forks?/.test(lf)) {
|
|
283
|
+
const m = content.match(/🍴\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*forks?/i);
|
|
284
|
+
if (m?.[1]) {
|
|
285
|
+
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
286
|
+
return isNaN(n) ? null : n;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
// Rating/score
|
|
290
|
+
if (/rating|score/.test(lf)) {
|
|
291
|
+
const m = content.match(/⭐\s*([\d.]+)\//) ?? content.match(/([\d.]+)\s*\/\s*10/) ?? content.match(/([\d.]+)\s*\/\s*5/);
|
|
292
|
+
if (m?.[1]) {
|
|
293
|
+
const n = parseFloat(m[1]);
|
|
294
|
+
return isNaN(n) ? null : n;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
// Year
|
|
298
|
+
if (/year/.test(lf)) {
|
|
299
|
+
const m = content.match(/\b(20\d{2})\b/);
|
|
300
|
+
if (m?.[1]) {
|
|
301
|
+
const n = parseInt(m[1]);
|
|
302
|
+
return isNaN(n) ? null : n;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
// Generic: find number near field name
|
|
139
306
|
const humanName = fieldName.replace(/_/g, '[\\s_-]*');
|
|
140
307
|
const pattern = new RegExp(`${humanName}[:\\s$]*([\\d,]+\\.?\\d*)`, 'i');
|
|
141
308
|
const match = content.match(pattern);
|
package/dist/core/youtube.js
CHANGED
|
@@ -15,6 +15,7 @@ import { join } from 'node:path';
|
|
|
15
15
|
import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
|
|
16
16
|
import { simpleFetch } from './fetcher.js';
|
|
17
17
|
import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
|
+
import { hasWebshareProxy as _hasWebshareProxy } from './proxy-config.js';
|
|
18
19
|
import { createLogger } from './logger.js';
|
|
19
20
|
// ---------------------------------------------------------------------------
|
|
20
21
|
// yt-dlp startup diagnostics
|
|
@@ -239,8 +240,10 @@ export function extractSummary(fullText) {
|
|
|
239
240
|
// ---------------------------------------------------------------------------
|
|
240
241
|
// Proxy-based InnerTube transcript extraction
|
|
241
242
|
// ---------------------------------------------------------------------------
|
|
242
|
-
// Webshare residential proxy config — reads from env vars
|
|
243
|
+
// Webshare residential proxy config — reads from env vars via proxy-config.ts.
|
|
243
244
|
// Locally, falls back to direct fetch (residential IP already works).
|
|
245
|
+
// These constants are kept for use in proxyRequestSlotted() which does
|
|
246
|
+
// low-level HTTP CONNECT tunneling (not Playwright-level proxy).
|
|
244
247
|
const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
|
|
245
248
|
const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
246
249
|
const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
|
|
@@ -249,7 +252,8 @@ const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
|
|
|
249
252
|
// slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
|
|
250
253
|
const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
|
|
251
254
|
function isProxyConfigured() {
|
|
252
|
-
|
|
255
|
+
// Delegate to the shared proxy-config helper for consistency
|
|
256
|
+
return _hasWebshareProxy();
|
|
253
257
|
}
|
|
254
258
|
/**
|
|
255
259
|
* Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
|
package/dist/server/app.js
CHANGED
|
@@ -106,8 +106,8 @@ export function createApp(config = {}) {
|
|
|
106
106
|
timeoutMs = 120000; // 2min for batch
|
|
107
107
|
else if (path.includes('/screenshot'))
|
|
108
108
|
timeoutMs = 60000; // 1min for screenshots
|
|
109
|
-
else if (req.query?.render === 'true')
|
|
110
|
-
timeoutMs = 60000; // 1min for
|
|
109
|
+
else if (req.query?.render === 'true' || req.query?.stealth === 'true')
|
|
110
|
+
timeoutMs = 60000; // 1min for browser/stealth fetches
|
|
111
111
|
else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
|
|
112
112
|
timeoutMs = 90000; // 90s for YouTube (yt-dlp needs time after simpleFetch fails)
|
|
113
113
|
req.setTimeout(timeoutMs);
|
|
@@ -13,6 +13,52 @@ import { getSchemaTemplate } from '../../core/schema-templates.js';
|
|
|
13
13
|
import { quickAnswer } from '../../core/quick-answer.js';
|
|
14
14
|
import { sendUsageAlertEmail } from '../email-service.js';
|
|
15
15
|
import { extractLinks } from '../../core/links.js';
|
|
16
|
+
// ── Helper: classify an error thrown by peel() into a FetchErrorType ─────────
|
|
17
|
+
function classifyFetchError(err) {
|
|
18
|
+
const code = err.code || err.name || '';
|
|
19
|
+
const msg = (err.message || '').toLowerCase();
|
|
20
|
+
if (code === 'TIMEOUT' || msg.includes('timeout') || msg.includes('timed out')) {
|
|
21
|
+
return 'timeout';
|
|
22
|
+
}
|
|
23
|
+
if (code === 'BLOCKED' || msg.includes('blocked') || msg.includes('cloudflare challenge') || msg.includes('captcha') || msg.includes('bot detection')) {
|
|
24
|
+
return 'blocked';
|
|
25
|
+
}
|
|
26
|
+
if (msg.includes('http 404') || msg.includes('not found') || msg.includes('dns resolution failed') || msg.includes('enotfound') || msg.includes('getaddrinfo')) {
|
|
27
|
+
return 'not_found';
|
|
28
|
+
}
|
|
29
|
+
if (msg.match(/http\s+5\d{2}/) || msg.includes('server error') || msg.includes('internal server')) {
|
|
30
|
+
return 'server_error';
|
|
31
|
+
}
|
|
32
|
+
if (code === 'NETWORK' || msg.includes('network') || msg.includes('econnrefused') || msg.includes('connection refused') || msg.includes('connection reset')) {
|
|
33
|
+
return 'network';
|
|
34
|
+
}
|
|
35
|
+
return 'unknown';
|
|
36
|
+
}
|
|
37
|
+
// ── Helper: build a clean, user-facing error message from a peel() error ─────
|
|
38
|
+
function buildFetchErrorMessage(err) {
|
|
39
|
+
const type = classifyFetchError(err);
|
|
40
|
+
const hints = {
|
|
41
|
+
timeout: 'Try increasing timeout with ?timeout=20000, or use render=true for JS-heavy sites.',
|
|
42
|
+
blocked: 'This site blocks automated requests. Try render=true or stealth=true.',
|
|
43
|
+
not_found: 'Verify the URL is correct and the site is accessible.',
|
|
44
|
+
server_error: 'The target site returned a server error. Try again later.',
|
|
45
|
+
network: 'Could not connect to the target URL. Verify the URL is correct and the site is online.',
|
|
46
|
+
unknown: undefined,
|
|
47
|
+
};
|
|
48
|
+
// Sanitize message: strip HTML chars, truncate
|
|
49
|
+
const safeMsg = (err.message || 'An unexpected error occurred while fetching the URL')
|
|
50
|
+
.replace(/[<>"']/g, '')
|
|
51
|
+
.trim();
|
|
52
|
+
const messages = {
|
|
53
|
+
timeout: `The website took too long to respond. Try with render=true or stealth=true for JavaScript-heavy sites.`,
|
|
54
|
+
blocked: `This website is blocking automated access (bot protection detected).`,
|
|
55
|
+
not_found: `The URL could not be reached — the domain may not exist or the page was not found.`,
|
|
56
|
+
server_error: `The target website returned a server error while processing the request.`,
|
|
57
|
+
network: `Could not reach this website. The server may be down or the URL may be incorrect.`,
|
|
58
|
+
unknown: safeMsg,
|
|
59
|
+
};
|
|
60
|
+
return { type, message: messages[type] || safeMsg, hint: hints[type] };
|
|
61
|
+
}
|
|
16
62
|
// ── Helper: extractive summarizer (TF-IDF-like sentence scoring) ─────────────
|
|
17
63
|
function extractSummary(content, maxWords = 150) {
|
|
18
64
|
if (!content)
|
|
@@ -527,26 +573,24 @@ export function createFetchRouter(authStore) {
|
|
|
527
573
|
});
|
|
528
574
|
}
|
|
529
575
|
// SECURITY: Sanitize error messages to prevent information disclosure
|
|
530
|
-
if (
|
|
576
|
+
if (res.headersSent)
|
|
577
|
+
return; // Timeout middleware already responded
|
|
578
|
+
const requestUrl = req.query.url;
|
|
579
|
+
if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
|
|
531
580
|
// WebPeelError from core library - safe to expose with helpful context
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
: 500;
|
|
539
|
-
const hints = {
|
|
540
|
-
TIMEOUT: 'Try increasing timeout with ?wait=10000, or use render=true for JS-heavy sites.',
|
|
541
|
-
BLOCKED: 'This site blocks automated requests. Try adding render=true or use stealth mode (costs 5 credits).',
|
|
542
|
-
NETWORK: 'Could not reach the target URL. Verify the URL is correct and the site is online.',
|
|
543
|
-
};
|
|
581
|
+
const { type, message, hint } = buildFetchErrorMessage(err);
|
|
582
|
+
const statusCode = type === 'timeout' ? 504
|
|
583
|
+
: type === 'blocked' ? 403
|
|
584
|
+
: type === 'not_found' ? 404
|
|
585
|
+
: type === 'network' || type === 'server_error' ? 502
|
|
586
|
+
: 500;
|
|
544
587
|
res.status(statusCode).json({
|
|
545
588
|
success: false,
|
|
546
589
|
error: {
|
|
547
|
-
type
|
|
548
|
-
message
|
|
549
|
-
|
|
590
|
+
type,
|
|
591
|
+
message,
|
|
592
|
+
url: requestUrl,
|
|
593
|
+
...(hint ? { hint } : {}),
|
|
550
594
|
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
551
595
|
},
|
|
552
596
|
requestId: req.requestId,
|
|
@@ -555,13 +599,12 @@ export function createFetchRouter(authStore) {
|
|
|
555
599
|
else {
|
|
556
600
|
// Unexpected error - generic message only
|
|
557
601
|
console.error('Fetch error:', err); // Log full error server-side
|
|
558
|
-
if (res.headersSent)
|
|
559
|
-
return; // Timeout middleware already responded
|
|
560
602
|
res.status(500).json({
|
|
561
603
|
success: false,
|
|
562
604
|
error: {
|
|
563
|
-
type: '
|
|
605
|
+
type: 'unknown',
|
|
564
606
|
message: 'An unexpected error occurred while fetching the URL. If this persists, check https://webpeel.dev/status',
|
|
607
|
+
url: requestUrl,
|
|
565
608
|
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
566
609
|
},
|
|
567
610
|
requestId: req.requestId,
|
|
@@ -1028,23 +1071,21 @@ export function createFetchRouter(authStore) {
|
|
|
1028
1071
|
console.error('POST fetch/scrape error:', err);
|
|
1029
1072
|
if (res.headersSent)
|
|
1030
1073
|
return; // Timeout middleware already responded
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
const
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
BLOCKED: 'Site blocks automated requests. Try render:true or stealth mode.',
|
|
1040
|
-
NETWORK: 'Could not reach the target URL. Verify it is correct and online.',
|
|
1041
|
-
};
|
|
1074
|
+
const postUrl = req.body?.url;
|
|
1075
|
+
if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
|
|
1076
|
+
const { type, message, hint } = buildFetchErrorMessage(err);
|
|
1077
|
+
const statusCode = type === 'timeout' ? 504
|
|
1078
|
+
: type === 'blocked' ? 403
|
|
1079
|
+
: type === 'not_found' ? 404
|
|
1080
|
+
: type === 'network' || type === 'server_error' ? 502
|
|
1081
|
+
: 500;
|
|
1042
1082
|
res.status(statusCode).json({
|
|
1043
1083
|
success: false,
|
|
1044
1084
|
error: {
|
|
1045
|
-
type
|
|
1046
|
-
message
|
|
1047
|
-
|
|
1085
|
+
type,
|
|
1086
|
+
message,
|
|
1087
|
+
url: postUrl,
|
|
1088
|
+
...(hint ? { hint } : {}),
|
|
1048
1089
|
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
1049
1090
|
},
|
|
1050
1091
|
requestId: req.requestId,
|
|
@@ -1054,8 +1095,9 @@ export function createFetchRouter(authStore) {
|
|
|
1054
1095
|
res.status(500).json({
|
|
1055
1096
|
success: false,
|
|
1056
1097
|
error: {
|
|
1057
|
-
type: '
|
|
1098
|
+
type: 'unknown',
|
|
1058
1099
|
message: 'An unexpected error occurred. If this persists, check https://webpeel.dev/status',
|
|
1100
|
+
url: postUrl,
|
|
1059
1101
|
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
1060
1102
|
},
|
|
1061
1103
|
requestId: req.requestId,
|
package/dist/types.d.ts
CHANGED
|
@@ -419,6 +419,18 @@ export interface PeelEnvelope {
|
|
|
419
419
|
*/
|
|
420
420
|
totalAvailable?: number;
|
|
421
421
|
}
|
|
422
|
+
/**
|
|
423
|
+
* Programmatic error classification for fetch failures.
|
|
424
|
+
* Returned in the `error.type` field of API error responses.
|
|
425
|
+
*
|
|
426
|
+
* - `timeout` — Site took too long to respond
|
|
427
|
+
* - `blocked` — Site actively blocked the request (403, CAPTCHA, bot detection)
|
|
428
|
+
* - `not_found` — 404 or the domain/URL does not exist
|
|
429
|
+
* - `server_error` — Target site returned a 5xx error
|
|
430
|
+
* - `network` — DNS failure, connection refused, or other network-level issue
|
|
431
|
+
* - `unknown` — Unclassified error
|
|
432
|
+
*/
|
|
433
|
+
export type FetchErrorType = 'timeout' | 'blocked' | 'not_found' | 'server_error' | 'network' | 'unknown';
|
|
422
434
|
export declare class WebPeelError extends Error {
|
|
423
435
|
code?: string | undefined;
|
|
424
436
|
constructor(message: string, code?: string | undefined);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.8",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|