webpeel 0.21.6 → 0.21.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/utils.js CHANGED
@@ -131,22 +131,30 @@ export function parseActions(actionStrings) {
131
131
  */
132
132
  export function formatError(error, _url, options) {
133
133
  const msg = error.message || String(error);
134
+ const errorType = error.errorType || '';
134
135
  const lines = [`\x1b[31m✖ ${msg}\x1b[0m`];
135
- if (msg.includes('net::ERR_') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
136
- lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
137
- }
138
- else if (msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
136
+ // Check structured errorType from API first (takes precedence over message heuristics)
137
+ if (errorType === 'timeout' || msg.includes('took too long') || msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
139
138
  lines.push('\x1b[33m💡 Try increasing timeout: --timeout 60000\x1b[0m');
140
139
  if (!options.render) {
141
140
  lines.push('\x1b[33m💡 Site may need browser rendering: --render\x1b[0m');
142
141
  }
143
142
  }
144
- else if (msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
143
+ else if (errorType === 'blocked' || msg.includes('blocking automated') || msg.includes('bot protection') || msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
145
144
  if (!options.stealth) {
146
145
  lines.push('\x1b[33m💡 Try stealth mode to bypass bot detection: --stealth\x1b[0m');
147
146
  }
148
147
  lines.push('\x1b[33m💡 Try a different user agent: --ua "Mozilla/5.0..."\x1b[0m');
149
148
  }
149
+ else if (errorType === 'not_found' || msg.includes('domain may not exist') || msg.includes('not found') || msg.includes('ENOTFOUND') || msg.includes('net::ERR_') || msg.includes('ECONNREFUSED')) {
150
+ lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
151
+ }
152
+ else if (errorType === 'network' || msg.includes('Could not reach') || msg.includes('could not connect') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
153
+ lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
154
+ }
155
+ else if (errorType === 'server_error' || msg.includes('server error')) {
156
+ lines.push('\x1b[33m💡 The target site returned a server error. Try again in a moment.\x1b[0m');
157
+ }
150
158
  else if (msg.includes('empty') || msg.includes('no content') || msg.includes('0 tokens')) {
151
159
  if (!options.render) {
152
160
  lines.push('\x1b[33m💡 Page may be JavaScript-rendered. Try: --render\x1b[0m');
@@ -213,18 +221,39 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
213
221
  if (!res.ok) {
214
222
  const body = await res.text().catch(() => '');
215
223
  // Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
216
- const isHtml = body.trimStart().startsWith('<');
224
+ const isHtml = body.trimStart().startsWith('<') || body.includes('<!DOCTYPE') || body.includes('<html');
217
225
  let errorMsg;
226
+ let errorType;
218
227
  if (res.status === 502 || res.status === 503 || res.status === 504) {
219
- errorMsg = `Could not reach this website (gateway error)`;
228
+ errorMsg = `Could not reach this website. The site may be blocking our server or timing out.`;
229
+ errorType = res.status === 504 ? 'timeout' : 'network';
220
230
  }
221
231
  else if (isHtml) {
222
- errorMsg = `Server returned an error page`;
232
+ errorMsg = `Server returned an error page (${res.status})`;
223
233
  }
224
234
  else {
225
- errorMsg = body.slice(0, 200) || 'Unknown error';
235
+ // Try to parse a structured JSON error response
236
+ try {
237
+ const json = JSON.parse(body);
238
+ const errObj = json?.error;
239
+ if (errObj && typeof errObj === 'object') {
240
+ errorMsg = typeof errObj.message === 'string' ? errObj.message : (body.slice(0, 200) || 'Unknown error');
241
+ if (typeof errObj.type === 'string')
242
+ errorType = errObj.type;
243
+ }
244
+ else {
245
+ errorMsg = body.slice(0, 200) || 'Unknown error';
246
+ }
247
+ }
248
+ catch {
249
+ errorMsg = body.slice(0, 200) || 'Unknown error';
250
+ }
226
251
  }
227
- throw new Error(`API error ${res.status}: ${errorMsg}`);
252
+ const err = new Error(`${errorMsg}`);
253
+ if (errorType)
254
+ err.errorType = errorType;
255
+ err.statusCode = res.status;
256
+ throw err;
228
257
  }
229
258
  const data = await res.json();
230
259
  // Map API response to PeelResult shape that the CLI already handles
@@ -405,20 +434,40 @@ export function classifyErrorCode(error) {
405
434
  // Check for our custom _code first (set in pre-fetch validation)
406
435
  if (error._code)
407
436
  return error._code;
437
+ // Check for structured errorType from API responses (set by fetchViaApi)
438
+ const errorType = error.errorType;
439
+ if (errorType) {
440
+ const typeMap = {
441
+ timeout: 'TIMEOUT',
442
+ blocked: 'BLOCKED',
443
+ not_found: 'NOT_FOUND',
444
+ server_error: 'SERVER_ERROR',
445
+ network: 'NETWORK',
446
+ unknown: 'FETCH_FAILED',
447
+ };
448
+ if (typeMap[errorType])
449
+ return typeMap[errorType];
450
+ }
408
451
  const msg = error.message.toLowerCase();
409
452
  const name = error.name || '';
410
- if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out')) {
453
+ if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out') || msg.includes('took too long')) {
411
454
  return 'TIMEOUT';
412
455
  }
413
- if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
456
+ if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare') || msg.includes('bot protection')) {
414
457
  return 'BLOCKED';
415
458
  }
416
- if (msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed') || msg.includes('not found')) {
417
- return 'DNS_FAILED';
459
+ if (msg.includes('domain may not exist') || msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed')) {
460
+ return 'NOT_FOUND';
461
+ }
462
+ if (msg.includes('http 404') || msg.includes('page was not found')) {
463
+ return 'NOT_FOUND';
418
464
  }
419
465
  if (msg.includes('invalid url') || msg.includes('invalid hostname') || msg.includes('only http')) {
420
466
  return 'INVALID_URL';
421
467
  }
468
+ if (msg.includes('could not reach') || msg.includes('could not connect') || msg.includes('econnrefused')) {
469
+ return 'NETWORK';
470
+ }
422
471
  return 'FETCH_FAILED';
423
472
  }
424
473
  /**
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Shared Webshare residential proxy configuration.
3
+ *
4
+ * WebPeel uses Webshare residential proxies (configured via env vars) to route
5
+ * requests through US residential IPs, bypassing datacenter IP blocks from
6
+ * DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
7
+ *
8
+ * Proxy credentials are loaded from environment variables:
9
+ * WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
10
+ * WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
11
+ * WEBSHARE_PROXY_USER — proxy username (without slot suffix)
12
+ * WEBSHARE_PROXY_PASS — proxy password
13
+ * WEBSHARE_PROXY_SLOTS — number of available US residential slots
14
+ *
15
+ * With the Webshare backbone plan each US slot has its own port:
16
+ * slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
17
+ */
18
+ export interface ProxyConfig {
19
+ /** Proxy server URL in the format "http://host:port" */
20
+ server: string;
21
+ /** Proxy username (includes slot suffix, e.g. "user-US-42") */
22
+ username: string;
23
+ /** Proxy password */
24
+ password: string;
25
+ }
26
+ /**
27
+ * Get a random Webshare residential proxy config.
28
+ * Returns null if the proxy is not configured (env vars missing or slots = 0).
29
+ *
30
+ * Uses random slot selection across all available US slots for even load
31
+ * distribution — same approach as youtube.ts proxyRequestSlotted().
32
+ */
33
+ export declare function getWebshareProxy(): ProxyConfig | null;
34
+ /**
35
+ * Check if Webshare proxies are configured (env vars are present and non-empty).
36
+ * Does NOT guarantee the proxy is reachable — just that credentials are set.
37
+ */
38
+ export declare function hasWebshareProxy(): boolean;
39
+ /**
40
+ * Convert a ProxyConfig to a Playwright-compatible proxy object.
41
+ * Useful for passing directly to browser.newContext({ proxy: ... }).
42
+ */
43
+ export declare function toPlaywrightProxy(config: ProxyConfig): {
44
+ server: string;
45
+ username: string;
46
+ password: string;
47
+ };
48
+ /**
49
+ * Get a random Webshare proxy as a fully-qualified URL string with embedded
50
+ * credentials. The format is: `http://username:password@host:port`
51
+ *
52
+ * Useful for passing to strategies.ts proxy option (which expects a URL string).
53
+ * Returns null if proxies are not configured.
54
+ */
55
+ export declare function getWebshareProxyUrl(): string | null;
@@ -0,0 +1,79 @@
1
+ /**
2
+ * Shared Webshare residential proxy configuration.
3
+ *
4
+ * WebPeel uses Webshare residential proxies (configured via env vars) to route
5
+ * requests through US residential IPs, bypassing datacenter IP blocks from
6
+ * DuckDuckGo, Amazon, BestBuy, CarGurus, and other sites with anti-bot detection.
7
+ *
8
+ * Proxy credentials are loaded from environment variables:
9
+ * WEBSHARE_PROXY_HOST — proxy hostname (e.g. p.webshare.io)
10
+ * WEBSHARE_PROXY_PORT — base port number (e.g. 10000)
11
+ * WEBSHARE_PROXY_USER — proxy username (without slot suffix)
12
+ * WEBSHARE_PROXY_PASS — proxy password
13
+ * WEBSHARE_PROXY_SLOTS — number of available US residential slots
14
+ *
15
+ * With the Webshare backbone plan each US slot has its own port:
16
+ * slot N → port (WEBSHARE_PROXY_PORT + N - 1), username: USER-US-N
17
+ */
18
+ /**
19
+ * Get a random Webshare residential proxy config.
20
+ * Returns null if the proxy is not configured (env vars missing or slots = 0).
21
+ *
22
+ * Uses random slot selection across all available US slots for even load
23
+ * distribution — same approach as youtube.ts proxyRequestSlotted().
24
+ */
25
+ export function getWebshareProxy() {
26
+ const host = process.env.WEBSHARE_PROXY_HOST;
27
+ const user = process.env.WEBSHARE_PROXY_USER;
28
+ const pass = process.env.WEBSHARE_PROXY_PASS;
29
+ const basePort = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
30
+ const slots = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '0', 10);
31
+ if (!host || !user || !pass || slots <= 0)
32
+ return null;
33
+ const slot = Math.floor(Math.random() * slots) + 1;
34
+ const port = basePort + slot - 1;
35
+ return {
36
+ server: `http://${host}:${port}`,
37
+ username: `${user}-US-${slot}`,
38
+ password: pass,
39
+ };
40
+ }
41
+ /**
42
+ * Check if Webshare proxies are configured (env vars are present and non-empty).
43
+ * Does NOT guarantee the proxy is reachable — just that credentials are set.
44
+ */
45
+ export function hasWebshareProxy() {
46
+ return !!(process.env.WEBSHARE_PROXY_HOST &&
47
+ process.env.WEBSHARE_PROXY_USER &&
48
+ process.env.WEBSHARE_PROXY_PASS);
49
+ }
50
+ /**
51
+ * Convert a ProxyConfig to a Playwright-compatible proxy object.
52
+ * Useful for passing directly to browser.newContext({ proxy: ... }).
53
+ */
54
+ export function toPlaywrightProxy(config) {
55
+ return {
56
+ server: config.server,
57
+ username: config.username,
58
+ password: config.password,
59
+ };
60
+ }
61
+ /**
62
+ * Get a random Webshare proxy as a fully-qualified URL string with embedded
63
+ * credentials. The format is: `http://username:password@host:port`
64
+ *
65
+ * Useful for passing to strategies.ts proxy option (which expects a URL string).
66
+ * Returns null if proxies are not configured.
67
+ */
68
+ export function getWebshareProxyUrl() {
69
+ const config = getWebshareProxy();
70
+ if (!config)
71
+ return null;
72
+ try {
73
+ const url = new URL(config.server);
74
+ return `http://${encodeURIComponent(config.username)}:${encodeURIComponent(config.password)}@${url.host}`;
75
+ }
76
+ catch {
77
+ return null;
78
+ }
79
+ }
@@ -15,6 +15,7 @@
15
15
  import { fetch as undiciFetch } from 'undici';
16
16
  import { load } from 'cheerio';
17
17
  import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
18
+ import { getWebshareProxy } from './proxy-config.js';
18
19
  import { createLogger } from './logger.js';
19
20
  const log = createLogger('search');
20
21
  function decodeHtmlEntities(input) {
@@ -236,10 +237,12 @@ export class StealthSearchProvider {
236
237
  const browser = await getStealthBrowser();
237
238
  const params = new URLSearchParams({ q: query });
238
239
  const url = `https://html.duckduckgo.com/html/?${params.toString()}`;
240
+ const proxy = getWebshareProxy();
239
241
  ctx = await browser.newContext({
240
242
  userAgent: getRandomUserAgent(),
241
243
  locale: 'en-US',
242
244
  timezoneId: 'America/New_York',
245
+ ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
243
246
  });
244
247
  const page = await ctx.newPage();
245
248
  await applyStealthScripts(page);
@@ -303,10 +306,12 @@ export class StealthSearchProvider {
303
306
  const browser = await getStealthBrowser();
304
307
  const params = new URLSearchParams({ q: query });
305
308
  const url = `https://www.bing.com/search?${params.toString()}`;
309
+ const proxy = getWebshareProxy();
306
310
  ctx = await browser.newContext({
307
311
  userAgent: getRandomUserAgent(),
308
312
  locale: 'en-US',
309
313
  timezoneId: 'America/New_York',
314
+ ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
310
315
  });
311
316
  const page = await ctx.newPage();
312
317
  await applyStealthScripts(page);
@@ -380,10 +385,12 @@ export class StealthSearchProvider {
380
385
  const browser = await getStealthBrowser();
381
386
  const params = new URLSearchParams({ q: query });
382
387
  const url = `https://www.ecosia.org/search?${params.toString()}`;
388
+ const proxy = getWebshareProxy();
383
389
  ctx = await browser.newContext({
384
390
  userAgent: getRandomUserAgent(),
385
391
  locale: 'en-US',
386
392
  timezoneId: 'America/New_York',
393
+ ...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
387
394
  });
388
395
  const page = await ctx.newPage();
389
396
  await applyStealthScripts(page);
@@ -10,6 +10,7 @@ import { simpleFetch, browserFetch, retryFetch } from './fetcher.js';
10
10
  import { getCached, setCached as setBasicCache } from './cache.js';
11
11
  import { resolveAndCache } from './dns-cache.js';
12
12
  import { BlockedError, NetworkError } from '../types.js';
13
+ import { getWebshareProxyUrl } from './proxy-config.js';
13
14
  import { detectChallenge } from './challenge-detection.js';
14
15
  import { getStrategyHooks, } from './strategy-hooks.js';
15
16
  import { createLogger } from './logger.js';
@@ -310,10 +311,15 @@ async function fetchWithBrowserStrategy(url, options) {
310
311
  export async function smartFetch(url, options = {}) {
311
312
  const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, } = options;
312
313
  const usePeelTLS = tls || cycle;
313
- // Build effective proxy list: explicit proxies array, or single proxy, or empty
314
+ // Build effective proxy list: explicit proxies array, or single proxy, or empty.
315
+ // When no explicit proxy is configured and Webshare is available, automatically
316
+ // add it as a fallback: try direct connection first (fast), then Webshare on block.
314
317
  const effectiveProxies = proxies?.length ? proxies :
315
318
  proxy ? [proxy] :
316
- [undefined]; // undefined = direct connection (no proxy)
319
+ (() => {
320
+ const wsUrl = getWebshareProxyUrl();
321
+ return wsUrl ? [undefined, wsUrl] : [undefined];
322
+ })();
317
323
  const firstProxy = effectiveProxies[0];
318
324
  const hooks = getStrategyHooks();
319
325
  const fetchStartMs = Date.now();
@@ -86,56 +86,223 @@ function parseLLMJson(text) {
86
86
  /**
87
87
  * For string fields: search for field name in content, extract surrounding text.
88
88
  */
89
- function heuristicExtractString(fieldName, content) {
89
+ /** Extract first H1 or page title from markdown content */
90
+ function extractPageTitle(content) {
91
+ const h1 = content.match(/^#\s+(.+)$/m);
92
+ if (h1?.[1])
93
+ return h1[1].replace(/[*_`]/g, '').trim();
94
+ return null;
95
+ }
96
+ /** Extract meta description (after *X min read* pattern common in WebPeel output) */
97
+ function extractDescription(content) {
98
+ // First paragraph after the title
99
+ const lines = content.split('\n').filter(l => l.trim());
100
+ let seenH1 = false;
101
+ for (const line of lines) {
102
+ if (line.startsWith('#')) {
103
+ seenH1 = true;
104
+ continue;
105
+ }
106
+ if (line.startsWith('*') && line.endsWith('*'))
107
+ continue; // byline
108
+ if (seenH1 && line.length > 30)
109
+ return line.replace(/[*_`]/g, '').trim().slice(0, 300);
110
+ }
111
+ return null;
112
+ }
113
+ /** Extract company/brand name from title (before " — ", " - ", " | ", " · ") */
114
+ function extractCompanyFromTitle(title) {
115
+ const sep = title.match(/^([^|·\-—]+)[|·\-—]/);
116
+ if (sep?.[1])
117
+ return sep[1].trim();
118
+ return title.trim().slice(0, 60);
119
+ }
120
+ /** Smart field-name-aware string extractor */
121
+ function heuristicExtractString(fieldName, content, pageUrl) {
122
+ const lf = fieldName.toLowerCase();
90
123
  const humanName = fieldName.replace(/_/g, ' ');
124
+ const title = extractPageTitle(content);
125
+ // --- Concept-aware extraction ---
126
+ // Company/brand/organization name
127
+ if (/company|brand|organization|org_name/.test(lf)) {
128
+ if (title)
129
+ return extractCompanyFromTitle(title);
130
+ // Fallback: extract from first heading of any level
131
+ const anyHeading = content.match(/^#{1,3}\s+(.+)$/m);
132
+ if (anyHeading?.[1])
133
+ return anyHeading[1].replace(/[*_`[\]]/g, '').trim().slice(0, 60);
134
+ }
135
+ // Title/name/product → first H1 or any heading, stripped of markdown
136
+ if (/^(title|name|product_name|product|heading)$/.test(lf)) {
137
+ const rawTitle = title ?? content.match(/^#{1,3}\s+(.+)$/m)?.[1];
138
+ if (rawTitle) {
139
+ // Strip markdown links [text](url) → text, badges ![...](url) → '', etc.
140
+ return rawTitle
141
+ .replace(/!\[[^\]]*\]\([^)]*\)/g, '') // remove images
142
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1') // [text](url) → text
143
+ .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
144
+ .replace(/[*_`[\]]/g, '')
145
+ .replace(/&[a-z]+;/g, '') // HTML entities
146
+ .replace(/\s+/g, ' ')
147
+ .trim().slice(0, 150);
148
+ }
149
+ }
150
+ // Description/summary/about → first paragraph
151
+ if (/description|summary|about|overview/.test(lf)) {
152
+ return extractDescription(content) ?? null;
153
+ }
154
+ // URL/website/link → use the URL if we have it
155
+ if (/^(url|website|link|homepage|site)$/.test(lf)) {
156
+ if (pageUrl)
157
+ return pageUrl;
158
+ }
159
+ // Author/writer/by
160
+ if (/author|writer|by/.test(lf)) {
161
+ const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
162
+ if (m?.[1])
163
+ return m[1].trim().slice(0, 100);
164
+ }
165
+ // Date/published/updated
166
+ if (/date|published|updated|modified/.test(lf)) {
167
+ const m = content.match(/(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b)/i)
168
+ ?? content.match(/(\d{4}-\d{2}-\d{2})/);
169
+ if (m?.[1])
170
+ return m[1];
171
+ }
172
+ // Email
173
+ if (/email|contact/.test(lf)) {
174
+ const m = content.match(/[\w.+-]+@[\w-]+\.[a-z]{2,}/i);
175
+ if (m?.[0])
176
+ return m[0];
177
+ }
178
+ // Price/cost/pricing → extract value near $
179
+ if (/price|cost|pricing|fee/.test(lf)) {
180
+ const m = content.match(/\$\s*[\d,]+(?:\.\d{2})?(?:\s*\/\s*\w+)?/)
181
+ ?? content.match(/(free|no cost|no charge)/i);
182
+ if (m?.[0])
183
+ return m[0].trim();
184
+ }
185
+ // Language (for GitHub repos)
186
+ if (/language|lang|tech/.test(lf)) {
187
+ const m = content.match(/💻\s*(\w[\w#+.-]+)/) ?? content.match(/Language[:\s]+(\w[\w#+.-]+)/i);
188
+ if (m?.[1])
189
+ return m[1];
190
+ }
191
+ // Stars (for GitHub)
192
+ if (/stars?/.test(lf)) {
193
+ const m = content.match(/⭐\s*([\d,]+)\s*stars?/i) ?? content.match(/([\d,]+)\s*stars?/i);
194
+ if (m?.[1])
195
+ return m[1].replace(/,/g, '');
196
+ }
197
+ // License
198
+ if (/license/.test(lf)) {
199
+ const m = content.match(/📜\s*(\w+)/) ?? content.match(/License[:\s]+(MIT|Apache|GPL|BSD|ISC|AGPL|MPL)[^\s]*/i);
200
+ if (m?.[1])
201
+ return m[1];
202
+ }
203
+ // --- Generic patterns (exact-ish match) ---
91
204
  const patterns = [
92
- // "field_name: value" or "Field Name: value" patterns
93
205
  new RegExp(`(?:^|\\n)[ \\t]*${humanName}[:\\s]+([^\\n]{5,200})`, 'i'),
94
- // JSON-like "field": "value"
95
206
  new RegExp(`"${fieldName}"\\s*:\\s*"([^"]{1,300})"`, 'i'),
96
- // Markdown bold **Field Name**: value
97
207
  new RegExp(`\\*{1,2}${humanName}\\*{0,2}[:\\s]+([^\\n]{5,200})`, 'i'),
98
- // Heading followed by content
99
208
  new RegExp(`#+\\s*${humanName}\\s*\\n+([^\\n]{5,300})`, 'i'),
100
209
  ];
101
210
  for (const pattern of patterns) {
102
211
  const match = content.match(pattern);
103
- if (match?.[1]) {
212
+ if (match?.[1])
104
213
  return match[1].trim().replace(/[|*_`]/g, '').slice(0, 300);
105
- }
106
214
  }
107
215
  return null;
108
216
  }
109
217
  /**
110
- * For boolean fields: search for positive/negative indicators near the field name.
218
+ * For boolean fields: search the ENTIRE content for positive/negative indicators.
111
219
  */
112
220
  function heuristicExtractBoolean(fieldName, content) {
113
- const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
221
+ const lf = fieldName.toLowerCase();
114
222
  const ctx = content.toLowerCase();
115
- // Search both underscore and spaced variants
116
- let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
117
- if (fieldIdx === -1)
118
- fieldIdx = ctx.indexOf(humanName);
119
- if (fieldIdx === -1)
120
- return null;
121
- // Look at a window of ±150 chars around the field name
122
- const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
123
- const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
124
- const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
125
- for (const pos of positive) {
126
- if (window.includes(pos))
223
+ // Concept-aware boolean extraction search entire content, not just near field name
224
+ // Free tier / free plan
225
+ if (/free_tier|has_free|is_free/.test(lf)) {
226
+ if (/free tier|free plan|\$0|no cost|no charge|free forever/.test(ctx))
127
227
  return true;
228
+ if (/no free|paid only|subscription required/.test(ctx))
229
+ return false;
128
230
  }
129
- for (const neg of negative) {
130
- if (window.includes(neg))
231
+ // Open source
232
+ if (/open_source|is_open|oss/.test(lf)) {
233
+ if (/open[- ]source|mit license|apache license|gpl|bsd license|📜\s*mit|📜\s*apache/.test(ctx))
234
+ return true;
235
+ if (/closed[- ]source|proprietary|commercial license/.test(ctx))
131
236
  return false;
132
237
  }
238
+ // API availability
239
+ if (/has_api|api_available|has_rest/.test(lf)) {
240
+ if (/rest api|graphql api|api endpoint|api key|\/v1\/|\/api\//.test(ctx))
241
+ return true;
242
+ }
243
+ // Authentication
244
+ if (/requires_auth|has_auth|is_authenticated/.test(lf)) {
245
+ if (/login|sign in|authentication|api key|bearer token/.test(ctx))
246
+ return true;
247
+ }
248
+ // General approach: search near field name concept
249
+ const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
250
+ let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
251
+ if (fieldIdx === -1)
252
+ fieldIdx = ctx.indexOf(humanName);
253
+ if (fieldIdx !== -1) {
254
+ const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
255
+ const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
256
+ const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
257
+ for (const pos of positive) {
258
+ if (window.includes(pos))
259
+ return true;
260
+ }
261
+ for (const neg of negative) {
262
+ if (window.includes(neg))
263
+ return false;
264
+ }
265
+ }
133
266
  return null;
134
267
  }
135
268
  /**
136
269
  * For number fields: find digits near the field name.
137
270
  */
138
271
  function heuristicExtractNumber(fieldName, content) {
272
+ const lf = fieldName.toLowerCase();
273
+ // Stars (GitHub)
274
+ if (/stars?/.test(lf)) {
275
+ const m = content.match(/⭐\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*stars?/i);
276
+ if (m?.[1]) {
277
+ const n = parseFloat(m[1].replace(/,/g, ''));
278
+ return isNaN(n) ? null : n;
279
+ }
280
+ }
281
+ // Forks
282
+ if (/forks?/.test(lf)) {
283
+ const m = content.match(/🍴\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*forks?/i);
284
+ if (m?.[1]) {
285
+ const n = parseFloat(m[1].replace(/,/g, ''));
286
+ return isNaN(n) ? null : n;
287
+ }
288
+ }
289
+ // Rating/score
290
+ if (/rating|score/.test(lf)) {
291
+ const m = content.match(/⭐\s*([\d.]+)\//) ?? content.match(/([\d.]+)\s*\/\s*10/) ?? content.match(/([\d.]+)\s*\/\s*5/);
292
+ if (m?.[1]) {
293
+ const n = parseFloat(m[1]);
294
+ return isNaN(n) ? null : n;
295
+ }
296
+ }
297
+ // Year
298
+ if (/year/.test(lf)) {
299
+ const m = content.match(/\b(20\d{2})\b/);
300
+ if (m?.[1]) {
301
+ const n = parseInt(m[1]);
302
+ return isNaN(n) ? null : n;
303
+ }
304
+ }
305
+ // Generic: find number near field name
139
306
  const humanName = fieldName.replace(/_/g, '[\\s_-]*');
140
307
  const pattern = new RegExp(`${humanName}[:\\s$]*([\\d,]+\\.?\\d*)`, 'i');
141
308
  const match = content.match(pattern);
@@ -15,6 +15,7 @@ import { join } from 'node:path';
15
15
  import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
16
16
  import { simpleFetch } from './fetcher.js';
17
17
  import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
18
+ import { hasWebshareProxy as _hasWebshareProxy } from './proxy-config.js';
18
19
  import { createLogger } from './logger.js';
19
20
  // ---------------------------------------------------------------------------
20
21
  // yt-dlp startup diagnostics
@@ -239,8 +240,10 @@ export function extractSummary(fullText) {
239
240
  // ---------------------------------------------------------------------------
240
241
  // Proxy-based InnerTube transcript extraction
241
242
  // ---------------------------------------------------------------------------
242
- // Webshare residential proxy config — reads from env vars on Render.
243
+ // Webshare residential proxy config — reads from env vars via proxy-config.ts.
243
244
  // Locally, falls back to direct fetch (residential IP already works).
245
+ // These constants are kept for use in proxyRequestSlotted() which does
246
+ // low-level HTTP CONNECT tunneling (not Playwright-level proxy).
244
247
  const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
245
248
  const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
246
249
  const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
@@ -249,7 +252,8 @@ const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
249
252
  // slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
250
253
  const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
251
254
  function isProxyConfigured() {
252
- return !!(PROXY_USER && PROXY_PASS);
255
+ // Delegate to the shared proxy-config helper for consistency
256
+ return _hasWebshareProxy();
253
257
  }
254
258
  /**
255
259
  * Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
@@ -106,8 +106,8 @@ export function createApp(config = {}) {
106
106
  timeoutMs = 120000; // 2min for batch
107
107
  else if (path.includes('/screenshot'))
108
108
  timeoutMs = 60000; // 1min for screenshots
109
- else if (req.query?.render === 'true')
110
- timeoutMs = 60000; // 1min for rendered fetches
109
+ else if (req.query?.render === 'true' || req.query?.stealth === 'true')
110
+ timeoutMs = 60000; // 1min for browser/stealth fetches
111
111
  else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
112
112
  timeoutMs = 90000; // 90s for YouTube (yt-dlp needs time after simpleFetch fails)
113
113
  req.setTimeout(timeoutMs);
@@ -13,6 +13,52 @@ import { getSchemaTemplate } from '../../core/schema-templates.js';
13
13
  import { quickAnswer } from '../../core/quick-answer.js';
14
14
  import { sendUsageAlertEmail } from '../email-service.js';
15
15
  import { extractLinks } from '../../core/links.js';
16
+ // ── Helper: classify an error thrown by peel() into a FetchErrorType ─────────
17
+ function classifyFetchError(err) {
18
+ const code = err.code || err.name || '';
19
+ const msg = (err.message || '').toLowerCase();
20
+ if (code === 'TIMEOUT' || msg.includes('timeout') || msg.includes('timed out')) {
21
+ return 'timeout';
22
+ }
23
+ if (code === 'BLOCKED' || msg.includes('blocked') || msg.includes('cloudflare challenge') || msg.includes('captcha') || msg.includes('bot detection')) {
24
+ return 'blocked';
25
+ }
26
+ if (msg.includes('http 404') || msg.includes('not found') || msg.includes('dns resolution failed') || msg.includes('enotfound') || msg.includes('getaddrinfo')) {
27
+ return 'not_found';
28
+ }
29
+ if (msg.match(/http\s+5\d{2}/) || msg.includes('server error') || msg.includes('internal server')) {
30
+ return 'server_error';
31
+ }
32
+ if (code === 'NETWORK' || msg.includes('network') || msg.includes('econnrefused') || msg.includes('connection refused') || msg.includes('connection reset')) {
33
+ return 'network';
34
+ }
35
+ return 'unknown';
36
+ }
37
+ // ── Helper: build a clean, user-facing error message from a peel() error ─────
38
+ function buildFetchErrorMessage(err) {
39
+ const type = classifyFetchError(err);
40
+ const hints = {
41
+ timeout: 'Try increasing timeout with ?timeout=20000, or use render=true for JS-heavy sites.',
42
+ blocked: 'This site blocks automated requests. Try render=true or stealth=true.',
43
+ not_found: 'Verify the URL is correct and the site is accessible.',
44
+ server_error: 'The target site returned a server error. Try again later.',
45
+ network: 'Could not connect to the target URL. Verify the URL is correct and the site is online.',
46
+ unknown: undefined,
47
+ };
48
+ // Sanitize message: strip HTML chars, truncate
49
+ const safeMsg = (err.message || 'An unexpected error occurred while fetching the URL')
50
+ .replace(/[<>"']/g, '')
51
+ .trim();
52
+ const messages = {
53
+ timeout: `The website took too long to respond. Try with render=true or stealth=true for JavaScript-heavy sites.`,
54
+ blocked: `This website is blocking automated access (bot protection detected).`,
55
+ not_found: `The URL could not be reached — the domain may not exist or the page was not found.`,
56
+ server_error: `The target website returned a server error while processing the request.`,
57
+ network: `Could not reach this website. The server may be down or the URL may be incorrect.`,
58
+ unknown: safeMsg,
59
+ };
60
+ return { type, message: messages[type] || safeMsg, hint: hints[type] };
61
+ }
16
62
  // ── Helper: extractive summarizer (TF-IDF-like sentence scoring) ─────────────
17
63
  function extractSummary(content, maxWords = 150) {
18
64
  if (!content)
@@ -527,26 +573,24 @@ export function createFetchRouter(authStore) {
527
573
  });
528
574
  }
529
575
  // SECURITY: Sanitize error messages to prevent information disclosure
530
- if (err.code) {
576
+ if (res.headersSent)
577
+ return; // Timeout middleware already responded
578
+ const requestUrl = req.query.url;
579
+ if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
531
580
  // WebPeelError from core library - safe to expose with helpful context
532
- if (res.headersSent)
533
- return; // Timeout middleware already responded
534
- const safeMessage = err.message.replace(/[<>"']/g, ''); // Remove HTML chars
535
- const statusCode = err.code === 'TIMEOUT' ? 504
536
- : err.code === 'BLOCKED' ? 403
537
- : err.code === 'NETWORK' ? 502
538
- : 500;
539
- const hints = {
540
- TIMEOUT: 'Try increasing timeout with ?wait=10000, or use render=true for JS-heavy sites.',
541
- BLOCKED: 'This site blocks automated requests. Try adding render=true or use stealth mode (costs 5 credits).',
542
- NETWORK: 'Could not reach the target URL. Verify the URL is correct and the site is online.',
543
- };
581
+ const { type, message, hint } = buildFetchErrorMessage(err);
582
+ const statusCode = type === 'timeout' ? 504
583
+ : type === 'blocked' ? 403
584
+ : type === 'not_found' ? 404
585
+ : type === 'network' || type === 'server_error' ? 502
586
+ : 500;
544
587
  res.status(statusCode).json({
545
588
  success: false,
546
589
  error: {
547
- type: err.code,
548
- message: safeMessage,
549
- hint: hints[err.code] || undefined,
590
+ type,
591
+ message,
592
+ url: requestUrl,
593
+ ...(hint ? { hint } : {}),
550
594
  docs: 'https://webpeel.dev/docs/api-reference#errors',
551
595
  },
552
596
  requestId: req.requestId,
@@ -555,13 +599,12 @@ export function createFetchRouter(authStore) {
555
599
  else {
556
600
  // Unexpected error - generic message only
557
601
  console.error('Fetch error:', err); // Log full error server-side
558
- if (res.headersSent)
559
- return; // Timeout middleware already responded
560
602
  res.status(500).json({
561
603
  success: false,
562
604
  error: {
563
- type: 'internal_error',
605
+ type: 'unknown',
564
606
  message: 'An unexpected error occurred while fetching the URL. If this persists, check https://webpeel.dev/status',
607
+ url: requestUrl,
565
608
  docs: 'https://webpeel.dev/docs/api-reference#errors',
566
609
  },
567
610
  requestId: req.requestId,
@@ -1028,23 +1071,21 @@ export function createFetchRouter(authStore) {
1028
1071
  console.error('POST fetch/scrape error:', err);
1029
1072
  if (res.headersSent)
1030
1073
  return; // Timeout middleware already responded
1031
- if (err.code) {
1032
- const safeMessage = err.message.replace(/[<>"']/g, '');
1033
- const statusCode = err.code === 'TIMEOUT' ? 504
1034
- : err.code === 'BLOCKED' ? 403
1035
- : err.code === 'NETWORK' ? 502
1036
- : 500;
1037
- const hints = {
1038
- TIMEOUT: 'Try increasing timeout, or set render:true for JS-heavy sites.',
1039
- BLOCKED: 'Site blocks automated requests. Try render:true or stealth mode.',
1040
- NETWORK: 'Could not reach the target URL. Verify it is correct and online.',
1041
- };
1074
+ const postUrl = req.body?.url;
1075
+ if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
1076
+ const { type, message, hint } = buildFetchErrorMessage(err);
1077
+ const statusCode = type === 'timeout' ? 504
1078
+ : type === 'blocked' ? 403
1079
+ : type === 'not_found' ? 404
1080
+ : type === 'network' || type === 'server_error' ? 502
1081
+ : 500;
1042
1082
  res.status(statusCode).json({
1043
1083
  success: false,
1044
1084
  error: {
1045
- type: err.code,
1046
- message: safeMessage,
1047
- hint: hints[err.code] || undefined,
1085
+ type,
1086
+ message,
1087
+ url: postUrl,
1088
+ ...(hint ? { hint } : {}),
1048
1089
  docs: 'https://webpeel.dev/docs/api-reference#errors',
1049
1090
  },
1050
1091
  requestId: req.requestId,
@@ -1054,8 +1095,9 @@ export function createFetchRouter(authStore) {
1054
1095
  res.status(500).json({
1055
1096
  success: false,
1056
1097
  error: {
1057
- type: 'internal_error',
1098
+ type: 'unknown',
1058
1099
  message: 'An unexpected error occurred. If this persists, check https://webpeel.dev/status',
1100
+ url: postUrl,
1059
1101
  docs: 'https://webpeel.dev/docs/api-reference#errors',
1060
1102
  },
1061
1103
  requestId: req.requestId,
package/dist/types.d.ts CHANGED
@@ -419,6 +419,18 @@ export interface PeelEnvelope {
419
419
  */
420
420
  totalAvailable?: number;
421
421
  }
422
+ /**
423
+ * Programmatic error classification for fetch failures.
424
+ * Returned in the `error.type` field of API error responses.
425
+ *
426
+ * - `timeout` — Site took too long to respond
427
+ * - `blocked` — Site actively blocked the request (403, CAPTCHA, bot detection)
428
+ * - `not_found` — 404 or the domain/URL does not exist
429
+ * - `server_error` — Target site returned a 5xx error
430
+ * - `network` — DNS failure, connection refused, or other network-level issue
431
+ * - `unknown` — Unclassified error
432
+ */
433
+ export type FetchErrorType = 'timeout' | 'blocked' | 'not_found' | 'server_error' | 'network' | 'unknown';
422
434
  export declare class WebPeelError extends Error {
423
435
  code?: string | undefined;
424
436
  constructor(message: string, code?: string | undefined);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.6",
3
+ "version": "0.21.8",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",