webpeel 0.21.5 → 0.21.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/utils.js CHANGED
@@ -131,22 +131,30 @@ export function parseActions(actionStrings) {
131
131
  */
132
132
  export function formatError(error, _url, options) {
133
133
  const msg = error.message || String(error);
134
+ const errorType = error.errorType || '';
134
135
  const lines = [`\x1b[31m✖ ${msg}\x1b[0m`];
135
- if (msg.includes('net::ERR_') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
136
- lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
137
- }
138
- else if (msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
136
+ // Check structured errorType from API first (takes precedence over message heuristics)
137
+ if (errorType === 'timeout' || msg.includes('took too long') || msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
139
138
  lines.push('\x1b[33m💡 Try increasing timeout: --timeout 60000\x1b[0m');
140
139
  if (!options.render) {
141
140
  lines.push('\x1b[33m💡 Site may need browser rendering: --render\x1b[0m');
142
141
  }
143
142
  }
144
- else if (msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
143
+ else if (errorType === 'blocked' || msg.includes('blocking automated') || msg.includes('bot protection') || msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
145
144
  if (!options.stealth) {
146
145
  lines.push('\x1b[33m💡 Try stealth mode to bypass bot detection: --stealth\x1b[0m');
147
146
  }
148
147
  lines.push('\x1b[33m💡 Try a different user agent: --ua "Mozilla/5.0..."\x1b[0m');
149
148
  }
149
+ else if (errorType === 'not_found' || msg.includes('domain may not exist') || msg.includes('not found') || msg.includes('ENOTFOUND') || msg.includes('net::ERR_') || msg.includes('ECONNREFUSED')) {
150
+ lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
151
+ }
152
+ else if (errorType === 'network' || msg.includes('Could not reach') || msg.includes('could not connect') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
153
+ lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
154
+ }
155
+ else if (errorType === 'server_error' || msg.includes('server error')) {
156
+ lines.push('\x1b[33m💡 The target site returned a server error. Try again in a moment.\x1b[0m');
157
+ }
150
158
  else if (msg.includes('empty') || msg.includes('no content') || msg.includes('0 tokens')) {
151
159
  if (!options.render) {
152
160
  lines.push('\x1b[33m💡 Page may be JavaScript-rendered. Try: --render\x1b[0m');
@@ -212,7 +220,40 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
212
220
  }
213
221
  if (!res.ok) {
214
222
  const body = await res.text().catch(() => '');
215
- throw new Error(`API error ${res.status}: ${body.slice(0, 200)}`);
223
+ // Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
224
+ const isHtml = body.trimStart().startsWith('<') || body.includes('<!DOCTYPE') || body.includes('<html');
225
+ let errorMsg;
226
+ let errorType;
227
+ if (res.status === 502 || res.status === 503 || res.status === 504) {
228
+ errorMsg = `Could not reach this website. The site may be blocking our server or timing out.`;
229
+ errorType = res.status === 504 ? 'timeout' : 'network';
230
+ }
231
+ else if (isHtml) {
232
+ errorMsg = `Server returned an error page (${res.status})`;
233
+ }
234
+ else {
235
+ // Try to parse a structured JSON error response
236
+ try {
237
+ const json = JSON.parse(body);
238
+ const errObj = json?.error;
239
+ if (errObj && typeof errObj === 'object') {
240
+ errorMsg = typeof errObj.message === 'string' ? errObj.message : (body.slice(0, 200) || 'Unknown error');
241
+ if (typeof errObj.type === 'string')
242
+ errorType = errObj.type;
243
+ }
244
+ else {
245
+ errorMsg = body.slice(0, 200) || 'Unknown error';
246
+ }
247
+ }
248
+ catch {
249
+ errorMsg = body.slice(0, 200) || 'Unknown error';
250
+ }
251
+ }
252
+ const err = new Error(`${errorMsg}`);
253
+ if (errorType)
254
+ err.errorType = errorType;
255
+ err.statusCode = res.status;
256
+ throw err;
216
257
  }
217
258
  const data = await res.json();
218
259
  // Map API response to PeelResult shape that the CLI already handles
@@ -393,20 +434,40 @@ export function classifyErrorCode(error) {
393
434
  // Check for our custom _code first (set in pre-fetch validation)
394
435
  if (error._code)
395
436
  return error._code;
437
+ // Check for structured errorType from API responses (set by fetchViaApi)
438
+ const errorType = error.errorType;
439
+ if (errorType) {
440
+ const typeMap = {
441
+ timeout: 'TIMEOUT',
442
+ blocked: 'BLOCKED',
443
+ not_found: 'NOT_FOUND',
444
+ server_error: 'SERVER_ERROR',
445
+ network: 'NETWORK',
446
+ unknown: 'FETCH_FAILED',
447
+ };
448
+ if (typeMap[errorType])
449
+ return typeMap[errorType];
450
+ }
396
451
  const msg = error.message.toLowerCase();
397
452
  const name = error.name || '';
398
- if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out')) {
453
+ if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out') || msg.includes('took too long')) {
399
454
  return 'TIMEOUT';
400
455
  }
401
- if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
456
+ if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare') || msg.includes('bot protection')) {
402
457
  return 'BLOCKED';
403
458
  }
404
- if (msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed') || msg.includes('not found')) {
405
- return 'DNS_FAILED';
459
+ if (msg.includes('domain may not exist') || msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed')) {
460
+ return 'NOT_FOUND';
461
+ }
462
+ if (msg.includes('http 404') || msg.includes('page was not found')) {
463
+ return 'NOT_FOUND';
406
464
  }
407
465
  if (msg.includes('invalid url') || msg.includes('invalid hostname') || msg.includes('only http')) {
408
466
  return 'INVALID_URL';
409
467
  }
468
+ if (msg.includes('could not reach') || msg.includes('could not connect') || msg.includes('econnrefused')) {
469
+ return 'NETWORK';
470
+ }
410
471
  return 'FETCH_FAILED';
411
472
  }
412
473
  /**
@@ -1274,9 +1274,27 @@ async function youtubeExtractor(_html, url) {
1274
1274
  const parts = [];
1275
1275
  parts.push(`# ${title}`);
1276
1276
  parts.push(headerLine);
1277
+ /**
1278
+ * Strip music note symbols from transcript/caption text.
1279
+ * YouTube auto-captions include ♪ and 🎵 as music cues.
1280
+ * Patterns cleaned:
1281
+ * [♪♪♪] → (removed)
1282
+ * ♪ text ♪ → text
1283
+ * standalone ♪ / 🎵 → (removed)
1284
+ */
1285
+ const cleanMusicNotes = (text) => text
1286
+ // Remove bracketed music cues: [♪], [♪♪♪], [🎵🎵🎵], etc.
1287
+ .replace(/\[[♪🎵]+\]/g, '')
1288
+ // Unwrap ♪ text ♪ → text (keep the words between notes)
1289
+ .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
1290
+ // Remove any remaining standalone ♪ or 🎵
1291
+ .replace(/[♪🎵]+/g, '')
1292
+ // Collapse extra whitespace introduced by removals
1293
+ .replace(/\s{2,}/g, ' ')
1294
+ .trim();
1277
1295
  // Summary section
1278
1296
  if (transcript.summary && hasTranscript) {
1279
- let summaryText = transcript.summary;
1297
+ let summaryText = cleanMusicNotes(transcript.summary);
1280
1298
  summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
1281
1299
  parts.push(`## Summary\n\n${summaryText}`);
1282
1300
  }
@@ -1296,7 +1314,7 @@ async function youtubeExtractor(_html, url) {
1296
1314
  // Full Transcript section (only if we have real transcript segments)
1297
1315
  // Add intelligent paragraph breaks for readability
1298
1316
  if (hasTranscript) {
1299
- let readableText = transcript.fullText;
1317
+ let readableText = cleanMusicNotes(transcript.fullText);
1300
1318
  // Break into paragraphs: after sentence-ending punctuation followed by a capital letter
1301
1319
  readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
1302
1320
  // Collapse any triple+ newlines
@@ -193,19 +193,32 @@ export async function handleYouTube(ctx) {
193
193
  headerParts.push(`**${viewStr}**`);
194
194
  if (publishStr)
195
195
  headerParts.push(`**Published:** ${publishStr}`);
196
+ /**
197
+ * Strip music note symbols from YouTube auto-caption text.
198
+ * Cleans: [♪♪♪], [🎵🎵🎵], ♪ text ♪ (keeps inner text), standalone ♪ / 🎵
199
+ */
200
+ const cleanMusicNotes = (text) => text
201
+ .replace(/\[[♪🎵]+\]/g, '')
202
+ .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
203
+ .replace(/[♪🎵]+/g, '')
204
+ .replace(/\s{2,}/g, ' ')
205
+ .trim();
196
206
  // Add paragraph breaks to transcript for readability
197
- let readableText = transcript.fullText;
207
+ let readableText = cleanMusicNotes(transcript.fullText);
198
208
  readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
199
209
  readableText = readableText.replace(/\n{3,}/g, '\n\n');
200
210
  // Build a clean markdown representation of the video + transcript
201
211
  const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
202
212
  if (transcript.summary) {
203
- let summaryText = transcript.summary;
213
+ let summaryText = cleanMusicNotes(transcript.summary);
204
214
  summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
205
215
  parts.push(`## Summary\n\n${summaryText}`);
206
216
  }
207
217
  if (transcript.keyPoints && transcript.keyPoints.length > 0) {
208
- parts.push(`## Key Points\n\n${transcript.keyPoints.map(kp => `- ${kp}`).join('\n')}`);
218
+ const cleanedKps = transcript.keyPoints.map((kp) => cleanMusicNotes(kp)).filter((kp) => kp.length > 0);
219
+ if (cleanedKps.length > 0) {
220
+ parts.push(`## Key Points\n\n${cleanedKps.map((kp) => `- ${kp}`).join('\n')}`);
221
+ }
209
222
  }
210
223
  if (transcript.chapters && transcript.chapters.length > 0) {
211
224
  parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
@@ -927,6 +940,10 @@ export async function postProcess(ctx) {
927
940
  if (ddResult) {
928
941
  ctx.domainData = ddResult;
929
942
  ctx.content = ddResult.cleanContent;
943
+ // Update title from domain extractor (takes precedence over HTML page title)
944
+ if (ddResult.structured?.title) {
945
+ ctx.title = ddResult.structured.title;
946
+ }
930
947
  }
931
948
  }
932
949
  catch (e) {
@@ -86,56 +86,223 @@ function parseLLMJson(text) {
86
86
  /**
87
87
  * For string fields: search for field name in content, extract surrounding text.
88
88
  */
89
- function heuristicExtractString(fieldName, content) {
89
+ /** Extract first H1 or page title from markdown content */
90
+ function extractPageTitle(content) {
91
+ const h1 = content.match(/^#\s+(.+)$/m);
92
+ if (h1?.[1])
93
+ return h1[1].replace(/[*_`]/g, '').trim();
94
+ return null;
95
+ }
96
+ /** Extract meta description (after *X min read* pattern common in WebPeel output) */
97
+ function extractDescription(content) {
98
+ // First paragraph after the title
99
+ const lines = content.split('\n').filter(l => l.trim());
100
+ let seenH1 = false;
101
+ for (const line of lines) {
102
+ if (line.startsWith('#')) {
103
+ seenH1 = true;
104
+ continue;
105
+ }
106
+ if (line.startsWith('*') && line.endsWith('*'))
107
+ continue; // byline
108
+ if (seenH1 && line.length > 30)
109
+ return line.replace(/[*_`]/g, '').trim().slice(0, 300);
110
+ }
111
+ return null;
112
+ }
113
+ /** Extract company/brand name from title (before " — ", " - ", " | ", " · ") */
114
+ function extractCompanyFromTitle(title) {
115
+ const sep = title.match(/^([^|·\-—]+)[|·\-—]/);
116
+ if (sep?.[1])
117
+ return sep[1].trim();
118
+ return title.trim().slice(0, 60);
119
+ }
120
+ /** Smart field-name-aware string extractor */
121
+ function heuristicExtractString(fieldName, content, pageUrl) {
122
+ const lf = fieldName.toLowerCase();
90
123
  const humanName = fieldName.replace(/_/g, ' ');
124
+ const title = extractPageTitle(content);
125
+ // --- Concept-aware extraction ---
126
+ // Company/brand/organization name
127
+ if (/company|brand|organization|org_name/.test(lf)) {
128
+ if (title)
129
+ return extractCompanyFromTitle(title);
130
+ // Fallback: extract from first heading of any level
131
+ const anyHeading = content.match(/^#{1,3}\s+(.+)$/m);
132
+ if (anyHeading?.[1])
133
+ return anyHeading[1].replace(/[*_`[\]]/g, '').trim().slice(0, 60);
134
+ }
135
+ // Title/name/product → first H1 or any heading, stripped of markdown
136
+ if (/^(title|name|product_name|product|heading)$/.test(lf)) {
137
+ const rawTitle = title ?? content.match(/^#{1,3}\s+(.+)$/m)?.[1];
138
+ if (rawTitle) {
139
+ // Strip markdown links [text](url) → text, badges ![...](url) → '', etc.
140
+ return rawTitle
141
+ .replace(/!\[[^\]]*\]\([^)]*\)/g, '') // remove images
142
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1') // [text](url) → text
143
+ .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
144
+ .replace(/[*_`[\]]/g, '')
145
+ .replace(/&[a-z]+;/g, '') // HTML entities
146
+ .replace(/\s+/g, ' ')
147
+ .trim().slice(0, 150);
148
+ }
149
+ }
150
+ // Description/summary/about → first paragraph
151
+ if (/description|summary|about|overview/.test(lf)) {
152
+ return extractDescription(content) ?? null;
153
+ }
154
+ // URL/website/link → use the URL if we have it
155
+ if (/^(url|website|link|homepage|site)$/.test(lf)) {
156
+ if (pageUrl)
157
+ return pageUrl;
158
+ }
159
+ // Author/writer/by
160
+ if (/author|writer|by/.test(lf)) {
161
+ const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
162
+ if (m?.[1])
163
+ return m[1].trim().slice(0, 100);
164
+ }
165
+ // Date/published/updated
166
+ if (/date|published|updated|modified/.test(lf)) {
167
+ const m = content.match(/(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b)/i)
168
+ ?? content.match(/(\d{4}-\d{2}-\d{2})/);
169
+ if (m?.[1])
170
+ return m[1];
171
+ }
172
+ // Email
173
+ if (/email|contact/.test(lf)) {
174
+ const m = content.match(/[\w.+-]+@[\w-]+\.[a-z]{2,}/i);
175
+ if (m?.[0])
176
+ return m[0];
177
+ }
178
+ // Price/cost/pricing → extract value near $
179
+ if (/price|cost|pricing|fee/.test(lf)) {
180
+ const m = content.match(/\$\s*[\d,]+(?:\.\d{2})?(?:\s*\/\s*\w+)?/)
181
+ ?? content.match(/(free|no cost|no charge)/i);
182
+ if (m?.[0])
183
+ return m[0].trim();
184
+ }
185
+ // Language (for GitHub repos)
186
+ if (/language|lang|tech/.test(lf)) {
187
+ const m = content.match(/💻\s*(\w[\w#+.-]+)/) ?? content.match(/Language[:\s]+(\w[\w#+.-]+)/i);
188
+ if (m?.[1])
189
+ return m[1];
190
+ }
191
+ // Stars (for GitHub)
192
+ if (/stars?/.test(lf)) {
193
+ const m = content.match(/⭐\s*([\d,]+)\s*stars?/i) ?? content.match(/([\d,]+)\s*stars?/i);
194
+ if (m?.[1])
195
+ return m[1].replace(/,/g, '');
196
+ }
197
+ // License
198
+ if (/license/.test(lf)) {
199
+ const m = content.match(/📜\s*(\w+)/) ?? content.match(/License[:\s]+(MIT|Apache|GPL|BSD|ISC|AGPL|MPL)[^\s]*/i);
200
+ if (m?.[1])
201
+ return m[1];
202
+ }
203
+ // --- Generic patterns (exact-ish match) ---
91
204
  const patterns = [
92
- // "field_name: value" or "Field Name: value" patterns
93
205
  new RegExp(`(?:^|\\n)[ \\t]*${humanName}[:\\s]+([^\\n]{5,200})`, 'i'),
94
- // JSON-like "field": "value"
95
206
  new RegExp(`"${fieldName}"\\s*:\\s*"([^"]{1,300})"`, 'i'),
96
- // Markdown bold **Field Name**: value
97
207
  new RegExp(`\\*{1,2}${humanName}\\*{0,2}[:\\s]+([^\\n]{5,200})`, 'i'),
98
- // Heading followed by content
99
208
  new RegExp(`#+\\s*${humanName}\\s*\\n+([^\\n]{5,300})`, 'i'),
100
209
  ];
101
210
  for (const pattern of patterns) {
102
211
  const match = content.match(pattern);
103
- if (match?.[1]) {
212
+ if (match?.[1])
104
213
  return match[1].trim().replace(/[|*_`]/g, '').slice(0, 300);
105
- }
106
214
  }
107
215
  return null;
108
216
  }
109
217
  /**
110
- * For boolean fields: search for positive/negative indicators near the field name.
218
+ * For boolean fields: search the ENTIRE content for positive/negative indicators.
111
219
  */
112
220
  function heuristicExtractBoolean(fieldName, content) {
113
- const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
221
+ const lf = fieldName.toLowerCase();
114
222
  const ctx = content.toLowerCase();
115
- // Search both underscore and spaced variants
116
- let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
117
- if (fieldIdx === -1)
118
- fieldIdx = ctx.indexOf(humanName);
119
- if (fieldIdx === -1)
120
- return null;
121
- // Look at a window of ±150 chars around the field name
122
- const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
123
- const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
124
- const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
125
- for (const pos of positive) {
126
- if (window.includes(pos))
223
+ // Concept-aware boolean extraction search entire content, not just near field name
224
+ // Free tier / free plan
225
+ if (/free_tier|has_free|is_free/.test(lf)) {
226
+ if (/free tier|free plan|\$0|no cost|no charge|free forever/.test(ctx))
127
227
  return true;
228
+ if (/no free|paid only|subscription required/.test(ctx))
229
+ return false;
128
230
  }
129
- for (const neg of negative) {
130
- if (window.includes(neg))
231
+ // Open source
232
+ if (/open_source|is_open|oss/.test(lf)) {
233
+ if (/open[- ]source|mit license|apache license|gpl|bsd license|📜\s*mit|📜\s*apache/.test(ctx))
234
+ return true;
235
+ if (/closed[- ]source|proprietary|commercial license/.test(ctx))
131
236
  return false;
132
237
  }
238
+ // API availability
239
+ if (/has_api|api_available|has_rest/.test(lf)) {
240
+ if (/rest api|graphql api|api endpoint|api key|\/v1\/|\/api\//.test(ctx))
241
+ return true;
242
+ }
243
+ // Authentication
244
+ if (/requires_auth|has_auth|is_authenticated/.test(lf)) {
245
+ if (/login|sign in|authentication|api key|bearer token/.test(ctx))
246
+ return true;
247
+ }
248
+ // General approach: search near field name concept
249
+ const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
250
+ let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
251
+ if (fieldIdx === -1)
252
+ fieldIdx = ctx.indexOf(humanName);
253
+ if (fieldIdx !== -1) {
254
+ const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
255
+ const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
256
+ const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
257
+ for (const pos of positive) {
258
+ if (window.includes(pos))
259
+ return true;
260
+ }
261
+ for (const neg of negative) {
262
+ if (window.includes(neg))
263
+ return false;
264
+ }
265
+ }
133
266
  return null;
134
267
  }
135
268
  /**
136
269
  * For number fields: find digits near the field name.
137
270
  */
138
271
  function heuristicExtractNumber(fieldName, content) {
272
+ const lf = fieldName.toLowerCase();
273
+ // Stars (GitHub)
274
+ if (/stars?/.test(lf)) {
275
+ const m = content.match(/⭐\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*stars?/i);
276
+ if (m?.[1]) {
277
+ const n = parseFloat(m[1].replace(/,/g, ''));
278
+ return isNaN(n) ? null : n;
279
+ }
280
+ }
281
+ // Forks
282
+ if (/forks?/.test(lf)) {
283
+ const m = content.match(/🍴\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*forks?/i);
284
+ if (m?.[1]) {
285
+ const n = parseFloat(m[1].replace(/,/g, ''));
286
+ return isNaN(n) ? null : n;
287
+ }
288
+ }
289
+ // Rating/score
290
+ if (/rating|score/.test(lf)) {
291
+ const m = content.match(/⭐\s*([\d.]+)\//) ?? content.match(/([\d.]+)\s*\/\s*10/) ?? content.match(/([\d.]+)\s*\/\s*5/);
292
+ if (m?.[1]) {
293
+ const n = parseFloat(m[1]);
294
+ return isNaN(n) ? null : n;
295
+ }
296
+ }
297
+ // Year
298
+ if (/year/.test(lf)) {
299
+ const m = content.match(/\b(20\d{2})\b/);
300
+ if (m?.[1]) {
301
+ const n = parseInt(m[1]);
302
+ return isNaN(n) ? null : n;
303
+ }
304
+ }
305
+ // Generic: find number near field name
139
306
  const humanName = fieldName.replace(/_/g, '[\\s_-]*');
140
307
  const pattern = new RegExp(`${humanName}[:\\s$]*([\\d,]+\\.?\\d*)`, 'i');
141
308
  const match = content.match(pattern);
@@ -106,8 +106,8 @@ export function createApp(config = {}) {
106
106
  timeoutMs = 120000; // 2min for batch
107
107
  else if (path.includes('/screenshot'))
108
108
  timeoutMs = 60000; // 1min for screenshots
109
- else if (req.query?.render === 'true')
110
- timeoutMs = 60000; // 1min for rendered fetches
109
+ else if (req.query?.render === 'true' || req.query?.stealth === 'true')
110
+ timeoutMs = 60000; // 1min for browser/stealth fetches
111
111
  else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
112
112
  timeoutMs = 90000; // 90s for YouTube (yt-dlp needs time after simpleFetch fails)
113
113
  req.setTimeout(timeoutMs);
@@ -13,6 +13,52 @@ import { getSchemaTemplate } from '../../core/schema-templates.js';
13
13
  import { quickAnswer } from '../../core/quick-answer.js';
14
14
  import { sendUsageAlertEmail } from '../email-service.js';
15
15
  import { extractLinks } from '../../core/links.js';
16
+ // ── Helper: classify an error thrown by peel() into a FetchErrorType ─────────
17
+ function classifyFetchError(err) {
18
+ const code = err.code || err.name || '';
19
+ const msg = (err.message || '').toLowerCase();
20
+ if (code === 'TIMEOUT' || msg.includes('timeout') || msg.includes('timed out')) {
21
+ return 'timeout';
22
+ }
23
+ if (code === 'BLOCKED' || msg.includes('blocked') || msg.includes('cloudflare challenge') || msg.includes('captcha') || msg.includes('bot detection')) {
24
+ return 'blocked';
25
+ }
26
+ if (msg.includes('http 404') || msg.includes('not found') || msg.includes('dns resolution failed') || msg.includes('enotfound') || msg.includes('getaddrinfo')) {
27
+ return 'not_found';
28
+ }
29
+ if (msg.match(/http\s+5\d{2}/) || msg.includes('server error') || msg.includes('internal server')) {
30
+ return 'server_error';
31
+ }
32
+ if (code === 'NETWORK' || msg.includes('network') || msg.includes('econnrefused') || msg.includes('connection refused') || msg.includes('connection reset')) {
33
+ return 'network';
34
+ }
35
+ return 'unknown';
36
+ }
37
+ // ── Helper: build a clean, user-facing error message from a peel() error ─────
38
+ function buildFetchErrorMessage(err) {
39
+ const type = classifyFetchError(err);
40
+ const hints = {
41
+ timeout: 'Try increasing timeout with ?timeout=20000, or use render=true for JS-heavy sites.',
42
+ blocked: 'This site blocks automated requests. Try render=true or stealth=true.',
43
+ not_found: 'Verify the URL is correct and the site is accessible.',
44
+ server_error: 'The target site returned a server error. Try again later.',
45
+ network: 'Could not connect to the target URL. Verify the URL is correct and the site is online.',
46
+ unknown: undefined,
47
+ };
48
+ // Sanitize message: strip HTML chars, truncate
49
+ const safeMsg = (err.message || 'An unexpected error occurred while fetching the URL')
50
+ .replace(/[<>"']/g, '')
51
+ .trim();
52
+ const messages = {
53
+ timeout: `The website took too long to respond. Try with render=true or stealth=true for JavaScript-heavy sites.`,
54
+ blocked: `This website is blocking automated access (bot protection detected).`,
55
+ not_found: `The URL could not be reached — the domain may not exist or the page was not found.`,
56
+ server_error: `The target website returned a server error while processing the request.`,
57
+ network: `Could not reach this website. The server may be down or the URL may be incorrect.`,
58
+ unknown: safeMsg,
59
+ };
60
+ return { type, message: messages[type] || safeMsg, hint: hints[type] };
61
+ }
16
62
  // ── Helper: extractive summarizer (TF-IDF-like sentence scoring) ─────────────
17
63
  function extractSummary(content, maxWords = 150) {
18
64
  if (!content)
@@ -527,26 +573,24 @@ export function createFetchRouter(authStore) {
527
573
  });
528
574
  }
529
575
  // SECURITY: Sanitize error messages to prevent information disclosure
530
- if (err.code) {
576
+ if (res.headersSent)
577
+ return; // Timeout middleware already responded
578
+ const requestUrl = req.query.url;
579
+ if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
531
580
  // WebPeelError from core library - safe to expose with helpful context
532
- if (res.headersSent)
533
- return; // Timeout middleware already responded
534
- const safeMessage = err.message.replace(/[<>"']/g, ''); // Remove HTML chars
535
- const statusCode = err.code === 'TIMEOUT' ? 504
536
- : err.code === 'BLOCKED' ? 403
537
- : err.code === 'NETWORK' ? 502
538
- : 500;
539
- const hints = {
540
- TIMEOUT: 'Try increasing timeout with ?wait=10000, or use render=true for JS-heavy sites.',
541
- BLOCKED: 'This site blocks automated requests. Try adding render=true or use stealth mode (costs 5 credits).',
542
- NETWORK: 'Could not reach the target URL. Verify the URL is correct and the site is online.',
543
- };
581
+ const { type, message, hint } = buildFetchErrorMessage(err);
582
+ const statusCode = type === 'timeout' ? 504
583
+ : type === 'blocked' ? 403
584
+ : type === 'not_found' ? 404
585
+ : type === 'network' || type === 'server_error' ? 502
586
+ : 500;
544
587
  res.status(statusCode).json({
545
588
  success: false,
546
589
  error: {
547
- type: err.code,
548
- message: safeMessage,
549
- hint: hints[err.code] || undefined,
590
+ type,
591
+ message,
592
+ url: requestUrl,
593
+ ...(hint ? { hint } : {}),
550
594
  docs: 'https://webpeel.dev/docs/api-reference#errors',
551
595
  },
552
596
  requestId: req.requestId,
@@ -555,13 +599,12 @@ export function createFetchRouter(authStore) {
555
599
  else {
556
600
  // Unexpected error - generic message only
557
601
  console.error('Fetch error:', err); // Log full error server-side
558
- if (res.headersSent)
559
- return; // Timeout middleware already responded
560
602
  res.status(500).json({
561
603
  success: false,
562
604
  error: {
563
- type: 'internal_error',
605
+ type: 'unknown',
564
606
  message: 'An unexpected error occurred while fetching the URL. If this persists, check https://webpeel.dev/status',
607
+ url: requestUrl,
565
608
  docs: 'https://webpeel.dev/docs/api-reference#errors',
566
609
  },
567
610
  requestId: req.requestId,
@@ -1028,23 +1071,21 @@ export function createFetchRouter(authStore) {
1028
1071
  console.error('POST fetch/scrape error:', err);
1029
1072
  if (res.headersSent)
1030
1073
  return; // Timeout middleware already responded
1031
- if (err.code) {
1032
- const safeMessage = err.message.replace(/[<>"']/g, '');
1033
- const statusCode = err.code === 'TIMEOUT' ? 504
1034
- : err.code === 'BLOCKED' ? 403
1035
- : err.code === 'NETWORK' ? 502
1036
- : 500;
1037
- const hints = {
1038
- TIMEOUT: 'Try increasing timeout, or set render:true for JS-heavy sites.',
1039
- BLOCKED: 'Site blocks automated requests. Try render:true or stealth mode.',
1040
- NETWORK: 'Could not reach the target URL. Verify it is correct and online.',
1041
- };
1074
+ const postUrl = req.body?.url;
1075
+ if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
1076
+ const { type, message, hint } = buildFetchErrorMessage(err);
1077
+ const statusCode = type === 'timeout' ? 504
1078
+ : type === 'blocked' ? 403
1079
+ : type === 'not_found' ? 404
1080
+ : type === 'network' || type === 'server_error' ? 502
1081
+ : 500;
1042
1082
  res.status(statusCode).json({
1043
1083
  success: false,
1044
1084
  error: {
1045
- type: err.code,
1046
- message: safeMessage,
1047
- hint: hints[err.code] || undefined,
1085
+ type,
1086
+ message,
1087
+ url: postUrl,
1088
+ ...(hint ? { hint } : {}),
1048
1089
  docs: 'https://webpeel.dev/docs/api-reference#errors',
1049
1090
  },
1050
1091
  requestId: req.requestId,
@@ -1054,8 +1095,9 @@ export function createFetchRouter(authStore) {
1054
1095
  res.status(500).json({
1055
1096
  success: false,
1056
1097
  error: {
1057
- type: 'internal_error',
1098
+ type: 'unknown',
1058
1099
  message: 'An unexpected error occurred. If this persists, check https://webpeel.dev/status',
1100
+ url: postUrl,
1059
1101
  docs: 'https://webpeel.dev/docs/api-reference#errors',
1060
1102
  },
1061
1103
  requestId: req.requestId,
package/dist/types.d.ts CHANGED
@@ -419,6 +419,18 @@ export interface PeelEnvelope {
419
419
  */
420
420
  totalAvailable?: number;
421
421
  }
422
+ /**
423
+ * Programmatic error classification for fetch failures.
424
+ * Returned in the `error.type` field of API error responses.
425
+ *
426
+ * - `timeout` — Site took too long to respond
427
+ * - `blocked` — Site actively blocked the request (403, CAPTCHA, bot detection)
428
+ * - `not_found` — 404 or the domain/URL does not exist
429
+ * - `server_error` — Target site returned a 5xx error
430
+ * - `network` — DNS failure, connection refused, or other network-level issue
431
+ * - `unknown` — Unclassified error
432
+ */
433
+ export type FetchErrorType = 'timeout' | 'blocked' | 'not_found' | 'server_error' | 'network' | 'unknown';
422
434
  export declare class WebPeelError extends Error {
423
435
  code?: string | undefined;
424
436
  constructor(message: string, code?: string | undefined);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.5",
3
+ "version": "0.21.7",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",