webpeel 0.21.5 → 0.21.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/utils.js +71 -10
- package/dist/core/domain-extractors.js +20 -2
- package/dist/core/pipeline.js +20 -3
- package/dist/core/structured-extract.js +190 -23
- package/dist/server/app.js +2 -2
- package/dist/server/routes/fetch.js +76 -34
- package/dist/types.d.ts +12 -0
- package/package.json +1 -1
package/dist/cli/utils.js
CHANGED
|
@@ -131,22 +131,30 @@ export function parseActions(actionStrings) {
|
|
|
131
131
|
*/
|
|
132
132
|
export function formatError(error, _url, options) {
|
|
133
133
|
const msg = error.message || String(error);
|
|
134
|
+
const errorType = error.errorType || '';
|
|
134
135
|
const lines = [`\x1b[31m✖ ${msg}\x1b[0m`];
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
}
|
|
138
|
-
else if (msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
|
|
136
|
+
// Check structured errorType from API first (takes precedence over message heuristics)
|
|
137
|
+
if (errorType === 'timeout' || msg.includes('took too long') || msg.includes('timeout') || msg.includes('Timeout') || msg.includes('Navigation timeout')) {
|
|
139
138
|
lines.push('\x1b[33m💡 Try increasing timeout: --timeout 60000\x1b[0m');
|
|
140
139
|
if (!options.render) {
|
|
141
140
|
lines.push('\x1b[33m💡 Site may need browser rendering: --render\x1b[0m');
|
|
142
141
|
}
|
|
143
142
|
}
|
|
144
|
-
else if (msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
|
|
143
|
+
else if (errorType === 'blocked' || msg.includes('blocking automated') || msg.includes('bot protection') || msg.includes('blocked') || msg.includes('403') || msg.includes('Access Denied') || msg.includes('challenge')) {
|
|
145
144
|
if (!options.stealth) {
|
|
146
145
|
lines.push('\x1b[33m💡 Try stealth mode to bypass bot detection: --stealth\x1b[0m');
|
|
147
146
|
}
|
|
148
147
|
lines.push('\x1b[33m💡 Try a different user agent: --ua "Mozilla/5.0..."\x1b[0m');
|
|
149
148
|
}
|
|
149
|
+
else if (errorType === 'not_found' || msg.includes('domain may not exist') || msg.includes('not found') || msg.includes('ENOTFOUND') || msg.includes('net::ERR_') || msg.includes('ECONNREFUSED')) {
|
|
150
|
+
lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
|
|
151
|
+
}
|
|
152
|
+
else if (errorType === 'network' || msg.includes('Could not reach') || msg.includes('could not connect') || msg.includes('ECONNREFUSED') || msg.includes('ENOTFOUND')) {
|
|
153
|
+
lines.push('\x1b[33m💡 Check the URL is correct and the site is accessible.\x1b[0m');
|
|
154
|
+
}
|
|
155
|
+
else if (errorType === 'server_error' || msg.includes('server error')) {
|
|
156
|
+
lines.push('\x1b[33m💡 The target site returned a server error. Try again in a moment.\x1b[0m');
|
|
157
|
+
}
|
|
150
158
|
else if (msg.includes('empty') || msg.includes('no content') || msg.includes('0 tokens')) {
|
|
151
159
|
if (!options.render) {
|
|
152
160
|
lines.push('\x1b[33m💡 Page may be JavaScript-rendered. Try: --render\x1b[0m');
|
|
@@ -212,7 +220,40 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
|
|
|
212
220
|
}
|
|
213
221
|
if (!res.ok) {
|
|
214
222
|
const body = await res.text().catch(() => '');
|
|
215
|
-
|
|
223
|
+
// Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
|
|
224
|
+
const isHtml = body.trimStart().startsWith('<') || body.includes('<!DOCTYPE') || body.includes('<html');
|
|
225
|
+
let errorMsg;
|
|
226
|
+
let errorType;
|
|
227
|
+
if (res.status === 502 || res.status === 503 || res.status === 504) {
|
|
228
|
+
errorMsg = `Could not reach this website. The site may be blocking our server or timing out.`;
|
|
229
|
+
errorType = res.status === 504 ? 'timeout' : 'network';
|
|
230
|
+
}
|
|
231
|
+
else if (isHtml) {
|
|
232
|
+
errorMsg = `Server returned an error page (${res.status})`;
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
// Try to parse a structured JSON error response
|
|
236
|
+
try {
|
|
237
|
+
const json = JSON.parse(body);
|
|
238
|
+
const errObj = json?.error;
|
|
239
|
+
if (errObj && typeof errObj === 'object') {
|
|
240
|
+
errorMsg = typeof errObj.message === 'string' ? errObj.message : (body.slice(0, 200) || 'Unknown error');
|
|
241
|
+
if (typeof errObj.type === 'string')
|
|
242
|
+
errorType = errObj.type;
|
|
243
|
+
}
|
|
244
|
+
else {
|
|
245
|
+
errorMsg = body.slice(0, 200) || 'Unknown error';
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
catch {
|
|
249
|
+
errorMsg = body.slice(0, 200) || 'Unknown error';
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
const err = new Error(`${errorMsg}`);
|
|
253
|
+
if (errorType)
|
|
254
|
+
err.errorType = errorType;
|
|
255
|
+
err.statusCode = res.status;
|
|
256
|
+
throw err;
|
|
216
257
|
}
|
|
217
258
|
const data = await res.json();
|
|
218
259
|
// Map API response to PeelResult shape that the CLI already handles
|
|
@@ -393,20 +434,40 @@ export function classifyErrorCode(error) {
|
|
|
393
434
|
// Check for our custom _code first (set in pre-fetch validation)
|
|
394
435
|
if (error._code)
|
|
395
436
|
return error._code;
|
|
437
|
+
// Check for structured errorType from API responses (set by fetchViaApi)
|
|
438
|
+
const errorType = error.errorType;
|
|
439
|
+
if (errorType) {
|
|
440
|
+
const typeMap = {
|
|
441
|
+
timeout: 'TIMEOUT',
|
|
442
|
+
blocked: 'BLOCKED',
|
|
443
|
+
not_found: 'NOT_FOUND',
|
|
444
|
+
server_error: 'SERVER_ERROR',
|
|
445
|
+
network: 'NETWORK',
|
|
446
|
+
unknown: 'FETCH_FAILED',
|
|
447
|
+
};
|
|
448
|
+
if (typeMap[errorType])
|
|
449
|
+
return typeMap[errorType];
|
|
450
|
+
}
|
|
396
451
|
const msg = error.message.toLowerCase();
|
|
397
452
|
const name = error.name || '';
|
|
398
|
-
if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out')) {
|
|
453
|
+
if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out') || msg.includes('took too long')) {
|
|
399
454
|
return 'TIMEOUT';
|
|
400
455
|
}
|
|
401
|
-
if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
|
|
456
|
+
if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare') || msg.includes('bot protection')) {
|
|
402
457
|
return 'BLOCKED';
|
|
403
458
|
}
|
|
404
|
-
if (msg.includes('
|
|
405
|
-
return '
|
|
459
|
+
if (msg.includes('domain may not exist') || msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed')) {
|
|
460
|
+
return 'NOT_FOUND';
|
|
461
|
+
}
|
|
462
|
+
if (msg.includes('http 404') || msg.includes('page was not found')) {
|
|
463
|
+
return 'NOT_FOUND';
|
|
406
464
|
}
|
|
407
465
|
if (msg.includes('invalid url') || msg.includes('invalid hostname') || msg.includes('only http')) {
|
|
408
466
|
return 'INVALID_URL';
|
|
409
467
|
}
|
|
468
|
+
if (msg.includes('could not reach') || msg.includes('could not connect') || msg.includes('econnrefused')) {
|
|
469
|
+
return 'NETWORK';
|
|
470
|
+
}
|
|
410
471
|
return 'FETCH_FAILED';
|
|
411
472
|
}
|
|
412
473
|
/**
|
|
@@ -1274,9 +1274,27 @@ async function youtubeExtractor(_html, url) {
|
|
|
1274
1274
|
const parts = [];
|
|
1275
1275
|
parts.push(`# ${title}`);
|
|
1276
1276
|
parts.push(headerLine);
|
|
1277
|
+
/**
|
|
1278
|
+
* Strip music note symbols from transcript/caption text.
|
|
1279
|
+
* YouTube auto-captions include ♪ and 🎵 as music cues.
|
|
1280
|
+
* Patterns cleaned:
|
|
1281
|
+
* [♪♪♪] → (removed)
|
|
1282
|
+
* ♪ text ♪ → text
|
|
1283
|
+
* standalone ♪ / 🎵 → (removed)
|
|
1284
|
+
*/
|
|
1285
|
+
const cleanMusicNotes = (text) => text
|
|
1286
|
+
// Remove bracketed music cues: [♪], [♪♪♪], [🎵🎵🎵], etc.
|
|
1287
|
+
.replace(/\[[♪🎵]+\]/g, '')
|
|
1288
|
+
// Unwrap ♪ text ♪ → text (keep the words between notes)
|
|
1289
|
+
.replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
|
|
1290
|
+
// Remove any remaining standalone ♪ or 🎵
|
|
1291
|
+
.replace(/[♪🎵]+/g, '')
|
|
1292
|
+
// Collapse extra whitespace introduced by removals
|
|
1293
|
+
.replace(/\s{2,}/g, ' ')
|
|
1294
|
+
.trim();
|
|
1277
1295
|
// Summary section
|
|
1278
1296
|
if (transcript.summary && hasTranscript) {
|
|
1279
|
-
let summaryText = transcript.summary;
|
|
1297
|
+
let summaryText = cleanMusicNotes(transcript.summary);
|
|
1280
1298
|
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
1281
1299
|
parts.push(`## Summary\n\n${summaryText}`);
|
|
1282
1300
|
}
|
|
@@ -1296,7 +1314,7 @@ async function youtubeExtractor(_html, url) {
|
|
|
1296
1314
|
// Full Transcript section (only if we have real transcript segments)
|
|
1297
1315
|
// Add intelligent paragraph breaks for readability
|
|
1298
1316
|
if (hasTranscript) {
|
|
1299
|
-
let readableText = transcript.fullText;
|
|
1317
|
+
let readableText = cleanMusicNotes(transcript.fullText);
|
|
1300
1318
|
// Break into paragraphs: after sentence-ending punctuation followed by a capital letter
|
|
1301
1319
|
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
1302
1320
|
// Collapse any triple+ newlines
|
package/dist/core/pipeline.js
CHANGED
|
@@ -193,19 +193,32 @@ export async function handleYouTube(ctx) {
|
|
|
193
193
|
headerParts.push(`**${viewStr}**`);
|
|
194
194
|
if (publishStr)
|
|
195
195
|
headerParts.push(`**Published:** ${publishStr}`);
|
|
196
|
+
/**
|
|
197
|
+
* Strip music note symbols from YouTube auto-caption text.
|
|
198
|
+
* Cleans: [♪♪♪], [🎵🎵🎵], ♪ text ♪ (keeps inner text), standalone ♪ / 🎵
|
|
199
|
+
*/
|
|
200
|
+
const cleanMusicNotes = (text) => text
|
|
201
|
+
.replace(/\[[♪🎵]+\]/g, '')
|
|
202
|
+
.replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
|
|
203
|
+
.replace(/[♪🎵]+/g, '')
|
|
204
|
+
.replace(/\s{2,}/g, ' ')
|
|
205
|
+
.trim();
|
|
196
206
|
// Add paragraph breaks to transcript for readability
|
|
197
|
-
let readableText = transcript.fullText;
|
|
207
|
+
let readableText = cleanMusicNotes(transcript.fullText);
|
|
198
208
|
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
199
209
|
readableText = readableText.replace(/\n{3,}/g, '\n\n');
|
|
200
210
|
// Build a clean markdown representation of the video + transcript
|
|
201
211
|
const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
|
|
202
212
|
if (transcript.summary) {
|
|
203
|
-
let summaryText = transcript.summary;
|
|
213
|
+
let summaryText = cleanMusicNotes(transcript.summary);
|
|
204
214
|
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
205
215
|
parts.push(`## Summary\n\n${summaryText}`);
|
|
206
216
|
}
|
|
207
217
|
if (transcript.keyPoints && transcript.keyPoints.length > 0) {
|
|
208
|
-
|
|
218
|
+
const cleanedKps = transcript.keyPoints.map((kp) => cleanMusicNotes(kp)).filter((kp) => kp.length > 0);
|
|
219
|
+
if (cleanedKps.length > 0) {
|
|
220
|
+
parts.push(`## Key Points\n\n${cleanedKps.map((kp) => `- ${kp}`).join('\n')}`);
|
|
221
|
+
}
|
|
209
222
|
}
|
|
210
223
|
if (transcript.chapters && transcript.chapters.length > 0) {
|
|
211
224
|
parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
|
|
@@ -927,6 +940,10 @@ export async function postProcess(ctx) {
|
|
|
927
940
|
if (ddResult) {
|
|
928
941
|
ctx.domainData = ddResult;
|
|
929
942
|
ctx.content = ddResult.cleanContent;
|
|
943
|
+
// Update title from domain extractor (takes precedence over HTML page title)
|
|
944
|
+
if (ddResult.structured?.title) {
|
|
945
|
+
ctx.title = ddResult.structured.title;
|
|
946
|
+
}
|
|
930
947
|
}
|
|
931
948
|
}
|
|
932
949
|
catch (e) {
|
|
@@ -86,56 +86,223 @@ function parseLLMJson(text) {
|
|
|
86
86
|
/**
|
|
87
87
|
* For string fields: search for field name in content, extract surrounding text.
|
|
88
88
|
*/
|
|
89
|
-
|
|
89
|
+
/** Extract first H1 or page title from markdown content */
|
|
90
|
+
function extractPageTitle(content) {
|
|
91
|
+
const h1 = content.match(/^#\s+(.+)$/m);
|
|
92
|
+
if (h1?.[1])
|
|
93
|
+
return h1[1].replace(/[*_`]/g, '').trim();
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
/** Extract meta description (after *X min read* pattern common in WebPeel output) */
|
|
97
|
+
function extractDescription(content) {
|
|
98
|
+
// First paragraph after the title
|
|
99
|
+
const lines = content.split('\n').filter(l => l.trim());
|
|
100
|
+
let seenH1 = false;
|
|
101
|
+
for (const line of lines) {
|
|
102
|
+
if (line.startsWith('#')) {
|
|
103
|
+
seenH1 = true;
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
if (line.startsWith('*') && line.endsWith('*'))
|
|
107
|
+
continue; // byline
|
|
108
|
+
if (seenH1 && line.length > 30)
|
|
109
|
+
return line.replace(/[*_`]/g, '').trim().slice(0, 300);
|
|
110
|
+
}
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
/** Extract company/brand name from title (before " — ", " - ", " | ", " · ") */
|
|
114
|
+
function extractCompanyFromTitle(title) {
|
|
115
|
+
const sep = title.match(/^([^|·\-—]+)[|·\-—]/);
|
|
116
|
+
if (sep?.[1])
|
|
117
|
+
return sep[1].trim();
|
|
118
|
+
return title.trim().slice(0, 60);
|
|
119
|
+
}
|
|
120
|
+
/** Smart field-name-aware string extractor */
|
|
121
|
+
function heuristicExtractString(fieldName, content, pageUrl) {
|
|
122
|
+
const lf = fieldName.toLowerCase();
|
|
90
123
|
const humanName = fieldName.replace(/_/g, ' ');
|
|
124
|
+
const title = extractPageTitle(content);
|
|
125
|
+
// --- Concept-aware extraction ---
|
|
126
|
+
// Company/brand/organization name
|
|
127
|
+
if (/company|brand|organization|org_name/.test(lf)) {
|
|
128
|
+
if (title)
|
|
129
|
+
return extractCompanyFromTitle(title);
|
|
130
|
+
// Fallback: extract from first heading of any level
|
|
131
|
+
const anyHeading = content.match(/^#{1,3}\s+(.+)$/m);
|
|
132
|
+
if (anyHeading?.[1])
|
|
133
|
+
return anyHeading[1].replace(/[*_`[\]]/g, '').trim().slice(0, 60);
|
|
134
|
+
}
|
|
135
|
+
// Title/name/product → first H1 or any heading, stripped of markdown
|
|
136
|
+
if (/^(title|name|product_name|product|heading)$/.test(lf)) {
|
|
137
|
+
const rawTitle = title ?? content.match(/^#{1,3}\s+(.+)$/m)?.[1];
|
|
138
|
+
if (rawTitle) {
|
|
139
|
+
// Strip markdown links [text](url) → text, badges  → '', etc.
|
|
140
|
+
return rawTitle
|
|
141
|
+
.replace(/!\[[^\]]*\]\([^)]*\)/g, '') // remove images
|
|
142
|
+
.replace(/\[([^\]]+)\]\([^)]*\)/g, '$1') // [text](url) → text
|
|
143
|
+
.replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
|
|
144
|
+
.replace(/[*_`[\]]/g, '')
|
|
145
|
+
.replace(/&[a-z]+;/g, '') // HTML entities
|
|
146
|
+
.replace(/\s+/g, ' ')
|
|
147
|
+
.trim().slice(0, 150);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
// Description/summary/about → first paragraph
|
|
151
|
+
if (/description|summary|about|overview/.test(lf)) {
|
|
152
|
+
return extractDescription(content) ?? null;
|
|
153
|
+
}
|
|
154
|
+
// URL/website/link → use the URL if we have it
|
|
155
|
+
if (/^(url|website|link|homepage|site)$/.test(lf)) {
|
|
156
|
+
if (pageUrl)
|
|
157
|
+
return pageUrl;
|
|
158
|
+
}
|
|
159
|
+
// Author/writer/by
|
|
160
|
+
if (/author|writer|by/.test(lf)) {
|
|
161
|
+
const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
|
|
162
|
+
if (m?.[1])
|
|
163
|
+
return m[1].trim().slice(0, 100);
|
|
164
|
+
}
|
|
165
|
+
// Date/published/updated
|
|
166
|
+
if (/date|published|updated|modified/.test(lf)) {
|
|
167
|
+
const m = content.match(/(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b)/i)
|
|
168
|
+
?? content.match(/(\d{4}-\d{2}-\d{2})/);
|
|
169
|
+
if (m?.[1])
|
|
170
|
+
return m[1];
|
|
171
|
+
}
|
|
172
|
+
// Email
|
|
173
|
+
if (/email|contact/.test(lf)) {
|
|
174
|
+
const m = content.match(/[\w.+-]+@[\w-]+\.[a-z]{2,}/i);
|
|
175
|
+
if (m?.[0])
|
|
176
|
+
return m[0];
|
|
177
|
+
}
|
|
178
|
+
// Price/cost/pricing → extract value near $
|
|
179
|
+
if (/price|cost|pricing|fee/.test(lf)) {
|
|
180
|
+
const m = content.match(/\$\s*[\d,]+(?:\.\d{2})?(?:\s*\/\s*\w+)?/)
|
|
181
|
+
?? content.match(/(free|no cost|no charge)/i);
|
|
182
|
+
if (m?.[0])
|
|
183
|
+
return m[0].trim();
|
|
184
|
+
}
|
|
185
|
+
// Language (for GitHub repos)
|
|
186
|
+
if (/language|lang|tech/.test(lf)) {
|
|
187
|
+
const m = content.match(/💻\s*(\w[\w#+.-]+)/) ?? content.match(/Language[:\s]+(\w[\w#+.-]+)/i);
|
|
188
|
+
if (m?.[1])
|
|
189
|
+
return m[1];
|
|
190
|
+
}
|
|
191
|
+
// Stars (for GitHub)
|
|
192
|
+
if (/stars?/.test(lf)) {
|
|
193
|
+
const m = content.match(/⭐\s*([\d,]+)\s*stars?/i) ?? content.match(/([\d,]+)\s*stars?/i);
|
|
194
|
+
if (m?.[1])
|
|
195
|
+
return m[1].replace(/,/g, '');
|
|
196
|
+
}
|
|
197
|
+
// License
|
|
198
|
+
if (/license/.test(lf)) {
|
|
199
|
+
const m = content.match(/📜\s*(\w+)/) ?? content.match(/License[:\s]+(MIT|Apache|GPL|BSD|ISC|AGPL|MPL)[^\s]*/i);
|
|
200
|
+
if (m?.[1])
|
|
201
|
+
return m[1];
|
|
202
|
+
}
|
|
203
|
+
// --- Generic patterns (exact-ish match) ---
|
|
91
204
|
const patterns = [
|
|
92
|
-
// "field_name: value" or "Field Name: value" patterns
|
|
93
205
|
new RegExp(`(?:^|\\n)[ \\t]*${humanName}[:\\s]+([^\\n]{5,200})`, 'i'),
|
|
94
|
-
// JSON-like "field": "value"
|
|
95
206
|
new RegExp(`"${fieldName}"\\s*:\\s*"([^"]{1,300})"`, 'i'),
|
|
96
|
-
// Markdown bold **Field Name**: value
|
|
97
207
|
new RegExp(`\\*{1,2}${humanName}\\*{0,2}[:\\s]+([^\\n]{5,200})`, 'i'),
|
|
98
|
-
// Heading followed by content
|
|
99
208
|
new RegExp(`#+\\s*${humanName}\\s*\\n+([^\\n]{5,300})`, 'i'),
|
|
100
209
|
];
|
|
101
210
|
for (const pattern of patterns) {
|
|
102
211
|
const match = content.match(pattern);
|
|
103
|
-
if (match?.[1])
|
|
212
|
+
if (match?.[1])
|
|
104
213
|
return match[1].trim().replace(/[|*_`]/g, '').slice(0, 300);
|
|
105
|
-
}
|
|
106
214
|
}
|
|
107
215
|
return null;
|
|
108
216
|
}
|
|
109
217
|
/**
|
|
110
|
-
* For boolean fields: search for positive/negative indicators
|
|
218
|
+
* For boolean fields: search the ENTIRE content for positive/negative indicators.
|
|
111
219
|
*/
|
|
112
220
|
function heuristicExtractBoolean(fieldName, content) {
|
|
113
|
-
const
|
|
221
|
+
const lf = fieldName.toLowerCase();
|
|
114
222
|
const ctx = content.toLowerCase();
|
|
115
|
-
//
|
|
116
|
-
|
|
117
|
-
if (
|
|
118
|
-
|
|
119
|
-
if (fieldIdx === -1)
|
|
120
|
-
return null;
|
|
121
|
-
// Look at a window of ±150 chars around the field name
|
|
122
|
-
const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
|
|
123
|
-
const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
|
|
124
|
-
const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
|
|
125
|
-
for (const pos of positive) {
|
|
126
|
-
if (window.includes(pos))
|
|
223
|
+
// Concept-aware boolean extraction — search entire content, not just near field name
|
|
224
|
+
// Free tier / free plan
|
|
225
|
+
if (/free_tier|has_free|is_free/.test(lf)) {
|
|
226
|
+
if (/free tier|free plan|\$0|no cost|no charge|free forever/.test(ctx))
|
|
127
227
|
return true;
|
|
228
|
+
if (/no free|paid only|subscription required/.test(ctx))
|
|
229
|
+
return false;
|
|
128
230
|
}
|
|
129
|
-
|
|
130
|
-
|
|
231
|
+
// Open source
|
|
232
|
+
if (/open_source|is_open|oss/.test(lf)) {
|
|
233
|
+
if (/open[- ]source|mit license|apache license|gpl|bsd license|📜\s*mit|📜\s*apache/.test(ctx))
|
|
234
|
+
return true;
|
|
235
|
+
if (/closed[- ]source|proprietary|commercial license/.test(ctx))
|
|
131
236
|
return false;
|
|
132
237
|
}
|
|
238
|
+
// API availability
|
|
239
|
+
if (/has_api|api_available|has_rest/.test(lf)) {
|
|
240
|
+
if (/rest api|graphql api|api endpoint|api key|\/v1\/|\/api\//.test(ctx))
|
|
241
|
+
return true;
|
|
242
|
+
}
|
|
243
|
+
// Authentication
|
|
244
|
+
if (/requires_auth|has_auth|is_authenticated/.test(lf)) {
|
|
245
|
+
if (/login|sign in|authentication|api key|bearer token/.test(ctx))
|
|
246
|
+
return true;
|
|
247
|
+
}
|
|
248
|
+
// General approach: search near field name concept
|
|
249
|
+
const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
|
|
250
|
+
let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
|
|
251
|
+
if (fieldIdx === -1)
|
|
252
|
+
fieldIdx = ctx.indexOf(humanName);
|
|
253
|
+
if (fieldIdx !== -1) {
|
|
254
|
+
const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
|
|
255
|
+
const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
|
|
256
|
+
const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
|
|
257
|
+
for (const pos of positive) {
|
|
258
|
+
if (window.includes(pos))
|
|
259
|
+
return true;
|
|
260
|
+
}
|
|
261
|
+
for (const neg of negative) {
|
|
262
|
+
if (window.includes(neg))
|
|
263
|
+
return false;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
133
266
|
return null;
|
|
134
267
|
}
|
|
135
268
|
/**
|
|
136
269
|
* For number fields: find digits near the field name.
|
|
137
270
|
*/
|
|
138
271
|
function heuristicExtractNumber(fieldName, content) {
|
|
272
|
+
const lf = fieldName.toLowerCase();
|
|
273
|
+
// Stars (GitHub)
|
|
274
|
+
if (/stars?/.test(lf)) {
|
|
275
|
+
const m = content.match(/⭐\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*stars?/i);
|
|
276
|
+
if (m?.[1]) {
|
|
277
|
+
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
278
|
+
return isNaN(n) ? null : n;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
// Forks
|
|
282
|
+
if (/forks?/.test(lf)) {
|
|
283
|
+
const m = content.match(/🍴\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*forks?/i);
|
|
284
|
+
if (m?.[1]) {
|
|
285
|
+
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
286
|
+
return isNaN(n) ? null : n;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
// Rating/score
|
|
290
|
+
if (/rating|score/.test(lf)) {
|
|
291
|
+
const m = content.match(/⭐\s*([\d.]+)\//) ?? content.match(/([\d.]+)\s*\/\s*10/) ?? content.match(/([\d.]+)\s*\/\s*5/);
|
|
292
|
+
if (m?.[1]) {
|
|
293
|
+
const n = parseFloat(m[1]);
|
|
294
|
+
return isNaN(n) ? null : n;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
// Year
|
|
298
|
+
if (/year/.test(lf)) {
|
|
299
|
+
const m = content.match(/\b(20\d{2})\b/);
|
|
300
|
+
if (m?.[1]) {
|
|
301
|
+
const n = parseInt(m[1]);
|
|
302
|
+
return isNaN(n) ? null : n;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
// Generic: find number near field name
|
|
139
306
|
const humanName = fieldName.replace(/_/g, '[\\s_-]*');
|
|
140
307
|
const pattern = new RegExp(`${humanName}[:\\s$]*([\\d,]+\\.?\\d*)`, 'i');
|
|
141
308
|
const match = content.match(pattern);
|
package/dist/server/app.js
CHANGED
|
@@ -106,8 +106,8 @@ export function createApp(config = {}) {
|
|
|
106
106
|
timeoutMs = 120000; // 2min for batch
|
|
107
107
|
else if (path.includes('/screenshot'))
|
|
108
108
|
timeoutMs = 60000; // 1min for screenshots
|
|
109
|
-
else if (req.query?.render === 'true')
|
|
110
|
-
timeoutMs = 60000; // 1min for
|
|
109
|
+
else if (req.query?.render === 'true' || req.query?.stealth === 'true')
|
|
110
|
+
timeoutMs = 60000; // 1min for browser/stealth fetches
|
|
111
111
|
else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
|
|
112
112
|
timeoutMs = 90000; // 90s for YouTube (yt-dlp needs time after simpleFetch fails)
|
|
113
113
|
req.setTimeout(timeoutMs);
|
|
@@ -13,6 +13,52 @@ import { getSchemaTemplate } from '../../core/schema-templates.js';
|
|
|
13
13
|
import { quickAnswer } from '../../core/quick-answer.js';
|
|
14
14
|
import { sendUsageAlertEmail } from '../email-service.js';
|
|
15
15
|
import { extractLinks } from '../../core/links.js';
|
|
16
|
+
// ── Helper: classify an error thrown by peel() into a FetchErrorType ─────────
|
|
17
|
+
function classifyFetchError(err) {
|
|
18
|
+
const code = err.code || err.name || '';
|
|
19
|
+
const msg = (err.message || '').toLowerCase();
|
|
20
|
+
if (code === 'TIMEOUT' || msg.includes('timeout') || msg.includes('timed out')) {
|
|
21
|
+
return 'timeout';
|
|
22
|
+
}
|
|
23
|
+
if (code === 'BLOCKED' || msg.includes('blocked') || msg.includes('cloudflare challenge') || msg.includes('captcha') || msg.includes('bot detection')) {
|
|
24
|
+
return 'blocked';
|
|
25
|
+
}
|
|
26
|
+
if (msg.includes('http 404') || msg.includes('not found') || msg.includes('dns resolution failed') || msg.includes('enotfound') || msg.includes('getaddrinfo')) {
|
|
27
|
+
return 'not_found';
|
|
28
|
+
}
|
|
29
|
+
if (msg.match(/http\s+5\d{2}/) || msg.includes('server error') || msg.includes('internal server')) {
|
|
30
|
+
return 'server_error';
|
|
31
|
+
}
|
|
32
|
+
if (code === 'NETWORK' || msg.includes('network') || msg.includes('econnrefused') || msg.includes('connection refused') || msg.includes('connection reset')) {
|
|
33
|
+
return 'network';
|
|
34
|
+
}
|
|
35
|
+
return 'unknown';
|
|
36
|
+
}
|
|
37
|
+
// ── Helper: build a clean, user-facing error message from a peel() error ─────
|
|
38
|
+
function buildFetchErrorMessage(err) {
|
|
39
|
+
const type = classifyFetchError(err);
|
|
40
|
+
const hints = {
|
|
41
|
+
timeout: 'Try increasing timeout with ?timeout=20000, or use render=true for JS-heavy sites.',
|
|
42
|
+
blocked: 'This site blocks automated requests. Try render=true or stealth=true.',
|
|
43
|
+
not_found: 'Verify the URL is correct and the site is accessible.',
|
|
44
|
+
server_error: 'The target site returned a server error. Try again later.',
|
|
45
|
+
network: 'Could not connect to the target URL. Verify the URL is correct and the site is online.',
|
|
46
|
+
unknown: undefined,
|
|
47
|
+
};
|
|
48
|
+
// Sanitize message: strip HTML chars, truncate
|
|
49
|
+
const safeMsg = (err.message || 'An unexpected error occurred while fetching the URL')
|
|
50
|
+
.replace(/[<>"']/g, '')
|
|
51
|
+
.trim();
|
|
52
|
+
const messages = {
|
|
53
|
+
timeout: `The website took too long to respond. Try with render=true or stealth=true for JavaScript-heavy sites.`,
|
|
54
|
+
blocked: `This website is blocking automated access (bot protection detected).`,
|
|
55
|
+
not_found: `The URL could not be reached — the domain may not exist or the page was not found.`,
|
|
56
|
+
server_error: `The target website returned a server error while processing the request.`,
|
|
57
|
+
network: `Could not reach this website. The server may be down or the URL may be incorrect.`,
|
|
58
|
+
unknown: safeMsg,
|
|
59
|
+
};
|
|
60
|
+
return { type, message: messages[type] || safeMsg, hint: hints[type] };
|
|
61
|
+
}
|
|
16
62
|
// ── Helper: extractive summarizer (TF-IDF-like sentence scoring) ─────────────
|
|
17
63
|
function extractSummary(content, maxWords = 150) {
|
|
18
64
|
if (!content)
|
|
@@ -527,26 +573,24 @@ export function createFetchRouter(authStore) {
|
|
|
527
573
|
});
|
|
528
574
|
}
|
|
529
575
|
// SECURITY: Sanitize error messages to prevent information disclosure
|
|
530
|
-
if (
|
|
576
|
+
if (res.headersSent)
|
|
577
|
+
return; // Timeout middleware already responded
|
|
578
|
+
const requestUrl = req.query.url;
|
|
579
|
+
if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
|
|
531
580
|
// WebPeelError from core library - safe to expose with helpful context
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
: 500;
|
|
539
|
-
const hints = {
|
|
540
|
-
TIMEOUT: 'Try increasing timeout with ?wait=10000, or use render=true for JS-heavy sites.',
|
|
541
|
-
BLOCKED: 'This site blocks automated requests. Try adding render=true or use stealth mode (costs 5 credits).',
|
|
542
|
-
NETWORK: 'Could not reach the target URL. Verify the URL is correct and the site is online.',
|
|
543
|
-
};
|
|
581
|
+
const { type, message, hint } = buildFetchErrorMessage(err);
|
|
582
|
+
const statusCode = type === 'timeout' ? 504
|
|
583
|
+
: type === 'blocked' ? 403
|
|
584
|
+
: type === 'not_found' ? 404
|
|
585
|
+
: type === 'network' || type === 'server_error' ? 502
|
|
586
|
+
: 500;
|
|
544
587
|
res.status(statusCode).json({
|
|
545
588
|
success: false,
|
|
546
589
|
error: {
|
|
547
|
-
type
|
|
548
|
-
message
|
|
549
|
-
|
|
590
|
+
type,
|
|
591
|
+
message,
|
|
592
|
+
url: requestUrl,
|
|
593
|
+
...(hint ? { hint } : {}),
|
|
550
594
|
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
551
595
|
},
|
|
552
596
|
requestId: req.requestId,
|
|
@@ -555,13 +599,12 @@ export function createFetchRouter(authStore) {
|
|
|
555
599
|
else {
|
|
556
600
|
// Unexpected error - generic message only
|
|
557
601
|
console.error('Fetch error:', err); // Log full error server-side
|
|
558
|
-
if (res.headersSent)
|
|
559
|
-
return; // Timeout middleware already responded
|
|
560
602
|
res.status(500).json({
|
|
561
603
|
success: false,
|
|
562
604
|
error: {
|
|
563
|
-
type: '
|
|
605
|
+
type: 'unknown',
|
|
564
606
|
message: 'An unexpected error occurred while fetching the URL. If this persists, check https://webpeel.dev/status',
|
|
607
|
+
url: requestUrl,
|
|
565
608
|
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
566
609
|
},
|
|
567
610
|
requestId: req.requestId,
|
|
@@ -1028,23 +1071,21 @@ export function createFetchRouter(authStore) {
|
|
|
1028
1071
|
console.error('POST fetch/scrape error:', err);
|
|
1029
1072
|
if (res.headersSent)
|
|
1030
1073
|
return; // Timeout middleware already responded
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
const
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
BLOCKED: 'Site blocks automated requests. Try render:true or stealth mode.',
|
|
1040
|
-
NETWORK: 'Could not reach the target URL. Verify it is correct and online.',
|
|
1041
|
-
};
|
|
1074
|
+
const postUrl = req.body?.url;
|
|
1075
|
+
if (err.code || err.name === 'TimeoutError' || err.name === 'BlockedError' || err.name === 'NetworkError' || err.name === 'WebPeelError') {
|
|
1076
|
+
const { type, message, hint } = buildFetchErrorMessage(err);
|
|
1077
|
+
const statusCode = type === 'timeout' ? 504
|
|
1078
|
+
: type === 'blocked' ? 403
|
|
1079
|
+
: type === 'not_found' ? 404
|
|
1080
|
+
: type === 'network' || type === 'server_error' ? 502
|
|
1081
|
+
: 500;
|
|
1042
1082
|
res.status(statusCode).json({
|
|
1043
1083
|
success: false,
|
|
1044
1084
|
error: {
|
|
1045
|
-
type
|
|
1046
|
-
message
|
|
1047
|
-
|
|
1085
|
+
type,
|
|
1086
|
+
message,
|
|
1087
|
+
url: postUrl,
|
|
1088
|
+
...(hint ? { hint } : {}),
|
|
1048
1089
|
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
1049
1090
|
},
|
|
1050
1091
|
requestId: req.requestId,
|
|
@@ -1054,8 +1095,9 @@ export function createFetchRouter(authStore) {
|
|
|
1054
1095
|
res.status(500).json({
|
|
1055
1096
|
success: false,
|
|
1056
1097
|
error: {
|
|
1057
|
-
type: '
|
|
1098
|
+
type: 'unknown',
|
|
1058
1099
|
message: 'An unexpected error occurred. If this persists, check https://webpeel.dev/status',
|
|
1100
|
+
url: postUrl,
|
|
1059
1101
|
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
1060
1102
|
},
|
|
1061
1103
|
requestId: req.requestId,
|
package/dist/types.d.ts
CHANGED
|
@@ -419,6 +419,18 @@ export interface PeelEnvelope {
|
|
|
419
419
|
*/
|
|
420
420
|
totalAvailable?: number;
|
|
421
421
|
}
|
|
422
|
+
/**
|
|
423
|
+
* Programmatic error classification for fetch failures.
|
|
424
|
+
* Returned in the `error.type` field of API error responses.
|
|
425
|
+
*
|
|
426
|
+
* - `timeout` — Site took too long to respond
|
|
427
|
+
* - `blocked` — Site actively blocked the request (403, CAPTCHA, bot detection)
|
|
428
|
+
* - `not_found` — 404 or the domain/URL does not exist
|
|
429
|
+
* - `server_error` — Target site returned a 5xx error
|
|
430
|
+
* - `network` — DNS failure, connection refused, or other network-level issue
|
|
431
|
+
* - `unknown` — Unclassified error
|
|
432
|
+
*/
|
|
433
|
+
export type FetchErrorType = 'timeout' | 'blocked' | 'not_found' | 'server_error' | 'network' | 'unknown';
|
|
422
434
|
export declare class WebPeelError extends Error {
|
|
423
435
|
code?: string | undefined;
|
|
424
436
|
constructor(message: string, code?: string | undefined);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.7",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|