webpeel 0.21.5 → 0.21.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/utils.js +13 -1
- package/dist/core/domain-extractors.js +20 -2
- package/dist/core/pipeline.js +20 -3
- package/package.json +1 -1
package/dist/cli/utils.js
CHANGED
|
@@ -212,7 +212,19 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
|
|
|
212
212
|
}
|
|
213
213
|
if (!res.ok) {
|
|
214
214
|
const body = await res.text().catch(() => '');
|
|
215
|
-
|
|
215
|
+
// Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
|
|
216
|
+
const isHtml = body.trimStart().startsWith('<');
|
|
217
|
+
let errorMsg;
|
|
218
|
+
if (res.status === 502 || res.status === 503 || res.status === 504) {
|
|
219
|
+
errorMsg = `Could not reach this website (gateway error)`;
|
|
220
|
+
}
|
|
221
|
+
else if (isHtml) {
|
|
222
|
+
errorMsg = `Server returned an error page`;
|
|
223
|
+
}
|
|
224
|
+
else {
|
|
225
|
+
errorMsg = body.slice(0, 200) || 'Unknown error';
|
|
226
|
+
}
|
|
227
|
+
throw new Error(`API error ${res.status}: ${errorMsg}`);
|
|
216
228
|
}
|
|
217
229
|
const data = await res.json();
|
|
218
230
|
// Map API response to PeelResult shape that the CLI already handles
|
|
@@ -1274,9 +1274,27 @@ async function youtubeExtractor(_html, url) {
|
|
|
1274
1274
|
const parts = [];
|
|
1275
1275
|
parts.push(`# ${title}`);
|
|
1276
1276
|
parts.push(headerLine);
|
|
1277
|
+
/**
|
|
1278
|
+
* Strip music note symbols from transcript/caption text.
|
|
1279
|
+
* YouTube auto-captions include ♪ and 🎵 as music cues.
|
|
1280
|
+
* Patterns cleaned:
|
|
1281
|
+
* [♪♪♪] → (removed)
|
|
1282
|
+
* ♪ text ♪ → text
|
|
1283
|
+
* standalone ♪ / 🎵 → (removed)
|
|
1284
|
+
*/
|
|
1285
|
+
const cleanMusicNotes = (text) => text
|
|
1286
|
+
// Remove bracketed music cues: [♪], [♪♪♪], [🎵🎵🎵], etc.
|
|
1287
|
+
.replace(/\[[♪🎵]+\]/g, '')
|
|
1288
|
+
// Unwrap ♪ text ♪ → text (keep the words between notes)
|
|
1289
|
+
.replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
|
|
1290
|
+
// Remove any remaining standalone ♪ or 🎵
|
|
1291
|
+
.replace(/[♪🎵]+/g, '')
|
|
1292
|
+
// Collapse extra whitespace introduced by removals
|
|
1293
|
+
.replace(/\s{2,}/g, ' ')
|
|
1294
|
+
.trim();
|
|
1277
1295
|
// Summary section
|
|
1278
1296
|
if (transcript.summary && hasTranscript) {
|
|
1279
|
-
let summaryText = transcript.summary;
|
|
1297
|
+
let summaryText = cleanMusicNotes(transcript.summary);
|
|
1280
1298
|
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
1281
1299
|
parts.push(`## Summary\n\n${summaryText}`);
|
|
1282
1300
|
}
|
|
@@ -1296,7 +1314,7 @@ async function youtubeExtractor(_html, url) {
|
|
|
1296
1314
|
// Full Transcript section (only if we have real transcript segments)
|
|
1297
1315
|
// Add intelligent paragraph breaks for readability
|
|
1298
1316
|
if (hasTranscript) {
|
|
1299
|
-
let readableText = transcript.fullText;
|
|
1317
|
+
let readableText = cleanMusicNotes(transcript.fullText);
|
|
1300
1318
|
// Break into paragraphs: after sentence-ending punctuation followed by a capital letter
|
|
1301
1319
|
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
1302
1320
|
// Collapse any triple+ newlines
|
package/dist/core/pipeline.js
CHANGED
|
@@ -193,19 +193,32 @@ export async function handleYouTube(ctx) {
|
|
|
193
193
|
headerParts.push(`**${viewStr}**`);
|
|
194
194
|
if (publishStr)
|
|
195
195
|
headerParts.push(`**Published:** ${publishStr}`);
|
|
196
|
+
/**
|
|
197
|
+
* Strip music note symbols from YouTube auto-caption text.
|
|
198
|
+
* Cleans: [♪♪♪], [🎵🎵🎵], ♪ text ♪ (keeps inner text), standalone ♪ / 🎵
|
|
199
|
+
*/
|
|
200
|
+
const cleanMusicNotes = (text) => text
|
|
201
|
+
.replace(/\[[♪🎵]+\]/g, '')
|
|
202
|
+
.replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
|
|
203
|
+
.replace(/[♪🎵]+/g, '')
|
|
204
|
+
.replace(/\s{2,}/g, ' ')
|
|
205
|
+
.trim();
|
|
196
206
|
// Add paragraph breaks to transcript for readability
|
|
197
|
-
let readableText = transcript.fullText;
|
|
207
|
+
let readableText = cleanMusicNotes(transcript.fullText);
|
|
198
208
|
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
199
209
|
readableText = readableText.replace(/\n{3,}/g, '\n\n');
|
|
200
210
|
// Build a clean markdown representation of the video + transcript
|
|
201
211
|
const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
|
|
202
212
|
if (transcript.summary) {
|
|
203
|
-
let summaryText = transcript.summary;
|
|
213
|
+
let summaryText = cleanMusicNotes(transcript.summary);
|
|
204
214
|
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
205
215
|
parts.push(`## Summary\n\n${summaryText}`);
|
|
206
216
|
}
|
|
207
217
|
if (transcript.keyPoints && transcript.keyPoints.length > 0) {
|
|
208
|
-
|
|
218
|
+
const cleanedKps = transcript.keyPoints.map((kp) => cleanMusicNotes(kp)).filter((kp) => kp.length > 0);
|
|
219
|
+
if (cleanedKps.length > 0) {
|
|
220
|
+
parts.push(`## Key Points\n\n${cleanedKps.map((kp) => `- ${kp}`).join('\n')}`);
|
|
221
|
+
}
|
|
209
222
|
}
|
|
210
223
|
if (transcript.chapters && transcript.chapters.length > 0) {
|
|
211
224
|
parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
|
|
@@ -927,6 +940,10 @@ export async function postProcess(ctx) {
|
|
|
927
940
|
if (ddResult) {
|
|
928
941
|
ctx.domainData = ddResult;
|
|
929
942
|
ctx.content = ddResult.cleanContent;
|
|
943
|
+
// Update title from domain extractor (takes precedence over HTML page title)
|
|
944
|
+
if (ddResult.structured?.title) {
|
|
945
|
+
ctx.title = ddResult.structured.title;
|
|
946
|
+
}
|
|
930
947
|
}
|
|
931
948
|
}
|
|
932
949
|
catch (e) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.6",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|