webpeel 0.21.5 → 0.21.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/utils.js CHANGED
@@ -212,7 +212,19 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
212
212
  }
213
213
  if (!res.ok) {
214
214
  const body = await res.text().catch(() => '');
215
- throw new Error(`API error ${res.status}: ${body.slice(0, 200)}`);
215
+ // Sanitize error message — don't expose raw HTML (e.g. Cloudflare 502 pages)
216
+ const isHtml = body.trimStart().startsWith('<');
217
+ let errorMsg;
218
+ if (res.status === 502 || res.status === 503 || res.status === 504) {
219
+ errorMsg = `Could not reach this website (gateway error)`;
220
+ }
221
+ else if (isHtml) {
222
+ errorMsg = `Server returned an error page`;
223
+ }
224
+ else {
225
+ errorMsg = body.slice(0, 200) || 'Unknown error';
226
+ }
227
+ throw new Error(`API error ${res.status}: ${errorMsg}`);
216
228
  }
217
229
  const data = await res.json();
218
230
  // Map API response to PeelResult shape that the CLI already handles
@@ -1274,9 +1274,27 @@ async function youtubeExtractor(_html, url) {
1274
1274
  const parts = [];
1275
1275
  parts.push(`# ${title}`);
1276
1276
  parts.push(headerLine);
1277
+ /**
1278
+ * Strip music note symbols from transcript/caption text.
1279
+ * YouTube auto-captions include ♪ and 🎵 as music cues.
1280
+ * Patterns cleaned:
1281
+ * [♪♪♪] → (removed)
1282
+ * ♪ text ♪ → text
1283
+ * standalone ♪ / 🎵 → (removed)
1284
+ */
1285
+ const cleanMusicNotes = (text) => text
1286
+ // Remove bracketed music cues: [♪], [♪♪♪], [🎵🎵🎵], etc.
1287
+ .replace(/\[[♪🎵]+\]/g, '')
1288
+ // Unwrap ♪ text ♪ → text (keep the words between notes)
1289
+ .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
1290
+ // Remove any remaining standalone ♪ or 🎵
1291
+ .replace(/[♪🎵]+/g, '')
1292
+ // Collapse extra whitespace introduced by removals
1293
+ .replace(/\s{2,}/g, ' ')
1294
+ .trim();
1277
1295
  // Summary section
1278
1296
  if (transcript.summary && hasTranscript) {
1279
- let summaryText = transcript.summary;
1297
+ let summaryText = cleanMusicNotes(transcript.summary);
1280
1298
  summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
1281
1299
  parts.push(`## Summary\n\n${summaryText}`);
1282
1300
  }
@@ -1296,7 +1314,7 @@ async function youtubeExtractor(_html, url) {
1296
1314
  // Full Transcript section (only if we have real transcript segments)
1297
1315
  // Add intelligent paragraph breaks for readability
1298
1316
  if (hasTranscript) {
1299
- let readableText = transcript.fullText;
1317
+ let readableText = cleanMusicNotes(transcript.fullText);
1300
1318
  // Break into paragraphs: after sentence-ending punctuation followed by a capital letter
1301
1319
  readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
1302
1320
  // Collapse any triple+ newlines
@@ -193,19 +193,32 @@ export async function handleYouTube(ctx) {
193
193
  headerParts.push(`**${viewStr}**`);
194
194
  if (publishStr)
195
195
  headerParts.push(`**Published:** ${publishStr}`);
196
+ /**
197
+ * Strip music note symbols from YouTube auto-caption text.
198
+ * Cleans: [♪♪♪], [🎵🎵🎵], ♪ text ♪ (keeps inner text), standalone ♪ / 🎵
199
+ */
200
+ const cleanMusicNotes = (text) => text
201
+ .replace(/\[[♪🎵]+\]/g, '')
202
+ .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
203
+ .replace(/[♪🎵]+/g, '')
204
+ .replace(/\s{2,}/g, ' ')
205
+ .trim();
196
206
  // Add paragraph breaks to transcript for readability
197
- let readableText = transcript.fullText;
207
+ let readableText = cleanMusicNotes(transcript.fullText);
198
208
  readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
199
209
  readableText = readableText.replace(/\n{3,}/g, '\n\n');
200
210
  // Build a clean markdown representation of the video + transcript
201
211
  const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
202
212
  if (transcript.summary) {
203
- let summaryText = transcript.summary;
213
+ let summaryText = cleanMusicNotes(transcript.summary);
204
214
  summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
205
215
  parts.push(`## Summary\n\n${summaryText}`);
206
216
  }
207
217
  if (transcript.keyPoints && transcript.keyPoints.length > 0) {
208
- parts.push(`## Key Points\n\n${transcript.keyPoints.map(kp => `- ${kp}`).join('\n')}`);
218
+ const cleanedKps = transcript.keyPoints.map((kp) => cleanMusicNotes(kp)).filter((kp) => kp.length > 0);
219
+ if (cleanedKps.length > 0) {
220
+ parts.push(`## Key Points\n\n${cleanedKps.map((kp) => `- ${kp}`).join('\n')}`);
221
+ }
209
222
  }
210
223
  if (transcript.chapters && transcript.chapters.length > 0) {
211
224
  parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
@@ -927,6 +940,10 @@ export async function postProcess(ctx) {
927
940
  if (ddResult) {
928
941
  ctx.domainData = ddResult;
929
942
  ctx.content = ddResult.cleanContent;
943
+ // Update title from domain extractor (takes precedence over HTML page title)
944
+ if (ddResult.structured?.title) {
945
+ ctx.title = ddResult.structured.title;
946
+ }
930
947
  }
931
948
  }
932
949
  catch (e) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.5",
3
+ "version": "0.21.6",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",