webpeel 0.20.17 → 0.20.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +44 -2
- package/dist/core/pipeline.js +50 -15
- package/dist/core/youtube.d.ts +4 -0
- package/dist/core/youtube.js +12 -0
- package/package.json +1 -1
|
@@ -1169,14 +1169,37 @@ async function youtubeExtractor(_html, url) {
|
|
|
1169
1169
|
const transcriptPromise = withTimeout(getYouTubeTranscript(url), 30000);
|
|
1170
1170
|
const oembedPromise = fetchJson(`https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`);
|
|
1171
1171
|
const noembedPromise = fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`).catch(() => null);
|
|
1172
|
-
|
|
1172
|
+
// Fetch subscriber count from channel page (lightweight, parallel)
|
|
1173
|
+
const subscriberPromise = (async () => {
|
|
1174
|
+
try {
|
|
1175
|
+
// Wait for oEmbed to get channel URL, then fetch subscriber count from channel page
|
|
1176
|
+
const oembed = await oembedPromise;
|
|
1177
|
+
const channelUrl = oembed?.author_url;
|
|
1178
|
+
if (!channelUrl)
|
|
1179
|
+
return '';
|
|
1180
|
+
const resp = await fetch(channelUrl, {
|
|
1181
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' },
|
|
1182
|
+
signal: AbortSignal.timeout(5000),
|
|
1183
|
+
});
|
|
1184
|
+
const html = await resp.text();
|
|
1185
|
+
// Look for subscriber count in page metadata (e.g. "4.12M subscribers")
|
|
1186
|
+
const subMatch = html.match(/(\d+(?:\.\d+)?[KMBkmb]?)\s*subscribers/i);
|
|
1187
|
+
return subMatch ? subMatch[1] + ' subscribers' : '';
|
|
1188
|
+
}
|
|
1189
|
+
catch {
|
|
1190
|
+
return '';
|
|
1191
|
+
}
|
|
1192
|
+
})();
|
|
1193
|
+
const [transcriptResult, oembedResult, noembedResult, subscriberResult] = await Promise.allSettled([
|
|
1173
1194
|
transcriptPromise,
|
|
1174
1195
|
oembedPromise,
|
|
1175
1196
|
noembedPromise,
|
|
1197
|
+
subscriberPromise,
|
|
1176
1198
|
]);
|
|
1177
1199
|
const transcript = transcriptResult.status === 'fulfilled' ? transcriptResult.value : null;
|
|
1178
1200
|
const oembedData = oembedResult.status === 'fulfilled' ? oembedResult.value : null;
|
|
1179
1201
|
const noembedData = noembedResult.status === 'fulfilled' ? noembedResult.value : null;
|
|
1202
|
+
const subscriberCount = subscriberResult.status === 'fulfilled' ? subscriberResult.value : '';
|
|
1180
1203
|
if (process.env.DEBUG) {
|
|
1181
1204
|
if (transcriptResult.status === 'rejected') {
|
|
1182
1205
|
console.debug('[webpeel]', 'YouTube transcript failed:', transcriptResult.reason instanceof Error ? transcriptResult.reason.message : transcriptResult.reason);
|
|
@@ -1198,12 +1221,15 @@ async function youtubeExtractor(_html, url) {
|
|
|
1198
1221
|
title,
|
|
1199
1222
|
channel,
|
|
1200
1223
|
channelUrl,
|
|
1224
|
+
subscriberCount: subscriberCount || undefined,
|
|
1201
1225
|
duration: transcript.duration,
|
|
1202
1226
|
publishDate,
|
|
1203
1227
|
language: transcript.language,
|
|
1204
1228
|
availableLanguages: transcript.availableLanguages,
|
|
1205
1229
|
transcriptSegments: transcript.segments.length,
|
|
1206
1230
|
wordCount: transcript.wordCount ?? 0,
|
|
1231
|
+
viewCount: transcript.viewCount ?? '',
|
|
1232
|
+
likeCount: transcript.likeCount ?? '',
|
|
1207
1233
|
description,
|
|
1208
1234
|
thumbnailUrl,
|
|
1209
1235
|
chapters: transcript.chapters ?? [],
|
|
@@ -1221,10 +1247,26 @@ async function youtubeExtractor(_html, url) {
|
|
|
1221
1247
|
publishStr = publishDate;
|
|
1222
1248
|
}
|
|
1223
1249
|
}
|
|
1250
|
+
// Format view count (e.g. "1,234,567" → "1.2M views")
|
|
1251
|
+
let viewStr = '';
|
|
1252
|
+
if (transcript.viewCount) {
|
|
1253
|
+
const v = parseInt(transcript.viewCount, 10);
|
|
1254
|
+
if (!isNaN(v)) {
|
|
1255
|
+
if (v >= 1_000_000)
|
|
1256
|
+
viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
|
|
1257
|
+
else if (v >= 1_000)
|
|
1258
|
+
viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
|
|
1259
|
+
else
|
|
1260
|
+
viewStr = `${v.toLocaleString()} views`;
|
|
1261
|
+
}
|
|
1262
|
+
}
|
|
1224
1263
|
// Build header line
|
|
1225
|
-
const
|
|
1264
|
+
const channelPart = subscriberCount ? `${channel} (${subscriberCount})` : channel;
|
|
1265
|
+
const headerParts = [`**Channel:** ${channelPart}`];
|
|
1226
1266
|
if (transcript.duration && transcript.duration !== '0:00')
|
|
1227
1267
|
headerParts.push(`**Duration:** ${transcript.duration}`);
|
|
1268
|
+
if (viewStr)
|
|
1269
|
+
headerParts.push(`**${viewStr}**`);
|
|
1228
1270
|
if (publishStr)
|
|
1229
1271
|
headerParts.push(`**Published:** ${publishStr}`);
|
|
1230
1272
|
const headerLine = headerParts.join(' | ');
|
package/dist/core/pipeline.js
CHANGED
|
@@ -161,22 +161,57 @@ export async function handleYouTube(ctx) {
|
|
|
161
161
|
const transcript = await getYouTubeTranscript(ctx.url, {
|
|
162
162
|
language: ctx.options.language ?? 'en',
|
|
163
163
|
});
|
|
164
|
+
// Format view count
|
|
165
|
+
let viewStr = '';
|
|
166
|
+
if (transcript.viewCount) {
|
|
167
|
+
const v = parseInt(transcript.viewCount, 10);
|
|
168
|
+
if (!isNaN(v)) {
|
|
169
|
+
if (v >= 1_000_000)
|
|
170
|
+
viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
|
|
171
|
+
else if (v >= 1_000)
|
|
172
|
+
viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
|
|
173
|
+
else
|
|
174
|
+
viewStr = `${v.toLocaleString()} views`;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
// Format publish date
|
|
178
|
+
let publishStr = '';
|
|
179
|
+
if (transcript.publishDate) {
|
|
180
|
+
try {
|
|
181
|
+
const d = new Date(transcript.publishDate);
|
|
182
|
+
publishStr = d.toLocaleDateString('en-US', { month: 'short', year: 'numeric', day: 'numeric' });
|
|
183
|
+
}
|
|
184
|
+
catch {
|
|
185
|
+
publishStr = transcript.publishDate;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
// Build header metadata line
|
|
189
|
+
const headerParts = [`**Channel:** ${transcript.channel}`];
|
|
190
|
+
if (transcript.duration && transcript.duration !== '0:00')
|
|
191
|
+
headerParts.push(`**Duration:** ${transcript.duration}`);
|
|
192
|
+
if (viewStr)
|
|
193
|
+
headerParts.push(`**${viewStr}**`);
|
|
194
|
+
if (publishStr)
|
|
195
|
+
headerParts.push(`**Published:** ${publishStr}`);
|
|
196
|
+
// Add paragraph breaks to transcript for readability
|
|
197
|
+
let readableText = transcript.fullText;
|
|
198
|
+
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
199
|
+
readableText = readableText.replace(/\n{3,}/g, '\n\n');
|
|
164
200
|
// Build a clean markdown representation of the video + transcript
|
|
165
|
-
const
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
const videoInfoContent = videoInfoLines.join('\n');
|
|
201
|
+
const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
|
|
202
|
+
if (transcript.summary) {
|
|
203
|
+
let summaryText = transcript.summary;
|
|
204
|
+
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
205
|
+
parts.push(`## Summary\n\n${summaryText}`);
|
|
206
|
+
}
|
|
207
|
+
if (transcript.keyPoints && transcript.keyPoints.length > 0) {
|
|
208
|
+
parts.push(`## Key Points\n\n${transcript.keyPoints.map(kp => `- ${kp}`).join('\n')}`);
|
|
209
|
+
}
|
|
210
|
+
if (transcript.chapters && transcript.chapters.length > 0) {
|
|
211
|
+
parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
|
|
212
|
+
}
|
|
213
|
+
parts.push(`## Full Transcript\n\n${readableText}`);
|
|
214
|
+
const videoInfoContent = parts.join('\n\n');
|
|
180
215
|
const elapsed = Date.now() - ytStartTime;
|
|
181
216
|
const tokens = estimateTokens(videoInfoContent);
|
|
182
217
|
const fingerprint = createHash('sha256').update(videoInfoContent).digest('hex').slice(0, 16);
|
package/dist/core/youtube.d.ts
CHANGED
|
@@ -42,6 +42,10 @@ export interface YouTubeTranscript {
|
|
|
42
42
|
summary?: string;
|
|
43
43
|
/** Total word count of transcript */
|
|
44
44
|
wordCount?: number;
|
|
45
|
+
/** View count (numeric string) */
|
|
46
|
+
viewCount?: string;
|
|
47
|
+
/** Like count (numeric string, may be empty) */
|
|
48
|
+
likeCount?: string;
|
|
45
49
|
}
|
|
46
50
|
export interface YouTubeVideoInfo {
|
|
47
51
|
videoId: string;
|
package/dist/core/youtube.js
CHANGED
|
@@ -417,6 +417,8 @@ async function getTranscriptViaProxy(videoId, preferredLang) {
|
|
|
417
417
|
const chapters = parseChaptersFromDescription(description);
|
|
418
418
|
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
419
419
|
const summary = extractSummary(fullText);
|
|
420
|
+
const viewCount = vd.viewCount ?? mf.viewCount ?? '';
|
|
421
|
+
const likeCount = vd.likeCount ?? '';
|
|
420
422
|
console.log(`[webpeel] [youtube] Proxy slot ${slot} success: ${segments.length} segments, ${wordCount} words`);
|
|
421
423
|
return {
|
|
422
424
|
videoId,
|
|
@@ -433,6 +435,8 @@ async function getTranscriptViaProxy(videoId, preferredLang) {
|
|
|
433
435
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
434
436
|
summary,
|
|
435
437
|
wordCount,
|
|
438
|
+
viewCount: viewCount || undefined,
|
|
439
|
+
likeCount: likeCount || undefined,
|
|
436
440
|
};
|
|
437
441
|
}
|
|
438
442
|
catch (err) {
|
|
@@ -541,6 +545,8 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
541
545
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
542
546
|
summary,
|
|
543
547
|
wordCount,
|
|
548
|
+
viewCount: undefined, // not available in this path without extra fetch
|
|
549
|
+
likeCount: undefined,
|
|
544
550
|
};
|
|
545
551
|
}
|
|
546
552
|
console.log('[webpeel] [youtube] Path 0 returned empty segments');
|
|
@@ -649,6 +655,8 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
649
655
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
650
656
|
summary,
|
|
651
657
|
wordCount,
|
|
658
|
+
viewCount: (videoDetails.viewCount ?? microformat.viewCount ?? '') || undefined,
|
|
659
|
+
likeCount: (videoDetails.likeCount ?? '') || undefined,
|
|
652
660
|
};
|
|
653
661
|
}
|
|
654
662
|
catch (err) {
|
|
@@ -761,6 +769,8 @@ async function getTranscriptViaYtDlp(videoId, preferredLang) {
|
|
|
761
769
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
762
770
|
summary,
|
|
763
771
|
wordCount,
|
|
772
|
+
viewCount: (infoData.view_count?.toString() ?? '') || undefined,
|
|
773
|
+
likeCount: (infoData.like_count?.toString() ?? '') || undefined,
|
|
764
774
|
});
|
|
765
775
|
}
|
|
766
776
|
catch {
|
|
@@ -887,6 +897,8 @@ async function getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang
|
|
|
887
897
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
888
898
|
summary,
|
|
889
899
|
wordCount,
|
|
900
|
+
viewCount: undefined, // browser path doesn't reliably get this
|
|
901
|
+
likeCount: undefined,
|
|
890
902
|
};
|
|
891
903
|
}
|
|
892
904
|
finally {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.18",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|