webpeel 0.20.14 → 0.20.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +56 -6
- package/dist/core/pipeline.js +50 -15
- package/dist/core/youtube.d.ts +4 -0
- package/dist/core/youtube.js +244 -0
- package/package.json +1 -1
|
@@ -1165,18 +1165,41 @@ async function youtubeExtractor(_html, url) {
|
|
|
1165
1165
|
]);
|
|
1166
1166
|
}
|
|
1167
1167
|
// Run transcript fetch and oEmbed fetch in parallel
|
|
1168
|
-
//
|
|
1169
|
-
const transcriptPromise = withTimeout(getYouTubeTranscript(url),
|
|
1168
|
+
// Proxy-based extraction takes 2-5s, but retry logic may need more time
|
|
1169
|
+
const transcriptPromise = withTimeout(getYouTubeTranscript(url), 30000);
|
|
1170
1170
|
const oembedPromise = fetchJson(`https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`);
|
|
1171
1171
|
const noembedPromise = fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`).catch(() => null);
|
|
1172
|
-
|
|
1172
|
+
// Fetch subscriber count from channel page (lightweight, parallel)
|
|
1173
|
+
const subscriberPromise = (async () => {
|
|
1174
|
+
try {
|
|
1175
|
+
// Wait for oEmbed to get channel URL, then fetch subscriber count from channel page
|
|
1176
|
+
const oembed = await oembedPromise;
|
|
1177
|
+
const channelUrl = oembed?.author_url;
|
|
1178
|
+
if (!channelUrl)
|
|
1179
|
+
return '';
|
|
1180
|
+
const resp = await fetch(channelUrl, {
|
|
1181
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' },
|
|
1182
|
+
signal: AbortSignal.timeout(5000),
|
|
1183
|
+
});
|
|
1184
|
+
const html = await resp.text();
|
|
1185
|
+
// Look for subscriber count in page metadata (e.g. "4.12M subscribers")
|
|
1186
|
+
const subMatch = html.match(/(\d+(?:\.\d+)?[KMBkmb]?)\s*subscribers/i);
|
|
1187
|
+
return subMatch ? subMatch[1] + ' subscribers' : '';
|
|
1188
|
+
}
|
|
1189
|
+
catch {
|
|
1190
|
+
return '';
|
|
1191
|
+
}
|
|
1192
|
+
})();
|
|
1193
|
+
const [transcriptResult, oembedResult, noembedResult, subscriberResult] = await Promise.allSettled([
|
|
1173
1194
|
transcriptPromise,
|
|
1174
1195
|
oembedPromise,
|
|
1175
1196
|
noembedPromise,
|
|
1197
|
+
subscriberPromise,
|
|
1176
1198
|
]);
|
|
1177
1199
|
const transcript = transcriptResult.status === 'fulfilled' ? transcriptResult.value : null;
|
|
1178
1200
|
const oembedData = oembedResult.status === 'fulfilled' ? oembedResult.value : null;
|
|
1179
1201
|
const noembedData = noembedResult.status === 'fulfilled' ? noembedResult.value : null;
|
|
1202
|
+
const subscriberCount = subscriberResult.status === 'fulfilled' ? subscriberResult.value : '';
|
|
1180
1203
|
if (process.env.DEBUG) {
|
|
1181
1204
|
if (transcriptResult.status === 'rejected') {
|
|
1182
1205
|
console.debug('[webpeel]', 'YouTube transcript failed:', transcriptResult.reason instanceof Error ? transcriptResult.reason.message : transcriptResult.reason);
|
|
@@ -1198,12 +1221,15 @@ async function youtubeExtractor(_html, url) {
|
|
|
1198
1221
|
title,
|
|
1199
1222
|
channel,
|
|
1200
1223
|
channelUrl,
|
|
1224
|
+
subscriberCount: subscriberCount || undefined,
|
|
1201
1225
|
duration: transcript.duration,
|
|
1202
1226
|
publishDate,
|
|
1203
1227
|
language: transcript.language,
|
|
1204
1228
|
availableLanguages: transcript.availableLanguages,
|
|
1205
1229
|
transcriptSegments: transcript.segments.length,
|
|
1206
1230
|
wordCount: transcript.wordCount ?? 0,
|
|
1231
|
+
viewCount: transcript.viewCount ?? '',
|
|
1232
|
+
likeCount: transcript.likeCount ?? '',
|
|
1207
1233
|
description,
|
|
1208
1234
|
thumbnailUrl,
|
|
1209
1235
|
chapters: transcript.chapters ?? [],
|
|
@@ -1221,10 +1247,26 @@ async function youtubeExtractor(_html, url) {
|
|
|
1221
1247
|
publishStr = publishDate;
|
|
1222
1248
|
}
|
|
1223
1249
|
}
|
|
1250
|
+
// Format view count (e.g. "1,234,567" → "1.2M views")
|
|
1251
|
+
let viewStr = '';
|
|
1252
|
+
if (transcript.viewCount) {
|
|
1253
|
+
const v = parseInt(transcript.viewCount, 10);
|
|
1254
|
+
if (!isNaN(v)) {
|
|
1255
|
+
if (v >= 1_000_000)
|
|
1256
|
+
viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
|
|
1257
|
+
else if (v >= 1_000)
|
|
1258
|
+
viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
|
|
1259
|
+
else
|
|
1260
|
+
viewStr = `${v.toLocaleString()} views`;
|
|
1261
|
+
}
|
|
1262
|
+
}
|
|
1224
1263
|
// Build header line
|
|
1225
|
-
const
|
|
1264
|
+
const channelPart = subscriberCount ? `${channel} (${subscriberCount})` : channel;
|
|
1265
|
+
const headerParts = [`**Channel:** ${channelPart}`];
|
|
1226
1266
|
if (transcript.duration && transcript.duration !== '0:00')
|
|
1227
1267
|
headerParts.push(`**Duration:** ${transcript.duration}`);
|
|
1268
|
+
if (viewStr)
|
|
1269
|
+
headerParts.push(`**${viewStr}**`);
|
|
1228
1270
|
if (publishStr)
|
|
1229
1271
|
headerParts.push(`**Published:** ${publishStr}`);
|
|
1230
1272
|
const headerLine = headerParts.join(' | ');
|
|
@@ -1233,7 +1275,9 @@ async function youtubeExtractor(_html, url) {
|
|
|
1233
1275
|
parts.push(headerLine);
|
|
1234
1276
|
// Summary section
|
|
1235
1277
|
if (transcript.summary && hasTranscript) {
|
|
1236
|
-
|
|
1278
|
+
let summaryText = transcript.summary;
|
|
1279
|
+
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
1280
|
+
parts.push(`## Summary\n\n${summaryText}`);
|
|
1237
1281
|
}
|
|
1238
1282
|
else if (!hasTranscript && transcript.fullText) {
|
|
1239
1283
|
parts.push(`## Description\n\n${transcript.fullText}`);
|
|
@@ -1249,8 +1293,14 @@ async function youtubeExtractor(_html, url) {
|
|
|
1249
1293
|
parts.push(`## Chapters\n\n${chLines}`);
|
|
1250
1294
|
}
|
|
1251
1295
|
// Full Transcript section (only if we have real transcript segments)
|
|
1296
|
+
// Add intelligent paragraph breaks for readability
|
|
1252
1297
|
if (hasTranscript) {
|
|
1253
|
-
|
|
1298
|
+
let readableText = transcript.fullText;
|
|
1299
|
+
// Break into paragraphs: after sentence-ending punctuation followed by a capital letter
|
|
1300
|
+
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
1301
|
+
// Collapse any triple+ newlines
|
|
1302
|
+
readableText = readableText.replace(/\n{3,}/g, '\n\n');
|
|
1303
|
+
parts.push(`## Full Transcript\n\n${readableText}`);
|
|
1254
1304
|
}
|
|
1255
1305
|
const cleanContent = parts.join('\n\n');
|
|
1256
1306
|
return { domain: 'youtube.com', type: 'video', structured, cleanContent };
|
package/dist/core/pipeline.js
CHANGED
|
@@ -161,22 +161,57 @@ export async function handleYouTube(ctx) {
|
|
|
161
161
|
const transcript = await getYouTubeTranscript(ctx.url, {
|
|
162
162
|
language: ctx.options.language ?? 'en',
|
|
163
163
|
});
|
|
164
|
+
// Format view count
|
|
165
|
+
let viewStr = '';
|
|
166
|
+
if (transcript.viewCount) {
|
|
167
|
+
const v = parseInt(transcript.viewCount, 10);
|
|
168
|
+
if (!isNaN(v)) {
|
|
169
|
+
if (v >= 1_000_000)
|
|
170
|
+
viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
|
|
171
|
+
else if (v >= 1_000)
|
|
172
|
+
viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
|
|
173
|
+
else
|
|
174
|
+
viewStr = `${v.toLocaleString()} views`;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
// Format publish date
|
|
178
|
+
let publishStr = '';
|
|
179
|
+
if (transcript.publishDate) {
|
|
180
|
+
try {
|
|
181
|
+
const d = new Date(transcript.publishDate);
|
|
182
|
+
publishStr = d.toLocaleDateString('en-US', { month: 'short', year: 'numeric', day: 'numeric' });
|
|
183
|
+
}
|
|
184
|
+
catch {
|
|
185
|
+
publishStr = transcript.publishDate;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
// Build header metadata line
|
|
189
|
+
const headerParts = [`**Channel:** ${transcript.channel}`];
|
|
190
|
+
if (transcript.duration && transcript.duration !== '0:00')
|
|
191
|
+
headerParts.push(`**Duration:** ${transcript.duration}`);
|
|
192
|
+
if (viewStr)
|
|
193
|
+
headerParts.push(`**${viewStr}**`);
|
|
194
|
+
if (publishStr)
|
|
195
|
+
headerParts.push(`**Published:** ${publishStr}`);
|
|
196
|
+
// Add paragraph breaks to transcript for readability
|
|
197
|
+
let readableText = transcript.fullText;
|
|
198
|
+
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
199
|
+
readableText = readableText.replace(/\n{3,}/g, '\n\n');
|
|
164
200
|
// Build a clean markdown representation of the video + transcript
|
|
165
|
-
const
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
const videoInfoContent = videoInfoLines.join('\n');
|
|
201
|
+
const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
|
|
202
|
+
if (transcript.summary) {
|
|
203
|
+
let summaryText = transcript.summary;
|
|
204
|
+
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
205
|
+
parts.push(`## Summary\n\n${summaryText}`);
|
|
206
|
+
}
|
|
207
|
+
if (transcript.keyPoints && transcript.keyPoints.length > 0) {
|
|
208
|
+
parts.push(`## Key Points\n\n${transcript.keyPoints.map(kp => `- ${kp}`).join('\n')}`);
|
|
209
|
+
}
|
|
210
|
+
if (transcript.chapters && transcript.chapters.length > 0) {
|
|
211
|
+
parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
|
|
212
|
+
}
|
|
213
|
+
parts.push(`## Full Transcript\n\n${readableText}`);
|
|
214
|
+
const videoInfoContent = parts.join('\n\n');
|
|
180
215
|
const elapsed = Date.now() - ytStartTime;
|
|
181
216
|
const tokens = estimateTokens(videoInfoContent);
|
|
182
217
|
const fingerprint = createHash('sha256').update(videoInfoContent).digest('hex').slice(0, 16);
|
package/dist/core/youtube.d.ts
CHANGED
|
@@ -42,6 +42,10 @@ export interface YouTubeTranscript {
|
|
|
42
42
|
summary?: string;
|
|
43
43
|
/** Total word count of transcript */
|
|
44
44
|
wordCount?: number;
|
|
45
|
+
/** View count (numeric string) */
|
|
46
|
+
viewCount?: string;
|
|
47
|
+
/** Like count (numeric string, may be empty) */
|
|
48
|
+
likeCount?: string;
|
|
45
49
|
}
|
|
46
50
|
export interface YouTubeVideoInfo {
|
|
47
51
|
videoId: string;
|
package/dist/core/youtube.js
CHANGED
|
@@ -6,6 +6,9 @@
|
|
|
6
6
|
* track URLs, fetch the timedtext XML, and return structured transcript data.
|
|
7
7
|
*/
|
|
8
8
|
import { execFile } from 'node:child_process';
|
|
9
|
+
import * as http from 'node:http';
|
|
10
|
+
import * as https from 'node:https';
|
|
11
|
+
import * as tls from 'node:tls';
|
|
9
12
|
import { readFile, unlink } from 'node:fs/promises';
|
|
10
13
|
import { tmpdir } from 'node:os';
|
|
11
14
|
import { join } from 'node:path';
|
|
@@ -231,6 +234,221 @@ export function extractSummary(fullText) {
|
|
|
231
234
|
return words.slice(0, 200).join(' ') + '...';
|
|
232
235
|
}
|
|
233
236
|
// ---------------------------------------------------------------------------
|
|
237
|
+
// Proxy-based InnerTube transcript extraction
|
|
238
|
+
// ---------------------------------------------------------------------------
|
|
239
|
+
// Webshare residential proxy config — reads from env vars on Render.
|
|
240
|
+
// Locally, falls back to direct fetch (residential IP already works).
|
|
241
|
+
const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
|
|
242
|
+
const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
243
|
+
const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
|
|
244
|
+
const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
|
|
245
|
+
// With paid Webshare backbone plan, each US slot has its own port:
|
|
246
|
+
// slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
|
|
247
|
+
const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
|
|
248
|
+
function isProxyConfigured() {
|
|
249
|
+
return !!(PROXY_USER && PROXY_PASS);
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
|
|
253
|
+
* slotted username (e.g. "argtnlhz-5"). This ensures both the /player call
|
|
254
|
+
* and the caption XML fetch go through the same residential IP.
|
|
255
|
+
*/
|
|
256
|
+
function proxyRequestSlotted(slottedUser, proxyPort, targetUrl, opts = {}) {
|
|
257
|
+
const url = new URL(targetUrl);
|
|
258
|
+
const timeout = opts.timeoutMs ?? 20000;
|
|
259
|
+
return new Promise((resolve, reject) => {
|
|
260
|
+
const proxyAuth = Buffer.from(`${slottedUser}:${PROXY_PASS}`).toString('base64');
|
|
261
|
+
const proxyReq = http.request({
|
|
262
|
+
host: PROXY_HOST,
|
|
263
|
+
port: proxyPort,
|
|
264
|
+
method: 'CONNECT',
|
|
265
|
+
path: `${url.hostname}:443`,
|
|
266
|
+
headers: { 'Proxy-Authorization': `Basic ${proxyAuth}` },
|
|
267
|
+
});
|
|
268
|
+
const timer = setTimeout(() => {
|
|
269
|
+
proxyReq.destroy();
|
|
270
|
+
reject(new Error('Proxy request timed out'));
|
|
271
|
+
}, timeout);
|
|
272
|
+
proxyReq.on('connect', (res, socket) => {
|
|
273
|
+
if (res.statusCode !== 200) {
|
|
274
|
+
clearTimeout(timer);
|
|
275
|
+
socket.destroy();
|
|
276
|
+
reject(new Error(`Proxy CONNECT failed: ${res.statusCode}`));
|
|
277
|
+
return;
|
|
278
|
+
}
|
|
279
|
+
const tlsSocket = tls.connect({ host: url.hostname, socket, servername: url.hostname }, () => {
|
|
280
|
+
const reqHeaders = {
|
|
281
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
282
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
283
|
+
'Cookie': 'CONSENT=YES+; SOCS=CAI',
|
|
284
|
+
...(opts.headers ?? {}),
|
|
285
|
+
};
|
|
286
|
+
const req = https.request({
|
|
287
|
+
hostname: url.hostname,
|
|
288
|
+
path: url.pathname + url.search,
|
|
289
|
+
method: opts.method ?? 'GET',
|
|
290
|
+
createConnection: () => tlsSocket,
|
|
291
|
+
headers: reqHeaders,
|
|
292
|
+
}, (response) => {
|
|
293
|
+
let data = '';
|
|
294
|
+
response.on('data', (chunk) => {
|
|
295
|
+
data += chunk;
|
|
296
|
+
});
|
|
297
|
+
response.on('end', () => {
|
|
298
|
+
clearTimeout(timer);
|
|
299
|
+
resolve({ status: response.statusCode ?? 0, body: data });
|
|
300
|
+
});
|
|
301
|
+
});
|
|
302
|
+
req.on('error', (e) => {
|
|
303
|
+
clearTimeout(timer);
|
|
304
|
+
reject(e);
|
|
305
|
+
});
|
|
306
|
+
if (opts.body)
|
|
307
|
+
req.write(opts.body);
|
|
308
|
+
req.end();
|
|
309
|
+
});
|
|
310
|
+
tlsSocket.on('error', (e) => {
|
|
311
|
+
clearTimeout(timer);
|
|
312
|
+
reject(e);
|
|
313
|
+
});
|
|
314
|
+
});
|
|
315
|
+
proxyReq.on('error', (e) => {
|
|
316
|
+
clearTimeout(timer);
|
|
317
|
+
reject(e);
|
|
318
|
+
});
|
|
319
|
+
proxyReq.end();
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
/**
|
|
323
|
+
* Fetch YouTube transcript via InnerTube /player API through Webshare proxy.
|
|
324
|
+
*
|
|
325
|
+
* This replicates the approach used by the Python `youtube-transcript-api` library:
|
|
326
|
+
* 1. POST to /youtubei/v1/player with ANDROID client context
|
|
327
|
+
* 2. Get caption track URLs WITHOUT the `exp=xpe` parameter
|
|
328
|
+
* 3. Fetch caption XML from those clean URLs (returns actual data, not 0 bytes)
|
|
329
|
+
*
|
|
330
|
+
* All requests go through the residential proxy to bypass YouTube's cloud IP blocking.
|
|
331
|
+
*/
|
|
332
|
+
async function getTranscriptViaProxy(videoId, preferredLang) {
|
|
333
|
+
// Try multiple proxy slots from the 44K+ US residential pool.
|
|
334
|
+
// Pick random slots across the pool for even distribution and to avoid
|
|
335
|
+
// rate-limited IPs. Try up to MAX_RETRIES different slots.
|
|
336
|
+
const MAX_RETRIES = 5;
|
|
337
|
+
const usedSlots = new Set();
|
|
338
|
+
const INNERTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8';
|
|
339
|
+
for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
|
|
340
|
+
// Pick a random US slot we haven't tried yet
|
|
341
|
+
let slot;
|
|
342
|
+
do {
|
|
343
|
+
slot = Math.floor(Math.random() * PROXY_MAX_US_SLOTS) + 1;
|
|
344
|
+
} while (usedSlots.has(slot) && usedSlots.size < PROXY_MAX_US_SLOTS);
|
|
345
|
+
usedSlots.add(slot);
|
|
346
|
+
const proxyUser = `${PROXY_USER}-US-${slot}`;
|
|
347
|
+
const proxyPort = PROXY_BASE_PORT + slot - 1;
|
|
348
|
+
const doProxyRequest = (url, opts = {}) => proxyRequestSlotted(proxyUser, proxyPort, url, opts);
|
|
349
|
+
try {
|
|
350
|
+
// Step 1: Call InnerTube /player with ANDROID client
|
|
351
|
+
// ANDROID client returns caption URLs WITHOUT exp=xpe (avoids 0-byte responses).
|
|
352
|
+
const playerResp = await doProxyRequest(`https://www.youtube.com/youtubei/v1/player?key=${INNERTUBE_API_KEY}`, {
|
|
353
|
+
method: 'POST',
|
|
354
|
+
body: JSON.stringify({
|
|
355
|
+
context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } },
|
|
356
|
+
videoId,
|
|
357
|
+
}),
|
|
358
|
+
headers: { 'Content-Type': 'application/json' },
|
|
359
|
+
});
|
|
360
|
+
if (playerResp.status !== 200) {
|
|
361
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): /player returned ${playerResp.status}`);
|
|
362
|
+
continue;
|
|
363
|
+
}
|
|
364
|
+
const playerData = JSON.parse(playerResp.body);
|
|
365
|
+
const captionTracks = playerData?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
366
|
+
if (!captionTracks || captionTracks.length === 0) {
|
|
367
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no caption tracks`);
|
|
368
|
+
continue;
|
|
369
|
+
}
|
|
370
|
+
// Pick best matching language track
|
|
371
|
+
let track = captionTracks.find((t) => t.languageCode === preferredLang);
|
|
372
|
+
if (!track) {
|
|
373
|
+
track = captionTracks.find((t) => t.languageCode === 'en') ?? captionTracks[0];
|
|
374
|
+
}
|
|
375
|
+
const captionUrl = track.baseUrl;
|
|
376
|
+
if (captionUrl.includes('exp=xpe')) {
|
|
377
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption URL has exp=xpe, skipping`);
|
|
378
|
+
continue;
|
|
379
|
+
}
|
|
380
|
+
// Step 2: Fetch caption XML through the SAME proxy slot (same residential IP)
|
|
381
|
+
const capResp = await doProxyRequest(captionUrl);
|
|
382
|
+
if (!capResp.body ||
|
|
383
|
+
capResp.body.length === 0 ||
|
|
384
|
+
capResp.status === 429 ||
|
|
385
|
+
capResp.body.includes('<title>Sorry...</title>')) {
|
|
386
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption XML failed (status=${capResp.status}, bytes=${capResp.body?.length ?? 0})`);
|
|
387
|
+
continue; // Try next slot
|
|
388
|
+
}
|
|
389
|
+
// Parse XML segments — handles both <text start="" dur=""> and <p t="" d=""> formats
|
|
390
|
+
const xmlSegments = [
|
|
391
|
+
...capResp.body.matchAll(/<(?:text|p)\s[^>]*?(?:start|t)="([^"]*)"[^>]*?(?:dur|d)="([^"]*)"[^>]*>([\s\S]*?)<\/(?:text|p)>/g),
|
|
392
|
+
];
|
|
393
|
+
if (xmlSegments.length === 0) {
|
|
394
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no segments parsed from XML`);
|
|
395
|
+
continue;
|
|
396
|
+
}
|
|
397
|
+
const segments = xmlSegments
|
|
398
|
+
.map((m) => ({
|
|
399
|
+
text: decodeHtmlEntities(m[3].replace(/<[^>]+>/g, '').replace(/\n/g, ' ').trim()),
|
|
400
|
+
start: parseFloat(m[1]) / (m[1].includes('.') ? 1 : 1000),
|
|
401
|
+
duration: parseFloat(m[2]) / (m[2].includes('.') ? 1 : 1000),
|
|
402
|
+
}))
|
|
403
|
+
.filter((s) => s.text.length > 0);
|
|
404
|
+
if (segments.length === 0)
|
|
405
|
+
continue;
|
|
406
|
+
// Extract metadata from player response
|
|
407
|
+
const vd = playerData.videoDetails ?? {};
|
|
408
|
+
const mf = playerData.microformat?.playerMicroformatRenderer ?? {};
|
|
409
|
+
const title = vd.title ?? '';
|
|
410
|
+
const channel = vd.author ?? '';
|
|
411
|
+
const lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
|
|
412
|
+
const description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
|
|
413
|
+
const publishDate = mf.publishDate ?? mf.uploadDate ?? '';
|
|
414
|
+
const availableLanguages = captionTracks.map((t) => t.languageCode);
|
|
415
|
+
const fullText = segments.map((s) => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
416
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
417
|
+
const chapters = parseChaptersFromDescription(description);
|
|
418
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
419
|
+
const summary = extractSummary(fullText);
|
|
420
|
+
const viewCount = vd.viewCount ?? mf.viewCount ?? '';
|
|
421
|
+
const likeCount = vd.likeCount ?? '';
|
|
422
|
+
console.log(`[webpeel] [youtube] Proxy slot ${slot} success: ${segments.length} segments, ${wordCount} words`);
|
|
423
|
+
return {
|
|
424
|
+
videoId,
|
|
425
|
+
title,
|
|
426
|
+
channel,
|
|
427
|
+
duration: formatDuration(lengthSeconds),
|
|
428
|
+
language: track.languageCode ?? preferredLang,
|
|
429
|
+
segments,
|
|
430
|
+
fullText,
|
|
431
|
+
availableLanguages,
|
|
432
|
+
description,
|
|
433
|
+
publishDate,
|
|
434
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
435
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
436
|
+
summary,
|
|
437
|
+
wordCount,
|
|
438
|
+
viewCount: viewCount || undefined,
|
|
439
|
+
likeCount: likeCount || undefined,
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
catch (err) {
|
|
443
|
+
console.log(`[webpeel] [youtube] Proxy slot ${slot} error:`, err?.message);
|
|
444
|
+
continue;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
// All slots exhausted
|
|
448
|
+
console.log('[webpeel] [youtube] All proxy slots exhausted');
|
|
449
|
+
return null;
|
|
450
|
+
}
|
|
451
|
+
// ---------------------------------------------------------------------------
|
|
234
452
|
// Transcript extraction
|
|
235
453
|
// ---------------------------------------------------------------------------
|
|
236
454
|
/**
|
|
@@ -246,6 +464,24 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
246
464
|
}
|
|
247
465
|
const preferredLang = options.language ?? 'en';
|
|
248
466
|
const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
|
|
467
|
+
// --- Path P: Proxy-based InnerTube (primary for cloud servers) ---
|
|
468
|
+
// Uses Webshare residential proxy + ANDROID InnerTube /player API.
|
|
469
|
+
// This is the approach used by every major YouTube transcript service
|
|
470
|
+
// (youtubetotranscript.com, youtube-transcript.io, etc.)
|
|
471
|
+
if (!process.env.VITEST && isProxyConfigured()) {
|
|
472
|
+
console.log('[webpeel] [youtube] Trying path P: proxy-based InnerTube (residential proxy)');
|
|
473
|
+
try {
|
|
474
|
+
const proxyResult = await getTranscriptViaProxy(videoId, preferredLang);
|
|
475
|
+
if (proxyResult && proxyResult.segments.length > 0) {
|
|
476
|
+
console.log(`[webpeel] [youtube] Path P success: ${proxyResult.segments.length} segments, ${proxyResult.wordCount} words`);
|
|
477
|
+
return proxyResult;
|
|
478
|
+
}
|
|
479
|
+
console.log('[webpeel] [youtube] Path P returned empty/null, falling through');
|
|
480
|
+
}
|
|
481
|
+
catch (err) {
|
|
482
|
+
console.log('[webpeel] [youtube] Path P failed:', err?.message);
|
|
483
|
+
}
|
|
484
|
+
}
|
|
249
485
|
// --- Path 0: youtube-transcript-plus (fastest — uses InnerTube API, ~1s) ---
|
|
250
486
|
// This library calls YouTube's internal InnerTube API directly via POST request,
|
|
251
487
|
// bypassing the IP-locked timedtext XML URLs. Works reliably from cloud servers.
|
|
@@ -309,6 +545,8 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
309
545
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
310
546
|
summary,
|
|
311
547
|
wordCount,
|
|
548
|
+
viewCount: undefined, // not available in this path without extra fetch
|
|
549
|
+
likeCount: undefined,
|
|
312
550
|
};
|
|
313
551
|
}
|
|
314
552
|
console.log('[webpeel] [youtube] Path 0 returned empty segments');
|
|
@@ -417,6 +655,8 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
417
655
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
418
656
|
summary,
|
|
419
657
|
wordCount,
|
|
658
|
+
viewCount: (videoDetails.viewCount ?? microformat.viewCount ?? '') || undefined,
|
|
659
|
+
likeCount: (videoDetails.likeCount ?? '') || undefined,
|
|
420
660
|
};
|
|
421
661
|
}
|
|
422
662
|
catch (err) {
|
|
@@ -529,6 +769,8 @@ async function getTranscriptViaYtDlp(videoId, preferredLang) {
|
|
|
529
769
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
530
770
|
summary,
|
|
531
771
|
wordCount,
|
|
772
|
+
viewCount: (infoData.view_count?.toString() ?? '') || undefined,
|
|
773
|
+
likeCount: (infoData.like_count?.toString() ?? '') || undefined,
|
|
532
774
|
});
|
|
533
775
|
}
|
|
534
776
|
catch {
|
|
@@ -655,6 +897,8 @@ async function getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang
|
|
|
655
897
|
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
656
898
|
summary,
|
|
657
899
|
wordCount,
|
|
900
|
+
viewCount: undefined, // browser path doesn't reliably get this
|
|
901
|
+
likeCount: undefined,
|
|
658
902
|
};
|
|
659
903
|
}
|
|
660
904
|
finally {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.18",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|