webpeel 0.20.13 → 0.20.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +12 -4
- package/dist/core/youtube.js +405 -20
- package/dist/server/app.js +1 -1
- package/package.json +4 -2
|
@@ -1165,8 +1165,8 @@ async function youtubeExtractor(_html, url) {
|
|
|
1165
1165
|
]);
|
|
1166
1166
|
}
|
|
1167
1167
|
// Run transcript fetch and oEmbed fetch in parallel
|
|
1168
|
-
//
|
|
1169
|
-
const transcriptPromise = withTimeout(getYouTubeTranscript(url),
|
|
1168
|
+
// Proxy-based extraction takes 2-5s, but retry logic may need more time
|
|
1169
|
+
const transcriptPromise = withTimeout(getYouTubeTranscript(url), 30000);
|
|
1170
1170
|
const oembedPromise = fetchJson(`https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`);
|
|
1171
1171
|
const noembedPromise = fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`).catch(() => null);
|
|
1172
1172
|
const [transcriptResult, oembedResult, noembedResult] = await Promise.allSettled([
|
|
@@ -1233,7 +1233,9 @@ async function youtubeExtractor(_html, url) {
|
|
|
1233
1233
|
parts.push(headerLine);
|
|
1234
1234
|
// Summary section
|
|
1235
1235
|
if (transcript.summary && hasTranscript) {
|
|
1236
|
-
|
|
1236
|
+
let summaryText = transcript.summary;
|
|
1237
|
+
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
1238
|
+
parts.push(`## Summary\n\n${summaryText}`);
|
|
1237
1239
|
}
|
|
1238
1240
|
else if (!hasTranscript && transcript.fullText) {
|
|
1239
1241
|
parts.push(`## Description\n\n${transcript.fullText}`);
|
|
@@ -1249,8 +1251,14 @@ async function youtubeExtractor(_html, url) {
|
|
|
1249
1251
|
parts.push(`## Chapters\n\n${chLines}`);
|
|
1250
1252
|
}
|
|
1251
1253
|
// Full Transcript section (only if we have real transcript segments)
|
|
1254
|
+
// Add intelligent paragraph breaks for readability
|
|
1252
1255
|
if (hasTranscript) {
|
|
1253
|
-
|
|
1256
|
+
let readableText = transcript.fullText;
|
|
1257
|
+
// Break into paragraphs: after sentence-ending punctuation followed by a capital letter
|
|
1258
|
+
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
1259
|
+
// Collapse any triple+ newlines
|
|
1260
|
+
readableText = readableText.replace(/\n{3,}/g, '\n\n');
|
|
1261
|
+
parts.push(`## Full Transcript\n\n${readableText}`);
|
|
1254
1262
|
}
|
|
1255
1263
|
const cleanContent = parts.join('\n\n');
|
|
1256
1264
|
return { domain: 'youtube.com', type: 'video', structured, cleanContent };
|
package/dist/core/youtube.js
CHANGED
|
@@ -6,12 +6,38 @@
|
|
|
6
6
|
* track URLs, fetch the timedtext XML, and return structured transcript data.
|
|
7
7
|
*/
|
|
8
8
|
import { execFile } from 'node:child_process';
|
|
9
|
+
import * as http from 'node:http';
|
|
10
|
+
import * as https from 'node:https';
|
|
11
|
+
import * as tls from 'node:tls';
|
|
9
12
|
import { readFile, unlink } from 'node:fs/promises';
|
|
10
13
|
import { tmpdir } from 'node:os';
|
|
11
14
|
import { join } from 'node:path';
|
|
15
|
+
import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
|
|
12
16
|
import { simpleFetch } from './fetcher.js';
|
|
13
17
|
import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
14
18
|
// ---------------------------------------------------------------------------
|
|
19
|
+
// yt-dlp startup diagnostics
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Check yt-dlp availability on startup.
|
|
22
|
+
// Skipped in test environments (VITEST) to avoid interfering with mocked paths.
|
|
23
|
+
let ytdlpAvailable = false;
|
|
24
|
+
(async () => {
|
|
25
|
+
if (process.env.VITEST)
|
|
26
|
+
return;
|
|
27
|
+
try {
|
|
28
|
+
const { execFileSync } = await import('node:child_process');
|
|
29
|
+
const version = execFileSync('yt-dlp', ['--version'], {
|
|
30
|
+
timeout: 5000,
|
|
31
|
+
env: { ...process.env, PATH: `/usr/local/bin:/usr/bin:/bin:${process.env.PATH ?? ''}` },
|
|
32
|
+
}).toString().trim();
|
|
33
|
+
ytdlpAvailable = true;
|
|
34
|
+
console.log(`[webpeel] [youtube] yt-dlp available: v${version}`);
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
console.log('[webpeel] [youtube] yt-dlp NOT available — falling back to HTTP extraction');
|
|
38
|
+
}
|
|
39
|
+
})();
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
15
41
|
// URL parsing
|
|
16
42
|
// ---------------------------------------------------------------------------
|
|
17
43
|
/**
|
|
@@ -208,6 +234,217 @@ export function extractSummary(fullText) {
|
|
|
208
234
|
return words.slice(0, 200).join(' ') + '...';
|
|
209
235
|
}
|
|
210
236
|
// ---------------------------------------------------------------------------
|
|
237
|
+
// Proxy-based InnerTube transcript extraction
|
|
238
|
+
// ---------------------------------------------------------------------------
|
|
239
|
+
// Webshare residential proxy config — reads from env vars on Render.
|
|
240
|
+
// Locally, falls back to direct fetch (residential IP already works).
|
|
241
|
+
const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
|
|
242
|
+
const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
243
|
+
const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
|
|
244
|
+
const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
|
|
245
|
+
// With paid Webshare backbone plan, each US slot has its own port:
|
|
246
|
+
// slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
|
|
247
|
+
const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
|
|
248
|
+
function isProxyConfigured() {
|
|
249
|
+
return !!(PROXY_USER && PROXY_PASS);
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
|
|
253
|
+
* slotted username (e.g. "argtnlhz-5"). This ensures both the /player call
|
|
254
|
+
* and the caption XML fetch go through the same residential IP.
|
|
255
|
+
*/
|
|
256
|
+
function proxyRequestSlotted(slottedUser, proxyPort, targetUrl, opts = {}) {
|
|
257
|
+
const url = new URL(targetUrl);
|
|
258
|
+
const timeout = opts.timeoutMs ?? 20000;
|
|
259
|
+
return new Promise((resolve, reject) => {
|
|
260
|
+
const proxyAuth = Buffer.from(`${slottedUser}:${PROXY_PASS}`).toString('base64');
|
|
261
|
+
const proxyReq = http.request({
|
|
262
|
+
host: PROXY_HOST,
|
|
263
|
+
port: proxyPort,
|
|
264
|
+
method: 'CONNECT',
|
|
265
|
+
path: `${url.hostname}:443`,
|
|
266
|
+
headers: { 'Proxy-Authorization': `Basic ${proxyAuth}` },
|
|
267
|
+
});
|
|
268
|
+
const timer = setTimeout(() => {
|
|
269
|
+
proxyReq.destroy();
|
|
270
|
+
reject(new Error('Proxy request timed out'));
|
|
271
|
+
}, timeout);
|
|
272
|
+
proxyReq.on('connect', (res, socket) => {
|
|
273
|
+
if (res.statusCode !== 200) {
|
|
274
|
+
clearTimeout(timer);
|
|
275
|
+
socket.destroy();
|
|
276
|
+
reject(new Error(`Proxy CONNECT failed: ${res.statusCode}`));
|
|
277
|
+
return;
|
|
278
|
+
}
|
|
279
|
+
const tlsSocket = tls.connect({ host: url.hostname, socket, servername: url.hostname }, () => {
|
|
280
|
+
const reqHeaders = {
|
|
281
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
282
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
283
|
+
'Cookie': 'CONSENT=YES+; SOCS=CAI',
|
|
284
|
+
...(opts.headers ?? {}),
|
|
285
|
+
};
|
|
286
|
+
const req = https.request({
|
|
287
|
+
hostname: url.hostname,
|
|
288
|
+
path: url.pathname + url.search,
|
|
289
|
+
method: opts.method ?? 'GET',
|
|
290
|
+
createConnection: () => tlsSocket,
|
|
291
|
+
headers: reqHeaders,
|
|
292
|
+
}, (response) => {
|
|
293
|
+
let data = '';
|
|
294
|
+
response.on('data', (chunk) => {
|
|
295
|
+
data += chunk;
|
|
296
|
+
});
|
|
297
|
+
response.on('end', () => {
|
|
298
|
+
clearTimeout(timer);
|
|
299
|
+
resolve({ status: response.statusCode ?? 0, body: data });
|
|
300
|
+
});
|
|
301
|
+
});
|
|
302
|
+
req.on('error', (e) => {
|
|
303
|
+
clearTimeout(timer);
|
|
304
|
+
reject(e);
|
|
305
|
+
});
|
|
306
|
+
if (opts.body)
|
|
307
|
+
req.write(opts.body);
|
|
308
|
+
req.end();
|
|
309
|
+
});
|
|
310
|
+
tlsSocket.on('error', (e) => {
|
|
311
|
+
clearTimeout(timer);
|
|
312
|
+
reject(e);
|
|
313
|
+
});
|
|
314
|
+
});
|
|
315
|
+
proxyReq.on('error', (e) => {
|
|
316
|
+
clearTimeout(timer);
|
|
317
|
+
reject(e);
|
|
318
|
+
});
|
|
319
|
+
proxyReq.end();
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
/**
|
|
323
|
+
* Fetch YouTube transcript via InnerTube /player API through Webshare proxy.
|
|
324
|
+
*
|
|
325
|
+
* This replicates the approach used by the Python `youtube-transcript-api` library:
|
|
326
|
+
* 1. POST to /youtubei/v1/player with ANDROID client context
|
|
327
|
+
* 2. Get caption track URLs WITHOUT the `exp=xpe` parameter
|
|
328
|
+
* 3. Fetch caption XML from those clean URLs (returns actual data, not 0 bytes)
|
|
329
|
+
*
|
|
330
|
+
* All requests go through the residential proxy to bypass YouTube's cloud IP blocking.
|
|
331
|
+
*/
|
|
332
|
+
async function getTranscriptViaProxy(videoId, preferredLang) {
|
|
333
|
+
// Try multiple proxy slots from the 44K+ US residential pool.
|
|
334
|
+
// Pick random slots across the pool for even distribution and to avoid
|
|
335
|
+
// rate-limited IPs. Try up to MAX_RETRIES different slots.
|
|
336
|
+
const MAX_RETRIES = 5;
|
|
337
|
+
const usedSlots = new Set();
|
|
338
|
+
const INNERTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8';
|
|
339
|
+
for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
|
|
340
|
+
// Pick a random US slot we haven't tried yet
|
|
341
|
+
let slot;
|
|
342
|
+
do {
|
|
343
|
+
slot = Math.floor(Math.random() * PROXY_MAX_US_SLOTS) + 1;
|
|
344
|
+
} while (usedSlots.has(slot) && usedSlots.size < PROXY_MAX_US_SLOTS);
|
|
345
|
+
usedSlots.add(slot);
|
|
346
|
+
const proxyUser = `${PROXY_USER}-US-${slot}`;
|
|
347
|
+
const proxyPort = PROXY_BASE_PORT + slot - 1;
|
|
348
|
+
const doProxyRequest = (url, opts = {}) => proxyRequestSlotted(proxyUser, proxyPort, url, opts);
|
|
349
|
+
try {
|
|
350
|
+
// Step 1: Call InnerTube /player with ANDROID client
|
|
351
|
+
// ANDROID client returns caption URLs WITHOUT exp=xpe (avoids 0-byte responses).
|
|
352
|
+
const playerResp = await doProxyRequest(`https://www.youtube.com/youtubei/v1/player?key=${INNERTUBE_API_KEY}`, {
|
|
353
|
+
method: 'POST',
|
|
354
|
+
body: JSON.stringify({
|
|
355
|
+
context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } },
|
|
356
|
+
videoId,
|
|
357
|
+
}),
|
|
358
|
+
headers: { 'Content-Type': 'application/json' },
|
|
359
|
+
});
|
|
360
|
+
if (playerResp.status !== 200) {
|
|
361
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): /player returned ${playerResp.status}`);
|
|
362
|
+
continue;
|
|
363
|
+
}
|
|
364
|
+
const playerData = JSON.parse(playerResp.body);
|
|
365
|
+
const captionTracks = playerData?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
366
|
+
if (!captionTracks || captionTracks.length === 0) {
|
|
367
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no caption tracks`);
|
|
368
|
+
continue;
|
|
369
|
+
}
|
|
370
|
+
// Pick best matching language track
|
|
371
|
+
let track = captionTracks.find((t) => t.languageCode === preferredLang);
|
|
372
|
+
if (!track) {
|
|
373
|
+
track = captionTracks.find((t) => t.languageCode === 'en') ?? captionTracks[0];
|
|
374
|
+
}
|
|
375
|
+
const captionUrl = track.baseUrl;
|
|
376
|
+
if (captionUrl.includes('exp=xpe')) {
|
|
377
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption URL has exp=xpe, skipping`);
|
|
378
|
+
continue;
|
|
379
|
+
}
|
|
380
|
+
// Step 2: Fetch caption XML through the SAME proxy slot (same residential IP)
|
|
381
|
+
const capResp = await doProxyRequest(captionUrl);
|
|
382
|
+
if (!capResp.body ||
|
|
383
|
+
capResp.body.length === 0 ||
|
|
384
|
+
capResp.status === 429 ||
|
|
385
|
+
capResp.body.includes('<title>Sorry...</title>')) {
|
|
386
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption XML failed (status=${capResp.status}, bytes=${capResp.body?.length ?? 0})`);
|
|
387
|
+
continue; // Try next slot
|
|
388
|
+
}
|
|
389
|
+
// Parse XML segments — handles both <text start="" dur=""> and <p t="" d=""> formats
|
|
390
|
+
const xmlSegments = [
|
|
391
|
+
...capResp.body.matchAll(/<(?:text|p)\s[^>]*?(?:start|t)="([^"]*)"[^>]*?(?:dur|d)="([^"]*)"[^>]*>([\s\S]*?)<\/(?:text|p)>/g),
|
|
392
|
+
];
|
|
393
|
+
if (xmlSegments.length === 0) {
|
|
394
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no segments parsed from XML`);
|
|
395
|
+
continue;
|
|
396
|
+
}
|
|
397
|
+
const segments = xmlSegments
|
|
398
|
+
.map((m) => ({
|
|
399
|
+
text: decodeHtmlEntities(m[3].replace(/<[^>]+>/g, '').replace(/\n/g, ' ').trim()),
|
|
400
|
+
start: parseFloat(m[1]) / (m[1].includes('.') ? 1 : 1000),
|
|
401
|
+
duration: parseFloat(m[2]) / (m[2].includes('.') ? 1 : 1000),
|
|
402
|
+
}))
|
|
403
|
+
.filter((s) => s.text.length > 0);
|
|
404
|
+
if (segments.length === 0)
|
|
405
|
+
continue;
|
|
406
|
+
// Extract metadata from player response
|
|
407
|
+
const vd = playerData.videoDetails ?? {};
|
|
408
|
+
const mf = playerData.microformat?.playerMicroformatRenderer ?? {};
|
|
409
|
+
const title = vd.title ?? '';
|
|
410
|
+
const channel = vd.author ?? '';
|
|
411
|
+
const lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
|
|
412
|
+
const description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
|
|
413
|
+
const publishDate = mf.publishDate ?? mf.uploadDate ?? '';
|
|
414
|
+
const availableLanguages = captionTracks.map((t) => t.languageCode);
|
|
415
|
+
const fullText = segments.map((s) => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
416
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
417
|
+
const chapters = parseChaptersFromDescription(description);
|
|
418
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
419
|
+
const summary = extractSummary(fullText);
|
|
420
|
+
console.log(`[webpeel] [youtube] Proxy slot ${slot} success: ${segments.length} segments, ${wordCount} words`);
|
|
421
|
+
return {
|
|
422
|
+
videoId,
|
|
423
|
+
title,
|
|
424
|
+
channel,
|
|
425
|
+
duration: formatDuration(lengthSeconds),
|
|
426
|
+
language: track.languageCode ?? preferredLang,
|
|
427
|
+
segments,
|
|
428
|
+
fullText,
|
|
429
|
+
availableLanguages,
|
|
430
|
+
description,
|
|
431
|
+
publishDate,
|
|
432
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
433
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
434
|
+
summary,
|
|
435
|
+
wordCount,
|
|
436
|
+
};
|
|
437
|
+
}
|
|
438
|
+
catch (err) {
|
|
439
|
+
console.log(`[webpeel] [youtube] Proxy slot ${slot} error:`, err?.message);
|
|
440
|
+
continue;
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
// All slots exhausted
|
|
444
|
+
console.log('[webpeel] [youtube] All proxy slots exhausted');
|
|
445
|
+
return null;
|
|
446
|
+
}
|
|
447
|
+
// ---------------------------------------------------------------------------
|
|
211
448
|
// Transcript extraction
|
|
212
449
|
// ---------------------------------------------------------------------------
|
|
213
450
|
/**
|
|
@@ -223,17 +460,149 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
223
460
|
}
|
|
224
461
|
const preferredLang = options.language ?? 'en';
|
|
225
462
|
const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
|
|
226
|
-
// --- Path
|
|
227
|
-
//
|
|
228
|
-
//
|
|
463
|
+
// --- Path P: Proxy-based InnerTube (primary for cloud servers) ---
|
|
464
|
+
// Uses Webshare residential proxy + ANDROID InnerTube /player API.
|
|
465
|
+
// This is the approach used by every major YouTube transcript service
|
|
466
|
+
// (youtubetotranscript.com, youtube-transcript.io, etc.)
|
|
467
|
+
if (!process.env.VITEST && isProxyConfigured()) {
|
|
468
|
+
console.log('[webpeel] [youtube] Trying path P: proxy-based InnerTube (residential proxy)');
|
|
469
|
+
try {
|
|
470
|
+
const proxyResult = await getTranscriptViaProxy(videoId, preferredLang);
|
|
471
|
+
if (proxyResult && proxyResult.segments.length > 0) {
|
|
472
|
+
console.log(`[webpeel] [youtube] Path P success: ${proxyResult.segments.length} segments, ${proxyResult.wordCount} words`);
|
|
473
|
+
return proxyResult;
|
|
474
|
+
}
|
|
475
|
+
console.log('[webpeel] [youtube] Path P returned empty/null, falling through');
|
|
476
|
+
}
|
|
477
|
+
catch (err) {
|
|
478
|
+
console.log('[webpeel] [youtube] Path P failed:', err?.message);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
// --- Path 0: youtube-transcript-plus (fastest — uses InnerTube API, ~1s) ---
|
|
482
|
+
// This library calls YouTube's internal InnerTube API directly via POST request,
|
|
483
|
+
// bypassing the IP-locked timedtext XML URLs. Works reliably from cloud servers.
|
|
484
|
+
// Skip in test mode — tests use mocked HTTP, but this path makes real InnerTube calls.
|
|
485
|
+
if (!process.env.VITEST) {
|
|
486
|
+
console.log('[webpeel] [youtube] Trying path 0: youtube-transcript-plus (InnerTube API)');
|
|
487
|
+
try {
|
|
488
|
+
const ytpSegments = await ytpFetchTranscript(videoId, { lang: preferredLang });
|
|
489
|
+
if (ytpSegments && ytpSegments.length > 0) {
|
|
490
|
+
// We have transcript segments — now fetch page metadata (title, channel, etc.)
|
|
491
|
+
let title = '', channel = '', lengthSeconds = 0, description = '', publishDate = '';
|
|
492
|
+
let availableLanguages = [preferredLang];
|
|
493
|
+
try {
|
|
494
|
+
const metaResp = await fetch(videoUrl, {
|
|
495
|
+
headers: {
|
|
496
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
497
|
+
'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
|
|
498
|
+
},
|
|
499
|
+
signal: AbortSignal.timeout(8000),
|
|
500
|
+
});
|
|
501
|
+
const html = await metaResp.text();
|
|
502
|
+
const pr = extractPlayerResponse(html);
|
|
503
|
+
if (pr) {
|
|
504
|
+
const vd = pr.videoDetails ?? {};
|
|
505
|
+
const mf = pr.microformat?.playerMicroformatRenderer ?? {};
|
|
506
|
+
title = vd.title ?? '';
|
|
507
|
+
channel = vd.author ?? '';
|
|
508
|
+
lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
|
|
509
|
+
description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
|
|
510
|
+
publishDate = mf.publishDate ?? mf.uploadDate ?? '';
|
|
511
|
+
const tracks = extractCaptionTracks(pr);
|
|
512
|
+
if (tracks.length > 0)
|
|
513
|
+
availableLanguages = tracks.map(t => t.languageCode);
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
catch { /* metadata fetch failed — segments are enough */ }
|
|
517
|
+
// Convert youtube-transcript-plus format to our format
|
|
518
|
+
const segments = ytpSegments.map(s => ({
|
|
519
|
+
text: decodeHtmlEntities((s.text ?? '').replace(/\n/g, ' ').trim()),
|
|
520
|
+
start: (s.offset ?? 0) / 1000, // offset is in ms
|
|
521
|
+
duration: (s.duration ?? 0) / 1000,
|
|
522
|
+
})).filter(s => s.text.length > 0);
|
|
523
|
+
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
524
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
525
|
+
const chapters = parseChaptersFromDescription(description);
|
|
526
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
527
|
+
const summary = extractSummary(fullText);
|
|
528
|
+
console.log(`[webpeel] [youtube] Path 0 success: ${segments.length} segments, ${wordCount} words`);
|
|
529
|
+
return {
|
|
530
|
+
videoId,
|
|
531
|
+
title,
|
|
532
|
+
channel,
|
|
533
|
+
duration: formatDuration(lengthSeconds),
|
|
534
|
+
language: ytpSegments[0]?.lang ?? preferredLang,
|
|
535
|
+
segments,
|
|
536
|
+
fullText,
|
|
537
|
+
availableLanguages,
|
|
538
|
+
description,
|
|
539
|
+
publishDate,
|
|
540
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
541
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
542
|
+
summary,
|
|
543
|
+
wordCount,
|
|
544
|
+
};
|
|
545
|
+
}
|
|
546
|
+
console.log('[webpeel] [youtube] Path 0 returned empty segments');
|
|
547
|
+
}
|
|
548
|
+
catch (err) {
|
|
549
|
+
console.log('[webpeel] [youtube] Path 0 failed:', err?.message);
|
|
550
|
+
}
|
|
551
|
+
} // end VITEST guard
|
|
229
552
|
const ytUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
|
|
230
553
|
const ytHeaders = {
|
|
231
554
|
'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
|
|
232
555
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
233
556
|
};
|
|
557
|
+
// --- Path 1: yt-dlp approach (most reliable on cloud servers — handles signature challenges internally) ---
|
|
558
|
+
if (ytdlpAvailable) {
|
|
559
|
+
console.log('[webpeel] [youtube] Trying path 1: yt-dlp');
|
|
560
|
+
try {
|
|
561
|
+
const ytdlpResult = await getTranscriptViaYtDlp(videoId, preferredLang);
|
|
562
|
+
if (ytdlpResult && ytdlpResult.segments.length > 0) {
|
|
563
|
+
return ytdlpResult;
|
|
564
|
+
}
|
|
565
|
+
console.log('[webpeel] [youtube] Path 1 failed: yt-dlp returned no segments');
|
|
566
|
+
}
|
|
567
|
+
catch (err) {
|
|
568
|
+
console.log('[webpeel] [youtube] Path 1 failed:', err?.message);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
else {
|
|
572
|
+
console.log('[webpeel] [youtube] Skipping path 1: yt-dlp not available');
|
|
573
|
+
}
|
|
574
|
+
// --- Path 2: HTTP fetch (simpleFetch first; if our challenge detection fires, fall back to native fetch) ---
|
|
575
|
+
// YouTube serves consent/challenge pages to server IPs without cookies.
|
|
576
|
+
// Setting SOCS consent cookie bypasses this — same approach as youtube-transcript npm.
|
|
577
|
+
// On cloud servers, simpleFetch may throw BlockedError due to our own challenge detection;
|
|
578
|
+
// in that case we retry with native fetch() which bypasses that guard.
|
|
579
|
+
console.log('[webpeel] [youtube] Trying path 2: native fetch');
|
|
234
580
|
try {
|
|
235
|
-
|
|
236
|
-
|
|
581
|
+
let html;
|
|
582
|
+
try {
|
|
583
|
+
const fetchResult = await simpleFetch(videoUrl, ytUserAgent, 15000, ytHeaders);
|
|
584
|
+
html = fetchResult.html;
|
|
585
|
+
}
|
|
586
|
+
catch (simpleFetchErr) {
|
|
587
|
+
// If our own challenge detection threw BlockedError, retry with raw native fetch
|
|
588
|
+
const errMsg = (simpleFetchErr?.message ?? '').toLowerCase();
|
|
589
|
+
const isBlocked = simpleFetchErr?.constructor?.name === 'BlockedError' ||
|
|
590
|
+
errMsg.includes('blocked') ||
|
|
591
|
+
errMsg.includes('challenge') ||
|
|
592
|
+
errMsg.includes('cloudflare');
|
|
593
|
+
if (!isBlocked)
|
|
594
|
+
throw simpleFetchErr;
|
|
595
|
+
console.log('[webpeel] [youtube] simpleFetch BlockedError — retrying with native fetch');
|
|
596
|
+
const fetchResponse = await fetch(videoUrl, {
|
|
597
|
+
headers: {
|
|
598
|
+
'User-Agent': ytUserAgent,
|
|
599
|
+
...ytHeaders,
|
|
600
|
+
},
|
|
601
|
+
redirect: 'follow',
|
|
602
|
+
signal: AbortSignal.timeout(15000),
|
|
603
|
+
});
|
|
604
|
+
html = await fetchResponse.text();
|
|
605
|
+
}
|
|
237
606
|
if (!html.includes('ytInitialPlayerResponse') && !html.includes('ytInitialData')) {
|
|
238
607
|
throw new Error('YouTube served non-video page (likely challenge/consent)');
|
|
239
608
|
}
|
|
@@ -257,7 +626,7 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
257
626
|
const segments = parseCaptionXml(captionXml);
|
|
258
627
|
if (segments.length === 0) {
|
|
259
628
|
// Caption URL returned empty content (common when ip=0.0.0.0 in signature)
|
|
260
|
-
// Fall through to
|
|
629
|
+
// Fall through to browser intercept path
|
|
261
630
|
throw new Error('Caption XML returned empty — session-locked URL');
|
|
262
631
|
}
|
|
263
632
|
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
@@ -288,23 +657,14 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
288
657
|
if (msg.includes('No captions available') || msg.includes('Not a valid YouTube URL')) {
|
|
289
658
|
throw err;
|
|
290
659
|
}
|
|
660
|
+
console.log('[webpeel] [youtube] Path 2 failed:', msg);
|
|
291
661
|
// Network/parsing failures — fall through to browser intercept approach
|
|
292
662
|
}
|
|
293
|
-
// --- Path 2: yt-dlp approach (fast, reliable, handles signature challenges) ---
|
|
294
|
-
try {
|
|
295
|
-
const ytdlpResult = await getTranscriptViaYtDlp(videoId, preferredLang);
|
|
296
|
-
if (ytdlpResult && ytdlpResult.segments.length > 0) {
|
|
297
|
-
return ytdlpResult;
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
catch (err) {
|
|
301
|
-
if (process.env.DEBUG)
|
|
302
|
-
console.debug('[webpeel]', 'yt-dlp transcript failed:', err?.message);
|
|
303
|
-
}
|
|
304
663
|
// --- Path 3: Browser intercept approach ---
|
|
305
664
|
// YouTube's caption URLs are session-specific (they return empty when fetched
|
|
306
665
|
// from a different HTTP client). We intercept the timedtext network request
|
|
307
666
|
// that the YouTube player makes automatically when loading the page.
|
|
667
|
+
console.log('[webpeel] [youtube] Trying path 3: browser intercept');
|
|
308
668
|
return getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang);
|
|
309
669
|
}
|
|
310
670
|
/**
|
|
@@ -333,7 +693,7 @@ async function getTranscriptViaYtDlp(videoId, preferredLang) {
|
|
|
333
693
|
...process.env,
|
|
334
694
|
PATH: `/usr/local/bin:/usr/bin:/bin:${process.env.PATH ?? ''}`,
|
|
335
695
|
};
|
|
336
|
-
const proc = execFile('yt-dlp', args, { timeout:
|
|
696
|
+
const proc = execFile('yt-dlp', args, { timeout: 60000, env: execEnv }, async (err) => {
|
|
337
697
|
try {
|
|
338
698
|
if (err) {
|
|
339
699
|
// yt-dlp not installed, timed out, or failed
|
|
@@ -687,10 +1047,35 @@ function selectBestTrack(tracks, preferredLang) {
|
|
|
687
1047
|
/**
|
|
688
1048
|
* Fetch the caption XML from YouTube's timedtext API.
|
|
689
1049
|
* Must use same cookies/UA as the page fetch — URLs are session-locked.
|
|
1050
|
+
* Tries simpleFetch first; falls back to native fetch() if BlockedError is thrown
|
|
1051
|
+
* (our own challenge detection fires on cloud server IPs).
|
|
690
1052
|
*/
|
|
691
1053
|
async function fetchCaptionXml(baseUrl, userAgent, headers) {
|
|
692
|
-
|
|
693
|
-
|
|
1054
|
+
try {
|
|
1055
|
+
const result = await simpleFetch(baseUrl, userAgent, 10000, headers);
|
|
1056
|
+
return result.html;
|
|
1057
|
+
}
|
|
1058
|
+
catch (simpleFetchErr) {
|
|
1059
|
+
const errMsg = (simpleFetchErr?.message ?? '').toLowerCase();
|
|
1060
|
+
const isBlocked = simpleFetchErr?.constructor?.name === 'BlockedError' ||
|
|
1061
|
+
errMsg.includes('blocked') ||
|
|
1062
|
+
errMsg.includes('challenge') ||
|
|
1063
|
+
errMsg.includes('cloudflare');
|
|
1064
|
+
if (!isBlocked)
|
|
1065
|
+
throw simpleFetchErr;
|
|
1066
|
+
// BlockedError: retry with native fetch
|
|
1067
|
+
const fetchHeaders = {};
|
|
1068
|
+
if (userAgent)
|
|
1069
|
+
fetchHeaders['User-Agent'] = userAgent;
|
|
1070
|
+
if (headers)
|
|
1071
|
+
Object.assign(fetchHeaders, headers);
|
|
1072
|
+
const response = await fetch(baseUrl, {
|
|
1073
|
+
headers: fetchHeaders,
|
|
1074
|
+
redirect: 'follow',
|
|
1075
|
+
signal: AbortSignal.timeout(10000),
|
|
1076
|
+
});
|
|
1077
|
+
return response.text();
|
|
1078
|
+
}
|
|
694
1079
|
}
|
|
695
1080
|
/**
|
|
696
1081
|
* Parse YouTube caption XML into transcript segments.
|
package/dist/server/app.js
CHANGED
|
@@ -104,7 +104,7 @@ export function createApp(config = {}) {
|
|
|
104
104
|
else if (req.query?.render === 'true')
|
|
105
105
|
timeoutMs = 60000; // 1min for rendered fetches
|
|
106
106
|
else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
|
|
107
|
-
timeoutMs =
|
|
107
|
+
timeoutMs = 90000; // 90s for YouTube (yt-dlp needs time after simpleFetch fails)
|
|
108
108
|
req.setTimeout(timeoutMs);
|
|
109
109
|
res.setTimeout(timeoutMs, () => {
|
|
110
110
|
if (!res.headersSent) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.17",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|
|
@@ -112,7 +112,9 @@
|
|
|
112
112
|
"resend": "^6.9.3",
|
|
113
113
|
"turndown": "^7.2.0",
|
|
114
114
|
"turndown-plugin-gfm": "^1.0.2",
|
|
115
|
-
"undici": "^7.2.0"
|
|
115
|
+
"undici": "^7.2.0",
|
|
116
|
+
"youtube-transcript": "^1.2.1",
|
|
117
|
+
"youtube-transcript-plus": "^1.2.0"
|
|
116
118
|
},
|
|
117
119
|
"optionalDependencies": {
|
|
118
120
|
"@sentry/node": "^7.120.4",
|