webpeel 0.20.13 → 0.20.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/youtube.js +173 -20
- package/dist/server/app.js +1 -1
- package/package.json +4 -2
package/dist/core/youtube.js
CHANGED
|
@@ -9,9 +9,32 @@ import { execFile } from 'node:child_process';
|
|
|
9
9
|
import { readFile, unlink } from 'node:fs/promises';
|
|
10
10
|
import { tmpdir } from 'node:os';
|
|
11
11
|
import { join } from 'node:path';
|
|
12
|
+
import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
|
|
12
13
|
import { simpleFetch } from './fetcher.js';
|
|
13
14
|
import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
14
15
|
// ---------------------------------------------------------------------------
|
|
16
|
+
// yt-dlp startup diagnostics
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Check yt-dlp availability on startup.
|
|
19
|
+
// Skipped in test environments (VITEST) to avoid interfering with mocked paths.
|
|
20
|
+
let ytdlpAvailable = false;
|
|
21
|
+
(async () => {
|
|
22
|
+
if (process.env.VITEST)
|
|
23
|
+
return;
|
|
24
|
+
try {
|
|
25
|
+
const { execFileSync } = await import('node:child_process');
|
|
26
|
+
const version = execFileSync('yt-dlp', ['--version'], {
|
|
27
|
+
timeout: 5000,
|
|
28
|
+
env: { ...process.env, PATH: `/usr/local/bin:/usr/bin:/bin:${process.env.PATH ?? ''}` },
|
|
29
|
+
}).toString().trim();
|
|
30
|
+
ytdlpAvailable = true;
|
|
31
|
+
console.log(`[webpeel] [youtube] yt-dlp available: v${version}`);
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
console.log('[webpeel] [youtube] yt-dlp NOT available — falling back to HTTP extraction');
|
|
35
|
+
}
|
|
36
|
+
})();
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
15
38
|
// URL parsing
|
|
16
39
|
// ---------------------------------------------------------------------------
|
|
17
40
|
/**
|
|
@@ -223,17 +246,131 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
223
246
|
}
|
|
224
247
|
const preferredLang = options.language ?? 'en';
|
|
225
248
|
const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
|
|
226
|
-
// --- Path
|
|
227
|
-
// YouTube
|
|
228
|
-
//
|
|
249
|
+
// --- Path 0: youtube-transcript-plus (fastest — uses InnerTube API, ~1s) ---
|
|
250
|
+
// This library calls YouTube's internal InnerTube API directly via POST request,
|
|
251
|
+
// bypassing the IP-locked timedtext XML URLs. Works reliably from cloud servers.
|
|
252
|
+
// Skip in test mode — tests use mocked HTTP, but this path makes real InnerTube calls.
|
|
253
|
+
if (!process.env.VITEST) {
|
|
254
|
+
console.log('[webpeel] [youtube] Trying path 0: youtube-transcript-plus (InnerTube API)');
|
|
255
|
+
try {
|
|
256
|
+
const ytpSegments = await ytpFetchTranscript(videoId, { lang: preferredLang });
|
|
257
|
+
if (ytpSegments && ytpSegments.length > 0) {
|
|
258
|
+
// We have transcript segments — now fetch page metadata (title, channel, etc.)
|
|
259
|
+
let title = '', channel = '', lengthSeconds = 0, description = '', publishDate = '';
|
|
260
|
+
let availableLanguages = [preferredLang];
|
|
261
|
+
try {
|
|
262
|
+
const metaResp = await fetch(videoUrl, {
|
|
263
|
+
headers: {
|
|
264
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
265
|
+
'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
|
|
266
|
+
},
|
|
267
|
+
signal: AbortSignal.timeout(8000),
|
|
268
|
+
});
|
|
269
|
+
const html = await metaResp.text();
|
|
270
|
+
const pr = extractPlayerResponse(html);
|
|
271
|
+
if (pr) {
|
|
272
|
+
const vd = pr.videoDetails ?? {};
|
|
273
|
+
const mf = pr.microformat?.playerMicroformatRenderer ?? {};
|
|
274
|
+
title = vd.title ?? '';
|
|
275
|
+
channel = vd.author ?? '';
|
|
276
|
+
lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
|
|
277
|
+
description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
|
|
278
|
+
publishDate = mf.publishDate ?? mf.uploadDate ?? '';
|
|
279
|
+
const tracks = extractCaptionTracks(pr);
|
|
280
|
+
if (tracks.length > 0)
|
|
281
|
+
availableLanguages = tracks.map(t => t.languageCode);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
catch { /* metadata fetch failed — segments are enough */ }
|
|
285
|
+
// Convert youtube-transcript-plus format to our format
|
|
286
|
+
const segments = ytpSegments.map(s => ({
|
|
287
|
+
text: decodeHtmlEntities((s.text ?? '').replace(/\n/g, ' ').trim()),
|
|
288
|
+
start: (s.offset ?? 0) / 1000, // offset is in ms
|
|
289
|
+
duration: (s.duration ?? 0) / 1000,
|
|
290
|
+
})).filter(s => s.text.length > 0);
|
|
291
|
+
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
292
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
293
|
+
const chapters = parseChaptersFromDescription(description);
|
|
294
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
295
|
+
const summary = extractSummary(fullText);
|
|
296
|
+
console.log(`[webpeel] [youtube] Path 0 success: ${segments.length} segments, ${wordCount} words`);
|
|
297
|
+
return {
|
|
298
|
+
videoId,
|
|
299
|
+
title,
|
|
300
|
+
channel,
|
|
301
|
+
duration: formatDuration(lengthSeconds),
|
|
302
|
+
language: ytpSegments[0]?.lang ?? preferredLang,
|
|
303
|
+
segments,
|
|
304
|
+
fullText,
|
|
305
|
+
availableLanguages,
|
|
306
|
+
description,
|
|
307
|
+
publishDate,
|
|
308
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
309
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
310
|
+
summary,
|
|
311
|
+
wordCount,
|
|
312
|
+
};
|
|
313
|
+
}
|
|
314
|
+
console.log('[webpeel] [youtube] Path 0 returned empty segments');
|
|
315
|
+
}
|
|
316
|
+
catch (err) {
|
|
317
|
+
console.log('[webpeel] [youtube] Path 0 failed:', err?.message);
|
|
318
|
+
}
|
|
319
|
+
} // end VITEST guard
|
|
229
320
|
const ytUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
|
|
230
321
|
const ytHeaders = {
|
|
231
322
|
'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
|
|
232
323
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
233
324
|
};
|
|
325
|
+
// --- Path 1: yt-dlp approach (most reliable on cloud servers — handles signature challenges internally) ---
|
|
326
|
+
if (ytdlpAvailable) {
|
|
327
|
+
console.log('[webpeel] [youtube] Trying path 1: yt-dlp');
|
|
328
|
+
try {
|
|
329
|
+
const ytdlpResult = await getTranscriptViaYtDlp(videoId, preferredLang);
|
|
330
|
+
if (ytdlpResult && ytdlpResult.segments.length > 0) {
|
|
331
|
+
return ytdlpResult;
|
|
332
|
+
}
|
|
333
|
+
console.log('[webpeel] [youtube] Path 1 failed: yt-dlp returned no segments');
|
|
334
|
+
}
|
|
335
|
+
catch (err) {
|
|
336
|
+
console.log('[webpeel] [youtube] Path 1 failed:', err?.message);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
else {
|
|
340
|
+
console.log('[webpeel] [youtube] Skipping path 1: yt-dlp not available');
|
|
341
|
+
}
|
|
342
|
+
// --- Path 2: HTTP fetch (simpleFetch first; if our challenge detection fires, fall back to native fetch) ---
|
|
343
|
+
// YouTube serves consent/challenge pages to server IPs without cookies.
|
|
344
|
+
// Setting SOCS consent cookie bypasses this — same approach as youtube-transcript npm.
|
|
345
|
+
// On cloud servers, simpleFetch may throw BlockedError due to our own challenge detection;
|
|
346
|
+
// in that case we retry with native fetch() which bypasses that guard.
|
|
347
|
+
console.log('[webpeel] [youtube] Trying path 2: native fetch');
|
|
234
348
|
try {
|
|
235
|
-
|
|
236
|
-
|
|
349
|
+
let html;
|
|
350
|
+
try {
|
|
351
|
+
const fetchResult = await simpleFetch(videoUrl, ytUserAgent, 15000, ytHeaders);
|
|
352
|
+
html = fetchResult.html;
|
|
353
|
+
}
|
|
354
|
+
catch (simpleFetchErr) {
|
|
355
|
+
// If our own challenge detection threw BlockedError, retry with raw native fetch
|
|
356
|
+
const errMsg = (simpleFetchErr?.message ?? '').toLowerCase();
|
|
357
|
+
const isBlocked = simpleFetchErr?.constructor?.name === 'BlockedError' ||
|
|
358
|
+
errMsg.includes('blocked') ||
|
|
359
|
+
errMsg.includes('challenge') ||
|
|
360
|
+
errMsg.includes('cloudflare');
|
|
361
|
+
if (!isBlocked)
|
|
362
|
+
throw simpleFetchErr;
|
|
363
|
+
console.log('[webpeel] [youtube] simpleFetch BlockedError — retrying with native fetch');
|
|
364
|
+
const fetchResponse = await fetch(videoUrl, {
|
|
365
|
+
headers: {
|
|
366
|
+
'User-Agent': ytUserAgent,
|
|
367
|
+
...ytHeaders,
|
|
368
|
+
},
|
|
369
|
+
redirect: 'follow',
|
|
370
|
+
signal: AbortSignal.timeout(15000),
|
|
371
|
+
});
|
|
372
|
+
html = await fetchResponse.text();
|
|
373
|
+
}
|
|
237
374
|
if (!html.includes('ytInitialPlayerResponse') && !html.includes('ytInitialData')) {
|
|
238
375
|
throw new Error('YouTube served non-video page (likely challenge/consent)');
|
|
239
376
|
}
|
|
@@ -257,7 +394,7 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
257
394
|
const segments = parseCaptionXml(captionXml);
|
|
258
395
|
if (segments.length === 0) {
|
|
259
396
|
// Caption URL returned empty content (common when ip=0.0.0.0 in signature)
|
|
260
|
-
// Fall through to
|
|
397
|
+
// Fall through to browser intercept path
|
|
261
398
|
throw new Error('Caption XML returned empty — session-locked URL');
|
|
262
399
|
}
|
|
263
400
|
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
@@ -288,23 +425,14 @@ export async function getYouTubeTranscript(url, options = {}) {
|
|
|
288
425
|
if (msg.includes('No captions available') || msg.includes('Not a valid YouTube URL')) {
|
|
289
426
|
throw err;
|
|
290
427
|
}
|
|
428
|
+
console.log('[webpeel] [youtube] Path 2 failed:', msg);
|
|
291
429
|
// Network/parsing failures — fall through to browser intercept approach
|
|
292
430
|
}
|
|
293
|
-
// --- Path 2: yt-dlp approach (fast, reliable, handles signature challenges) ---
|
|
294
|
-
try {
|
|
295
|
-
const ytdlpResult = await getTranscriptViaYtDlp(videoId, preferredLang);
|
|
296
|
-
if (ytdlpResult && ytdlpResult.segments.length > 0) {
|
|
297
|
-
return ytdlpResult;
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
catch (err) {
|
|
301
|
-
if (process.env.DEBUG)
|
|
302
|
-
console.debug('[webpeel]', 'yt-dlp transcript failed:', err?.message);
|
|
303
|
-
}
|
|
304
431
|
// --- Path 3: Browser intercept approach ---
|
|
305
432
|
// YouTube's caption URLs are session-specific (they return empty when fetched
|
|
306
433
|
// from a different HTTP client). We intercept the timedtext network request
|
|
307
434
|
// that the YouTube player makes automatically when loading the page.
|
|
435
|
+
console.log('[webpeel] [youtube] Trying path 3: browser intercept');
|
|
308
436
|
return getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang);
|
|
309
437
|
}
|
|
310
438
|
/**
|
|
@@ -333,7 +461,7 @@ async function getTranscriptViaYtDlp(videoId, preferredLang) {
|
|
|
333
461
|
...process.env,
|
|
334
462
|
PATH: `/usr/local/bin:/usr/bin:/bin:${process.env.PATH ?? ''}`,
|
|
335
463
|
};
|
|
336
|
-
const proc = execFile('yt-dlp', args, { timeout:
|
|
464
|
+
const proc = execFile('yt-dlp', args, { timeout: 60000, env: execEnv }, async (err) => {
|
|
337
465
|
try {
|
|
338
466
|
if (err) {
|
|
339
467
|
// yt-dlp not installed, timed out, or failed
|
|
@@ -687,10 +815,35 @@ function selectBestTrack(tracks, preferredLang) {
|
|
|
687
815
|
/**
|
|
688
816
|
* Fetch the caption XML from YouTube's timedtext API.
|
|
689
817
|
* Must use same cookies/UA as the page fetch — URLs are session-locked.
|
|
818
|
+
* Tries simpleFetch first; falls back to native fetch() if BlockedError is thrown
|
|
819
|
+
* (our own challenge detection fires on cloud server IPs).
|
|
690
820
|
*/
|
|
691
821
|
async function fetchCaptionXml(baseUrl, userAgent, headers) {
|
|
692
|
-
|
|
693
|
-
|
|
822
|
+
try {
|
|
823
|
+
const result = await simpleFetch(baseUrl, userAgent, 10000, headers);
|
|
824
|
+
return result.html;
|
|
825
|
+
}
|
|
826
|
+
catch (simpleFetchErr) {
|
|
827
|
+
const errMsg = (simpleFetchErr?.message ?? '').toLowerCase();
|
|
828
|
+
const isBlocked = simpleFetchErr?.constructor?.name === 'BlockedError' ||
|
|
829
|
+
errMsg.includes('blocked') ||
|
|
830
|
+
errMsg.includes('challenge') ||
|
|
831
|
+
errMsg.includes('cloudflare');
|
|
832
|
+
if (!isBlocked)
|
|
833
|
+
throw simpleFetchErr;
|
|
834
|
+
// BlockedError: retry with native fetch
|
|
835
|
+
const fetchHeaders = {};
|
|
836
|
+
if (userAgent)
|
|
837
|
+
fetchHeaders['User-Agent'] = userAgent;
|
|
838
|
+
if (headers)
|
|
839
|
+
Object.assign(fetchHeaders, headers);
|
|
840
|
+
const response = await fetch(baseUrl, {
|
|
841
|
+
headers: fetchHeaders,
|
|
842
|
+
redirect: 'follow',
|
|
843
|
+
signal: AbortSignal.timeout(10000),
|
|
844
|
+
});
|
|
845
|
+
return response.text();
|
|
846
|
+
}
|
|
694
847
|
}
|
|
695
848
|
/**
|
|
696
849
|
* Parse YouTube caption XML into transcript segments.
|
package/dist/server/app.js
CHANGED
|
@@ -104,7 +104,7 @@ export function createApp(config = {}) {
|
|
|
104
104
|
else if (req.query?.render === 'true')
|
|
105
105
|
timeoutMs = 60000; // 1min for rendered fetches
|
|
106
106
|
else if (urlParam.includes('youtube.com') || urlParam.includes('youtu.be'))
|
|
107
|
-
timeoutMs =
|
|
107
|
+
timeoutMs = 90000; // 90s for YouTube (yt-dlp needs time after simpleFetch fails)
|
|
108
108
|
req.setTimeout(timeoutMs);
|
|
109
109
|
res.setTimeout(timeoutMs, () => {
|
|
110
110
|
if (!res.headersSent) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.20.
|
|
3
|
+
"version": "0.20.14",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|
|
@@ -112,7 +112,9 @@
|
|
|
112
112
|
"resend": "^6.9.3",
|
|
113
113
|
"turndown": "^7.2.0",
|
|
114
114
|
"turndown-plugin-gfm": "^1.0.2",
|
|
115
|
-
"undici": "^7.2.0"
|
|
115
|
+
"undici": "^7.2.0",
|
|
116
|
+
"youtube-transcript": "^1.2.1",
|
|
117
|
+
"youtube-transcript-plus": "^1.2.0"
|
|
116
118
|
},
|
|
117
119
|
"optionalDependencies": {
|
|
118
120
|
"@sentry/node": "^7.120.4",
|