youtube-transcript-plus 1.2.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +153 -7
- package/dist/cache/fs-cache.d.ts +18 -0
- package/dist/cache/in-memory-cache.d.ts +14 -0
- package/dist/errors.d.ts +14 -0
- package/dist/formatters.d.ts +57 -0
- package/dist/index.d.ts +141 -10
- package/dist/types.d.ts +125 -1
- package/dist/utils.d.ts +20 -0
- package/dist/youtube-transcript-plus.cjs +732 -0
- package/dist/youtube-transcript-plus.js +19 -16
- package/dist/youtube-transcript-plus.mjs +716 -0
- package/package.json +26 -15
|
@@ -0,0 +1,716 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
|
|
4
|
+
/******************************************************************************
|
|
5
|
+
Copyright (c) Microsoft Corporation.
|
|
6
|
+
|
|
7
|
+
Permission to use, copy, modify, and/or distribute this software for any
|
|
8
|
+
purpose with or without fee is hereby granted.
|
|
9
|
+
|
|
10
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
|
|
11
|
+
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
|
|
12
|
+
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
|
|
13
|
+
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
|
14
|
+
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
|
15
|
+
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
16
|
+
PERFORMANCE OF THIS SOFTWARE.
|
|
17
|
+
***************************************************************************** */
|
|
18
|
+
/* global Reflect, Promise, SuppressedError, Symbol, Iterator */
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
function __awaiter(thisArg, _arguments, P, generator) {
|
|
22
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
23
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
24
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
25
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
26
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
27
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
typeof SuppressedError === "function" ? SuppressedError : function (error, suppressed, message) {
|
|
32
|
+
var e = new Error(message);
|
|
33
|
+
return e.name = "SuppressedError", e.error = error, e.suppressed = suppressed, e;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
|
|
37
|
+
const RE_YOUTUBE = /(?:v=|\/|v\/|embed\/|watch\?.*v=|youtu\.be\/|\/v\/|e\/|watch\?.*vi?=|\/embed\/|\/v\/|vi?\/|watch\?.*vi?=|youtu\.be\/|\/vi?\/|\/e\/)([a-zA-Z0-9_-]{11})/i;
|
|
38
|
+
const RE_XML_TRANSCRIPT = /<text start="([^"]*)" dur="([^"]*)">([^<]*)<\/text>/g;
|
|
39
|
+
const DEFAULT_CACHE_TTL = 3600000; // 1 hour in milliseconds
|
|
40
|
+
|
|
41
|
+
/** Thrown when YouTube is rate-limiting requests from your IP address. */
|
|
42
|
+
class YoutubeTranscriptTooManyRequestError extends Error {
|
|
43
|
+
constructor() {
|
|
44
|
+
super('YouTube is receiving too many requests from your IP address. Please try again later or use a proxy. If the issue persists, consider reducing the frequency of requests.');
|
|
45
|
+
this.name = 'YoutubeTranscriptTooManyRequestError';
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
/** Thrown when the requested video is unavailable or has been removed. */
|
|
49
|
+
class YoutubeTranscriptVideoUnavailableError extends Error {
|
|
50
|
+
constructor(videoId) {
|
|
51
|
+
super(`The video with ID "${videoId}" is no longer available or has been removed. Please check the video URL or ID and try again.`);
|
|
52
|
+
this.name = 'YoutubeTranscriptVideoUnavailableError';
|
|
53
|
+
this.videoId = videoId;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
/** Thrown when transcripts are disabled for the video by its owner. */
|
|
57
|
+
class YoutubeTranscriptDisabledError extends Error {
|
|
58
|
+
constructor(videoId) {
|
|
59
|
+
super(`Transcripts are disabled for the video with ID "${videoId}". This may be due to the video owner disabling captions or the video not supporting transcripts.`);
|
|
60
|
+
this.name = 'YoutubeTranscriptDisabledError';
|
|
61
|
+
this.videoId = videoId;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
/** Thrown when no transcripts are available for the video. */
|
|
65
|
+
class YoutubeTranscriptNotAvailableError extends Error {
|
|
66
|
+
constructor(videoId) {
|
|
67
|
+
super(`No transcripts are available for the video with ID "${videoId}". This may be because the video does not have captions or the captions are not accessible.`);
|
|
68
|
+
this.name = 'YoutubeTranscriptNotAvailableError';
|
|
69
|
+
this.videoId = videoId;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
/** Thrown when the transcript is not available in the requested language. */
|
|
73
|
+
class YoutubeTranscriptNotAvailableLanguageError extends Error {
|
|
74
|
+
constructor(lang, availableLangs, videoId) {
|
|
75
|
+
super(`No transcripts are available in "${lang}" for the video with ID "${videoId}". Available languages: ${availableLangs.join(', ')}. Please try a different language.`);
|
|
76
|
+
this.name = 'YoutubeTranscriptNotAvailableLanguageError';
|
|
77
|
+
this.videoId = videoId;
|
|
78
|
+
this.lang = lang;
|
|
79
|
+
this.availableLangs = availableLangs;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
/** Thrown when the provided `lang` option is not a valid BCP 47 language code. */
|
|
83
|
+
class YoutubeTranscriptInvalidLangError extends Error {
|
|
84
|
+
constructor(lang) {
|
|
85
|
+
super(`Invalid language code "${lang}". Please provide a valid BCP 47 language code (e.g., "en", "fr", "pt-BR").`);
|
|
86
|
+
this.name = 'YoutubeTranscriptInvalidLangError';
|
|
87
|
+
this.lang = lang;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
/** Thrown when the provided video ID or URL is invalid. */
|
|
91
|
+
class YoutubeTranscriptInvalidVideoIdError extends Error {
|
|
92
|
+
constructor() {
|
|
93
|
+
super('Invalid YouTube video ID or URL. Please provide a valid video ID or URL. Example: "dQw4w9WgXcQ" or "https://www.youtube.com/watch?v=dQw4w9WgXcQ".');
|
|
94
|
+
this.name = 'YoutubeTranscriptInvalidVideoIdError';
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const RE_VIDEO_ID = /^[a-zA-Z0-9_-]{11}$/;
|
|
99
|
+
const RE_BCP47_LANG = /^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$/;
|
|
100
|
+
const XML_ENTITIES = {
|
|
101
|
+
'&': '&',
|
|
102
|
+
'<': '<',
|
|
103
|
+
'>': '>',
|
|
104
|
+
'"': '"',
|
|
105
|
+
''': "'",
|
|
106
|
+
''': "'",
|
|
107
|
+
};
|
|
108
|
+
const RE_XML_ENTITY = /&(?:amp|lt|gt|quot|apos|#39);/g;
|
|
109
|
+
function decodeXmlEntities(text) {
|
|
110
|
+
return text.replace(RE_XML_ENTITY, (match) => { var _a; return (_a = XML_ENTITIES[match]) !== null && _a !== void 0 ? _a : match; });
|
|
111
|
+
}
|
|
112
|
+
function retrieveVideoId(videoId) {
|
|
113
|
+
if (RE_VIDEO_ID.test(videoId)) {
|
|
114
|
+
return videoId;
|
|
115
|
+
}
|
|
116
|
+
const matchId = videoId.match(RE_YOUTUBE);
|
|
117
|
+
if (matchId && matchId.length) {
|
|
118
|
+
return matchId[1];
|
|
119
|
+
}
|
|
120
|
+
throw new YoutubeTranscriptInvalidVideoIdError();
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Validate that a language code matches a BCP 47-like pattern.
|
|
124
|
+
* @throws {@link YoutubeTranscriptInvalidLangError} if the language code is invalid.
|
|
125
|
+
*/
|
|
126
|
+
function validateLang(lang) {
|
|
127
|
+
if (!RE_BCP47_LANG.test(lang)) {
|
|
128
|
+
throw new YoutubeTranscriptInvalidLangError(lang);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
function defaultFetch(params) {
|
|
132
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
133
|
+
const { url, lang, userAgent, method = 'GET', body, headers = {}, signal } = params;
|
|
134
|
+
const fetchHeaders = Object.assign(Object.assign({ 'User-Agent': userAgent || DEFAULT_USER_AGENT }, (lang && { 'Accept-Language': lang })), headers);
|
|
135
|
+
const fetchOptions = {
|
|
136
|
+
method,
|
|
137
|
+
headers: fetchHeaders,
|
|
138
|
+
signal,
|
|
139
|
+
};
|
|
140
|
+
if (body && method === 'POST') {
|
|
141
|
+
fetchOptions.body = body;
|
|
142
|
+
}
|
|
143
|
+
return fetch(url, fetchOptions);
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
/** Returns true if the HTTP status code is retryable (429 or 5xx). */
|
|
147
|
+
function isRetryableStatus(status) {
|
|
148
|
+
return status === 429 || (status >= 500 && status <= 599);
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Wait for the given number of milliseconds, aborting early if the signal fires.
|
|
152
|
+
*/
|
|
153
|
+
function sleep(ms, signal) {
|
|
154
|
+
return new Promise((resolve, reject) => {
|
|
155
|
+
signal === null || signal === void 0 ? void 0 : signal.throwIfAborted();
|
|
156
|
+
const timer = setTimeout(resolve, ms);
|
|
157
|
+
if (signal) {
|
|
158
|
+
const onAbort = () => {
|
|
159
|
+
clearTimeout(timer);
|
|
160
|
+
reject(signal.reason);
|
|
161
|
+
};
|
|
162
|
+
signal.addEventListener('abort', onAbort, { once: true });
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Wrap a fetch call with retry logic using exponential backoff.
|
|
168
|
+
*
|
|
169
|
+
* Retries on 429 (Too Many Requests) and 5xx (Server Errors).
|
|
170
|
+
* Client errors (4xx other than 429) are returned immediately.
|
|
171
|
+
*
|
|
172
|
+
* @param fetchFn - Function that performs the fetch call.
|
|
173
|
+
* @param retries - Maximum number of retry attempts (0 = no retries).
|
|
174
|
+
* @param retryDelay - Base delay in milliseconds for exponential backoff.
|
|
175
|
+
* @param signal - Optional AbortSignal to cancel the operation.
|
|
176
|
+
* @returns The fetch Response.
|
|
177
|
+
*/
|
|
178
|
+
function fetchWithRetry(fetchFn, retries, retryDelay, signal) {
|
|
179
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
180
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
181
|
+
signal === null || signal === void 0 ? void 0 : signal.throwIfAborted();
|
|
182
|
+
const response = yield fetchFn();
|
|
183
|
+
if (!isRetryableStatus(response.status) || attempt === retries) {
|
|
184
|
+
return response;
|
|
185
|
+
}
|
|
186
|
+
// Wait with exponential backoff: delay * 2^attempt
|
|
187
|
+
const delay = retryDelay * Math.pow(2, attempt);
|
|
188
|
+
yield sleep(delay, signal);
|
|
189
|
+
}
|
|
190
|
+
// Unreachable — the loop always returns — but TypeScript requires it
|
|
191
|
+
throw new Error('Unexpected: retry loop exited without returning');
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
function sanitizeKey(key) {
|
|
196
|
+
return key.replace(/[^a-zA-Z0-9_-]/g, '_');
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* File-system-based cache implementation.
|
|
200
|
+
*
|
|
201
|
+
* Each entry is stored as a JSON file in the specified directory.
|
|
202
|
+
* Expired entries are automatically deleted when accessed.
|
|
203
|
+
*
|
|
204
|
+
* @example
|
|
205
|
+
* ```typescript
|
|
206
|
+
* import { fetchTranscript, FsCache } from 'youtube-transcript-plus';
|
|
207
|
+
* const transcript = await fetchTranscript('dQw4w9WgXcQ', {
|
|
208
|
+
* cache: new FsCache('./my-cache-dir', 86400000), // 1 day TTL
|
|
209
|
+
* });
|
|
210
|
+
* ```
|
|
211
|
+
*/
|
|
212
|
+
class FsCache {
|
|
213
|
+
/**
|
|
214
|
+
* @param cacheDir - Directory to store cache files. Created automatically if it doesn't exist.
|
|
215
|
+
* @param defaultTTL - Default time-to-live in milliseconds. Defaults to 1 hour.
|
|
216
|
+
*/
|
|
217
|
+
constructor(cacheDir = './cache', defaultTTL = DEFAULT_CACHE_TTL) {
|
|
218
|
+
this.cacheDir = cacheDir;
|
|
219
|
+
this.defaultTTL = defaultTTL;
|
|
220
|
+
this.ready = fs.mkdir(cacheDir, { recursive: true }).then(() => { });
|
|
221
|
+
}
|
|
222
|
+
get(key) {
|
|
223
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
224
|
+
yield this.ready;
|
|
225
|
+
const filePath = path.join(this.cacheDir, sanitizeKey(key));
|
|
226
|
+
try {
|
|
227
|
+
const data = yield fs.readFile(filePath, 'utf-8');
|
|
228
|
+
const { value, expires } = JSON.parse(data);
|
|
229
|
+
if (expires > Date.now()) {
|
|
230
|
+
return value;
|
|
231
|
+
}
|
|
232
|
+
yield fs.unlink(filePath);
|
|
233
|
+
}
|
|
234
|
+
catch (_error) { }
|
|
235
|
+
return null;
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
set(key, value, ttl) {
|
|
239
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
240
|
+
yield this.ready;
|
|
241
|
+
const filePath = path.join(this.cacheDir, sanitizeKey(key));
|
|
242
|
+
const expires = Date.now() + (ttl !== null && ttl !== void 0 ? ttl : this.defaultTTL);
|
|
243
|
+
yield fs.writeFile(filePath, JSON.stringify({ value, expires }), 'utf-8');
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* In-memory cache implementation using a `Map`.
|
|
250
|
+
*
|
|
251
|
+
* Entries are automatically cleaned up when accessed after expiration.
|
|
252
|
+
*
|
|
253
|
+
* @example
|
|
254
|
+
* ```typescript
|
|
255
|
+
* import { fetchTranscript, InMemoryCache } from 'youtube-transcript-plus';
|
|
256
|
+
* const transcript = await fetchTranscript('dQw4w9WgXcQ', {
|
|
257
|
+
* cache: new InMemoryCache(1800000), // 30 minutes TTL
|
|
258
|
+
* });
|
|
259
|
+
* ```
|
|
260
|
+
*/
|
|
261
|
+
class InMemoryCache {
|
|
262
|
+
/** @param defaultTTL - Default time-to-live in milliseconds. Defaults to 1 hour. */
|
|
263
|
+
constructor(defaultTTL = DEFAULT_CACHE_TTL) {
|
|
264
|
+
this.cache = new Map();
|
|
265
|
+
this.defaultTTL = defaultTTL;
|
|
266
|
+
}
|
|
267
|
+
get(key) {
|
|
268
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
269
|
+
const entry = this.cache.get(key);
|
|
270
|
+
if (entry && entry.expires > Date.now()) {
|
|
271
|
+
return entry.value;
|
|
272
|
+
}
|
|
273
|
+
this.cache.delete(key); // Clean up expired entries
|
|
274
|
+
return null;
|
|
275
|
+
});
|
|
276
|
+
}
|
|
277
|
+
set(key, value, ttl) {
|
|
278
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
279
|
+
const expires = Date.now() + (ttl !== null && ttl !== void 0 ? ttl : this.defaultTTL);
|
|
280
|
+
this.cache.set(key, { value, expires });
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Format seconds as an SRT timestamp: `HH:MM:SS,mmm`
|
|
287
|
+
* SRT uses comma as the decimal separator per specification.
|
|
288
|
+
*/
|
|
289
|
+
function formatSrtTimestamp(seconds) {
|
|
290
|
+
const h = Math.floor(seconds / 3600);
|
|
291
|
+
const m = Math.floor((seconds % 3600) / 60);
|
|
292
|
+
const s = Math.floor(seconds % 60);
|
|
293
|
+
const ms = Math.round((seconds % 1) * 1000);
|
|
294
|
+
return (String(h).padStart(2, '0') +
|
|
295
|
+
':' +
|
|
296
|
+
String(m).padStart(2, '0') +
|
|
297
|
+
':' +
|
|
298
|
+
String(s).padStart(2, '0') +
|
|
299
|
+
',' +
|
|
300
|
+
String(ms).padStart(3, '0'));
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* Format seconds as a VTT timestamp: `HH:MM:SS.mmm`
|
|
304
|
+
* VTT uses period as the decimal separator per specification.
|
|
305
|
+
*/
|
|
306
|
+
function formatVttTimestamp(seconds) {
|
|
307
|
+
const h = Math.floor(seconds / 3600);
|
|
308
|
+
const m = Math.floor((seconds % 3600) / 60);
|
|
309
|
+
const s = Math.floor(seconds % 60);
|
|
310
|
+
const ms = Math.round((seconds % 1) * 1000);
|
|
311
|
+
return (String(h).padStart(2, '0') +
|
|
312
|
+
':' +
|
|
313
|
+
String(m).padStart(2, '0') +
|
|
314
|
+
':' +
|
|
315
|
+
String(s).padStart(2, '0') +
|
|
316
|
+
'.' +
|
|
317
|
+
String(ms).padStart(3, '0'));
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Convert transcript segments to SubRip (SRT) format.
|
|
321
|
+
*
|
|
322
|
+
* @param segments - Array of transcript segments from {@link fetchTranscript}.
|
|
323
|
+
* @returns A string in SRT format with sequence numbers and `HH:MM:SS,mmm` timestamps.
|
|
324
|
+
*
|
|
325
|
+
* @example
|
|
326
|
+
* ```typescript
|
|
327
|
+
* import { fetchTranscript, toSRT } from 'youtube-transcript-plus';
|
|
328
|
+
* const transcript = await fetchTranscript('dQw4w9WgXcQ');
|
|
329
|
+
* const srt = toSRT(transcript);
|
|
330
|
+
*
|
|
331
|
+
* // With videoDetails enabled, use result.segments:
|
|
332
|
+
* const result = await fetchTranscript('dQw4w9WgXcQ', { videoDetails: true });
|
|
333
|
+
* const srt2 = toSRT(result.segments);
|
|
334
|
+
* ```
|
|
335
|
+
*/
|
|
336
|
+
function toSRT(segments) {
|
|
337
|
+
return segments
|
|
338
|
+
.map((segment, index) => {
|
|
339
|
+
const start = formatSrtTimestamp(segment.offset);
|
|
340
|
+
const end = formatSrtTimestamp(segment.offset + segment.duration);
|
|
341
|
+
return `${index + 1}\n${start} --> ${end}\n${segment.text}`;
|
|
342
|
+
})
|
|
343
|
+
.join('\n\n');
|
|
344
|
+
}
|
|
345
|
+
/**
|
|
346
|
+
* Convert transcript segments to WebVTT (VTT) format.
|
|
347
|
+
*
|
|
348
|
+
* @param segments - Array of transcript segments from {@link fetchTranscript}.
|
|
349
|
+
* @returns A string in VTT format with `WEBVTT` header and `HH:MM:SS.mmm` timestamps.
|
|
350
|
+
*
|
|
351
|
+
* @example
|
|
352
|
+
* ```typescript
|
|
353
|
+
* import { fetchTranscript, toVTT } from 'youtube-transcript-plus';
|
|
354
|
+
* const transcript = await fetchTranscript('dQw4w9WgXcQ');
|
|
355
|
+
* const vtt = toVTT(transcript);
|
|
356
|
+
*
|
|
357
|
+
* // With videoDetails enabled, use result.segments:
|
|
358
|
+
* const result = await fetchTranscript('dQw4w9WgXcQ', { videoDetails: true });
|
|
359
|
+
* const vtt2 = toVTT(result.segments);
|
|
360
|
+
* ```
|
|
361
|
+
*/
|
|
362
|
+
function toVTT(segments) {
|
|
363
|
+
const cues = segments
|
|
364
|
+
.map((segment) => {
|
|
365
|
+
const start = formatVttTimestamp(segment.offset);
|
|
366
|
+
const end = formatVttTimestamp(segment.offset + segment.duration);
|
|
367
|
+
return `${start} --> ${end}\n${segment.text}`;
|
|
368
|
+
})
|
|
369
|
+
.join('\n\n');
|
|
370
|
+
return `WEBVTT\n\n${cues}`;
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Convert transcript segments to plain text.
|
|
374
|
+
*
|
|
375
|
+
* @param segments - Array of transcript segments from {@link fetchTranscript}.
|
|
376
|
+
* @param separator - String to join segments with. Defaults to `'\n'`.
|
|
377
|
+
* @returns A plain text string with segments joined by the separator.
|
|
378
|
+
*
|
|
379
|
+
* @example
|
|
380
|
+
* ```typescript
|
|
381
|
+
* import { fetchTranscript, toPlainText } from 'youtube-transcript-plus';
|
|
382
|
+
* const transcript = await fetchTranscript('dQw4w9WgXcQ');
|
|
383
|
+
* const text = toPlainText(transcript);
|
|
384
|
+
* const paragraph = toPlainText(transcript, ' ');
|
|
385
|
+
*
|
|
386
|
+
* // With videoDetails enabled, use result.segments:
|
|
387
|
+
* const result = await fetchTranscript('dQw4w9WgXcQ', { videoDetails: true });
|
|
388
|
+
* const text2 = toPlainText(result.segments);
|
|
389
|
+
* ```
|
|
390
|
+
*/
|
|
391
|
+
function toPlainText(segments, separator = '\n') {
|
|
392
|
+
return segments.map((segment) => segment.text).join(separator);
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
/**
|
|
396
|
+
* Fetches YouTube video transcripts and caption metadata using the Innertube API.
|
|
397
|
+
*
|
|
398
|
+
* Can be used as an instance (with shared config) or via static/convenience methods.
|
|
399
|
+
*
|
|
400
|
+
* @example
|
|
401
|
+
* ```typescript
|
|
402
|
+
* // Instance usage with shared config
|
|
403
|
+
* const yt = new YoutubeTranscript({ lang: 'en' });
|
|
404
|
+
* const transcript = await yt.fetchTranscript('dQw4w9WgXcQ');
|
|
405
|
+
* const languages = await yt.listLanguages('dQw4w9WgXcQ');
|
|
406
|
+
*
|
|
407
|
+
* // Static method
|
|
408
|
+
* const transcript = await YoutubeTranscript.fetchTranscript('dQw4w9WgXcQ', { lang: 'en' });
|
|
409
|
+
*
|
|
410
|
+
* // Opt-in to video details
|
|
411
|
+
* const { videoDetails, segments } = await YoutubeTranscript.fetchTranscript('dQw4w9WgXcQ', {
|
|
412
|
+
* videoDetails: true,
|
|
413
|
+
* });
|
|
414
|
+
*
|
|
415
|
+
* // Convenience export
|
|
416
|
+
* const transcript = await fetchTranscript('dQw4w9WgXcQ');
|
|
417
|
+
* const languages = await listLanguages('dQw4w9WgXcQ');
|
|
418
|
+
* ```
|
|
419
|
+
*/
|
|
420
|
+
class YoutubeTranscript {
|
|
421
|
+
constructor(config) {
|
|
422
|
+
this.config = config;
|
|
423
|
+
}
|
|
424
|
+
/**
|
|
425
|
+
* Fetch caption tracks and the player response from the Innertube player API.
|
|
426
|
+
* Shared logic used by both fetchTranscript and listLanguages.
|
|
427
|
+
*/
|
|
428
|
+
_fetchCaptionTracks(identifier, lang) {
|
|
429
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
430
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l;
|
|
431
|
+
const userAgent = (_b = (_a = this.config) === null || _a === void 0 ? void 0 : _a.userAgent) !== null && _b !== void 0 ? _b : DEFAULT_USER_AGENT;
|
|
432
|
+
const protocol = ((_c = this.config) === null || _c === void 0 ? void 0 : _c.disableHttps) ? 'http' : 'https';
|
|
433
|
+
const retries = (_e = (_d = this.config) === null || _d === void 0 ? void 0 : _d.retries) !== null && _e !== void 0 ? _e : 0;
|
|
434
|
+
const retryDelay = (_g = (_f = this.config) === null || _f === void 0 ? void 0 : _f.retryDelay) !== null && _g !== void 0 ? _g : 1000;
|
|
435
|
+
const signal = (_h = this.config) === null || _h === void 0 ? void 0 : _h.signal;
|
|
436
|
+
// 1) Fetch the watch page to extract an Innertube API key
|
|
437
|
+
const watchUrl = `${protocol}://www.youtube.com/watch?v=${identifier}`;
|
|
438
|
+
const watchFetchParams = { url: watchUrl, lang, userAgent, signal };
|
|
439
|
+
const videoPageResponse = yield fetchWithRetry(() => {
|
|
440
|
+
var _a;
|
|
441
|
+
return ((_a = this.config) === null || _a === void 0 ? void 0 : _a.videoFetch)
|
|
442
|
+
? this.config.videoFetch(watchFetchParams)
|
|
443
|
+
: defaultFetch(watchFetchParams);
|
|
444
|
+
}, retries, retryDelay, signal);
|
|
445
|
+
if (!videoPageResponse.ok) {
|
|
446
|
+
throw new YoutubeTranscriptVideoUnavailableError(identifier);
|
|
447
|
+
}
|
|
448
|
+
const videoPageBody = yield videoPageResponse.text();
|
|
449
|
+
// Basic bot/recaptcha detection preserves old error behavior
|
|
450
|
+
if (videoPageBody.includes('class="g-recaptcha"')) {
|
|
451
|
+
throw new YoutubeTranscriptTooManyRequestError();
|
|
452
|
+
}
|
|
453
|
+
// 2) Extract Innertube API key from the page
|
|
454
|
+
const apiKeyMatch = videoPageBody.match(/"INNERTUBE_API_KEY":"([^"]+)"/) ||
|
|
455
|
+
videoPageBody.match(/INNERTUBE_API_KEY\\":\\"([^\\"]+)\\"/);
|
|
456
|
+
if (!apiKeyMatch) {
|
|
457
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
458
|
+
}
|
|
459
|
+
const apiKey = apiKeyMatch[1];
|
|
460
|
+
// 3) Call Innertube player as ANDROID client to retrieve captionTracks
|
|
461
|
+
const playerEndpoint = `${protocol}://www.youtube.com/youtubei/v1/player?key=${apiKey}`;
|
|
462
|
+
const playerBody = {
|
|
463
|
+
context: {
|
|
464
|
+
client: {
|
|
465
|
+
clientName: 'ANDROID',
|
|
466
|
+
clientVersion: '20.10.38',
|
|
467
|
+
},
|
|
468
|
+
},
|
|
469
|
+
videoId: identifier,
|
|
470
|
+
};
|
|
471
|
+
const playerFetchParams = {
|
|
472
|
+
url: playerEndpoint,
|
|
473
|
+
method: 'POST',
|
|
474
|
+
lang,
|
|
475
|
+
userAgent,
|
|
476
|
+
headers: { 'Content-Type': 'application/json' },
|
|
477
|
+
body: JSON.stringify(playerBody),
|
|
478
|
+
signal,
|
|
479
|
+
};
|
|
480
|
+
const playerRes = yield fetchWithRetry(() => {
|
|
481
|
+
var _a;
|
|
482
|
+
return ((_a = this.config) === null || _a === void 0 ? void 0 : _a.playerFetch)
|
|
483
|
+
? this.config.playerFetch(playerFetchParams)
|
|
484
|
+
: defaultFetch(playerFetchParams);
|
|
485
|
+
}, retries, retryDelay, signal);
|
|
486
|
+
if (!playerRes.ok) {
|
|
487
|
+
throw new YoutubeTranscriptVideoUnavailableError(identifier);
|
|
488
|
+
}
|
|
489
|
+
const playerJson = (yield playerRes.json());
|
|
490
|
+
const tracklist = (_k = (_j = playerJson.captions) === null || _j === void 0 ? void 0 : _j.playerCaptionsTracklistRenderer) !== null && _k !== void 0 ? _k : playerJson.playerCaptionsTracklistRenderer;
|
|
491
|
+
const tracks = tracklist === null || tracklist === void 0 ? void 0 : tracklist.captionTracks;
|
|
492
|
+
const isPlayableOk = ((_l = playerJson.playabilityStatus) === null || _l === void 0 ? void 0 : _l.status) === 'OK';
|
|
493
|
+
// If `captions` is entirely missing, treat as "not available"
|
|
494
|
+
if (!playerJson.captions || !tracklist) {
|
|
495
|
+
// If video is playable but captions aren't provided, treat as "disabled"
|
|
496
|
+
if (isPlayableOk) {
|
|
497
|
+
throw new YoutubeTranscriptDisabledError(identifier);
|
|
498
|
+
}
|
|
499
|
+
// Otherwise we can't assert they're disabled; treat as "not available"
|
|
500
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
501
|
+
}
|
|
502
|
+
// If `captions` exists but there are zero tracks, treat as "disabled"
|
|
503
|
+
if (!Array.isArray(tracks) || tracks.length === 0) {
|
|
504
|
+
throw new YoutubeTranscriptDisabledError(identifier);
|
|
505
|
+
}
|
|
506
|
+
return { tracks, playerJson };
|
|
507
|
+
});
|
|
508
|
+
}
|
|
509
|
+
/**
|
|
510
|
+
* Extract VideoDetails from the Innertube player response.
|
|
511
|
+
*/
|
|
512
|
+
_extractVideoDetails(playerJson, identifier) {
|
|
513
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l;
|
|
514
|
+
const raw = playerJson.videoDetails;
|
|
515
|
+
return {
|
|
516
|
+
videoId: (_a = raw === null || raw === void 0 ? void 0 : raw.videoId) !== null && _a !== void 0 ? _a : identifier,
|
|
517
|
+
title: (_b = raw === null || raw === void 0 ? void 0 : raw.title) !== null && _b !== void 0 ? _b : '',
|
|
518
|
+
author: (_c = raw === null || raw === void 0 ? void 0 : raw.author) !== null && _c !== void 0 ? _c : '',
|
|
519
|
+
channelId: (_d = raw === null || raw === void 0 ? void 0 : raw.channelId) !== null && _d !== void 0 ? _d : '',
|
|
520
|
+
lengthSeconds: parseInt((_e = raw === null || raw === void 0 ? void 0 : raw.lengthSeconds) !== null && _e !== void 0 ? _e : '0', 10),
|
|
521
|
+
viewCount: parseInt((_f = raw === null || raw === void 0 ? void 0 : raw.viewCount) !== null && _f !== void 0 ? _f : '0', 10),
|
|
522
|
+
description: (_g = raw === null || raw === void 0 ? void 0 : raw.shortDescription) !== null && _g !== void 0 ? _g : '',
|
|
523
|
+
keywords: (_h = raw === null || raw === void 0 ? void 0 : raw.keywords) !== null && _h !== void 0 ? _h : [],
|
|
524
|
+
thumbnails: (_k = (_j = raw === null || raw === void 0 ? void 0 : raw.thumbnail) === null || _j === void 0 ? void 0 : _j.thumbnails) !== null && _k !== void 0 ? _k : [],
|
|
525
|
+
isLiveContent: (_l = raw === null || raw === void 0 ? void 0 : raw.isLiveContent) !== null && _l !== void 0 ? _l : false,
|
|
526
|
+
};
|
|
527
|
+
}
|
|
528
|
+
/**
|
|
529
|
+
* Fetch the transcript for a YouTube video.
|
|
530
|
+
*
|
|
531
|
+
* When `videoDetails` is set to `true` in the config, returns a {@link TranscriptResult}
|
|
532
|
+
* containing both video metadata and transcript segments. Otherwise returns an array of
|
|
533
|
+
* {@link TranscriptSegment} objects.
|
|
534
|
+
*
|
|
535
|
+
* **Note:** The instance method returns a union type because `videoDetails` is set at
|
|
536
|
+
* construction time. For automatic type narrowing, use the static method or the
|
|
537
|
+
* `fetchTranscript` convenience export instead.
|
|
538
|
+
*
|
|
539
|
+
* @param videoId - A YouTube video ID (11 characters) or full YouTube URL.
|
|
540
|
+
* @returns An array of transcript segments, or a TranscriptResult if `videoDetails` is enabled.
|
|
541
|
+
* @throws {@link YoutubeTranscriptInvalidVideoIdError} if the video ID/URL is invalid.
|
|
542
|
+
* @throws {@link YoutubeTranscriptVideoUnavailableError} if the video is unavailable.
|
|
543
|
+
* @throws {@link YoutubeTranscriptDisabledError} if transcripts are disabled.
|
|
544
|
+
* @throws {@link YoutubeTranscriptNotAvailableError} if no transcript is available.
|
|
545
|
+
* @throws {@link YoutubeTranscriptNotAvailableLanguageError} if the requested language is unavailable.
|
|
546
|
+
* @throws {@link YoutubeTranscriptTooManyRequestError} if rate-limited by YouTube.
|
|
547
|
+
*/
|
|
548
|
+
fetchTranscript(videoId) {
|
|
549
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
550
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o;
|
|
551
|
+
const identifier = retrieveVideoId(videoId);
|
|
552
|
+
const lang = (_a = this.config) === null || _a === void 0 ? void 0 : _a.lang;
|
|
553
|
+
if (lang) {
|
|
554
|
+
validateLang(lang);
|
|
555
|
+
}
|
|
556
|
+
const userAgent = (_c = (_b = this.config) === null || _b === void 0 ? void 0 : _b.userAgent) !== null && _c !== void 0 ? _c : DEFAULT_USER_AGENT;
|
|
557
|
+
const includeDetails = ((_d = this.config) === null || _d === void 0 ? void 0 : _d.videoDetails) === true;
|
|
558
|
+
// Cache lookup (if provided)
|
|
559
|
+
const cache = (_e = this.config) === null || _e === void 0 ? void 0 : _e.cache;
|
|
560
|
+
const cacheTTL = (_f = this.config) === null || _f === void 0 ? void 0 : _f.cacheTTL;
|
|
561
|
+
const cacheKey = includeDetails
|
|
562
|
+
? `yt:transcript+details:${identifier}:${lang !== null && lang !== void 0 ? lang : ''}`
|
|
563
|
+
: `yt:transcript:${identifier}:${lang !== null && lang !== void 0 ? lang : ''}`;
|
|
564
|
+
if (cache) {
|
|
565
|
+
const cached = yield cache.get(cacheKey);
|
|
566
|
+
if (cached) {
|
|
567
|
+
try {
|
|
568
|
+
return JSON.parse(cached);
|
|
569
|
+
}
|
|
570
|
+
catch (_p) {
|
|
571
|
+
// ignore parse errors and continue
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
const { tracks, playerJson } = yield this._fetchCaptionTracks(identifier, lang);
|
|
576
|
+
// Respect requested language or fallback to first track
|
|
577
|
+
const selectedTrack = lang
|
|
578
|
+
? tracks.find((t) => t.languageCode === lang)
|
|
579
|
+
: tracks[0];
|
|
580
|
+
if (!selectedTrack) {
|
|
581
|
+
const available = tracks.map((t) => t.languageCode).filter(Boolean);
|
|
582
|
+
throw new YoutubeTranscriptNotAvailableLanguageError(lang, available, identifier);
|
|
583
|
+
}
|
|
584
|
+
// Build transcript URL; prefer XML by stripping fmt if present
|
|
585
|
+
const transcriptBaseURL = (_g = selectedTrack.baseUrl) !== null && _g !== void 0 ? _g : selectedTrack.url;
|
|
586
|
+
if (!transcriptBaseURL) {
|
|
587
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
588
|
+
}
|
|
589
|
+
let transcriptURL = transcriptBaseURL;
|
|
590
|
+
transcriptURL = transcriptURL.replace(/&fmt=[^&]+/, '');
|
|
591
|
+
if ((_h = this.config) === null || _h === void 0 ? void 0 : _h.disableHttps) {
|
|
592
|
+
transcriptURL = transcriptURL.replace(/^https:\/\//, 'http://');
|
|
593
|
+
}
|
|
594
|
+
// Fetch transcript XML using the same hook surface as before
|
|
595
|
+
const retries = (_k = (_j = this.config) === null || _j === void 0 ? void 0 : _j.retries) !== null && _k !== void 0 ? _k : 0;
|
|
596
|
+
const retryDelay = (_m = (_l = this.config) === null || _l === void 0 ? void 0 : _l.retryDelay) !== null && _m !== void 0 ? _m : 1000;
|
|
597
|
+
const signal = (_o = this.config) === null || _o === void 0 ? void 0 : _o.signal;
|
|
598
|
+
const transcriptFetchParams = { url: transcriptURL, lang, userAgent, signal };
|
|
599
|
+
const transcriptResponse = yield fetchWithRetry(() => {
|
|
600
|
+
var _a;
|
|
601
|
+
return ((_a = this.config) === null || _a === void 0 ? void 0 : _a.transcriptFetch)
|
|
602
|
+
? this.config.transcriptFetch(transcriptFetchParams)
|
|
603
|
+
: defaultFetch(transcriptFetchParams);
|
|
604
|
+
}, retries, retryDelay, signal);
|
|
605
|
+
if (!transcriptResponse.ok) {
|
|
606
|
+
// Preserve legacy behavior
|
|
607
|
+
if (transcriptResponse.status === 429) {
|
|
608
|
+
throw new YoutubeTranscriptTooManyRequestError();
|
|
609
|
+
}
|
|
610
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
611
|
+
}
|
|
612
|
+
const transcriptBody = yield transcriptResponse.text();
|
|
613
|
+
// Parse XML into TranscriptSegment objects
|
|
614
|
+
const results = [...transcriptBody.matchAll(RE_XML_TRANSCRIPT)];
|
|
615
|
+
const segments = results.map((m) => ({
|
|
616
|
+
text: decodeXmlEntities(m[3]),
|
|
617
|
+
duration: parseFloat(m[2]),
|
|
618
|
+
offset: parseFloat(m[1]),
|
|
619
|
+
lang: lang !== null && lang !== void 0 ? lang : selectedTrack.languageCode,
|
|
620
|
+
}));
|
|
621
|
+
if (segments.length === 0) {
|
|
622
|
+
throw new YoutubeTranscriptNotAvailableError(identifier);
|
|
623
|
+
}
|
|
624
|
+
// Build the result based on whether videoDetails was requested
|
|
625
|
+
const result = includeDetails
|
|
626
|
+
? { videoDetails: this._extractVideoDetails(playerJson, identifier), segments }
|
|
627
|
+
: segments;
|
|
628
|
+
// Cache store
|
|
629
|
+
if (cache) {
|
|
630
|
+
try {
|
|
631
|
+
yield cache.set(cacheKey, JSON.stringify(result), cacheTTL);
|
|
632
|
+
}
|
|
633
|
+
catch (_q) {
|
|
634
|
+
// non-fatal
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
return result;
|
|
638
|
+
});
|
|
639
|
+
}
|
|
640
|
+
/**
|
|
641
|
+
* List available caption languages for a YouTube video.
|
|
642
|
+
*
|
|
643
|
+
* Queries the Innertube player API to discover what caption tracks exist,
|
|
644
|
+
* without downloading any transcript data.
|
|
645
|
+
*
|
|
646
|
+
* @param videoId - A YouTube video ID (11 characters) or full YouTube URL.
|
|
647
|
+
* @returns An array of available caption track info objects.
|
|
648
|
+
* @throws {@link YoutubeTranscriptInvalidVideoIdError} if the video ID/URL is invalid.
|
|
649
|
+
* @throws {@link YoutubeTranscriptVideoUnavailableError} if the video is unavailable.
|
|
650
|
+
* @throws {@link YoutubeTranscriptDisabledError} if transcripts are disabled.
|
|
651
|
+
* @throws {@link YoutubeTranscriptNotAvailableError} if no captions are available.
|
|
652
|
+
* @throws {@link YoutubeTranscriptTooManyRequestError} if rate-limited by YouTube.
|
|
653
|
+
*
|
|
654
|
+
* @example
|
|
655
|
+
* ```typescript
|
|
656
|
+
* const yt = new YoutubeTranscript();
|
|
657
|
+
* const languages = await yt.listLanguages('dQw4w9WgXcQ');
|
|
658
|
+
* // [
|
|
659
|
+
* // { languageCode: 'en', languageName: 'English', isAutoGenerated: false },
|
|
660
|
+
* // { languageCode: 'es', languageName: 'Spanish (auto-generated)', isAutoGenerated: true },
|
|
661
|
+
* // ]
|
|
662
|
+
* ```
|
|
663
|
+
*/
|
|
664
|
+
listLanguages(videoId) {
|
|
665
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
666
|
+
const identifier = retrieveVideoId(videoId);
|
|
667
|
+
const { tracks } = yield this._fetchCaptionTracks(identifier);
|
|
668
|
+
return tracks.map((track) => {
|
|
669
|
+
var _a, _b;
|
|
670
|
+
return ({
|
|
671
|
+
languageCode: track.languageCode,
|
|
672
|
+
languageName: (_b = (_a = track.name) === null || _a === void 0 ? void 0 : _a.simpleText) !== null && _b !== void 0 ? _b : track.languageCode,
|
|
673
|
+
isAutoGenerated: track.kind === 'asr',
|
|
674
|
+
});
|
|
675
|
+
});
|
|
676
|
+
});
|
|
677
|
+
}
|
|
678
|
+
static fetchTranscript(videoId, config) {
|
|
679
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
680
|
+
const instance = new YoutubeTranscript(config);
|
|
681
|
+
return instance.fetchTranscript(videoId);
|
|
682
|
+
});
|
|
683
|
+
}
|
|
684
|
+
/**
|
|
685
|
+
* Static convenience method to list available caption languages without creating an instance.
|
|
686
|
+
*
|
|
687
|
+
* @param videoId - A YouTube video ID (11 characters) or full YouTube URL.
|
|
688
|
+
* @param config - Optional configuration options.
|
|
689
|
+
* @returns An array of available caption track info objects.
|
|
690
|
+
*/
|
|
691
|
+
static listLanguages(videoId, config) {
|
|
692
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
693
|
+
const instance = new YoutubeTranscript(config);
|
|
694
|
+
return instance.listLanguages(videoId);
|
|
695
|
+
});
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
function fetchTranscript(videoId, config) {
|
|
699
|
+
return YoutubeTranscript.fetchTranscript(videoId, config);
|
|
700
|
+
}
|
|
701
|
+
/**
|
|
702
|
+
* Convenience function to list available caption languages for a YouTube video.
|
|
703
|
+
*
|
|
704
|
+
* @param videoId - A YouTube video ID (11 characters) or full YouTube URL.
|
|
705
|
+
* @param config - Optional configuration options.
|
|
706
|
+
* @returns An array of available caption track info objects.
|
|
707
|
+
*
|
|
708
|
+
* @example
|
|
709
|
+
* ```typescript
|
|
710
|
+
* import { listLanguages } from 'youtube-transcript-plus';
|
|
711
|
+
* const languages = await listLanguages('dQw4w9WgXcQ');
|
|
712
|
+
* ```
|
|
713
|
+
*/
|
|
714
|
+
const listLanguages = YoutubeTranscript.listLanguages;
|
|
715
|
+
|
|
716
|
+
export { FsCache, InMemoryCache, YoutubeTranscript, YoutubeTranscriptDisabledError, YoutubeTranscriptInvalidLangError, YoutubeTranscriptInvalidVideoIdError, YoutubeTranscriptNotAvailableError, YoutubeTranscriptNotAvailableLanguageError, YoutubeTranscriptTooManyRequestError, YoutubeTranscriptVideoUnavailableError, fetchTranscript, listLanguages, toPlainText, toSRT, toVTT };
|