@hallelx/youtube-transcript 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,25 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 hallelx2
4
+
5
+ This package is a TypeScript port of the Python library `youtube-transcript-api`
6
+ by Jonas Depoix (jdepoix), which is also released under the MIT license.
7
+ Original project: https://github.com/jdepoix/youtube-transcript-api
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,270 @@
1
+ # @hallelx/youtube-transcript
2
+
3
+ Fetch transcripts and subtitles from YouTube videos. Works with both manually
4
+ created captions and auto-generated transcripts. Supports translation and
5
+ multiple output formats (JSON, text, SRT, WebVTT, pretty).
6
+
7
+ This is a faithful TypeScript port of the excellent Python library
8
+ [`youtube-transcript-api`](https://github.com/jdepoix/youtube-transcript-api)
9
+ by [jdepoix](https://github.com/jdepoix). It uses the same internal
10
+ `youtubei/v1/player` endpoint, so it does **not** scrape the YouTube web page
11
+ DOM and is much more resilient than HTML-scraping alternatives.
12
+
13
+ Runs on **Node.js (>=18)**, **Bun**, and **Deno** (with a custom `fetchFn` on
14
+ Deno when using proxies). Zero runtime dependencies in the common path.
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ npm install @hallelx/youtube-transcript
20
+ # or
21
+ bun add @hallelx/youtube-transcript
22
+ # or
23
+ pnpm add @hallelx/youtube-transcript
24
+ ```
25
+
26
+ ## Quick start
27
+
28
+ ```ts
29
+ import { YouTubeTranscriptApi } from '@hallelx/youtube-transcript';
30
+
31
+ const api = new YouTubeTranscriptApi();
32
+ const transcript = await api.fetch('arj7oStGLkU');
33
+
34
+ for (const snippet of transcript) {
35
+ console.log(`[${snippet.start}s] ${snippet.text}`);
36
+ }
37
+ ```
38
+
39
+ `fetch(videoId, options?)` returns a `FetchedTranscript` containing snippets,
40
+ the language, and metadata. The default language is English (`en`); pass
41
+ `languages` to specify a priority list:
42
+
43
+ ```ts
44
+ const transcript = await api.fetch('arj7oStGLkU', {
45
+ languages: ['de', 'en'], // try German first, fall back to English
46
+ });
47
+ ```
48
+
49
+ ## Listing available transcripts
50
+
51
+ ```ts
52
+ const list = await api.list('arj7oStGLkU');
53
+
54
+ for (const transcript of list) {
55
+ console.log(transcript.languageCode, transcript.language, transcript.isGenerated);
56
+ }
57
+
58
+ // Find a specific kind:
59
+ const manual = list.findManuallyCreatedTranscript(['en']);
60
+ const generated = list.findGeneratedTranscript(['en']);
61
+ const fetched = await manual.fetch();
62
+ ```
63
+
64
+ ## Translation
65
+
66
+ ```ts
67
+ const list = await api.list('arj7oStGLkU');
68
+ const en = list.findTranscript(['en']);
69
+
70
+ if (en.isTranslatable) {
71
+ const french = en.translate('fr');
72
+ const fetched = await french.fetch();
73
+ console.log(fetched.snippets);
74
+ }
75
+ ```
76
+
77
+ ## Output formatters
78
+
79
+ ```ts
80
+ import {
81
+ YouTubeTranscriptApi,
82
+ JSONFormatter,
83
+ SRTFormatter,
84
+ WebVTTFormatter,
85
+ TextFormatter,
86
+ } from '@hallelx/youtube-transcript';
87
+
88
+ const transcript = await new YouTubeTranscriptApi().fetch('arj7oStGLkU');
89
+
90
+ console.log(new JSONFormatter().formatTranscript(transcript, { indent: 2 }));
91
+ console.log(new SRTFormatter().formatTranscript(transcript));
92
+ console.log(new WebVTTFormatter().formatTranscript(transcript));
93
+ console.log(new TextFormatter().formatTranscript(transcript));
94
+ ```
95
+
96
+ ## Preserving HTML formatting
97
+
98
+ By default, all HTML tags are stripped from snippet text. To preserve a small
99
+ whitelist of formatting tags (`<strong>`, `<em>`, `<b>`, `<i>`, `<mark>`,
100
+ `<small>`, `<del>`, `<ins>`, `<sub>`, `<sup>`), pass `preserveFormatting: true`:
101
+
102
+ ```ts
103
+ const transcript = await api.fetch('arj7oStGLkU', { preserveFormatting: true });
104
+ ```
105
+
106
+ ## CLI
107
+
108
+ The package ships a `youtube-transcript` binary:
109
+
110
+ ```bash
111
+ youtube-transcript --list-transcripts arj7oStGLkU
112
+ youtube-transcript --languages en --format srt arj7oStGLkU
113
+ youtube-transcript --languages de en --format json arj7oStGLkU dQw4w9WgXcQ
114
+ youtube-transcript --translate fr arj7oStGLkU
115
+ ```
116
+
117
+ Run `youtube-transcript --help` for the full list of options.
118
+
119
+ ## Working around IP bans (proxies)
120
+
121
+ YouTube blocks IPs that make too many requests, especially from cloud
122
+ providers. The library exposes two proxy configurations:
123
+
124
+ ### Generic HTTP/HTTPS proxy
125
+
126
+ ```ts
127
+ import { YouTubeTranscriptApi, GenericProxyConfig } from '@hallelx/youtube-transcript';
128
+
129
+ const api = new YouTubeTranscriptApi({
130
+ proxyConfig: new GenericProxyConfig({
131
+ httpUrl: 'http://user:pass@proxy.example.com:8080',
132
+ httpsUrl: 'http://user:pass@proxy.example.com:8080',
133
+ }),
134
+ });
135
+ ```
136
+
137
+ ### Webshare rotating residential proxies (recommended)
138
+
139
+ ```ts
140
+ import { YouTubeTranscriptApi, WebshareProxyConfig } from '@hallelx/youtube-transcript';
141
+
142
+ const api = new YouTubeTranscriptApi({
143
+ proxyConfig: new WebshareProxyConfig({
144
+ proxyUsername: 'your-webshare-username',
145
+ proxyPassword: 'your-webshare-password',
146
+ }),
147
+ });
148
+ ```
149
+
150
+ ### Runtime support for proxies
151
+
152
+ - **Node.js**: requires the optional peer dependency `undici`. Install it once
153
+ with `npm install undici`. The library lazy-loads it only when a proxy is in
154
+ use.
155
+ - **Bun**: uses Bun's native `fetch({ proxy })` option — no extra deps needed.
156
+ - **Deno**: pass a custom `fetchFn` configured with `Deno.createHttpClient`.
157
+
158
+ ### Custom `fetchFn`
159
+
160
+ For full control (custom HTTPS agents, retries, telemetry), inject your own
161
+ fetch implementation:
162
+
163
+ ```ts
164
+ const api = new YouTubeTranscriptApi({
165
+ fetchFn: async (input, init) => {
166
+ // wrap the global fetch, plug in middleware, etc.
167
+ return fetch(input, init);
168
+ },
169
+ });
170
+ ```
171
+
172
+ ## Error handling
173
+
174
+ All exceptions extend `YouTubeTranscriptApiException`. The most useful
175
+ subclasses are:
176
+
177
+ | Error | When |
178
+ | --- | --- |
179
+ | `VideoUnavailable` | Video doesn't exist or has been removed |
180
+ | `InvalidVideoId` | A URL was passed instead of a video ID |
181
+ | `VideoUnplayable` | Region-blocked, copyright strike, or similar |
182
+ | `AgeRestricted` | Video requires sign-in for age verification |
183
+ | `TranscriptsDisabled` | Video has no captions enabled |
184
+ | `NoTranscriptFound` | None of the requested languages exist |
185
+ | `NotTranslatable` | Tried to translate a non-translatable transcript |
186
+ | `TranslationLanguageNotAvailable` | Translation target language unavailable |
187
+ | `RequestBlocked` / `IpBlocked` | YouTube blocked your IP |
188
+ | `PoTokenRequired` | Video requires a PO Token (rare) |
189
+ | `FailedToCreateConsentCookie` | Could not bypass the EU consent screen |
190
+ | `YouTubeRequestFailed` | Underlying HTTP request failed |
191
+ | `YouTubeDataUnparsable` | YouTube response shape changed unexpectedly |
192
+
193
+ ```ts
194
+ import {
195
+ YouTubeTranscriptApi,
196
+ TranscriptsDisabled,
197
+ NoTranscriptFound,
198
+ } from '@hallelx/youtube-transcript';
199
+
200
+ try {
201
+ const transcript = await new YouTubeTranscriptApi().fetch('xxx');
202
+ } catch (err) {
203
+ if (err instanceof TranscriptsDisabled) {
204
+ console.log('No captions on this video');
205
+ } else if (err instanceof NoTranscriptFound) {
206
+ console.log('No transcript in the requested language');
207
+ } else {
208
+ throw err;
209
+ }
210
+ }
211
+ ```
212
+
213
+ ## API surface
214
+
215
+ ```ts
216
+ class YouTubeTranscriptApi {
217
+ constructor(options?: { proxyConfig?: ProxyConfig; fetchFn?: typeof fetch });
218
+ fetch(videoId: string, options?: { languages?: string[]; preserveFormatting?: boolean }): Promise<FetchedTranscript>;
219
+ list(videoId: string): Promise<TranscriptList>;
220
+ }
221
+
222
+ class TranscriptList implements Iterable<Transcript> {
223
+ videoId: string;
224
+ findTranscript(languageCodes: Iterable<string>): Transcript;
225
+ findManuallyCreatedTranscript(languageCodes: Iterable<string>): Transcript;
226
+ findGeneratedTranscript(languageCodes: Iterable<string>): Transcript;
227
+ }
228
+
229
+ class Transcript {
230
+ videoId: string;
231
+ language: string;
232
+ languageCode: string;
233
+ isGenerated: boolean;
234
+ isTranslatable: boolean;
235
+ translationLanguages: readonly TranslationLanguage[];
236
+ fetch(options?: { preserveFormatting?: boolean }): Promise<FetchedTranscript>;
237
+ translate(languageCode: string): Transcript;
238
+ }
239
+
240
+ class FetchedTranscript implements Iterable<FetchedTranscriptSnippet> {
241
+ snippets: FetchedTranscriptSnippet[];
242
+ videoId: string;
243
+ language: string;
244
+ languageCode: string;
245
+ isGenerated: boolean;
246
+ toRawData(): Array<{ text: string; start: number; duration: number }>;
247
+ }
248
+
249
+ interface FetchedTranscriptSnippet {
250
+ text: string;
251
+ start: number; // seconds
252
+ duration: number; // seconds
253
+ }
254
+ ```
255
+
256
+ ## Differences from the Python library
257
+
258
+ - `pretty` formatter uses `JSON.stringify(data, null, 2)` instead of Python's
259
+ `pprint`. The output is intended for human reading and the structure is the
260
+ same.
261
+ - `WebshareProxyConfig` percent-encodes the username and password when building
262
+ the proxy URL (Python relies on the `requests` library to handle this).
263
+ - The constructor takes `fetchFn?: typeof fetch` rather than a
264
+ `requests.Session` instance.
265
+
266
+ ## License
267
+
268
+ MIT. This package is a port of
269
+ [`youtube-transcript-api`](https://github.com/jdepoix/youtube-transcript-api)
270
+ by jdepoix, also MIT-licensed. Please consider supporting the upstream project.