@nadimtuhin/ytranscript 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +301 -0
- package/dist/cli.js +2569 -0
- package/dist/index.js +616 -0
- package/dist/mcp.js +28709 -0
- package/package.json +65 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
# ytranscript
|
|
2
|
+
|
|
3
|
+
Fast YouTube transcript extraction with bulk processing, Google Takeout support, MCP server, and multiple output formats.
|
|
4
|
+
|
|
5
|
+
Built with [Bun](https://bun.sh) for maximum performance.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Direct YouTube API** - No third-party services, uses YouTube's innertube API
|
|
10
|
+
- **MCP Server** - Use with Claude, Cursor, and other AI assistants via Model Context Protocol
|
|
11
|
+
- **Bulk processing** - Process thousands of videos with concurrency control
|
|
12
|
+
- **Google Takeout support** - Import from watch history JSON and watch-later CSV
|
|
13
|
+
- **Resume-safe** - Automatically skips already-processed videos
|
|
14
|
+
- **Multiple output formats** - JSON, JSONL, CSV, SRT, VTT, plain text
|
|
15
|
+
- **Language selection** - Choose preferred transcript languages
|
|
16
|
+
- **Programmatic API** - Use as a library in your TypeScript/JavaScript projects
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# Install globally
|
|
22
|
+
bun install -g ytranscript
|
|
23
|
+
|
|
24
|
+
# Or use locally in a project
|
|
25
|
+
bun add ytranscript
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## CLI Usage
|
|
29
|
+
|
|
30
|
+
### Fetch a single transcript
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Basic usage (outputs plain text)
|
|
34
|
+
ytranscript get dQw4w9WgXcQ
|
|
35
|
+
|
|
36
|
+
# From URL
|
|
37
|
+
ytranscript get "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
|
38
|
+
|
|
39
|
+
# With specific language
|
|
40
|
+
ytranscript get dQw4w9WgXcQ --lang es
|
|
41
|
+
|
|
42
|
+
# Output as SRT subtitles
|
|
43
|
+
ytranscript get dQw4w9WgXcQ --format srt -o video.srt
|
|
44
|
+
|
|
45
|
+
# Output as JSON with timestamps
|
|
46
|
+
ytranscript get dQw4w9WgXcQ --format json
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Check available languages
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
ytranscript info dQw4w9WgXcQ
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Bulk processing
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
# From Google Takeout exports
|
|
59
|
+
ytranscript bulk \
|
|
60
|
+
--history "Takeout/YouTube/history/watch-history.json" \
|
|
61
|
+
--watch-later "Takeout/YouTube/playlists/Watch later-videos.csv" \
|
|
62
|
+
--out-jsonl transcripts.jsonl \
|
|
63
|
+
--out-csv transcripts.csv
|
|
64
|
+
|
|
65
|
+
# From a list of video IDs
|
|
66
|
+
ytranscript bulk --videos "dQw4w9WgXcQ,jNQXAC9IVRw,9bZkp7q19f0"
|
|
67
|
+
|
|
68
|
+
# From a file (one ID or URL per line)
|
|
69
|
+
ytranscript bulk --file videos.txt
|
|
70
|
+
|
|
71
|
+
# Resume a previous run
|
|
72
|
+
ytranscript bulk --history watch-history.json --resume
|
|
73
|
+
|
|
74
|
+
# Control concurrency and rate limiting
|
|
75
|
+
ytranscript bulk \
|
|
76
|
+
--history watch-history.json \
|
|
77
|
+
--concurrency 8 \
|
|
78
|
+
--pause-after 20 \
|
|
79
|
+
--pause-ms 3000
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Programmatic API
|
|
83
|
+
|
|
84
|
+
### Fetch a single transcript
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
import { fetchTranscript } from 'ytranscript';
|
|
88
|
+
|
|
89
|
+
const transcript = await fetchTranscript('dQw4w9WgXcQ', {
|
|
90
|
+
languages: ['en', 'es'], // Preference order
|
|
91
|
+
includeAutoGenerated: true,
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
console.log(transcript.text); // Full transcript text
|
|
95
|
+
console.log(transcript.segments); // Array of { text, start, duration }
|
|
96
|
+
console.log(transcript.language); // 'en'
|
|
97
|
+
console.log(transcript.isAutoGenerated); // true/false
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Bulk processing
|
|
101
|
+
|
|
102
|
+
```typescript
|
|
103
|
+
import {
|
|
104
|
+
loadWatchHistory,
|
|
105
|
+
loadWatchLater,
|
|
106
|
+
mergeVideoSources,
|
|
107
|
+
processVideos,
|
|
108
|
+
} from 'ytranscript';
|
|
109
|
+
|
|
110
|
+
// Load from Google Takeout
|
|
111
|
+
const history = await loadWatchHistory('./watch-history.json');
|
|
112
|
+
const watchLater = await loadWatchLater('./watch-later.csv');
|
|
113
|
+
|
|
114
|
+
// Merge and deduplicate
|
|
115
|
+
const videos = mergeVideoSources(history, watchLater);
|
|
116
|
+
|
|
117
|
+
// Process with progress callback
|
|
118
|
+
const results = await processVideos(videos, {
|
|
119
|
+
concurrency: 4,
|
|
120
|
+
pauseAfter: 10,
|
|
121
|
+
pauseDuration: 5000,
|
|
122
|
+
onProgress: (completed, total, result) => {
|
|
123
|
+
const status = result.transcript ? 'OK' : 'FAIL';
|
|
124
|
+
console.log(`[${completed}/${total}] ${result.meta.videoId}: ${status}`);
|
|
125
|
+
},
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
// Filter successful results
|
|
129
|
+
const transcripts = results.filter((r) => r.transcript);
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Streaming for large datasets
|
|
133
|
+
|
|
134
|
+
```typescript
|
|
135
|
+
import { streamVideos, appendJsonl } from 'ytranscript';
|
|
136
|
+
|
|
137
|
+
for await (const result of streamVideos(videos, { concurrency: 4 })) {
|
|
138
|
+
// Write each result immediately (resume-safe)
|
|
139
|
+
await appendJsonl(result, 'output.jsonl');
|
|
140
|
+
}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Output formatting
|
|
144
|
+
|
|
145
|
+
```typescript
|
|
146
|
+
import { fetchTranscript, formatSrt, formatVtt, formatText } from 'ytranscript';
|
|
147
|
+
|
|
148
|
+
const transcript = await fetchTranscript('dQw4w9WgXcQ');
|
|
149
|
+
|
|
150
|
+
// SRT subtitles
|
|
151
|
+
const srt = formatSrt(transcript);
|
|
152
|
+
await Bun.write('video.srt', srt);
|
|
153
|
+
|
|
154
|
+
// VTT subtitles
|
|
155
|
+
const vtt = formatVtt(transcript);
|
|
156
|
+
await Bun.write('video.vtt', vtt);
|
|
157
|
+
|
|
158
|
+
// Plain text with timestamps
|
|
159
|
+
const text = formatText(transcript, true);
|
|
160
|
+
// [0:00] First line of transcript
|
|
161
|
+
// [0:05] Second line...
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Google Takeout
|
|
165
|
+
|
|
166
|
+
To export your YouTube data:
|
|
167
|
+
|
|
168
|
+
1. Go to [Google Takeout](https://takeout.google.com/)
|
|
169
|
+
2. Deselect all, then select only "YouTube and YouTube Music"
|
|
170
|
+
3. Click "All YouTube data included" and select:
|
|
171
|
+
- History → Watch history
|
|
172
|
+
- Playlists (includes Watch Later)
|
|
173
|
+
4. Export and download
|
|
174
|
+
5. Extract the archive
|
|
175
|
+
|
|
176
|
+
The relevant files are:
|
|
177
|
+
- `Takeout/YouTube and YouTube Music/history/watch-history.json`
|
|
178
|
+
- `Takeout/YouTube and YouTube Music/playlists/Watch later-videos.csv`
|
|
179
|
+
|
|
180
|
+
## API Reference
|
|
181
|
+
|
|
182
|
+
### Types
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
interface Transcript {
|
|
186
|
+
videoId: string;
|
|
187
|
+
text: string;
|
|
188
|
+
segments: TranscriptSegment[];
|
|
189
|
+
language: string;
|
|
190
|
+
isAutoGenerated: boolean;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
interface TranscriptSegment {
|
|
194
|
+
text: string;
|
|
195
|
+
start: number; // seconds
|
|
196
|
+
duration: number; // seconds
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
interface TranscriptResult {
|
|
200
|
+
meta: WatchHistoryMeta;
|
|
201
|
+
transcript: Transcript | null;
|
|
202
|
+
error?: string;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
interface FetchOptions {
|
|
206
|
+
languages?: string[];
|
|
207
|
+
timeout?: number;
|
|
208
|
+
includeAutoGenerated?: boolean;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
interface BulkOptions extends FetchOptions {
|
|
212
|
+
concurrency?: number;
|
|
213
|
+
pauseAfter?: number;
|
|
214
|
+
pauseDuration?: number;
|
|
215
|
+
skipIds?: Set<string>;
|
|
216
|
+
onProgress?: (completed: number, total: number, result: TranscriptResult) => void;
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## License
|
|
221
|
+
|
|
222
|
+
MIT
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## MCP Server (Model Context Protocol)
|
|
227
|
+
|
|
228
|
+
ytranscript includes an MCP server that allows AI assistants like Claude to fetch YouTube transcripts directly.
|
|
229
|
+
|
|
230
|
+
### Available Tools
|
|
231
|
+
|
|
232
|
+
| Tool | Description |
|
|
233
|
+
|------|-------------|
|
|
234
|
+
| `get_transcript` | Fetch transcript for a YouTube video with format options (text, segments, srt, vtt) |
|
|
235
|
+
| `get_transcript_languages` | List available caption languages for a video |
|
|
236
|
+
| `extract_video_id` | Extract video ID from various YouTube URL formats |
|
|
237
|
+
| `get_transcripts_bulk` | Fetch transcripts for multiple videos at once |
|
|
238
|
+
|
|
239
|
+
### Setup with Claude Desktop
|
|
240
|
+
|
|
241
|
+
Add to your Claude Desktop config (`~/Library/Application Support/Claude/claude_desktop_config.json` on macOS):
|
|
242
|
+
|
|
243
|
+
```json
|
|
244
|
+
{
|
|
245
|
+
"mcpServers": {
|
|
246
|
+
"ytranscript": {
|
|
247
|
+
"command": "npx",
|
|
248
|
+
"args": ["-y", "ytranscript-mcp"]
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Or if installed globally:
|
|
255
|
+
|
|
256
|
+
```json
|
|
257
|
+
{
|
|
258
|
+
"mcpServers": {
|
|
259
|
+
"ytranscript": {
|
|
260
|
+
"command": "ytranscript-mcp"
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Setup with Cursor
|
|
267
|
+
|
|
268
|
+
Add to your Cursor MCP settings:
|
|
269
|
+
|
|
270
|
+
```json
|
|
271
|
+
{
|
|
272
|
+
"mcpServers": {
|
|
273
|
+
"ytranscript": {
|
|
274
|
+
"command": "npx",
|
|
275
|
+
"args": ["-y", "ytranscript-mcp"]
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### Example Usage in Claude
|
|
282
|
+
|
|
283
|
+
Once configured, you can ask Claude:
|
|
284
|
+
|
|
285
|
+
- "Get the transcript for this YouTube video: https://youtube.com/watch?v=dQw4w9WgXcQ"
|
|
286
|
+
- "What languages are available for this video?"
|
|
287
|
+
- "Summarize the transcript of this video"
|
|
288
|
+
- "Get transcripts for these 5 videos and compare their content"
|
|
289
|
+
|
|
290
|
+
### Running the MCP Server Manually
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
# Via npx
|
|
294
|
+
npx ytranscript-mcp
|
|
295
|
+
|
|
296
|
+
# Or if installed globally
|
|
297
|
+
ytranscript-mcp
|
|
298
|
+
|
|
299
|
+
# For development
|
|
300
|
+
bun run dev:mcp
|
|
301
|
+
```
|