mcp-headless-youtube-transcript 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -11
- package/build/index.js +52 -7
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -6,43 +6,115 @@ An MCP (Model Context Protocol) server that extracts YouTube video transcripts u
|
|
|
6
6
|
|
|
7
7
|
- Extract transcripts from YouTube videos using video ID or full URL
|
|
8
8
|
- Support for multiple languages
|
|
9
|
-
-
|
|
9
|
+
- Automatic pagination for large transcripts (98k character chunks)
|
|
10
|
+
- Clean text output optimized for LLM consumption
|
|
10
11
|
- Built with TypeScript and the MCP SDK
|
|
11
12
|
|
|
12
13
|
## Installation
|
|
13
14
|
|
|
15
|
+
Install via npm:
|
|
16
|
+
|
|
14
17
|
```bash
|
|
15
|
-
npm install
|
|
16
|
-
|
|
18
|
+
npm install -g mcp-headless-youtube-transcript
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Or use directly with npx:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
npx mcp-headless-youtube-transcript
|
|
17
25
|
```
|
|
18
26
|
|
|
19
|
-
##
|
|
27
|
+
## MCP Configuration
|
|
20
28
|
|
|
21
|
-
|
|
29
|
+
Add this server to your MCP settings:
|
|
22
30
|
|
|
23
|
-
|
|
31
|
+
```json
|
|
32
|
+
{
|
|
33
|
+
"mcpServers": {
|
|
34
|
+
"youtube-transcript": {
|
|
35
|
+
"command": "npx",
|
|
36
|
+
"args": ["-y", "mcp-headless-youtube-transcript"]
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
```
|
|
24
41
|
|
|
25
|
-
|
|
42
|
+
## Tools Available
|
|
26
43
|
|
|
27
|
-
|
|
44
|
+
### `get_youtube_transcript`
|
|
28
45
|
|
|
29
|
-
Extracts transcript/captions from a YouTube video.
|
|
46
|
+
Extracts transcript/captions from a YouTube video with automatic pagination for large transcripts.
|
|
30
47
|
|
|
31
48
|
**Parameters:**
|
|
32
49
|
- `videoId` (required): YouTube video ID or full URL
|
|
33
50
|
- `lang` (optional): Language code for captions (e.g., "en", "es", "ko"). Defaults to "en"
|
|
51
|
+
- `segment` (optional): Segment number to retrieve (1-based). Each segment is ~98k characters. Defaults to 1
|
|
34
52
|
|
|
35
|
-
**
|
|
53
|
+
**Examples:**
|
|
54
|
+
|
|
55
|
+
Basic usage:
|
|
56
|
+
```json
|
|
57
|
+
{
|
|
58
|
+
"name": "get_youtube_transcript",
|
|
59
|
+
"arguments": {
|
|
60
|
+
"videoId": "dQw4w9WgXcQ"
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
With language:
|
|
36
66
|
```json
|
|
37
67
|
{
|
|
38
68
|
"name": "get_youtube_transcript",
|
|
39
69
|
"arguments": {
|
|
40
70
|
"videoId": "dQw4w9WgXcQ",
|
|
41
|
-
"lang": "
|
|
71
|
+
"lang": "es"
|
|
42
72
|
}
|
|
43
73
|
}
|
|
44
74
|
```
|
|
45
75
|
|
|
76
|
+
With pagination:
|
|
77
|
+
```json
|
|
78
|
+
{
|
|
79
|
+
"name": "get_youtube_transcript",
|
|
80
|
+
"arguments": {
|
|
81
|
+
"videoId": "dQw4w9WgXcQ",
|
|
82
|
+
"segment": 2
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Response Format
|
|
88
|
+
|
|
89
|
+
The tool returns the raw transcript text. For large transcripts, the response includes pagination information:
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
[Segment 1 of 3]
|
|
93
|
+
|
|
94
|
+
this is the actual transcript text content...
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
When multiple segments are available, you can retrieve subsequent segments by incrementing the `segment` parameter.
|
|
98
|
+
|
|
99
|
+
## Caching
|
|
100
|
+
|
|
101
|
+
The server includes built-in caching to improve performance for paginated requests. The cache behavior can be configured with an environment variable:
|
|
102
|
+
|
|
103
|
+
- `TRANSCRIPT_CACHE_TTL`: Cache duration in seconds (default: 300 = 5 minutes)
|
|
104
|
+
|
|
105
|
+
### Cache Features:
|
|
106
|
+
- Full transcripts are cached on first fetch
|
|
107
|
+
- Cache expiration time is updated on each read or write
|
|
108
|
+
- Expired entries are automatically cleaned up after each request
|
|
109
|
+
- Each video+language combination is cached separately
|
|
110
|
+
|
|
111
|
+
### Setting Cache Duration:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Set cache to 10 minutes
|
|
115
|
+
TRANSCRIPT_CACHE_TTL=600 npx mcp-headless-youtube-transcript
|
|
116
|
+
```
|
|
117
|
+
|
|
46
118
|
## Supported URL Formats
|
|
47
119
|
|
|
48
120
|
- Video ID: `dQw4w9WgXcQ`
|
package/build/index.js
CHANGED
|
@@ -4,6 +4,41 @@ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
|
|
|
4
4
|
import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
5
5
|
import { getSubtitles } from 'headless-youtube-captions';
|
|
6
6
|
import { extractVideoId } from './utils.js';
|
|
7
|
+
// In-memory cache
|
|
8
|
+
const transcriptCache = new Map();
|
|
9
|
+
// Get cache TTL from environment variable (default 5 minutes)
|
|
10
|
+
const CACHE_TTL_SECONDS = parseInt(process.env.TRANSCRIPT_CACHE_TTL || '300');
|
|
11
|
+
// Cache helper functions
|
|
12
|
+
function getCacheKey(videoId, lang) {
|
|
13
|
+
return `${videoId}:${lang}`;
|
|
14
|
+
}
|
|
15
|
+
function getCachedTranscript(videoId, lang) {
|
|
16
|
+
const key = getCacheKey(videoId, lang);
|
|
17
|
+
const entry = transcriptCache.get(key);
|
|
18
|
+
if (!entry)
|
|
19
|
+
return null;
|
|
20
|
+
const now = Date.now();
|
|
21
|
+
if (now > entry.expiresAt) {
|
|
22
|
+
transcriptCache.delete(key);
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
// Update expiration time on read
|
|
26
|
+
entry.expiresAt = now + (CACHE_TTL_SECONDS * 1000);
|
|
27
|
+
return entry.transcript;
|
|
28
|
+
}
|
|
29
|
+
function setCachedTranscript(videoId, lang, transcript) {
|
|
30
|
+
const key = getCacheKey(videoId, lang);
|
|
31
|
+
const expiresAt = Date.now() + (CACHE_TTL_SECONDS * 1000);
|
|
32
|
+
transcriptCache.set(key, { transcript, expiresAt });
|
|
33
|
+
}
|
|
34
|
+
function cleanupExpiredCache() {
|
|
35
|
+
const now = Date.now();
|
|
36
|
+
for (const [key, entry] of transcriptCache.entries()) {
|
|
37
|
+
if (now > entry.expiresAt) {
|
|
38
|
+
transcriptCache.delete(key);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
7
42
|
const server = new Server({
|
|
8
43
|
name: 'mcp-headless-youtube-transcript',
|
|
9
44
|
version: '1.0.0',
|
|
@@ -54,13 +89,19 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
54
89
|
if (!extractedVideoId) {
|
|
55
90
|
throw new Error('Invalid YouTube video ID or URL');
|
|
56
91
|
}
|
|
57
|
-
//
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
92
|
+
// Check cache first
|
|
93
|
+
let fullTranscript = getCachedTranscript(extractedVideoId, lang);
|
|
94
|
+
if (!fullTranscript) {
|
|
95
|
+
// Get subtitles using headless-youtube-captions
|
|
96
|
+
const subtitles = await getSubtitles({
|
|
97
|
+
videoID: extractedVideoId,
|
|
98
|
+
lang: lang,
|
|
99
|
+
});
|
|
100
|
+
// Get the full raw text content
|
|
101
|
+
fullTranscript = subtitles.map(s => s.text).join(' ');
|
|
102
|
+
// Cache the full transcript
|
|
103
|
+
setCachedTranscript(extractedVideoId, lang, fullTranscript);
|
|
104
|
+
}
|
|
64
105
|
// Split into 98k character chunks
|
|
65
106
|
const chunkSize = 98000;
|
|
66
107
|
const chunks = [];
|
|
@@ -106,6 +147,10 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
106
147
|
isError: true,
|
|
107
148
|
};
|
|
108
149
|
}
|
|
150
|
+
finally {
|
|
151
|
+
// Cleanup expired cache entries after each request
|
|
152
|
+
cleanupExpiredCache();
|
|
153
|
+
}
|
|
109
154
|
}
|
|
110
155
|
throw new Error(`Unknown tool: ${name}`);
|
|
111
156
|
});
|