mcp-headless-youtube-transcript 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +83 -11
  2. package/build/index.js +52 -7
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -6,43 +6,115 @@ An MCP (Model Context Protocol) server that extracts YouTube video transcripts u
6
6
 
7
7
  - Extract transcripts from YouTube videos using video ID or full URL
8
8
  - Support for multiple languages
9
- - Timestamped transcript output
9
+ - Automatic pagination for large transcripts (98k character chunks)
10
+ - Clean text output optimized for LLM consumption
10
11
  - Built with TypeScript and the MCP SDK
11
12
 
12
13
  ## Installation
13
14
 
15
+ Install via npm:
16
+
14
17
  ```bash
15
- npm install
16
- npm run build
18
+ npm install -g mcp-headless-youtube-transcript
19
+ ```
20
+
21
+ Or use directly with npx:
22
+
23
+ ```bash
24
+ npx mcp-headless-youtube-transcript
17
25
  ```
18
26
 
19
- ## Usage
27
+ ## MCP Configuration
20
28
 
21
- ### As an MCP Server
29
+ Add this server to your MCP settings:
22
30
 
23
- This server implements the Model Context Protocol and can be used with MCP clients.
31
+ ```json
32
+ {
33
+ "mcpServers": {
34
+ "youtube-transcript": {
35
+ "command": "npx",
36
+ "args": ["-y", "mcp-headless-youtube-transcript"]
37
+ }
38
+ }
39
+ }
40
+ ```
24
41
 
25
- ### Tools Available
42
+ ## Tools Available
26
43
 
27
- #### `get_youtube_transcript`
44
+ ### `get_youtube_transcript`
28
45
 
29
- Extracts transcript/captions from a YouTube video.
46
+ Extracts transcript/captions from a YouTube video with automatic pagination for large transcripts.
30
47
 
31
48
  **Parameters:**
32
49
  - `videoId` (required): YouTube video ID or full URL
33
50
  - `lang` (optional): Language code for captions (e.g., "en", "es", "ko"). Defaults to "en"
51
+ - `segment` (optional): Segment number to retrieve (1-based). Each segment is ~98k characters. Defaults to 1
34
52
 
35
- **Example:**
53
+ **Examples:**
54
+
55
+ Basic usage:
56
+ ```json
57
+ {
58
+ "name": "get_youtube_transcript",
59
+ "arguments": {
60
+ "videoId": "dQw4w9WgXcQ"
61
+ }
62
+ }
63
+ ```
64
+
65
+ With language:
36
66
  ```json
37
67
  {
38
68
  "name": "get_youtube_transcript",
39
69
  "arguments": {
40
70
  "videoId": "dQw4w9WgXcQ",
41
- "lang": "en"
71
+ "lang": "es"
42
72
  }
43
73
  }
44
74
  ```
45
75
 
76
+ With pagination:
77
+ ```json
78
+ {
79
+ "name": "get_youtube_transcript",
80
+ "arguments": {
81
+ "videoId": "dQw4w9WgXcQ",
82
+ "segment": 2
83
+ }
84
+ }
85
+ ```
86
+
87
+ ## Response Format
88
+
89
+ The tool returns the raw transcript text. For large transcripts, the response includes pagination information:
90
+
91
+ ```
92
+ [Segment 1 of 3]
93
+
94
+ this is the actual transcript text content...
95
+ ```
96
+
97
+ When multiple segments are available, you can retrieve subsequent segments by incrementing the `segment` parameter.
98
+
99
+ ## Caching
100
+
101
+ The server includes built-in caching to improve performance for paginated requests. The cache behavior can be configured with an environment variable:
102
+
103
+ - `TRANSCRIPT_CACHE_TTL`: Cache duration in seconds (default: 300 = 5 minutes)
104
+
105
+ ### Cache Features:
106
+ - Full transcripts are cached on first fetch
107
+ - Cache expiration time is updated on each read or write
108
+ - Expired entries are automatically cleaned up after each request
109
+ - Each video+language combination is cached separately
110
+
111
+ ### Setting Cache Duration:
112
+
113
+ ```bash
114
+ # Set cache to 10 minutes
115
+ TRANSCRIPT_CACHE_TTL=600 npx mcp-headless-youtube-transcript
116
+ ```
117
+
46
118
  ## Supported URL Formats
47
119
 
48
120
  - Video ID: `dQw4w9WgXcQ`
package/build/index.js CHANGED
@@ -4,6 +4,41 @@ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
4
4
  import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
5
5
  import { getSubtitles } from 'headless-youtube-captions';
6
6
  import { extractVideoId } from './utils.js';
7
+ // In-memory cache
8
+ const transcriptCache = new Map();
9
+ // Get cache TTL from environment variable (default 5 minutes)
10
+ const CACHE_TTL_SECONDS = parseInt(process.env.TRANSCRIPT_CACHE_TTL || '300');
11
+ // Cache helper functions
12
+ function getCacheKey(videoId, lang) {
13
+ return `${videoId}:${lang}`;
14
+ }
15
+ function getCachedTranscript(videoId, lang) {
16
+ const key = getCacheKey(videoId, lang);
17
+ const entry = transcriptCache.get(key);
18
+ if (!entry)
19
+ return null;
20
+ const now = Date.now();
21
+ if (now > entry.expiresAt) {
22
+ transcriptCache.delete(key);
23
+ return null;
24
+ }
25
+ // Update expiration time on read
26
+ entry.expiresAt = now + (CACHE_TTL_SECONDS * 1000);
27
+ return entry.transcript;
28
+ }
29
+ function setCachedTranscript(videoId, lang, transcript) {
30
+ const key = getCacheKey(videoId, lang);
31
+ const expiresAt = Date.now() + (CACHE_TTL_SECONDS * 1000);
32
+ transcriptCache.set(key, { transcript, expiresAt });
33
+ }
34
+ function cleanupExpiredCache() {
35
+ const now = Date.now();
36
+ for (const [key, entry] of transcriptCache.entries()) {
37
+ if (now > entry.expiresAt) {
38
+ transcriptCache.delete(key);
39
+ }
40
+ }
41
+ }
7
42
  const server = new Server({
8
43
  name: 'mcp-headless-youtube-transcript',
9
44
  version: '1.0.0',
@@ -54,13 +89,19 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
54
89
  if (!extractedVideoId) {
55
90
  throw new Error('Invalid YouTube video ID or URL');
56
91
  }
57
- // Get subtitles using headless-youtube-captions
58
- const subtitles = await getSubtitles({
59
- videoID: extractedVideoId,
60
- lang: lang,
61
- });
62
- // Get the full raw text content
63
- const fullTranscript = subtitles.map(s => s.text).join(' ');
92
+ // Check cache first
93
+ let fullTranscript = getCachedTranscript(extractedVideoId, lang);
94
+ if (!fullTranscript) {
95
+ // Get subtitles using headless-youtube-captions
96
+ const subtitles = await getSubtitles({
97
+ videoID: extractedVideoId,
98
+ lang: lang,
99
+ });
100
+ // Get the full raw text content
101
+ fullTranscript = subtitles.map(s => s.text).join(' ');
102
+ // Cache the full transcript
103
+ setCachedTranscript(extractedVideoId, lang, fullTranscript);
104
+ }
64
105
  // Split into 98k character chunks
65
106
  const chunkSize = 98000;
66
107
  const chunks = [];
@@ -106,6 +147,10 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
106
147
  isError: true,
107
148
  };
108
149
  }
150
+ finally {
151
+ // Cleanup expired cache entries after each request
152
+ cleanupExpiredCache();
153
+ }
109
154
  }
110
155
  throw new Error(`Unknown tool: ${name}`);
111
156
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mcp-headless-youtube-transcript",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "MCP server for extracting YouTube video transcripts using headless-youtube-captions",
5
5
  "main": "build/index.js",
6
6
  "bin": {