yttranscript-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ytt/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ """yttranscript-mcp — fetch, clean, and search YouTube transcripts.
2
+
3
+ Captions-first (fast, low-bandwidth, rate-limit resistant) with an optional
4
+ local Whisper fallback, plus LLM-friendly cleaning that strips rolling-caption
5
+ duplication and timestamps.
6
+ """
7
+
8
+ from .cleaner import clean_segments, estimate_tokens, merge_overlapping
9
+ from .search_service import search, search_and_get_transcripts
10
+ from .searcher import VideoSearchResult
11
+ from .service import ServiceResult, get_transcript, get_transcripts_batch
12
+
13
+ __version__ = "0.2.0"
14
+
15
+ __all__ = [
16
+ "get_transcript",
17
+ "get_transcripts_batch",
18
+ "ServiceResult",
19
+ "VideoSearchResult",
20
+ "search",
21
+ "search_and_get_transcripts",
22
+ "clean_segments",
23
+ "merge_overlapping",
24
+ "estimate_tokens",
25
+ "__version__",
26
+ ]
ytt/cache.py ADDED
@@ -0,0 +1,238 @@
1
+ """SQLite-backed transcript cache for avoiding redundant fetches."""
2
+
3
+ import asyncio
4
+ import json
5
+ from dataclasses import dataclass
6
+ from datetime import datetime, timedelta
7
+
8
+ import aiosqlite
9
+
10
+ from .config import config
11
+
12
+
13
+ @dataclass
14
+ class CachedTranscript:
15
+ """A cached transcript entry."""
16
+
17
+ video_id: str
18
+ language: str
19
+ source: str # 'innertube' or 'whisper'
20
+ raw_data: dict # JSON representation
21
+ created_at: datetime
22
+ expires_at: datetime
23
+
24
+ def is_expired(self) -> bool:
25
+ """Check if the cache entry has expired."""
26
+ return datetime.now() >= self.expires_at
27
+
28
+
29
+ def _row_to_cached_transcript(row: tuple) -> CachedTranscript:
30
+ """Convert a database row to a CachedTranscript."""
31
+ video_id, language, source, raw_data, created_at_str, expires_at_str = row
32
+
33
+ # Parse timestamps
34
+ created_at = datetime.fromisoformat(created_at_str)
35
+ expires_at = datetime.fromisoformat(expires_at_str)
36
+
37
+ # Parse raw_data JSON
38
+ if isinstance(raw_data, str):
39
+ raw_data = json.loads(raw_data)
40
+
41
+ return CachedTranscript(
42
+ video_id=video_id,
43
+ language=language,
44
+ source=source,
45
+ raw_data=raw_data,
46
+ created_at=created_at,
47
+ expires_at=expires_at,
48
+ )
49
+
50
+
51
+ class TranscriptCache:
52
+ """SQLite-backed cache for transcript data.
53
+
54
+ Stores fetched transcripts to avoid redundant API calls.
55
+ Cache entries expire after CACHE_TTL_DAYS (default 7 days).
56
+ """
57
+
58
+ def __init__(self, db_path: str | None = None):
59
+ self.db_path = db_path or config.CACHE_DB_PATH
60
+ self._initialized = False
61
+ self._lock = asyncio.Lock()
62
+
63
+ async def initialize(self) -> None:
64
+ """Initialize the database schema."""
65
+ if self._initialized:
66
+ return
67
+
68
+ async with self._lock:
69
+ if self._initialized:
70
+ return
71
+
72
+ async with aiosqlite.connect(self.db_path) as db:
73
+ await db.execute("""
74
+ CREATE TABLE IF NOT EXISTS transcripts (
75
+ video_id TEXT NOT NULL,
76
+ lang TEXT NOT NULL,
77
+ source TEXT NOT NULL,
78
+ raw_data TEXT NOT NULL,
79
+ created_at TEXT NOT NULL,
80
+ expires_at TEXT NOT NULL,
81
+ PRIMARY KEY (video_id, lang)
82
+ )
83
+ """)
84
+ await db.execute("""
85
+ CREATE INDEX IF NOT EXISTS idx_expires_at
86
+ ON transcripts(expires_at)
87
+ """)
88
+ await db.commit()
89
+
90
+ self._initialized = True
91
+
92
+ async def get(self, video_id: str, language: str = "en") -> CachedTranscript | None:
93
+ """Get a cached transcript if it exists and is not expired.
94
+
95
+ Args:
96
+ video_id: The YouTube video ID.
97
+ language: The language code (default: 'en').
98
+
99
+ Returns:
100
+ CachedTranscript if found and not expired, None otherwise.
101
+ """
102
+ await self.initialize()
103
+
104
+ async with aiosqlite.connect(self.db_path) as db:
105
+ db.row_factory = aiosqlite.Row
106
+ cursor = await db.execute(
107
+ """
108
+ SELECT video_id, lang, source, raw_data, created_at, expires_at
109
+ FROM transcripts
110
+ WHERE video_id = ? AND lang = ?
111
+ """,
112
+ (video_id, language),
113
+ )
114
+ row = await cursor.fetchone()
115
+
116
+ if row is None:
117
+ return None
118
+
119
+ transcript = _row_to_cached_transcript(tuple(row))
120
+
121
+ # Check if expired
122
+ if transcript.is_expired():
123
+ await self.delete(video_id, language)
124
+ return None
125
+
126
+ return transcript
127
+
128
+ async def set(
129
+ self,
130
+ video_id: str,
131
+ language: str,
132
+ raw_data: dict,
133
+ source: str,
134
+ ttl_days: int | None = None,
135
+ ) -> None:
136
+ """Store a transcript in the cache.
137
+
138
+ Args:
139
+ video_id: The YouTube video ID.
140
+ language: The language code.
141
+ raw_data: The transcript data as a dict.
142
+ source: 'innertube' or 'whisper'.
143
+ ttl_days: Override the default TTL.
144
+ """
145
+ await self.initialize()
146
+
147
+ ttl = ttl_days or config.CACHE_TTL_DAYS
148
+ now = datetime.now()
149
+ expires_at = now + timedelta(days=ttl)
150
+
151
+ raw_json = json.dumps(raw_data, ensure_ascii=False)
152
+
153
+ async with aiosqlite.connect(self.db_path) as db:
154
+ await db.execute(
155
+ """
156
+ INSERT OR REPLACE INTO transcripts
157
+ (video_id, lang, source, raw_data, created_at, expires_at)
158
+ VALUES (?, ?, ?, ?, ?, ?)
159
+ """,
160
+ (
161
+ video_id,
162
+ language,
163
+ source,
164
+ raw_json,
165
+ now.isoformat(),
166
+ expires_at.isoformat(),
167
+ ),
168
+ )
169
+ await db.commit()
170
+
171
+ async def delete(self, video_id: str, language: str = "en") -> None:
172
+ """Delete a cached transcript.
173
+
174
+ Args:
175
+ video_id: The YouTube video ID.
176
+ language: The language code.
177
+ """
178
+ await self.initialize()
179
+
180
+ async with aiosqlite.connect(self.db_path) as db:
181
+ await db.execute(
182
+ "DELETE FROM transcripts WHERE video_id = ? AND lang = ?",
183
+ (video_id, language),
184
+ )
185
+ await db.commit()
186
+
187
+ async def cleanup_expired(self) -> int:
188
+ """Remove all expired cache entries.
189
+
190
+ Returns:
191
+ Number of entries removed.
192
+ """
193
+ await self.initialize()
194
+
195
+ now = datetime.now().isoformat()
196
+
197
+ async with aiosqlite.connect(self.db_path) as db:
198
+ cursor = await db.execute(
199
+ "DELETE FROM transcripts WHERE expires_at < ?",
200
+ (now,),
201
+ )
202
+ await db.commit()
203
+ return cursor.rowcount
204
+
205
+ async def get_stats(self) -> dict:
206
+ """Get cache statistics.
207
+
208
+ Returns:
209
+ Dict with total_entries, expired_entries, and entries_by_source.
210
+ """
211
+ await self.initialize()
212
+
213
+ async with aiosqlite.connect(self.db_path) as db:
214
+ # Total entries
215
+ cursor = await db.execute("SELECT COUNT(*) FROM transcripts")
216
+ total = (await cursor.fetchone())[0]
217
+
218
+ # Expired entries
219
+ now = datetime.now().isoformat()
220
+ cursor = await db.execute(
221
+ "SELECT COUNT(*) FROM transcripts WHERE expires_at < ?",
222
+ (now,),
223
+ )
224
+ expired = (await cursor.fetchone())[0]
225
+
226
+ # By source
227
+ cursor = await db.execute("SELECT source, COUNT(*) FROM transcripts GROUP BY source")
228
+ by_source = dict(await cursor.fetchall())
229
+
230
+ return {
231
+ "total_entries": total,
232
+ "expired_entries": expired,
233
+ "entries_by_source": by_source,
234
+ }
235
+
236
+
237
+ # Global cache instance
238
+ cache = TranscriptCache()
ytt/cleaner.py ADDED
@@ -0,0 +1,109 @@
1
+ """Clean transcripts for LLM ingestion without wasting context.
2
+
3
+ YouTube auto-captions (ASR) "roll": each cue repeats the tail of the previous
4
+ cue plus a few new words, so naively joining cues produces 2-3x duplicated
5
+ text. This module merges that overlap, decodes HTML entities, strips caption
6
+ markup, and re-flows the result into compact paragraphs — typically cutting
7
+ token count by half or more versus the raw segment dump.
8
+ """
9
+
10
+ import html
11
+ import re
12
+
13
+ # Caption markup like <c>, </c>, <00:00:01.234> and stray formatting tags.
14
+ _TAG_RE = re.compile(r"<[^>]+>")
15
+ _WS_RE = re.compile(r"\s+")
16
+
17
+
18
+ def _normalize(text: str) -> str:
19
+ """Decode entities, strip markup, collapse whitespace for one cue."""
20
+ text = html.unescape(text)
21
+ text = text.replace("\n", " ")
22
+ text = _TAG_RE.sub("", text)
23
+ text = _WS_RE.sub(" ", text)
24
+ return text.strip()
25
+
26
+
27
+ def merge_overlapping(texts: list[str]) -> str:
28
+ """Merge cue texts, removing rolling-caption word overlap.
29
+
30
+ For each cue we find the longest suffix of the text so far that equals the
31
+ cue's leading words (case-insensitive) and append only the remainder. This
32
+ collapses ``"a b c" + "b c d" -> "a b c d"`` and drops fully-contained
33
+ repeats entirely.
34
+ """
35
+ words: list[str] = []
36
+ lowered: list[str] = [] # parallel lowercased view for cheap comparison
37
+
38
+ for raw in texts:
39
+ seg = _normalize(raw)
40
+ if not seg:
41
+ continue
42
+ seg_words = seg.split()
43
+ seg_lower = [w.lower() for w in seg_words]
44
+
45
+ max_k = min(len(words), len(seg_words))
46
+ overlap = 0
47
+ for k in range(max_k, 0, -1):
48
+ if lowered[-k:] == seg_lower[:k]:
49
+ overlap = k
50
+ break
51
+
52
+ words.extend(seg_words[overlap:])
53
+ lowered.extend(seg_lower[overlap:])
54
+
55
+ return " ".join(words)
56
+
57
+
58
+ def to_paragraphs(text: str, words_per_paragraph: int = 80) -> str:
59
+ """Re-flow a dense string into readable paragraphs.
60
+
61
+ Auto-captions usually lack punctuation, so we group a fixed number of words
62
+ per paragraph. This adds only newlines (negligible tokens) while keeping the
63
+ text scannable. If the text already has sentence punctuation we break on
64
+ sentence boundaries instead.
65
+ """
66
+ text = text.strip()
67
+ if not text:
68
+ return ""
69
+
70
+ # Prefer sentence-based grouping when punctuation is present.
71
+ sentences = re.split(r"(?<=[.!?])\s+", text)
72
+ if len(sentences) > 1:
73
+ paragraphs, current, count = [], [], 0
74
+ for sentence in sentences:
75
+ current.append(sentence)
76
+ count += len(sentence.split())
77
+ if count >= words_per_paragraph:
78
+ paragraphs.append(" ".join(current))
79
+ current, count = [], 0
80
+ if current:
81
+ paragraphs.append(" ".join(current))
82
+ return "\n\n".join(paragraphs)
83
+
84
+ # No punctuation: chunk by word count.
85
+ words = text.split()
86
+ paragraphs = [
87
+ " ".join(words[i : i + words_per_paragraph])
88
+ for i in range(0, len(words), words_per_paragraph)
89
+ ]
90
+ return "\n\n".join(paragraphs)
91
+
92
+
93
+ def clean_segments(segments, paragraphs: bool = True) -> str:
94
+ """Produce clean, deduplicated text from a list of caption segments.
95
+
96
+ Args:
97
+ segments: Objects exposing a ``.text`` attribute (TimedText / WhisperSegment).
98
+ paragraphs: Re-flow into paragraphs (True) or return one dense line.
99
+
100
+ Returns:
101
+ Cleaned transcript text optimised for LLM ingestion.
102
+ """
103
+ merged = merge_overlapping([getattr(seg, "text", "") for seg in segments])
104
+ return to_paragraphs(merged) if paragraphs else merged
105
+
106
+
107
+ def estimate_tokens(text: str) -> int:
108
+ """Rough token estimate (~4 chars/token) for budgeting LLM context."""
109
+ return max(1, len(text) // 4)
ytt/cli.py ADDED
@@ -0,0 +1,304 @@
1
+ #!/usr/bin/env python3
2
+ """CLI for YouTube transcript fetching."""
3
+
4
+ import asyncio
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import click
9
+ from rich.console import Console
10
+ from rich.panel import Panel
11
+ from rich.syntax import Syntax
12
+
13
+ from .service import get_transcript, get_transcripts_batch
14
+ from .cache import cache
15
+ from .search_service import search, search_and_get_transcripts
16
+
17
+ console = Console(legacy_windows=False, force_terminal=True)
18
+
19
+
20
+ def validate_video_id(ctx, param, value):
21
+ """Validate video ID or URL."""
22
+ if value is None:
23
+ return None
24
+
25
+ # Basic validation - could be video ID or URL
26
+ if "/" in value or "?" in value:
27
+ # Looks like a URL - let the service handle it
28
+ return value
29
+
30
+ # 11-char video ID
31
+ if len(value) == 11:
32
+ return value
33
+
34
+ raise click.BadParameter("Must be a valid YouTube video ID or URL")
35
+
36
+
37
+ @click.group()
38
+ def cli():
39
+ """YouTube Transcript Fetcher - Get transcripts from any YouTube video."""
40
+ pass
41
+
42
+
43
+ @cli.command()
44
+ @click.argument("video_id", callback=validate_video_id)
45
+ @click.option(
46
+ "--language",
47
+ "-l",
48
+ default="en",
49
+ help="Language code (e.g., en, es, fr, de)",
50
+ )
51
+ @click.option(
52
+ "--format",
53
+ "-f",
54
+ type=click.Choice(["clean", "text", "json", "srt", "vtt"]),
55
+ default="clean",
56
+ help="Output format (clean = deduplicated, LLM-friendly)",
57
+ )
58
+ @click.option(
59
+ "--output",
60
+ "-o",
61
+ type=click.Path(),
62
+ help="Output file (default: stdout)",
63
+ )
64
+ @click.option(
65
+ "--no-cache",
66
+ is_flag=True,
67
+ help="Skip cache lookup",
68
+ )
69
+ @click.option(
70
+ "--no-whisper",
71
+ is_flag=True,
72
+ help="Disable Whisper fallback",
73
+ )
74
+ def transcript(video_id, language, format, output, no_cache, no_whisper):
75
+ """Get transcript for a YouTube video."""
76
+
77
+ async def fetch():
78
+ return await get_transcript(
79
+ video_id,
80
+ language=language,
81
+ output_format=format,
82
+ use_cache=not no_cache,
83
+ use_whisper_fallback=not no_whisper,
84
+ )
85
+
86
+ console.print("Fetching transcript...")
87
+ try:
88
+ result = asyncio.run(fetch())
89
+ except Exception as e:
90
+ console.print(f"[red]Error:[/red] {e}")
91
+ sys.exit(1)
92
+
93
+ # Display result
94
+ if format == "json":
95
+ syntax = Syntax(result.content, "json", theme="monokai", line_numbers=False)
96
+ panel = Panel(syntax, title=f"[bold]{result.video_id}[/bold]")
97
+ else:
98
+ panel = Panel(
99
+ result.content[:2000] + ("..." if len(result.content) > 2000 else ""),
100
+ title=f"[bold]{result.video_id}[/bold]",
101
+ subtitle=f"Source: {result.source} | Language: {result.language}",
102
+ )
103
+
104
+ console.print(panel)
105
+
106
+ if output:
107
+ Path(output).write_text(result.content, encoding="utf-8")
108
+ console.print(f"[green]Saved to {output}[/green]")
109
+
110
+
111
+ @cli.command()
112
+ @click.argument("video_ids", nargs=-1, callback=lambda ctx, param, values: values)
113
+ @click.option(
114
+ "--language",
115
+ "-l",
116
+ default="en",
117
+ help="Language code",
118
+ )
119
+ @click.option(
120
+ "--format",
121
+ "-f",
122
+ type=click.Choice(["clean", "text", "json", "srt", "vtt"]),
123
+ default="clean",
124
+ help="Output format (clean = deduplicated, LLM-friendly)",
125
+ )
126
+ @click.option(
127
+ "--workers",
128
+ "-w",
129
+ default=4,
130
+ help="Max concurrent workers",
131
+ )
132
+ def batch(video_ids, language, format, workers):
133
+ """Get transcripts for multiple videos."""
134
+
135
+ if not video_ids:
136
+ console.print("[yellow]No video IDs provided[/yellow]")
137
+ return
138
+
139
+ async def fetch_all():
140
+ return await get_transcripts_batch(
141
+ video_ids,
142
+ language=language,
143
+ output_format=format,
144
+ max_workers=workers,
145
+ )
146
+
147
+ console.print(f"Fetching transcripts for {len(video_ids)} videos (workers={workers})...")
148
+
149
+ results = asyncio.run(fetch_all())
150
+
151
+ success = sum(1 for r in results if not isinstance(r, Exception))
152
+ failed = len(results) - success
153
+
154
+ console.print(f"\n[green]Success: {success}[/green] | [red]Failed: {failed}[/red]")
155
+
156
+ for vid, r in zip(video_ids, results):
157
+ if isinstance(r, Exception):
158
+ console.print(f" [red][FAIL][/red] {vid}: {r}")
159
+ else:
160
+ console.print(f" [green][OK][/green] {vid} ({r.source})")
161
+
162
+
163
+ @cli.command()
164
+ @click.option("--clean", is_flag=True, help="Remove expired entries")
165
+ def cache_stats(clean):
166
+ """Show cache statistics."""
167
+
168
+ async def do_cache():
169
+ if clean:
170
+ deleted = await cache.cleanup_expired()
171
+ console.print(f"[yellow]Removed {deleted} expired entries[/yellow]")
172
+
173
+ stats = await cache.get_stats()
174
+
175
+ console.print(
176
+ Panel(
177
+ f"""[bold]Cache Statistics[/bold]
178
+
179
+ Total entries: {stats['total_entries']}
180
+ Expired: {stats['expired_entries']}
181
+ By source: {stats['entries_by_source']}
182
+ """,
183
+ title="Cache",
184
+ )
185
+ )
186
+
187
+ asyncio.run(do_cache())
188
+
189
+
190
+ @cli.command()
191
+ @click.argument("query")
192
+ @click.option(
193
+ "--limit",
194
+ "-n",
195
+ default=5,
196
+ type=click.IntRange(1, 20),
197
+ help="Number of results to return",
198
+ )
199
+ @click.option(
200
+ "--format",
201
+ "-f",
202
+ type=click.Choice(["text", "json", "table"]),
203
+ default="table",
204
+ help="Output format",
205
+ )
206
+ @click.option(
207
+ "--with-transcripts",
208
+ is_flag=True,
209
+ help="Also fetch transcripts for each result",
210
+ )
211
+ @click.option(
212
+ "--language",
213
+ "-l",
214
+ default="en",
215
+ help="Language for transcripts",
216
+ )
217
+ @click.option(
218
+ "--no-cache",
219
+ is_flag=True,
220
+ help="Skip cache lookup",
221
+ )
222
+ def search_cmd(query, limit, format, with_transcripts, language, no_cache):
223
+ """Search YouTube for videos."""
224
+ from rich.table import Table
225
+
226
+ async def do_search():
227
+ if with_transcripts:
228
+ return await search_and_get_transcripts(
229
+ query, max_results=limit, language=language, use_cache=not no_cache
230
+ )
231
+ return await search(query, max_results=limit, use_cache=not no_cache)
232
+
233
+ console.print(f"Searching YouTube for: '{query}'...")
234
+
235
+ try:
236
+ results = asyncio.run(do_search())
237
+ except Exception as e:
238
+ console.print(f"[red]Error:[/red] {e}")
239
+ sys.exit(1)
240
+
241
+ if not results:
242
+ console.print("[yellow]No results found[/yellow]")
243
+ return
244
+
245
+ if format == "json":
246
+ import json
247
+
248
+ output = []
249
+ for r in results:
250
+ if with_transcripts:
251
+ video, transcript = r
252
+ item = {
253
+ "video_id": video.video_id,
254
+ "title": video.title,
255
+ "channel": video.channel_name,
256
+ "duration": video.duration,
257
+ "views": video.view_count,
258
+ }
259
+ if transcript:
260
+ item["transcript"] = transcript.content
261
+ output.append(item)
262
+ else:
263
+ output.append(
264
+ {
265
+ "video_id": r.video_id,
266
+ "title": r.title,
267
+ "channel": r.channel_name,
268
+ "duration": r.duration,
269
+ "views": r.view_count,
270
+ }
271
+ )
272
+ console.print(json.dumps(output, indent=2))
273
+ else:
274
+ table = Table(title=f"Search Results: '{query}'")
275
+ table.add_column("Video ID", style="cyan")
276
+ table.add_column("Title", style="white")
277
+ table.add_column("Channel", style="green")
278
+ table.add_column("Duration", justify="right")
279
+ table.add_column("Views", justify="right")
280
+
281
+ for r in results:
282
+ video = r if hasattr(r, "video_id") else r[0]
283
+ title = video.title[:50] + "..." if len(video.title) > 50 else video.title
284
+ table.add_row(
285
+ video.video_id,
286
+ title,
287
+ video.channel_name,
288
+ video.duration,
289
+ video.view_count,
290
+ )
291
+
292
+ console.print(table)
293
+
294
+ if with_transcripts:
295
+ success_count = (
296
+ sum(1 for r in results if r[1] is not None)
297
+ if results and not hasattr(results[0], "video_id")
298
+ else 0
299
+ )
300
+ console.print(f"\n[dim]Transcripts fetched for {success_count}/{len(results)} videos[/dim]")
301
+
302
+
303
+ if __name__ == "__main__":
304
+ cli()