PyPI - yttranscript-mcp - Versions diffs - 0.2.0__py3-none-any.whl - Mend

yttranscript-mcp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

ytt/__init__.py +26 -0
ytt/cache.py +238 -0
ytt/cleaner.py +109 -0
ytt/cli.py +304 -0
ytt/config.py +188 -0
ytt/cuda_dll_manager.py +200 -0
ytt/exceptions.py +45 -0
ytt/fetcher.py +300 -0
ytt/formatters.py +244 -0
ytt/http.py +129 -0
ytt/mcp/__init__.py +1 -0
ytt/mcp/server.py +201 -0
ytt/parser.py +241 -0
ytt/rate_limiter.py +74 -0
ytt/search_cache.py +133 -0
ytt/search_service.py +109 -0
ytt/searcher.py +154 -0
ytt/service.py +239 -0
ytt/whisper_runner.py +300 -0
yttranscript_mcp-0.2.0.dist-info/METADATA +164 -0
yttranscript_mcp-0.2.0.dist-info/RECORD +24 -0
yttranscript_mcp-0.2.0.dist-info/WHEEL +4 -0
yttranscript_mcp-0.2.0.dist-info/entry_points.txt +3 -0
yttranscript_mcp-0.2.0.dist-info/licenses/LICENSE +202 -0

ytt/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""yttranscript-mcp — fetch, clean, and search YouTube transcripts.
+Captions-first (fast, low-bandwidth, rate-limit resistant) with an optional
+local Whisper fallback, plus LLM-friendly cleaning that strips rolling-caption
+duplication and timestamps.
+"""
+from .cleaner import clean_segments, estimate_tokens, merge_overlapping
+from .search_service import search, search_and_get_transcripts
+from .searcher import VideoSearchResult
+from .service import ServiceResult, get_transcript, get_transcripts_batch
+__version__ = "0.2.0"
+__all__ = [
+    "get_transcript",
+    "get_transcripts_batch",
+    "ServiceResult",
+    "VideoSearchResult",
+    "search",
+    "search_and_get_transcripts",
+    "clean_segments",
+    "merge_overlapping",
+    "estimate_tokens",
+    "__version__",
+]

ytt/cache.py ADDED Viewed

@@ -0,0 +1,238 @@
+"""SQLite-backed transcript cache for avoiding redundant fetches."""
+import asyncio
+import json
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+import aiosqlite
+from .config import config
+@dataclass
+class CachedTranscript:
+    """A cached transcript entry."""
+    video_id: str
+    language: str
+    source: str  # 'innertube' or 'whisper'
+    raw_data: dict  # JSON representation
+    created_at: datetime
+    expires_at: datetime
+    def is_expired(self) -> bool:
+        """Check if the cache entry has expired."""
+        return datetime.now() >= self.expires_at
+def _row_to_cached_transcript(row: tuple) -> CachedTranscript:
+    """Convert a database row to a CachedTranscript."""
+    video_id, language, source, raw_data, created_at_str, expires_at_str = row
+    # Parse timestamps
+    created_at = datetime.fromisoformat(created_at_str)
+    expires_at = datetime.fromisoformat(expires_at_str)
+    # Parse raw_data JSON
+    if isinstance(raw_data, str):
+        raw_data = json.loads(raw_data)
+    return CachedTranscript(
+        video_id=video_id,
+        language=language,
+        source=source,
+        raw_data=raw_data,
+        created_at=created_at,
+        expires_at=expires_at,
+    )
+class TranscriptCache:
+    """SQLite-backed cache for transcript data.
+    Stores fetched transcripts to avoid redundant API calls.
+    Cache entries expire after CACHE_TTL_DAYS (default 7 days).
+    """
+    def __init__(self, db_path: str | None = None):
+        self.db_path = db_path or config.CACHE_DB_PATH
+        self._initialized = False
+        self._lock = asyncio.Lock()
+    async def initialize(self) -> None:
+        """Initialize the database schema."""
+        if self._initialized:
+            return
+        async with self._lock:
+            if self._initialized:
+                return
+            async with aiosqlite.connect(self.db_path) as db:
+                await db.execute("""
+                    CREATE TABLE IF NOT EXISTS transcripts (
+                        video_id    TEXT NOT NULL,
+                        lang        TEXT NOT NULL,
+                        source      TEXT NOT NULL,
+                        raw_data    TEXT NOT NULL,
+                        created_at  TEXT NOT NULL,
+                        expires_at  TEXT NOT NULL,
+                        PRIMARY KEY (video_id, lang)
+                    )
+                """)
+                await db.execute("""
+                    CREATE INDEX IF NOT EXISTS idx_expires_at
+                    ON transcripts(expires_at)
+                """)
+                await db.commit()
+            self._initialized = True
+    async def get(self, video_id: str, language: str = "en") -> CachedTranscript | None:
+        """Get a cached transcript if it exists and is not expired.
+        Args:
+            video_id: The YouTube video ID.
+            language: The language code (default: 'en').
+        Returns:
+            CachedTranscript if found and not expired, None otherwise.
+        """
+        await self.initialize()
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            cursor = await db.execute(
+                """
+                SELECT video_id, lang, source, raw_data, created_at, expires_at
+                FROM transcripts
+                WHERE video_id = ? AND lang = ?
+                """,
+                (video_id, language),
+            )
+            row = await cursor.fetchone()
+        if row is None:
+            return None
+        transcript = _row_to_cached_transcript(tuple(row))
+        # Check if expired
+        if transcript.is_expired():
+            await self.delete(video_id, language)
+            return None
+        return transcript
+    async def set(
+        self,
+        video_id: str,
+        language: str,
+        raw_data: dict,
+        source: str,
+        ttl_days: int | None = None,
+    ) -> None:
+        """Store a transcript in the cache.
+        Args:
+            video_id: The YouTube video ID.
+            language: The language code.
+            raw_data: The transcript data as a dict.
+            source: 'innertube' or 'whisper'.
+            ttl_days: Override the default TTL.
+        """
+        await self.initialize()
+        ttl = ttl_days or config.CACHE_TTL_DAYS
+        now = datetime.now()
+        expires_at = now + timedelta(days=ttl)
+        raw_json = json.dumps(raw_data, ensure_ascii=False)
+        async with aiosqlite.connect(self.db_path) as db:
+            await db.execute(
+                """
+                INSERT OR REPLACE INTO transcripts
+                (video_id, lang, source, raw_data, created_at, expires_at)
+                VALUES (?, ?, ?, ?, ?, ?)
+                """,
+                (
+                    video_id,
+                    language,
+                    source,
+                    raw_json,
+                    now.isoformat(),
+                    expires_at.isoformat(),
+                ),
+            )
+            await db.commit()
+    async def delete(self, video_id: str, language: str = "en") -> None:
+        """Delete a cached transcript.
+        Args:
+            video_id: The YouTube video ID.
+            language: The language code.
+        """
+        await self.initialize()
+        async with aiosqlite.connect(self.db_path) as db:
+            await db.execute(
+                "DELETE FROM transcripts WHERE video_id = ? AND lang = ?",
+                (video_id, language),
+            )
+            await db.commit()
+    async def cleanup_expired(self) -> int:
+        """Remove all expired cache entries.
+        Returns:
+            Number of entries removed.
+        """
+        await self.initialize()
+        now = datetime.now().isoformat()
+        async with aiosqlite.connect(self.db_path) as db:
+            cursor = await db.execute(
+                "DELETE FROM transcripts WHERE expires_at < ?",
+                (now,),
+            )
+            await db.commit()
+            return cursor.rowcount
+    async def get_stats(self) -> dict:
+        """Get cache statistics.
+        Returns:
+            Dict with total_entries, expired_entries, and entries_by_source.
+        """
+        await self.initialize()
+        async with aiosqlite.connect(self.db_path) as db:
+            # Total entries
+            cursor = await db.execute("SELECT COUNT(*) FROM transcripts")
+            total = (await cursor.fetchone())[0]
+            # Expired entries
+            now = datetime.now().isoformat()
+            cursor = await db.execute(
+                "SELECT COUNT(*) FROM transcripts WHERE expires_at < ?",
+                (now,),
+            )
+            expired = (await cursor.fetchone())[0]
+            # By source
+            cursor = await db.execute("SELECT source, COUNT(*) FROM transcripts GROUP BY source")
+            by_source = dict(await cursor.fetchall())
+        return {
+            "total_entries": total,
+            "expired_entries": expired,
+            "entries_by_source": by_source,
+        }
+# Global cache instance
+cache = TranscriptCache()

ytt/cleaner.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""Clean transcripts for LLM ingestion without wasting context.
+YouTube auto-captions (ASR) "roll": each cue repeats the tail of the previous
+cue plus a few new words, so naively joining cues produces 2-3x duplicated
+text. This module merges that overlap, decodes HTML entities, strips caption
+markup, and re-flows the result into compact paragraphs — typically cutting
+token count by half or more versus the raw segment dump.
+"""
+import html
+import re
+# Caption markup like <c>, </c>, <00:00:01.234> and stray formatting tags.
+_TAG_RE = re.compile(r"<[^>]+>")
+_WS_RE = re.compile(r"\s+")
+def _normalize(text: str) -> str:
+    """Decode entities, strip markup, collapse whitespace for one cue."""
+    text = html.unescape(text)
+    text = text.replace("\n", " ")
+    text = _TAG_RE.sub("", text)
+    text = _WS_RE.sub(" ", text)
+    return text.strip()
+def merge_overlapping(texts: list[str]) -> str:
+    """Merge cue texts, removing rolling-caption word overlap.
+    For each cue we find the longest suffix of the text so far that equals the
+    cue's leading words (case-insensitive) and append only the remainder. This
+    collapses ``"a b c" + "b c d" -> "a b c d"`` and drops fully-contained
+    repeats entirely.
+    """
+    words: list[str] = []
+    lowered: list[str] = []  # parallel lowercased view for cheap comparison
+    for raw in texts:
+        seg = _normalize(raw)
+        if not seg:
+            continue
+        seg_words = seg.split()
+        seg_lower = [w.lower() for w in seg_words]
+        max_k = min(len(words), len(seg_words))
+        overlap = 0
+        for k in range(max_k, 0, -1):
+            if lowered[-k:] == seg_lower[:k]:
+                overlap = k
+                break
+        words.extend(seg_words[overlap:])
+        lowered.extend(seg_lower[overlap:])
+    return " ".join(words)
+def to_paragraphs(text: str, words_per_paragraph: int = 80) -> str:
+    """Re-flow a dense string into readable paragraphs.
+    Auto-captions usually lack punctuation, so we group a fixed number of words
+    per paragraph. This adds only newlines (negligible tokens) while keeping the
+    text scannable. If the text already has sentence punctuation we break on
+    sentence boundaries instead.
+    """
+    text = text.strip()
+    if not text:
+        return ""
+    # Prefer sentence-based grouping when punctuation is present.
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    if len(sentences) > 1:
+        paragraphs, current, count = [], [], 0
+        for sentence in sentences:
+            current.append(sentence)
+            count += len(sentence.split())
+            if count >= words_per_paragraph:
+                paragraphs.append(" ".join(current))
+                current, count = [], 0
+        if current:
+            paragraphs.append(" ".join(current))
+        return "\n\n".join(paragraphs)
+    # No punctuation: chunk by word count.
+    words = text.split()
+    paragraphs = [
+        " ".join(words[i : i + words_per_paragraph])
+        for i in range(0, len(words), words_per_paragraph)
+    ]
+    return "\n\n".join(paragraphs)
+def clean_segments(segments, paragraphs: bool = True) -> str:
+    """Produce clean, deduplicated text from a list of caption segments.
+    Args:
+        segments: Objects exposing a ``.text`` attribute (TimedText / WhisperSegment).
+        paragraphs: Re-flow into paragraphs (True) or return one dense line.
+    Returns:
+        Cleaned transcript text optimised for LLM ingestion.
+    """
+    merged = merge_overlapping([getattr(seg, "text", "") for seg in segments])
+    return to_paragraphs(merged) if paragraphs else merged
+def estimate_tokens(text: str) -> int:
+    """Rough token estimate (~4 chars/token) for budgeting LLM context."""
+    return max(1, len(text) // 4)

ytt/cli.py ADDED Viewed

@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+"""CLI for YouTube transcript fetching."""
+import asyncio
+import sys
+from pathlib import Path
+import click
+from rich.console import Console
+from rich.panel import Panel
+from rich.syntax import Syntax
+from .service import get_transcript, get_transcripts_batch
+from .cache import cache
+from .search_service import search, search_and_get_transcripts
+console = Console(legacy_windows=False, force_terminal=True)
+def validate_video_id(ctx, param, value):
+    """Validate video ID or URL."""
+    if value is None:
+        return None
+    # Basic validation - could be video ID or URL
+    if "/" in value or "?" in value:
+        # Looks like a URL - let the service handle it
+        return value
+    # 11-char video ID
+    if len(value) == 11:
+        return value
+    raise click.BadParameter("Must be a valid YouTube video ID or URL")
+@click.group()
+def cli():
+    """YouTube Transcript Fetcher - Get transcripts from any YouTube video."""
+    pass
+@cli.command()
+@click.argument("video_id", callback=validate_video_id)
+@click.option(
+    "--language",
+    "-l",
+    default="en",
+    help="Language code (e.g., en, es, fr, de)",
+)
+@click.option(
+    "--format",
+    "-f",
+    type=click.Choice(["clean", "text", "json", "srt", "vtt"]),
+    default="clean",
+    help="Output format (clean = deduplicated, LLM-friendly)",
+)
+@click.option(
+    "--output",
+    "-o",
+    type=click.Path(),
+    help="Output file (default: stdout)",
+)
+@click.option(
+    "--no-cache",
+    is_flag=True,
+    help="Skip cache lookup",
+)
+@click.option(
+    "--no-whisper",
+    is_flag=True,
+    help="Disable Whisper fallback",
+)
+def transcript(video_id, language, format, output, no_cache, no_whisper):
+    """Get transcript for a YouTube video."""
+    async def fetch():
+        return await get_transcript(
+            video_id,
+            language=language,
+            output_format=format,
+            use_cache=not no_cache,
+            use_whisper_fallback=not no_whisper,
+        )
+    console.print("Fetching transcript...")
+    try:
+        result = asyncio.run(fetch())
+    except Exception as e:
+        console.print(f"[red]Error:[/red] {e}")
+        sys.exit(1)
+    # Display result
+    if format == "json":
+        syntax = Syntax(result.content, "json", theme="monokai", line_numbers=False)
+        panel = Panel(syntax, title=f"[bold]{result.video_id}[/bold]")
+    else:
+        panel = Panel(
+            result.content[:2000] + ("..." if len(result.content) > 2000 else ""),
+            title=f"[bold]{result.video_id}[/bold]",
+            subtitle=f"Source: {result.source} | Language: {result.language}",
+        )
+    console.print(panel)
+    if output:
+        Path(output).write_text(result.content, encoding="utf-8")
+        console.print(f"[green]Saved to {output}[/green]")
+@cli.command()
+@click.argument("video_ids", nargs=-1, callback=lambda ctx, param, values: values)
+@click.option(
+    "--language",
+    "-l",
+    default="en",
+    help="Language code",
+)
+@click.option(
+    "--format",
+    "-f",
+    type=click.Choice(["clean", "text", "json", "srt", "vtt"]),
+    default="clean",
+    help="Output format (clean = deduplicated, LLM-friendly)",
+)
+@click.option(
+    "--workers",
+    "-w",
+    default=4,
+    help="Max concurrent workers",
+)
+def batch(video_ids, language, format, workers):
+    """Get transcripts for multiple videos."""
+    if not video_ids:
+        console.print("[yellow]No video IDs provided[/yellow]")
+        return
+    async def fetch_all():
+        return await get_transcripts_batch(
+            video_ids,
+            language=language,
+            output_format=format,
+            max_workers=workers,
+        )
+    console.print(f"Fetching transcripts for {len(video_ids)} videos (workers={workers})...")
+    results = asyncio.run(fetch_all())
+    success = sum(1 for r in results if not isinstance(r, Exception))
+    failed = len(results) - success
+    console.print(f"\n[green]Success: {success}[/green] | [red]Failed: {failed}[/red]")
+    for vid, r in zip(video_ids, results):
+        if isinstance(r, Exception):
+            console.print(f"  [red][FAIL][/red] {vid}: {r}")
+        else:
+            console.print(f"  [green][OK][/green] {vid} ({r.source})")
+@cli.command()
+@click.option("--clean", is_flag=True, help="Remove expired entries")
+def cache_stats(clean):
+    """Show cache statistics."""
+    async def do_cache():
+        if clean:
+            deleted = await cache.cleanup_expired()
+            console.print(f"[yellow]Removed {deleted} expired entries[/yellow]")
+        stats = await cache.get_stats()
+        console.print(
+            Panel(
+                f"""[bold]Cache Statistics[/bold]
+Total entries: {stats['total_entries']}
+Expired: {stats['expired_entries']}
+By source: {stats['entries_by_source']}
+""",
+                title="Cache",
+            )
+        )
+    asyncio.run(do_cache())
+@cli.command()
+@click.argument("query")
+@click.option(
+    "--limit",
+    "-n",
+    default=5,
+    type=click.IntRange(1, 20),
+    help="Number of results to return",
+)
+@click.option(
+    "--format",
+    "-f",
+    type=click.Choice(["text", "json", "table"]),
+    default="table",
+    help="Output format",
+)
+@click.option(
+    "--with-transcripts",
+    is_flag=True,
+    help="Also fetch transcripts for each result",
+)
+@click.option(
+    "--language",
+    "-l",
+    default="en",
+    help="Language for transcripts",
+)
+@click.option(
+    "--no-cache",
+    is_flag=True,
+    help="Skip cache lookup",
+)
+def search_cmd(query, limit, format, with_transcripts, language, no_cache):
+    """Search YouTube for videos."""
+    from rich.table import Table
+    async def do_search():
+        if with_transcripts:
+            return await search_and_get_transcripts(
+                query, max_results=limit, language=language, use_cache=not no_cache
+            )
+        return await search(query, max_results=limit, use_cache=not no_cache)
+    console.print(f"Searching YouTube for: '{query}'...")
+    try:
+        results = asyncio.run(do_search())
+    except Exception as e:
+        console.print(f"[red]Error:[/red] {e}")
+        sys.exit(1)
+    if not results:
+        console.print("[yellow]No results found[/yellow]")
+        return
+    if format == "json":
+        import json
+        output = []
+        for r in results:
+            if with_transcripts:
+                video, transcript = r
+                item = {
+                    "video_id": video.video_id,
+                    "title": video.title,
+                    "channel": video.channel_name,
+                    "duration": video.duration,
+                    "views": video.view_count,
+                }
+                if transcript:
+                    item["transcript"] = transcript.content
+                output.append(item)
+            else:
+                output.append(
+                    {
+                        "video_id": r.video_id,
+                        "title": r.title,
+                        "channel": r.channel_name,
+                        "duration": r.duration,
+                        "views": r.view_count,
+                    }
+                )
+        console.print(json.dumps(output, indent=2))
+    else:
+        table = Table(title=f"Search Results: '{query}'")
+        table.add_column("Video ID", style="cyan")
+        table.add_column("Title", style="white")
+        table.add_column("Channel", style="green")
+        table.add_column("Duration", justify="right")
+        table.add_column("Views", justify="right")
+        for r in results:
+            video = r if hasattr(r, "video_id") else r[0]
+            title = video.title[:50] + "..." if len(video.title) > 50 else video.title
+            table.add_row(
+                video.video_id,
+                title,
+                video.channel_name,
+                video.duration,
+                video.view_count,
+            )
+        console.print(table)
+    if with_transcripts:
+        success_count = (
+            sum(1 for r in results if r[1] is not None)
+            if results and not hasattr(results[0], "video_id")
+            else 0
+        )
+        console.print(f"\n[dim]Transcripts fetched for {success_count}/{len(results)} videos[/dim]")
+if __name__ == "__main__":
+    cli()