PyPI - broadcastx - Versions diffs - 0.1.0__tar.gz - Mend

broadcastx 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

broadcastx-0.1.0/PKG-INFO +10 -0
broadcastx-0.1.0/README.md +196 -0
broadcastx-0.1.0/broadcastx/__init__.py +3 -0
broadcastx-0.1.0/broadcastx/cli.py +244 -0
broadcastx-0.1.0/broadcastx/config.py +67 -0
broadcastx-0.1.0/broadcastx/downloader.py +337 -0
broadcastx-0.1.0/broadcastx/monitor.py +424 -0
broadcastx-0.1.0/broadcastx/pause_detector.py +382 -0
broadcastx-0.1.0/broadcastx/rotation.py +564 -0
broadcastx-0.1.0/broadcastx/scanner.py +290 -0
broadcastx-0.1.0/broadcastx/scrape_broadcasts.py +909 -0
broadcastx-0.1.0/broadcastx.egg-info/PKG-INFO +10 -0
broadcastx-0.1.0/broadcastx.egg-info/SOURCES.txt +21 -0
broadcastx-0.1.0/broadcastx.egg-info/dependency_links.txt +1 -0
broadcastx-0.1.0/broadcastx.egg-info/entry_points.txt +2 -0
broadcastx-0.1.0/broadcastx.egg-info/requires.txt +5 -0
broadcastx-0.1.0/broadcastx.egg-info/top_level.txt +1 -0
broadcastx-0.1.0/pyproject.toml +22 -0
broadcastx-0.1.0/setup.cfg +4 -0
broadcastx-0.1.0/tests/test_broadcast_url_validation.py +29 -0
broadcastx-0.1.0/tests/test_downloader_rotation.py +129 -0
broadcastx-0.1.0/tests/test_monitor_auth.py +50 -0
broadcastx-0.1.0/tests/test_rotation_sidecar.py +168 -0

broadcastx-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,10 @@
+Metadata-Version: 2.4
+Name: broadcastx
+Version: 0.1.0
+Summary: Discover and download X/Twitter broadcast videos from user timelines
+Requires-Python: >=3.11
+Requires-Dist: playwright>=1.40
+Requires-Dist: rich>=13.0
+Requires-Dist: click>=8.0
+Requires-Dist: httpx>=0.27
+Requires-Dist: twscrape>=0.18

broadcastx-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,196 @@
+# BroadcastX
+<a href="README_ZH.md">🇨🇳 中文版</a> | <a href="README.md">🇬🇧 English</a>
+[![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/) [![PyPI](https://img.shields.io/pypi/v/broadcastx.svg)](https://pypi.org/project/broadcastx/)
+Discover, monitor, and download X/Twitter broadcast videos from user timelines.
+**BroadcastX** is a CLI tool that helps you:
+- **Scan** — Find broadcast links in a user's timeline
+- **Download** — Download broadcast videos with automatic phone-rotation correction
+- **Monitor** — Watch a profile for live broadcasts and auto-download replays
+## Features
+### Scan
+Uses Playwright browser automation to scroll through a user's X profile and intercept GraphQL API responses to extract broadcast URLs. More reliable than DOM scraping.
+### Download with Auto-Rotation
+Downloads broadcast videos via `yt-dlp` and post-processes the video to correct phone orientation. Broadcasts streamed from a phone in portrait mode appear upright after processing. A `.rotation.jsonl` sidecar file is written alongside the video for inspection.
+### Monitor
+Continuously monitors a user's profile. When a live broadcast is detected, periodically checks its status. When the broadcast ends, automatically downloads the replay.
+## Prerequisites
+- **Python 3.11+**
+- **[yt-dlp](https://github.com/yt-dlp/yt-dlp)** — `brew install yt-dlp`
+- **[ffmpeg](https://ffmpeg.org/)** — `brew install ffmpeg`
+- **Google Chrome** (installed separately)
+## Installation
+```bash
+# Create virtual environment
+python3 -m venv .venv
+source .venv/bin/activate
+# Install BroadcastX and dependencies
+pip install -e .
+# Install Playwright's browser driver
+playwright install chromium
+```
+## Quick Start
+```bash
+# Scan a user's timeline for broadcast links
+broadcastx scan @username
+# Download broadcasts from scan results
+broadcastx download --from output/broadcasts.json
+# Monitor a user for live broadcasts
+broadcastx monitor @username
+```
+## Usage
+### Scan a timeline for broadcasts
+```bash
+broadcastx scan @username
+# Options:
+#   --max-scrolls 100      Maximum scroll actions
+#   --scroll-delay 2.0     Seconds between scrolls
+#   --idle-timeout 10.0    Stop after N seconds with no new data
+#   --output FILE          Output path (default: output/broadcasts.json)
+#   --headless             Run browser without visible window
+```
+The scanner opens the user's X profile in Chrome, scrolls through the timeline, and intercepts API responses. Broadcast URLs are extracted from tweet cards. If you are not logged in, the browser shows the login page — log in manually, then press Enter in the terminal to continue. Your session is saved to `~/.broadcastx/chrome-profile/` for future runs.
+### Download broadcasts
+```bash
+# Single broadcast
+broadcastx download https://x.com/i/broadcasts/1vAxRkBbDRzKl
+# From scan results
+broadcastx download --from output/broadcasts.json
+# Multiple concurrent downloads
+broadcastx download --from output/broadcasts.json -p 3
+# Custom output directory
+broadcastx download --from output/broadcasts.json -o ./videos
+# Use Firefox cookies
+broadcastx download --from output/broadcasts.json --browser firefox
+# Verbose yt-dlp output
+broadcastx download --from output/broadcasts.json -v
+```
+BroadcastX **automatically corrects phone rotation**: if the broadcast carries phone-orientation metadata in the HLS stream, the video is re-encoded so it displays upright in any player.
+### Monitor a profile for live broadcasts
+```bash
+broadcastx monitor @username
+# One-shot test cycle (no loop)
+broadcastx monitor @username --once
+# Download to a custom directory
+broadcastx monitor @username -o ./my_videos
+# Custom check intervals (seconds)
+broadcastx monitor @username --check-interval 1800 --live-interval 300
+# Detect only, skip download
+broadcastx monitor @username --no-download
+```
+The monitor runs in a loop:
+1. **Profile check** (every `check-interval`, default 30 min) — Opens the profile and looks for broadcast cards.
+2. **Live detection** — When a candidate is found, checks whether it is currently live.
+3. **Live check** (every `live-interval`, default 5 min) — Re-checks status until the broadcast ends.
+4. **Download** — Downloads the replay automatically.
+Events are logged to `output/monitor_events.json`.
+### Scrape all past broadcasts
+```bash
+broadcastx scrape @username
+# Ignore saved state and start from the beginning
+broadcastx scrape @username --fresh
+# Add delay and verbose output
+broadcastx scrape @username --delay 2.0 -v
+# Supply credentials directly (skips browser login)
+broadcastx scrape @username \
+  --auth-token "your_auth_token" \
+  --csrf-token "your_ct0" \
+  --user-id "1234567890"
+```
+Uses GraphQL API pagination with cursor-based resumption for full history traversal. State is saved locally, so you can pause and resume after rate limits.
+## Output Structure
+```
+output/
+├── broadcasts.json          # Scan results
+├── monitor_events.json      # Monitor event log
+└── videos/
+    ├── [title] [id].mp4     # Downloaded broadcast
+    ├── [id].rotation.jsonl  # Rotation timeline sidecar
+    └── ...
+```
+## Pipeline Examples
+```bash
+# Scan + download all found broadcasts
+broadcastx scan @username
+broadcastx download --from output/broadcasts.json
+# Monitor with auto-download
+broadcastx monitor @username -o ./videos
+# Bulk scrape + download
+broadcastx scrape @username
+broadcastx download --from output/username_broadcasts.json
+```
+## How It Works
+### Scanner
+Uses Playwright to intercept Twitter's GraphQL API responses (`UserTweets` / `TweetDetail`). This is more stable than DOM scraping because JSON response structures change less frequently than HTML.
+### Downloader
+Wraps `yt-dlp` (which has a built-in `TwitterBroadcastIE` extractor) and adds:
+- **Rotation sidecar extraction** — Parses timed-ID3 metadata from HLS segments
+- **Auto-rotation** — Re-encodes the video with correct orientation via ffmpeg
+### Rotation Sidecar
+The JSONL sidecar (`[id].rotation.jsonl`) contains one record per HLS segment:
+- `raw_rotation` — Original sensor angle from Periscope
+- `rotation` — Quantized to 0°, 90°, 180°, or 270° with hysteresis
+- `ntp` — NTP timestamp for timeline reconstruction
+## License
+MIT

broadcastx-0.1.0/broadcastx/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""BroadcastX - Discover and download X/Twitter broadcast videos."""
+__version__ = "0.1.0"

broadcastx-0.1.0/broadcastx/cli.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""
+BroadcastX CLI — Discover and download X/Twitter broadcast videos.
+Usage:
+    broadcastx scan @username
+    broadcastx download https://x.com/i/broadcasts/...
+    broadcastx download --from broadcasts.json
+"""
+import asyncio
+from pathlib import Path
+import click
+from rich.console import Console
+from . import __version__
+from .config import DEFAULT_BROADCASTS_FILE, DEFAULT_BROWSER, DEFAULT_VIDEOS_DIR
+from .downloader import check_ffmpeg, check_yt_dlp, download_all, download_broadcast
+from .monitor import monitor_user
+from .pause_detector import detect_pauses, pause_report, trim_intervals
+from .scanner import scan_user
+from .scrape_broadcasts import scrape_broadcasts
+console = Console()
+@click.group()
+@click.version_option(version=__version__, prog_name="broadcastx")
+def main():
+    """BroadcastX — Discover and download X/Twitter broadcast videos."""
+    pass
+@main.command()
+@click.argument("username")
+@click.option("--max-scrolls", "-n", default=100, help="Maximum scroll actions (default: 100)")
+@click.option("--scroll-delay", "-d", default=2.0, help="Delay between scrolls in seconds (default: 2.0)")
+@click.option("--idle-timeout", "-t", default=10.0, help="Stop after N seconds with no new data (default: 10)")
+@click.option("--output", "-o", default=None, help="Output JSON file path")
+@click.option("--headless/--no-headless", default=False, help="Run browser headless (default: visible)")
+def scan(username, max_scrolls, scroll_delay, idle_timeout, output, headless):
+    """Scan a user's timeline for broadcast links.
+    USERNAME can be with or without @ (e.g., @elonmusk or elonmusk).
+    """
+    asyncio.run(scan_user(
+        username=username,
+        max_scrolls=max_scrolls,
+        scroll_delay=scroll_delay,
+        idle_timeout=idle_timeout,
+        headless=headless,
+        output_file=output,
+    ))
+@main.command()
+@click.argument("username")
+@click.option("--output", "-o", default=None, help="Output JSON file path")
+@click.option("--delay", default=1.0, help="Delay between API calls in seconds (default: 1.0)")
+@click.option("--headless/--no-headless", default=False, help="Run browser headless (default: visible)")
+@click.option("--verbose", "-v", is_flag=True, help="Show detailed output")
+@click.option("--fresh", is_flag=True, help="Ignore saved state, start from beginning")
+@click.option("--auth-token", default=None, help="Manual auth_token cookie (skips browser)")
+@click.option("--csrf-token", default=None, help="Manual ct0/CSRF token (skips browser)")
+@click.option("--user-id", default=None, help="Manual user ID (skips user ID lookup)")
+def scrape(username, output, delay, headless, verbose, fresh, auth_token, csrf_token, user_id):
+    """Scrape ALL past broadcasts from a user's timeline.
+    Uses GraphQL API pagination. Saves cursor state so you can resume
+    after rate limits. Run the same command again to continue.
+    USERNAME can be with or without @ (e.g., @SpaceX or SpaceX).
+    Examples:
+        broadcastx scrape @SpaceX
+        broadcastx scrape @SpaceX --fresh    # ignore saved state
+        broadcastx scrape @SpaceX --delay 2.0 -v
+    """
+    if fresh:
+        from .scrape_broadcasts import _state_file
+        state_path = _state_file(username.lstrip("@"))
+        if state_path.exists():
+            state_path.unlink()
+            console.print(f"[dim]Cleared saved state: {state_path}[/dim]")
+    asyncio.run(scrape_broadcasts(
+        username=username,
+        headless=headless,
+        output_file=output,
+        delay=delay,
+        verbose=verbose,
+        auth_token=auth_token,
+        csrf_token=csrf_token,
+        user_id=user_id,
+    ))
+@main.command()
+@click.argument("username")
+@click.option("--check-interval", default=30 * 60, help="Seconds between profile checks (default: 1800)")
+@click.option("--live-interval", default=5 * 60, help="Seconds between live-status checks (default: 300)")
+@click.option("--output", "-o", default=None, help="Monitor event JSON file path")
+@click.option("--output-dir", default=None, help="Directory for downloaded videos")
+@click.option("--browser", "-b", default=DEFAULT_BROWSER, help=f"Browser for yt-dlp cookies (default: {DEFAULT_BROWSER})")
+@click.option("--headless/--no-headless", default=False, help="Run browser headless (default: visible)")
+@click.option("--download/--no-download", default=True, help="Download when broadcast ends (default: download)")
+@click.option("--once", is_flag=True, help="Run one detection cycle, useful for testing")
+def monitor(username, check_interval, live_interval, output, output_dir, browser, headless, download, once):
+    """Monitor a profile for current live broadcasts and download ended replays.
+    USERNAME can be with or without @ (e.g., @SpaceX or SpaceX).
+    """
+    if download and not check_yt_dlp():
+        console.print("[red]✗ yt-dlp not found.[/red]")
+        console.print("  Install with: [bold]brew install yt-dlp[/bold]")
+        raise SystemExit(1)
+    if download and not check_ffmpeg():
+        console.print("[red]✗ ffmpeg not found.[/red]")
+        console.print("  Install with: [bold]brew install ffmpeg[/bold]")
+        raise SystemExit(1)
+    asyncio.run(monitor_user(
+        username=username,
+        check_interval=check_interval,
+        live_interval=live_interval,
+        headless=headless,
+        output_file=output,
+        output_dir=output_dir,
+        browser=browser,
+        download=download,
+        once=once,
+    ))
+@main.command()
+@click.argument("urls", nargs=-1)
+@click.option("--from", "from_file", default=None, type=click.Path(), help="Load URLs from a JSON file")
+@click.option("--output-dir", "-o", default=None, help="Output directory for videos")
+@click.option("--browser", "-b", default=DEFAULT_BROWSER, help=f"Browser for cookies (default: {DEFAULT_BROWSER})")
+@click.option("--verbose", "-v", is_flag=True, help="Show yt-dlp output")
+@click.option("--parallel", "-p", default=1, help="Number of concurrent downloads (default: 1)")
+def download(urls, from_file, output_dir, browser, verbose, parallel):
+    """Download broadcast video(s).
+    Pass one or more broadcast URLs directly, or use --from to load from a JSON file.
+    Examples:
+        broadcastx download https://x.com/i/broadcasts/1vAxRkBbDRzKl
+        broadcastx download --from output/broadcasts.json
+        broadcastx download --from output/broadcasts.json -o ./my_videos
+    Rotation correction is applied automatically: if the broadcast carries
+    phone-orientation metadata, the downloaded video is re-encoded so it
+    displays upright. A `.rotation.jsonl` sidecar is also written alongside
+    the video for inspection.
+    """
+    # Pre-flight checks
+    if not check_yt_dlp():
+        console.print("[red]✗ yt-dlp not found.[/red]")
+        console.print("  Install with: [bold]brew install yt-dlp[/bold]")
+        raise SystemExit(1)
+    if not check_ffmpeg():
+        console.print("[red]✗ ffmpeg not found.[/red]")
+        console.print("  Install with: [bold]brew install ffmpeg[/bold]")
+        raise SystemExit(1)
+    if not urls and not from_file:
+        console.print("[yellow]Provide URLs or use --from <file>.[/yellow]")
+        raise SystemExit(1)
+    out = Path(output_dir) if output_dir else DEFAULT_VIDEOS_DIR
+    results = download_all(
+        urls=list(urls),
+        from_file=from_file,
+        output_dir=out,
+        browser=browser,
+        verbose=verbose,
+        parallel=parallel,
+    )
+    # Exit with error code if any downloads failed
+    if any(not r.success for r in results):
+        raise SystemExit(1)
+if __name__ == "__main__":
+    main()
+@main.command()
+@click.argument("broadcast_url")
+@click.option("--browser", "-b", default=DEFAULT_BROWSER, help=f"Browser for cookies (default: {DEFAULT_BROWSER})")
+@click.option("--trim/--detect-only", default=False, help="Actually trim paused sections (default: detect only)")
+@click.option("--output", "-o", default=None, help="Output video for --trim (default: <video>.trimmed.mp4)")
+@click.option("--size-ratio", default=0.50, help="Size-drop threshold (default 0.50)")
+@click.option("--gap-density", default=0.50, help="PDT-gap density threshold (default 0.50)")
+@click.option("--min-pause", default=10.0, help="Minimum pause duration in seconds (default 10)")
+def trim_pauses(broadcast_url, browser, trim, output, size_ratio, gap_density, min_pause):
+    """Detect (and optionally trim) paused sections in a broadcast.
+    Analyses HLS segments via HTTP HEAD requests (no full download) and
+    playlist PDT timestamps to find sections where the video was paused
+    while audio continued.  Default: detect-only.  Pass --trim to cut.
+    """
+    if trim and not check_ffmpeg():
+        console.print("[red]ffmpeg not found - install with: brew install ffmpeg[/red]")
+        raise SystemExit(1)
+    console.print("[bold]Analysing HLS segments for pauses...[/bold]")
+    pauses = detect_pauses(
+        broadcast_url,
+        browser=browser,
+        size_ratio_threshold=size_ratio,
+        gap_density_threshold=gap_density,
+        min_pause_sec=min_pause,
+    )
+    console.print(pause_report(pauses))
+    if trim and pauses:
+        video_path = Path("output") / "videos" / f"{broadcast_url.split('/')[-1]}.mp4"
+        if not video_path.exists():
+            console.print(f"[red]Video not found: {video_path}")
+            console.print("  Download first: broadcastx download <url>")
+            raise SystemExit(1)
+        out = Path(output) if output else Path(str(video_path).replace(".mp4", ".trimmed.mp4"))
+        console.print(f"\n[bold]Trimming -> {out}...")
+        try:
+            trim_intervals(video_path, pauses, out)
+            console.print(f"  [green]Done -> {out}")
+        except Exception as e:
+            console.print(f"  [red]Failed: {e}")
+            raise SystemExit(1)
+    elif trim and not pauses:
+        console.print("[green]Nothing to trim.")

broadcastx-0.1.0/broadcastx/config.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Shared configuration and constants for BroadcastX."""
+import re
+from pathlib import Path
+# Default output directory (relative to cwd)
+DEFAULT_OUTPUT_DIR = Path("output")
+DEFAULT_VIDEOS_DIR = DEFAULT_OUTPUT_DIR / "videos"
+DEFAULT_BROADCASTS_FILE = DEFAULT_OUTPUT_DIR / "broadcasts.json"
+# Browser to extract cookies from (for yt-dlp)
+DEFAULT_BROWSER = "chrome"
+# X broadcast IDs observed in real broadcast URLs are opaque alphanumeric
+# tokens, e.g. 1vAxRkBbDRzKl. Reject tiny fragments such as /broadcasts/1.
+BROADCAST_ID_RE = r"[A-Za-z0-9]{8,}"
+# Broadcast URL patterns to match
+BROADCAST_PATTERNS = [
+    re.compile(rf"https?://(?:x|twitter)\.com/i/broadcasts/({BROADCAST_ID_RE})(?![A-Za-z0-9_])"),
+    re.compile(rf"https?://(?:www\.)?pscp\.tv/w/({BROADCAST_ID_RE})(?![A-Za-z0-9_])"),
+]
+# Twitter GraphQL endpoints to intercept
+GRAPHQL_ENDPOINTS = [
+    "UserTweets",
+    "UserTweetsAndReplies",
+    "TweetDetail",
+    "SearchTimeline",
+]
+# Scanner defaults
+DEFAULT_MAX_SCROLLS = 100       # Maximum number of scroll actions
+DEFAULT_SCROLL_DELAY = 2.0      # Seconds between scrolls
+DEFAULT_IDLE_TIMEOUT = 10.0     # Stop after N seconds with no new tweets
+DEFAULT_HEADLESS = False        # Show browser by default (useful for login)
+# yt-dlp output template — uses broadcast ID as filename
+YTDLP_OUTPUT_TEMPLATE = "%(id)s [%(timestamp>%Y-%m-%d %H.%M.%S)s] %(title)s.%(ext)s"
+def ensure_output_dirs():
+    """Create output directories if they don't exist."""
+    DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    DEFAULT_VIDEOS_DIR.mkdir(parents=True, exist_ok=True)
+def extract_broadcast_id(url: str) -> str | None:
+    """Extract broadcast ID from a broadcast URL."""
+    for pattern in BROADCAST_PATTERNS:
+        match = pattern.search(url)
+        if match:
+            return match.group(1)
+    return None
+def is_broadcast_url(url: str) -> bool:
+    """Check if a URL is a broadcast URL."""
+    return extract_broadcast_id(url) is not None
+def normalize_broadcast_url(url: str) -> str | None:
+    """Normalize a broadcast URL to the canonical x.com format."""
+    bid = extract_broadcast_id(url)
+    if bid:
+        return f"https://x.com/i/broadcasts/{bid}"
+    return None