@heylemon/lemonade 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import os
4
+ import re
5
+ import subprocess
6
+ import sys
7
+ import tempfile
8
+ from pathlib import Path
9
+
10
+ def clean_vtt(content: str) -> str:
11
+ """
12
+ Clean WebVTT content to plain text.
13
+ Removes headers, timestamps, and duplicate lines.
14
+ """
15
+ lines = content.splitlines()
16
+ text_lines = []
17
+ seen = set()
18
+
19
+ timestamp_pattern = re.compile(r'\d{2}:\d{2}:\d{2}\.\d{3}\s-->\s\d{2}:\d{2}:\d{2}\.\d{3}')
20
+
21
+ for line in lines:
22
+ line = line.strip()
23
+ if not line or line == 'WEBVTT' or line.isdigit():
24
+ continue
25
+ if timestamp_pattern.match(line):
26
+ continue
27
+ if line.startswith('NOTE') or line.startswith('STYLE'):
28
+ continue
29
+
30
+ if text_lines and text_lines[-1] == line:
31
+ continue
32
+
33
+ line = re.sub(r'<[^>]+>', '', line)
34
+
35
+ text_lines.append(line)
36
+
37
+ return '\n'.join(text_lines)
38
+
39
+ def get_transcript(url: str):
40
+ with tempfile.TemporaryDirectory() as temp_dir:
41
+ cmd = [
42
+ "yt-dlp",
43
+ "--write-subs",
44
+ "--write-auto-subs",
45
+ "--skip-download",
46
+ "--sub-lang", "en",
47
+ "--output", "subs",
48
+ url
49
+ ]
50
+
51
+ try:
52
+ subprocess.run(cmd, cwd=temp_dir, check=True, capture_output=True)
53
+ except subprocess.CalledProcessError as e:
54
+ print(f"Error running yt-dlp: {e.stderr.decode()}", file=sys.stderr)
55
+ sys.exit(1)
56
+ except FileNotFoundError:
57
+ print("Error: yt-dlp not found. Please install it.", file=sys.stderr)
58
+ sys.exit(1)
59
+
60
+ temp_path = Path(temp_dir)
61
+ vtt_files = list(temp_path.glob("*.vtt"))
62
+
63
+ if not vtt_files:
64
+ print("No subtitles found.", file=sys.stderr)
65
+ sys.exit(1)
66
+
67
+ vtt_file = vtt_files[0]
68
+
69
+ content = vtt_file.read_text(encoding='utf-8')
70
+ clean_text = clean_vtt(content)
71
+ print(clean_text)
72
+
73
+ def main():
74
+ parser = argparse.ArgumentParser(description="Fetch YouTube transcript.")
75
+ parser.add_argument("url", help="YouTube video URL")
76
+ args = parser.parse_args()
77
+
78
+ get_transcript(args.url)
79
+
80
+ if __name__ == "__main__":
81
+ main()