videonut 1.2.7 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -272
- package/USER_GUIDE.md +90 -90
- package/agents/core/eic.md +771 -771
- package/agents/creative/director.md +246 -246
- package/agents/creative/scriptwriter.md +207 -207
- package/agents/research/investigator.md +394 -394
- package/agents/technical/archivist.md +288 -288
- package/agents/technical/scavenger.md +247 -247
- package/bin/videonut.js +37 -21
- package/config.yaml +61 -61
- package/docs/scriptwriter.md +42 -42
- package/file_validator.py +186 -186
- package/memory/short_term/asset_manifest.md +64 -64
- package/memory/short_term/investigation_dossier.md +31 -31
- package/memory/short_term/master_script.md +51 -51
- package/package.json +61 -64
- package/requirements.txt +8 -8
- package/setup.js +33 -15
- package/tools/check_env.py +76 -76
- package/tools/downloaders/caption_reader.py +237 -237
- package/tools/downloaders/clip_grabber.py +82 -82
- package/tools/downloaders/image_grabber.py +105 -105
- package/tools/downloaders/pdf_reader.py +163 -163
- package/tools/downloaders/screenshotter.py +58 -58
- package/tools/downloaders/web_reader.py +69 -69
- package/tools/validators/link_checker.py +45 -45
- package/workflow_orchestrator.py +336 -336
- package/.claude/commands/archivist.toml +0 -12
- package/.claude/commands/director.toml +0 -12
- package/.claude/commands/eic.toml +0 -12
- package/.claude/commands/investigator.toml +0 -12
- package/.claude/commands/prompt.toml +0 -12
- package/.claude/commands/scavenger.toml +0 -12
- package/.claude/commands/scout.toml +0 -12
- package/.claude/commands/scriptwriter.toml +0 -12
- package/.claude/commands/seo.toml +0 -12
- package/.claude/commands/thumbnail.toml +0 -12
- package/.claude/commands/topic_scout.toml +0 -12
- package/.gemini/commands/archivist.toml +0 -12
- package/.gemini/commands/director.toml +0 -12
- package/.gemini/commands/eic.toml +0 -12
- package/.gemini/commands/investigator.toml +0 -12
- package/.gemini/commands/prompt.toml +0 -12
- package/.gemini/commands/scavenger.toml +0 -12
- package/.gemini/commands/scout.toml +0 -12
- package/.gemini/commands/scriptwriter.toml +0 -12
- package/.gemini/commands/seo.toml +0 -12
- package/.gemini/commands/thumbnail.toml +0 -12
- package/.gemini/commands/topic_scout.toml +0 -12
- package/.qwen/commands/archivist.toml +0 -12
- package/.qwen/commands/director.toml +0 -12
- package/.qwen/commands/eic.toml +0 -12
- package/.qwen/commands/investigator.toml +0 -12
- package/.qwen/commands/prompt.toml +0 -12
- package/.qwen/commands/scavenger.toml +0 -12
- package/.qwen/commands/scout.toml +0 -12
- package/.qwen/commands/scriptwriter.toml +0 -12
- package/.qwen/commands/seo.toml +0 -12
- package/.qwen/commands/thumbnail.toml +0 -12
- package/.qwen/commands/topic_scout.toml +0 -12
package/tools/check_env.py
CHANGED
|
@@ -1,77 +1,77 @@
|
|
|
1
|
-
import shutil
|
|
2
|
-
import sys
|
|
3
|
-
import os
|
|
4
|
-
import subprocess
|
|
5
|
-
|
|
6
|
-
def check_command(cmd, name):
|
|
7
|
-
path = shutil.which(cmd)
|
|
8
|
-
if path:
|
|
9
|
-
print(f"✅ {name} found at: {path}")
|
|
10
|
-
return True
|
|
11
|
-
else:
|
|
12
|
-
print(f"❌ {name} NOT found in PATH.")
|
|
13
|
-
return False
|
|
14
|
-
|
|
15
|
-
def check_import(module_name):
|
|
16
|
-
try:
|
|
17
|
-
__import__(module_name)
|
|
18
|
-
print(f"✅ Python module '{module_name}' is installed.")
|
|
19
|
-
return True
|
|
20
|
-
except ImportError:
|
|
21
|
-
print(f"❌ Python module '{module_name}' is MISSING.")
|
|
22
|
-
return False
|
|
23
|
-
|
|
24
|
-
def main():
|
|
25
|
-
print("🔍 VideoNut Environment Check...")
|
|
26
|
-
print("-" * 30)
|
|
27
|
-
|
|
28
|
-
all_good = True
|
|
29
|
-
|
|
30
|
-
# 1. Check Python version
|
|
31
|
-
if sys.version_info < (3, 8):
|
|
32
|
-
print("❌ Python 3.8+ is required.")
|
|
33
|
-
all_good = False
|
|
34
|
-
else:
|
|
35
|
-
print(f"✅ Python Version: {sys.version}")
|
|
36
|
-
|
|
37
|
-
# 2. Check FFmpeg
|
|
38
|
-
if not check_command("ffmpeg", "FFmpeg"):
|
|
39
|
-
# Check local bin fallback
|
|
40
|
-
local_bin = os.path.join(os.path.dirname(__file__), "bin", "ffmpeg.exe")
|
|
41
|
-
if os.path.exists(local_bin):
|
|
42
|
-
print(f"✅ FFmpeg found in local bin: {local_bin}")
|
|
43
|
-
else:
|
|
44
|
-
print(" (Please install FFmpeg or place it in tools/bin/)")
|
|
45
|
-
all_good = False
|
|
46
|
-
|
|
47
|
-
# 3. Check Python Packages
|
|
48
|
-
if not check_import("yt_dlp"): all_good = False
|
|
49
|
-
if not check_import("playwright"): all_good = False
|
|
50
|
-
if not check_import("requests"): all_good = False
|
|
51
|
-
if not check_import("bs4"): all_good = False
|
|
52
|
-
if not check_import("youtube_transcript_api"): all_good = False
|
|
53
|
-
if not check_import("pypdf"): all_good = False
|
|
54
|
-
|
|
55
|
-
# 4. Check for new tools
|
|
56
|
-
tools_dir = os.path.join(os.path.dirname(__file__), "downloaders")
|
|
57
|
-
new_tools = [
|
|
58
|
-
("caption_reader.py", os.path.join(tools_dir, "caption_reader.py")),
|
|
59
|
-
]
|
|
60
|
-
|
|
61
|
-
for tool_name, tool_path in new_tools:
|
|
62
|
-
if os.path.exists(tool_path):
|
|
63
|
-
print(f"✅ Tool found: {tool_name}")
|
|
64
|
-
else:
|
|
65
|
-
print(f"❌ Tool missing: {tool_name} at {tool_path}")
|
|
66
|
-
all_good = False
|
|
67
|
-
|
|
68
|
-
print("-" * 30)
|
|
69
|
-
if all_good:
|
|
70
|
-
print("🚀 System is READY for VideoNut Agents.")
|
|
71
|
-
sys.exit(0)
|
|
72
|
-
else:
|
|
73
|
-
print("⚠️ System has ISSUES. Please fix missing dependencies.")
|
|
74
|
-
sys.exit(1)
|
|
75
|
-
|
|
76
|
-
if __name__ == "__main__":
|
|
1
|
+
import shutil
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
|
|
6
|
+
def check_command(cmd, name):
|
|
7
|
+
path = shutil.which(cmd)
|
|
8
|
+
if path:
|
|
9
|
+
print(f"✅ {name} found at: {path}")
|
|
10
|
+
return True
|
|
11
|
+
else:
|
|
12
|
+
print(f"❌ {name} NOT found in PATH.")
|
|
13
|
+
return False
|
|
14
|
+
|
|
15
|
+
def check_import(module_name):
|
|
16
|
+
try:
|
|
17
|
+
__import__(module_name)
|
|
18
|
+
print(f"✅ Python module '{module_name}' is installed.")
|
|
19
|
+
return True
|
|
20
|
+
except ImportError:
|
|
21
|
+
print(f"❌ Python module '{module_name}' is MISSING.")
|
|
22
|
+
return False
|
|
23
|
+
|
|
24
|
+
def main():
|
|
25
|
+
print("🔍 VideoNut Environment Check...")
|
|
26
|
+
print("-" * 30)
|
|
27
|
+
|
|
28
|
+
all_good = True
|
|
29
|
+
|
|
30
|
+
# 1. Check Python version
|
|
31
|
+
if sys.version_info < (3, 8):
|
|
32
|
+
print("❌ Python 3.8+ is required.")
|
|
33
|
+
all_good = False
|
|
34
|
+
else:
|
|
35
|
+
print(f"✅ Python Version: {sys.version}")
|
|
36
|
+
|
|
37
|
+
# 2. Check FFmpeg
|
|
38
|
+
if not check_command("ffmpeg", "FFmpeg"):
|
|
39
|
+
# Check local bin fallback
|
|
40
|
+
local_bin = os.path.join(os.path.dirname(__file__), "bin", "ffmpeg.exe")
|
|
41
|
+
if os.path.exists(local_bin):
|
|
42
|
+
print(f"✅ FFmpeg found in local bin: {local_bin}")
|
|
43
|
+
else:
|
|
44
|
+
print(" (Please install FFmpeg or place it in tools/bin/)")
|
|
45
|
+
all_good = False
|
|
46
|
+
|
|
47
|
+
# 3. Check Python Packages
|
|
48
|
+
if not check_import("yt_dlp"): all_good = False
|
|
49
|
+
if not check_import("playwright"): all_good = False
|
|
50
|
+
if not check_import("requests"): all_good = False
|
|
51
|
+
if not check_import("bs4"): all_good = False
|
|
52
|
+
if not check_import("youtube_transcript_api"): all_good = False
|
|
53
|
+
if not check_import("pypdf"): all_good = False
|
|
54
|
+
|
|
55
|
+
# 4. Check for new tools
|
|
56
|
+
tools_dir = os.path.join(os.path.dirname(__file__), "downloaders")
|
|
57
|
+
new_tools = [
|
|
58
|
+
("caption_reader.py", os.path.join(tools_dir, "caption_reader.py")),
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
for tool_name, tool_path in new_tools:
|
|
62
|
+
if os.path.exists(tool_path):
|
|
63
|
+
print(f"✅ Tool found: {tool_name}")
|
|
64
|
+
else:
|
|
65
|
+
print(f"❌ Tool missing: {tool_name} at {tool_path}")
|
|
66
|
+
all_good = False
|
|
67
|
+
|
|
68
|
+
print("-" * 30)
|
|
69
|
+
if all_good:
|
|
70
|
+
print("🚀 System is READY for VideoNut Agents.")
|
|
71
|
+
sys.exit(0)
|
|
72
|
+
else:
|
|
73
|
+
print("⚠️ System has ISSUES. Please fix missing dependencies.")
|
|
74
|
+
sys.exit(1)
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
77
|
main()
|
|
@@ -1,238 +1,238 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
YouTube Caption/Transcript Reader for VideoNut
|
|
4
|
-
Extracts captions from YouTube videos with optional timestamp display.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import sys
|
|
8
|
-
import argparse
|
|
9
|
-
import json
|
|
10
|
-
from youtube_transcript_api import YouTubeTranscriptApi
|
|
11
|
-
from youtube_transcript_api.formatters import TextFormatter, JSONFormatter
|
|
12
|
-
import re
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def extract_video_id(url):
|
|
16
|
-
"""
|
|
17
|
-
Extract YouTube video ID from various URL formats
|
|
18
|
-
"""
|
|
19
|
-
# Patterns for different YouTube URL formats
|
|
20
|
-
patterns = [
|
|
21
|
-
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11,12})',
|
|
22
|
-
r'(?:https?:\/\/)?(?:www\.)?youtu\.be\/([a-zA-Z0-9_-]{11,12})',
|
|
23
|
-
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([a-zA-Z0-9_-]{11,12})',
|
|
24
|
-
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/v\/([a-zA-Z0-9_-]{11,12})',
|
|
25
|
-
]
|
|
26
|
-
|
|
27
|
-
for pattern in patterns:
|
|
28
|
-
match = re.search(pattern, url)
|
|
29
|
-
if match:
|
|
30
|
-
return match.group(1)
|
|
31
|
-
|
|
32
|
-
return None
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def format_timestamp(seconds):
|
|
36
|
-
"""Convert seconds to HH:MM:SS format"""
|
|
37
|
-
hours = int(seconds // 3600)
|
|
38
|
-
minutes = int((seconds % 3600) // 60)
|
|
39
|
-
secs = int(seconds % 60)
|
|
40
|
-
|
|
41
|
-
if hours > 0:
|
|
42
|
-
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
|
43
|
-
else:
|
|
44
|
-
return f"{minutes:02d}:{secs:02d}"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def get_youtube_captions(url, languages=None, with_timestamps=False, search_term=None):
|
|
48
|
-
"""
|
|
49
|
-
Get YouTube video captions/transcript
|
|
50
|
-
|
|
51
|
-
Args:
|
|
52
|
-
url: YouTube video URL
|
|
53
|
-
languages: List of preferred language codes
|
|
54
|
-
with_timestamps: If True, include timestamps with each line
|
|
55
|
-
search_term: If provided, only return lines containing this term (with timestamps)
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
Formatted transcript string
|
|
59
|
-
"""
|
|
60
|
-
if languages is None:
|
|
61
|
-
# Default to English and other common languages
|
|
62
|
-
languages = ['en', 'en-US', 'en-GB', 'hi', 'te', 'ta', 'mr', 'es', 'fr', 'de']
|
|
63
|
-
|
|
64
|
-
video_id = extract_video_id(url)
|
|
65
|
-
|
|
66
|
-
if not video_id:
|
|
67
|
-
print(f"Error: Could not extract video ID from URL: {url}", file=sys.stderr)
|
|
68
|
-
sys.exit(1)
|
|
69
|
-
|
|
70
|
-
try:
|
|
71
|
-
# Instantiate the API class
|
|
72
|
-
api = YouTubeTranscriptApi()
|
|
73
|
-
|
|
74
|
-
# Fetch the transcript data directly using the instance method
|
|
75
|
-
transcript_data = api.fetch(video_id, languages=languages)
|
|
76
|
-
|
|
77
|
-
# If searching for a term, filter and return with timestamps
|
|
78
|
-
if search_term:
|
|
79
|
-
search_lower = search_term.lower()
|
|
80
|
-
matches = []
|
|
81
|
-
for entry in transcript_data:
|
|
82
|
-
if search_lower in entry.text.lower():
|
|
83
|
-
timestamp = format_timestamp(entry.start)
|
|
84
|
-
duration = getattr(entry, 'duration', 0)
|
|
85
|
-
end_timestamp = format_timestamp(entry.start + duration)
|
|
86
|
-
matches.append({
|
|
87
|
-
'timestamp': timestamp,
|
|
88
|
-
'end_timestamp': end_timestamp,
|
|
89
|
-
'start_seconds': entry.start,
|
|
90
|
-
'text': entry.text
|
|
91
|
-
})
|
|
92
|
-
|
|
93
|
-
if not matches:
|
|
94
|
-
return f"No matches found for '{search_term}' in transcript."
|
|
95
|
-
|
|
96
|
-
output = [f"\n🔍 Found {len(matches)} matches for '{search_term}':\n"]
|
|
97
|
-
for match in matches:
|
|
98
|
-
output.append(f"[{match['timestamp']}] {match['text']}")
|
|
99
|
-
output.append(f"\n📋 Suggested clip range: {matches[0]['timestamp']} - {matches[-1]['end_timestamp']}")
|
|
100
|
-
return '\n'.join(output)
|
|
101
|
-
|
|
102
|
-
# If with_timestamps, format each line with timestamp
|
|
103
|
-
if with_timestamps:
|
|
104
|
-
output = []
|
|
105
|
-
output.append(f"\n📝 Transcript with Timestamps:\n")
|
|
106
|
-
output.append("=" * 60)
|
|
107
|
-
for entry in transcript_data:
|
|
108
|
-
timestamp = format_timestamp(entry.start)
|
|
109
|
-
output.append(f"[{timestamp}] {entry.text}")
|
|
110
|
-
return '\n'.join(output)
|
|
111
|
-
|
|
112
|
-
# Default: plain text format - join all text entries
|
|
113
|
-
plain_text = ' '.join([entry.text for entry in transcript_data])
|
|
114
|
-
|
|
115
|
-
return plain_text
|
|
116
|
-
|
|
117
|
-
except Exception as e:
|
|
118
|
-
print(f"Error retrieving captions: {str(e)}", file=sys.stderr)
|
|
119
|
-
sys.exit(1)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def find_timestamp_for_quote(url, quote, context_seconds=30):
|
|
123
|
-
"""
|
|
124
|
-
Find the timestamp where a specific quote appears in the video.
|
|
125
|
-
Returns the start and end timestamps for a clip containing that quote.
|
|
126
|
-
|
|
127
|
-
Args:
|
|
128
|
-
url: YouTube video URL
|
|
129
|
-
quote: The quote to search for
|
|
130
|
-
context_seconds: How many seconds of context to include before/after
|
|
131
|
-
|
|
132
|
-
Returns:
|
|
133
|
-
Dict with start_time, end_time, and surrounding text
|
|
134
|
-
"""
|
|
135
|
-
video_id = extract_video_id(url)
|
|
136
|
-
if not video_id:
|
|
137
|
-
return None
|
|
138
|
-
|
|
139
|
-
try:
|
|
140
|
-
api = YouTubeTranscriptApi()
|
|
141
|
-
# Use fetch to get the default transcript or specify languages
|
|
142
|
-
transcript_data = api.fetch(video_id)
|
|
143
|
-
|
|
144
|
-
quote_lower = quote.lower()
|
|
145
|
-
|
|
146
|
-
for i, entry in enumerate(transcript_data):
|
|
147
|
-
if quote_lower in entry.text.lower():
|
|
148
|
-
# Found the quote
|
|
149
|
-
start_time = max(0, entry.start - context_seconds)
|
|
150
|
-
end_time = entry.start + getattr(entry, 'duration', 5) + context_seconds
|
|
151
|
-
|
|
152
|
-
# Get surrounding context
|
|
153
|
-
context_entries = []
|
|
154
|
-
for j in range(max(0, i-3), min(len(transcript_data), i+4)):
|
|
155
|
-
context_entries.append({
|
|
156
|
-
'timestamp': format_timestamp(transcript_data[j].start),
|
|
157
|
-
'text': transcript_data[j].text
|
|
158
|
-
})
|
|
159
|
-
|
|
160
|
-
return {
|
|
161
|
-
'found': True,
|
|
162
|
-
'quote': entry.text,
|
|
163
|
-
'timestamp': format_timestamp(entry.start),
|
|
164
|
-
'clip_start': format_timestamp(start_time),
|
|
165
|
-
'clip_end': format_timestamp(end_time),
|
|
166
|
-
'context': context_entries
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
return {'found': False, 'message': f"Quote not found: {quote}"}
|
|
170
|
-
|
|
171
|
-
except Exception as e:
|
|
172
|
-
return {'found': False, 'message': str(e)}
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def main():
|
|
176
|
-
parser = argparse.ArgumentParser(
|
|
177
|
-
description="Extract captions from YouTube videos with optional timestamps.",
|
|
178
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
179
|
-
epilog="""
|
|
180
|
-
Examples:
|
|
181
|
-
# Get plain transcript
|
|
182
|
-
python caption_reader.py --url "https://youtube.com/watch?v=xxx"
|
|
183
|
-
|
|
184
|
-
# Get transcript with timestamps
|
|
185
|
-
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --timestamps
|
|
186
|
-
|
|
187
|
-
# Search for specific term and get timestamps
|
|
188
|
-
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --search "electoral bonds"
|
|
189
|
-
|
|
190
|
-
# Find timestamp for a specific quote
|
|
191
|
-
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --find-quote "corruption" --json
|
|
192
|
-
"""
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
parser.add_argument("--url", required=True, help="YouTube video URL")
|
|
196
|
-
parser.add_argument("--languages", nargs="*", default=None,
|
|
197
|
-
help="Preferred language codes (e.g., en hi te). Default: en and Indian languages")
|
|
198
|
-
parser.add_argument("--timestamps", "-t", action="store_true",
|
|
199
|
-
help="Include timestamps with each line")
|
|
200
|
-
parser.add_argument("--search", "-s", help="Search for specific term and show timestamps")
|
|
201
|
-
parser.add_argument("--find-quote", "-f", help="Find exact timestamp for a quote")
|
|
202
|
-
parser.add_argument("--context", "-c", type=int, default=30,
|
|
203
|
-
help="Seconds of context around found quote (default: 30)")
|
|
204
|
-
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
|
205
|
-
|
|
206
|
-
args = parser.parse_args()
|
|
207
|
-
|
|
208
|
-
if args.find_quote:
|
|
209
|
-
# Find timestamp for specific quote
|
|
210
|
-
result = find_timestamp_for_quote(args.url, args.find_quote, args.context)
|
|
211
|
-
if args.json:
|
|
212
|
-
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
213
|
-
else:
|
|
214
|
-
if result.get('found'):
|
|
215
|
-
print(f"\n✅ Quote Found!")
|
|
216
|
-
print(f" Timestamp: {result['timestamp']}")
|
|
217
|
-
print(f" Text: {result['quote']}")
|
|
218
|
-
print(f"\n🎬 Suggested Clip:")
|
|
219
|
-
print(f" Start: {result['clip_start']}")
|
|
220
|
-
print(f" End: {result['clip_end']}")
|
|
221
|
-
print(f"\n📄 Context:")
|
|
222
|
-
for entry in result['context']:
|
|
223
|
-
print(f" [{entry['timestamp']}] {entry['text']}")
|
|
224
|
-
else:
|
|
225
|
-
print(f"❌ {result.get('message', 'Quote not found')}")
|
|
226
|
-
else:
|
|
227
|
-
# Get transcript
|
|
228
|
-
captions = get_youtube_captions(
|
|
229
|
-
args.url,
|
|
230
|
-
args.languages,
|
|
231
|
-
with_timestamps=args.timestamps,
|
|
232
|
-
search_term=args.search
|
|
233
|
-
)
|
|
234
|
-
print(captions)
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
if __name__ == "__main__":
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
YouTube Caption/Transcript Reader for VideoNut
|
|
4
|
+
Extracts captions from YouTube videos with optional timestamp display.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
11
|
+
from youtube_transcript_api.formatters import TextFormatter, JSONFormatter
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_video_id(url):
|
|
16
|
+
"""
|
|
17
|
+
Extract YouTube video ID from various URL formats
|
|
18
|
+
"""
|
|
19
|
+
# Patterns for different YouTube URL formats
|
|
20
|
+
patterns = [
|
|
21
|
+
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11,12})',
|
|
22
|
+
r'(?:https?:\/\/)?(?:www\.)?youtu\.be\/([a-zA-Z0-9_-]{11,12})',
|
|
23
|
+
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([a-zA-Z0-9_-]{11,12})',
|
|
24
|
+
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/v\/([a-zA-Z0-9_-]{11,12})',
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
for pattern in patterns:
|
|
28
|
+
match = re.search(pattern, url)
|
|
29
|
+
if match:
|
|
30
|
+
return match.group(1)
|
|
31
|
+
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def format_timestamp(seconds):
|
|
36
|
+
"""Convert seconds to HH:MM:SS format"""
|
|
37
|
+
hours = int(seconds // 3600)
|
|
38
|
+
minutes = int((seconds % 3600) // 60)
|
|
39
|
+
secs = int(seconds % 60)
|
|
40
|
+
|
|
41
|
+
if hours > 0:
|
|
42
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
|
43
|
+
else:
|
|
44
|
+
return f"{minutes:02d}:{secs:02d}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_youtube_captions(url, languages=None, with_timestamps=False, search_term=None):
|
|
48
|
+
"""
|
|
49
|
+
Get YouTube video captions/transcript
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
url: YouTube video URL
|
|
53
|
+
languages: List of preferred language codes
|
|
54
|
+
with_timestamps: If True, include timestamps with each line
|
|
55
|
+
search_term: If provided, only return lines containing this term (with timestamps)
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Formatted transcript string
|
|
59
|
+
"""
|
|
60
|
+
if languages is None:
|
|
61
|
+
# Default to English and other common languages
|
|
62
|
+
languages = ['en', 'en-US', 'en-GB', 'hi', 'te', 'ta', 'mr', 'es', 'fr', 'de']
|
|
63
|
+
|
|
64
|
+
video_id = extract_video_id(url)
|
|
65
|
+
|
|
66
|
+
if not video_id:
|
|
67
|
+
print(f"Error: Could not extract video ID from URL: {url}", file=sys.stderr)
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Instantiate the API class
|
|
72
|
+
api = YouTubeTranscriptApi()
|
|
73
|
+
|
|
74
|
+
# Fetch the transcript data directly using the instance method
|
|
75
|
+
transcript_data = api.fetch(video_id, languages=languages)
|
|
76
|
+
|
|
77
|
+
# If searching for a term, filter and return with timestamps
|
|
78
|
+
if search_term:
|
|
79
|
+
search_lower = search_term.lower()
|
|
80
|
+
matches = []
|
|
81
|
+
for entry in transcript_data:
|
|
82
|
+
if search_lower in entry.text.lower():
|
|
83
|
+
timestamp = format_timestamp(entry.start)
|
|
84
|
+
duration = getattr(entry, 'duration', 0)
|
|
85
|
+
end_timestamp = format_timestamp(entry.start + duration)
|
|
86
|
+
matches.append({
|
|
87
|
+
'timestamp': timestamp,
|
|
88
|
+
'end_timestamp': end_timestamp,
|
|
89
|
+
'start_seconds': entry.start,
|
|
90
|
+
'text': entry.text
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
if not matches:
|
|
94
|
+
return f"No matches found for '{search_term}' in transcript."
|
|
95
|
+
|
|
96
|
+
output = [f"\n🔍 Found {len(matches)} matches for '{search_term}':\n"]
|
|
97
|
+
for match in matches:
|
|
98
|
+
output.append(f"[{match['timestamp']}] {match['text']}")
|
|
99
|
+
output.append(f"\n📋 Suggested clip range: {matches[0]['timestamp']} - {matches[-1]['end_timestamp']}")
|
|
100
|
+
return '\n'.join(output)
|
|
101
|
+
|
|
102
|
+
# If with_timestamps, format each line with timestamp
|
|
103
|
+
if with_timestamps:
|
|
104
|
+
output = []
|
|
105
|
+
output.append(f"\n📝 Transcript with Timestamps:\n")
|
|
106
|
+
output.append("=" * 60)
|
|
107
|
+
for entry in transcript_data:
|
|
108
|
+
timestamp = format_timestamp(entry.start)
|
|
109
|
+
output.append(f"[{timestamp}] {entry.text}")
|
|
110
|
+
return '\n'.join(output)
|
|
111
|
+
|
|
112
|
+
# Default: plain text format - join all text entries
|
|
113
|
+
plain_text = ' '.join([entry.text for entry in transcript_data])
|
|
114
|
+
|
|
115
|
+
return plain_text
|
|
116
|
+
|
|
117
|
+
except Exception as e:
|
|
118
|
+
print(f"Error retrieving captions: {str(e)}", file=sys.stderr)
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def find_timestamp_for_quote(url, quote, context_seconds=30):
|
|
123
|
+
"""
|
|
124
|
+
Find the timestamp where a specific quote appears in the video.
|
|
125
|
+
Returns the start and end timestamps for a clip containing that quote.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
url: YouTube video URL
|
|
129
|
+
quote: The quote to search for
|
|
130
|
+
context_seconds: How many seconds of context to include before/after
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Dict with start_time, end_time, and surrounding text
|
|
134
|
+
"""
|
|
135
|
+
video_id = extract_video_id(url)
|
|
136
|
+
if not video_id:
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
api = YouTubeTranscriptApi()
|
|
141
|
+
# Use fetch to get the default transcript or specify languages
|
|
142
|
+
transcript_data = api.fetch(video_id)
|
|
143
|
+
|
|
144
|
+
quote_lower = quote.lower()
|
|
145
|
+
|
|
146
|
+
for i, entry in enumerate(transcript_data):
|
|
147
|
+
if quote_lower in entry.text.lower():
|
|
148
|
+
# Found the quote
|
|
149
|
+
start_time = max(0, entry.start - context_seconds)
|
|
150
|
+
end_time = entry.start + getattr(entry, 'duration', 5) + context_seconds
|
|
151
|
+
|
|
152
|
+
# Get surrounding context
|
|
153
|
+
context_entries = []
|
|
154
|
+
for j in range(max(0, i-3), min(len(transcript_data), i+4)):
|
|
155
|
+
context_entries.append({
|
|
156
|
+
'timestamp': format_timestamp(transcript_data[j].start),
|
|
157
|
+
'text': transcript_data[j].text
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
'found': True,
|
|
162
|
+
'quote': entry.text,
|
|
163
|
+
'timestamp': format_timestamp(entry.start),
|
|
164
|
+
'clip_start': format_timestamp(start_time),
|
|
165
|
+
'clip_end': format_timestamp(end_time),
|
|
166
|
+
'context': context_entries
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return {'found': False, 'message': f"Quote not found: {quote}"}
|
|
170
|
+
|
|
171
|
+
except Exception as e:
|
|
172
|
+
return {'found': False, 'message': str(e)}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def main():
|
|
176
|
+
parser = argparse.ArgumentParser(
|
|
177
|
+
description="Extract captions from YouTube videos with optional timestamps.",
|
|
178
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
179
|
+
epilog="""
|
|
180
|
+
Examples:
|
|
181
|
+
# Get plain transcript
|
|
182
|
+
python caption_reader.py --url "https://youtube.com/watch?v=xxx"
|
|
183
|
+
|
|
184
|
+
# Get transcript with timestamps
|
|
185
|
+
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --timestamps
|
|
186
|
+
|
|
187
|
+
# Search for specific term and get timestamps
|
|
188
|
+
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --search "electoral bonds"
|
|
189
|
+
|
|
190
|
+
# Find timestamp for a specific quote
|
|
191
|
+
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --find-quote "corruption" --json
|
|
192
|
+
"""
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
parser.add_argument("--url", required=True, help="YouTube video URL")
|
|
196
|
+
parser.add_argument("--languages", nargs="*", default=None,
|
|
197
|
+
help="Preferred language codes (e.g., en hi te). Default: en and Indian languages")
|
|
198
|
+
parser.add_argument("--timestamps", "-t", action="store_true",
|
|
199
|
+
help="Include timestamps with each line")
|
|
200
|
+
parser.add_argument("--search", "-s", help="Search for specific term and show timestamps")
|
|
201
|
+
parser.add_argument("--find-quote", "-f", help="Find exact timestamp for a quote")
|
|
202
|
+
parser.add_argument("--context", "-c", type=int, default=30,
|
|
203
|
+
help="Seconds of context around found quote (default: 30)")
|
|
204
|
+
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
|
205
|
+
|
|
206
|
+
args = parser.parse_args()
|
|
207
|
+
|
|
208
|
+
if args.find_quote:
|
|
209
|
+
# Find timestamp for specific quote
|
|
210
|
+
result = find_timestamp_for_quote(args.url, args.find_quote, args.context)
|
|
211
|
+
if args.json:
|
|
212
|
+
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
213
|
+
else:
|
|
214
|
+
if result.get('found'):
|
|
215
|
+
print(f"\n✅ Quote Found!")
|
|
216
|
+
print(f" Timestamp: {result['timestamp']}")
|
|
217
|
+
print(f" Text: {result['quote']}")
|
|
218
|
+
print(f"\n🎬 Suggested Clip:")
|
|
219
|
+
print(f" Start: {result['clip_start']}")
|
|
220
|
+
print(f" End: {result['clip_end']}")
|
|
221
|
+
print(f"\n📄 Context:")
|
|
222
|
+
for entry in result['context']:
|
|
223
|
+
print(f" [{entry['timestamp']}] {entry['text']}")
|
|
224
|
+
else:
|
|
225
|
+
print(f"❌ {result.get('message', 'Quote not found')}")
|
|
226
|
+
else:
|
|
227
|
+
# Get transcript
|
|
228
|
+
captions = get_youtube_captions(
|
|
229
|
+
args.url,
|
|
230
|
+
args.languages,
|
|
231
|
+
with_timestamps=args.timestamps,
|
|
232
|
+
search_term=args.search
|
|
233
|
+
)
|
|
234
|
+
print(captions)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
if __name__ == "__main__":
|
|
238
238
|
main()
|