videonut 1.0.1 ā 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.antigravity/config.toml +8 -0
- package/.claude/commands/archivist.toml +12 -0
- package/.claude/commands/director.toml +12 -0
- package/.claude/commands/eic.toml +12 -0
- package/.claude/commands/investigator.toml +12 -0
- package/.claude/commands/prompt.toml +12 -0
- package/.claude/commands/scavenger.toml +12 -0
- package/.claude/commands/scout.toml +12 -0
- package/.claude/commands/scriptwriter.toml +12 -0
- package/.claude/commands/seo.toml +12 -0
- package/.claude/commands/thumbnail.toml +12 -0
- package/.claude/commands/topic_scout.toml +12 -0
- package/.gemini/commands/archivist.toml +12 -0
- package/.gemini/commands/director.toml +12 -0
- package/.gemini/commands/eic.toml +12 -0
- package/.gemini/commands/investigator.toml +12 -0
- package/.gemini/commands/prompt.toml +12 -0
- package/.gemini/commands/scavenger.toml +12 -0
- package/.gemini/commands/scout.toml +12 -0
- package/.gemini/commands/scriptwriter.toml +12 -0
- package/.gemini/commands/seo.toml +12 -0
- package/.gemini/commands/thumbnail.toml +12 -0
- package/.gemini/commands/topic_scout.toml +12 -0
- package/.qwen/commands/archivist.toml +12 -0
- package/.qwen/commands/director.toml +12 -0
- package/.qwen/commands/eic.toml +12 -0
- package/.qwen/commands/investigator.toml +12 -0
- package/.qwen/commands/prompt.toml +12 -0
- package/.qwen/commands/scavenger.toml +12 -0
- package/.qwen/commands/scout.toml +12 -0
- package/.qwen/commands/scriptwriter.toml +12 -0
- package/.qwen/commands/seo.toml +12 -0
- package/.qwen/commands/thumbnail.toml +12 -0
- package/.qwen/commands/topic_scout.toml +12 -0
- package/USER_GUIDE.md +90 -0
- package/agents/core/eic.md +772 -0
- package/agents/core/prompt_agent.md +264 -0
- package/agents/core/self_review_protocol.md +143 -0
- package/agents/creative/director.md +247 -0
- package/agents/creative/scriptwriter.md +208 -0
- package/agents/creative/seo.md +316 -0
- package/agents/creative/thumbnail.md +285 -0
- package/agents/research/investigator.md +395 -0
- package/agents/research/topic_scout.md +419 -0
- package/agents/technical/archivist.md +289 -0
- package/agents/technical/scavenger.md +248 -0
- package/bin/videonut.js +389 -107
- package/config.yaml +62 -0
- package/docs/AUDIT_REPORT.md +364 -0
- package/docs/LIFECYCLE.md +651 -0
- package/docs/scriptwriter.md +43 -0
- package/file_validator.py +187 -0
- package/memory/short_term/asset_manifest.md +64 -0
- package/memory/short_term/investigation_dossier.md +31 -0
- package/memory/short_term/master_script.md +51 -0
- package/package.json +16 -3
- package/requirements.txt +9 -0
- package/scripts/setup.js +8 -0
- package/tools/check_env.py +77 -0
- package/tools/downloaders/__pycache__/caption_reader.cpython-312.pyc +0 -0
- package/tools/downloaders/__pycache__/image_grabber.cpython-312.pyc +0 -0
- package/tools/downloaders/__pycache__/pdf_reader.cpython-312.pyc +0 -0
- package/tools/downloaders/__pycache__/screenshotter.cpython-312.pyc +0 -0
- package/tools/downloaders/__pycache__/web_reader.cpython-312.pyc +0 -0
- package/tools/downloaders/article_screenshotter.py +388 -0
- package/tools/downloaders/caption_reader.py +238 -0
- package/tools/downloaders/clip_grabber.py +83 -0
- package/tools/downloaders/image_grabber.py +106 -0
- package/tools/downloaders/pdf_reader.py +163 -0
- package/tools/downloaders/pdf_screenshotter.py +240 -0
- package/tools/downloaders/screenshotter.py +58 -0
- package/tools/downloaders/web_reader.py +69 -0
- package/tools/downloaders/youtube_search.py +174 -0
- package/tools/logging/search_logger.py +334 -0
- package/tools/validators/__pycache__/archive_url.cpython-312.pyc +0 -0
- package/tools/validators/__pycache__/link_checker.cpython-312.pyc +0 -0
- package/tools/validators/archive_url.py +269 -0
- package/tools/validators/link_checker.py +45 -0
- package/workflow_orchestrator.py +337 -0
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Smart Article Screenshotter for VideoNut
|
|
4
|
+
Takes screenshots of news articles with specific quotes highlighted.
|
|
5
|
+
Can scroll to specific text, highlight it, and capture focused screenshots.
|
|
6
|
+
|
|
7
|
+
USAGE:
|
|
8
|
+
# Find and highlight a specific quote
|
|
9
|
+
python article_screenshotter.py --url "https://example.com/article" --quote "exact words to find" --output "quote.png"
|
|
10
|
+
|
|
11
|
+
# Screenshot without highlighting
|
|
12
|
+
python article_screenshotter.py --url "https://example.com" --quote "text" --no-highlight --output "quote.png"
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import sys
|
|
16
|
+
import os
|
|
17
|
+
import argparse
|
|
18
|
+
import time
|
|
19
|
+
from random import uniform
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from playwright.sync_api import sync_playwright
|
|
23
|
+
except ImportError:
|
|
24
|
+
print("Error: Playwright not installed. Install with: pip install playwright && playwright install chromium")
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def normalize_text(text):
|
|
29
|
+
"""Normalize text for comparison - remove extra spaces, newlines."""
|
|
30
|
+
import re
|
|
31
|
+
return re.sub(r'\s+', ' ', text.strip().lower())
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def find_quote_in_page(page, quote):
|
|
35
|
+
"""
|
|
36
|
+
Find the specific element containing the quote using multiple strategies.
|
|
37
|
+
Returns the element locator or None.
|
|
38
|
+
"""
|
|
39
|
+
quote_normalized = normalize_text(quote)
|
|
40
|
+
|
|
41
|
+
# Strategy 1: Try exact text match with Playwright
|
|
42
|
+
print(f" Strategy 1: Exact text match...")
|
|
43
|
+
locator = page.get_by_text(quote, exact=False)
|
|
44
|
+
if locator.count() > 0:
|
|
45
|
+
print(f" ā
Found with Strategy 1")
|
|
46
|
+
return locator.first
|
|
47
|
+
|
|
48
|
+
# Strategy 2: Try first few words (in case quote is long)
|
|
49
|
+
words = quote.split()
|
|
50
|
+
if len(words) > 5:
|
|
51
|
+
short_quote = ' '.join(words[:5])
|
|
52
|
+
print(f" Strategy 2: First 5 words: '{short_quote}'...")
|
|
53
|
+
locator = page.get_by_text(short_quote, exact=False)
|
|
54
|
+
if locator.count() > 0:
|
|
55
|
+
print(f" ā
Found with Strategy 2")
|
|
56
|
+
return locator.first
|
|
57
|
+
|
|
58
|
+
# Strategy 3: JavaScript search across all text nodes
|
|
59
|
+
print(f" Strategy 3: JavaScript deep search...")
|
|
60
|
+
element_handle = page.evaluate_handle('''(searchText) => {
|
|
61
|
+
const normalizeText = (t) => t.replace(/\\s+/g, ' ').trim().toLowerCase();
|
|
62
|
+
const searchNorm = normalizeText(searchText);
|
|
63
|
+
|
|
64
|
+
// Search in common content elements
|
|
65
|
+
const selectors = ['p', 'span', 'div', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'li', 'td', 'article'];
|
|
66
|
+
|
|
67
|
+
for (const selector of selectors) {
|
|
68
|
+
const elements = document.querySelectorAll(selector);
|
|
69
|
+
for (const el of elements) {
|
|
70
|
+
const text = normalizeText(el.innerText || '');
|
|
71
|
+
if (text.includes(searchNorm)) {
|
|
72
|
+
return el;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Fallback: search entire body
|
|
78
|
+
const allElements = document.querySelectorAll('*');
|
|
79
|
+
for (const el of allElements) {
|
|
80
|
+
if (el.innerText) {
|
|
81
|
+
const text = normalizeText(el.innerText);
|
|
82
|
+
if (text.includes(searchNorm) && el.innerText.length < 2000) {
|
|
83
|
+
return el;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return null;
|
|
89
|
+
}''', quote)
|
|
90
|
+
|
|
91
|
+
if element_handle:
|
|
92
|
+
element = element_handle.as_element()
|
|
93
|
+
if element:
|
|
94
|
+
print(f" ā
Found with Strategy 3 (JavaScript)")
|
|
95
|
+
return element
|
|
96
|
+
|
|
97
|
+
print(f" ā Quote not found with any strategy")
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def take_quote_screenshot(url, output_path, quote=None, highlight=True, width=1280, height=900):
|
|
102
|
+
"""
|
|
103
|
+
Take a screenshot of a webpage, focusing on a specific quote.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
url: URL of the article
|
|
107
|
+
output_path: Where to save the screenshot
|
|
108
|
+
quote: Text to find and focus on (REQUIRED for meaningful screenshot)
|
|
109
|
+
highlight: Whether to highlight the found text
|
|
110
|
+
width: Viewport width
|
|
111
|
+
height: Viewport height
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Dict with success status and details
|
|
115
|
+
"""
|
|
116
|
+
# Rate limiting
|
|
117
|
+
delay = uniform(1, 2)
|
|
118
|
+
print(f"ā³ Rate limiting: Waiting {delay:.2f} seconds...")
|
|
119
|
+
time.sleep(delay)
|
|
120
|
+
|
|
121
|
+
result = {
|
|
122
|
+
'success': False,
|
|
123
|
+
'quote_found': False,
|
|
124
|
+
'message': '',
|
|
125
|
+
'output_path': output_path
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
with sync_playwright() as p:
|
|
129
|
+
browser = p.chromium.launch(headless=True)
|
|
130
|
+
context = browser.new_context(
|
|
131
|
+
viewport={'width': width, 'height': height},
|
|
132
|
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
133
|
+
)
|
|
134
|
+
page = context.new_page()
|
|
135
|
+
|
|
136
|
+
# Set headers to appear like real browser
|
|
137
|
+
page.set_extra_http_headers({
|
|
138
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
139
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
140
|
+
"Accept-Encoding": "gzip, deflate",
|
|
141
|
+
"Connection": "keep-alive",
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
print(f"š Navigating to {url}...")
|
|
146
|
+
# Use networkidle for better dynamic content handling
|
|
147
|
+
try:
|
|
148
|
+
page.goto(url, timeout=45000, wait_until='networkidle')
|
|
149
|
+
except:
|
|
150
|
+
# Fallback to domcontentloaded if networkidle times out
|
|
151
|
+
print(" ā ļø networkidle timeout, using domcontentloaded...")
|
|
152
|
+
page.goto(url, timeout=30000, wait_until='domcontentloaded')
|
|
153
|
+
|
|
154
|
+
# Wait for dynamic content (increased from 3s to 5s for JS-heavy sites)
|
|
155
|
+
print(" ā³ Waiting for dynamic content to load...")
|
|
156
|
+
page.wait_for_timeout(5000)
|
|
157
|
+
|
|
158
|
+
# Try to close cookie popups, modals, and ads
|
|
159
|
+
print("š§¹ Closing popups...")
|
|
160
|
+
for selector in [
|
|
161
|
+
'button:has-text("Accept")',
|
|
162
|
+
'button:has-text("I Agree")',
|
|
163
|
+
'button:has-text("Got it")',
|
|
164
|
+
'button:has-text("Continue")',
|
|
165
|
+
'.close-button',
|
|
166
|
+
'[aria-label="Close"]',
|
|
167
|
+
'.modal-close',
|
|
168
|
+
'.popup-close',
|
|
169
|
+
'#close-btn'
|
|
170
|
+
]:
|
|
171
|
+
try:
|
|
172
|
+
if page.locator(selector).count() > 0:
|
|
173
|
+
page.locator(selector).first.click(timeout=1000)
|
|
174
|
+
page.wait_for_timeout(500)
|
|
175
|
+
except:
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
if quote:
|
|
179
|
+
print(f"š Searching for quote: '{quote[:60]}{'...' if len(quote) > 60 else ''}'")
|
|
180
|
+
|
|
181
|
+
# Find the quote element
|
|
182
|
+
element = find_quote_in_page(page, quote)
|
|
183
|
+
|
|
184
|
+
if element:
|
|
185
|
+
result['quote_found'] = True
|
|
186
|
+
|
|
187
|
+
# Step 1: Try multiple scroll methods (some sites block certain approaches)
|
|
188
|
+
print("š Scrolling quote to center of viewport...")
|
|
189
|
+
|
|
190
|
+
# Method 1: scrollIntoView with block center (most reliable)
|
|
191
|
+
try:
|
|
192
|
+
page.evaluate('''(el) => {
|
|
193
|
+
el.scrollIntoView({ behavior: 'instant', block: 'center', inline: 'nearest' });
|
|
194
|
+
}''', element)
|
|
195
|
+
page.wait_for_timeout(500)
|
|
196
|
+
print(" ā
Scroll method 1 (scrollIntoView) succeeded")
|
|
197
|
+
except Exception as scroll_err:
|
|
198
|
+
print(f" ā ļø Scroll method 1 failed: {scroll_err}")
|
|
199
|
+
|
|
200
|
+
# Method 2: Manual scrollTo calculation as fallback
|
|
201
|
+
try:
|
|
202
|
+
page.evaluate('''(el) => {
|
|
203
|
+
const rect = el.getBoundingClientRect();
|
|
204
|
+
const scrollTop = window.pageYOffset + rect.top - (window.innerHeight / 2) + (rect.height / 2);
|
|
205
|
+
window.scrollTo({ top: Math.max(0, scrollTop), behavior: 'instant' });
|
|
206
|
+
}''', element)
|
|
207
|
+
page.wait_for_timeout(500)
|
|
208
|
+
print(" ā
Scroll method 2 (scrollTo) succeeded")
|
|
209
|
+
except Exception as scroll_err2:
|
|
210
|
+
print(f" ā ļø Scroll method 2 also failed: {scroll_err2}")
|
|
211
|
+
|
|
212
|
+
# Step 1.5: Verify element is now visible in viewport
|
|
213
|
+
is_visible = page.evaluate('''(el) => {
|
|
214
|
+
const rect = el.getBoundingClientRect();
|
|
215
|
+
return rect.top >= 0 && rect.bottom <= window.innerHeight;
|
|
216
|
+
}''', element)
|
|
217
|
+
|
|
218
|
+
if not is_visible:
|
|
219
|
+
print(" ā ļø Element not fully visible, trying Playwright scroll...")
|
|
220
|
+
try:
|
|
221
|
+
element.scroll_into_view_if_needed()
|
|
222
|
+
page.wait_for_timeout(500)
|
|
223
|
+
except:
|
|
224
|
+
pass
|
|
225
|
+
|
|
226
|
+
# Step 2: Highlight the element
|
|
227
|
+
if highlight:
|
|
228
|
+
print("šØ Highlighting quote...")
|
|
229
|
+
page.evaluate('''(el) => {
|
|
230
|
+
// Save original styles
|
|
231
|
+
el.setAttribute('data-original-style', el.getAttribute('style') || '');
|
|
232
|
+
|
|
233
|
+
// Apply highlight styles with !important to override site CSS
|
|
234
|
+
el.style.setProperty('background-color', '#ffff00', 'important');
|
|
235
|
+
el.style.setProperty('color', '#000000', 'important');
|
|
236
|
+
el.style.setProperty('padding', '10px', 'important');
|
|
237
|
+
el.style.setProperty('border-radius', '4px', 'important');
|
|
238
|
+
el.style.setProperty('border', '4px solid #ff6600', 'important');
|
|
239
|
+
el.style.setProperty('box-shadow', '0 0 30px rgba(255, 102, 0, 0.8)', 'important');
|
|
240
|
+
el.style.setProperty('position', 'relative', 'important');
|
|
241
|
+
el.style.setProperty('z-index', '99999', 'important');
|
|
242
|
+
el.style.setProperty('display', 'block', 'important');
|
|
243
|
+
}''', element)
|
|
244
|
+
print(" ā
Quote highlighted with yellow background + orange border")
|
|
245
|
+
|
|
246
|
+
# Step 3: Wait for CSS to apply and re-render
|
|
247
|
+
page.wait_for_timeout(800)
|
|
248
|
+
|
|
249
|
+
# Step 4: Take the screenshot
|
|
250
|
+
print("šø Taking screenshot...")
|
|
251
|
+
page.screenshot(path=output_path)
|
|
252
|
+
result['success'] = True
|
|
253
|
+
result['message'] = f"Quote found, centered, and captured: '{quote[:40]}...'"
|
|
254
|
+
|
|
255
|
+
else:
|
|
256
|
+
# Quote NOT found - try fuzzy fallback
|
|
257
|
+
print("ā ļø Exact quote not found. Trying fuzzy search...")
|
|
258
|
+
|
|
259
|
+
# Try with just the first 3 words
|
|
260
|
+
words = quote.split()
|
|
261
|
+
if len(words) >= 3:
|
|
262
|
+
fuzzy_quote = ' '.join(words[:3])
|
|
263
|
+
fuzzy_element = find_quote_in_page(page, fuzzy_quote)
|
|
264
|
+
|
|
265
|
+
if fuzzy_element:
|
|
266
|
+
print(f" ā
Found partial match with: '{fuzzy_quote}'")
|
|
267
|
+
|
|
268
|
+
# Scroll and highlight
|
|
269
|
+
page.evaluate('''(el) => {
|
|
270
|
+
const rect = el.getBoundingClientRect();
|
|
271
|
+
const scrollTop = window.pageYOffset + rect.top - (window.innerHeight / 2);
|
|
272
|
+
window.scrollTo({ top: scrollTop, behavior: 'instant' });
|
|
273
|
+
}''', fuzzy_element)
|
|
274
|
+
|
|
275
|
+
if highlight:
|
|
276
|
+
page.evaluate('''(el) => {
|
|
277
|
+
el.style.backgroundColor = '#ffff00';
|
|
278
|
+
el.style.border = '3px solid #ff6600';
|
|
279
|
+
el.style.padding = '8px';
|
|
280
|
+
}''', fuzzy_element)
|
|
281
|
+
|
|
282
|
+
page.wait_for_timeout(300)
|
|
283
|
+
page.screenshot(path=output_path)
|
|
284
|
+
result['success'] = True
|
|
285
|
+
result['quote_found'] = True
|
|
286
|
+
result['message'] = f"Partial match found: '{fuzzy_quote}'"
|
|
287
|
+
else:
|
|
288
|
+
# Complete failure
|
|
289
|
+
result['success'] = False
|
|
290
|
+
result['quote_found'] = False
|
|
291
|
+
result['message'] = f"ERROR: Quote not found on page: '{quote[:50]}...'"
|
|
292
|
+
print(f" ā {result['message']}")
|
|
293
|
+
# Don't take useless screenshot
|
|
294
|
+
else:
|
|
295
|
+
result['success'] = False
|
|
296
|
+
result['message'] = f"ERROR: Quote too short and not found: '{quote}'"
|
|
297
|
+
|
|
298
|
+
else:
|
|
299
|
+
# No quote provided - just screenshot the article content
|
|
300
|
+
print("šø No quote specified. Taking article screenshot...")
|
|
301
|
+
|
|
302
|
+
# Try to find and scroll to main article content
|
|
303
|
+
for selector in ['article', '.article-content', '.story-content',
|
|
304
|
+
'.post-content', 'main', '#content', '.entry-content']:
|
|
305
|
+
if page.locator(selector).count() > 0:
|
|
306
|
+
page.locator(selector).first.scroll_into_view_if_needed()
|
|
307
|
+
break
|
|
308
|
+
|
|
309
|
+
page.screenshot(path=output_path)
|
|
310
|
+
result['success'] = True
|
|
311
|
+
result['message'] = "Article screenshot captured (no specific quote)"
|
|
312
|
+
|
|
313
|
+
# Validate file was created
|
|
314
|
+
if result['success'] and os.path.exists(output_path):
|
|
315
|
+
size = os.path.getsize(output_path)
|
|
316
|
+
if size > 0:
|
|
317
|
+
print(f"ā
Screenshot saved: {output_path} ({size:,} bytes)")
|
|
318
|
+
else:
|
|
319
|
+
result['success'] = False
|
|
320
|
+
result['message'] = "Screenshot file is empty"
|
|
321
|
+
|
|
322
|
+
except Exception as e:
|
|
323
|
+
result['message'] = f"Error: {str(e)}"
|
|
324
|
+
print(f"ā Error: {e}")
|
|
325
|
+
|
|
326
|
+
finally:
|
|
327
|
+
browser.close()
|
|
328
|
+
|
|
329
|
+
return result
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def main():
|
|
333
|
+
parser = argparse.ArgumentParser(
|
|
334
|
+
description="Take screenshots of news articles with quote highlighting.",
|
|
335
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
336
|
+
epilog="""
|
|
337
|
+
Examples:
|
|
338
|
+
# Find and highlight a specific quote (RECOMMENDED)
|
|
339
|
+
python article_screenshotter.py --url "https://timesofindia.com/article" --quote "PM Modi said this is important" --output "quote.png"
|
|
340
|
+
|
|
341
|
+
# Screenshot without highlighting
|
|
342
|
+
python article_screenshotter.py --url "https://example.com" --quote "important text" --no-highlight --output "quote.png"
|
|
343
|
+
|
|
344
|
+
# Just capture article (no specific quote)
|
|
345
|
+
python article_screenshotter.py --url "https://example.com" --output "article.png"
|
|
346
|
+
|
|
347
|
+
NOTE: Always provide --quote for meaningful screenshots. Without it, you just get the page header.
|
|
348
|
+
"""
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
parser.add_argument("--url", "-u", required=True, help="URL of the article")
|
|
352
|
+
parser.add_argument("--output", "-o", required=True, help="Output file path for screenshot")
|
|
353
|
+
parser.add_argument("--quote", "-q", help="Specific quote/text to find, center, and highlight (REQUIRED for useful screenshots)")
|
|
354
|
+
parser.add_argument("--no-highlight", action="store_true", help="Don't highlight the found text")
|
|
355
|
+
parser.add_argument("--width", "-w", type=int, default=1280, help="Viewport width (default: 1280)")
|
|
356
|
+
parser.add_argument("--height", "-H", type=int, default=900, help="Viewport height (default: 900)")
|
|
357
|
+
|
|
358
|
+
args = parser.parse_args()
|
|
359
|
+
|
|
360
|
+
# Warn if no quote provided
|
|
361
|
+
if not args.quote:
|
|
362
|
+
print("ā ļø WARNING: No --quote provided. Screenshot will just be the page header.")
|
|
363
|
+
print(" For useful screenshots, always provide the specific text you want to capture.")
|
|
364
|
+
|
|
365
|
+
# Ensure output directory exists
|
|
366
|
+
output_dir = os.path.dirname(args.output)
|
|
367
|
+
if output_dir:
|
|
368
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
369
|
+
|
|
370
|
+
result = take_quote_screenshot(
|
|
371
|
+
url=args.url,
|
|
372
|
+
output_path=args.output,
|
|
373
|
+
quote=args.quote,
|
|
374
|
+
highlight=not args.no_highlight,
|
|
375
|
+
width=args.width,
|
|
376
|
+
height=args.height
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
if result['success']:
|
|
380
|
+
print(f"\nā
SUCCESS: {result['message']}")
|
|
381
|
+
sys.exit(0)
|
|
382
|
+
else:
|
|
383
|
+
print(f"\nā FAILED: {result['message']}")
|
|
384
|
+
sys.exit(1)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
if __name__ == "__main__":
|
|
388
|
+
main()
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
YouTube Caption/Transcript Reader for VideoNut
|
|
4
|
+
Extracts captions from YouTube videos with optional timestamp display.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
11
|
+
from youtube_transcript_api.formatters import TextFormatter, JSONFormatter
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_video_id(url):
|
|
16
|
+
"""
|
|
17
|
+
Extract YouTube video ID from various URL formats
|
|
18
|
+
"""
|
|
19
|
+
# Patterns for different YouTube URL formats
|
|
20
|
+
patterns = [
|
|
21
|
+
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11,12})',
|
|
22
|
+
r'(?:https?:\/\/)?(?:www\.)?youtu\.be\/([a-zA-Z0-9_-]{11,12})',
|
|
23
|
+
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([a-zA-Z0-9_-]{11,12})',
|
|
24
|
+
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/v\/([a-zA-Z0-9_-]{11,12})',
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
for pattern in patterns:
|
|
28
|
+
match = re.search(pattern, url)
|
|
29
|
+
if match:
|
|
30
|
+
return match.group(1)
|
|
31
|
+
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def format_timestamp(seconds):
|
|
36
|
+
"""Convert seconds to HH:MM:SS format"""
|
|
37
|
+
hours = int(seconds // 3600)
|
|
38
|
+
minutes = int((seconds % 3600) // 60)
|
|
39
|
+
secs = int(seconds % 60)
|
|
40
|
+
|
|
41
|
+
if hours > 0:
|
|
42
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
|
43
|
+
else:
|
|
44
|
+
return f"{minutes:02d}:{secs:02d}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_youtube_captions(url, languages=None, with_timestamps=False, search_term=None):
|
|
48
|
+
"""
|
|
49
|
+
Get YouTube video captions/transcript
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
url: YouTube video URL
|
|
53
|
+
languages: List of preferred language codes
|
|
54
|
+
with_timestamps: If True, include timestamps with each line
|
|
55
|
+
search_term: If provided, only return lines containing this term (with timestamps)
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Formatted transcript string
|
|
59
|
+
"""
|
|
60
|
+
if languages is None:
|
|
61
|
+
# Default to English and other common languages
|
|
62
|
+
languages = ['en', 'en-US', 'en-GB', 'hi', 'te', 'ta', 'mr', 'es', 'fr', 'de']
|
|
63
|
+
|
|
64
|
+
video_id = extract_video_id(url)
|
|
65
|
+
|
|
66
|
+
if not video_id:
|
|
67
|
+
print(f"Error: Could not extract video ID from URL: {url}", file=sys.stderr)
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Instantiate the API class
|
|
72
|
+
api = YouTubeTranscriptApi()
|
|
73
|
+
|
|
74
|
+
# Fetch the transcript data directly using the instance method
|
|
75
|
+
transcript_data = api.fetch(video_id, languages=languages)
|
|
76
|
+
|
|
77
|
+
# If searching for a term, filter and return with timestamps
|
|
78
|
+
if search_term:
|
|
79
|
+
search_lower = search_term.lower()
|
|
80
|
+
matches = []
|
|
81
|
+
for entry in transcript_data:
|
|
82
|
+
if search_lower in entry.text.lower():
|
|
83
|
+
timestamp = format_timestamp(entry.start)
|
|
84
|
+
duration = getattr(entry, 'duration', 0)
|
|
85
|
+
end_timestamp = format_timestamp(entry.start + duration)
|
|
86
|
+
matches.append({
|
|
87
|
+
'timestamp': timestamp,
|
|
88
|
+
'end_timestamp': end_timestamp,
|
|
89
|
+
'start_seconds': entry.start,
|
|
90
|
+
'text': entry.text
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
if not matches:
|
|
94
|
+
return f"No matches found for '{search_term}' in transcript."
|
|
95
|
+
|
|
96
|
+
output = [f"\nš Found {len(matches)} matches for '{search_term}':\n"]
|
|
97
|
+
for match in matches:
|
|
98
|
+
output.append(f"[{match['timestamp']}] {match['text']}")
|
|
99
|
+
output.append(f"\nš Suggested clip range: {matches[0]['timestamp']} - {matches[-1]['end_timestamp']}")
|
|
100
|
+
return '\n'.join(output)
|
|
101
|
+
|
|
102
|
+
# If with_timestamps, format each line with timestamp
|
|
103
|
+
if with_timestamps:
|
|
104
|
+
output = []
|
|
105
|
+
output.append(f"\nš Transcript with Timestamps:\n")
|
|
106
|
+
output.append("=" * 60)
|
|
107
|
+
for entry in transcript_data:
|
|
108
|
+
timestamp = format_timestamp(entry.start)
|
|
109
|
+
output.append(f"[{timestamp}] {entry.text}")
|
|
110
|
+
return '\n'.join(output)
|
|
111
|
+
|
|
112
|
+
# Default: plain text format - join all text entries
|
|
113
|
+
plain_text = ' '.join([entry.text for entry in transcript_data])
|
|
114
|
+
|
|
115
|
+
return plain_text
|
|
116
|
+
|
|
117
|
+
except Exception as e:
|
|
118
|
+
print(f"Error retrieving captions: {str(e)}", file=sys.stderr)
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def find_timestamp_for_quote(url, quote, context_seconds=30):
|
|
123
|
+
"""
|
|
124
|
+
Find the timestamp where a specific quote appears in the video.
|
|
125
|
+
Returns the start and end timestamps for a clip containing that quote.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
url: YouTube video URL
|
|
129
|
+
quote: The quote to search for
|
|
130
|
+
context_seconds: How many seconds of context to include before/after
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Dict with start_time, end_time, and surrounding text
|
|
134
|
+
"""
|
|
135
|
+
video_id = extract_video_id(url)
|
|
136
|
+
if not video_id:
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
api = YouTubeTranscriptApi()
|
|
141
|
+
# Use fetch to get the default transcript or specify languages
|
|
142
|
+
transcript_data = api.fetch(video_id)
|
|
143
|
+
|
|
144
|
+
quote_lower = quote.lower()
|
|
145
|
+
|
|
146
|
+
for i, entry in enumerate(transcript_data):
|
|
147
|
+
if quote_lower in entry.text.lower():
|
|
148
|
+
# Found the quote
|
|
149
|
+
start_time = max(0, entry.start - context_seconds)
|
|
150
|
+
end_time = entry.start + getattr(entry, 'duration', 5) + context_seconds
|
|
151
|
+
|
|
152
|
+
# Get surrounding context
|
|
153
|
+
context_entries = []
|
|
154
|
+
for j in range(max(0, i-3), min(len(transcript_data), i+4)):
|
|
155
|
+
context_entries.append({
|
|
156
|
+
'timestamp': format_timestamp(transcript_data[j].start),
|
|
157
|
+
'text': transcript_data[j].text
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
'found': True,
|
|
162
|
+
'quote': entry.text,
|
|
163
|
+
'timestamp': format_timestamp(entry.start),
|
|
164
|
+
'clip_start': format_timestamp(start_time),
|
|
165
|
+
'clip_end': format_timestamp(end_time),
|
|
166
|
+
'context': context_entries
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return {'found': False, 'message': f"Quote not found: {quote}"}
|
|
170
|
+
|
|
171
|
+
except Exception as e:
|
|
172
|
+
return {'found': False, 'message': str(e)}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def main():
|
|
176
|
+
parser = argparse.ArgumentParser(
|
|
177
|
+
description="Extract captions from YouTube videos with optional timestamps.",
|
|
178
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
179
|
+
epilog="""
|
|
180
|
+
Examples:
|
|
181
|
+
# Get plain transcript
|
|
182
|
+
python caption_reader.py --url "https://youtube.com/watch?v=xxx"
|
|
183
|
+
|
|
184
|
+
# Get transcript with timestamps
|
|
185
|
+
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --timestamps
|
|
186
|
+
|
|
187
|
+
# Search for specific term and get timestamps
|
|
188
|
+
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --search "electoral bonds"
|
|
189
|
+
|
|
190
|
+
# Find timestamp for a specific quote
|
|
191
|
+
python caption_reader.py --url "https://youtube.com/watch?v=xxx" --find-quote "corruption" --json
|
|
192
|
+
"""
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
parser.add_argument("--url", required=True, help="YouTube video URL")
|
|
196
|
+
parser.add_argument("--languages", nargs="*", default=None,
|
|
197
|
+
help="Preferred language codes (e.g., en hi te). Default: en and Indian languages")
|
|
198
|
+
parser.add_argument("--timestamps", "-t", action="store_true",
|
|
199
|
+
help="Include timestamps with each line")
|
|
200
|
+
parser.add_argument("--search", "-s", help="Search for specific term and show timestamps")
|
|
201
|
+
parser.add_argument("--find-quote", "-f", help="Find exact timestamp for a quote")
|
|
202
|
+
parser.add_argument("--context", "-c", type=int, default=30,
|
|
203
|
+
help="Seconds of context around found quote (default: 30)")
|
|
204
|
+
parser.add_argument("--json", "-j", action="store_true", help="Output as JSON")
|
|
205
|
+
|
|
206
|
+
args = parser.parse_args()
|
|
207
|
+
|
|
208
|
+
if args.find_quote:
|
|
209
|
+
# Find timestamp for specific quote
|
|
210
|
+
result = find_timestamp_for_quote(args.url, args.find_quote, args.context)
|
|
211
|
+
if args.json:
|
|
212
|
+
print(json.dumps(result, indent=2, ensure_ascii=False))
|
|
213
|
+
else:
|
|
214
|
+
if result.get('found'):
|
|
215
|
+
print(f"\nā
Quote Found!")
|
|
216
|
+
print(f" Timestamp: {result['timestamp']}")
|
|
217
|
+
print(f" Text: {result['quote']}")
|
|
218
|
+
print(f"\nš¬ Suggested Clip:")
|
|
219
|
+
print(f" Start: {result['clip_start']}")
|
|
220
|
+
print(f" End: {result['clip_end']}")
|
|
221
|
+
print(f"\nš Context:")
|
|
222
|
+
for entry in result['context']:
|
|
223
|
+
print(f" [{entry['timestamp']}] {entry['text']}")
|
|
224
|
+
else:
|
|
225
|
+
print(f"ā {result.get('message', 'Quote not found')}")
|
|
226
|
+
else:
|
|
227
|
+
# Get transcript
|
|
228
|
+
captions = get_youtube_captions(
|
|
229
|
+
args.url,
|
|
230
|
+
args.languages,
|
|
231
|
+
with_timestamps=args.timestamps,
|
|
232
|
+
search_term=args.search
|
|
233
|
+
)
|
|
234
|
+
print(captions)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
if __name__ == "__main__":
|
|
238
|
+
main()
|