videonut 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/.antigravity/config.toml +8 -0
  2. package/.claude/commands/archivist.toml +12 -0
  3. package/.claude/commands/director.toml +12 -0
  4. package/.claude/commands/eic.toml +12 -0
  5. package/.claude/commands/investigator.toml +12 -0
  6. package/.claude/commands/prompt.toml +12 -0
  7. package/.claude/commands/scavenger.toml +12 -0
  8. package/.claude/commands/scout.toml +12 -0
  9. package/.claude/commands/scriptwriter.toml +12 -0
  10. package/.claude/commands/seo.toml +12 -0
  11. package/.claude/commands/thumbnail.toml +12 -0
  12. package/.claude/commands/topic_scout.toml +12 -0
  13. package/.gemini/commands/archivist.toml +12 -0
  14. package/.gemini/commands/director.toml +12 -0
  15. package/.gemini/commands/eic.toml +12 -0
  16. package/.gemini/commands/investigator.toml +12 -0
  17. package/.gemini/commands/prompt.toml +12 -0
  18. package/.gemini/commands/scavenger.toml +12 -0
  19. package/.gemini/commands/scout.toml +12 -0
  20. package/.gemini/commands/scriptwriter.toml +12 -0
  21. package/.gemini/commands/seo.toml +12 -0
  22. package/.gemini/commands/thumbnail.toml +12 -0
  23. package/.gemini/commands/topic_scout.toml +12 -0
  24. package/.qwen/commands/archivist.toml +12 -0
  25. package/.qwen/commands/director.toml +12 -0
  26. package/.qwen/commands/eic.toml +12 -0
  27. package/.qwen/commands/investigator.toml +12 -0
  28. package/.qwen/commands/prompt.toml +12 -0
  29. package/.qwen/commands/scavenger.toml +12 -0
  30. package/.qwen/commands/scout.toml +12 -0
  31. package/.qwen/commands/scriptwriter.toml +12 -0
  32. package/.qwen/commands/seo.toml +12 -0
  33. package/.qwen/commands/thumbnail.toml +12 -0
  34. package/.qwen/commands/topic_scout.toml +12 -0
  35. package/USER_GUIDE.md +90 -0
  36. package/agents/core/eic.md +772 -0
  37. package/agents/core/prompt_agent.md +264 -0
  38. package/agents/core/self_review_protocol.md +143 -0
  39. package/agents/creative/director.md +247 -0
  40. package/agents/creative/scriptwriter.md +208 -0
  41. package/agents/creative/seo.md +316 -0
  42. package/agents/creative/thumbnail.md +285 -0
  43. package/agents/research/investigator.md +395 -0
  44. package/agents/research/topic_scout.md +419 -0
  45. package/agents/technical/archivist.md +289 -0
  46. package/agents/technical/scavenger.md +248 -0
  47. package/bin/videonut.js +389 -107
  48. package/config.yaml +62 -0
  49. package/docs/AUDIT_REPORT.md +364 -0
  50. package/docs/LIFECYCLE.md +651 -0
  51. package/docs/scriptwriter.md +43 -0
  52. package/file_validator.py +187 -0
  53. package/memory/short_term/asset_manifest.md +64 -0
  54. package/memory/short_term/investigation_dossier.md +31 -0
  55. package/memory/short_term/master_script.md +51 -0
  56. package/package.json +16 -3
  57. package/requirements.txt +9 -0
  58. package/scripts/setup.js +8 -0
  59. package/tools/check_env.py +77 -0
  60. package/tools/downloaders/__pycache__/caption_reader.cpython-312.pyc +0 -0
  61. package/tools/downloaders/__pycache__/image_grabber.cpython-312.pyc +0 -0
  62. package/tools/downloaders/__pycache__/pdf_reader.cpython-312.pyc +0 -0
  63. package/tools/downloaders/__pycache__/screenshotter.cpython-312.pyc +0 -0
  64. package/tools/downloaders/__pycache__/web_reader.cpython-312.pyc +0 -0
  65. package/tools/downloaders/article_screenshotter.py +388 -0
  66. package/tools/downloaders/caption_reader.py +238 -0
  67. package/tools/downloaders/clip_grabber.py +83 -0
  68. package/tools/downloaders/image_grabber.py +106 -0
  69. package/tools/downloaders/pdf_reader.py +163 -0
  70. package/tools/downloaders/pdf_screenshotter.py +240 -0
  71. package/tools/downloaders/screenshotter.py +58 -0
  72. package/tools/downloaders/web_reader.py +69 -0
  73. package/tools/downloaders/youtube_search.py +174 -0
  74. package/tools/logging/search_logger.py +334 -0
  75. package/tools/validators/__pycache__/archive_url.cpython-312.pyc +0 -0
  76. package/tools/validators/__pycache__/link_checker.cpython-312.pyc +0 -0
  77. package/tools/validators/archive_url.py +269 -0
  78. package/tools/validators/link_checker.py +45 -0
  79. package/workflow_orchestrator.py +337 -0
@@ -0,0 +1,83 @@
1
+ import os
2
+ import sys
3
+ import subprocess
4
+ import argparse
5
+
6
+ def download_clip(url, start_time, end_time, output_path, ffmpeg_path):
7
+ """
8
+ Downloads a specific clip from a YouTube video using yt-dlp.
9
+ """
10
+ # Ensure output directory exists if it's not the current directory
11
+ dir_name = os.path.dirname(output_path)
12
+ if dir_name:
13
+ os.makedirs(dir_name, exist_ok=True)
14
+
15
+ # Construct the yt-dlp command
16
+ # --download-sections "*start-end" downloads only that range
17
+ # --force-keyframes-at-cuts ensures precise cutting (requires ffmpeg)
18
+ cmd = [
19
+ "yt-dlp",
20
+ "--verbose",
21
+ "--download-sections", f"*{start_time}-{end_time}",
22
+ "--force-keyframes-at-cuts",
23
+ "--ffmpeg-location", ffmpeg_path,
24
+ "-o", output_path,
25
+ url
26
+ ]
27
+
28
+ print(f"Executing: ", ' '.join(cmd))
29
+
30
+ try:
31
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
32
+ print("Download successful.")
33
+ print(result.stdout)
34
+
35
+ # Validate that the file was created and has content
36
+ if os.path.exists(output_path):
37
+ file_size = os.path.getsize(output_path)
38
+ if file_size == 0:
39
+ print(f"Error: Downloaded file is empty: {output_path}")
40
+ sys.exit(1)
41
+ else:
42
+ print(f"File validation: {output_path} created with size {file_size} bytes")
43
+ else:
44
+ print(f"Error: Downloaded file does not exist: {output_path}")
45
+ sys.exit(1)
46
+
47
+ except subprocess.CalledProcessError as e:
48
+ print("Error during download:")
49
+ print(e.stderr)
50
+ sys.exit(1)
51
+ except FileNotFoundError:
52
+ print("Error: yt-dlp not found. Please install it (pip install yt-dlp) and ensure it's in your PATH.")
53
+ sys.exit(1)
54
+
55
+ if __name__ == "__main__":
56
+ parser = argparse.ArgumentParser(description="Download a video clip.")
57
+ parser.add_argument("--url", required=True, help="Video URL")
58
+ parser.add_argument("--start", required=True, help="Start time (e.g., 00:00:10 or 10)")
59
+ parser.add_argument("--end", required=True, help="End time (e.g., 00:00:20 or 20)")
60
+ parser.add_argument("--output", required=True, help="Output file path")
61
+
62
+ # Try to find ffmpeg in system PATH first
63
+ import shutil
64
+ import platform
65
+ default_ffmpeg = shutil.which("ffmpeg")
66
+ if not default_ffmpeg:
67
+ # Fallback to local bin folder relative to this script
68
+ # Assumes structure: tools/downloaders/clip_grabber.py -> tools/bin/ffmpeg.exe (Windows) or tools/bin/ffmpeg (Unix)
69
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
70
+ # Determine the appropriate executable name based on the platform
71
+ ffmpeg_exe = "ffmpeg.exe" if platform.system().lower() == "windows" else "ffmpeg"
72
+ default_ffmpeg = os.path.join(base_dir, "bin", ffmpeg_exe)
73
+
74
+ # If the fallback path doesn't exist, warn the user
75
+ if not os.path.exists(default_ffmpeg):
76
+ print(f"Warning: ffmpeg not found in PATH or at expected location: {default_ffmpeg}")
77
+ print("Please install ffmpeg or place it in the tools/bin/ directory.")
78
+
79
+ parser.add_argument("--ffmpeg", default=default_ffmpeg, help="Path to ffmpeg executable")
80
+
81
+ args = parser.parse_args()
82
+
83
+ download_clip(args.url, args.start, args.end, args.output, args.ffmpeg)
@@ -0,0 +1,106 @@
1
+ import os
2
+ import sys
3
+ import requests
4
+ import argparse
5
+ import mimetypes
6
+ from urllib.parse import urlparse
7
+
8
+ def is_safe_image_type(content_type, url):
9
+ """
10
+ Check if the content type is a safe image type.
11
+ """
12
+ safe_types = {
13
+ 'image/jpeg', 'image/jpg', 'image/png', 'image/gif',
14
+ 'image/webp', 'image/bmp', 'image/svg+xml', 'image/tiff'
15
+ }
16
+
17
+ # Check content-type header
18
+ if content_type and content_type.lower() in safe_types:
19
+ return True
20
+
21
+ # Fallback: check file extension from URL
22
+ parsed_url = urlparse(url)
23
+ file_ext = os.path.splitext(parsed_url.path)[1].lower()
24
+ mime_type, _ = mimetypes.guess_type(f"dummy{file_ext}")
25
+
26
+ if mime_type and mime_type in safe_types:
27
+ return True
28
+
29
+ return False
30
+
31
+ def get_file_size(response):
32
+ """
33
+ Get the file size from the response headers.
34
+ """
35
+ content_length = response.headers.get('content-length')
36
+ if content_length:
37
+ return int(content_length)
38
+ return 0
39
+
40
+ def download_image(url, output_path):
41
+ """
42
+ Downloads an image from a URL with security validation.
43
+ """
44
+ # Ensure output directory exists if it's not the current directory
45
+ dir_name = os.path.dirname(output_path)
46
+ if dir_name:
47
+ os.makedirs(dir_name, exist_ok=True)
48
+
49
+ headers = {
50
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
51
+ }
52
+
53
+ try:
54
+ print(f"Downloading image from: {url}")
55
+
56
+ # First, make a HEAD request to check content type and size
57
+ head_response = requests.head(url, headers=headers, timeout=10)
58
+ content_type = head_response.headers.get('content-type', '').lower()
59
+
60
+ # Validate content type
61
+ if not is_safe_image_type(content_type, url):
62
+ print(f"Security Error: Content type '{content_type}' is not a safe image type.")
63
+ sys.exit(1)
64
+
65
+ # Check file size (limit to 50MB)
66
+ file_size = get_file_size(head_response)
67
+ if file_size > 50 * 1024 * 1024: # 50MB
68
+ print(f"Security Error: File size {file_size} bytes exceeds 50MB limit.")
69
+ sys.exit(1)
70
+
71
+ # Actually download the file
72
+ response = requests.get(url, headers=headers, stream=True, timeout=10)
73
+ response.raise_for_status()
74
+
75
+ # Double-check content type after download
76
+ downloaded_content_type = response.headers.get('content-type', '').lower()
77
+ if not is_safe_image_type(downloaded_content_type, url):
78
+ print(f"Security Error: Downloaded content type '{downloaded_content_type}' is not a safe image type.")
79
+ sys.exit(1)
80
+
81
+ # Write file in chunks with size validation
82
+ total_size = 0
83
+ with open(output_path, 'wb') as f:
84
+ for chunk in response.iter_content(chunk_size=8192):
85
+ if chunk: # Filter out keep-alive chunks
86
+ total_size += len(chunk)
87
+ if total_size > 50 * 1024 * 1024: # 50MB limit
88
+ print(f"Security Error: Downloaded file exceeds 50MB limit.")
89
+ os.remove(output_path) # Clean up partial file
90
+ sys.exit(1)
91
+ f.write(chunk)
92
+
93
+ print(f"Successfully saved to {output_path}")
94
+
95
+ except Exception as e:
96
+ print(f"Failed to download image: {e}")
97
+ sys.exit(1)
98
+
99
+ if __name__ == "__main__":
100
+ parser = argparse.ArgumentParser(description="Download an image.")
101
+ parser.add_argument("--url", required=True, help="Image URL")
102
+ parser.add_argument("--output", required=True, help="Output file path")
103
+
104
+ args = parser.parse_args()
105
+
106
+ download_image(args.url, args.output)
@@ -0,0 +1,163 @@
1
+ import sys
2
+ import requests
3
+ import io
4
+ import time
5
+ from random import uniform
6
+ from pypdf import PdfReader
7
+ import argparse
8
+ import re
9
+
10
+ def read_pdf(url, search_term=None, page_number=None):
11
+ """
12
+ Read a PDF from URL with optional search and page selection.
13
+
14
+ Args:
15
+ url: URL of the PDF
16
+ search_term: Optional term to search for in the PDF
17
+ page_number: Optional specific page to read (1-indexed)
18
+ """
19
+ # Add random delay to implement rate limiting
20
+ delay = uniform(1, 3)
21
+ print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
22
+ time.sleep(delay)
23
+
24
+ headers = {
25
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
26
+ }
27
+
28
+ try:
29
+ response = requests.get(url, headers=headers, timeout=30)
30
+ response.raise_for_status()
31
+
32
+ f = io.BytesIO(response.content)
33
+ reader = PdfReader(f)
34
+ total_pages = len(reader.pages)
35
+
36
+ print(f"📄 PDF loaded: {total_pages} pages")
37
+
38
+ # If specific page requested
39
+ if page_number:
40
+ if 1 <= page_number <= total_pages:
41
+ text = reader.pages[page_number - 1].extract_text()
42
+ print(f"\n--- Page {page_number} of {total_pages} ---")
43
+ print(text)
44
+ return
45
+ else:
46
+ print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
47
+ sys.exit(1)
48
+
49
+ # If search term provided, find all occurrences
50
+ if search_term:
51
+ print(f"🔍 Searching for: '{search_term}'")
52
+ matches = []
53
+ search_lower = search_term.lower()
54
+
55
+ for i, page in enumerate(reader.pages):
56
+ page_text = page.extract_text()
57
+ if page_text and search_lower in page_text.lower():
58
+ # Find the context around the match
59
+ lines = page_text.split('\n')
60
+ for j, line in enumerate(lines):
61
+ if search_lower in line.lower():
62
+ # Get surrounding context (2 lines before/after)
63
+ context_start = max(0, j - 2)
64
+ context_end = min(len(lines), j + 3)
65
+ context = '\n'.join(lines[context_start:context_end])
66
+ matches.append({
67
+ 'page': i + 1,
68
+ 'line': line.strip(),
69
+ 'context': context
70
+ })
71
+
72
+ if matches:
73
+ print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
74
+ for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
75
+ print(f"{'='*60}")
76
+ print(f"📍 Match {idx+1} - Page {match['page']}")
77
+ print(f"{'='*60}")
78
+ print(f"Line: {match['line']}")
79
+ print(f"\nContext:")
80
+ print(match['context'])
81
+ print()
82
+
83
+ if len(matches) > 10:
84
+ print(f"... and {len(matches) - 10} more matches")
85
+
86
+ # Suggest best page for screenshot
87
+ best_page = matches[0]['page']
88
+ print(f"\n📸 Suggested page for screenshot: Page {best_page}")
89
+ print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
90
+ else:
91
+ print(f"❌ No matches found for '{search_term}'")
92
+ return
93
+
94
+ # Default: Smart extraction with priority for first and last pages
95
+ MAX_PAGES = 15
96
+ MAX_CHARS = 20000
97
+
98
+ text = ""
99
+ pages_to_read = []
100
+
101
+ if total_pages <= MAX_PAGES:
102
+ pages_to_read = list(range(total_pages))
103
+ else:
104
+ # Smart selection: first 7 + last 4
105
+ first_pages = list(range(min(7, total_pages)))
106
+ last_pages = list(range(max(0, total_pages - 4), total_pages))
107
+ pages_to_read = sorted(set(first_pages + last_pages))
108
+
109
+ print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
110
+
111
+ for i in pages_to_read:
112
+ page_text = reader.pages[i].extract_text()
113
+ if page_text:
114
+ text += f"\n--- Page {i+1} ---\n{page_text}"
115
+
116
+ # Smart truncation with intro/conclusion preservation
117
+ if len(text) > MAX_CHARS:
118
+ intro = text[:8000]
119
+ outro = text[-8000:]
120
+ truncated = len(text) - MAX_CHARS
121
+ print(f"--- PDF CONTENT START ---")
122
+ print(intro)
123
+ print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
124
+ print(f"--- PDF CONTENT END ---")
125
+ print(outro)
126
+ else:
127
+ print(text)
128
+
129
+ except requests.exceptions.RequestException as e:
130
+ print(f"Error downloading PDF: {e}")
131
+ sys.exit(1)
132
+ except Exception as e:
133
+ print(f"Error reading PDF: {e}")
134
+ sys.exit(1)
135
+
136
+
137
+ def main():
138
+ parser = argparse.ArgumentParser(
139
+ description="Read and search PDF documents from URLs.",
140
+ formatter_class=argparse.RawDescriptionHelpFormatter,
141
+ epilog="""
142
+ Examples:
143
+ # Read PDF with smart extraction (first + last pages)
144
+ python pdf_reader.py --url "https://example.com/report.pdf"
145
+
146
+ # Search for specific term in PDF
147
+ python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
148
+
149
+ # Read specific page
150
+ python pdf_reader.py --url "https://example.com/report.pdf" --page 5
151
+ """
152
+ )
153
+
154
+ parser.add_argument("--url", required=True, help="URL of the PDF document")
155
+ parser.add_argument("--search", "-s", help="Search for specific term and show context")
156
+ parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
157
+
158
+ args = parser.parse_args()
159
+ read_pdf(args.url, search_term=args.search, page_number=args.page)
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
@@ -0,0 +1,240 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF Page Screenshotter for VideoNut
4
+ Takes screenshots of specific PDF pages, optionally highlighting search terms.
5
+ Uses Playwright to render PDFs in browser for high-quality screenshots.
6
+
7
+ USAGE:
8
+ # Screenshot specific page from PDF
9
+ python pdf_screenshotter.py --url "https://example.com/report.pdf" --page 3 --output "page3.png"
10
+
11
+ # Search for term and screenshot the page where it's found
12
+ python pdf_screenshotter.py --url "https://example.com/report.pdf" --search "Prime Minister" --output "pm_quote.png"
13
+ """
14
+
15
+ import sys
16
+ import os
17
+ import argparse
18
+ import requests
19
+ import io
20
+ import time
21
+ from random import uniform
22
+ from pypdf import PdfReader
23
+
24
+ try:
25
+ from playwright.sync_api import sync_playwright
26
+ except ImportError:
27
+ print("Error: Playwright not installed. Install with: pip install playwright && playwright install chromium")
28
+ sys.exit(1)
29
+
30
+
31
+ def download_pdf_to_temp(url, temp_path):
32
+ """Download PDF to a temporary file."""
33
+ delay = uniform(1, 2)
34
+ print(f"⏳ Rate limiting: Waiting {delay:.2f} seconds...")
35
+ time.sleep(delay)
36
+
37
+ headers = {
38
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
39
+ }
40
+
41
+ response = requests.get(url, headers=headers, timeout=30)
42
+ response.raise_for_status()
43
+
44
+ with open(temp_path, 'wb') as f:
45
+ f.write(response.content)
46
+
47
+ return temp_path
48
+
49
+
50
+ def find_page_with_term(pdf_path, search_term):
51
+ """Find the first page containing the search term."""
52
+ with open(pdf_path, 'rb') as f:
53
+ reader = PdfReader(f)
54
+ search_lower = search_term.lower()
55
+
56
+ for i, page in enumerate(reader.pages):
57
+ page_text = page.extract_text()
58
+ if page_text and search_lower in page_text.lower():
59
+ return i + 1 # Return 1-indexed page number
60
+
61
+ return None
62
+
63
+
64
+ def screenshot_pdf_page(pdf_path, page_number, output_path, search_term=None, width=1280, height=1600):
65
+ """
66
+ Take a screenshot of a specific PDF page using browser rendering.
67
+
68
+ Args:
69
+ pdf_path: Local path to PDF file
70
+ page_number: Page to screenshot (1-indexed)
71
+ output_path: Where to save the screenshot
72
+ search_term: Optional term to highlight on the page
73
+ width: Viewport width
74
+ height: Viewport height
75
+ """
76
+ result = {
77
+ 'success': False,
78
+ 'page': page_number,
79
+ 'search_found': False,
80
+ 'message': '',
81
+ 'output_path': output_path
82
+ }
83
+
84
+ # Verify PDF exists and page is valid
85
+ with open(pdf_path, 'rb') as f:
86
+ reader = PdfReader(f)
87
+ total_pages = len(reader.pages)
88
+
89
+ if page_number < 1 or page_number > total_pages:
90
+ result['message'] = f"Page {page_number} not found. PDF has {total_pages} pages."
91
+ return result
92
+
93
+ with sync_playwright() as p:
94
+ browser = p.chromium.launch(headless=True)
95
+ context = browser.new_context(
96
+ viewport={'width': width, 'height': height}
97
+ )
98
+ page = context.new_page()
99
+
100
+ try:
101
+ # Open PDF in browser using file:// protocol
102
+ abs_path = os.path.abspath(pdf_path)
103
+ pdf_url = f"file:///{abs_path.replace(os.sep, '/')}"
104
+
105
+ print(f"🌐 Opening PDF in browser...")
106
+ page.goto(pdf_url, timeout=30000, wait_until='networkidle')
107
+
108
+ # Wait for PDF to render
109
+ page.wait_for_timeout(3000)
110
+
111
+ # Navigate to specific page
112
+ # Most PDF viewers use #page=N in URL
113
+ if page_number > 1:
114
+ page.goto(f"{pdf_url}#page={page_number}", timeout=30000)
115
+ page.wait_for_timeout(2000)
116
+
117
+ # If search term provided, try to find and highlight
118
+ if search_term:
119
+ print(f"🔍 Searching for '{search_term}'...")
120
+ # Use browser's find function (Ctrl+F simulation)
121
+ # This is a workaround since direct PDF text selection is complex
122
+ page.keyboard.press("Control+f")
123
+ page.wait_for_timeout(500)
124
+ page.keyboard.type(search_term)
125
+ page.wait_for_timeout(1000)
126
+ page.keyboard.press("Escape")
127
+ result['search_found'] = True
128
+
129
+ # Take screenshot
130
+ print(f"📸 Taking screenshot of page {page_number}...")
131
+ page.screenshot(path=output_path, full_page=False)
132
+
133
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
134
+ result['success'] = True
135
+ result['message'] = f"Screenshot saved: {output_path}"
136
+ print(f"✅ {result['message']}")
137
+ else:
138
+ result['message'] = "Screenshot file is empty or not created"
139
+
140
+ except Exception as e:
141
+ result['message'] = f"Error: {str(e)}"
142
+ print(f"❌ {result['message']}")
143
+
144
+ finally:
145
+ browser.close()
146
+
147
+ return result
148
+
149
+
150
+ def main():
151
+ parser = argparse.ArgumentParser(
152
+ description="Screenshot specific pages from PDF documents.",
153
+ formatter_class=argparse.RawDescriptionHelpFormatter,
154
+ epilog="""
155
+ Examples:
156
+ # Screenshot specific page
157
+ python pdf_screenshotter.py --url "https://example.com/report.pdf" --page 3 --output "page3.png"
158
+
159
+ # Auto-find page with search term and screenshot
160
+ python pdf_screenshotter.py --url "https://example.com/report.pdf" --search "Prime Minister" --output "pm_quote.png"
161
+
162
+ # Screenshot local PDF file
163
+ python pdf_screenshotter.py --file "report.pdf" --page 5 --output "page5.png"
164
+ """
165
+ )
166
+
167
+ parser.add_argument("--url", "-u", help="URL of the PDF document")
168
+ parser.add_argument("--file", "-f", help="Local path to PDF file")
169
+ parser.add_argument("--page", "-p", type=int, help="Page number to screenshot (1-indexed)")
170
+ parser.add_argument("--search", "-s", help="Search for term and screenshot that page")
171
+ parser.add_argument("--output", "-o", required=True, help="Output file path for screenshot")
172
+ parser.add_argument("--width", type=int, default=1280, help="Viewport width (default: 1280)")
173
+ parser.add_argument("--height", type=int, default=1600, help="Viewport height (default: 1600)")
174
+
175
+ args = parser.parse_args()
176
+
177
+ # Validate inputs
178
+ if not args.url and not args.file:
179
+ print("Error: Either --url or --file must be provided")
180
+ sys.exit(1)
181
+
182
+ if not args.page and not args.search:
183
+ print("Error: Either --page or --search must be provided")
184
+ sys.exit(1)
185
+
186
+ # Ensure output directory exists
187
+ output_dir = os.path.dirname(args.output)
188
+ if output_dir:
189
+ os.makedirs(output_dir, exist_ok=True)
190
+
191
+ # Get PDF file (download if URL provided)
192
+ if args.url:
193
+ temp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.temp')
194
+ os.makedirs(temp_dir, exist_ok=True)
195
+ pdf_path = os.path.join(temp_dir, 'temp_pdf.pdf')
196
+
197
+ print(f"📥 Downloading PDF from {args.url}...")
198
+ try:
199
+ download_pdf_to_temp(args.url, pdf_path)
200
+ print(f"✅ PDF downloaded")
201
+ except Exception as e:
202
+ print(f"❌ Failed to download PDF: {e}")
203
+ sys.exit(1)
204
+ else:
205
+ pdf_path = args.file
206
+ if not os.path.exists(pdf_path):
207
+ print(f"Error: File not found: {pdf_path}")
208
+ sys.exit(1)
209
+
210
+ # Determine page number
211
+ page_number = args.page
212
+ if args.search:
213
+ print(f"🔍 Searching for '{args.search}' in PDF...")
214
+ found_page = find_page_with_term(pdf_path, args.search)
215
+ if found_page:
216
+ page_number = found_page
217
+ print(f"✅ Found '{args.search}' on page {page_number}")
218
+ else:
219
+ print(f"❌ '{args.search}' not found in PDF")
220
+ sys.exit(1)
221
+
222
+ # Take screenshot
223
+ result = screenshot_pdf_page(
224
+ pdf_path,
225
+ page_number,
226
+ args.output,
227
+ search_term=args.search,
228
+ width=args.width,
229
+ height=args.height
230
+ )
231
+
232
+ if result['success']:
233
+ sys.exit(0)
234
+ else:
235
+ print(f"❌ Failed: {result['message']}")
236
+ sys.exit(1)
237
+
238
+
239
+ if __name__ == "__main__":
240
+ main()
@@ -0,0 +1,58 @@
1
+ import sys
2
+ import os
3
+ import argparse
4
+ import time
5
+ from random import uniform
6
+ from playwright.sync_api import sync_playwright
7
+
8
+ def take_screenshot(url, output_path):
9
+ # Add random delay to implement rate limiting
10
+ delay = uniform(1, 3) # Random delay between 1-3 seconds
11
+ print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
12
+ time.sleep(delay)
13
+
14
+ with sync_playwright() as p:
15
+ browser = p.chromium.launch()
16
+ page = browser.new_page()
17
+
18
+ # Set additional headers to appear more like a real user
19
+ page.set_extra_http_headers({
20
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
21
+ "Accept-Language": "en-US,en;q=0.5",
22
+ "Accept-Encoding": "gzip, deflate",
23
+ "Connection": "keep-alive",
24
+ "Upgrade-Insecure-Requests": "1",
25
+ })
26
+
27
+ try:
28
+ print(f"Navigating to {url}...")
29
+ page.goto(url, timeout=30000)
30
+ # Wait a bit for dynamic content (e.g., Twitter embeds)
31
+ page.wait_for_timeout(2000)
32
+
33
+ page.screenshot(path=output_path, full_page=False)
34
+
35
+ # Validate that the file was created and has content
36
+ if os.path.exists(output_path):
37
+ file_size = os.path.getsize(output_path)
38
+ if file_size == 0:
39
+ print(f"Error: Screenshot file is empty: {output_path}")
40
+ sys.exit(1)
41
+ else:
42
+ print(f"File validation: {output_path} created with size {file_size} bytes")
43
+ else:
44
+ print(f"Error: Screenshot file does not exist: {output_path}")
45
+ sys.exit(1)
46
+
47
+ except Exception as e:
48
+ print(f"Error taking screenshot: {e}")
49
+ sys.exit(1)
50
+ finally:
51
+ browser.close()
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument("--url", required=True)
56
+ parser.add_argument("--output", required=True)
57
+ args = parser.parse_args()
58
+ take_screenshot(args.url, args.output)