videonut 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.antigravity/config.toml +8 -0
- package/.claude/commands/archivist.toml +12 -0
- package/.claude/commands/director.toml +12 -0
- package/.claude/commands/eic.toml +12 -0
- package/.claude/commands/investigator.toml +12 -0
- package/.claude/commands/prompt.toml +12 -0
- package/.claude/commands/scavenger.toml +12 -0
- package/.claude/commands/scout.toml +12 -0
- package/.claude/commands/scriptwriter.toml +12 -0
- package/.claude/commands/seo.toml +12 -0
- package/.claude/commands/thumbnail.toml +12 -0
- package/.claude/commands/topic_scout.toml +12 -0
- package/.gemini/commands/archivist.toml +12 -0
- package/.gemini/commands/director.toml +12 -0
- package/.gemini/commands/eic.toml +12 -0
- package/.gemini/commands/investigator.toml +12 -0
- package/.gemini/commands/prompt.toml +12 -0
- package/.gemini/commands/scavenger.toml +12 -0
- package/.gemini/commands/scout.toml +12 -0
- package/.gemini/commands/scriptwriter.toml +12 -0
- package/.gemini/commands/seo.toml +12 -0
- package/.gemini/commands/thumbnail.toml +12 -0
- package/.gemini/commands/topic_scout.toml +12 -0
- package/.qwen/commands/archivist.toml +12 -0
- package/.qwen/commands/director.toml +12 -0
- package/.qwen/commands/eic.toml +12 -0
- package/.qwen/commands/investigator.toml +12 -0
- package/.qwen/commands/prompt.toml +12 -0
- package/.qwen/commands/scavenger.toml +12 -0
- package/.qwen/commands/scout.toml +12 -0
- package/.qwen/commands/scriptwriter.toml +12 -0
- package/.qwen/commands/seo.toml +12 -0
- package/.qwen/commands/thumbnail.toml +12 -0
- package/.qwen/commands/topic_scout.toml +12 -0
- package/USER_GUIDE.md +90 -0
- package/agents/core/eic.md +772 -0
- package/agents/core/prompt_agent.md +264 -0
- package/agents/core/self_review_protocol.md +143 -0
- package/agents/creative/director.md +247 -0
- package/agents/creative/scriptwriter.md +208 -0
- package/agents/creative/seo.md +316 -0
- package/agents/creative/thumbnail.md +285 -0
- package/agents/research/investigator.md +395 -0
- package/agents/research/topic_scout.md +419 -0
- package/agents/technical/archivist.md +289 -0
- package/agents/technical/scavenger.md +248 -0
- package/bin/videonut.js +389 -107
- package/config.yaml +62 -0
- package/docs/AUDIT_REPORT.md +364 -0
- package/docs/LIFECYCLE.md +651 -0
- package/docs/scriptwriter.md +43 -0
- package/file_validator.py +187 -0
- package/memory/short_term/asset_manifest.md +64 -0
- package/memory/short_term/investigation_dossier.md +31 -0
- package/memory/short_term/master_script.md +51 -0
- package/package.json +16 -3
- package/requirements.txt +9 -0
- package/scripts/setup.js +8 -0
- package/tools/check_env.py +77 -0
- package/tools/downloaders/__pycache__/caption_reader.cpython-312.pyc +0 -0
- package/tools/downloaders/__pycache__/image_grabber.cpython-312.pyc +0 -0
- package/tools/downloaders/__pycache__/pdf_reader.cpython-312.pyc +0 -0
- package/tools/downloaders/__pycache__/screenshotter.cpython-312.pyc +0 -0
- package/tools/downloaders/__pycache__/web_reader.cpython-312.pyc +0 -0
- package/tools/downloaders/article_screenshotter.py +388 -0
- package/tools/downloaders/caption_reader.py +238 -0
- package/tools/downloaders/clip_grabber.py +83 -0
- package/tools/downloaders/image_grabber.py +106 -0
- package/tools/downloaders/pdf_reader.py +163 -0
- package/tools/downloaders/pdf_screenshotter.py +240 -0
- package/tools/downloaders/screenshotter.py +58 -0
- package/tools/downloaders/web_reader.py +69 -0
- package/tools/downloaders/youtube_search.py +174 -0
- package/tools/logging/search_logger.py +334 -0
- package/tools/validators/__pycache__/archive_url.cpython-312.pyc +0 -0
- package/tools/validators/__pycache__/link_checker.cpython-312.pyc +0 -0
- package/tools/validators/archive_url.py +269 -0
- package/tools/validators/link_checker.py +45 -0
- package/workflow_orchestrator.py +337 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import subprocess
|
|
4
|
+
import argparse
|
|
5
|
+
|
|
6
|
+
def download_clip(url, start_time, end_time, output_path, ffmpeg_path):
|
|
7
|
+
"""
|
|
8
|
+
Downloads a specific clip from a YouTube video using yt-dlp.
|
|
9
|
+
"""
|
|
10
|
+
# Ensure output directory exists if it's not the current directory
|
|
11
|
+
dir_name = os.path.dirname(output_path)
|
|
12
|
+
if dir_name:
|
|
13
|
+
os.makedirs(dir_name, exist_ok=True)
|
|
14
|
+
|
|
15
|
+
# Construct the yt-dlp command
|
|
16
|
+
# --download-sections "*start-end" downloads only that range
|
|
17
|
+
# --force-keyframes-at-cuts ensures precise cutting (requires ffmpeg)
|
|
18
|
+
cmd = [
|
|
19
|
+
"yt-dlp",
|
|
20
|
+
"--verbose",
|
|
21
|
+
"--download-sections", f"*{start_time}-{end_time}",
|
|
22
|
+
"--force-keyframes-at-cuts",
|
|
23
|
+
"--ffmpeg-location", ffmpeg_path,
|
|
24
|
+
"-o", output_path,
|
|
25
|
+
url
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
print(f"Executing: ", ' '.join(cmd))
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
32
|
+
print("Download successful.")
|
|
33
|
+
print(result.stdout)
|
|
34
|
+
|
|
35
|
+
# Validate that the file was created and has content
|
|
36
|
+
if os.path.exists(output_path):
|
|
37
|
+
file_size = os.path.getsize(output_path)
|
|
38
|
+
if file_size == 0:
|
|
39
|
+
print(f"Error: Downloaded file is empty: {output_path}")
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
else:
|
|
42
|
+
print(f"File validation: {output_path} created with size {file_size} bytes")
|
|
43
|
+
else:
|
|
44
|
+
print(f"Error: Downloaded file does not exist: {output_path}")
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
except subprocess.CalledProcessError as e:
|
|
48
|
+
print("Error during download:")
|
|
49
|
+
print(e.stderr)
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
except FileNotFoundError:
|
|
52
|
+
print("Error: yt-dlp not found. Please install it (pip install yt-dlp) and ensure it's in your PATH.")
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
parser = argparse.ArgumentParser(description="Download a video clip.")
|
|
57
|
+
parser.add_argument("--url", required=True, help="Video URL")
|
|
58
|
+
parser.add_argument("--start", required=True, help="Start time (e.g., 00:00:10 or 10)")
|
|
59
|
+
parser.add_argument("--end", required=True, help="End time (e.g., 00:00:20 or 20)")
|
|
60
|
+
parser.add_argument("--output", required=True, help="Output file path")
|
|
61
|
+
|
|
62
|
+
# Try to find ffmpeg in system PATH first
|
|
63
|
+
import shutil
|
|
64
|
+
import platform
|
|
65
|
+
default_ffmpeg = shutil.which("ffmpeg")
|
|
66
|
+
if not default_ffmpeg:
|
|
67
|
+
# Fallback to local bin folder relative to this script
|
|
68
|
+
# Assumes structure: tools/downloaders/clip_grabber.py -> tools/bin/ffmpeg.exe (Windows) or tools/bin/ffmpeg (Unix)
|
|
69
|
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
70
|
+
# Determine the appropriate executable name based on the platform
|
|
71
|
+
ffmpeg_exe = "ffmpeg.exe" if platform.system().lower() == "windows" else "ffmpeg"
|
|
72
|
+
default_ffmpeg = os.path.join(base_dir, "bin", ffmpeg_exe)
|
|
73
|
+
|
|
74
|
+
# If the fallback path doesn't exist, warn the user
|
|
75
|
+
if not os.path.exists(default_ffmpeg):
|
|
76
|
+
print(f"Warning: ffmpeg not found in PATH or at expected location: {default_ffmpeg}")
|
|
77
|
+
print("Please install ffmpeg or place it in the tools/bin/ directory.")
|
|
78
|
+
|
|
79
|
+
parser.add_argument("--ffmpeg", default=default_ffmpeg, help="Path to ffmpeg executable")
|
|
80
|
+
|
|
81
|
+
args = parser.parse_args()
|
|
82
|
+
|
|
83
|
+
download_clip(args.url, args.start, args.end, args.output, args.ffmpeg)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import requests
|
|
4
|
+
import argparse
|
|
5
|
+
import mimetypes
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
def is_safe_image_type(content_type, url):
|
|
9
|
+
"""
|
|
10
|
+
Check if the content type is a safe image type.
|
|
11
|
+
"""
|
|
12
|
+
safe_types = {
|
|
13
|
+
'image/jpeg', 'image/jpg', 'image/png', 'image/gif',
|
|
14
|
+
'image/webp', 'image/bmp', 'image/svg+xml', 'image/tiff'
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
# Check content-type header
|
|
18
|
+
if content_type and content_type.lower() in safe_types:
|
|
19
|
+
return True
|
|
20
|
+
|
|
21
|
+
# Fallback: check file extension from URL
|
|
22
|
+
parsed_url = urlparse(url)
|
|
23
|
+
file_ext = os.path.splitext(parsed_url.path)[1].lower()
|
|
24
|
+
mime_type, _ = mimetypes.guess_type(f"dummy{file_ext}")
|
|
25
|
+
|
|
26
|
+
if mime_type and mime_type in safe_types:
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
def get_file_size(response):
|
|
32
|
+
"""
|
|
33
|
+
Get the file size from the response headers.
|
|
34
|
+
"""
|
|
35
|
+
content_length = response.headers.get('content-length')
|
|
36
|
+
if content_length:
|
|
37
|
+
return int(content_length)
|
|
38
|
+
return 0
|
|
39
|
+
|
|
40
|
+
def download_image(url, output_path):
|
|
41
|
+
"""
|
|
42
|
+
Downloads an image from a URL with security validation.
|
|
43
|
+
"""
|
|
44
|
+
# Ensure output directory exists if it's not the current directory
|
|
45
|
+
dir_name = os.path.dirname(output_path)
|
|
46
|
+
if dir_name:
|
|
47
|
+
os.makedirs(dir_name, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
headers = {
|
|
50
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
print(f"Downloading image from: {url}")
|
|
55
|
+
|
|
56
|
+
# First, make a HEAD request to check content type and size
|
|
57
|
+
head_response = requests.head(url, headers=headers, timeout=10)
|
|
58
|
+
content_type = head_response.headers.get('content-type', '').lower()
|
|
59
|
+
|
|
60
|
+
# Validate content type
|
|
61
|
+
if not is_safe_image_type(content_type, url):
|
|
62
|
+
print(f"Security Error: Content type '{content_type}' is not a safe image type.")
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
# Check file size (limit to 50MB)
|
|
66
|
+
file_size = get_file_size(head_response)
|
|
67
|
+
if file_size > 50 * 1024 * 1024: # 50MB
|
|
68
|
+
print(f"Security Error: File size {file_size} bytes exceeds 50MB limit.")
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
|
|
71
|
+
# Actually download the file
|
|
72
|
+
response = requests.get(url, headers=headers, stream=True, timeout=10)
|
|
73
|
+
response.raise_for_status()
|
|
74
|
+
|
|
75
|
+
# Double-check content type after download
|
|
76
|
+
downloaded_content_type = response.headers.get('content-type', '').lower()
|
|
77
|
+
if not is_safe_image_type(downloaded_content_type, url):
|
|
78
|
+
print(f"Security Error: Downloaded content type '{downloaded_content_type}' is not a safe image type.")
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
# Write file in chunks with size validation
|
|
82
|
+
total_size = 0
|
|
83
|
+
with open(output_path, 'wb') as f:
|
|
84
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
85
|
+
if chunk: # Filter out keep-alive chunks
|
|
86
|
+
total_size += len(chunk)
|
|
87
|
+
if total_size > 50 * 1024 * 1024: # 50MB limit
|
|
88
|
+
print(f"Security Error: Downloaded file exceeds 50MB limit.")
|
|
89
|
+
os.remove(output_path) # Clean up partial file
|
|
90
|
+
sys.exit(1)
|
|
91
|
+
f.write(chunk)
|
|
92
|
+
|
|
93
|
+
print(f"Successfully saved to {output_path}")
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
print(f"Failed to download image: {e}")
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
parser = argparse.ArgumentParser(description="Download an image.")
|
|
101
|
+
parser.add_argument("--url", required=True, help="Image URL")
|
|
102
|
+
parser.add_argument("--output", required=True, help="Output file path")
|
|
103
|
+
|
|
104
|
+
args = parser.parse_args()
|
|
105
|
+
|
|
106
|
+
download_image(args.url, args.output)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import requests
|
|
3
|
+
import io
|
|
4
|
+
import time
|
|
5
|
+
from random import uniform
|
|
6
|
+
from pypdf import PdfReader
|
|
7
|
+
import argparse
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
def read_pdf(url, search_term=None, page_number=None):
|
|
11
|
+
"""
|
|
12
|
+
Read a PDF from URL with optional search and page selection.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
url: URL of the PDF
|
|
16
|
+
search_term: Optional term to search for in the PDF
|
|
17
|
+
page_number: Optional specific page to read (1-indexed)
|
|
18
|
+
"""
|
|
19
|
+
# Add random delay to implement rate limiting
|
|
20
|
+
delay = uniform(1, 3)
|
|
21
|
+
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
22
|
+
time.sleep(delay)
|
|
23
|
+
|
|
24
|
+
headers = {
|
|
25
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
30
|
+
response.raise_for_status()
|
|
31
|
+
|
|
32
|
+
f = io.BytesIO(response.content)
|
|
33
|
+
reader = PdfReader(f)
|
|
34
|
+
total_pages = len(reader.pages)
|
|
35
|
+
|
|
36
|
+
print(f"📄 PDF loaded: {total_pages} pages")
|
|
37
|
+
|
|
38
|
+
# If specific page requested
|
|
39
|
+
if page_number:
|
|
40
|
+
if 1 <= page_number <= total_pages:
|
|
41
|
+
text = reader.pages[page_number - 1].extract_text()
|
|
42
|
+
print(f"\n--- Page {page_number} of {total_pages} ---")
|
|
43
|
+
print(text)
|
|
44
|
+
return
|
|
45
|
+
else:
|
|
46
|
+
print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
|
|
47
|
+
sys.exit(1)
|
|
48
|
+
|
|
49
|
+
# If search term provided, find all occurrences
|
|
50
|
+
if search_term:
|
|
51
|
+
print(f"🔍 Searching for: '{search_term}'")
|
|
52
|
+
matches = []
|
|
53
|
+
search_lower = search_term.lower()
|
|
54
|
+
|
|
55
|
+
for i, page in enumerate(reader.pages):
|
|
56
|
+
page_text = page.extract_text()
|
|
57
|
+
if page_text and search_lower in page_text.lower():
|
|
58
|
+
# Find the context around the match
|
|
59
|
+
lines = page_text.split('\n')
|
|
60
|
+
for j, line in enumerate(lines):
|
|
61
|
+
if search_lower in line.lower():
|
|
62
|
+
# Get surrounding context (2 lines before/after)
|
|
63
|
+
context_start = max(0, j - 2)
|
|
64
|
+
context_end = min(len(lines), j + 3)
|
|
65
|
+
context = '\n'.join(lines[context_start:context_end])
|
|
66
|
+
matches.append({
|
|
67
|
+
'page': i + 1,
|
|
68
|
+
'line': line.strip(),
|
|
69
|
+
'context': context
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
if matches:
|
|
73
|
+
print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
|
|
74
|
+
for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
|
|
75
|
+
print(f"{'='*60}")
|
|
76
|
+
print(f"📍 Match {idx+1} - Page {match['page']}")
|
|
77
|
+
print(f"{'='*60}")
|
|
78
|
+
print(f"Line: {match['line']}")
|
|
79
|
+
print(f"\nContext:")
|
|
80
|
+
print(match['context'])
|
|
81
|
+
print()
|
|
82
|
+
|
|
83
|
+
if len(matches) > 10:
|
|
84
|
+
print(f"... and {len(matches) - 10} more matches")
|
|
85
|
+
|
|
86
|
+
# Suggest best page for screenshot
|
|
87
|
+
best_page = matches[0]['page']
|
|
88
|
+
print(f"\n📸 Suggested page for screenshot: Page {best_page}")
|
|
89
|
+
print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
|
|
90
|
+
else:
|
|
91
|
+
print(f"❌ No matches found for '{search_term}'")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
# Default: Smart extraction with priority for first and last pages
|
|
95
|
+
MAX_PAGES = 15
|
|
96
|
+
MAX_CHARS = 20000
|
|
97
|
+
|
|
98
|
+
text = ""
|
|
99
|
+
pages_to_read = []
|
|
100
|
+
|
|
101
|
+
if total_pages <= MAX_PAGES:
|
|
102
|
+
pages_to_read = list(range(total_pages))
|
|
103
|
+
else:
|
|
104
|
+
# Smart selection: first 7 + last 4
|
|
105
|
+
first_pages = list(range(min(7, total_pages)))
|
|
106
|
+
last_pages = list(range(max(0, total_pages - 4), total_pages))
|
|
107
|
+
pages_to_read = sorted(set(first_pages + last_pages))
|
|
108
|
+
|
|
109
|
+
print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
|
|
110
|
+
|
|
111
|
+
for i in pages_to_read:
|
|
112
|
+
page_text = reader.pages[i].extract_text()
|
|
113
|
+
if page_text:
|
|
114
|
+
text += f"\n--- Page {i+1} ---\n{page_text}"
|
|
115
|
+
|
|
116
|
+
# Smart truncation with intro/conclusion preservation
|
|
117
|
+
if len(text) > MAX_CHARS:
|
|
118
|
+
intro = text[:8000]
|
|
119
|
+
outro = text[-8000:]
|
|
120
|
+
truncated = len(text) - MAX_CHARS
|
|
121
|
+
print(f"--- PDF CONTENT START ---")
|
|
122
|
+
print(intro)
|
|
123
|
+
print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
|
|
124
|
+
print(f"--- PDF CONTENT END ---")
|
|
125
|
+
print(outro)
|
|
126
|
+
else:
|
|
127
|
+
print(text)
|
|
128
|
+
|
|
129
|
+
except requests.exceptions.RequestException as e:
|
|
130
|
+
print(f"Error downloading PDF: {e}")
|
|
131
|
+
sys.exit(1)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print(f"Error reading PDF: {e}")
|
|
134
|
+
sys.exit(1)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def main():
|
|
138
|
+
parser = argparse.ArgumentParser(
|
|
139
|
+
description="Read and search PDF documents from URLs.",
|
|
140
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
141
|
+
epilog="""
|
|
142
|
+
Examples:
|
|
143
|
+
# Read PDF with smart extraction (first + last pages)
|
|
144
|
+
python pdf_reader.py --url "https://example.com/report.pdf"
|
|
145
|
+
|
|
146
|
+
# Search for specific term in PDF
|
|
147
|
+
python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
|
|
148
|
+
|
|
149
|
+
# Read specific page
|
|
150
|
+
python pdf_reader.py --url "https://example.com/report.pdf" --page 5
|
|
151
|
+
"""
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
parser.add_argument("--url", required=True, help="URL of the PDF document")
|
|
155
|
+
parser.add_argument("--search", "-s", help="Search for specific term and show context")
|
|
156
|
+
parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
|
|
157
|
+
|
|
158
|
+
args = parser.parse_args()
|
|
159
|
+
read_pdf(args.url, search_term=args.search, page_number=args.page)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
PDF Page Screenshotter for VideoNut
|
|
4
|
+
Takes screenshots of specific PDF pages, optionally highlighting search terms.
|
|
5
|
+
Uses Playwright to render PDFs in browser for high-quality screenshots.
|
|
6
|
+
|
|
7
|
+
USAGE:
|
|
8
|
+
# Screenshot specific page from PDF
|
|
9
|
+
python pdf_screenshotter.py --url "https://example.com/report.pdf" --page 3 --output "page3.png"
|
|
10
|
+
|
|
11
|
+
# Search for term and screenshot the page where it's found
|
|
12
|
+
python pdf_screenshotter.py --url "https://example.com/report.pdf" --search "Prime Minister" --output "pm_quote.png"
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import sys
|
|
16
|
+
import os
|
|
17
|
+
import argparse
|
|
18
|
+
import requests
|
|
19
|
+
import io
|
|
20
|
+
import time
|
|
21
|
+
from random import uniform
|
|
22
|
+
from pypdf import PdfReader
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from playwright.sync_api import sync_playwright
|
|
26
|
+
except ImportError:
|
|
27
|
+
print("Error: Playwright not installed. Install with: pip install playwright && playwright install chromium")
|
|
28
|
+
sys.exit(1)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def download_pdf_to_temp(url, temp_path):
|
|
32
|
+
"""Download PDF to a temporary file."""
|
|
33
|
+
delay = uniform(1, 2)
|
|
34
|
+
print(f"⏳ Rate limiting: Waiting {delay:.2f} seconds...")
|
|
35
|
+
time.sleep(delay)
|
|
36
|
+
|
|
37
|
+
headers = {
|
|
38
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
42
|
+
response.raise_for_status()
|
|
43
|
+
|
|
44
|
+
with open(temp_path, 'wb') as f:
|
|
45
|
+
f.write(response.content)
|
|
46
|
+
|
|
47
|
+
return temp_path
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def find_page_with_term(pdf_path, search_term):
|
|
51
|
+
"""Find the first page containing the search term."""
|
|
52
|
+
with open(pdf_path, 'rb') as f:
|
|
53
|
+
reader = PdfReader(f)
|
|
54
|
+
search_lower = search_term.lower()
|
|
55
|
+
|
|
56
|
+
for i, page in enumerate(reader.pages):
|
|
57
|
+
page_text = page.extract_text()
|
|
58
|
+
if page_text and search_lower in page_text.lower():
|
|
59
|
+
return i + 1 # Return 1-indexed page number
|
|
60
|
+
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def screenshot_pdf_page(pdf_path, page_number, output_path, search_term=None, width=1280, height=1600):
|
|
65
|
+
"""
|
|
66
|
+
Take a screenshot of a specific PDF page using browser rendering.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
pdf_path: Local path to PDF file
|
|
70
|
+
page_number: Page to screenshot (1-indexed)
|
|
71
|
+
output_path: Where to save the screenshot
|
|
72
|
+
search_term: Optional term to highlight on the page
|
|
73
|
+
width: Viewport width
|
|
74
|
+
height: Viewport height
|
|
75
|
+
"""
|
|
76
|
+
result = {
|
|
77
|
+
'success': False,
|
|
78
|
+
'page': page_number,
|
|
79
|
+
'search_found': False,
|
|
80
|
+
'message': '',
|
|
81
|
+
'output_path': output_path
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Verify PDF exists and page is valid
|
|
85
|
+
with open(pdf_path, 'rb') as f:
|
|
86
|
+
reader = PdfReader(f)
|
|
87
|
+
total_pages = len(reader.pages)
|
|
88
|
+
|
|
89
|
+
if page_number < 1 or page_number > total_pages:
|
|
90
|
+
result['message'] = f"Page {page_number} not found. PDF has {total_pages} pages."
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
with sync_playwright() as p:
|
|
94
|
+
browser = p.chromium.launch(headless=True)
|
|
95
|
+
context = browser.new_context(
|
|
96
|
+
viewport={'width': width, 'height': height}
|
|
97
|
+
)
|
|
98
|
+
page = context.new_page()
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# Open PDF in browser using file:// protocol
|
|
102
|
+
abs_path = os.path.abspath(pdf_path)
|
|
103
|
+
pdf_url = f"file:///{abs_path.replace(os.sep, '/')}"
|
|
104
|
+
|
|
105
|
+
print(f"🌐 Opening PDF in browser...")
|
|
106
|
+
page.goto(pdf_url, timeout=30000, wait_until='networkidle')
|
|
107
|
+
|
|
108
|
+
# Wait for PDF to render
|
|
109
|
+
page.wait_for_timeout(3000)
|
|
110
|
+
|
|
111
|
+
# Navigate to specific page
|
|
112
|
+
# Most PDF viewers use #page=N in URL
|
|
113
|
+
if page_number > 1:
|
|
114
|
+
page.goto(f"{pdf_url}#page={page_number}", timeout=30000)
|
|
115
|
+
page.wait_for_timeout(2000)
|
|
116
|
+
|
|
117
|
+
# If search term provided, try to find and highlight
|
|
118
|
+
if search_term:
|
|
119
|
+
print(f"🔍 Searching for '{search_term}'...")
|
|
120
|
+
# Use browser's find function (Ctrl+F simulation)
|
|
121
|
+
# This is a workaround since direct PDF text selection is complex
|
|
122
|
+
page.keyboard.press("Control+f")
|
|
123
|
+
page.wait_for_timeout(500)
|
|
124
|
+
page.keyboard.type(search_term)
|
|
125
|
+
page.wait_for_timeout(1000)
|
|
126
|
+
page.keyboard.press("Escape")
|
|
127
|
+
result['search_found'] = True
|
|
128
|
+
|
|
129
|
+
# Take screenshot
|
|
130
|
+
print(f"📸 Taking screenshot of page {page_number}...")
|
|
131
|
+
page.screenshot(path=output_path, full_page=False)
|
|
132
|
+
|
|
133
|
+
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
|
134
|
+
result['success'] = True
|
|
135
|
+
result['message'] = f"Screenshot saved: {output_path}"
|
|
136
|
+
print(f"✅ {result['message']}")
|
|
137
|
+
else:
|
|
138
|
+
result['message'] = "Screenshot file is empty or not created"
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
result['message'] = f"Error: {str(e)}"
|
|
142
|
+
print(f"❌ {result['message']}")
|
|
143
|
+
|
|
144
|
+
finally:
|
|
145
|
+
browser.close()
|
|
146
|
+
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def main():
|
|
151
|
+
parser = argparse.ArgumentParser(
|
|
152
|
+
description="Screenshot specific pages from PDF documents.",
|
|
153
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
154
|
+
epilog="""
|
|
155
|
+
Examples:
|
|
156
|
+
# Screenshot specific page
|
|
157
|
+
python pdf_screenshotter.py --url "https://example.com/report.pdf" --page 3 --output "page3.png"
|
|
158
|
+
|
|
159
|
+
# Auto-find page with search term and screenshot
|
|
160
|
+
python pdf_screenshotter.py --url "https://example.com/report.pdf" --search "Prime Minister" --output "pm_quote.png"
|
|
161
|
+
|
|
162
|
+
# Screenshot local PDF file
|
|
163
|
+
python pdf_screenshotter.py --file "report.pdf" --page 5 --output "page5.png"
|
|
164
|
+
"""
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
parser.add_argument("--url", "-u", help="URL of the PDF document")
|
|
168
|
+
parser.add_argument("--file", "-f", help="Local path to PDF file")
|
|
169
|
+
parser.add_argument("--page", "-p", type=int, help="Page number to screenshot (1-indexed)")
|
|
170
|
+
parser.add_argument("--search", "-s", help="Search for term and screenshot that page")
|
|
171
|
+
parser.add_argument("--output", "-o", required=True, help="Output file path for screenshot")
|
|
172
|
+
parser.add_argument("--width", type=int, default=1280, help="Viewport width (default: 1280)")
|
|
173
|
+
parser.add_argument("--height", type=int, default=1600, help="Viewport height (default: 1600)")
|
|
174
|
+
|
|
175
|
+
args = parser.parse_args()
|
|
176
|
+
|
|
177
|
+
# Validate inputs
|
|
178
|
+
if not args.url and not args.file:
|
|
179
|
+
print("Error: Either --url or --file must be provided")
|
|
180
|
+
sys.exit(1)
|
|
181
|
+
|
|
182
|
+
if not args.page and not args.search:
|
|
183
|
+
print("Error: Either --page or --search must be provided")
|
|
184
|
+
sys.exit(1)
|
|
185
|
+
|
|
186
|
+
# Ensure output directory exists
|
|
187
|
+
output_dir = os.path.dirname(args.output)
|
|
188
|
+
if output_dir:
|
|
189
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
190
|
+
|
|
191
|
+
# Get PDF file (download if URL provided)
|
|
192
|
+
if args.url:
|
|
193
|
+
temp_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '.temp')
|
|
194
|
+
os.makedirs(temp_dir, exist_ok=True)
|
|
195
|
+
pdf_path = os.path.join(temp_dir, 'temp_pdf.pdf')
|
|
196
|
+
|
|
197
|
+
print(f"📥 Downloading PDF from {args.url}...")
|
|
198
|
+
try:
|
|
199
|
+
download_pdf_to_temp(args.url, pdf_path)
|
|
200
|
+
print(f"✅ PDF downloaded")
|
|
201
|
+
except Exception as e:
|
|
202
|
+
print(f"❌ Failed to download PDF: {e}")
|
|
203
|
+
sys.exit(1)
|
|
204
|
+
else:
|
|
205
|
+
pdf_path = args.file
|
|
206
|
+
if not os.path.exists(pdf_path):
|
|
207
|
+
print(f"Error: File not found: {pdf_path}")
|
|
208
|
+
sys.exit(1)
|
|
209
|
+
|
|
210
|
+
# Determine page number
|
|
211
|
+
page_number = args.page
|
|
212
|
+
if args.search:
|
|
213
|
+
print(f"🔍 Searching for '{args.search}' in PDF...")
|
|
214
|
+
found_page = find_page_with_term(pdf_path, args.search)
|
|
215
|
+
if found_page:
|
|
216
|
+
page_number = found_page
|
|
217
|
+
print(f"✅ Found '{args.search}' on page {page_number}")
|
|
218
|
+
else:
|
|
219
|
+
print(f"❌ '{args.search}' not found in PDF")
|
|
220
|
+
sys.exit(1)
|
|
221
|
+
|
|
222
|
+
# Take screenshot
|
|
223
|
+
result = screenshot_pdf_page(
|
|
224
|
+
pdf_path,
|
|
225
|
+
page_number,
|
|
226
|
+
args.output,
|
|
227
|
+
search_term=args.search,
|
|
228
|
+
width=args.width,
|
|
229
|
+
height=args.height
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if result['success']:
|
|
233
|
+
sys.exit(0)
|
|
234
|
+
else:
|
|
235
|
+
print(f"❌ Failed: {result['message']}")
|
|
236
|
+
sys.exit(1)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
main()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
import time
|
|
5
|
+
from random import uniform
|
|
6
|
+
from playwright.sync_api import sync_playwright
|
|
7
|
+
|
|
8
|
+
def take_screenshot(url, output_path):
|
|
9
|
+
# Add random delay to implement rate limiting
|
|
10
|
+
delay = uniform(1, 3) # Random delay between 1-3 seconds
|
|
11
|
+
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
12
|
+
time.sleep(delay)
|
|
13
|
+
|
|
14
|
+
with sync_playwright() as p:
|
|
15
|
+
browser = p.chromium.launch()
|
|
16
|
+
page = browser.new_page()
|
|
17
|
+
|
|
18
|
+
# Set additional headers to appear more like a real user
|
|
19
|
+
page.set_extra_http_headers({
|
|
20
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
21
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
22
|
+
"Accept-Encoding": "gzip, deflate",
|
|
23
|
+
"Connection": "keep-alive",
|
|
24
|
+
"Upgrade-Insecure-Requests": "1",
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
print(f"Navigating to {url}...")
|
|
29
|
+
page.goto(url, timeout=30000)
|
|
30
|
+
# Wait a bit for dynamic content (e.g., Twitter embeds)
|
|
31
|
+
page.wait_for_timeout(2000)
|
|
32
|
+
|
|
33
|
+
page.screenshot(path=output_path, full_page=False)
|
|
34
|
+
|
|
35
|
+
# Validate that the file was created and has content
|
|
36
|
+
if os.path.exists(output_path):
|
|
37
|
+
file_size = os.path.getsize(output_path)
|
|
38
|
+
if file_size == 0:
|
|
39
|
+
print(f"Error: Screenshot file is empty: {output_path}")
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
else:
|
|
42
|
+
print(f"File validation: {output_path} created with size {file_size} bytes")
|
|
43
|
+
else:
|
|
44
|
+
print(f"Error: Screenshot file does not exist: {output_path}")
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"Error taking screenshot: {e}")
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
finally:
|
|
51
|
+
browser.close()
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
parser = argparse.ArgumentParser()
|
|
55
|
+
parser.add_argument("--url", required=True)
|
|
56
|
+
parser.add_argument("--output", required=True)
|
|
57
|
+
args = parser.parse_args()
|
|
58
|
+
take_screenshot(args.url, args.output)
|