videonut 1.2.8 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -272
- package/USER_GUIDE.md +90 -90
- package/agents/core/eic.md +771 -771
- package/agents/creative/director.md +246 -246
- package/agents/creative/scriptwriter.md +207 -207
- package/agents/research/investigator.md +394 -394
- package/agents/technical/archivist.md +288 -288
- package/agents/technical/scavenger.md +247 -247
- package/config.yaml +61 -61
- package/docs/scriptwriter.md +42 -42
- package/file_validator.py +186 -186
- package/memory/short_term/asset_manifest.md +64 -64
- package/memory/short_term/investigation_dossier.md +31 -31
- package/memory/short_term/master_script.md +51 -51
- package/package.json +61 -64
- package/requirements.txt +8 -8
- package/tools/check_env.py +76 -76
- package/tools/downloaders/caption_reader.py +237 -237
- package/tools/downloaders/clip_grabber.py +82 -82
- package/tools/downloaders/image_grabber.py +105 -105
- package/tools/downloaders/pdf_reader.py +163 -163
- package/tools/downloaders/screenshotter.py +58 -58
- package/tools/downloaders/web_reader.py +69 -69
- package/tools/validators/link_checker.py +45 -45
- package/workflow_orchestrator.py +336 -336
- package/.claude/commands/archivist.toml +0 -12
- package/.claude/commands/director.toml +0 -12
- package/.claude/commands/eic.toml +0 -12
- package/.claude/commands/investigator.toml +0 -12
- package/.claude/commands/prompt.toml +0 -12
- package/.claude/commands/scavenger.toml +0 -12
- package/.claude/commands/scout.toml +0 -12
- package/.claude/commands/scriptwriter.toml +0 -12
- package/.claude/commands/seo.toml +0 -12
- package/.claude/commands/thumbnail.toml +0 -12
- package/.claude/commands/topic_scout.toml +0 -12
- package/.gemini/commands/archivist.toml +0 -12
- package/.gemini/commands/director.toml +0 -12
- package/.gemini/commands/eic.toml +0 -12
- package/.gemini/commands/investigator.toml +0 -12
- package/.gemini/commands/prompt.toml +0 -12
- package/.gemini/commands/scavenger.toml +0 -12
- package/.gemini/commands/scout.toml +0 -12
- package/.gemini/commands/scriptwriter.toml +0 -12
- package/.gemini/commands/seo.toml +0 -12
- package/.gemini/commands/thumbnail.toml +0 -12
- package/.gemini/commands/topic_scout.toml +0 -12
- package/.qwen/commands/archivist.toml +0 -12
- package/.qwen/commands/director.toml +0 -12
- package/.qwen/commands/eic.toml +0 -12
- package/.qwen/commands/investigator.toml +0 -12
- package/.qwen/commands/prompt.toml +0 -12
- package/.qwen/commands/scavenger.toml +0 -12
- package/.qwen/commands/scout.toml +0 -12
- package/.qwen/commands/scriptwriter.toml +0 -12
- package/.qwen/commands/seo.toml +0 -12
- package/.qwen/commands/thumbnail.toml +0 -12
- package/.qwen/commands/topic_scout.toml +0 -12
|
@@ -1,106 +1,106 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sys
|
|
3
|
-
import requests
|
|
4
|
-
import argparse
|
|
5
|
-
import mimetypes
|
|
6
|
-
from urllib.parse import urlparse
|
|
7
|
-
|
|
8
|
-
def is_safe_image_type(content_type, url):
|
|
9
|
-
"""
|
|
10
|
-
Check if the content type is a safe image type.
|
|
11
|
-
"""
|
|
12
|
-
safe_types = {
|
|
13
|
-
'image/jpeg', 'image/jpg', 'image/png', 'image/gif',
|
|
14
|
-
'image/webp', 'image/bmp', 'image/svg+xml', 'image/tiff'
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
# Check content-type header
|
|
18
|
-
if content_type and content_type.lower() in safe_types:
|
|
19
|
-
return True
|
|
20
|
-
|
|
21
|
-
# Fallback: check file extension from URL
|
|
22
|
-
parsed_url = urlparse(url)
|
|
23
|
-
file_ext = os.path.splitext(parsed_url.path)[1].lower()
|
|
24
|
-
mime_type, _ = mimetypes.guess_type(f"dummy{file_ext}")
|
|
25
|
-
|
|
26
|
-
if mime_type and mime_type in safe_types:
|
|
27
|
-
return True
|
|
28
|
-
|
|
29
|
-
return False
|
|
30
|
-
|
|
31
|
-
def get_file_size(response):
|
|
32
|
-
"""
|
|
33
|
-
Get the file size from the response headers.
|
|
34
|
-
"""
|
|
35
|
-
content_length = response.headers.get('content-length')
|
|
36
|
-
if content_length:
|
|
37
|
-
return int(content_length)
|
|
38
|
-
return 0
|
|
39
|
-
|
|
40
|
-
def download_image(url, output_path):
|
|
41
|
-
"""
|
|
42
|
-
Downloads an image from a URL with security validation.
|
|
43
|
-
"""
|
|
44
|
-
# Ensure output directory exists if it's not the current directory
|
|
45
|
-
dir_name = os.path.dirname(output_path)
|
|
46
|
-
if dir_name:
|
|
47
|
-
os.makedirs(dir_name, exist_ok=True)
|
|
48
|
-
|
|
49
|
-
headers = {
|
|
50
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
try:
|
|
54
|
-
print(f"Downloading image from: {url}")
|
|
55
|
-
|
|
56
|
-
# First, make a HEAD request to check content type and size
|
|
57
|
-
head_response = requests.head(url, headers=headers, timeout=10)
|
|
58
|
-
content_type = head_response.headers.get('content-type', '').lower()
|
|
59
|
-
|
|
60
|
-
# Validate content type
|
|
61
|
-
if not is_safe_image_type(content_type, url):
|
|
62
|
-
print(f"Security Error: Content type '{content_type}' is not a safe image type.")
|
|
63
|
-
sys.exit(1)
|
|
64
|
-
|
|
65
|
-
# Check file size (limit to 50MB)
|
|
66
|
-
file_size = get_file_size(head_response)
|
|
67
|
-
if file_size > 50 * 1024 * 1024: # 50MB
|
|
68
|
-
print(f"Security Error: File size {file_size} bytes exceeds 50MB limit.")
|
|
69
|
-
sys.exit(1)
|
|
70
|
-
|
|
71
|
-
# Actually download the file
|
|
72
|
-
response = requests.get(url, headers=headers, stream=True, timeout=10)
|
|
73
|
-
response.raise_for_status()
|
|
74
|
-
|
|
75
|
-
# Double-check content type after download
|
|
76
|
-
downloaded_content_type = response.headers.get('content-type', '').lower()
|
|
77
|
-
if not is_safe_image_type(downloaded_content_type, url):
|
|
78
|
-
print(f"Security Error: Downloaded content type '{downloaded_content_type}' is not a safe image type.")
|
|
79
|
-
sys.exit(1)
|
|
80
|
-
|
|
81
|
-
# Write file in chunks with size validation
|
|
82
|
-
total_size = 0
|
|
83
|
-
with open(output_path, 'wb') as f:
|
|
84
|
-
for chunk in response.iter_content(chunk_size=8192):
|
|
85
|
-
if chunk: # Filter out keep-alive chunks
|
|
86
|
-
total_size += len(chunk)
|
|
87
|
-
if total_size > 50 * 1024 * 1024: # 50MB limit
|
|
88
|
-
print(f"Security Error: Downloaded file exceeds 50MB limit.")
|
|
89
|
-
os.remove(output_path) # Clean up partial file
|
|
90
|
-
sys.exit(1)
|
|
91
|
-
f.write(chunk)
|
|
92
|
-
|
|
93
|
-
print(f"Successfully saved to {output_path}")
|
|
94
|
-
|
|
95
|
-
except Exception as e:
|
|
96
|
-
print(f"Failed to download image: {e}")
|
|
97
|
-
sys.exit(1)
|
|
98
|
-
|
|
99
|
-
if __name__ == "__main__":
|
|
100
|
-
parser = argparse.ArgumentParser(description="Download an image.")
|
|
101
|
-
parser.add_argument("--url", required=True, help="Image URL")
|
|
102
|
-
parser.add_argument("--output", required=True, help="Output file path")
|
|
103
|
-
|
|
104
|
-
args = parser.parse_args()
|
|
105
|
-
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import requests
|
|
4
|
+
import argparse
|
|
5
|
+
import mimetypes
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
def is_safe_image_type(content_type, url):
|
|
9
|
+
"""
|
|
10
|
+
Check if the content type is a safe image type.
|
|
11
|
+
"""
|
|
12
|
+
safe_types = {
|
|
13
|
+
'image/jpeg', 'image/jpg', 'image/png', 'image/gif',
|
|
14
|
+
'image/webp', 'image/bmp', 'image/svg+xml', 'image/tiff'
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
# Check content-type header
|
|
18
|
+
if content_type and content_type.lower() in safe_types:
|
|
19
|
+
return True
|
|
20
|
+
|
|
21
|
+
# Fallback: check file extension from URL
|
|
22
|
+
parsed_url = urlparse(url)
|
|
23
|
+
file_ext = os.path.splitext(parsed_url.path)[1].lower()
|
|
24
|
+
mime_type, _ = mimetypes.guess_type(f"dummy{file_ext}")
|
|
25
|
+
|
|
26
|
+
if mime_type and mime_type in safe_types:
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
def get_file_size(response):
|
|
32
|
+
"""
|
|
33
|
+
Get the file size from the response headers.
|
|
34
|
+
"""
|
|
35
|
+
content_length = response.headers.get('content-length')
|
|
36
|
+
if content_length:
|
|
37
|
+
return int(content_length)
|
|
38
|
+
return 0
|
|
39
|
+
|
|
40
|
+
def download_image(url, output_path):
|
|
41
|
+
"""
|
|
42
|
+
Downloads an image from a URL with security validation.
|
|
43
|
+
"""
|
|
44
|
+
# Ensure output directory exists if it's not the current directory
|
|
45
|
+
dir_name = os.path.dirname(output_path)
|
|
46
|
+
if dir_name:
|
|
47
|
+
os.makedirs(dir_name, exist_ok=True)
|
|
48
|
+
|
|
49
|
+
headers = {
|
|
50
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
print(f"Downloading image from: {url}")
|
|
55
|
+
|
|
56
|
+
# First, make a HEAD request to check content type and size
|
|
57
|
+
head_response = requests.head(url, headers=headers, timeout=10)
|
|
58
|
+
content_type = head_response.headers.get('content-type', '').lower()
|
|
59
|
+
|
|
60
|
+
# Validate content type
|
|
61
|
+
if not is_safe_image_type(content_type, url):
|
|
62
|
+
print(f"Security Error: Content type '{content_type}' is not a safe image type.")
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
# Check file size (limit to 50MB)
|
|
66
|
+
file_size = get_file_size(head_response)
|
|
67
|
+
if file_size > 50 * 1024 * 1024: # 50MB
|
|
68
|
+
print(f"Security Error: File size {file_size} bytes exceeds 50MB limit.")
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
|
|
71
|
+
# Actually download the file
|
|
72
|
+
response = requests.get(url, headers=headers, stream=True, timeout=10)
|
|
73
|
+
response.raise_for_status()
|
|
74
|
+
|
|
75
|
+
# Double-check content type after download
|
|
76
|
+
downloaded_content_type = response.headers.get('content-type', '').lower()
|
|
77
|
+
if not is_safe_image_type(downloaded_content_type, url):
|
|
78
|
+
print(f"Security Error: Downloaded content type '{downloaded_content_type}' is not a safe image type.")
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
# Write file in chunks with size validation
|
|
82
|
+
total_size = 0
|
|
83
|
+
with open(output_path, 'wb') as f:
|
|
84
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
85
|
+
if chunk: # Filter out keep-alive chunks
|
|
86
|
+
total_size += len(chunk)
|
|
87
|
+
if total_size > 50 * 1024 * 1024: # 50MB limit
|
|
88
|
+
print(f"Security Error: Downloaded file exceeds 50MB limit.")
|
|
89
|
+
os.remove(output_path) # Clean up partial file
|
|
90
|
+
sys.exit(1)
|
|
91
|
+
f.write(chunk)
|
|
92
|
+
|
|
93
|
+
print(f"Successfully saved to {output_path}")
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
print(f"Failed to download image: {e}")
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
parser = argparse.ArgumentParser(description="Download an image.")
|
|
101
|
+
parser.add_argument("--url", required=True, help="Image URL")
|
|
102
|
+
parser.add_argument("--output", required=True, help="Output file path")
|
|
103
|
+
|
|
104
|
+
args = parser.parse_args()
|
|
105
|
+
|
|
106
106
|
download_image(args.url, args.output)
|
|
@@ -1,163 +1,163 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import requests
|
|
3
|
-
import io
|
|
4
|
-
import time
|
|
5
|
-
from random import uniform
|
|
6
|
-
from pypdf import PdfReader
|
|
7
|
-
import argparse
|
|
8
|
-
import re
|
|
9
|
-
|
|
10
|
-
def read_pdf(url, search_term=None, page_number=None):
|
|
11
|
-
"""
|
|
12
|
-
Read a PDF from URL with optional search and page selection.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
url: URL of the PDF
|
|
16
|
-
search_term: Optional term to search for in the PDF
|
|
17
|
-
page_number: Optional specific page to read (1-indexed)
|
|
18
|
-
"""
|
|
19
|
-
# Add random delay to implement rate limiting
|
|
20
|
-
delay = uniform(1, 3)
|
|
21
|
-
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
22
|
-
time.sleep(delay)
|
|
23
|
-
|
|
24
|
-
headers = {
|
|
25
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
try:
|
|
29
|
-
response = requests.get(url, headers=headers, timeout=30)
|
|
30
|
-
response.raise_for_status()
|
|
31
|
-
|
|
32
|
-
f = io.BytesIO(response.content)
|
|
33
|
-
reader = PdfReader(f)
|
|
34
|
-
total_pages = len(reader.pages)
|
|
35
|
-
|
|
36
|
-
print(f"📄 PDF loaded: {total_pages} pages")
|
|
37
|
-
|
|
38
|
-
# If specific page requested
|
|
39
|
-
if page_number:
|
|
40
|
-
if 1 <= page_number <= total_pages:
|
|
41
|
-
text = reader.pages[page_number - 1].extract_text()
|
|
42
|
-
print(f"\n--- Page {page_number} of {total_pages} ---")
|
|
43
|
-
print(text)
|
|
44
|
-
return
|
|
45
|
-
else:
|
|
46
|
-
print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
|
|
47
|
-
sys.exit(1)
|
|
48
|
-
|
|
49
|
-
# If search term provided, find all occurrences
|
|
50
|
-
if search_term:
|
|
51
|
-
print(f"🔍 Searching for: '{search_term}'")
|
|
52
|
-
matches = []
|
|
53
|
-
search_lower = search_term.lower()
|
|
54
|
-
|
|
55
|
-
for i, page in enumerate(reader.pages):
|
|
56
|
-
page_text = page.extract_text()
|
|
57
|
-
if page_text and search_lower in page_text.lower():
|
|
58
|
-
# Find the context around the match
|
|
59
|
-
lines = page_text.split('\n')
|
|
60
|
-
for j, line in enumerate(lines):
|
|
61
|
-
if search_lower in line.lower():
|
|
62
|
-
# Get surrounding context (2 lines before/after)
|
|
63
|
-
context_start = max(0, j - 2)
|
|
64
|
-
context_end = min(len(lines), j + 3)
|
|
65
|
-
context = '\n'.join(lines[context_start:context_end])
|
|
66
|
-
matches.append({
|
|
67
|
-
'page': i + 1,
|
|
68
|
-
'line': line.strip(),
|
|
69
|
-
'context': context
|
|
70
|
-
})
|
|
71
|
-
|
|
72
|
-
if matches:
|
|
73
|
-
print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
|
|
74
|
-
for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
|
|
75
|
-
print(f"{'='*60}")
|
|
76
|
-
print(f"📍 Match {idx+1} - Page {match['page']}")
|
|
77
|
-
print(f"{'='*60}")
|
|
78
|
-
print(f"Line: {match['line']}")
|
|
79
|
-
print(f"\nContext:")
|
|
80
|
-
print(match['context'])
|
|
81
|
-
print()
|
|
82
|
-
|
|
83
|
-
if len(matches) > 10:
|
|
84
|
-
print(f"... and {len(matches) - 10} more matches")
|
|
85
|
-
|
|
86
|
-
# Suggest best page for screenshot
|
|
87
|
-
best_page = matches[0]['page']
|
|
88
|
-
print(f"\n📸 Suggested page for screenshot: Page {best_page}")
|
|
89
|
-
print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
|
|
90
|
-
else:
|
|
91
|
-
print(f"❌ No matches found for '{search_term}'")
|
|
92
|
-
return
|
|
93
|
-
|
|
94
|
-
# Default: Smart extraction with priority for first and last pages
|
|
95
|
-
MAX_PAGES = 15
|
|
96
|
-
MAX_CHARS = 20000
|
|
97
|
-
|
|
98
|
-
text = ""
|
|
99
|
-
pages_to_read = []
|
|
100
|
-
|
|
101
|
-
if total_pages <= MAX_PAGES:
|
|
102
|
-
pages_to_read = list(range(total_pages))
|
|
103
|
-
else:
|
|
104
|
-
# Smart selection: first 7 + last 4
|
|
105
|
-
first_pages = list(range(min(7, total_pages)))
|
|
106
|
-
last_pages = list(range(max(0, total_pages - 4), total_pages))
|
|
107
|
-
pages_to_read = sorted(set(first_pages + last_pages))
|
|
108
|
-
|
|
109
|
-
print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
|
|
110
|
-
|
|
111
|
-
for i in pages_to_read:
|
|
112
|
-
page_text = reader.pages[i].extract_text()
|
|
113
|
-
if page_text:
|
|
114
|
-
text += f"\n--- Page {i+1} ---\n{page_text}"
|
|
115
|
-
|
|
116
|
-
# Smart truncation with intro/conclusion preservation
|
|
117
|
-
if len(text) > MAX_CHARS:
|
|
118
|
-
intro = text[:8000]
|
|
119
|
-
outro = text[-8000:]
|
|
120
|
-
truncated = len(text) - MAX_CHARS
|
|
121
|
-
print(f"--- PDF CONTENT START ---")
|
|
122
|
-
print(intro)
|
|
123
|
-
print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
|
|
124
|
-
print(f"--- PDF CONTENT END ---")
|
|
125
|
-
print(outro)
|
|
126
|
-
else:
|
|
127
|
-
print(text)
|
|
128
|
-
|
|
129
|
-
except requests.exceptions.RequestException as e:
|
|
130
|
-
print(f"Error downloading PDF: {e}")
|
|
131
|
-
sys.exit(1)
|
|
132
|
-
except Exception as e:
|
|
133
|
-
print(f"Error reading PDF: {e}")
|
|
134
|
-
sys.exit(1)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def main():
|
|
138
|
-
parser = argparse.ArgumentParser(
|
|
139
|
-
description="Read and search PDF documents from URLs.",
|
|
140
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
141
|
-
epilog="""
|
|
142
|
-
Examples:
|
|
143
|
-
# Read PDF with smart extraction (first + last pages)
|
|
144
|
-
python pdf_reader.py --url "https://example.com/report.pdf"
|
|
145
|
-
|
|
146
|
-
# Search for specific term in PDF
|
|
147
|
-
python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
|
|
148
|
-
|
|
149
|
-
# Read specific page
|
|
150
|
-
python pdf_reader.py --url "https://example.com/report.pdf" --page 5
|
|
151
|
-
"""
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
parser.add_argument("--url", required=True, help="URL of the PDF document")
|
|
155
|
-
parser.add_argument("--search", "-s", help="Search for specific term and show context")
|
|
156
|
-
parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
|
|
157
|
-
|
|
158
|
-
args = parser.parse_args()
|
|
159
|
-
read_pdf(args.url, search_term=args.search, page_number=args.page)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
if __name__ == "__main__":
|
|
163
|
-
main()
|
|
1
|
+
import sys
|
|
2
|
+
import requests
|
|
3
|
+
import io
|
|
4
|
+
import time
|
|
5
|
+
from random import uniform
|
|
6
|
+
from pypdf import PdfReader
|
|
7
|
+
import argparse
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
def read_pdf(url, search_term=None, page_number=None):
|
|
11
|
+
"""
|
|
12
|
+
Read a PDF from URL with optional search and page selection.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
url: URL of the PDF
|
|
16
|
+
search_term: Optional term to search for in the PDF
|
|
17
|
+
page_number: Optional specific page to read (1-indexed)
|
|
18
|
+
"""
|
|
19
|
+
# Add random delay to implement rate limiting
|
|
20
|
+
delay = uniform(1, 3)
|
|
21
|
+
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
22
|
+
time.sleep(delay)
|
|
23
|
+
|
|
24
|
+
headers = {
|
|
25
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
30
|
+
response.raise_for_status()
|
|
31
|
+
|
|
32
|
+
f = io.BytesIO(response.content)
|
|
33
|
+
reader = PdfReader(f)
|
|
34
|
+
total_pages = len(reader.pages)
|
|
35
|
+
|
|
36
|
+
print(f"📄 PDF loaded: {total_pages} pages")
|
|
37
|
+
|
|
38
|
+
# If specific page requested
|
|
39
|
+
if page_number:
|
|
40
|
+
if 1 <= page_number <= total_pages:
|
|
41
|
+
text = reader.pages[page_number - 1].extract_text()
|
|
42
|
+
print(f"\n--- Page {page_number} of {total_pages} ---")
|
|
43
|
+
print(text)
|
|
44
|
+
return
|
|
45
|
+
else:
|
|
46
|
+
print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
|
|
47
|
+
sys.exit(1)
|
|
48
|
+
|
|
49
|
+
# If search term provided, find all occurrences
|
|
50
|
+
if search_term:
|
|
51
|
+
print(f"🔍 Searching for: '{search_term}'")
|
|
52
|
+
matches = []
|
|
53
|
+
search_lower = search_term.lower()
|
|
54
|
+
|
|
55
|
+
for i, page in enumerate(reader.pages):
|
|
56
|
+
page_text = page.extract_text()
|
|
57
|
+
if page_text and search_lower in page_text.lower():
|
|
58
|
+
# Find the context around the match
|
|
59
|
+
lines = page_text.split('\n')
|
|
60
|
+
for j, line in enumerate(lines):
|
|
61
|
+
if search_lower in line.lower():
|
|
62
|
+
# Get surrounding context (2 lines before/after)
|
|
63
|
+
context_start = max(0, j - 2)
|
|
64
|
+
context_end = min(len(lines), j + 3)
|
|
65
|
+
context = '\n'.join(lines[context_start:context_end])
|
|
66
|
+
matches.append({
|
|
67
|
+
'page': i + 1,
|
|
68
|
+
'line': line.strip(),
|
|
69
|
+
'context': context
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
if matches:
|
|
73
|
+
print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
|
|
74
|
+
for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
|
|
75
|
+
print(f"{'='*60}")
|
|
76
|
+
print(f"📍 Match {idx+1} - Page {match['page']}")
|
|
77
|
+
print(f"{'='*60}")
|
|
78
|
+
print(f"Line: {match['line']}")
|
|
79
|
+
print(f"\nContext:")
|
|
80
|
+
print(match['context'])
|
|
81
|
+
print()
|
|
82
|
+
|
|
83
|
+
if len(matches) > 10:
|
|
84
|
+
print(f"... and {len(matches) - 10} more matches")
|
|
85
|
+
|
|
86
|
+
# Suggest best page for screenshot
|
|
87
|
+
best_page = matches[0]['page']
|
|
88
|
+
print(f"\n📸 Suggested page for screenshot: Page {best_page}")
|
|
89
|
+
print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
|
|
90
|
+
else:
|
|
91
|
+
print(f"❌ No matches found for '{search_term}'")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
# Default: Smart extraction with priority for first and last pages
|
|
95
|
+
MAX_PAGES = 15
|
|
96
|
+
MAX_CHARS = 20000
|
|
97
|
+
|
|
98
|
+
text = ""
|
|
99
|
+
pages_to_read = []
|
|
100
|
+
|
|
101
|
+
if total_pages <= MAX_PAGES:
|
|
102
|
+
pages_to_read = list(range(total_pages))
|
|
103
|
+
else:
|
|
104
|
+
# Smart selection: first 7 + last 4
|
|
105
|
+
first_pages = list(range(min(7, total_pages)))
|
|
106
|
+
last_pages = list(range(max(0, total_pages - 4), total_pages))
|
|
107
|
+
pages_to_read = sorted(set(first_pages + last_pages))
|
|
108
|
+
|
|
109
|
+
print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
|
|
110
|
+
|
|
111
|
+
for i in pages_to_read:
|
|
112
|
+
page_text = reader.pages[i].extract_text()
|
|
113
|
+
if page_text:
|
|
114
|
+
text += f"\n--- Page {i+1} ---\n{page_text}"
|
|
115
|
+
|
|
116
|
+
# Smart truncation with intro/conclusion preservation
|
|
117
|
+
if len(text) > MAX_CHARS:
|
|
118
|
+
intro = text[:8000]
|
|
119
|
+
outro = text[-8000:]
|
|
120
|
+
truncated = len(text) - MAX_CHARS
|
|
121
|
+
print(f"--- PDF CONTENT START ---")
|
|
122
|
+
print(intro)
|
|
123
|
+
print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
|
|
124
|
+
print(f"--- PDF CONTENT END ---")
|
|
125
|
+
print(outro)
|
|
126
|
+
else:
|
|
127
|
+
print(text)
|
|
128
|
+
|
|
129
|
+
except requests.exceptions.RequestException as e:
|
|
130
|
+
print(f"Error downloading PDF: {e}")
|
|
131
|
+
sys.exit(1)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print(f"Error reading PDF: {e}")
|
|
134
|
+
sys.exit(1)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def main():
|
|
138
|
+
parser = argparse.ArgumentParser(
|
|
139
|
+
description="Read and search PDF documents from URLs.",
|
|
140
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
141
|
+
epilog="""
|
|
142
|
+
Examples:
|
|
143
|
+
# Read PDF with smart extraction (first + last pages)
|
|
144
|
+
python pdf_reader.py --url "https://example.com/report.pdf"
|
|
145
|
+
|
|
146
|
+
# Search for specific term in PDF
|
|
147
|
+
python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
|
|
148
|
+
|
|
149
|
+
# Read specific page
|
|
150
|
+
python pdf_reader.py --url "https://example.com/report.pdf" --page 5
|
|
151
|
+
"""
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
parser.add_argument("--url", required=True, help="URL of the PDF document")
|
|
155
|
+
parser.add_argument("--search", "-s", help="Search for specific term and show context")
|
|
156
|
+
parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
|
|
157
|
+
|
|
158
|
+
args = parser.parse_args()
|
|
159
|
+
read_pdf(args.url, search_term=args.search, page_number=args.page)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|
|
@@ -1,58 +1,58 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import os
|
|
3
|
-
import argparse
|
|
4
|
-
import time
|
|
5
|
-
from random import uniform
|
|
6
|
-
from playwright.sync_api import sync_playwright
|
|
7
|
-
|
|
8
|
-
def take_screenshot(url, output_path):
|
|
9
|
-
# Add random delay to implement rate limiting
|
|
10
|
-
delay = uniform(1, 3) # Random delay between 1-3 seconds
|
|
11
|
-
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
12
|
-
time.sleep(delay)
|
|
13
|
-
|
|
14
|
-
with sync_playwright() as p:
|
|
15
|
-
browser = p.chromium.launch()
|
|
16
|
-
page = browser.new_page()
|
|
17
|
-
|
|
18
|
-
# Set additional headers to appear more like a real user
|
|
19
|
-
page.set_extra_http_headers({
|
|
20
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
21
|
-
"Accept-Language": "en-US,en;q=0.5",
|
|
22
|
-
"Accept-Encoding": "gzip, deflate",
|
|
23
|
-
"Connection": "keep-alive",
|
|
24
|
-
"Upgrade-Insecure-Requests": "1",
|
|
25
|
-
})
|
|
26
|
-
|
|
27
|
-
try:
|
|
28
|
-
print(f"Navigating to {url}...")
|
|
29
|
-
page.goto(url, timeout=30000)
|
|
30
|
-
# Wait a bit for dynamic content (e.g., Twitter embeds)
|
|
31
|
-
page.wait_for_timeout(2000)
|
|
32
|
-
|
|
33
|
-
page.screenshot(path=output_path, full_page=False)
|
|
34
|
-
|
|
35
|
-
# Validate that the file was created and has content
|
|
36
|
-
if os.path.exists(output_path):
|
|
37
|
-
file_size = os.path.getsize(output_path)
|
|
38
|
-
if file_size == 0:
|
|
39
|
-
print(f"Error: Screenshot file is empty: {output_path}")
|
|
40
|
-
sys.exit(1)
|
|
41
|
-
else:
|
|
42
|
-
print(f"File validation: {output_path} created with size {file_size} bytes")
|
|
43
|
-
else:
|
|
44
|
-
print(f"Error: Screenshot file does not exist: {output_path}")
|
|
45
|
-
sys.exit(1)
|
|
46
|
-
|
|
47
|
-
except Exception as e:
|
|
48
|
-
print(f"Error taking screenshot: {e}")
|
|
49
|
-
sys.exit(1)
|
|
50
|
-
finally:
|
|
51
|
-
browser.close()
|
|
52
|
-
|
|
53
|
-
if __name__ == "__main__":
|
|
54
|
-
parser = argparse.ArgumentParser()
|
|
55
|
-
parser.add_argument("--url", required=True)
|
|
56
|
-
parser.add_argument("--output", required=True)
|
|
57
|
-
args = parser.parse_args()
|
|
58
|
-
take_screenshot(args.url, args.output)
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
import time
|
|
5
|
+
from random import uniform
|
|
6
|
+
from playwright.sync_api import sync_playwright
|
|
7
|
+
|
|
8
|
+
def take_screenshot(url, output_path):
|
|
9
|
+
# Add random delay to implement rate limiting
|
|
10
|
+
delay = uniform(1, 3) # Random delay between 1-3 seconds
|
|
11
|
+
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
12
|
+
time.sleep(delay)
|
|
13
|
+
|
|
14
|
+
with sync_playwright() as p:
|
|
15
|
+
browser = p.chromium.launch()
|
|
16
|
+
page = browser.new_page()
|
|
17
|
+
|
|
18
|
+
# Set additional headers to appear more like a real user
|
|
19
|
+
page.set_extra_http_headers({
|
|
20
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
21
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
22
|
+
"Accept-Encoding": "gzip, deflate",
|
|
23
|
+
"Connection": "keep-alive",
|
|
24
|
+
"Upgrade-Insecure-Requests": "1",
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
print(f"Navigating to {url}...")
|
|
29
|
+
page.goto(url, timeout=30000)
|
|
30
|
+
# Wait a bit for dynamic content (e.g., Twitter embeds)
|
|
31
|
+
page.wait_for_timeout(2000)
|
|
32
|
+
|
|
33
|
+
page.screenshot(path=output_path, full_page=False)
|
|
34
|
+
|
|
35
|
+
# Validate that the file was created and has content
|
|
36
|
+
if os.path.exists(output_path):
|
|
37
|
+
file_size = os.path.getsize(output_path)
|
|
38
|
+
if file_size == 0:
|
|
39
|
+
print(f"Error: Screenshot file is empty: {output_path}")
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
else:
|
|
42
|
+
print(f"File validation: {output_path} created with size {file_size} bytes")
|
|
43
|
+
else:
|
|
44
|
+
print(f"Error: Screenshot file does not exist: {output_path}")
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"Error taking screenshot: {e}")
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
finally:
|
|
51
|
+
browser.close()
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
parser = argparse.ArgumentParser()
|
|
55
|
+
parser.add_argument("--url", required=True)
|
|
56
|
+
parser.add_argument("--output", required=True)
|
|
57
|
+
args = parser.parse_args()
|
|
58
|
+
take_screenshot(args.url, args.output)
|