videonut 1.2.7 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +272 -272
- package/USER_GUIDE.md +90 -90
- package/agents/core/eic.md +771 -771
- package/agents/creative/director.md +246 -246
- package/agents/creative/scriptwriter.md +207 -207
- package/agents/research/investigator.md +394 -394
- package/agents/technical/archivist.md +288 -288
- package/agents/technical/scavenger.md +247 -247
- package/bin/videonut.js +37 -21
- package/config.yaml +61 -61
- package/docs/scriptwriter.md +42 -42
- package/file_validator.py +186 -186
- package/memory/short_term/asset_manifest.md +64 -64
- package/memory/short_term/investigation_dossier.md +31 -31
- package/memory/short_term/master_script.md +51 -51
- package/package.json +61 -64
- package/requirements.txt +8 -8
- package/setup.js +33 -15
- package/tools/check_env.py +76 -76
- package/tools/downloaders/caption_reader.py +237 -237
- package/tools/downloaders/clip_grabber.py +82 -82
- package/tools/downloaders/image_grabber.py +105 -105
- package/tools/downloaders/pdf_reader.py +163 -163
- package/tools/downloaders/screenshotter.py +58 -58
- package/tools/downloaders/web_reader.py +69 -69
- package/tools/validators/link_checker.py +45 -45
- package/workflow_orchestrator.py +336 -336
- package/.claude/commands/archivist.toml +0 -12
- package/.claude/commands/director.toml +0 -12
- package/.claude/commands/eic.toml +0 -12
- package/.claude/commands/investigator.toml +0 -12
- package/.claude/commands/prompt.toml +0 -12
- package/.claude/commands/scavenger.toml +0 -12
- package/.claude/commands/scout.toml +0 -12
- package/.claude/commands/scriptwriter.toml +0 -12
- package/.claude/commands/seo.toml +0 -12
- package/.claude/commands/thumbnail.toml +0 -12
- package/.claude/commands/topic_scout.toml +0 -12
- package/.gemini/commands/archivist.toml +0 -12
- package/.gemini/commands/director.toml +0 -12
- package/.gemini/commands/eic.toml +0 -12
- package/.gemini/commands/investigator.toml +0 -12
- package/.gemini/commands/prompt.toml +0 -12
- package/.gemini/commands/scavenger.toml +0 -12
- package/.gemini/commands/scout.toml +0 -12
- package/.gemini/commands/scriptwriter.toml +0 -12
- package/.gemini/commands/seo.toml +0 -12
- package/.gemini/commands/thumbnail.toml +0 -12
- package/.gemini/commands/topic_scout.toml +0 -12
- package/.qwen/commands/archivist.toml +0 -12
- package/.qwen/commands/director.toml +0 -12
- package/.qwen/commands/eic.toml +0 -12
- package/.qwen/commands/investigator.toml +0 -12
- package/.qwen/commands/prompt.toml +0 -12
- package/.qwen/commands/scavenger.toml +0 -12
- package/.qwen/commands/scout.toml +0 -12
- package/.qwen/commands/scriptwriter.toml +0 -12
- package/.qwen/commands/seo.toml +0 -12
- package/.qwen/commands/thumbnail.toml +0 -12
- package/.qwen/commands/topic_scout.toml +0 -12
|
@@ -1,163 +1,163 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import requests
|
|
3
|
-
import io
|
|
4
|
-
import time
|
|
5
|
-
from random import uniform
|
|
6
|
-
from pypdf import PdfReader
|
|
7
|
-
import argparse
|
|
8
|
-
import re
|
|
9
|
-
|
|
10
|
-
def read_pdf(url, search_term=None, page_number=None):
|
|
11
|
-
"""
|
|
12
|
-
Read a PDF from URL with optional search and page selection.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
url: URL of the PDF
|
|
16
|
-
search_term: Optional term to search for in the PDF
|
|
17
|
-
page_number: Optional specific page to read (1-indexed)
|
|
18
|
-
"""
|
|
19
|
-
# Add random delay to implement rate limiting
|
|
20
|
-
delay = uniform(1, 3)
|
|
21
|
-
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
22
|
-
time.sleep(delay)
|
|
23
|
-
|
|
24
|
-
headers = {
|
|
25
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
try:
|
|
29
|
-
response = requests.get(url, headers=headers, timeout=30)
|
|
30
|
-
response.raise_for_status()
|
|
31
|
-
|
|
32
|
-
f = io.BytesIO(response.content)
|
|
33
|
-
reader = PdfReader(f)
|
|
34
|
-
total_pages = len(reader.pages)
|
|
35
|
-
|
|
36
|
-
print(f"📄 PDF loaded: {total_pages} pages")
|
|
37
|
-
|
|
38
|
-
# If specific page requested
|
|
39
|
-
if page_number:
|
|
40
|
-
if 1 <= page_number <= total_pages:
|
|
41
|
-
text = reader.pages[page_number - 1].extract_text()
|
|
42
|
-
print(f"\n--- Page {page_number} of {total_pages} ---")
|
|
43
|
-
print(text)
|
|
44
|
-
return
|
|
45
|
-
else:
|
|
46
|
-
print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
|
|
47
|
-
sys.exit(1)
|
|
48
|
-
|
|
49
|
-
# If search term provided, find all occurrences
|
|
50
|
-
if search_term:
|
|
51
|
-
print(f"🔍 Searching for: '{search_term}'")
|
|
52
|
-
matches = []
|
|
53
|
-
search_lower = search_term.lower()
|
|
54
|
-
|
|
55
|
-
for i, page in enumerate(reader.pages):
|
|
56
|
-
page_text = page.extract_text()
|
|
57
|
-
if page_text and search_lower in page_text.lower():
|
|
58
|
-
# Find the context around the match
|
|
59
|
-
lines = page_text.split('\n')
|
|
60
|
-
for j, line in enumerate(lines):
|
|
61
|
-
if search_lower in line.lower():
|
|
62
|
-
# Get surrounding context (2 lines before/after)
|
|
63
|
-
context_start = max(0, j - 2)
|
|
64
|
-
context_end = min(len(lines), j + 3)
|
|
65
|
-
context = '\n'.join(lines[context_start:context_end])
|
|
66
|
-
matches.append({
|
|
67
|
-
'page': i + 1,
|
|
68
|
-
'line': line.strip(),
|
|
69
|
-
'context': context
|
|
70
|
-
})
|
|
71
|
-
|
|
72
|
-
if matches:
|
|
73
|
-
print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
|
|
74
|
-
for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
|
|
75
|
-
print(f"{'='*60}")
|
|
76
|
-
print(f"📍 Match {idx+1} - Page {match['page']}")
|
|
77
|
-
print(f"{'='*60}")
|
|
78
|
-
print(f"Line: {match['line']}")
|
|
79
|
-
print(f"\nContext:")
|
|
80
|
-
print(match['context'])
|
|
81
|
-
print()
|
|
82
|
-
|
|
83
|
-
if len(matches) > 10:
|
|
84
|
-
print(f"... and {len(matches) - 10} more matches")
|
|
85
|
-
|
|
86
|
-
# Suggest best page for screenshot
|
|
87
|
-
best_page = matches[0]['page']
|
|
88
|
-
print(f"\n📸 Suggested page for screenshot: Page {best_page}")
|
|
89
|
-
print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
|
|
90
|
-
else:
|
|
91
|
-
print(f"❌ No matches found for '{search_term}'")
|
|
92
|
-
return
|
|
93
|
-
|
|
94
|
-
# Default: Smart extraction with priority for first and last pages
|
|
95
|
-
MAX_PAGES = 15
|
|
96
|
-
MAX_CHARS = 20000
|
|
97
|
-
|
|
98
|
-
text = ""
|
|
99
|
-
pages_to_read = []
|
|
100
|
-
|
|
101
|
-
if total_pages <= MAX_PAGES:
|
|
102
|
-
pages_to_read = list(range(total_pages))
|
|
103
|
-
else:
|
|
104
|
-
# Smart selection: first 7 + last 4
|
|
105
|
-
first_pages = list(range(min(7, total_pages)))
|
|
106
|
-
last_pages = list(range(max(0, total_pages - 4), total_pages))
|
|
107
|
-
pages_to_read = sorted(set(first_pages + last_pages))
|
|
108
|
-
|
|
109
|
-
print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
|
|
110
|
-
|
|
111
|
-
for i in pages_to_read:
|
|
112
|
-
page_text = reader.pages[i].extract_text()
|
|
113
|
-
if page_text:
|
|
114
|
-
text += f"\n--- Page {i+1} ---\n{page_text}"
|
|
115
|
-
|
|
116
|
-
# Smart truncation with intro/conclusion preservation
|
|
117
|
-
if len(text) > MAX_CHARS:
|
|
118
|
-
intro = text[:8000]
|
|
119
|
-
outro = text[-8000:]
|
|
120
|
-
truncated = len(text) - MAX_CHARS
|
|
121
|
-
print(f"--- PDF CONTENT START ---")
|
|
122
|
-
print(intro)
|
|
123
|
-
print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
|
|
124
|
-
print(f"--- PDF CONTENT END ---")
|
|
125
|
-
print(outro)
|
|
126
|
-
else:
|
|
127
|
-
print(text)
|
|
128
|
-
|
|
129
|
-
except requests.exceptions.RequestException as e:
|
|
130
|
-
print(f"Error downloading PDF: {e}")
|
|
131
|
-
sys.exit(1)
|
|
132
|
-
except Exception as e:
|
|
133
|
-
print(f"Error reading PDF: {e}")
|
|
134
|
-
sys.exit(1)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def main():
|
|
138
|
-
parser = argparse.ArgumentParser(
|
|
139
|
-
description="Read and search PDF documents from URLs.",
|
|
140
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
141
|
-
epilog="""
|
|
142
|
-
Examples:
|
|
143
|
-
# Read PDF with smart extraction (first + last pages)
|
|
144
|
-
python pdf_reader.py --url "https://example.com/report.pdf"
|
|
145
|
-
|
|
146
|
-
# Search for specific term in PDF
|
|
147
|
-
python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
|
|
148
|
-
|
|
149
|
-
# Read specific page
|
|
150
|
-
python pdf_reader.py --url "https://example.com/report.pdf" --page 5
|
|
151
|
-
"""
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
parser.add_argument("--url", required=True, help="URL of the PDF document")
|
|
155
|
-
parser.add_argument("--search", "-s", help="Search for specific term and show context")
|
|
156
|
-
parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
|
|
157
|
-
|
|
158
|
-
args = parser.parse_args()
|
|
159
|
-
read_pdf(args.url, search_term=args.search, page_number=args.page)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
if __name__ == "__main__":
|
|
163
|
-
main()
|
|
1
|
+
import sys
|
|
2
|
+
import requests
|
|
3
|
+
import io
|
|
4
|
+
import time
|
|
5
|
+
from random import uniform
|
|
6
|
+
from pypdf import PdfReader
|
|
7
|
+
import argparse
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
def read_pdf(url, search_term=None, page_number=None):
|
|
11
|
+
"""
|
|
12
|
+
Read a PDF from URL with optional search and page selection.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
url: URL of the PDF
|
|
16
|
+
search_term: Optional term to search for in the PDF
|
|
17
|
+
page_number: Optional specific page to read (1-indexed)
|
|
18
|
+
"""
|
|
19
|
+
# Add random delay to implement rate limiting
|
|
20
|
+
delay = uniform(1, 3)
|
|
21
|
+
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
22
|
+
time.sleep(delay)
|
|
23
|
+
|
|
24
|
+
headers = {
|
|
25
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
30
|
+
response.raise_for_status()
|
|
31
|
+
|
|
32
|
+
f = io.BytesIO(response.content)
|
|
33
|
+
reader = PdfReader(f)
|
|
34
|
+
total_pages = len(reader.pages)
|
|
35
|
+
|
|
36
|
+
print(f"📄 PDF loaded: {total_pages} pages")
|
|
37
|
+
|
|
38
|
+
# If specific page requested
|
|
39
|
+
if page_number:
|
|
40
|
+
if 1 <= page_number <= total_pages:
|
|
41
|
+
text = reader.pages[page_number - 1].extract_text()
|
|
42
|
+
print(f"\n--- Page {page_number} of {total_pages} ---")
|
|
43
|
+
print(text)
|
|
44
|
+
return
|
|
45
|
+
else:
|
|
46
|
+
print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
|
|
47
|
+
sys.exit(1)
|
|
48
|
+
|
|
49
|
+
# If search term provided, find all occurrences
|
|
50
|
+
if search_term:
|
|
51
|
+
print(f"🔍 Searching for: '{search_term}'")
|
|
52
|
+
matches = []
|
|
53
|
+
search_lower = search_term.lower()
|
|
54
|
+
|
|
55
|
+
for i, page in enumerate(reader.pages):
|
|
56
|
+
page_text = page.extract_text()
|
|
57
|
+
if page_text and search_lower in page_text.lower():
|
|
58
|
+
# Find the context around the match
|
|
59
|
+
lines = page_text.split('\n')
|
|
60
|
+
for j, line in enumerate(lines):
|
|
61
|
+
if search_lower in line.lower():
|
|
62
|
+
# Get surrounding context (2 lines before/after)
|
|
63
|
+
context_start = max(0, j - 2)
|
|
64
|
+
context_end = min(len(lines), j + 3)
|
|
65
|
+
context = '\n'.join(lines[context_start:context_end])
|
|
66
|
+
matches.append({
|
|
67
|
+
'page': i + 1,
|
|
68
|
+
'line': line.strip(),
|
|
69
|
+
'context': context
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
if matches:
|
|
73
|
+
print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
|
|
74
|
+
for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
|
|
75
|
+
print(f"{'='*60}")
|
|
76
|
+
print(f"📍 Match {idx+1} - Page {match['page']}")
|
|
77
|
+
print(f"{'='*60}")
|
|
78
|
+
print(f"Line: {match['line']}")
|
|
79
|
+
print(f"\nContext:")
|
|
80
|
+
print(match['context'])
|
|
81
|
+
print()
|
|
82
|
+
|
|
83
|
+
if len(matches) > 10:
|
|
84
|
+
print(f"... and {len(matches) - 10} more matches")
|
|
85
|
+
|
|
86
|
+
# Suggest best page for screenshot
|
|
87
|
+
best_page = matches[0]['page']
|
|
88
|
+
print(f"\n📸 Suggested page for screenshot: Page {best_page}")
|
|
89
|
+
print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
|
|
90
|
+
else:
|
|
91
|
+
print(f"❌ No matches found for '{search_term}'")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
# Default: Smart extraction with priority for first and last pages
|
|
95
|
+
MAX_PAGES = 15
|
|
96
|
+
MAX_CHARS = 20000
|
|
97
|
+
|
|
98
|
+
text = ""
|
|
99
|
+
pages_to_read = []
|
|
100
|
+
|
|
101
|
+
if total_pages <= MAX_PAGES:
|
|
102
|
+
pages_to_read = list(range(total_pages))
|
|
103
|
+
else:
|
|
104
|
+
# Smart selection: first 7 + last 4
|
|
105
|
+
first_pages = list(range(min(7, total_pages)))
|
|
106
|
+
last_pages = list(range(max(0, total_pages - 4), total_pages))
|
|
107
|
+
pages_to_read = sorted(set(first_pages + last_pages))
|
|
108
|
+
|
|
109
|
+
print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
|
|
110
|
+
|
|
111
|
+
for i in pages_to_read:
|
|
112
|
+
page_text = reader.pages[i].extract_text()
|
|
113
|
+
if page_text:
|
|
114
|
+
text += f"\n--- Page {i+1} ---\n{page_text}"
|
|
115
|
+
|
|
116
|
+
# Smart truncation with intro/conclusion preservation
|
|
117
|
+
if len(text) > MAX_CHARS:
|
|
118
|
+
intro = text[:8000]
|
|
119
|
+
outro = text[-8000:]
|
|
120
|
+
truncated = len(text) - MAX_CHARS
|
|
121
|
+
print(f"--- PDF CONTENT START ---")
|
|
122
|
+
print(intro)
|
|
123
|
+
print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
|
|
124
|
+
print(f"--- PDF CONTENT END ---")
|
|
125
|
+
print(outro)
|
|
126
|
+
else:
|
|
127
|
+
print(text)
|
|
128
|
+
|
|
129
|
+
except requests.exceptions.RequestException as e:
|
|
130
|
+
print(f"Error downloading PDF: {e}")
|
|
131
|
+
sys.exit(1)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print(f"Error reading PDF: {e}")
|
|
134
|
+
sys.exit(1)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def main():
|
|
138
|
+
parser = argparse.ArgumentParser(
|
|
139
|
+
description="Read and search PDF documents from URLs.",
|
|
140
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
141
|
+
epilog="""
|
|
142
|
+
Examples:
|
|
143
|
+
# Read PDF with smart extraction (first + last pages)
|
|
144
|
+
python pdf_reader.py --url "https://example.com/report.pdf"
|
|
145
|
+
|
|
146
|
+
# Search for specific term in PDF
|
|
147
|
+
python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
|
|
148
|
+
|
|
149
|
+
# Read specific page
|
|
150
|
+
python pdf_reader.py --url "https://example.com/report.pdf" --page 5
|
|
151
|
+
"""
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
parser.add_argument("--url", required=True, help="URL of the PDF document")
|
|
155
|
+
parser.add_argument("--search", "-s", help="Search for specific term and show context")
|
|
156
|
+
parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
|
|
157
|
+
|
|
158
|
+
args = parser.parse_args()
|
|
159
|
+
read_pdf(args.url, search_term=args.search, page_number=args.page)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|
|
@@ -1,58 +1,58 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import os
|
|
3
|
-
import argparse
|
|
4
|
-
import time
|
|
5
|
-
from random import uniform
|
|
6
|
-
from playwright.sync_api import sync_playwright
|
|
7
|
-
|
|
8
|
-
def take_screenshot(url, output_path):
|
|
9
|
-
# Add random delay to implement rate limiting
|
|
10
|
-
delay = uniform(1, 3) # Random delay between 1-3 seconds
|
|
11
|
-
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
12
|
-
time.sleep(delay)
|
|
13
|
-
|
|
14
|
-
with sync_playwright() as p:
|
|
15
|
-
browser = p.chromium.launch()
|
|
16
|
-
page = browser.new_page()
|
|
17
|
-
|
|
18
|
-
# Set additional headers to appear more like a real user
|
|
19
|
-
page.set_extra_http_headers({
|
|
20
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
21
|
-
"Accept-Language": "en-US,en;q=0.5",
|
|
22
|
-
"Accept-Encoding": "gzip, deflate",
|
|
23
|
-
"Connection": "keep-alive",
|
|
24
|
-
"Upgrade-Insecure-Requests": "1",
|
|
25
|
-
})
|
|
26
|
-
|
|
27
|
-
try:
|
|
28
|
-
print(f"Navigating to {url}...")
|
|
29
|
-
page.goto(url, timeout=30000)
|
|
30
|
-
# Wait a bit for dynamic content (e.g., Twitter embeds)
|
|
31
|
-
page.wait_for_timeout(2000)
|
|
32
|
-
|
|
33
|
-
page.screenshot(path=output_path, full_page=False)
|
|
34
|
-
|
|
35
|
-
# Validate that the file was created and has content
|
|
36
|
-
if os.path.exists(output_path):
|
|
37
|
-
file_size = os.path.getsize(output_path)
|
|
38
|
-
if file_size == 0:
|
|
39
|
-
print(f"Error: Screenshot file is empty: {output_path}")
|
|
40
|
-
sys.exit(1)
|
|
41
|
-
else:
|
|
42
|
-
print(f"File validation: {output_path} created with size {file_size} bytes")
|
|
43
|
-
else:
|
|
44
|
-
print(f"Error: Screenshot file does not exist: {output_path}")
|
|
45
|
-
sys.exit(1)
|
|
46
|
-
|
|
47
|
-
except Exception as e:
|
|
48
|
-
print(f"Error taking screenshot: {e}")
|
|
49
|
-
sys.exit(1)
|
|
50
|
-
finally:
|
|
51
|
-
browser.close()
|
|
52
|
-
|
|
53
|
-
if __name__ == "__main__":
|
|
54
|
-
parser = argparse.ArgumentParser()
|
|
55
|
-
parser.add_argument("--url", required=True)
|
|
56
|
-
parser.add_argument("--output", required=True)
|
|
57
|
-
args = parser.parse_args()
|
|
58
|
-
take_screenshot(args.url, args.output)
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
import time
|
|
5
|
+
from random import uniform
|
|
6
|
+
from playwright.sync_api import sync_playwright
|
|
7
|
+
|
|
8
|
+
def take_screenshot(url, output_path):
|
|
9
|
+
# Add random delay to implement rate limiting
|
|
10
|
+
delay = uniform(1, 3) # Random delay between 1-3 seconds
|
|
11
|
+
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
12
|
+
time.sleep(delay)
|
|
13
|
+
|
|
14
|
+
with sync_playwright() as p:
|
|
15
|
+
browser = p.chromium.launch()
|
|
16
|
+
page = browser.new_page()
|
|
17
|
+
|
|
18
|
+
# Set additional headers to appear more like a real user
|
|
19
|
+
page.set_extra_http_headers({
|
|
20
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
21
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
22
|
+
"Accept-Encoding": "gzip, deflate",
|
|
23
|
+
"Connection": "keep-alive",
|
|
24
|
+
"Upgrade-Insecure-Requests": "1",
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
print(f"Navigating to {url}...")
|
|
29
|
+
page.goto(url, timeout=30000)
|
|
30
|
+
# Wait a bit for dynamic content (e.g., Twitter embeds)
|
|
31
|
+
page.wait_for_timeout(2000)
|
|
32
|
+
|
|
33
|
+
page.screenshot(path=output_path, full_page=False)
|
|
34
|
+
|
|
35
|
+
# Validate that the file was created and has content
|
|
36
|
+
if os.path.exists(output_path):
|
|
37
|
+
file_size = os.path.getsize(output_path)
|
|
38
|
+
if file_size == 0:
|
|
39
|
+
print(f"Error: Screenshot file is empty: {output_path}")
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
else:
|
|
42
|
+
print(f"File validation: {output_path} created with size {file_size} bytes")
|
|
43
|
+
else:
|
|
44
|
+
print(f"Error: Screenshot file does not exist: {output_path}")
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"Error taking screenshot: {e}")
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
finally:
|
|
51
|
+
browser.close()
|
|
52
|
+
|
|
53
|
+
if __name__ == "__main__":
|
|
54
|
+
parser = argparse.ArgumentParser()
|
|
55
|
+
parser.add_argument("--url", required=True)
|
|
56
|
+
parser.add_argument("--output", required=True)
|
|
57
|
+
args = parser.parse_args()
|
|
58
|
+
take_screenshot(args.url, args.output)
|
|
@@ -1,69 +1,69 @@
|
|
|
1
|
-
import sys
|
|
2
|
-
import argparse
|
|
3
|
-
import time
|
|
4
|
-
from random import uniform
|
|
5
|
-
from playwright.sync_api import sync_playwright
|
|
6
|
-
|
|
7
|
-
def read_webpage(url):
|
|
8
|
-
try:
|
|
9
|
-
# Add random delay to implement rate limiting
|
|
10
|
-
delay = uniform(1, 3) # Random delay between 1-3 seconds
|
|
11
|
-
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
12
|
-
time.sleep(delay)
|
|
13
|
-
|
|
14
|
-
with sync_playwright() as p:
|
|
15
|
-
# Launch browser (headless by default)
|
|
16
|
-
browser = p.chromium.launch()
|
|
17
|
-
page = browser.new_page()
|
|
18
|
-
|
|
19
|
-
# Set additional headers to appear more like a real user
|
|
20
|
-
page.set_extra_http_headers({
|
|
21
|
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
22
|
-
"Accept-Language": "en-US,en;q=0.5",
|
|
23
|
-
"Accept-Encoding": "gzip, deflate",
|
|
24
|
-
"Connection": "keep-alive",
|
|
25
|
-
"Upgrade-Insecure-Requests": "1",
|
|
26
|
-
})
|
|
27
|
-
|
|
28
|
-
# Navigate with a reasonable timeout
|
|
29
|
-
page.goto(url, timeout=30000)
|
|
30
|
-
|
|
31
|
-
# Wait for content to load (basic heuristic)
|
|
32
|
-
page.wait_for_load_state("domcontentloaded")
|
|
33
|
-
|
|
34
|
-
# Get the text content
|
|
35
|
-
# We use evaluate to get innerText which mimics what a user sees (hidden text is ignored)
|
|
36
|
-
text = page.evaluate("document.body.innerText")
|
|
37
|
-
|
|
38
|
-
# Basic cleanup: Remove excessive newlines
|
|
39
|
-
clean_text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()])
|
|
40
|
-
|
|
41
|
-
# Smart truncation: Preserve intro AND conclusion (critical for research)
|
|
42
|
-
MAX_TOTAL = 40000 # Increased from 25000
|
|
43
|
-
INTRO_SIZE = 8000 # First portion (hook/summary)
|
|
44
|
-
OUTRO_SIZE = 8000 # Last portion (conclusion/recommendations)
|
|
45
|
-
|
|
46
|
-
if len(clean_text) > MAX_TOTAL:
|
|
47
|
-
intro = clean_text[:INTRO_SIZE]
|
|
48
|
-
outro = clean_text[-OUTRO_SIZE:]
|
|
49
|
-
truncated_chars = len(clean_text) - MAX_TOTAL
|
|
50
|
-
|
|
51
|
-
print(f"--- CONTENT START (First {INTRO_SIZE} chars) ---")
|
|
52
|
-
print(intro)
|
|
53
|
-
print(f"\n\n[... {truncated_chars:,} CHARACTERS TRUNCATED - Middle section omitted to preserve intro and conclusion ...]\n\n")
|
|
54
|
-
print(f"--- CONTENT END (Last {OUTRO_SIZE} chars) ---")
|
|
55
|
-
print(outro)
|
|
56
|
-
else:
|
|
57
|
-
print(clean_text)
|
|
58
|
-
|
|
59
|
-
browser.close()
|
|
60
|
-
|
|
61
|
-
except Exception as e:
|
|
62
|
-
print(f"Error reading webpage: {e}")
|
|
63
|
-
sys.exit(1)
|
|
64
|
-
|
|
65
|
-
if __name__ == "__main__":
|
|
66
|
-
parser = argparse.ArgumentParser()
|
|
67
|
-
parser.add_argument("--url", required=True)
|
|
68
|
-
args = parser.parse_args()
|
|
69
|
-
read_webpage(args.url)
|
|
1
|
+
import sys
|
|
2
|
+
import argparse
|
|
3
|
+
import time
|
|
4
|
+
from random import uniform
|
|
5
|
+
from playwright.sync_api import sync_playwright
|
|
6
|
+
|
|
7
|
+
def read_webpage(url):
|
|
8
|
+
try:
|
|
9
|
+
# Add random delay to implement rate limiting
|
|
10
|
+
delay = uniform(1, 3) # Random delay between 1-3 seconds
|
|
11
|
+
print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
|
|
12
|
+
time.sleep(delay)
|
|
13
|
+
|
|
14
|
+
with sync_playwright() as p:
|
|
15
|
+
# Launch browser (headless by default)
|
|
16
|
+
browser = p.chromium.launch()
|
|
17
|
+
page = browser.new_page()
|
|
18
|
+
|
|
19
|
+
# Set additional headers to appear more like a real user
|
|
20
|
+
page.set_extra_http_headers({
|
|
21
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
22
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
23
|
+
"Accept-Encoding": "gzip, deflate",
|
|
24
|
+
"Connection": "keep-alive",
|
|
25
|
+
"Upgrade-Insecure-Requests": "1",
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
# Navigate with a reasonable timeout
|
|
29
|
+
page.goto(url, timeout=30000)
|
|
30
|
+
|
|
31
|
+
# Wait for content to load (basic heuristic)
|
|
32
|
+
page.wait_for_load_state("domcontentloaded")
|
|
33
|
+
|
|
34
|
+
# Get the text content
|
|
35
|
+
# We use evaluate to get innerText which mimics what a user sees (hidden text is ignored)
|
|
36
|
+
text = page.evaluate("document.body.innerText")
|
|
37
|
+
|
|
38
|
+
# Basic cleanup: Remove excessive newlines
|
|
39
|
+
clean_text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()])
|
|
40
|
+
|
|
41
|
+
# Smart truncation: Preserve intro AND conclusion (critical for research)
|
|
42
|
+
MAX_TOTAL = 40000 # Increased from 25000
|
|
43
|
+
INTRO_SIZE = 8000 # First portion (hook/summary)
|
|
44
|
+
OUTRO_SIZE = 8000 # Last portion (conclusion/recommendations)
|
|
45
|
+
|
|
46
|
+
if len(clean_text) > MAX_TOTAL:
|
|
47
|
+
intro = clean_text[:INTRO_SIZE]
|
|
48
|
+
outro = clean_text[-OUTRO_SIZE:]
|
|
49
|
+
truncated_chars = len(clean_text) - MAX_TOTAL
|
|
50
|
+
|
|
51
|
+
print(f"--- CONTENT START (First {INTRO_SIZE} chars) ---")
|
|
52
|
+
print(intro)
|
|
53
|
+
print(f"\n\n[... {truncated_chars:,} CHARACTERS TRUNCATED - Middle section omitted to preserve intro and conclusion ...]\n\n")
|
|
54
|
+
print(f"--- CONTENT END (Last {OUTRO_SIZE} chars) ---")
|
|
55
|
+
print(outro)
|
|
56
|
+
else:
|
|
57
|
+
print(clean_text)
|
|
58
|
+
|
|
59
|
+
browser.close()
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
print(f"Error reading webpage: {e}")
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
parser = argparse.ArgumentParser()
|
|
67
|
+
parser.add_argument("--url", required=True)
|
|
68
|
+
args = parser.parse_args()
|
|
69
|
+
read_webpage(args.url)
|