videonut 1.2.8 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +272 -272
  2. package/USER_GUIDE.md +90 -90
  3. package/agents/core/eic.md +771 -771
  4. package/agents/creative/director.md +246 -246
  5. package/agents/creative/scriptwriter.md +207 -207
  6. package/agents/research/investigator.md +394 -394
  7. package/agents/technical/archivist.md +288 -288
  8. package/agents/technical/scavenger.md +247 -247
  9. package/config.yaml +61 -61
  10. package/docs/scriptwriter.md +42 -42
  11. package/file_validator.py +186 -186
  12. package/memory/short_term/asset_manifest.md +64 -64
  13. package/memory/short_term/investigation_dossier.md +31 -31
  14. package/memory/short_term/master_script.md +51 -51
  15. package/package.json +61 -64
  16. package/requirements.txt +8 -8
  17. package/tools/check_env.py +76 -76
  18. package/tools/downloaders/caption_reader.py +237 -237
  19. package/tools/downloaders/clip_grabber.py +82 -82
  20. package/tools/downloaders/image_grabber.py +105 -105
  21. package/tools/downloaders/pdf_reader.py +163 -163
  22. package/tools/downloaders/screenshotter.py +58 -58
  23. package/tools/downloaders/web_reader.py +69 -69
  24. package/tools/validators/link_checker.py +45 -45
  25. package/workflow_orchestrator.py +336 -336
  26. package/.claude/commands/archivist.toml +0 -12
  27. package/.claude/commands/director.toml +0 -12
  28. package/.claude/commands/eic.toml +0 -12
  29. package/.claude/commands/investigator.toml +0 -12
  30. package/.claude/commands/prompt.toml +0 -12
  31. package/.claude/commands/scavenger.toml +0 -12
  32. package/.claude/commands/scout.toml +0 -12
  33. package/.claude/commands/scriptwriter.toml +0 -12
  34. package/.claude/commands/seo.toml +0 -12
  35. package/.claude/commands/thumbnail.toml +0 -12
  36. package/.claude/commands/topic_scout.toml +0 -12
  37. package/.gemini/commands/archivist.toml +0 -12
  38. package/.gemini/commands/director.toml +0 -12
  39. package/.gemini/commands/eic.toml +0 -12
  40. package/.gemini/commands/investigator.toml +0 -12
  41. package/.gemini/commands/prompt.toml +0 -12
  42. package/.gemini/commands/scavenger.toml +0 -12
  43. package/.gemini/commands/scout.toml +0 -12
  44. package/.gemini/commands/scriptwriter.toml +0 -12
  45. package/.gemini/commands/seo.toml +0 -12
  46. package/.gemini/commands/thumbnail.toml +0 -12
  47. package/.gemini/commands/topic_scout.toml +0 -12
  48. package/.qwen/commands/archivist.toml +0 -12
  49. package/.qwen/commands/director.toml +0 -12
  50. package/.qwen/commands/eic.toml +0 -12
  51. package/.qwen/commands/investigator.toml +0 -12
  52. package/.qwen/commands/prompt.toml +0 -12
  53. package/.qwen/commands/scavenger.toml +0 -12
  54. package/.qwen/commands/scout.toml +0 -12
  55. package/.qwen/commands/scriptwriter.toml +0 -12
  56. package/.qwen/commands/seo.toml +0 -12
  57. package/.qwen/commands/thumbnail.toml +0 -12
  58. package/.qwen/commands/topic_scout.toml +0 -12
@@ -1,106 +1,106 @@
1
- import os
2
- import sys
3
- import requests
4
- import argparse
5
- import mimetypes
6
- from urllib.parse import urlparse
7
-
8
- def is_safe_image_type(content_type, url):
9
- """
10
- Check if the content type is a safe image type.
11
- """
12
- safe_types = {
13
- 'image/jpeg', 'image/jpg', 'image/png', 'image/gif',
14
- 'image/webp', 'image/bmp', 'image/svg+xml', 'image/tiff'
15
- }
16
-
17
- # Check content-type header
18
- if content_type and content_type.lower() in safe_types:
19
- return True
20
-
21
- # Fallback: check file extension from URL
22
- parsed_url = urlparse(url)
23
- file_ext = os.path.splitext(parsed_url.path)[1].lower()
24
- mime_type, _ = mimetypes.guess_type(f"dummy{file_ext}")
25
-
26
- if mime_type and mime_type in safe_types:
27
- return True
28
-
29
- return False
30
-
31
- def get_file_size(response):
32
- """
33
- Get the file size from the response headers.
34
- """
35
- content_length = response.headers.get('content-length')
36
- if content_length:
37
- return int(content_length)
38
- return 0
39
-
40
- def download_image(url, output_path):
41
- """
42
- Downloads an image from a URL with security validation.
43
- """
44
- # Ensure output directory exists if it's not the current directory
45
- dir_name = os.path.dirname(output_path)
46
- if dir_name:
47
- os.makedirs(dir_name, exist_ok=True)
48
-
49
- headers = {
50
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
51
- }
52
-
53
- try:
54
- print(f"Downloading image from: {url}")
55
-
56
- # First, make a HEAD request to check content type and size
57
- head_response = requests.head(url, headers=headers, timeout=10)
58
- content_type = head_response.headers.get('content-type', '').lower()
59
-
60
- # Validate content type
61
- if not is_safe_image_type(content_type, url):
62
- print(f"Security Error: Content type '{content_type}' is not a safe image type.")
63
- sys.exit(1)
64
-
65
- # Check file size (limit to 50MB)
66
- file_size = get_file_size(head_response)
67
- if file_size > 50 * 1024 * 1024: # 50MB
68
- print(f"Security Error: File size {file_size} bytes exceeds 50MB limit.")
69
- sys.exit(1)
70
-
71
- # Actually download the file
72
- response = requests.get(url, headers=headers, stream=True, timeout=10)
73
- response.raise_for_status()
74
-
75
- # Double-check content type after download
76
- downloaded_content_type = response.headers.get('content-type', '').lower()
77
- if not is_safe_image_type(downloaded_content_type, url):
78
- print(f"Security Error: Downloaded content type '{downloaded_content_type}' is not a safe image type.")
79
- sys.exit(1)
80
-
81
- # Write file in chunks with size validation
82
- total_size = 0
83
- with open(output_path, 'wb') as f:
84
- for chunk in response.iter_content(chunk_size=8192):
85
- if chunk: # Filter out keep-alive chunks
86
- total_size += len(chunk)
87
- if total_size > 50 * 1024 * 1024: # 50MB limit
88
- print(f"Security Error: Downloaded file exceeds 50MB limit.")
89
- os.remove(output_path) # Clean up partial file
90
- sys.exit(1)
91
- f.write(chunk)
92
-
93
- print(f"Successfully saved to {output_path}")
94
-
95
- except Exception as e:
96
- print(f"Failed to download image: {e}")
97
- sys.exit(1)
98
-
99
- if __name__ == "__main__":
100
- parser = argparse.ArgumentParser(description="Download an image.")
101
- parser.add_argument("--url", required=True, help="Image URL")
102
- parser.add_argument("--output", required=True, help="Output file path")
103
-
104
- args = parser.parse_args()
105
-
1
+ import os
2
+ import sys
3
+ import requests
4
+ import argparse
5
+ import mimetypes
6
+ from urllib.parse import urlparse
7
+
8
+ def is_safe_image_type(content_type, url):
9
+ """
10
+ Check if the content type is a safe image type.
11
+ """
12
+ safe_types = {
13
+ 'image/jpeg', 'image/jpg', 'image/png', 'image/gif',
14
+ 'image/webp', 'image/bmp', 'image/svg+xml', 'image/tiff'
15
+ }
16
+
17
+ # Check content-type header
18
+ if content_type and content_type.lower() in safe_types:
19
+ return True
20
+
21
+ # Fallback: check file extension from URL
22
+ parsed_url = urlparse(url)
23
+ file_ext = os.path.splitext(parsed_url.path)[1].lower()
24
+ mime_type, _ = mimetypes.guess_type(f"dummy{file_ext}")
25
+
26
+ if mime_type and mime_type in safe_types:
27
+ return True
28
+
29
+ return False
30
+
31
+ def get_file_size(response):
32
+ """
33
+ Get the file size from the response headers.
34
+ """
35
+ content_length = response.headers.get('content-length')
36
+ if content_length:
37
+ return int(content_length)
38
+ return 0
39
+
40
+ def download_image(url, output_path):
41
+ """
42
+ Downloads an image from a URL with security validation.
43
+ """
44
+ # Ensure output directory exists if it's not the current directory
45
+ dir_name = os.path.dirname(output_path)
46
+ if dir_name:
47
+ os.makedirs(dir_name, exist_ok=True)
48
+
49
+ headers = {
50
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
51
+ }
52
+
53
+ try:
54
+ print(f"Downloading image from: {url}")
55
+
56
+ # First, make a HEAD request to check content type and size
57
+ head_response = requests.head(url, headers=headers, timeout=10)
58
+ content_type = head_response.headers.get('content-type', '').lower()
59
+
60
+ # Validate content type
61
+ if not is_safe_image_type(content_type, url):
62
+ print(f"Security Error: Content type '{content_type}' is not a safe image type.")
63
+ sys.exit(1)
64
+
65
+ # Check file size (limit to 50MB)
66
+ file_size = get_file_size(head_response)
67
+ if file_size > 50 * 1024 * 1024: # 50MB
68
+ print(f"Security Error: File size {file_size} bytes exceeds 50MB limit.")
69
+ sys.exit(1)
70
+
71
+ # Actually download the file
72
+ response = requests.get(url, headers=headers, stream=True, timeout=10)
73
+ response.raise_for_status()
74
+
75
+ # Double-check content type after download
76
+ downloaded_content_type = response.headers.get('content-type', '').lower()
77
+ if not is_safe_image_type(downloaded_content_type, url):
78
+ print(f"Security Error: Downloaded content type '{downloaded_content_type}' is not a safe image type.")
79
+ sys.exit(1)
80
+
81
+ # Write file in chunks with size validation
82
+ total_size = 0
83
+ with open(output_path, 'wb') as f:
84
+ for chunk in response.iter_content(chunk_size=8192):
85
+ if chunk: # Filter out keep-alive chunks
86
+ total_size += len(chunk)
87
+ if total_size > 50 * 1024 * 1024: # 50MB limit
88
+ print(f"Security Error: Downloaded file exceeds 50MB limit.")
89
+ os.remove(output_path) # Clean up partial file
90
+ sys.exit(1)
91
+ f.write(chunk)
92
+
93
+ print(f"Successfully saved to {output_path}")
94
+
95
+ except Exception as e:
96
+ print(f"Failed to download image: {e}")
97
+ sys.exit(1)
98
+
99
+ if __name__ == "__main__":
100
+ parser = argparse.ArgumentParser(description="Download an image.")
101
+ parser.add_argument("--url", required=True, help="Image URL")
102
+ parser.add_argument("--output", required=True, help="Output file path")
103
+
104
+ args = parser.parse_args()
105
+
106
106
  download_image(args.url, args.output)
@@ -1,163 +1,163 @@
1
- import sys
2
- import requests
3
- import io
4
- import time
5
- from random import uniform
6
- from pypdf import PdfReader
7
- import argparse
8
- import re
9
-
10
- def read_pdf(url, search_term=None, page_number=None):
11
- """
12
- Read a PDF from URL with optional search and page selection.
13
-
14
- Args:
15
- url: URL of the PDF
16
- search_term: Optional term to search for in the PDF
17
- page_number: Optional specific page to read (1-indexed)
18
- """
19
- # Add random delay to implement rate limiting
20
- delay = uniform(1, 3)
21
- print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
22
- time.sleep(delay)
23
-
24
- headers = {
25
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
26
- }
27
-
28
- try:
29
- response = requests.get(url, headers=headers, timeout=30)
30
- response.raise_for_status()
31
-
32
- f = io.BytesIO(response.content)
33
- reader = PdfReader(f)
34
- total_pages = len(reader.pages)
35
-
36
- print(f"📄 PDF loaded: {total_pages} pages")
37
-
38
- # If specific page requested
39
- if page_number:
40
- if 1 <= page_number <= total_pages:
41
- text = reader.pages[page_number - 1].extract_text()
42
- print(f"\n--- Page {page_number} of {total_pages} ---")
43
- print(text)
44
- return
45
- else:
46
- print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
47
- sys.exit(1)
48
-
49
- # If search term provided, find all occurrences
50
- if search_term:
51
- print(f"🔍 Searching for: '{search_term}'")
52
- matches = []
53
- search_lower = search_term.lower()
54
-
55
- for i, page in enumerate(reader.pages):
56
- page_text = page.extract_text()
57
- if page_text and search_lower in page_text.lower():
58
- # Find the context around the match
59
- lines = page_text.split('\n')
60
- for j, line in enumerate(lines):
61
- if search_lower in line.lower():
62
- # Get surrounding context (2 lines before/after)
63
- context_start = max(0, j - 2)
64
- context_end = min(len(lines), j + 3)
65
- context = '\n'.join(lines[context_start:context_end])
66
- matches.append({
67
- 'page': i + 1,
68
- 'line': line.strip(),
69
- 'context': context
70
- })
71
-
72
- if matches:
73
- print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
74
- for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
75
- print(f"{'='*60}")
76
- print(f"📍 Match {idx+1} - Page {match['page']}")
77
- print(f"{'='*60}")
78
- print(f"Line: {match['line']}")
79
- print(f"\nContext:")
80
- print(match['context'])
81
- print()
82
-
83
- if len(matches) > 10:
84
- print(f"... and {len(matches) - 10} more matches")
85
-
86
- # Suggest best page for screenshot
87
- best_page = matches[0]['page']
88
- print(f"\n📸 Suggested page for screenshot: Page {best_page}")
89
- print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
90
- else:
91
- print(f"❌ No matches found for '{search_term}'")
92
- return
93
-
94
- # Default: Smart extraction with priority for first and last pages
95
- MAX_PAGES = 15
96
- MAX_CHARS = 20000
97
-
98
- text = ""
99
- pages_to_read = []
100
-
101
- if total_pages <= MAX_PAGES:
102
- pages_to_read = list(range(total_pages))
103
- else:
104
- # Smart selection: first 7 + last 4
105
- first_pages = list(range(min(7, total_pages)))
106
- last_pages = list(range(max(0, total_pages - 4), total_pages))
107
- pages_to_read = sorted(set(first_pages + last_pages))
108
-
109
- print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
110
-
111
- for i in pages_to_read:
112
- page_text = reader.pages[i].extract_text()
113
- if page_text:
114
- text += f"\n--- Page {i+1} ---\n{page_text}"
115
-
116
- # Smart truncation with intro/conclusion preservation
117
- if len(text) > MAX_CHARS:
118
- intro = text[:8000]
119
- outro = text[-8000:]
120
- truncated = len(text) - MAX_CHARS
121
- print(f"--- PDF CONTENT START ---")
122
- print(intro)
123
- print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
124
- print(f"--- PDF CONTENT END ---")
125
- print(outro)
126
- else:
127
- print(text)
128
-
129
- except requests.exceptions.RequestException as e:
130
- print(f"Error downloading PDF: {e}")
131
- sys.exit(1)
132
- except Exception as e:
133
- print(f"Error reading PDF: {e}")
134
- sys.exit(1)
135
-
136
-
137
- def main():
138
- parser = argparse.ArgumentParser(
139
- description="Read and search PDF documents from URLs.",
140
- formatter_class=argparse.RawDescriptionHelpFormatter,
141
- epilog="""
142
- Examples:
143
- # Read PDF with smart extraction (first + last pages)
144
- python pdf_reader.py --url "https://example.com/report.pdf"
145
-
146
- # Search for specific term in PDF
147
- python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
148
-
149
- # Read specific page
150
- python pdf_reader.py --url "https://example.com/report.pdf" --page 5
151
- """
152
- )
153
-
154
- parser.add_argument("--url", required=True, help="URL of the PDF document")
155
- parser.add_argument("--search", "-s", help="Search for specific term and show context")
156
- parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
157
-
158
- args = parser.parse_args()
159
- read_pdf(args.url, search_term=args.search, page_number=args.page)
160
-
161
-
162
- if __name__ == "__main__":
163
- main()
1
+ import sys
2
+ import requests
3
+ import io
4
+ import time
5
+ from random import uniform
6
+ from pypdf import PdfReader
7
+ import argparse
8
+ import re
9
+
10
+ def read_pdf(url, search_term=None, page_number=None):
11
+ """
12
+ Read a PDF from URL with optional search and page selection.
13
+
14
+ Args:
15
+ url: URL of the PDF
16
+ search_term: Optional term to search for in the PDF
17
+ page_number: Optional specific page to read (1-indexed)
18
+ """
19
+ # Add random delay to implement rate limiting
20
+ delay = uniform(1, 3)
21
+ print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
22
+ time.sleep(delay)
23
+
24
+ headers = {
25
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
26
+ }
27
+
28
+ try:
29
+ response = requests.get(url, headers=headers, timeout=30)
30
+ response.raise_for_status()
31
+
32
+ f = io.BytesIO(response.content)
33
+ reader = PdfReader(f)
34
+ total_pages = len(reader.pages)
35
+
36
+ print(f"📄 PDF loaded: {total_pages} pages")
37
+
38
+ # If specific page requested
39
+ if page_number:
40
+ if 1 <= page_number <= total_pages:
41
+ text = reader.pages[page_number - 1].extract_text()
42
+ print(f"\n--- Page {page_number} of {total_pages} ---")
43
+ print(text)
44
+ return
45
+ else:
46
+ print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
47
+ sys.exit(1)
48
+
49
+ # If search term provided, find all occurrences
50
+ if search_term:
51
+ print(f"🔍 Searching for: '{search_term}'")
52
+ matches = []
53
+ search_lower = search_term.lower()
54
+
55
+ for i, page in enumerate(reader.pages):
56
+ page_text = page.extract_text()
57
+ if page_text and search_lower in page_text.lower():
58
+ # Find the context around the match
59
+ lines = page_text.split('\n')
60
+ for j, line in enumerate(lines):
61
+ if search_lower in line.lower():
62
+ # Get surrounding context (2 lines before/after)
63
+ context_start = max(0, j - 2)
64
+ context_end = min(len(lines), j + 3)
65
+ context = '\n'.join(lines[context_start:context_end])
66
+ matches.append({
67
+ 'page': i + 1,
68
+ 'line': line.strip(),
69
+ 'context': context
70
+ })
71
+
72
+ if matches:
73
+ print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
74
+ for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
75
+ print(f"{'='*60}")
76
+ print(f"📍 Match {idx+1} - Page {match['page']}")
77
+ print(f"{'='*60}")
78
+ print(f"Line: {match['line']}")
79
+ print(f"\nContext:")
80
+ print(match['context'])
81
+ print()
82
+
83
+ if len(matches) > 10:
84
+ print(f"... and {len(matches) - 10} more matches")
85
+
86
+ # Suggest best page for screenshot
87
+ best_page = matches[0]['page']
88
+ print(f"\n📸 Suggested page for screenshot: Page {best_page}")
89
+ print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
90
+ else:
91
+ print(f"❌ No matches found for '{search_term}'")
92
+ return
93
+
94
+ # Default: Smart extraction with priority for first and last pages
95
+ MAX_PAGES = 15
96
+ MAX_CHARS = 20000
97
+
98
+ text = ""
99
+ pages_to_read = []
100
+
101
+ if total_pages <= MAX_PAGES:
102
+ pages_to_read = list(range(total_pages))
103
+ else:
104
+ # Smart selection: first 7 + last 4
105
+ first_pages = list(range(min(7, total_pages)))
106
+ last_pages = list(range(max(0, total_pages - 4), total_pages))
107
+ pages_to_read = sorted(set(first_pages + last_pages))
108
+
109
+ print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
110
+
111
+ for i in pages_to_read:
112
+ page_text = reader.pages[i].extract_text()
113
+ if page_text:
114
+ text += f"\n--- Page {i+1} ---\n{page_text}"
115
+
116
+ # Smart truncation with intro/conclusion preservation
117
+ if len(text) > MAX_CHARS:
118
+ intro = text[:8000]
119
+ outro = text[-8000:]
120
+ truncated = len(text) - MAX_CHARS
121
+ print(f"--- PDF CONTENT START ---")
122
+ print(intro)
123
+ print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
124
+ print(f"--- PDF CONTENT END ---")
125
+ print(outro)
126
+ else:
127
+ print(text)
128
+
129
+ except requests.exceptions.RequestException as e:
130
+ print(f"Error downloading PDF: {e}")
131
+ sys.exit(1)
132
+ except Exception as e:
133
+ print(f"Error reading PDF: {e}")
134
+ sys.exit(1)
135
+
136
+
137
+ def main():
138
+ parser = argparse.ArgumentParser(
139
+ description="Read and search PDF documents from URLs.",
140
+ formatter_class=argparse.RawDescriptionHelpFormatter,
141
+ epilog="""
142
+ Examples:
143
+ # Read PDF with smart extraction (first + last pages)
144
+ python pdf_reader.py --url "https://example.com/report.pdf"
145
+
146
+ # Search for specific term in PDF
147
+ python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
148
+
149
+ # Read specific page
150
+ python pdf_reader.py --url "https://example.com/report.pdf" --page 5
151
+ """
152
+ )
153
+
154
+ parser.add_argument("--url", required=True, help="URL of the PDF document")
155
+ parser.add_argument("--search", "-s", help="Search for specific term and show context")
156
+ parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
157
+
158
+ args = parser.parse_args()
159
+ read_pdf(args.url, search_term=args.search, page_number=args.page)
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
@@ -1,58 +1,58 @@
1
- import sys
2
- import os
3
- import argparse
4
- import time
5
- from random import uniform
6
- from playwright.sync_api import sync_playwright
7
-
8
- def take_screenshot(url, output_path):
9
- # Add random delay to implement rate limiting
10
- delay = uniform(1, 3) # Random delay between 1-3 seconds
11
- print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
12
- time.sleep(delay)
13
-
14
- with sync_playwright() as p:
15
- browser = p.chromium.launch()
16
- page = browser.new_page()
17
-
18
- # Set additional headers to appear more like a real user
19
- page.set_extra_http_headers({
20
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
21
- "Accept-Language": "en-US,en;q=0.5",
22
- "Accept-Encoding": "gzip, deflate",
23
- "Connection": "keep-alive",
24
- "Upgrade-Insecure-Requests": "1",
25
- })
26
-
27
- try:
28
- print(f"Navigating to {url}...")
29
- page.goto(url, timeout=30000)
30
- # Wait a bit for dynamic content (e.g., Twitter embeds)
31
- page.wait_for_timeout(2000)
32
-
33
- page.screenshot(path=output_path, full_page=False)
34
-
35
- # Validate that the file was created and has content
36
- if os.path.exists(output_path):
37
- file_size = os.path.getsize(output_path)
38
- if file_size == 0:
39
- print(f"Error: Screenshot file is empty: {output_path}")
40
- sys.exit(1)
41
- else:
42
- print(f"File validation: {output_path} created with size {file_size} bytes")
43
- else:
44
- print(f"Error: Screenshot file does not exist: {output_path}")
45
- sys.exit(1)
46
-
47
- except Exception as e:
48
- print(f"Error taking screenshot: {e}")
49
- sys.exit(1)
50
- finally:
51
- browser.close()
52
-
53
- if __name__ == "__main__":
54
- parser = argparse.ArgumentParser()
55
- parser.add_argument("--url", required=True)
56
- parser.add_argument("--output", required=True)
57
- args = parser.parse_args()
58
- take_screenshot(args.url, args.output)
1
+ import sys
2
+ import os
3
+ import argparse
4
+ import time
5
+ from random import uniform
6
+ from playwright.sync_api import sync_playwright
7
+
8
+ def take_screenshot(url, output_path):
9
+ # Add random delay to implement rate limiting
10
+ delay = uniform(1, 3) # Random delay between 1-3 seconds
11
+ print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
12
+ time.sleep(delay)
13
+
14
+ with sync_playwright() as p:
15
+ browser = p.chromium.launch()
16
+ page = browser.new_page()
17
+
18
+ # Set additional headers to appear more like a real user
19
+ page.set_extra_http_headers({
20
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
21
+ "Accept-Language": "en-US,en;q=0.5",
22
+ "Accept-Encoding": "gzip, deflate",
23
+ "Connection": "keep-alive",
24
+ "Upgrade-Insecure-Requests": "1",
25
+ })
26
+
27
+ try:
28
+ print(f"Navigating to {url}...")
29
+ page.goto(url, timeout=30000)
30
+ # Wait a bit for dynamic content (e.g., Twitter embeds)
31
+ page.wait_for_timeout(2000)
32
+
33
+ page.screenshot(path=output_path, full_page=False)
34
+
35
+ # Validate that the file was created and has content
36
+ if os.path.exists(output_path):
37
+ file_size = os.path.getsize(output_path)
38
+ if file_size == 0:
39
+ print(f"Error: Screenshot file is empty: {output_path}")
40
+ sys.exit(1)
41
+ else:
42
+ print(f"File validation: {output_path} created with size {file_size} bytes")
43
+ else:
44
+ print(f"Error: Screenshot file does not exist: {output_path}")
45
+ sys.exit(1)
46
+
47
+ except Exception as e:
48
+ print(f"Error taking screenshot: {e}")
49
+ sys.exit(1)
50
+ finally:
51
+ browser.close()
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument("--url", required=True)
56
+ parser.add_argument("--output", required=True)
57
+ args = parser.parse_args()
58
+ take_screenshot(args.url, args.output)