videonut 1.2.7 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +272 -272
  2. package/USER_GUIDE.md +90 -90
  3. package/agents/core/eic.md +771 -771
  4. package/agents/creative/director.md +246 -246
  5. package/agents/creative/scriptwriter.md +207 -207
  6. package/agents/research/investigator.md +394 -394
  7. package/agents/technical/archivist.md +288 -288
  8. package/agents/technical/scavenger.md +247 -247
  9. package/bin/videonut.js +37 -21
  10. package/config.yaml +61 -61
  11. package/docs/scriptwriter.md +42 -42
  12. package/file_validator.py +186 -186
  13. package/memory/short_term/asset_manifest.md +64 -64
  14. package/memory/short_term/investigation_dossier.md +31 -31
  15. package/memory/short_term/master_script.md +51 -51
  16. package/package.json +61 -64
  17. package/requirements.txt +8 -8
  18. package/setup.js +33 -15
  19. package/tools/check_env.py +76 -76
  20. package/tools/downloaders/caption_reader.py +237 -237
  21. package/tools/downloaders/clip_grabber.py +82 -82
  22. package/tools/downloaders/image_grabber.py +105 -105
  23. package/tools/downloaders/pdf_reader.py +163 -163
  24. package/tools/downloaders/screenshotter.py +58 -58
  25. package/tools/downloaders/web_reader.py +69 -69
  26. package/tools/validators/link_checker.py +45 -45
  27. package/workflow_orchestrator.py +336 -336
  28. package/.claude/commands/archivist.toml +0 -12
  29. package/.claude/commands/director.toml +0 -12
  30. package/.claude/commands/eic.toml +0 -12
  31. package/.claude/commands/investigator.toml +0 -12
  32. package/.claude/commands/prompt.toml +0 -12
  33. package/.claude/commands/scavenger.toml +0 -12
  34. package/.claude/commands/scout.toml +0 -12
  35. package/.claude/commands/scriptwriter.toml +0 -12
  36. package/.claude/commands/seo.toml +0 -12
  37. package/.claude/commands/thumbnail.toml +0 -12
  38. package/.claude/commands/topic_scout.toml +0 -12
  39. package/.gemini/commands/archivist.toml +0 -12
  40. package/.gemini/commands/director.toml +0 -12
  41. package/.gemini/commands/eic.toml +0 -12
  42. package/.gemini/commands/investigator.toml +0 -12
  43. package/.gemini/commands/prompt.toml +0 -12
  44. package/.gemini/commands/scavenger.toml +0 -12
  45. package/.gemini/commands/scout.toml +0 -12
  46. package/.gemini/commands/scriptwriter.toml +0 -12
  47. package/.gemini/commands/seo.toml +0 -12
  48. package/.gemini/commands/thumbnail.toml +0 -12
  49. package/.gemini/commands/topic_scout.toml +0 -12
  50. package/.qwen/commands/archivist.toml +0 -12
  51. package/.qwen/commands/director.toml +0 -12
  52. package/.qwen/commands/eic.toml +0 -12
  53. package/.qwen/commands/investigator.toml +0 -12
  54. package/.qwen/commands/prompt.toml +0 -12
  55. package/.qwen/commands/scavenger.toml +0 -12
  56. package/.qwen/commands/scout.toml +0 -12
  57. package/.qwen/commands/scriptwriter.toml +0 -12
  58. package/.qwen/commands/seo.toml +0 -12
  59. package/.qwen/commands/thumbnail.toml +0 -12
  60. package/.qwen/commands/topic_scout.toml +0 -12
@@ -1,163 +1,163 @@
1
- import sys
2
- import requests
3
- import io
4
- import time
5
- from random import uniform
6
- from pypdf import PdfReader
7
- import argparse
8
- import re
9
-
10
- def read_pdf(url, search_term=None, page_number=None):
11
- """
12
- Read a PDF from URL with optional search and page selection.
13
-
14
- Args:
15
- url: URL of the PDF
16
- search_term: Optional term to search for in the PDF
17
- page_number: Optional specific page to read (1-indexed)
18
- """
19
- # Add random delay to implement rate limiting
20
- delay = uniform(1, 3)
21
- print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
22
- time.sleep(delay)
23
-
24
- headers = {
25
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
26
- }
27
-
28
- try:
29
- response = requests.get(url, headers=headers, timeout=30)
30
- response.raise_for_status()
31
-
32
- f = io.BytesIO(response.content)
33
- reader = PdfReader(f)
34
- total_pages = len(reader.pages)
35
-
36
- print(f"📄 PDF loaded: {total_pages} pages")
37
-
38
- # If specific page requested
39
- if page_number:
40
- if 1 <= page_number <= total_pages:
41
- text = reader.pages[page_number - 1].extract_text()
42
- print(f"\n--- Page {page_number} of {total_pages} ---")
43
- print(text)
44
- return
45
- else:
46
- print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
47
- sys.exit(1)
48
-
49
- # If search term provided, find all occurrences
50
- if search_term:
51
- print(f"🔍 Searching for: '{search_term}'")
52
- matches = []
53
- search_lower = search_term.lower()
54
-
55
- for i, page in enumerate(reader.pages):
56
- page_text = page.extract_text()
57
- if page_text and search_lower in page_text.lower():
58
- # Find the context around the match
59
- lines = page_text.split('\n')
60
- for j, line in enumerate(lines):
61
- if search_lower in line.lower():
62
- # Get surrounding context (2 lines before/after)
63
- context_start = max(0, j - 2)
64
- context_end = min(len(lines), j + 3)
65
- context = '\n'.join(lines[context_start:context_end])
66
- matches.append({
67
- 'page': i + 1,
68
- 'line': line.strip(),
69
- 'context': context
70
- })
71
-
72
- if matches:
73
- print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
74
- for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
75
- print(f"{'='*60}")
76
- print(f"📍 Match {idx+1} - Page {match['page']}")
77
- print(f"{'='*60}")
78
- print(f"Line: {match['line']}")
79
- print(f"\nContext:")
80
- print(match['context'])
81
- print()
82
-
83
- if len(matches) > 10:
84
- print(f"... and {len(matches) - 10} more matches")
85
-
86
- # Suggest best page for screenshot
87
- best_page = matches[0]['page']
88
- print(f"\n📸 Suggested page for screenshot: Page {best_page}")
89
- print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
90
- else:
91
- print(f"❌ No matches found for '{search_term}'")
92
- return
93
-
94
- # Default: Smart extraction with priority for first and last pages
95
- MAX_PAGES = 15
96
- MAX_CHARS = 20000
97
-
98
- text = ""
99
- pages_to_read = []
100
-
101
- if total_pages <= MAX_PAGES:
102
- pages_to_read = list(range(total_pages))
103
- else:
104
- # Smart selection: first 7 + last 4
105
- first_pages = list(range(min(7, total_pages)))
106
- last_pages = list(range(max(0, total_pages - 4), total_pages))
107
- pages_to_read = sorted(set(first_pages + last_pages))
108
-
109
- print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
110
-
111
- for i in pages_to_read:
112
- page_text = reader.pages[i].extract_text()
113
- if page_text:
114
- text += f"\n--- Page {i+1} ---\n{page_text}"
115
-
116
- # Smart truncation with intro/conclusion preservation
117
- if len(text) > MAX_CHARS:
118
- intro = text[:8000]
119
- outro = text[-8000:]
120
- truncated = len(text) - MAX_CHARS
121
- print(f"--- PDF CONTENT START ---")
122
- print(intro)
123
- print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
124
- print(f"--- PDF CONTENT END ---")
125
- print(outro)
126
- else:
127
- print(text)
128
-
129
- except requests.exceptions.RequestException as e:
130
- print(f"Error downloading PDF: {e}")
131
- sys.exit(1)
132
- except Exception as e:
133
- print(f"Error reading PDF: {e}")
134
- sys.exit(1)
135
-
136
-
137
- def main():
138
- parser = argparse.ArgumentParser(
139
- description="Read and search PDF documents from URLs.",
140
- formatter_class=argparse.RawDescriptionHelpFormatter,
141
- epilog="""
142
- Examples:
143
- # Read PDF with smart extraction (first + last pages)
144
- python pdf_reader.py --url "https://example.com/report.pdf"
145
-
146
- # Search for specific term in PDF
147
- python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
148
-
149
- # Read specific page
150
- python pdf_reader.py --url "https://example.com/report.pdf" --page 5
151
- """
152
- )
153
-
154
- parser.add_argument("--url", required=True, help="URL of the PDF document")
155
- parser.add_argument("--search", "-s", help="Search for specific term and show context")
156
- parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
157
-
158
- args = parser.parse_args()
159
- read_pdf(args.url, search_term=args.search, page_number=args.page)
160
-
161
-
162
- if __name__ == "__main__":
163
- main()
1
+ import sys
2
+ import requests
3
+ import io
4
+ import time
5
+ from random import uniform
6
+ from pypdf import PdfReader
7
+ import argparse
8
+ import re
9
+
10
+ def read_pdf(url, search_term=None, page_number=None):
11
+ """
12
+ Read a PDF from URL with optional search and page selection.
13
+
14
+ Args:
15
+ url: URL of the PDF
16
+ search_term: Optional term to search for in the PDF
17
+ page_number: Optional specific page to read (1-indexed)
18
+ """
19
+ # Add random delay to implement rate limiting
20
+ delay = uniform(1, 3)
21
+ print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
22
+ time.sleep(delay)
23
+
24
+ headers = {
25
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
26
+ }
27
+
28
+ try:
29
+ response = requests.get(url, headers=headers, timeout=30)
30
+ response.raise_for_status()
31
+
32
+ f = io.BytesIO(response.content)
33
+ reader = PdfReader(f)
34
+ total_pages = len(reader.pages)
35
+
36
+ print(f"📄 PDF loaded: {total_pages} pages")
37
+
38
+ # If specific page requested
39
+ if page_number:
40
+ if 1 <= page_number <= total_pages:
41
+ text = reader.pages[page_number - 1].extract_text()
42
+ print(f"\n--- Page {page_number} of {total_pages} ---")
43
+ print(text)
44
+ return
45
+ else:
46
+ print(f"Error: Page {page_number} not found. PDF has {total_pages} pages.")
47
+ sys.exit(1)
48
+
49
+ # If search term provided, find all occurrences
50
+ if search_term:
51
+ print(f"🔍 Searching for: '{search_term}'")
52
+ matches = []
53
+ search_lower = search_term.lower()
54
+
55
+ for i, page in enumerate(reader.pages):
56
+ page_text = page.extract_text()
57
+ if page_text and search_lower in page_text.lower():
58
+ # Find the context around the match
59
+ lines = page_text.split('\n')
60
+ for j, line in enumerate(lines):
61
+ if search_lower in line.lower():
62
+ # Get surrounding context (2 lines before/after)
63
+ context_start = max(0, j - 2)
64
+ context_end = min(len(lines), j + 3)
65
+ context = '\n'.join(lines[context_start:context_end])
66
+ matches.append({
67
+ 'page': i + 1,
68
+ 'line': line.strip(),
69
+ 'context': context
70
+ })
71
+
72
+ if matches:
73
+ print(f"\n✅ Found {len(matches)} matches for '{search_term}':\n")
74
+ for idx, match in enumerate(matches[:10]): # Limit to first 10 matches
75
+ print(f"{'='*60}")
76
+ print(f"📍 Match {idx+1} - Page {match['page']}")
77
+ print(f"{'='*60}")
78
+ print(f"Line: {match['line']}")
79
+ print(f"\nContext:")
80
+ print(match['context'])
81
+ print()
82
+
83
+ if len(matches) > 10:
84
+ print(f"... and {len(matches) - 10} more matches")
85
+
86
+ # Suggest best page for screenshot
87
+ best_page = matches[0]['page']
88
+ print(f"\n📸 Suggested page for screenshot: Page {best_page}")
89
+ print(f" Use: python pdf_reader.py --url \"{url}\" --page {best_page}")
90
+ else:
91
+ print(f"❌ No matches found for '{search_term}'")
92
+ return
93
+
94
+ # Default: Smart extraction with priority for first and last pages
95
+ MAX_PAGES = 15
96
+ MAX_CHARS = 20000
97
+
98
+ text = ""
99
+ pages_to_read = []
100
+
101
+ if total_pages <= MAX_PAGES:
102
+ pages_to_read = list(range(total_pages))
103
+ else:
104
+ # Smart selection: first 7 + last 4
105
+ first_pages = list(range(min(7, total_pages)))
106
+ last_pages = list(range(max(0, total_pages - 4), total_pages))
107
+ pages_to_read = sorted(set(first_pages + last_pages))
108
+
109
+ print(f"📄 Document has {total_pages} pages. Reading pages: {[p+1 for p in pages_to_read]} (first + last priority)")
110
+
111
+ for i in pages_to_read:
112
+ page_text = reader.pages[i].extract_text()
113
+ if page_text:
114
+ text += f"\n--- Page {i+1} ---\n{page_text}"
115
+
116
+ # Smart truncation with intro/conclusion preservation
117
+ if len(text) > MAX_CHARS:
118
+ intro = text[:8000]
119
+ outro = text[-8000:]
120
+ truncated = len(text) - MAX_CHARS
121
+ print(f"--- PDF CONTENT START ---")
122
+ print(intro)
123
+ print(f"\n\n[... {truncated:,} characters truncated from middle ...]\n\n")
124
+ print(f"--- PDF CONTENT END ---")
125
+ print(outro)
126
+ else:
127
+ print(text)
128
+
129
+ except requests.exceptions.RequestException as e:
130
+ print(f"Error downloading PDF: {e}")
131
+ sys.exit(1)
132
+ except Exception as e:
133
+ print(f"Error reading PDF: {e}")
134
+ sys.exit(1)
135
+
136
+
137
+ def main():
138
+ parser = argparse.ArgumentParser(
139
+ description="Read and search PDF documents from URLs.",
140
+ formatter_class=argparse.RawDescriptionHelpFormatter,
141
+ epilog="""
142
+ Examples:
143
+ # Read PDF with smart extraction (first + last pages)
144
+ python pdf_reader.py --url "https://example.com/report.pdf"
145
+
146
+ # Search for specific term in PDF
147
+ python pdf_reader.py --url "https://example.com/report.pdf" --search "Prime Minister"
148
+
149
+ # Read specific page
150
+ python pdf_reader.py --url "https://example.com/report.pdf" --page 5
151
+ """
152
+ )
153
+
154
+ parser.add_argument("--url", required=True, help="URL of the PDF document")
155
+ parser.add_argument("--search", "-s", help="Search for specific term and show context")
156
+ parser.add_argument("--page", "-p", type=int, help="Read specific page number (1-indexed)")
157
+
158
+ args = parser.parse_args()
159
+ read_pdf(args.url, search_term=args.search, page_number=args.page)
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
@@ -1,58 +1,58 @@
1
- import sys
2
- import os
3
- import argparse
4
- import time
5
- from random import uniform
6
- from playwright.sync_api import sync_playwright
7
-
8
- def take_screenshot(url, output_path):
9
- # Add random delay to implement rate limiting
10
- delay = uniform(1, 3) # Random delay between 1-3 seconds
11
- print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
12
- time.sleep(delay)
13
-
14
- with sync_playwright() as p:
15
- browser = p.chromium.launch()
16
- page = browser.new_page()
17
-
18
- # Set additional headers to appear more like a real user
19
- page.set_extra_http_headers({
20
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
21
- "Accept-Language": "en-US,en;q=0.5",
22
- "Accept-Encoding": "gzip, deflate",
23
- "Connection": "keep-alive",
24
- "Upgrade-Insecure-Requests": "1",
25
- })
26
-
27
- try:
28
- print(f"Navigating to {url}...")
29
- page.goto(url, timeout=30000)
30
- # Wait a bit for dynamic content (e.g., Twitter embeds)
31
- page.wait_for_timeout(2000)
32
-
33
- page.screenshot(path=output_path, full_page=False)
34
-
35
- # Validate that the file was created and has content
36
- if os.path.exists(output_path):
37
- file_size = os.path.getsize(output_path)
38
- if file_size == 0:
39
- print(f"Error: Screenshot file is empty: {output_path}")
40
- sys.exit(1)
41
- else:
42
- print(f"File validation: {output_path} created with size {file_size} bytes")
43
- else:
44
- print(f"Error: Screenshot file does not exist: {output_path}")
45
- sys.exit(1)
46
-
47
- except Exception as e:
48
- print(f"Error taking screenshot: {e}")
49
- sys.exit(1)
50
- finally:
51
- browser.close()
52
-
53
- if __name__ == "__main__":
54
- parser = argparse.ArgumentParser()
55
- parser.add_argument("--url", required=True)
56
- parser.add_argument("--output", required=True)
57
- args = parser.parse_args()
58
- take_screenshot(args.url, args.output)
1
+ import sys
2
+ import os
3
+ import argparse
4
+ import time
5
+ from random import uniform
6
+ from playwright.sync_api import sync_playwright
7
+
8
+ def take_screenshot(url, output_path):
9
+ # Add random delay to implement rate limiting
10
+ delay = uniform(1, 3) # Random delay between 1-3 seconds
11
+ print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
12
+ time.sleep(delay)
13
+
14
+ with sync_playwright() as p:
15
+ browser = p.chromium.launch()
16
+ page = browser.new_page()
17
+
18
+ # Set additional headers to appear more like a real user
19
+ page.set_extra_http_headers({
20
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
21
+ "Accept-Language": "en-US,en;q=0.5",
22
+ "Accept-Encoding": "gzip, deflate",
23
+ "Connection": "keep-alive",
24
+ "Upgrade-Insecure-Requests": "1",
25
+ })
26
+
27
+ try:
28
+ print(f"Navigating to {url}...")
29
+ page.goto(url, timeout=30000)
30
+ # Wait a bit for dynamic content (e.g., Twitter embeds)
31
+ page.wait_for_timeout(2000)
32
+
33
+ page.screenshot(path=output_path, full_page=False)
34
+
35
+ # Validate that the file was created and has content
36
+ if os.path.exists(output_path):
37
+ file_size = os.path.getsize(output_path)
38
+ if file_size == 0:
39
+ print(f"Error: Screenshot file is empty: {output_path}")
40
+ sys.exit(1)
41
+ else:
42
+ print(f"File validation: {output_path} created with size {file_size} bytes")
43
+ else:
44
+ print(f"Error: Screenshot file does not exist: {output_path}")
45
+ sys.exit(1)
46
+
47
+ except Exception as e:
48
+ print(f"Error taking screenshot: {e}")
49
+ sys.exit(1)
50
+ finally:
51
+ browser.close()
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument("--url", required=True)
56
+ parser.add_argument("--output", required=True)
57
+ args = parser.parse_args()
58
+ take_screenshot(args.url, args.output)
@@ -1,69 +1,69 @@
1
- import sys
2
- import argparse
3
- import time
4
- from random import uniform
5
- from playwright.sync_api import sync_playwright
6
-
7
- def read_webpage(url):
8
- try:
9
- # Add random delay to implement rate limiting
10
- delay = uniform(1, 3) # Random delay between 1-3 seconds
11
- print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
12
- time.sleep(delay)
13
-
14
- with sync_playwright() as p:
15
- # Launch browser (headless by default)
16
- browser = p.chromium.launch()
17
- page = browser.new_page()
18
-
19
- # Set additional headers to appear more like a real user
20
- page.set_extra_http_headers({
21
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
22
- "Accept-Language": "en-US,en;q=0.5",
23
- "Accept-Encoding": "gzip, deflate",
24
- "Connection": "keep-alive",
25
- "Upgrade-Insecure-Requests": "1",
26
- })
27
-
28
- # Navigate with a reasonable timeout
29
- page.goto(url, timeout=30000)
30
-
31
- # Wait for content to load (basic heuristic)
32
- page.wait_for_load_state("domcontentloaded")
33
-
34
- # Get the text content
35
- # We use evaluate to get innerText which mimics what a user sees (hidden text is ignored)
36
- text = page.evaluate("document.body.innerText")
37
-
38
- # Basic cleanup: Remove excessive newlines
39
- clean_text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()])
40
-
41
- # Smart truncation: Preserve intro AND conclusion (critical for research)
42
- MAX_TOTAL = 40000 # Increased from 25000
43
- INTRO_SIZE = 8000 # First portion (hook/summary)
44
- OUTRO_SIZE = 8000 # Last portion (conclusion/recommendations)
45
-
46
- if len(clean_text) > MAX_TOTAL:
47
- intro = clean_text[:INTRO_SIZE]
48
- outro = clean_text[-OUTRO_SIZE:]
49
- truncated_chars = len(clean_text) - MAX_TOTAL
50
-
51
- print(f"--- CONTENT START (First {INTRO_SIZE} chars) ---")
52
- print(intro)
53
- print(f"\n\n[... {truncated_chars:,} CHARACTERS TRUNCATED - Middle section omitted to preserve intro and conclusion ...]\n\n")
54
- print(f"--- CONTENT END (Last {OUTRO_SIZE} chars) ---")
55
- print(outro)
56
- else:
57
- print(clean_text)
58
-
59
- browser.close()
60
-
61
- except Exception as e:
62
- print(f"Error reading webpage: {e}")
63
- sys.exit(1)
64
-
65
- if __name__ == "__main__":
66
- parser = argparse.ArgumentParser()
67
- parser.add_argument("--url", required=True)
68
- args = parser.parse_args()
69
- read_webpage(args.url)
1
+ import sys
2
+ import argparse
3
+ import time
4
+ from random import uniform
5
+ from playwright.sync_api import sync_playwright
6
+
7
+ def read_webpage(url):
8
+ try:
9
+ # Add random delay to implement rate limiting
10
+ delay = uniform(1, 3) # Random delay between 1-3 seconds
11
+ print(f"Rate limiting: Waiting {delay:.2f} seconds before accessing {url}")
12
+ time.sleep(delay)
13
+
14
+ with sync_playwright() as p:
15
+ # Launch browser (headless by default)
16
+ browser = p.chromium.launch()
17
+ page = browser.new_page()
18
+
19
+ # Set additional headers to appear more like a real user
20
+ page.set_extra_http_headers({
21
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
22
+ "Accept-Language": "en-US,en;q=0.5",
23
+ "Accept-Encoding": "gzip, deflate",
24
+ "Connection": "keep-alive",
25
+ "Upgrade-Insecure-Requests": "1",
26
+ })
27
+
28
+ # Navigate with a reasonable timeout
29
+ page.goto(url, timeout=30000)
30
+
31
+ # Wait for content to load (basic heuristic)
32
+ page.wait_for_load_state("domcontentloaded")
33
+
34
+ # Get the text content
35
+ # We use evaluate to get innerText which mimics what a user sees (hidden text is ignored)
36
+ text = page.evaluate("document.body.innerText")
37
+
38
+ # Basic cleanup: Remove excessive newlines
39
+ clean_text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()])
40
+
41
+ # Smart truncation: Preserve intro AND conclusion (critical for research)
42
+ MAX_TOTAL = 40000 # Increased from 25000
43
+ INTRO_SIZE = 8000 # First portion (hook/summary)
44
+ OUTRO_SIZE = 8000 # Last portion (conclusion/recommendations)
45
+
46
+ if len(clean_text) > MAX_TOTAL:
47
+ intro = clean_text[:INTRO_SIZE]
48
+ outro = clean_text[-OUTRO_SIZE:]
49
+ truncated_chars = len(clean_text) - MAX_TOTAL
50
+
51
+ print(f"--- CONTENT START (First {INTRO_SIZE} chars) ---")
52
+ print(intro)
53
+ print(f"\n\n[... {truncated_chars:,} CHARACTERS TRUNCATED - Middle section omitted to preserve intro and conclusion ...]\n\n")
54
+ print(f"--- CONTENT END (Last {OUTRO_SIZE} chars) ---")
55
+ print(outro)
56
+ else:
57
+ print(clean_text)
58
+
59
+ browser.close()
60
+
61
+ except Exception as e:
62
+ print(f"Error reading webpage: {e}")
63
+ sys.exit(1)
64
+
65
+ if __name__ == "__main__":
66
+ parser = argparse.ArgumentParser()
67
+ parser.add_argument("--url", required=True)
68
+ args = parser.parse_args()
69
+ read_webpage(args.url)