superbrain-server 1.0.2-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/superbrain.js +196 -0
- package/package.json +23 -0
- package/payload/.dockerignore +45 -0
- package/payload/.env.example +58 -0
- package/payload/Dockerfile +73 -0
- package/payload/analyzers/__init__.py +0 -0
- package/payload/analyzers/audio_transcribe.py +225 -0
- package/payload/analyzers/caption.py +244 -0
- package/payload/analyzers/music_identifier.py +346 -0
- package/payload/analyzers/text_analyzer.py +117 -0
- package/payload/analyzers/visual_analyze.py +218 -0
- package/payload/analyzers/webpage_analyzer.py +789 -0
- package/payload/analyzers/youtube_analyzer.py +320 -0
- package/payload/api.py +1676 -0
- package/payload/config/.api_keys.example +22 -0
- package/payload/config/model_rankings.json +492 -0
- package/payload/config/openrouter_free_models.json +1364 -0
- package/payload/config/whisper_model.txt +1 -0
- package/payload/config_settings.py +185 -0
- package/payload/core/__init__.py +0 -0
- package/payload/core/category_manager.py +219 -0
- package/payload/core/database.py +811 -0
- package/payload/core/link_checker.py +300 -0
- package/payload/core/model_router.py +1253 -0
- package/payload/docker-compose.yml +120 -0
- package/payload/instagram/__init__.py +0 -0
- package/payload/instagram/instagram_downloader.py +253 -0
- package/payload/instagram/instagram_login.py +190 -0
- package/payload/main.py +912 -0
- package/payload/requirements.txt +39 -0
- package/payload/reset.py +311 -0
- package/payload/start-docker-prod.sh +125 -0
- package/payload/start-docker.sh +56 -0
- package/payload/start.py +1302 -0
- package/payload/static/favicon.ico +0 -0
- package/payload/stop-docker.sh +16 -0
- package/payload/utils/__init__.py +0 -0
- package/payload/utils/db_stats.py +108 -0
- package/payload/utils/manage_token.py +91 -0
|
@@ -0,0 +1,789 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Web Page Analyzer for SuperBrain
|
|
4
|
+
==================================
|
|
5
|
+
Multi-strategy fetcher with platform-aware content extraction.
|
|
6
|
+
|
|
7
|
+
Fetch priority chain:
|
|
8
|
+
1. Reddit → official .json API (no scraping needed)
|
|
9
|
+
2. Medium → scribe.rip → freedium.cfd proxy chain
|
|
10
|
+
3. newspaper4k → fast article parser, works on most news/blog sites
|
|
11
|
+
4. trafilatura → best-in-class boilerplate remover, handles forums
|
|
12
|
+
5. Wayback Machine → archive.org snapshot for blocked/paywalled pages
|
|
13
|
+
6. BeautifulSoup → raw HTML last-resort fallback
|
|
14
|
+
|
|
15
|
+
Thumbnail priority:
|
|
16
|
+
og:image / twitter:image → article first <img> → platform favicon URL → SVG card
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
import re
|
|
21
|
+
import base64
|
|
22
|
+
import json
|
|
23
|
+
import time
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from urllib.parse import urlparse, urljoin
|
|
26
|
+
|
|
27
|
+
# Ensure backend root is in sys.path (needed when run as a subprocess)
|
|
28
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Prompt template
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
_WEB_PROMPT_TPL = """Analyze the following web page content and generate a structured report.
|
|
36
|
+
|
|
37
|
+
URL: {url}
|
|
38
|
+
Page Title: {page_title}
|
|
39
|
+
|
|
40
|
+
CONTENT:
|
|
41
|
+
{content}
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
Generate the report in this EXACT format (use these exact emoji headers):
|
|
46
|
+
|
|
47
|
+
📌 TITLE:
|
|
48
|
+
[Clear descriptive title for this content]
|
|
49
|
+
|
|
50
|
+
📝 SUMMARY:
|
|
51
|
+
[3-5 sentence summary covering: main topic, key information, important facts,
|
|
52
|
+
any products/places/tools mentioned, and the overall purpose of the page]
|
|
53
|
+
|
|
54
|
+
🏷️ TAGS:
|
|
55
|
+
[8-12 relevant hashtags/keywords separated by spaces, e.g. #python #tutorial #beginners]
|
|
56
|
+
|
|
57
|
+
🎵 MUSIC:
|
|
58
|
+
[N/A — web page]
|
|
59
|
+
|
|
60
|
+
📂 CATEGORY:
|
|
61
|
+
[Choose exactly ONE from: product, places, recipe, software, book, tv shows, workout, film, event, other]
|
|
62
|
+
|
|
63
|
+
Be specific and factual. Extract real names, numbers, and details from the content."""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
# Browser-like headers (shared across all strategies)
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
_HEADERS = {
|
|
71
|
+
"User-Agent": (
|
|
72
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
73
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
74
|
+
"Chrome/122.0.0.0 Safari/537.36"
|
|
75
|
+
),
|
|
76
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
77
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
78
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
79
|
+
"DNT": "1",
|
|
80
|
+
"Connection": "keep-alive",
|
|
81
|
+
"Upgrade-Insecure-Requests": "1",
|
|
82
|
+
"Sec-Fetch-Dest": "document",
|
|
83
|
+
"Sec-Fetch-Mode": "navigate",
|
|
84
|
+
"Sec-Fetch-Site": "none",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
# Platform detection helpers
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
def _netloc(url: str) -> str:
|
|
93
|
+
return urlparse(url).netloc.lower()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _is_reddit(url: str) -> bool:
|
|
97
|
+
nl = _netloc(url)
|
|
98
|
+
return "reddit.com" in nl or "redd.it" in nl
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Known custom-domain Medium publications (add more as needed)
|
|
102
|
+
_MEDIUM_CUSTOM_DOMAINS = {
|
|
103
|
+
"towardsdatascience.com", "bettermarketing.pub", "uxdesign.cc",
|
|
104
|
+
"itnext.io", "betterprogramming.pub", "entrepreneurshandbook.co",
|
|
105
|
+
"theascent.pub", "personal-growth.org", "onezero.medium.com",
|
|
106
|
+
"writingcooperative.com",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _is_medium(url: str) -> bool:
|
|
111
|
+
"""medium.com subdomains + custom-domain Medium publications."""
|
|
112
|
+
nl = _netloc(url)
|
|
113
|
+
if "medium.com" in nl: # covers safeti.medium.com, medium.com, etc.
|
|
114
|
+
return True
|
|
115
|
+
return nl in _MEDIUM_CUSTOM_DOMAINS
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _is_hacker_news(url: str) -> bool:
|
|
119
|
+
return "news.ycombinator.com" in _netloc(url)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# Thumbnail helpers
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def _abs_url(src: str, page_url: str) -> str:
|
|
127
|
+
"""Convert relative/protocol-relative URL to absolute."""
|
|
128
|
+
if not src:
|
|
129
|
+
return ""
|
|
130
|
+
if src.startswith("//"):
|
|
131
|
+
return "https:" + src
|
|
132
|
+
if src.startswith("/"):
|
|
133
|
+
p = urlparse(page_url)
|
|
134
|
+
return f"{p.scheme}://{p.netloc}{src}"
|
|
135
|
+
if not src.startswith("http"):
|
|
136
|
+
return urljoin(page_url, src)
|
|
137
|
+
return src
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _get_favicon_url(url: str) -> str:
|
|
141
|
+
"""
|
|
142
|
+
Return a Google-served favicon URL for the domain.
|
|
143
|
+
sz=128 returns up to 128x128 PNG — always resolves (falls back to globe icon).
|
|
144
|
+
"""
|
|
145
|
+
p = urlparse(url)
|
|
146
|
+
domain = f"{p.scheme}://{p.netloc}"
|
|
147
|
+
return f"https://www.google.com/s2/favicons?sz=128&domain_url={domain}"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
_GREY_SVG_COLORS = [
|
|
151
|
+
"#4F46E5", "#0891B2", "#059669", "#D97706",
|
|
152
|
+
"#DC2626", "#7C3AED", "#DB2777", "#0369A1",
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _make_svg_placeholder(domain: str) -> str:
|
|
157
|
+
colour = _GREY_SVG_COLORS[sum(ord(c) for c in domain) % len(_GREY_SVG_COLORS)]
|
|
158
|
+
label = domain[:30]
|
|
159
|
+
svg = (
|
|
160
|
+
f'<svg xmlns="http://www.w3.org/2000/svg" width="480" height="270">'
|
|
161
|
+
f'<rect width="480" height="270" fill="{colour}"/>'
|
|
162
|
+
f'<text x="240" y="135" font-family="system-ui,Arial,sans-serif" '
|
|
163
|
+
f'font-size="22" font-weight="bold" fill="rgba(255,255,255,0.9)" '
|
|
164
|
+
f'text-anchor="middle" dominant-baseline="middle">{label}</text>'
|
|
165
|
+
f'</svg>'
|
|
166
|
+
)
|
|
167
|
+
return "data:image/svg+xml;base64," + base64.b64encode(svg.encode()).decode()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _extract_og_image(soup, page_url: str) -> str:
|
|
171
|
+
"""og:image → twitter:image → first large <img> in article/main."""
|
|
172
|
+
for prop in ("og:image", "og:image:secure_url"):
|
|
173
|
+
tag = soup.find("meta", property=prop)
|
|
174
|
+
if tag and tag.get("content"):
|
|
175
|
+
return _abs_url(tag["content"].strip(), page_url)
|
|
176
|
+
for name in ("twitter:image", "twitter:image:src"):
|
|
177
|
+
tag = soup.find("meta", attrs={"name": name})
|
|
178
|
+
if tag and tag.get("content"):
|
|
179
|
+
return _abs_url(tag["content"].strip(), page_url)
|
|
180
|
+
# First sizable img inside known content containers
|
|
181
|
+
for sel in ("article", "main", '[role="main"]', ".post-content", ".entry-content", ".content"):
|
|
182
|
+
el = soup.select_one(sel)
|
|
183
|
+
if el:
|
|
184
|
+
for img in el.find_all("img", src=True):
|
|
185
|
+
src = _abs_url(img["src"].strip(), page_url)
|
|
186
|
+
# Skip tracking pixels / tiny images
|
|
187
|
+
w = img.get("width", "")
|
|
188
|
+
h = img.get("height", "")
|
|
189
|
+
if w and int(str(w).rstrip("px") or 0) < 50:
|
|
190
|
+
continue
|
|
191
|
+
if src.startswith("http"):
|
|
192
|
+
return src
|
|
193
|
+
return ""
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _resolve_thumbnail(soup, page_url: str) -> str:
|
|
197
|
+
"""Return best thumbnail: OG image → platform favicon → SVG."""
|
|
198
|
+
og = _extract_og_image(soup, page_url) if soup else ""
|
|
199
|
+
if og:
|
|
200
|
+
return og
|
|
201
|
+
# Use platform favicon (recognisable icon for Medium, Reddit, GitHub, etc.)
|
|
202
|
+
return _get_favicon_url(page_url)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
# Strategy 1 – Reddit JSON API
|
|
207
|
+
# ---------------------------------------------------------------------------
|
|
208
|
+
|
|
209
|
+
def _fetch_reddit(url: str, timeout: int) -> tuple[str, str, str] | None:
|
|
210
|
+
"""
|
|
211
|
+
Use Reddit's undocumented JSON API to get post + top comments.
|
|
212
|
+
Works on any reddit.com/r/.../comments/... URL.
|
|
213
|
+
"""
|
|
214
|
+
import requests
|
|
215
|
+
|
|
216
|
+
# Normalise: strip query/fragment, ensure .json suffix
|
|
217
|
+
p = urlparse(url)
|
|
218
|
+
clean = f"{p.scheme}://{p.netloc}{p.path.rstrip('/')}/.json"
|
|
219
|
+
|
|
220
|
+
r = requests.get(
|
|
221
|
+
clean,
|
|
222
|
+
headers={**_HEADERS, "Accept": "application/json"},
|
|
223
|
+
timeout=timeout,
|
|
224
|
+
allow_redirects=True,
|
|
225
|
+
)
|
|
226
|
+
r.raise_for_status()
|
|
227
|
+
data = r.json()
|
|
228
|
+
|
|
229
|
+
# Reddit returns a list of two listings: [post_listing, comments_listing]
|
|
230
|
+
if not (isinstance(data, list) and len(data) >= 1):
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
post_listing = data[0].get("data", {}).get("children", [])
|
|
234
|
+
if not post_listing:
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
post = post_listing[0].get("data", {})
|
|
238
|
+
title = post.get("title", "")
|
|
239
|
+
selftext = post.get("selftext", "") # markdown body of text posts
|
|
240
|
+
url_field = post.get("url", "") # link posts point here
|
|
241
|
+
author = post.get("author", "")
|
|
242
|
+
sub = post.get("subreddit_name_prefixed", "")
|
|
243
|
+
score = post.get("score", 0)
|
|
244
|
+
thumbnail_url = post.get("thumbnail", "") # Reddit thumbnail
|
|
245
|
+
preview = post.get("preview", {}).get("images", [])
|
|
246
|
+
|
|
247
|
+
# Better image: use preview image > thumbnail field
|
|
248
|
+
og_image = ""
|
|
249
|
+
if preview:
|
|
250
|
+
try:
|
|
251
|
+
og_image = preview[0]["source"]["url"].replace("&", "&")
|
|
252
|
+
except (KeyError, IndexError):
|
|
253
|
+
pass
|
|
254
|
+
if not og_image and thumbnail_url and thumbnail_url.startswith("http"):
|
|
255
|
+
og_image = thumbnail_url
|
|
256
|
+
if not og_image:
|
|
257
|
+
og_image = _get_favicon_url(url)
|
|
258
|
+
|
|
259
|
+
# Collect top-level comments
|
|
260
|
+
comments: list[str] = []
|
|
261
|
+
if len(data) >= 2:
|
|
262
|
+
for child in data[1].get("data", {}).get("children", [])[:10]:
|
|
263
|
+
body = child.get("data", {}).get("body", "").strip()
|
|
264
|
+
if body and body != "[deleted]" and body != "[removed]":
|
|
265
|
+
comments.append(body)
|
|
266
|
+
|
|
267
|
+
parts = [f"TITLE: {title}", f"SUBREDDIT: {sub}", f"AUTHOR: u/{author}", f"SCORE: {score}"]
|
|
268
|
+
if selftext:
|
|
269
|
+
parts.append(f"\nPOST BODY:\n{selftext}")
|
|
270
|
+
if url_field and url_field != url:
|
|
271
|
+
parts.append(f"\nLINKED URL: {url_field}")
|
|
272
|
+
if comments:
|
|
273
|
+
parts.append("\nTOP COMMENTS:\n" + "\n---\n".join(comments))
|
|
274
|
+
|
|
275
|
+
import datetime as _dt
|
|
276
|
+
post_date = (
|
|
277
|
+
_dt.datetime.utcfromtimestamp(post.get("created_utc", 0)).strftime("%Y-%m-%d")
|
|
278
|
+
if post.get("created_utc") else None
|
|
279
|
+
)
|
|
280
|
+
text = "\n".join(parts)
|
|
281
|
+
return title, text, og_image, author, post_date
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# ---------------------------------------------------------------------------
|
|
285
|
+
# Strategy 2 – Medium via open proxy chain
|
|
286
|
+
# ---------------------------------------------------------------------------
|
|
287
|
+
|
|
288
|
+
# Proxies tried left-to-right; {url} is replaced with the full article URL.
|
|
289
|
+
_MEDIUM_PROXIES = [
|
|
290
|
+
"https://scribe.rip/{url}", # scribe mirrors the article cleanly
|
|
291
|
+
"https://freedium.cfd/{url}", # alternative (sometimes down)
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _parse_proxy_page(html: str, orig_url: str) -> tuple[str, str, str]:
|
|
296
|
+
"""Extract title/text/thumbnail from a Medium proxy HTML page."""
|
|
297
|
+
from bs4 import BeautifulSoup
|
|
298
|
+
soup = BeautifulSoup(html, "lxml")
|
|
299
|
+
thumbnail = _resolve_thumbnail(soup, orig_url)
|
|
300
|
+
|
|
301
|
+
title = ""
|
|
302
|
+
for prop in ("og:title", "twitter:title"):
|
|
303
|
+
tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop})
|
|
304
|
+
if tag and tag.get("content"):
|
|
305
|
+
title = tag["content"].strip()
|
|
306
|
+
break
|
|
307
|
+
if not title and soup.title:
|
|
308
|
+
title = soup.title.get_text(strip=True)
|
|
309
|
+
|
|
310
|
+
# Remove boilerplate
|
|
311
|
+
for tag in soup(["script", "style", "nav", "header", "footer", "aside"]):
|
|
312
|
+
tag.decompose()
|
|
313
|
+
|
|
314
|
+
content_el = (soup.select_one(".main-content") or
|
|
315
|
+
soup.select_one("article") or
|
|
316
|
+
soup.select_one('[role="main"]') or
|
|
317
|
+
soup.find("body"))
|
|
318
|
+
text = (content_el or soup).get_text(separator="\n", strip=True)
|
|
319
|
+
lines = [ln.strip() for ln in text.splitlines() if len(ln.strip()) > 15]
|
|
320
|
+
|
|
321
|
+
# Extract author from Medium proxy HTML (meta tags first)
|
|
322
|
+
proxy_author = ""
|
|
323
|
+
for _pa in [{"property": "article:author"}, {"name": "author"}, {"name": "twitter:creator"}]:
|
|
324
|
+
_pt = soup.find("meta", attrs=_pa)
|
|
325
|
+
if _pt and _pt.get("content") and _pt["content"].strip().lower() not in ("medium", ""):
|
|
326
|
+
proxy_author = _pt["content"].strip()
|
|
327
|
+
break
|
|
328
|
+
if not proxy_author:
|
|
329
|
+
for _sel in ['a[rel="author"]', ".author", ".byline"]:
|
|
330
|
+
_el = soup.select_one(_sel)
|
|
331
|
+
if _el:
|
|
332
|
+
proxy_author = _el.get_text(strip=True)
|
|
333
|
+
break
|
|
334
|
+
|
|
335
|
+
# Extract publish date (meta first)
|
|
336
|
+
proxy_date = None
|
|
337
|
+
_pdt = soup.find("meta", attrs={"property": "article:published_time"})
|
|
338
|
+
if _pdt and _pdt.get("content"):
|
|
339
|
+
_pm = re.search(r'\d{4}-\d{2}-\d{2}', _pdt["content"])
|
|
340
|
+
if _pm:
|
|
341
|
+
proxy_date = _pm.group(0)
|
|
342
|
+
|
|
343
|
+
# Scribe.rip byline fallback: a <p> like "AuthorNameon YYYY-MM-DD" or "Author · YYYY-MM-DD"
|
|
344
|
+
# (scribe.rip sometimes concatenates author+date without spacing)
|
|
345
|
+
if not proxy_author or not proxy_date:
|
|
346
|
+
for _bp in soup.find_all("p"):
|
|
347
|
+
_bt = _bp.get_text(strip=True)
|
|
348
|
+
# Pattern: <name>on <date> or <name> on <date>
|
|
349
|
+
_bm = re.match(r'^(.{2,60}?)\s*on\s+(\d{4}-\d{2}-\d{2})\b', _bt, re.IGNORECASE)
|
|
350
|
+
if not _bm:
|
|
351
|
+
_bm = re.match(r'^(.{2,60}?)\s*[·•|]\s*(\d{4}-\d{2}-\d{2})\b', _bt)
|
|
352
|
+
if _bm:
|
|
353
|
+
if not proxy_author:
|
|
354
|
+
proxy_author = _bm.group(1).strip().rstrip("·•|").strip()
|
|
355
|
+
if not proxy_date:
|
|
356
|
+
proxy_date = _bm.group(2)
|
|
357
|
+
break
|
|
358
|
+
|
|
359
|
+
return title, "\n".join(lines), thumbnail, proxy_author, proxy_date
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _fetch_medium(url: str, timeout: int) -> tuple[str, str, str] | None:
|
|
363
|
+
"""
|
|
364
|
+
Try each Medium proxy in order; return first successful result.
|
|
365
|
+
"""
|
|
366
|
+
import requests
|
|
367
|
+
|
|
368
|
+
for proxy_tpl in _MEDIUM_PROXIES:
|
|
369
|
+
proxy_url = proxy_tpl.format(url=url)
|
|
370
|
+
try:
|
|
371
|
+
print(f" [medium] Trying {proxy_url[:55]}...")
|
|
372
|
+
r = requests.get(proxy_url, headers=_HEADERS,
|
|
373
|
+
timeout=timeout, allow_redirects=True)
|
|
374
|
+
r.raise_for_status()
|
|
375
|
+
title, text, thumbnail, auth, pd = _parse_proxy_page(r.text, url)
|
|
376
|
+
if len(text) > 200:
|
|
377
|
+
return title, text, thumbnail, auth, pd
|
|
378
|
+
print(f" [medium] {proxy_url[:40]} returned too little text")
|
|
379
|
+
except Exception as e:
|
|
380
|
+
print(f" [medium] {proxy_url[:40]} failed: {e}")
|
|
381
|
+
|
|
382
|
+
return None
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
# ---------------------------------------------------------------------------
|
|
386
|
+
# Strategy – Wayback Machine (emergency fallback for blocked/paywalled URLs)
|
|
387
|
+
# ---------------------------------------------------------------------------
|
|
388
|
+
|
|
389
|
+
def _fetch_wayback(url: str, timeout: int) -> tuple[str, str, str] | None:
|
|
390
|
+
"""
|
|
391
|
+
Look up the most recent Wayback Machine snapshot for a URL and fetch it.
|
|
392
|
+
Used as a last resort when all live fetch strategies are blocked (403/429).
|
|
393
|
+
"""
|
|
394
|
+
import requests
|
|
395
|
+
import trafilatura
|
|
396
|
+
|
|
397
|
+
check = f"https://archive.org/wayback/available?url={url}"
|
|
398
|
+
try:
|
|
399
|
+
resp = requests.get(check, timeout=10)
|
|
400
|
+
resp.raise_for_status()
|
|
401
|
+
data = resp.json()
|
|
402
|
+
snapshot = data.get("archived_snapshots", {}).get("closest", {})
|
|
403
|
+
if not snapshot.get("available"):
|
|
404
|
+
return None
|
|
405
|
+
wb_url = snapshot["url"]
|
|
406
|
+
print(f" [wayback] Snapshot found: {wb_url[:70]}")
|
|
407
|
+
except Exception as e:
|
|
408
|
+
print(f" [wayback] Availability check failed: {e}")
|
|
409
|
+
return None
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
r = requests.get(wb_url, headers=_HEADERS, timeout=timeout, allow_redirects=True)
|
|
413
|
+
r.raise_for_status()
|
|
414
|
+
html = r.text
|
|
415
|
+
except Exception as e:
|
|
416
|
+
print(f" [wayback] Fetch failed: {e}")
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
# Use trafilatura for clean extraction from cached HTML
|
|
420
|
+
try:
|
|
421
|
+
text = trafilatura.extract(html, url=url,
|
|
422
|
+
include_comments=True, favor_recall=True) or ""
|
|
423
|
+
meta = trafilatura.extract_metadata(html, default_url=url)
|
|
424
|
+
title = (meta.title if meta else "") or ""
|
|
425
|
+
og_image = (meta.image if meta else "") or ""
|
|
426
|
+
wb_a = (meta.author if meta else "") or ""
|
|
427
|
+
wb_d = (meta.date if meta else "") or ""
|
|
428
|
+
except Exception:
|
|
429
|
+
text = ""; title = ""; og_image = ""; wb_a = ""; wb_d = ""
|
|
430
|
+
|
|
431
|
+
if not og_image:
|
|
432
|
+
try:
|
|
433
|
+
from bs4 import BeautifulSoup
|
|
434
|
+
soup = BeautifulSoup(html, "lxml")
|
|
435
|
+
og_image = _resolve_thumbnail(soup, url)
|
|
436
|
+
except Exception:
|
|
437
|
+
og_image = _get_favicon_url(url)
|
|
438
|
+
|
|
439
|
+
wb_date = None
|
|
440
|
+
if wb_d:
|
|
441
|
+
_wm = re.search(r'\d{4}-\d{2}-\d{2}', str(wb_d))
|
|
442
|
+
if _wm:
|
|
443
|
+
wb_date = _wm.group(0)
|
|
444
|
+
|
|
445
|
+
return (title, text, og_image, wb_a, wb_date) if len(text) > 100 else None
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
# ---------------------------------------------------------------------------
|
|
449
|
+
# Strategy 3 – newspaper4k
|
|
450
|
+
# ---------------------------------------------------------------------------
|
|
451
|
+
|
|
452
|
+
def _fetch_newspaper(url: str, timeout: int) -> tuple[str, str, str] | None:
|
|
453
|
+
"""
|
|
454
|
+
newspaper4k (maintained fork of newspaper3k) — excellent for news articles,
|
|
455
|
+
blog posts, and most standard editorial pages.
|
|
456
|
+
"""
|
|
457
|
+
try:
|
|
458
|
+
from newspaper import Article, Config
|
|
459
|
+
except ImportError:
|
|
460
|
+
return None
|
|
461
|
+
|
|
462
|
+
cfg = Config()
|
|
463
|
+
cfg.browser_user_agent = _HEADERS["User-Agent"]
|
|
464
|
+
cfg.request_timeout = timeout
|
|
465
|
+
cfg.fetch_images = False
|
|
466
|
+
cfg.memoize_articles = False
|
|
467
|
+
|
|
468
|
+
article = Article(url, config=cfg)
|
|
469
|
+
article.download()
|
|
470
|
+
article.parse()
|
|
471
|
+
|
|
472
|
+
title = article.title or ""
|
|
473
|
+
text = article.text or ""
|
|
474
|
+
top_image = article.top_image or ""
|
|
475
|
+
|
|
476
|
+
if not top_image:
|
|
477
|
+
# Try to get it from meta via soup
|
|
478
|
+
try:
|
|
479
|
+
from bs4 import BeautifulSoup
|
|
480
|
+
soup = BeautifulSoup(article.html or "", "lxml")
|
|
481
|
+
top_image = _resolve_thumbnail(soup, url)
|
|
482
|
+
except Exception:
|
|
483
|
+
top_image = _get_favicon_url(url)
|
|
484
|
+
|
|
485
|
+
# Extract author and publish date from newspaper4k
|
|
486
|
+
np_author = ""
|
|
487
|
+
if hasattr(article, 'authors') and article.authors:
|
|
488
|
+
np_author = article.authors[0]
|
|
489
|
+
np_date = None
|
|
490
|
+
if hasattr(article, 'publish_date') and article.publish_date:
|
|
491
|
+
try:
|
|
492
|
+
_npd = article.publish_date
|
|
493
|
+
if hasattr(_npd, 'strftime'):
|
|
494
|
+
np_date = _npd.strftime("%Y-%m-%d")
|
|
495
|
+
else:
|
|
496
|
+
_npm = re.search(r'\d{4}-\d{2}-\d{2}', str(_npd))
|
|
497
|
+
if _npm:
|
|
498
|
+
np_date = _npm.group(0)
|
|
499
|
+
except Exception:
|
|
500
|
+
pass
|
|
501
|
+
|
|
502
|
+
return (title, text, top_image, np_author, np_date) if len(text) > 200 else None
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
# ---------------------------------------------------------------------------
|
|
506
|
+
# Strategy 4 – trafilatura
|
|
507
|
+
# ---------------------------------------------------------------------------
|
|
508
|
+
|
|
509
|
+
def _fetch_trafilatura(url: str, timeout: int) -> tuple[str, str, str] | None:
|
|
510
|
+
"""
|
|
511
|
+
trafilatura — state-of-the-art main-content extractor.
|
|
512
|
+
Handles forums, comment threads, Hacker News, Stack Overflow, etc.
|
|
513
|
+
"""
|
|
514
|
+
try:
|
|
515
|
+
import trafilatura
|
|
516
|
+
from trafilatura.settings import use_config
|
|
517
|
+
except ImportError:
|
|
518
|
+
return None
|
|
519
|
+
|
|
520
|
+
# Download with our headers
|
|
521
|
+
import requests
|
|
522
|
+
r = requests.get(url, headers=_HEADERS, timeout=timeout, allow_redirects=True)
|
|
523
|
+
r.raise_for_status()
|
|
524
|
+
html = r.text
|
|
525
|
+
|
|
526
|
+
# Extract with trafilatura
|
|
527
|
+
extracted = trafilatura.extract(
|
|
528
|
+
html,
|
|
529
|
+
url=url,
|
|
530
|
+
include_comments=True,
|
|
531
|
+
include_tables=True,
|
|
532
|
+
no_fallback=False,
|
|
533
|
+
favor_recall=True, # better for forums/threads
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
if not extracted or len(extracted) < 200:
|
|
537
|
+
return None
|
|
538
|
+
|
|
539
|
+
# Get metadata (title + image) via trafilatura's metadata extractor
|
|
540
|
+
meta = trafilatura.extract_metadata(html, default_url=url)
|
|
541
|
+
title = (meta.title if meta else "") or ""
|
|
542
|
+
og_image = (meta.image if meta else "") or ""
|
|
543
|
+
traf_a = (meta.author if meta else "") or ""
|
|
544
|
+
traf_d = (meta.date if meta else "") or ""
|
|
545
|
+
if not og_image:
|
|
546
|
+
try:
|
|
547
|
+
from bs4 import BeautifulSoup
|
|
548
|
+
soup = BeautifulSoup(html, "lxml")
|
|
549
|
+
og_image = _resolve_thumbnail(soup, url)
|
|
550
|
+
except Exception:
|
|
551
|
+
og_image = _get_favicon_url(url)
|
|
552
|
+
|
|
553
|
+
traf_date = None
|
|
554
|
+
if traf_d:
|
|
555
|
+
_tm = re.search(r'\d{4}-\d{2}-\d{2}', str(traf_d))
|
|
556
|
+
if _tm:
|
|
557
|
+
traf_date = _tm.group(0)
|
|
558
|
+
|
|
559
|
+
return title, extracted, og_image, traf_a, traf_date
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
# ---------------------------------------------------------------------------
|
|
563
|
+
# Strategy 5 – BeautifulSoup (original reliable fallback)
|
|
564
|
+
# ---------------------------------------------------------------------------
|
|
565
|
+
|
|
566
|
+
def _fetch_beautifulsoup(url: str, timeout: int) -> tuple[str, str, str]:
|
|
567
|
+
"""Pure BeautifulSoup fallback — always produces *something*."""
|
|
568
|
+
import requests
|
|
569
|
+
|
|
570
|
+
r = requests.get(url, headers=_HEADERS, timeout=timeout, allow_redirects=True)
|
|
571
|
+
r.raise_for_status()
|
|
572
|
+
html = r.text
|
|
573
|
+
|
|
574
|
+
try:
|
|
575
|
+
from bs4 import BeautifulSoup
|
|
576
|
+
soup = BeautifulSoup(html, "lxml")
|
|
577
|
+
except Exception:
|
|
578
|
+
from bs4 import BeautifulSoup
|
|
579
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
580
|
+
|
|
581
|
+
thumbnail = _resolve_thumbnail(soup, url)
|
|
582
|
+
|
|
583
|
+
title = ""
|
|
584
|
+
og = soup.find("meta", property="og:title")
|
|
585
|
+
if og and og.get("content"):
|
|
586
|
+
title = og["content"].strip()
|
|
587
|
+
elif soup.title:
|
|
588
|
+
title = soup.title.get_text(strip=True)
|
|
589
|
+
elif soup.find("h1"):
|
|
590
|
+
title = soup.find("h1").get_text(strip=True)
|
|
591
|
+
|
|
592
|
+
for tag in soup(["script", "style", "nav", "header", "footer", "aside",
|
|
593
|
+
"form", "button", "noscript", "iframe", "svg"]):
|
|
594
|
+
tag.decompose()
|
|
595
|
+
|
|
596
|
+
text = ""
|
|
597
|
+
for sel in ["article", "main", '[role="main"]', ".post-content",
|
|
598
|
+
".article-body", ".entry-content", ".content", "#content", ".post", "#main"]:
|
|
599
|
+
el = soup.select_one(sel)
|
|
600
|
+
if el:
|
|
601
|
+
text = el.get_text(separator="\n", strip=True)
|
|
602
|
+
break
|
|
603
|
+
if not text:
|
|
604
|
+
body = soup.find("body")
|
|
605
|
+
text = (body or soup).get_text(separator="\n", strip=True)
|
|
606
|
+
|
|
607
|
+
# Extract author and date from meta/JSON-LD
|
|
608
|
+
bs_author = ""
|
|
609
|
+
for _ba in [{"property": "article:author"}, {"name": "author"}, {"name": "dc.creator"}]:
|
|
610
|
+
_bm = soup.find("meta", attrs=_ba)
|
|
611
|
+
if _bm and _bm.get("content"):
|
|
612
|
+
bs_author = _bm["content"].strip()
|
|
613
|
+
break
|
|
614
|
+
bs_date = None
|
|
615
|
+
for _ba in [{"property": "article:published_time"}, {"name": "datePublished"},
|
|
616
|
+
{"itemprop": "datePublished"}]:
|
|
617
|
+
_bm = soup.find("meta", attrs=_ba)
|
|
618
|
+
if _bm and _bm.get("content"):
|
|
619
|
+
_bdm = re.search(r'\d{4}-\d{2}-\d{2}', _bm["content"])
|
|
620
|
+
if _bdm:
|
|
621
|
+
bs_date = _bdm.group(0)
|
|
622
|
+
break
|
|
623
|
+
if not bs_date:
|
|
624
|
+
for _bt in soup.find_all("time", attrs={"datetime": True}):
|
|
625
|
+
if re.match(r'\d{4}-\d{2}-\d{2}', _bt["datetime"]):
|
|
626
|
+
bs_date = _bt["datetime"][:10]
|
|
627
|
+
break
|
|
628
|
+
|
|
629
|
+
return title, text, thumbnail, bs_author, bs_date
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
# ---------------------------------------------------------------------------
|
|
633
|
+
# Public: fetch_page_text
|
|
634
|
+
# ---------------------------------------------------------------------------
|
|
635
|
+
|
|
636
|
+
def fetch_page_text(url: str, timeout: int = 20) -> tuple[str, str, str, str, str | None]:
|
|
637
|
+
"""
|
|
638
|
+
Fetch a web page with a multi-strategy pipeline and return
|
|
639
|
+
(title, text, thumbnail, author, post_date).
|
|
640
|
+
|
|
641
|
+
Strategy order:
|
|
642
|
+
Reddit JSON API → Medium proxies (scribe.rip/freedium) →
|
|
643
|
+
newspaper4k → trafilatura → Wayback Machine → BeautifulSoup
|
|
644
|
+
"""
|
|
645
|
+
|
|
646
|
+
def _clean(text: str) -> str:
|
|
647
|
+
lines = [ln.strip() for ln in text.splitlines() if len(ln.strip()) > 10]
|
|
648
|
+
words = " ".join(lines).split()
|
|
649
|
+
if len(words) > 5000:
|
|
650
|
+
return " ".join(words[:5000]) + "\n[... content truncated ...]"
|
|
651
|
+
return "\n".join(lines)
|
|
652
|
+
|
|
653
|
+
# 1 — Reddit
|
|
654
|
+
if _is_reddit(url):
|
|
655
|
+
print(" [fetch] Reddit JSON API")
|
|
656
|
+
try:
|
|
657
|
+
result = _fetch_reddit(url, timeout)
|
|
658
|
+
if result and result[1].strip():
|
|
659
|
+
t, txt, thumb, auth, pd = result
|
|
660
|
+
return t, _clean(txt), thumb, auth, pd
|
|
661
|
+
except Exception as e:
|
|
662
|
+
print(f" [fetch] Reddit failed: {e}")
|
|
663
|
+
|
|
664
|
+
# 2 — Medium (multi-proxy)
|
|
665
|
+
if _is_medium(url):
|
|
666
|
+
print(" [fetch] Medium proxies (scribe.rip → freedium.cfd)")
|
|
667
|
+
try:
|
|
668
|
+
result = _fetch_medium(url, timeout)
|
|
669
|
+
if result and result[1].strip():
|
|
670
|
+
t, txt, thumb, auth, pd = result
|
|
671
|
+
return t, _clean(txt), thumb, auth, pd
|
|
672
|
+
except Exception as e:
|
|
673
|
+
print(f" [fetch] All Medium proxies failed: {e}")
|
|
674
|
+
|
|
675
|
+
blocked_error: str = ""
|
|
676
|
+
|
|
677
|
+
# 3 — newspaper4k (best for standard articles)
|
|
678
|
+
print(" [fetch] newspaper4k")
|
|
679
|
+
try:
|
|
680
|
+
result = _fetch_newspaper(url, timeout)
|
|
681
|
+
if result and result[1].strip():
|
|
682
|
+
t, txt, thumb, auth, pd = result
|
|
683
|
+
return t, _clean(txt), thumb, auth, pd
|
|
684
|
+
except Exception as e:
|
|
685
|
+
print(f" [fetch] newspaper4k failed: {e}")
|
|
686
|
+
if "403" in str(e) or "401" in str(e) or "Forbidden" in str(e):
|
|
687
|
+
blocked_error = str(e)
|
|
688
|
+
|
|
689
|
+
# 4 — trafilatura (best for forums / comment-heavy pages)
|
|
690
|
+
print(" [fetch] trafilatura")
|
|
691
|
+
try:
|
|
692
|
+
result = _fetch_trafilatura(url, timeout)
|
|
693
|
+
if result and result[1].strip():
|
|
694
|
+
t, txt, thumb, auth, pd = result
|
|
695
|
+
return t, _clean(txt), thumb, auth, pd
|
|
696
|
+
except Exception as e:
|
|
697
|
+
print(f" [fetch] trafilatura failed: {e}")
|
|
698
|
+
if "403" in str(e) or "401" in str(e) or "Forbidden" in str(e):
|
|
699
|
+
blocked_error = str(e)
|
|
700
|
+
|
|
701
|
+
# 5 — Wayback Machine (when site blocks scrapers)
|
|
702
|
+
if blocked_error or _is_medium(url):
|
|
703
|
+
print(" [fetch] Wayback Machine (site appears blocked)")
|
|
704
|
+
try:
|
|
705
|
+
result = _fetch_wayback(url, timeout)
|
|
706
|
+
if result and result[1].strip():
|
|
707
|
+
t, txt, thumb, auth, pd = result
|
|
708
|
+
return t, _clean(txt), thumb, auth, pd
|
|
709
|
+
except Exception as e:
|
|
710
|
+
print(f" [fetch] Wayback Machine failed: {e}")
|
|
711
|
+
|
|
712
|
+
# 6 — BeautifulSoup raw fallback
|
|
713
|
+
print(" [fetch] BeautifulSoup fallback")
|
|
714
|
+
t, txt, thumb, auth, pd = _fetch_beautifulsoup(url, timeout)
|
|
715
|
+
return t, _clean(txt), thumb, auth, pd
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
# ---------------------------------------------------------------------------
|
|
719
|
+
# Core analyzer (public API)
|
|
720
|
+
# ---------------------------------------------------------------------------
|
|
721
|
+
|
|
722
|
+
def analyze_webpage(url: str) -> dict:
|
|
723
|
+
"""
|
|
724
|
+
Fetch and analyze a web page via ModelRouter text models.
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
dict: raw_output, page_title, thumbnail, error
|
|
728
|
+
"""
|
|
729
|
+
print(" 🌐 Fetching web page...")
|
|
730
|
+
|
|
731
|
+
try:
|
|
732
|
+
page_title, content, thumbnail, page_author, page_date = fetch_page_text(url)
|
|
733
|
+
summary_title = f"'{page_title[:70]}'" if page_title else "(no title)"
|
|
734
|
+
print(f" ✓ Fetched: {summary_title}")
|
|
735
|
+
if thumbnail.startswith("data:"):
|
|
736
|
+
print(" 🖼️ Using SVG placeholder (no image found)")
|
|
737
|
+
elif "google.com/s2/favicons" in thumbnail:
|
|
738
|
+
print(f" 🖼️ Using platform favicon: {_netloc(url)}")
|
|
739
|
+
else:
|
|
740
|
+
print(f" 🖼️ Thumbnail: {thumbnail[:80]}")
|
|
741
|
+
except Exception as e:
|
|
742
|
+
return {"raw_output": "", "page_title": "", "thumbnail": "",
|
|
743
|
+
"author": "", "post_date": None,
|
|
744
|
+
"error": f"Failed to fetch page: {e}"}
|
|
745
|
+
|
|
746
|
+
if not content.strip():
|
|
747
|
+
return {"raw_output": "", "page_title": page_title, "thumbnail": thumbnail,
|
|
748
|
+
"author": page_author, "post_date": page_date,
|
|
749
|
+
"error": "No readable text content found on the page"}
|
|
750
|
+
|
|
751
|
+
prompt = _WEB_PROMPT_TPL.format(
|
|
752
|
+
url=url,
|
|
753
|
+
page_title=page_title or "Unknown",
|
|
754
|
+
content=content[:8000],
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
print(" 🤖 Analyzing page content with AI...")
|
|
758
|
+
|
|
759
|
+
try:
|
|
760
|
+
from core.model_router import get_router
|
|
761
|
+
raw_output = get_router().generate_text(prompt)
|
|
762
|
+
print(" ✓ Web page analysis complete")
|
|
763
|
+
return {"raw_output": raw_output, "page_title": page_title,
|
|
764
|
+
"thumbnail": thumbnail, "author": page_author,
|
|
765
|
+
"post_date": page_date, "error": None}
|
|
766
|
+
except Exception as e:
|
|
767
|
+
return {"raw_output": "", "page_title": page_title, "thumbnail": thumbnail,
|
|
768
|
+
"author": page_author, "post_date": page_date,
|
|
769
|
+
"error": f"AI analysis failed: {e}"}
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
# ---------------------------------------------------------------------------
|
|
773
|
+
# CLI
|
|
774
|
+
# ---------------------------------------------------------------------------
|
|
775
|
+
|
|
776
|
+
if __name__ == "__main__":
|
|
777
|
+
import sys
|
|
778
|
+
url = sys.argv[1] if len(sys.argv) > 1 else input("Web page URL: ").strip()
|
|
779
|
+
if url:
|
|
780
|
+
result = analyze_webpage(url)
|
|
781
|
+
if result["error"]:
|
|
782
|
+
print(f"\n✗ Error: {result['error']}")
|
|
783
|
+
else:
|
|
784
|
+
print("\n" + "=" * 60)
|
|
785
|
+
print(f"[thumbnail] {result['thumbnail'][:100]}")
|
|
786
|
+
print("=" * 60)
|
|
787
|
+
print(result["raw_output"])
|
|
788
|
+
|
|
789
|
+
|