superbrain-server 1.0.2-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/bin/superbrain.js +196 -0
  2. package/package.json +23 -0
  3. package/payload/.dockerignore +45 -0
  4. package/payload/.env.example +58 -0
  5. package/payload/Dockerfile +73 -0
  6. package/payload/analyzers/__init__.py +0 -0
  7. package/payload/analyzers/audio_transcribe.py +225 -0
  8. package/payload/analyzers/caption.py +244 -0
  9. package/payload/analyzers/music_identifier.py +346 -0
  10. package/payload/analyzers/text_analyzer.py +117 -0
  11. package/payload/analyzers/visual_analyze.py +218 -0
  12. package/payload/analyzers/webpage_analyzer.py +789 -0
  13. package/payload/analyzers/youtube_analyzer.py +320 -0
  14. package/payload/api.py +1676 -0
  15. package/payload/config/.api_keys.example +22 -0
  16. package/payload/config/model_rankings.json +492 -0
  17. package/payload/config/openrouter_free_models.json +1364 -0
  18. package/payload/config/whisper_model.txt +1 -0
  19. package/payload/config_settings.py +185 -0
  20. package/payload/core/__init__.py +0 -0
  21. package/payload/core/category_manager.py +219 -0
  22. package/payload/core/database.py +811 -0
  23. package/payload/core/link_checker.py +300 -0
  24. package/payload/core/model_router.py +1253 -0
  25. package/payload/docker-compose.yml +120 -0
  26. package/payload/instagram/__init__.py +0 -0
  27. package/payload/instagram/instagram_downloader.py +253 -0
  28. package/payload/instagram/instagram_login.py +190 -0
  29. package/payload/main.py +912 -0
  30. package/payload/requirements.txt +39 -0
  31. package/payload/reset.py +311 -0
  32. package/payload/start-docker-prod.sh +125 -0
  33. package/payload/start-docker.sh +56 -0
  34. package/payload/start.py +1302 -0
  35. package/payload/static/favicon.ico +0 -0
  36. package/payload/stop-docker.sh +16 -0
  37. package/payload/utils/__init__.py +0 -0
  38. package/payload/utils/db_stats.py +108 -0
  39. package/payload/utils/manage_token.py +91 -0
@@ -0,0 +1,789 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Web Page Analyzer for SuperBrain
4
+ ==================================
5
+ Multi-strategy fetcher with platform-aware content extraction.
6
+
7
+ Fetch priority chain:
8
+ 1. Reddit → official .json API (no scraping needed)
9
+ 2. Medium → scribe.rip → freedium.cfd proxy chain
10
+ 3. newspaper4k → fast article parser, works on most news/blog sites
11
+ 4. trafilatura → best-in-class boilerplate remover, handles forums
12
+ 5. Wayback Machine → archive.org snapshot for blocked/paywalled pages
13
+ 6. BeautifulSoup → raw HTML last-resort fallback
14
+
15
+ Thumbnail priority:
16
+ og:image / twitter:image → article first <img> → platform favicon URL → SVG card
17
+ """
18
+
19
+ import sys
20
+ import re
21
+ import base64
22
+ import json
23
+ import time
24
+ from pathlib import Path
25
+ from urllib.parse import urlparse, urljoin
26
+
27
+ # Ensure backend root is in sys.path (needed when run as a subprocess)
28
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Prompt template
33
+ # ---------------------------------------------------------------------------
34
+
35
+ _WEB_PROMPT_TPL = """Analyze the following web page content and generate a structured report.
36
+
37
+ URL: {url}
38
+ Page Title: {page_title}
39
+
40
+ CONTENT:
41
+ {content}
42
+
43
+ ---
44
+
45
+ Generate the report in this EXACT format (use these exact emoji headers):
46
+
47
+ 📌 TITLE:
48
+ [Clear descriptive title for this content]
49
+
50
+ 📝 SUMMARY:
51
+ [3-5 sentence summary covering: main topic, key information, important facts,
52
+ any products/places/tools mentioned, and the overall purpose of the page]
53
+
54
+ 🏷️ TAGS:
55
+ [8-12 relevant hashtags/keywords separated by spaces, e.g. #python #tutorial #beginners]
56
+
57
+ 🎵 MUSIC:
58
+ [N/A — web page]
59
+
60
+ 📂 CATEGORY:
61
+ [Choose exactly ONE from: product, places, recipe, software, book, tv shows, workout, film, event, other]
62
+
63
+ Be specific and factual. Extract real names, numbers, and details from the content."""
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Browser-like headers (shared across all strategies)
68
+ # ---------------------------------------------------------------------------
69
+
70
+ _HEADERS = {
71
+ "User-Agent": (
72
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
73
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
74
+ "Chrome/122.0.0.0 Safari/537.36"
75
+ ),
76
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
77
+ "Accept-Language": "en-US,en;q=0.9",
78
+ "Accept-Encoding": "gzip, deflate, br",
79
+ "DNT": "1",
80
+ "Connection": "keep-alive",
81
+ "Upgrade-Insecure-Requests": "1",
82
+ "Sec-Fetch-Dest": "document",
83
+ "Sec-Fetch-Mode": "navigate",
84
+ "Sec-Fetch-Site": "none",
85
+ }
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # Platform detection helpers
90
+ # ---------------------------------------------------------------------------
91
+
92
+ def _netloc(url: str) -> str:
93
+ return urlparse(url).netloc.lower()
94
+
95
+
96
+ def _is_reddit(url: str) -> bool:
97
+ nl = _netloc(url)
98
+ return "reddit.com" in nl or "redd.it" in nl
99
+
100
+
101
+ # Known custom-domain Medium publications (add more as needed)
102
+ _MEDIUM_CUSTOM_DOMAINS = {
103
+ "towardsdatascience.com", "bettermarketing.pub", "uxdesign.cc",
104
+ "itnext.io", "betterprogramming.pub", "entrepreneurshandbook.co",
105
+ "theascent.pub", "personal-growth.org", "onezero.medium.com",
106
+ "writingcooperative.com",
107
+ }
108
+
109
+
110
+ def _is_medium(url: str) -> bool:
111
+ """medium.com subdomains + custom-domain Medium publications."""
112
+ nl = _netloc(url)
113
+ if "medium.com" in nl: # covers safeti.medium.com, medium.com, etc.
114
+ return True
115
+ return nl in _MEDIUM_CUSTOM_DOMAINS
116
+
117
+
118
+ def _is_hacker_news(url: str) -> bool:
119
+ return "news.ycombinator.com" in _netloc(url)
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # Thumbnail helpers
124
+ # ---------------------------------------------------------------------------
125
+
126
+ def _abs_url(src: str, page_url: str) -> str:
127
+ """Convert relative/protocol-relative URL to absolute."""
128
+ if not src:
129
+ return ""
130
+ if src.startswith("//"):
131
+ return "https:" + src
132
+ if src.startswith("/"):
133
+ p = urlparse(page_url)
134
+ return f"{p.scheme}://{p.netloc}{src}"
135
+ if not src.startswith("http"):
136
+ return urljoin(page_url, src)
137
+ return src
138
+
139
+
140
+ def _get_favicon_url(url: str) -> str:
141
+ """
142
+ Return a Google-served favicon URL for the domain.
143
+ sz=128 returns up to 128x128 PNG — always resolves (falls back to globe icon).
144
+ """
145
+ p = urlparse(url)
146
+ domain = f"{p.scheme}://{p.netloc}"
147
+ return f"https://www.google.com/s2/favicons?sz=128&domain_url={domain}"
148
+
149
+
150
+ _GREY_SVG_COLORS = [
151
+ "#4F46E5", "#0891B2", "#059669", "#D97706",
152
+ "#DC2626", "#7C3AED", "#DB2777", "#0369A1",
153
+ ]
154
+
155
+
156
+ def _make_svg_placeholder(domain: str) -> str:
157
+ colour = _GREY_SVG_COLORS[sum(ord(c) for c in domain) % len(_GREY_SVG_COLORS)]
158
+ label = domain[:30]
159
+ svg = (
160
+ f'<svg xmlns="http://www.w3.org/2000/svg" width="480" height="270">'
161
+ f'<rect width="480" height="270" fill="{colour}"/>'
162
+ f'<text x="240" y="135" font-family="system-ui,Arial,sans-serif" '
163
+ f'font-size="22" font-weight="bold" fill="rgba(255,255,255,0.9)" '
164
+ f'text-anchor="middle" dominant-baseline="middle">{label}</text>'
165
+ f'</svg>'
166
+ )
167
+ return "data:image/svg+xml;base64," + base64.b64encode(svg.encode()).decode()
168
+
169
+
170
+ def _extract_og_image(soup, page_url: str) -> str:
171
+ """og:image → twitter:image → first large <img> in article/main."""
172
+ for prop in ("og:image", "og:image:secure_url"):
173
+ tag = soup.find("meta", property=prop)
174
+ if tag and tag.get("content"):
175
+ return _abs_url(tag["content"].strip(), page_url)
176
+ for name in ("twitter:image", "twitter:image:src"):
177
+ tag = soup.find("meta", attrs={"name": name})
178
+ if tag and tag.get("content"):
179
+ return _abs_url(tag["content"].strip(), page_url)
180
+ # First sizable img inside known content containers
181
+ for sel in ("article", "main", '[role="main"]', ".post-content", ".entry-content", ".content"):
182
+ el = soup.select_one(sel)
183
+ if el:
184
+ for img in el.find_all("img", src=True):
185
+ src = _abs_url(img["src"].strip(), page_url)
186
+ # Skip tracking pixels / tiny images
187
+ w = img.get("width", "")
188
+ h = img.get("height", "")
189
+ if w and int(str(w).rstrip("px") or 0) < 50:
190
+ continue
191
+ if src.startswith("http"):
192
+ return src
193
+ return ""
194
+
195
+
196
+ def _resolve_thumbnail(soup, page_url: str) -> str:
197
+ """Return best thumbnail: OG image → platform favicon → SVG."""
198
+ og = _extract_og_image(soup, page_url) if soup else ""
199
+ if og:
200
+ return og
201
+ # Use platform favicon (recognisable icon for Medium, Reddit, GitHub, etc.)
202
+ return _get_favicon_url(page_url)
203
+
204
+
205
+ # ---------------------------------------------------------------------------
206
+ # Strategy 1 – Reddit JSON API
207
+ # ---------------------------------------------------------------------------
208
+
209
+ def _fetch_reddit(url: str, timeout: int) -> tuple[str, str, str] | None:
210
+ """
211
+ Use Reddit's undocumented JSON API to get post + top comments.
212
+ Works on any reddit.com/r/.../comments/... URL.
213
+ """
214
+ import requests
215
+
216
+ # Normalise: strip query/fragment, ensure .json suffix
217
+ p = urlparse(url)
218
+ clean = f"{p.scheme}://{p.netloc}{p.path.rstrip('/')}/.json"
219
+
220
+ r = requests.get(
221
+ clean,
222
+ headers={**_HEADERS, "Accept": "application/json"},
223
+ timeout=timeout,
224
+ allow_redirects=True,
225
+ )
226
+ r.raise_for_status()
227
+ data = r.json()
228
+
229
+ # Reddit returns a list of two listings: [post_listing, comments_listing]
230
+ if not (isinstance(data, list) and len(data) >= 1):
231
+ return None
232
+
233
+ post_listing = data[0].get("data", {}).get("children", [])
234
+ if not post_listing:
235
+ return None
236
+
237
+ post = post_listing[0].get("data", {})
238
+ title = post.get("title", "")
239
+ selftext = post.get("selftext", "") # markdown body of text posts
240
+ url_field = post.get("url", "") # link posts point here
241
+ author = post.get("author", "")
242
+ sub = post.get("subreddit_name_prefixed", "")
243
+ score = post.get("score", 0)
244
+ thumbnail_url = post.get("thumbnail", "") # Reddit thumbnail
245
+ preview = post.get("preview", {}).get("images", [])
246
+
247
+ # Better image: use preview image > thumbnail field
248
+ og_image = ""
249
+ if preview:
250
+ try:
251
+ og_image = preview[0]["source"]["url"].replace("&amp;", "&")
252
+ except (KeyError, IndexError):
253
+ pass
254
+ if not og_image and thumbnail_url and thumbnail_url.startswith("http"):
255
+ og_image = thumbnail_url
256
+ if not og_image:
257
+ og_image = _get_favicon_url(url)
258
+
259
+ # Collect top-level comments
260
+ comments: list[str] = []
261
+ if len(data) >= 2:
262
+ for child in data[1].get("data", {}).get("children", [])[:10]:
263
+ body = child.get("data", {}).get("body", "").strip()
264
+ if body and body != "[deleted]" and body != "[removed]":
265
+ comments.append(body)
266
+
267
+ parts = [f"TITLE: {title}", f"SUBREDDIT: {sub}", f"AUTHOR: u/{author}", f"SCORE: {score}"]
268
+ if selftext:
269
+ parts.append(f"\nPOST BODY:\n{selftext}")
270
+ if url_field and url_field != url:
271
+ parts.append(f"\nLINKED URL: {url_field}")
272
+ if comments:
273
+ parts.append("\nTOP COMMENTS:\n" + "\n---\n".join(comments))
274
+
275
+ import datetime as _dt
276
+ post_date = (
277
+ _dt.datetime.utcfromtimestamp(post.get("created_utc", 0)).strftime("%Y-%m-%d")
278
+ if post.get("created_utc") else None
279
+ )
280
+ text = "\n".join(parts)
281
+ return title, text, og_image, author, post_date
282
+
283
+
284
+ # ---------------------------------------------------------------------------
285
+ # Strategy 2 – Medium via open proxy chain
286
+ # ---------------------------------------------------------------------------
287
+
288
+ # Proxies tried left-to-right; {url} is replaced with the full article URL.
289
+ _MEDIUM_PROXIES = [
290
+ "https://scribe.rip/{url}", # scribe mirrors the article cleanly
291
+ "https://freedium.cfd/{url}", # alternative (sometimes down)
292
+ ]
293
+
294
+
295
+ def _parse_proxy_page(html: str, orig_url: str) -> tuple[str, str, str]:
296
+ """Extract title/text/thumbnail from a Medium proxy HTML page."""
297
+ from bs4 import BeautifulSoup
298
+ soup = BeautifulSoup(html, "lxml")
299
+ thumbnail = _resolve_thumbnail(soup, orig_url)
300
+
301
+ title = ""
302
+ for prop in ("og:title", "twitter:title"):
303
+ tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop})
304
+ if tag and tag.get("content"):
305
+ title = tag["content"].strip()
306
+ break
307
+ if not title and soup.title:
308
+ title = soup.title.get_text(strip=True)
309
+
310
+ # Remove boilerplate
311
+ for tag in soup(["script", "style", "nav", "header", "footer", "aside"]):
312
+ tag.decompose()
313
+
314
+ content_el = (soup.select_one(".main-content") or
315
+ soup.select_one("article") or
316
+ soup.select_one('[role="main"]') or
317
+ soup.find("body"))
318
+ text = (content_el or soup).get_text(separator="\n", strip=True)
319
+ lines = [ln.strip() for ln in text.splitlines() if len(ln.strip()) > 15]
320
+
321
+ # Extract author from Medium proxy HTML (meta tags first)
322
+ proxy_author = ""
323
+ for _pa in [{"property": "article:author"}, {"name": "author"}, {"name": "twitter:creator"}]:
324
+ _pt = soup.find("meta", attrs=_pa)
325
+ if _pt and _pt.get("content") and _pt["content"].strip().lower() not in ("medium", ""):
326
+ proxy_author = _pt["content"].strip()
327
+ break
328
+ if not proxy_author:
329
+ for _sel in ['a[rel="author"]', ".author", ".byline"]:
330
+ _el = soup.select_one(_sel)
331
+ if _el:
332
+ proxy_author = _el.get_text(strip=True)
333
+ break
334
+
335
+ # Extract publish date (meta first)
336
+ proxy_date = None
337
+ _pdt = soup.find("meta", attrs={"property": "article:published_time"})
338
+ if _pdt and _pdt.get("content"):
339
+ _pm = re.search(r'\d{4}-\d{2}-\d{2}', _pdt["content"])
340
+ if _pm:
341
+ proxy_date = _pm.group(0)
342
+
343
+ # Scribe.rip byline fallback: a <p> like "AuthorNameon YYYY-MM-DD" or "Author · YYYY-MM-DD"
344
+ # (scribe.rip sometimes concatenates author+date without spacing)
345
+ if not proxy_author or not proxy_date:
346
+ for _bp in soup.find_all("p"):
347
+ _bt = _bp.get_text(strip=True)
348
+ # Pattern: <name>on <date> or <name> on <date>
349
+ _bm = re.match(r'^(.{2,60}?)\s*on\s+(\d{4}-\d{2}-\d{2})\b', _bt, re.IGNORECASE)
350
+ if not _bm:
351
+ _bm = re.match(r'^(.{2,60}?)\s*[·•|]\s*(\d{4}-\d{2}-\d{2})\b', _bt)
352
+ if _bm:
353
+ if not proxy_author:
354
+ proxy_author = _bm.group(1).strip().rstrip("·•|").strip()
355
+ if not proxy_date:
356
+ proxy_date = _bm.group(2)
357
+ break
358
+
359
+ return title, "\n".join(lines), thumbnail, proxy_author, proxy_date
360
+
361
+
362
+ def _fetch_medium(url: str, timeout: int) -> tuple[str, str, str] | None:
363
+ """
364
+ Try each Medium proxy in order; return first successful result.
365
+ """
366
+ import requests
367
+
368
+ for proxy_tpl in _MEDIUM_PROXIES:
369
+ proxy_url = proxy_tpl.format(url=url)
370
+ try:
371
+ print(f" [medium] Trying {proxy_url[:55]}...")
372
+ r = requests.get(proxy_url, headers=_HEADERS,
373
+ timeout=timeout, allow_redirects=True)
374
+ r.raise_for_status()
375
+ title, text, thumbnail, auth, pd = _parse_proxy_page(r.text, url)
376
+ if len(text) > 200:
377
+ return title, text, thumbnail, auth, pd
378
+ print(f" [medium] {proxy_url[:40]} returned too little text")
379
+ except Exception as e:
380
+ print(f" [medium] {proxy_url[:40]} failed: {e}")
381
+
382
+ return None
383
+
384
+
385
+ # ---------------------------------------------------------------------------
386
+ # Strategy – Wayback Machine (emergency fallback for blocked/paywalled URLs)
387
+ # ---------------------------------------------------------------------------
388
+
389
+ def _fetch_wayback(url: str, timeout: int) -> tuple[str, str, str] | None:
390
+ """
391
+ Look up the most recent Wayback Machine snapshot for a URL and fetch it.
392
+ Used as a last resort when all live fetch strategies are blocked (403/429).
393
+ """
394
+ import requests
395
+ import trafilatura
396
+
397
+ check = f"https://archive.org/wayback/available?url={url}"
398
+ try:
399
+ resp = requests.get(check, timeout=10)
400
+ resp.raise_for_status()
401
+ data = resp.json()
402
+ snapshot = data.get("archived_snapshots", {}).get("closest", {})
403
+ if not snapshot.get("available"):
404
+ return None
405
+ wb_url = snapshot["url"]
406
+ print(f" [wayback] Snapshot found: {wb_url[:70]}")
407
+ except Exception as e:
408
+ print(f" [wayback] Availability check failed: {e}")
409
+ return None
410
+
411
+ try:
412
+ r = requests.get(wb_url, headers=_HEADERS, timeout=timeout, allow_redirects=True)
413
+ r.raise_for_status()
414
+ html = r.text
415
+ except Exception as e:
416
+ print(f" [wayback] Fetch failed: {e}")
417
+ return None
418
+
419
+ # Use trafilatura for clean extraction from cached HTML
420
+ try:
421
+ text = trafilatura.extract(html, url=url,
422
+ include_comments=True, favor_recall=True) or ""
423
+ meta = trafilatura.extract_metadata(html, default_url=url)
424
+ title = (meta.title if meta else "") or ""
425
+ og_image = (meta.image if meta else "") or ""
426
+ wb_a = (meta.author if meta else "") or ""
427
+ wb_d = (meta.date if meta else "") or ""
428
+ except Exception:
429
+ text = ""; title = ""; og_image = ""; wb_a = ""; wb_d = ""
430
+
431
+ if not og_image:
432
+ try:
433
+ from bs4 import BeautifulSoup
434
+ soup = BeautifulSoup(html, "lxml")
435
+ og_image = _resolve_thumbnail(soup, url)
436
+ except Exception:
437
+ og_image = _get_favicon_url(url)
438
+
439
+ wb_date = None
440
+ if wb_d:
441
+ _wm = re.search(r'\d{4}-\d{2}-\d{2}', str(wb_d))
442
+ if _wm:
443
+ wb_date = _wm.group(0)
444
+
445
+ return (title, text, og_image, wb_a, wb_date) if len(text) > 100 else None
446
+
447
+
448
+ # ---------------------------------------------------------------------------
449
+ # Strategy 3 – newspaper4k
450
+ # ---------------------------------------------------------------------------
451
+
452
+ def _fetch_newspaper(url: str, timeout: int) -> tuple[str, str, str] | None:
453
+ """
454
+ newspaper4k (maintained fork of newspaper3k) — excellent for news articles,
455
+ blog posts, and most standard editorial pages.
456
+ """
457
+ try:
458
+ from newspaper import Article, Config
459
+ except ImportError:
460
+ return None
461
+
462
+ cfg = Config()
463
+ cfg.browser_user_agent = _HEADERS["User-Agent"]
464
+ cfg.request_timeout = timeout
465
+ cfg.fetch_images = False
466
+ cfg.memoize_articles = False
467
+
468
+ article = Article(url, config=cfg)
469
+ article.download()
470
+ article.parse()
471
+
472
+ title = article.title or ""
473
+ text = article.text or ""
474
+ top_image = article.top_image or ""
475
+
476
+ if not top_image:
477
+ # Try to get it from meta via soup
478
+ try:
479
+ from bs4 import BeautifulSoup
480
+ soup = BeautifulSoup(article.html or "", "lxml")
481
+ top_image = _resolve_thumbnail(soup, url)
482
+ except Exception:
483
+ top_image = _get_favicon_url(url)
484
+
485
+ # Extract author and publish date from newspaper4k
486
+ np_author = ""
487
+ if hasattr(article, 'authors') and article.authors:
488
+ np_author = article.authors[0]
489
+ np_date = None
490
+ if hasattr(article, 'publish_date') and article.publish_date:
491
+ try:
492
+ _npd = article.publish_date
493
+ if hasattr(_npd, 'strftime'):
494
+ np_date = _npd.strftime("%Y-%m-%d")
495
+ else:
496
+ _npm = re.search(r'\d{4}-\d{2}-\d{2}', str(_npd))
497
+ if _npm:
498
+ np_date = _npm.group(0)
499
+ except Exception:
500
+ pass
501
+
502
+ return (title, text, top_image, np_author, np_date) if len(text) > 200 else None
503
+
504
+
505
+ # ---------------------------------------------------------------------------
506
+ # Strategy 4 – trafilatura
507
+ # ---------------------------------------------------------------------------
508
+
509
+ def _fetch_trafilatura(url: str, timeout: int) -> tuple[str, str, str] | None:
510
+ """
511
+ trafilatura — state-of-the-art main-content extractor.
512
+ Handles forums, comment threads, Hacker News, Stack Overflow, etc.
513
+ """
514
+ try:
515
+ import trafilatura
516
+ from trafilatura.settings import use_config
517
+ except ImportError:
518
+ return None
519
+
520
+ # Download with our headers
521
+ import requests
522
+ r = requests.get(url, headers=_HEADERS, timeout=timeout, allow_redirects=True)
523
+ r.raise_for_status()
524
+ html = r.text
525
+
526
+ # Extract with trafilatura
527
+ extracted = trafilatura.extract(
528
+ html,
529
+ url=url,
530
+ include_comments=True,
531
+ include_tables=True,
532
+ no_fallback=False,
533
+ favor_recall=True, # better for forums/threads
534
+ )
535
+
536
+ if not extracted or len(extracted) < 200:
537
+ return None
538
+
539
+ # Get metadata (title + image) via trafilatura's metadata extractor
540
+ meta = trafilatura.extract_metadata(html, default_url=url)
541
+ title = (meta.title if meta else "") or ""
542
+ og_image = (meta.image if meta else "") or ""
543
+ traf_a = (meta.author if meta else "") or ""
544
+ traf_d = (meta.date if meta else "") or ""
545
+ if not og_image:
546
+ try:
547
+ from bs4 import BeautifulSoup
548
+ soup = BeautifulSoup(html, "lxml")
549
+ og_image = _resolve_thumbnail(soup, url)
550
+ except Exception:
551
+ og_image = _get_favicon_url(url)
552
+
553
+ traf_date = None
554
+ if traf_d:
555
+ _tm = re.search(r'\d{4}-\d{2}-\d{2}', str(traf_d))
556
+ if _tm:
557
+ traf_date = _tm.group(0)
558
+
559
+ return title, extracted, og_image, traf_a, traf_date
560
+
561
+
562
+ # ---------------------------------------------------------------------------
563
+ # Strategy 5 – BeautifulSoup (original reliable fallback)
564
+ # ---------------------------------------------------------------------------
565
+
566
+ def _fetch_beautifulsoup(url: str, timeout: int) -> tuple[str, str, str]:
567
+ """Pure BeautifulSoup fallback — always produces *something*."""
568
+ import requests
569
+
570
+ r = requests.get(url, headers=_HEADERS, timeout=timeout, allow_redirects=True)
571
+ r.raise_for_status()
572
+ html = r.text
573
+
574
+ try:
575
+ from bs4 import BeautifulSoup
576
+ soup = BeautifulSoup(html, "lxml")
577
+ except Exception:
578
+ from bs4 import BeautifulSoup
579
+ soup = BeautifulSoup(html, "html.parser")
580
+
581
+ thumbnail = _resolve_thumbnail(soup, url)
582
+
583
+ title = ""
584
+ og = soup.find("meta", property="og:title")
585
+ if og and og.get("content"):
586
+ title = og["content"].strip()
587
+ elif soup.title:
588
+ title = soup.title.get_text(strip=True)
589
+ elif soup.find("h1"):
590
+ title = soup.find("h1").get_text(strip=True)
591
+
592
+ for tag in soup(["script", "style", "nav", "header", "footer", "aside",
593
+ "form", "button", "noscript", "iframe", "svg"]):
594
+ tag.decompose()
595
+
596
+ text = ""
597
+ for sel in ["article", "main", '[role="main"]', ".post-content",
598
+ ".article-body", ".entry-content", ".content", "#content", ".post", "#main"]:
599
+ el = soup.select_one(sel)
600
+ if el:
601
+ text = el.get_text(separator="\n", strip=True)
602
+ break
603
+ if not text:
604
+ body = soup.find("body")
605
+ text = (body or soup).get_text(separator="\n", strip=True)
606
+
607
+ # Extract author and date from meta/JSON-LD
608
+ bs_author = ""
609
+ for _ba in [{"property": "article:author"}, {"name": "author"}, {"name": "dc.creator"}]:
610
+ _bm = soup.find("meta", attrs=_ba)
611
+ if _bm and _bm.get("content"):
612
+ bs_author = _bm["content"].strip()
613
+ break
614
+ bs_date = None
615
+ for _ba in [{"property": "article:published_time"}, {"name": "datePublished"},
616
+ {"itemprop": "datePublished"}]:
617
+ _bm = soup.find("meta", attrs=_ba)
618
+ if _bm and _bm.get("content"):
619
+ _bdm = re.search(r'\d{4}-\d{2}-\d{2}', _bm["content"])
620
+ if _bdm:
621
+ bs_date = _bdm.group(0)
622
+ break
623
+ if not bs_date:
624
+ for _bt in soup.find_all("time", attrs={"datetime": True}):
625
+ if re.match(r'\d{4}-\d{2}-\d{2}', _bt["datetime"]):
626
+ bs_date = _bt["datetime"][:10]
627
+ break
628
+
629
+ return title, text, thumbnail, bs_author, bs_date
630
+
631
+
632
+ # ---------------------------------------------------------------------------
633
+ # Public: fetch_page_text
634
+ # ---------------------------------------------------------------------------
635
+
636
+ def fetch_page_text(url: str, timeout: int = 20) -> tuple[str, str, str, str, str | None]:
637
+ """
638
+ Fetch a web page with a multi-strategy pipeline and return
639
+ (title, text, thumbnail, author, post_date).
640
+
641
+ Strategy order:
642
+ Reddit JSON API → Medium proxies (scribe.rip/freedium) →
643
+ newspaper4k → trafilatura → Wayback Machine → BeautifulSoup
644
+ """
645
+
646
+ def _clean(text: str) -> str:
647
+ lines = [ln.strip() for ln in text.splitlines() if len(ln.strip()) > 10]
648
+ words = " ".join(lines).split()
649
+ if len(words) > 5000:
650
+ return " ".join(words[:5000]) + "\n[... content truncated ...]"
651
+ return "\n".join(lines)
652
+
653
+ # 1 — Reddit
654
+ if _is_reddit(url):
655
+ print(" [fetch] Reddit JSON API")
656
+ try:
657
+ result = _fetch_reddit(url, timeout)
658
+ if result and result[1].strip():
659
+ t, txt, thumb, auth, pd = result
660
+ return t, _clean(txt), thumb, auth, pd
661
+ except Exception as e:
662
+ print(f" [fetch] Reddit failed: {e}")
663
+
664
+ # 2 — Medium (multi-proxy)
665
+ if _is_medium(url):
666
+ print(" [fetch] Medium proxies (scribe.rip → freedium.cfd)")
667
+ try:
668
+ result = _fetch_medium(url, timeout)
669
+ if result and result[1].strip():
670
+ t, txt, thumb, auth, pd = result
671
+ return t, _clean(txt), thumb, auth, pd
672
+ except Exception as e:
673
+ print(f" [fetch] All Medium proxies failed: {e}")
674
+
675
+ blocked_error: str = ""
676
+
677
+ # 3 — newspaper4k (best for standard articles)
678
+ print(" [fetch] newspaper4k")
679
+ try:
680
+ result = _fetch_newspaper(url, timeout)
681
+ if result and result[1].strip():
682
+ t, txt, thumb, auth, pd = result
683
+ return t, _clean(txt), thumb, auth, pd
684
+ except Exception as e:
685
+ print(f" [fetch] newspaper4k failed: {e}")
686
+ if "403" in str(e) or "401" in str(e) or "Forbidden" in str(e):
687
+ blocked_error = str(e)
688
+
689
+ # 4 — trafilatura (best for forums / comment-heavy pages)
690
+ print(" [fetch] trafilatura")
691
+ try:
692
+ result = _fetch_trafilatura(url, timeout)
693
+ if result and result[1].strip():
694
+ t, txt, thumb, auth, pd = result
695
+ return t, _clean(txt), thumb, auth, pd
696
+ except Exception as e:
697
+ print(f" [fetch] trafilatura failed: {e}")
698
+ if "403" in str(e) or "401" in str(e) or "Forbidden" in str(e):
699
+ blocked_error = str(e)
700
+
701
+ # 5 — Wayback Machine (when site blocks scrapers)
702
+ if blocked_error or _is_medium(url):
703
+ print(" [fetch] Wayback Machine (site appears blocked)")
704
+ try:
705
+ result = _fetch_wayback(url, timeout)
706
+ if result and result[1].strip():
707
+ t, txt, thumb, auth, pd = result
708
+ return t, _clean(txt), thumb, auth, pd
709
+ except Exception as e:
710
+ print(f" [fetch] Wayback Machine failed: {e}")
711
+
712
+ # 6 — BeautifulSoup raw fallback
713
+ print(" [fetch] BeautifulSoup fallback")
714
+ t, txt, thumb, auth, pd = _fetch_beautifulsoup(url, timeout)
715
+ return t, _clean(txt), thumb, auth, pd
716
+
717
+
718
+ # ---------------------------------------------------------------------------
719
+ # Core analyzer (public API)
720
+ # ---------------------------------------------------------------------------
721
+
722
+ def analyze_webpage(url: str) -> dict:
723
+ """
724
+ Fetch and analyze a web page via ModelRouter text models.
725
+
726
+ Returns:
727
+ dict: raw_output, page_title, thumbnail, error
728
+ """
729
+ print(" 🌐 Fetching web page...")
730
+
731
+ try:
732
+ page_title, content, thumbnail, page_author, page_date = fetch_page_text(url)
733
+ summary_title = f"'{page_title[:70]}'" if page_title else "(no title)"
734
+ print(f" ✓ Fetched: {summary_title}")
735
+ if thumbnail.startswith("data:"):
736
+ print(" 🖼️ Using SVG placeholder (no image found)")
737
+ elif "google.com/s2/favicons" in thumbnail:
738
+ print(f" 🖼️ Using platform favicon: {_netloc(url)}")
739
+ else:
740
+ print(f" 🖼️ Thumbnail: {thumbnail[:80]}")
741
+ except Exception as e:
742
+ return {"raw_output": "", "page_title": "", "thumbnail": "",
743
+ "author": "", "post_date": None,
744
+ "error": f"Failed to fetch page: {e}"}
745
+
746
+ if not content.strip():
747
+ return {"raw_output": "", "page_title": page_title, "thumbnail": thumbnail,
748
+ "author": page_author, "post_date": page_date,
749
+ "error": "No readable text content found on the page"}
750
+
751
+ prompt = _WEB_PROMPT_TPL.format(
752
+ url=url,
753
+ page_title=page_title or "Unknown",
754
+ content=content[:8000],
755
+ )
756
+
757
+ print(" 🤖 Analyzing page content with AI...")
758
+
759
+ try:
760
+ from core.model_router import get_router
761
+ raw_output = get_router().generate_text(prompt)
762
+ print(" ✓ Web page analysis complete")
763
+ return {"raw_output": raw_output, "page_title": page_title,
764
+ "thumbnail": thumbnail, "author": page_author,
765
+ "post_date": page_date, "error": None}
766
+ except Exception as e:
767
+ return {"raw_output": "", "page_title": page_title, "thumbnail": thumbnail,
768
+ "author": page_author, "post_date": page_date,
769
+ "error": f"AI analysis failed: {e}"}
770
+
771
+
772
+ # ---------------------------------------------------------------------------
773
+ # CLI
774
+ # ---------------------------------------------------------------------------
775
+
776
+ if __name__ == "__main__":
777
+ import sys
778
+ url = sys.argv[1] if len(sys.argv) > 1 else input("Web page URL: ").strip()
779
+ if url:
780
+ result = analyze_webpage(url)
781
+ if result["error"]:
782
+ print(f"\n✗ Error: {result['error']}")
783
+ else:
784
+ print("\n" + "=" * 60)
785
+ print(f"[thumbnail] {result['thumbnail'][:100]}")
786
+ print("=" * 60)
787
+ print(result["raw_output"])
788
+
789
+