superbrain-server 1.0.2-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/superbrain.js +196 -0
- package/package.json +23 -0
- package/payload/.dockerignore +45 -0
- package/payload/.env.example +58 -0
- package/payload/Dockerfile +73 -0
- package/payload/analyzers/__init__.py +0 -0
- package/payload/analyzers/audio_transcribe.py +225 -0
- package/payload/analyzers/caption.py +244 -0
- package/payload/analyzers/music_identifier.py +346 -0
- package/payload/analyzers/text_analyzer.py +117 -0
- package/payload/analyzers/visual_analyze.py +218 -0
- package/payload/analyzers/webpage_analyzer.py +789 -0
- package/payload/analyzers/youtube_analyzer.py +320 -0
- package/payload/api.py +1676 -0
- package/payload/config/.api_keys.example +22 -0
- package/payload/config/model_rankings.json +492 -0
- package/payload/config/openrouter_free_models.json +1364 -0
- package/payload/config/whisper_model.txt +1 -0
- package/payload/config_settings.py +185 -0
- package/payload/core/__init__.py +0 -0
- package/payload/core/category_manager.py +219 -0
- package/payload/core/database.py +811 -0
- package/payload/core/link_checker.py +300 -0
- package/payload/core/model_router.py +1253 -0
- package/payload/docker-compose.yml +120 -0
- package/payload/instagram/__init__.py +0 -0
- package/payload/instagram/instagram_downloader.py +253 -0
- package/payload/instagram/instagram_login.py +190 -0
- package/payload/main.py +912 -0
- package/payload/requirements.txt +39 -0
- package/payload/reset.py +311 -0
- package/payload/start-docker-prod.sh +125 -0
- package/payload/start-docker.sh +56 -0
- package/payload/start.py +1302 -0
- package/payload/static/favicon.ico +0 -0
- package/payload/stop-docker.sh +16 -0
- package/payload/utils/__init__.py +0 -0
- package/payload/utils/db_stats.py +108 -0
- package/payload/utils/manage_token.py +91 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Universal Link Validator for SuperBrain
|
|
4
|
+
========================================
|
|
5
|
+
Detects and validates Instagram, YouTube, and general web page URLs.
|
|
6
|
+
|
|
7
|
+
Returns a unified dict with:
|
|
8
|
+
content_type : 'instagram' | 'youtube' | 'webpage'
|
|
9
|
+
shortcode : DB primary key
|
|
10
|
+
Instagram → original shortcode (e.g. DUQD-t2DC1D)
|
|
11
|
+
YouTube → YT_<video_id> (e.g. YT_dQw4w9WgXcW)
|
|
12
|
+
Webpage → WP_<sha256[:16]> (e.g. WP_a1b2c3d4e5f6a7b8)
|
|
13
|
+
video_id : YouTube video ID (YouTube only, else None)
|
|
14
|
+
valid : bool
|
|
15
|
+
error : str | None
|
|
16
|
+
url : cleaned URL
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
import hashlib
|
|
21
|
+
import requests
|
|
22
|
+
from urllib.parse import urlparse, parse_qs
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
26
|
+
# Short-URL resolver
|
|
27
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
_SHORT_URL_DOMAINS = {
|
|
30
|
+
"share.google",
|
|
31
|
+
"goo.gl",
|
|
32
|
+
"bit.ly",
|
|
33
|
+
"t.co",
|
|
34
|
+
"tinyurl.com",
|
|
35
|
+
"ow.ly",
|
|
36
|
+
"buff.ly",
|
|
37
|
+
"short.gy",
|
|
38
|
+
"rb.gy",
|
|
39
|
+
"shorturl.at",
|
|
40
|
+
"is.gd",
|
|
41
|
+
"v.gd",
|
|
42
|
+
"cutt.ly",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _is_short_url(netloc: str) -> bool:
|
|
47
|
+
"""Return True if the domain is a known URL shortener."""
|
|
48
|
+
netloc = netloc.lower().lstrip("www.")
|
|
49
|
+
return any(netloc == d or netloc.endswith("." + d) for d in _SHORT_URL_DOMAINS)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
_MOBILE_UA = (
|
|
53
|
+
"Mozilla/5.0 (Linux; Android 14; Pixel 8) "
|
|
54
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
55
|
+
"Chrome/122.0.0.0 Mobile Safari/537.36"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _resolve_url(url: str) -> str:
|
|
60
|
+
"""Follow redirects and return the final URL. Returns original on failure."""
|
|
61
|
+
headers = {"User-Agent": _MOBILE_UA}
|
|
62
|
+
try:
|
|
63
|
+
# GET with stream so we don't download the body, just follow redirects
|
|
64
|
+
resp = requests.get(
|
|
65
|
+
url, allow_redirects=True, timeout=10, stream=True,
|
|
66
|
+
headers=headers,
|
|
67
|
+
)
|
|
68
|
+
resp.close()
|
|
69
|
+
return resp.url
|
|
70
|
+
except Exception:
|
|
71
|
+
return url
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
75
|
+
# Text → URL extractor
|
|
76
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
# Matches any bare http/https URL in free text (e.g. "Title - Site https://...")
|
|
79
|
+
_URL_IN_TEXT_RE = re.compile(
|
|
80
|
+
r'https?://[^\s"<>]+',
|
|
81
|
+
re.IGNORECASE,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
86
|
+
# Instagram
|
|
87
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
88
|
+
|
|
89
|
+
def _validate_instagram(url: str, parsed) -> dict:
|
|
90
|
+
"""Returns validate_link result for an Instagram URL, or None if not Instagram."""
|
|
91
|
+
if parsed.netloc not in (
|
|
92
|
+
"instagram.com", "www.instagram.com", "instagr.am", "www.instagr.am"
|
|
93
|
+
):
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
match = re.search(r"/(?:p|reel|reels|tv)/([A-Za-z0-9_-]+)", parsed.path)
|
|
97
|
+
if not match:
|
|
98
|
+
return {
|
|
99
|
+
"valid": False, "content_type": "instagram",
|
|
100
|
+
"shortcode": None, "video_id": None,
|
|
101
|
+
"error": "Not a valid Instagram post/reel/video URL", "url": url,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
shortcode = match.group(1)
|
|
105
|
+
if not re.match(r"^[A-Za-z0-9_-]+$", shortcode):
|
|
106
|
+
return {
|
|
107
|
+
"valid": False, "content_type": "instagram",
|
|
108
|
+
"shortcode": None, "video_id": None,
|
|
109
|
+
"error": "Invalid Instagram shortcode format", "url": url,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
"valid": True, "content_type": "instagram",
|
|
114
|
+
"shortcode": shortcode, "video_id": None,
|
|
115
|
+
"error": None, "url": url,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
120
|
+
# YouTube
|
|
121
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
122
|
+
|
|
123
|
+
_YT_DOMAINS = (
|
|
124
|
+
"youtube.com", "www.youtube.com", "m.youtube.com",
|
|
125
|
+
"youtu.be", "www.youtu.be",
|
|
126
|
+
"youtube-nocookie.com", "www.youtube-nocookie.com",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _extract_youtube_id(url: str, parsed) -> str | None:
|
|
131
|
+
"""Extract video ID from any known YouTube URL format."""
|
|
132
|
+
netloc = parsed.netloc.lower()
|
|
133
|
+
if netloc not in _YT_DOMAINS:
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
path = parsed.path
|
|
137
|
+
qs = parse_qs(parsed.query)
|
|
138
|
+
|
|
139
|
+
# youtu.be/<id>
|
|
140
|
+
if "youtu.be" in netloc:
|
|
141
|
+
m = re.match(r"^/([A-Za-z0-9_-]{11})", path)
|
|
142
|
+
return m.group(1) if m else None
|
|
143
|
+
|
|
144
|
+
# /watch?v=<id>
|
|
145
|
+
if "/watch" in path and "v" in qs:
|
|
146
|
+
return qs["v"][0]
|
|
147
|
+
|
|
148
|
+
# /shorts/<id> or /embed/<id> or /v/<id> or /live/<id>
|
|
149
|
+
m = re.match(r"^/(?:shorts|embed|v|live|e)/([A-Za-z0-9_-]{11})", path)
|
|
150
|
+
if m:
|
|
151
|
+
return m.group(1)
|
|
152
|
+
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _validate_youtube(url: str, parsed) -> dict | None:
|
|
157
|
+
"""Returns validate_link result for a YouTube URL, or None if not YouTube."""
|
|
158
|
+
video_id = _extract_youtube_id(url, parsed)
|
|
159
|
+
if video_id is None:
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
clean_url = f"https://www.youtube.com/watch?v={video_id}"
|
|
163
|
+
return {
|
|
164
|
+
"valid": True, "content_type": "youtube",
|
|
165
|
+
"shortcode": f"YT_{video_id}", "video_id": video_id,
|
|
166
|
+
"error": None, "url": clean_url,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
171
|
+
# Generic web page
|
|
172
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
173
|
+
|
|
174
|
+
def _make_page_id(url: str) -> str:
|
|
175
|
+
"""Deterministic 16-char ID derived from the URL (sha256 hex prefix)."""
|
|
176
|
+
return hashlib.sha256(url.encode()).hexdigest()[:16]
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _validate_webpage(url: str, parsed) -> dict:
|
|
180
|
+
"""Always returns a validate_link result for any http/https URL."""
|
|
181
|
+
if parsed.scheme not in ("http", "https"):
|
|
182
|
+
return {
|
|
183
|
+
"valid": False, "content_type": "webpage",
|
|
184
|
+
"shortcode": None, "video_id": None,
|
|
185
|
+
"error": "URL must use http or https", "url": url,
|
|
186
|
+
}
|
|
187
|
+
if not parsed.netloc:
|
|
188
|
+
return {
|
|
189
|
+
"valid": False, "content_type": "webpage",
|
|
190
|
+
"shortcode": None, "video_id": None,
|
|
191
|
+
"error": "Invalid URL — no domain found", "url": url,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
page_id = _make_page_id(url)
|
|
195
|
+
return {
|
|
196
|
+
"valid": True, "content_type": "webpage",
|
|
197
|
+
"shortcode": f"WP_{page_id}", "video_id": None,
|
|
198
|
+
"error": None, "url": url,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
203
|
+
# Public API
|
|
204
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
205
|
+
|
|
206
|
+
def validate_link(url: str) -> dict:
|
|
207
|
+
"""
|
|
208
|
+
Validate any URL and detect its content type.
|
|
209
|
+
|
|
210
|
+
Handles:
|
|
211
|
+
- Plain URLs: https://example.com/article
|
|
212
|
+
- Short URLs: share.google/xxx, bit.ly/xxx → resolved to final URL
|
|
213
|
+
- Title + URL text: "Some Title https://example.com" → URL extracted
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
{
|
|
217
|
+
'valid' : bool,
|
|
218
|
+
'content_type' : 'instagram' | 'youtube' | 'webpage',
|
|
219
|
+
'shortcode' : str | None, # DB primary key
|
|
220
|
+
'video_id' : str | None, # YouTube video ID only
|
|
221
|
+
'error' : str | None,
|
|
222
|
+
'url' : str,
|
|
223
|
+
}
|
|
224
|
+
"""
|
|
225
|
+
if not url or not isinstance(url, str):
|
|
226
|
+
return {
|
|
227
|
+
"valid": False, "content_type": "webpage",
|
|
228
|
+
"shortcode": None, "video_id": None,
|
|
229
|
+
"error": "Empty or invalid URL", "url": url or "",
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
url = url.strip()
|
|
233
|
+
|
|
234
|
+
# ── Step 1: If input is "Title https://url" style text, extract the URL ──
|
|
235
|
+
# Only attempt extraction when the full string doesn't parse as a URL
|
|
236
|
+
_quick = urlparse(url)
|
|
237
|
+
if _quick.scheme not in ("http", "https"):
|
|
238
|
+
matches = _URL_IN_TEXT_RE.findall(url)
|
|
239
|
+
if matches:
|
|
240
|
+
url = matches[0].rstrip(".,);")
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
parsed = urlparse(url)
|
|
244
|
+
except Exception as e:
|
|
245
|
+
return {
|
|
246
|
+
"valid": False, "content_type": "webpage",
|
|
247
|
+
"shortcode": None, "video_id": None,
|
|
248
|
+
"error": f"Invalid URL format: {e}", "url": url,
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
# ── Step 2: Resolve short / redirect URLs before further validation ──
|
|
252
|
+
if _is_short_url(parsed.netloc):
|
|
253
|
+
resolved = _resolve_url(url)
|
|
254
|
+
if resolved != url:
|
|
255
|
+
url = resolved
|
|
256
|
+
try:
|
|
257
|
+
parsed = urlparse(url)
|
|
258
|
+
except Exception:
|
|
259
|
+
pass
|
|
260
|
+
|
|
261
|
+
result = _validate_instagram(url, parsed)
|
|
262
|
+
if result is not None:
|
|
263
|
+
return result
|
|
264
|
+
|
|
265
|
+
result = _validate_youtube(url, parsed)
|
|
266
|
+
if result is not None:
|
|
267
|
+
return result
|
|
268
|
+
|
|
269
|
+
return _validate_webpage(url, parsed)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# Backward-compat shim for code that still calls is_valid_instagram_link()
|
|
273
|
+
def is_valid_instagram_link(url: str):
|
|
274
|
+
"""Legacy function. Prefer validate_link()."""
|
|
275
|
+
r = validate_link(url)
|
|
276
|
+
if r["content_type"] != "instagram":
|
|
277
|
+
return False, None, "Not an Instagram URL"
|
|
278
|
+
return r["valid"], r["shortcode"], r["error"]
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
282
|
+
# CLI test
|
|
283
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
284
|
+
|
|
285
|
+
if __name__ == "__main__":
|
|
286
|
+
test_urls = [
|
|
287
|
+
"https://www.instagram.com/reel/DUQD-t2DC1D/",
|
|
288
|
+
"https://www.instagram.com/p/DRWKk5JiL0h/",
|
|
289
|
+
"https://www.youtube.com/watch?v=dQw4w9WgXcW",
|
|
290
|
+
"https://youtu.be/dQw4w9WgXcW",
|
|
291
|
+
"https://www.youtube.com/shorts/ab12cd34ef5",
|
|
292
|
+
"https://techcrunch.com/2024/01/01/some-article/",
|
|
293
|
+
"https://www.instagram.com/username/", # invalid IG (no post path)
|
|
294
|
+
"not-a-url",
|
|
295
|
+
]
|
|
296
|
+
print("=" * 70)
|
|
297
|
+
for u in test_urls:
|
|
298
|
+
r = validate_link(u)
|
|
299
|
+
icon = "✓" if r["valid"] else "✗"
|
|
300
|
+
print(f"{icon} [{r['content_type']:<9}] shortcode={str(r['shortcode']):<28} | {u}")
|