superbrain-server 1.0.2-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/bin/superbrain.js +196 -0
  2. package/package.json +23 -0
  3. package/payload/.dockerignore +45 -0
  4. package/payload/.env.example +58 -0
  5. package/payload/Dockerfile +73 -0
  6. package/payload/analyzers/__init__.py +0 -0
  7. package/payload/analyzers/audio_transcribe.py +225 -0
  8. package/payload/analyzers/caption.py +244 -0
  9. package/payload/analyzers/music_identifier.py +346 -0
  10. package/payload/analyzers/text_analyzer.py +117 -0
  11. package/payload/analyzers/visual_analyze.py +218 -0
  12. package/payload/analyzers/webpage_analyzer.py +789 -0
  13. package/payload/analyzers/youtube_analyzer.py +320 -0
  14. package/payload/api.py +1676 -0
  15. package/payload/config/.api_keys.example +22 -0
  16. package/payload/config/model_rankings.json +492 -0
  17. package/payload/config/openrouter_free_models.json +1364 -0
  18. package/payload/config/whisper_model.txt +1 -0
  19. package/payload/config_settings.py +185 -0
  20. package/payload/core/__init__.py +0 -0
  21. package/payload/core/category_manager.py +219 -0
  22. package/payload/core/database.py +811 -0
  23. package/payload/core/link_checker.py +300 -0
  24. package/payload/core/model_router.py +1253 -0
  25. package/payload/docker-compose.yml +120 -0
  26. package/payload/instagram/__init__.py +0 -0
  27. package/payload/instagram/instagram_downloader.py +253 -0
  28. package/payload/instagram/instagram_login.py +190 -0
  29. package/payload/main.py +912 -0
  30. package/payload/requirements.txt +39 -0
  31. package/payload/reset.py +311 -0
  32. package/payload/start-docker-prod.sh +125 -0
  33. package/payload/start-docker.sh +56 -0
  34. package/payload/start.py +1302 -0
  35. package/payload/static/favicon.ico +0 -0
  36. package/payload/stop-docker.sh +16 -0
  37. package/payload/utils/__init__.py +0 -0
  38. package/payload/utils/db_stats.py +108 -0
  39. package/payload/utils/manage_token.py +91 -0
@@ -0,0 +1,300 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Universal Link Validator for SuperBrain
4
+ ========================================
5
+ Detects and validates Instagram, YouTube, and general web page URLs.
6
+
7
+ Returns a unified dict with:
8
+ content_type : 'instagram' | 'youtube' | 'webpage'
9
+ shortcode : DB primary key
10
+ Instagram → original shortcode (e.g. DUQD-t2DC1D)
11
+ YouTube → YT_<video_id> (e.g. YT_dQw4w9WgXcW)
12
+ Webpage → WP_<sha256[:16]> (e.g. WP_a1b2c3d4e5f6a7b8)
13
+ video_id : YouTube video ID (YouTube only, else None)
14
+ valid : bool
15
+ error : str | None
16
+ url : cleaned URL
17
+ """
18
+
19
+ import re
20
+ import hashlib
21
+ import requests
22
+ from urllib.parse import urlparse, parse_qs
23
+
24
+
25
+ # ─────────────────────────────────────────────────────────────────────────────
26
+ # Short-URL resolver
27
+ # ─────────────────────────────────────────────────────────────────────────────
28
+
29
+ _SHORT_URL_DOMAINS = {
30
+ "share.google",
31
+ "goo.gl",
32
+ "bit.ly",
33
+ "t.co",
34
+ "tinyurl.com",
35
+ "ow.ly",
36
+ "buff.ly",
37
+ "short.gy",
38
+ "rb.gy",
39
+ "shorturl.at",
40
+ "is.gd",
41
+ "v.gd",
42
+ "cutt.ly",
43
+ }
44
+
45
+
46
+ def _is_short_url(netloc: str) -> bool:
47
+ """Return True if the domain is a known URL shortener."""
48
+ netloc = netloc.lower().lstrip("www.")
49
+ return any(netloc == d or netloc.endswith("." + d) for d in _SHORT_URL_DOMAINS)
50
+
51
+
52
+ _MOBILE_UA = (
53
+ "Mozilla/5.0 (Linux; Android 14; Pixel 8) "
54
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
55
+ "Chrome/122.0.0.0 Mobile Safari/537.36"
56
+ )
57
+
58
+
59
+ def _resolve_url(url: str) -> str:
60
+ """Follow redirects and return the final URL. Returns original on failure."""
61
+ headers = {"User-Agent": _MOBILE_UA}
62
+ try:
63
+ # GET with stream so we don't download the body, just follow redirects
64
+ resp = requests.get(
65
+ url, allow_redirects=True, timeout=10, stream=True,
66
+ headers=headers,
67
+ )
68
+ resp.close()
69
+ return resp.url
70
+ except Exception:
71
+ return url
72
+
73
+
74
+ # ─────────────────────────────────────────────────────────────────────────────
75
+ # Text → URL extractor
76
+ # ─────────────────────────────────────────────────────────────────────────────
77
+
78
+ # Matches any bare http/https URL in free text (e.g. "Title - Site https://...")
79
+ _URL_IN_TEXT_RE = re.compile(
80
+ r'https?://[^\s"<>]+',
81
+ re.IGNORECASE,
82
+ )
83
+
84
+
85
+ # ─────────────────────────────────────────────────────────────────────────────
86
+ # Instagram
87
+ # ─────────────────────────────────────────────────────────────────────────────
88
+
89
+ def _validate_instagram(url: str, parsed) -> dict:
90
+ """Returns validate_link result for an Instagram URL, or None if not Instagram."""
91
+ if parsed.netloc not in (
92
+ "instagram.com", "www.instagram.com", "instagr.am", "www.instagr.am"
93
+ ):
94
+ return None
95
+
96
+ match = re.search(r"/(?:p|reel|reels|tv)/([A-Za-z0-9_-]+)", parsed.path)
97
+ if not match:
98
+ return {
99
+ "valid": False, "content_type": "instagram",
100
+ "shortcode": None, "video_id": None,
101
+ "error": "Not a valid Instagram post/reel/video URL", "url": url,
102
+ }
103
+
104
+ shortcode = match.group(1)
105
+ if not re.match(r"^[A-Za-z0-9_-]+$", shortcode):
106
+ return {
107
+ "valid": False, "content_type": "instagram",
108
+ "shortcode": None, "video_id": None,
109
+ "error": "Invalid Instagram shortcode format", "url": url,
110
+ }
111
+
112
+ return {
113
+ "valid": True, "content_type": "instagram",
114
+ "shortcode": shortcode, "video_id": None,
115
+ "error": None, "url": url,
116
+ }
117
+
118
+
119
+ # ─────────────────────────────────────────────────────────────────────────────
120
+ # YouTube
121
+ # ─────────────────────────────────────────────────────────────────────────────
122
+
123
+ _YT_DOMAINS = (
124
+ "youtube.com", "www.youtube.com", "m.youtube.com",
125
+ "youtu.be", "www.youtu.be",
126
+ "youtube-nocookie.com", "www.youtube-nocookie.com",
127
+ )
128
+
129
+
130
+ def _extract_youtube_id(url: str, parsed) -> str | None:
131
+ """Extract video ID from any known YouTube URL format."""
132
+ netloc = parsed.netloc.lower()
133
+ if netloc not in _YT_DOMAINS:
134
+ return None
135
+
136
+ path = parsed.path
137
+ qs = parse_qs(parsed.query)
138
+
139
+ # youtu.be/<id>
140
+ if "youtu.be" in netloc:
141
+ m = re.match(r"^/([A-Za-z0-9_-]{11})", path)
142
+ return m.group(1) if m else None
143
+
144
+ # /watch?v=<id>
145
+ if "/watch" in path and "v" in qs:
146
+ return qs["v"][0]
147
+
148
+ # /shorts/<id> or /embed/<id> or /v/<id> or /live/<id>
149
+ m = re.match(r"^/(?:shorts|embed|v|live|e)/([A-Za-z0-9_-]{11})", path)
150
+ if m:
151
+ return m.group(1)
152
+
153
+ return None
154
+
155
+
156
+ def _validate_youtube(url: str, parsed) -> dict | None:
157
+ """Returns validate_link result for a YouTube URL, or None if not YouTube."""
158
+ video_id = _extract_youtube_id(url, parsed)
159
+ if video_id is None:
160
+ return None
161
+
162
+ clean_url = f"https://www.youtube.com/watch?v={video_id}"
163
+ return {
164
+ "valid": True, "content_type": "youtube",
165
+ "shortcode": f"YT_{video_id}", "video_id": video_id,
166
+ "error": None, "url": clean_url,
167
+ }
168
+
169
+
170
+ # ─────────────────────────────────────────────────────────────────────────────
171
+ # Generic web page
172
+ # ─────────────────────────────────────────────────────────────────────────────
173
+
174
+ def _make_page_id(url: str) -> str:
175
+ """Deterministic 16-char ID derived from the URL (sha256 hex prefix)."""
176
+ return hashlib.sha256(url.encode()).hexdigest()[:16]
177
+
178
+
179
+ def _validate_webpage(url: str, parsed) -> dict:
180
+ """Always returns a validate_link result for any http/https URL."""
181
+ if parsed.scheme not in ("http", "https"):
182
+ return {
183
+ "valid": False, "content_type": "webpage",
184
+ "shortcode": None, "video_id": None,
185
+ "error": "URL must use http or https", "url": url,
186
+ }
187
+ if not parsed.netloc:
188
+ return {
189
+ "valid": False, "content_type": "webpage",
190
+ "shortcode": None, "video_id": None,
191
+ "error": "Invalid URL — no domain found", "url": url,
192
+ }
193
+
194
+ page_id = _make_page_id(url)
195
+ return {
196
+ "valid": True, "content_type": "webpage",
197
+ "shortcode": f"WP_{page_id}", "video_id": None,
198
+ "error": None, "url": url,
199
+ }
200
+
201
+
202
+ # ─────────────────────────────────────────────────────────────────────────────
203
+ # Public API
204
+ # ─────────────────────────────────────────────────────────────────────────────
205
+
206
+ def validate_link(url: str) -> dict:
207
+ """
208
+ Validate any URL and detect its content type.
209
+
210
+ Handles:
211
+ - Plain URLs: https://example.com/article
212
+ - Short URLs: share.google/xxx, bit.ly/xxx → resolved to final URL
213
+ - Title + URL text: "Some Title https://example.com" → URL extracted
214
+
215
+ Returns:
216
+ {
217
+ 'valid' : bool,
218
+ 'content_type' : 'instagram' | 'youtube' | 'webpage',
219
+ 'shortcode' : str | None, # DB primary key
220
+ 'video_id' : str | None, # YouTube video ID only
221
+ 'error' : str | None,
222
+ 'url' : str,
223
+ }
224
+ """
225
+ if not url or not isinstance(url, str):
226
+ return {
227
+ "valid": False, "content_type": "webpage",
228
+ "shortcode": None, "video_id": None,
229
+ "error": "Empty or invalid URL", "url": url or "",
230
+ }
231
+
232
+ url = url.strip()
233
+
234
+ # ── Step 1: If input is "Title https://url" style text, extract the URL ──
235
+ # Only attempt extraction when the full string doesn't parse as a URL
236
+ _quick = urlparse(url)
237
+ if _quick.scheme not in ("http", "https"):
238
+ matches = _URL_IN_TEXT_RE.findall(url)
239
+ if matches:
240
+ url = matches[0].rstrip(".,);")
241
+
242
+ try:
243
+ parsed = urlparse(url)
244
+ except Exception as e:
245
+ return {
246
+ "valid": False, "content_type": "webpage",
247
+ "shortcode": None, "video_id": None,
248
+ "error": f"Invalid URL format: {e}", "url": url,
249
+ }
250
+
251
+ # ── Step 2: Resolve short / redirect URLs before further validation ──
252
+ if _is_short_url(parsed.netloc):
253
+ resolved = _resolve_url(url)
254
+ if resolved != url:
255
+ url = resolved
256
+ try:
257
+ parsed = urlparse(url)
258
+ except Exception:
259
+ pass
260
+
261
+ result = _validate_instagram(url, parsed)
262
+ if result is not None:
263
+ return result
264
+
265
+ result = _validate_youtube(url, parsed)
266
+ if result is not None:
267
+ return result
268
+
269
+ return _validate_webpage(url, parsed)
270
+
271
+
272
+ # Backward-compat shim for code that still calls is_valid_instagram_link()
273
+ def is_valid_instagram_link(url: str):
274
+ """Legacy function. Prefer validate_link()."""
275
+ r = validate_link(url)
276
+ if r["content_type"] != "instagram":
277
+ return False, None, "Not an Instagram URL"
278
+ return r["valid"], r["shortcode"], r["error"]
279
+
280
+
281
+ # ─────────────────────────────────────────────────────────────────────────────
282
+ # CLI test
283
+ # ─────────────────────────────────────────────────────────────────────────────
284
+
285
+ if __name__ == "__main__":
286
+ test_urls = [
287
+ "https://www.instagram.com/reel/DUQD-t2DC1D/",
288
+ "https://www.instagram.com/p/DRWKk5JiL0h/",
289
+ "https://www.youtube.com/watch?v=dQw4w9WgXcW",
290
+ "https://youtu.be/dQw4w9WgXcW",
291
+ "https://www.youtube.com/shorts/ab12cd34ef5",
292
+ "https://techcrunch.com/2024/01/01/some-article/",
293
+ "https://www.instagram.com/username/", # invalid IG (no post path)
294
+ "not-a-url",
295
+ ]
296
+ print("=" * 70)
297
+ for u in test_urls:
298
+ r = validate_link(u)
299
+ icon = "✓" if r["valid"] else "✗"
300
+ print(f"{icon} [{r['content_type']:<9}] shortcode={str(r['shortcode']):<28} | {u}")