superbrain-server 1.0.2-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/bin/superbrain.js +196 -0
  2. package/package.json +23 -0
  3. package/payload/.dockerignore +45 -0
  4. package/payload/.env.example +58 -0
  5. package/payload/Dockerfile +73 -0
  6. package/payload/analyzers/__init__.py +0 -0
  7. package/payload/analyzers/audio_transcribe.py +225 -0
  8. package/payload/analyzers/caption.py +244 -0
  9. package/payload/analyzers/music_identifier.py +346 -0
  10. package/payload/analyzers/text_analyzer.py +117 -0
  11. package/payload/analyzers/visual_analyze.py +218 -0
  12. package/payload/analyzers/webpage_analyzer.py +789 -0
  13. package/payload/analyzers/youtube_analyzer.py +320 -0
  14. package/payload/api.py +1676 -0
  15. package/payload/config/.api_keys.example +22 -0
  16. package/payload/config/model_rankings.json +492 -0
  17. package/payload/config/openrouter_free_models.json +1364 -0
  18. package/payload/config/whisper_model.txt +1 -0
  19. package/payload/config_settings.py +185 -0
  20. package/payload/core/__init__.py +0 -0
  21. package/payload/core/category_manager.py +219 -0
  22. package/payload/core/database.py +811 -0
  23. package/payload/core/link_checker.py +300 -0
  24. package/payload/core/model_router.py +1253 -0
  25. package/payload/docker-compose.yml +120 -0
  26. package/payload/instagram/__init__.py +0 -0
  27. package/payload/instagram/instagram_downloader.py +253 -0
  28. package/payload/instagram/instagram_login.py +190 -0
  29. package/payload/main.py +912 -0
  30. package/payload/requirements.txt +39 -0
  31. package/payload/reset.py +311 -0
  32. package/payload/start-docker-prod.sh +125 -0
  33. package/payload/start-docker.sh +56 -0
  34. package/payload/start.py +1302 -0
  35. package/payload/static/favicon.ico +0 -0
  36. package/payload/stop-docker.sh +16 -0
  37. package/payload/utils/__init__.py +0 -0
  38. package/payload/utils/db_stats.py +108 -0
  39. package/payload/utils/manage_token.py +91 -0
@@ -0,0 +1,320 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ YouTube Video Analyzer for SuperBrain
4
+ =======================================
5
+ Uses Gemini's native YouTube URL understanding — no download, no audio
6
+ transcription, no frame extraction. One API call → full structured analysis.
7
+
8
+ The google.genai SDK passes the YouTube URL directly to Gemini which can
9
+ watch, listen, and read the video natively at Google's data centre.
10
+ """
11
+
12
+ import os
13
+ import re
14
+ from pathlib import Path
15
+ from urllib.parse import urlparse, parse_qs
16
+
17
+ API_KEYS_FILE = Path(__file__).resolve().parent.parent / "config" / ".api_keys"
18
+
19
+ # ── Prompt ────────────────────────────────────────────────────────────────────
20
+
21
+ YOUTUBE_PROMPT = """Watch this YouTube video carefully and write a structured analysis report.
22
+
23
+ Generate the report in this EXACT format (use these exact emoji headers):
24
+
25
+ 📌 TITLE:
26
+ [The actual video title, or a clear descriptive title if you can't read it]
27
+
28
+ � CHANNEL:
29
+ [The YouTube channel name or creator/uploader name]
30
+
31
+ 📅 DATE:
32
+ [Upload date in YYYY-MM-DD format if visible or known. Otherwise write "Unknown"]
33
+
34
+ �📝 SUMMARY:
35
+ [Comprehensive 3-5 sentence summary covering: main topic/theme, key points and
36
+ information shared, any products/places/tools/tips mentioned, who this content
37
+ is for, and the overall value or takeaway]
38
+
39
+ 🏷️ TAGS:
40
+ [Generate 8-12 relevant hashtags/keywords separated by spaces, e.g. #dji #drone #aerial]
41
+
42
+ 🎵 MUSIC:
43
+ [Name specific background music or songs heard in the video. If there is no
44
+ identifiable background music, write "No background music". If it's voiceover
45
+ only, write "Voiceover only".]
46
+
47
+ 📂 CATEGORY:
48
+ [Choose exactly ONE from: product, places, recipe, software, book, tv shows, workout, film, event, other]
49
+
50
+ Be specific, accurate, and extractive — pull out real names, numbers, and facts from the video."""
51
+
52
+
53
+ # ── Thumbnail helper ─────────────────────────────────────────────────────
54
+
55
+ def _extract_video_id(youtube_url: str) -> str:
56
+ """Extract the 11-char video ID from any YouTube URL format."""
57
+ parsed = urlparse(youtube_url)
58
+ qs = parse_qs(parsed.query)
59
+ if "youtu.be" in parsed.netloc:
60
+ return parsed.path.lstrip("/").split("/")[0]
61
+ if "v" in qs:
62
+ return qs["v"][0]
63
+ m = re.match(r"^/(?:shorts|embed|v|live|e)/([A-Za-z0-9_-]{11})", parsed.path)
64
+ return m.group(1) if m else ""
65
+
66
+
67
+ def _parse_yt_field(raw: str, label: str) -> str:
68
+ """Extract a single-line field value from YouTube Gemini output.
69
+ Handles emoji/no-emoji and markdown bold variants.
70
+ """
71
+ pattern = re.compile(
72
+ rf'(?:^|\n)\s*\S*\s*\*{{0,2}}{re.escape(label)}\*{{0,2}}:?\s*([^\n]+)',
73
+ re.IGNORECASE,
74
+ )
75
+ m = pattern.search(raw)
76
+ return m.group(1).strip().strip("*").strip() if m else ""
77
+
78
+
79
+ def get_youtube_channel_name(url: str, ai_raw: str = "") -> str:
80
+ """
81
+ Multi-stage robust YouTube channel name extractor.
82
+
83
+ Stages (tried in order, returns first non-empty result):
84
+ 1. oEmbed API — fast, no auth, reliable for public videos
85
+ 2. HTML scrape — parses itemprop/JSON-LD metadata from the watch page
86
+ 3. yt-dlp — subprocess call (if yt-dlp is installed)
87
+ 4. AI output — value parsed from Gemini's CHANNEL field in *ai_raw*
88
+ """
89
+ import requests, subprocess, json as _json, shutil
90
+
91
+ # ── Stage 1: oEmbed (fastest, no auth) ───────────────────────────────
92
+ try:
93
+ r = requests.get(
94
+ "https://www.youtube.com/oembed",
95
+ params={"url": url, "format": "json"},
96
+ timeout=8,
97
+ )
98
+ if r.ok:
99
+ name = r.json().get("author_name", "").strip()
100
+ if name:
101
+ return name
102
+ except Exception:
103
+ pass
104
+
105
+ # ── Stage 2: HTML meta scrape ─────────────────────────────────────────
106
+ try:
107
+ r = requests.get(
108
+ url,
109
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"},
110
+ timeout=10,
111
+ )
112
+ text = r.text
113
+ # JSON-LD: "author":{"@type":"Person","name":"Channel Name"}
114
+ m = re.search(r'"author"\s*:\s*\{[^}]*"name"\s*:\s*"([^"]+)"', text)
115
+ if m:
116
+ return m.group(1).strip()
117
+ # itemprop channel
118
+ m = re.search(r'itemprop="author"[^>]*>\s*<[^>]*itemprop="name"[^>]*content="([^"]+)"', text)
119
+ if m:
120
+ return m.group(1).strip()
121
+ # ytInitialData ownerText
122
+ m = re.search(r'"ownerText"\s*:\s*\{"runs"\s*:\s*\[\{"text"\s*:\s*"([^"]+)"', text)
123
+ if m:
124
+ return m.group(1).strip()
125
+ except Exception:
126
+ pass
127
+
128
+ # ── Stage 3: yt-dlp subprocess ────────────────────────────────────────
129
+ if shutil.which("yt-dlp"):
130
+ try:
131
+ result = subprocess.run(
132
+ ["yt-dlp", "--print", "channel", "--no-download", "--quiet", url],
133
+ capture_output=True, text=True, timeout=20,
134
+ )
135
+ name = result.stdout.strip()
136
+ if name:
137
+ return name
138
+ except Exception:
139
+ pass
140
+
141
+ # ── Stage 4: AI-parsed fallback ───────────────────────────────────────
142
+ if ai_raw:
143
+ name = _parse_yt_field(ai_raw, "CHANNEL")
144
+ if name:
145
+ return name
146
+
147
+ return ""
148
+
149
+
150
+ def get_youtube_upload_date(youtube_url: str) -> str | None:
151
+ """
152
+ Scrape the actual upload date from YouTube's page HTML.
153
+ Returns 'YYYY-MM-DD' string or None on failure.
154
+ """
155
+ try:
156
+ import requests
157
+ r = requests.get(
158
+ youtube_url,
159
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"},
160
+ timeout=10,
161
+ )
162
+ for pattern in (
163
+ r'"uploadDate":"(\d{4}-\d{2}-\d{2})',
164
+ r'"publishDate":"(\d{4}-\d{2}-\d{2})',
165
+ r'<meta itemprop="datePublished" content="(\d{4}-\d{2}-\d{2})',
166
+ ):
167
+ m = re.search(pattern, r.text)
168
+ if m:
169
+ return m.group(1)
170
+ except Exception:
171
+ pass
172
+ return None
173
+
174
+
175
+ def get_youtube_thumbnail(youtube_url: str) -> str:
176
+ """
177
+ Return the best available thumbnail URL for a YouTube video.
178
+ Tries maxresdefault (1280x720) first, falls back to hqdefault (480x360).
179
+ The returned string is always a direct HTTPS URL — no download needed.
180
+ """
181
+ video_id = _extract_video_id(youtube_url)
182
+ if not video_id:
183
+ return ""
184
+
185
+ # Verify maxresdefault exists (some older videos only have hqdefault)
186
+ maxres = f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg"
187
+ hq = f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
188
+ try:
189
+ import requests
190
+ r = requests.head(maxres, timeout=5)
191
+ # YouTube returns a tiny 120x90 stub with 200 when maxres is unavailable
192
+ # Real maxres images are much larger; check content-length as a signal
193
+ cl = int(r.headers.get("content-length", 0))
194
+ return maxres if (r.status_code == 200 and cl > 5000) else hq
195
+ except Exception:
196
+ return hq
197
+
198
+
199
+ # ── Key loader ─────────────────────────────────────────────────────────────────
200
+
201
+ def _load_gemini_key() -> str:
202
+ creds: dict[str, str] = {}
203
+ if API_KEYS_FILE.exists():
204
+ for line in API_KEYS_FILE.read_text(encoding="utf-8", errors="ignore").splitlines():
205
+ line = line.strip()
206
+ if "=" in line and not line.startswith("#"):
207
+ k, _, v = line.partition("=")
208
+ creds[k.strip()] = v.strip()
209
+ return creds.get("GEMINI_API_KEY") or os.getenv("GEMINI_API_KEY", "")
210
+
211
+
212
+ # ── Model fallback chain (supports YouTube video natively) ──────────────────────
213
+
214
+ # Tried left-to-right; on 429 we parse the retry-after delay and honour it once.
215
+ # Only Gemini 2.x+ models support YouTube URL as a native video part via v1beta.
216
+ _GEMINI_MODELS = [
217
+ "gemini-2.0-flash",
218
+ "gemini-2.0-flash-lite",
219
+ "gemini-2.5-flash-lite",
220
+ "gemini-2.5-flash",
221
+ ]
222
+
223
+
224
+ def _parse_retry_after(err_str: str) -> float:
225
+ """Extract retry delay seconds from a Gemini 429 error string."""
226
+ m = re.search(r"retryDelay['\"]?\s*:\s*['\"]?(\d+(?:\.\d+)?)", err_str)
227
+ if m:
228
+ return float(m.group(1))
229
+ m = re.search(r"retry in (\d+(?:\.\d+)?)s", err_str)
230
+ if m:
231
+ return float(m.group(1))
232
+ return 0.0
233
+
234
+
235
+ # ── Core analysis ──────────────────────────────────────────────────────────────
236
+
237
+ def analyze_youtube(youtube_url: str) -> dict:
238
+ """
239
+ Analyze a YouTube video using Gemini's native YouTube URL support.
240
+ Tries each model in _GEMINI_MODELS; on 429 waits the retry-after period
241
+ (capped at 65 s) before falling through to the next model.
242
+
243
+ Returns:
244
+ dict with keys: raw_output (str), channel (str), thumbnail (str), error (str|None)
245
+ """
246
+ import time
247
+
248
+ gemini_key = _load_gemini_key()
249
+ if not gemini_key:
250
+ return {"raw_output": "", "channel": "", "thumbnail": "",
251
+ "error": "GEMINI_API_KEY not found. Add it to backend/.api_keys"}
252
+
253
+ try:
254
+ from google import genai
255
+ from google.genai import types as gtypes
256
+ except ImportError:
257
+ return {"raw_output": "", "channel": "", "thumbnail": "",
258
+ "error": "google-genai not installed. Run: pip install google-genai"}
259
+
260
+ client = genai.Client(api_key=gemini_key)
261
+ last_error = ""
262
+
263
+ for model in _GEMINI_MODELS:
264
+ print(f" 🎬 Trying {model} for YouTube analysis...")
265
+ try:
266
+ response = client.models.generate_content(
267
+ model=model,
268
+ contents=[
269
+ gtypes.Part.from_uri(
270
+ file_uri=youtube_url,
271
+ mime_type="video/youtube",
272
+ ),
273
+ YOUTUBE_PROMPT,
274
+ ],
275
+ config=gtypes.GenerateContentConfig(
276
+ max_output_tokens=1500,
277
+ temperature=0.7,
278
+ ),
279
+ )
280
+ raw = response.text.strip()
281
+ thumbnail = get_youtube_thumbnail(youtube_url)
282
+ post_date = get_youtube_upload_date(youtube_url)
283
+ # Robust multi-stage channel extraction
284
+ channel = get_youtube_channel_name(youtube_url, ai_raw=raw)
285
+ info = f" | channel: {channel}" if channel else ""
286
+ dp = f" | date: {post_date}" if post_date else ""
287
+ print(f" ✓ Gemini YouTube analysis complete (model: {model}){info}{dp}")
288
+ return {"raw_output": raw, "channel": channel, "thumbnail": thumbnail,
289
+ "post_date": post_date, "error": None}
290
+
291
+ except Exception as e:
292
+ err = str(e)
293
+ last_error = err
294
+ if "429" in err or "RESOURCE_EXHAUSTED" in err:
295
+ wait = min(_parse_retry_after(err), 65.0)
296
+ if wait > 0:
297
+ print(f" ⏳ {model} rate-limited — waiting {wait:.0f}s before next model...")
298
+ time.sleep(wait)
299
+ else:
300
+ print(f" ⚠️ {model} quota exhausted — trying next model")
301
+ else:
302
+ # Non-quota error (e.g. 404 model not found) — try next model
303
+ print(f" ✗ {model} failed: {err[:120]}")
304
+
305
+ print(f" ✗ All Gemini models exhausted for YouTube analysis")
306
+ return {"raw_output": "", "channel": "", "thumbnail": "", "post_date": None, "error": last_error}
307
+
308
+
309
+ # ── CLI ────────────────────────────────────────────────────────────────────────
310
+
311
+ if __name__ == "__main__":
312
+ import sys
313
+ url = sys.argv[1] if len(sys.argv) > 1 else input("YouTube URL: ").strip()
314
+ if url:
315
+ result = analyze_youtube(url)
316
+ if result["error"]:
317
+ print(f"\n✗ Error: {result['error']}")
318
+ else:
319
+ print("\n" + "=" * 60)
320
+ print(result["raw_output"])