superbrain-server 1.0.2-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/bin/superbrain.js +196 -0
  2. package/package.json +23 -0
  3. package/payload/.dockerignore +45 -0
  4. package/payload/.env.example +58 -0
  5. package/payload/Dockerfile +73 -0
  6. package/payload/analyzers/__init__.py +0 -0
  7. package/payload/analyzers/audio_transcribe.py +225 -0
  8. package/payload/analyzers/caption.py +244 -0
  9. package/payload/analyzers/music_identifier.py +346 -0
  10. package/payload/analyzers/text_analyzer.py +117 -0
  11. package/payload/analyzers/visual_analyze.py +218 -0
  12. package/payload/analyzers/webpage_analyzer.py +789 -0
  13. package/payload/analyzers/youtube_analyzer.py +320 -0
  14. package/payload/api.py +1676 -0
  15. package/payload/config/.api_keys.example +22 -0
  16. package/payload/config/model_rankings.json +492 -0
  17. package/payload/config/openrouter_free_models.json +1364 -0
  18. package/payload/config/whisper_model.txt +1 -0
  19. package/payload/config_settings.py +185 -0
  20. package/payload/core/__init__.py +0 -0
  21. package/payload/core/category_manager.py +219 -0
  22. package/payload/core/database.py +811 -0
  23. package/payload/core/link_checker.py +300 -0
  24. package/payload/core/model_router.py +1253 -0
  25. package/payload/docker-compose.yml +120 -0
  26. package/payload/instagram/__init__.py +0 -0
  27. package/payload/instagram/instagram_downloader.py +253 -0
  28. package/payload/instagram/instagram_login.py +190 -0
  29. package/payload/main.py +912 -0
  30. package/payload/requirements.txt +39 -0
  31. package/payload/reset.py +311 -0
  32. package/payload/start-docker-prod.sh +125 -0
  33. package/payload/start-docker.sh +56 -0
  34. package/payload/start.py +1302 -0
  35. package/payload/static/favicon.ico +0 -0
  36. package/payload/stop-docker.sh +16 -0
  37. package/payload/utils/__init__.py +0 -0
  38. package/payload/utils/db_stats.py +108 -0
  39. package/payload/utils/manage_token.py +91 -0
@@ -0,0 +1,912 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SuperBrain - Instagram Content Analyzer
4
+ Main orchestrator that coordinates all analysis scripts
5
+ With parallel processing for better performance
6
+ """
7
+
8
+ import sys
9
+ import os
10
+ from pathlib import Path
11
+ import subprocess
12
+ import json
13
+ import re
14
+ import shutil
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
16
+ import time
17
+
18
+ # Import local modules
19
+ from core.link_checker import validate_link
20
+ from core.database import get_db
21
+ from analyzers.youtube_analyzer import analyze_youtube
22
+ from analyzers.webpage_analyzer import analyze_webpage
23
+
24
+ # Sentinel returned by run_*_analysis when the item has been queued for retry
25
+ RETRY_SENTINEL = "__ENQUEUED_FOR_RETRY__"
26
+
27
+ # Keywords that indicate a retryable quota / rate-limit failure
28
+ _QUOTA_KEYWORDS = (
29
+ "resource_exhausted", "quota", "rate_limit", "rate limit",
30
+ "429", "too many requests", "daily limit", "free tier",
31
+ "insufficient_quota", "ratelimit", "all gemini models exhausted",
32
+ )
33
+
34
+ def _is_quota_error(err: str) -> bool:
35
+ """Return True when an error string looks like a recoverable quota / rate-limit."""
36
+ low = err.lower()
37
+ return any(k in low for k in _QUOTA_KEYWORDS)
38
+
39
+ def print_header(title):
40
+ """Print section header"""
41
+ print("\n" + "=" * 80)
42
+ print(f" {title}")
43
+ print("=" * 80 + "\n")
44
+
45
+ def print_section(title):
46
+ """Print subsection"""
47
+ print(f"\n{'─' * 80}")
48
+ print(f" {title}")
49
+ print('─' * 80 + "\n")
50
+
51
+ def generate_final_summary(results, instagram_url):
52
+ """Generate comprehensive summary using all analysis results via ModelRouter."""
53
+ from core.model_router import get_router
54
+
55
+ # Collect all analysis data
56
+ visual_summary = ""
57
+ audio_summary = ""
58
+ music_info = ""
59
+ text_summary = ""
60
+
61
+ # Extract visual analysis
62
+ if results['visual']:
63
+ visual_summary = "VISUAL ANALYSIS:\n"
64
+ for item in results['visual']:
65
+ output = item['output']
66
+ clean = _clean_visual(output)
67
+ if clean:
68
+ visual_summary += f"- {clean[:600]}\n"
69
+
70
+ # Extract audio transcription
71
+ if results['audio_transcription']:
72
+ audio_summary = "AUDIO TRANSCRIPTION:\n"
73
+ for item in results['audio_transcription']:
74
+ output = item['output']
75
+ clean = _clean_audio(output)
76
+ lang = output.split('Detected Language:')[1].split('(')[0].strip() if 'Detected Language:' in output else 'Unknown'
77
+ audio_summary += f"- Language: {lang}\n"
78
+ if clean:
79
+ audio_summary += f"- Content: {clean[:400]}\n"
80
+
81
+ # Extract music identification
82
+ if results['music_identification']:
83
+ music_info = "MUSIC:\n"
84
+ for item in results['music_identification']:
85
+ output = item['output']
86
+ if '🎵 Song:' in output:
87
+ song = output.split('🎵 Song:')[1].split('\n')[0].strip()
88
+ artist = output.split('👤 Artist:')[1].split('\n')[0].strip() if '👤 Artist:' in output else 'Unknown'
89
+ music_info += f"- {song} by {artist}\n"
90
+ elif 'No match found' in output:
91
+ music_info += "- No music identified (likely voiceover/no background music)\n"
92
+
93
+ # Extract text analysis
94
+ if results['text']:
95
+ text_summary = "TEXT ANALYSIS:\n"
96
+ for item in results['text']:
97
+ clean = _clean_text(item['output'])
98
+ if clean:
99
+ text_summary += f"{clean[:600]}\n"
100
+
101
+ # Combine all information
102
+ combined_info = f"""
103
+ INSTAGRAM POST: {instagram_url}
104
+
105
+ {visual_summary}
106
+
107
+ {audio_summary}
108
+
109
+ {music_info}
110
+
111
+ {text_summary}
112
+ """
113
+
114
+ # Generate structured summary using LLM
115
+ prompt = f"""Based on the following analysis of an Instagram post, create a comprehensive structured summary.
116
+
117
+ {combined_info}
118
+
119
+ Generate a report in this EXACT format:
120
+
121
+ 📌 TITLE:
122
+ [Create a clear, descriptive title]
123
+
124
+ 📝 SUMMARY:
125
+ [Comprehensive 3-5 sentence summary including:
126
+ - Main content/theme
127
+ - Key information (locations, products, tips, itineraries, lists, tools, links, etc.)
128
+ - Important highlights
129
+ - Any actionable items or recommendations]
130
+
131
+ 🏷️ TAGS:
132
+ [Generate 8-12 relevant hashtags/keywords]
133
+
134
+ 🎵 MUSIC:
135
+ [Music/song name if found, or "No background music" or "Voiceover only"]
136
+
137
+ 📂 CATEGORY:
138
+ [Choose ONE from: product, places, recipe, software, book, tv shows, workout, film, event]
139
+
140
+ Be specific, concise, and actionable. Focus on useful information."""
141
+
142
+ try:
143
+ print("🤖 Generating comprehensive summary with AI...")
144
+ router = get_router()
145
+ summary = router.generate_text(prompt)
146
+
147
+ if not summary:
148
+ summary = "Unable to generate comprehensive summary."
149
+
150
+ return summary
151
+
152
+ except Exception as e:
153
+ return f"Error generating summary: {e}\n\nRaw data available in individual analysis sections above."
154
+
155
+ def _parse_field(text: str, emoji: str, label: str) -> str:
156
+ """
157
+ Extract a field value from AI output — handles all common AI formatting variations:
158
+ 📌 TITLE: value
159
+ 📌 **TITLE:** value
160
+ 📌 **TITLE** \n value
161
+ Also handles emoji variation selectors (U+FE0F) that may or may not be present.
162
+ Returns the first non-empty content after the label, stopped at the next
163
+ section emoji line or blank-line boundary.
164
+ """
165
+ # Strip variation selector from emoji so pattern works whether or not it's present
166
+ emoji_base = emoji.replace('\ufe0f', '')
167
+ pattern = re.compile(
168
+ rf'{re.escape(emoji_base)}\ufe0f?\s*\*{{0,2}}{re.escape(label)}\*{{0,2}}:?\s*',
169
+ re.IGNORECASE
170
+ )
171
+ m = pattern.search(text)
172
+ # Fallback: model may output U+FFFD instead of the emoji (encoding mangling)
173
+ if not m:
174
+ pattern_fb = re.compile(
175
+ rf'\ufffd\s*\*{{0,2}}{re.escape(label)}\*{{0,2}}:?\s*',
176
+ re.IGNORECASE
177
+ )
178
+ m = pattern_fb.search(text)
179
+ if not m:
180
+ return ""
181
+ after = text[m.end():]
182
+ # Collect until next section (identified by an emoji at line start) or 2 blank lines
183
+ lines = after.split('\n')
184
+ content_lines = []
185
+ for line in lines:
186
+ stripped = line.strip()
187
+ # Stop at next section header — match ANY emoji/symbol at line start,
188
+ # OR a U+FFFD replacement char (model sometimes mangles lower-plane emojis)
189
+ if content_lines and re.match(
190
+ r'^[\U0001F000-\U0001FFFF\U00002600-\U000027BF\U00002B00-\U00002BFF\uFFFD]',
191
+ stripped,
192
+ ):
193
+ break
194
+ # Skip pure markdown bold/italic wrapper lines but keep the text
195
+ content_lines.append(re.sub(r'\*{1,3}([^*]+?)\*{1,3}', r'\1',
196
+ re.sub(r'^\*{1,3}|\*{1,3}$', '', stripped)))
197
+ # Remove leading/trailing blank lines and join
198
+ result = ' '.join(l for l in content_lines if l).strip()
199
+ # Strip surrounding markdown bold (**)
200
+ result = re.sub(r'^\*+|\*+$', '', result).strip('"').strip()
201
+ return result
202
+
203
+
204
+ def parse_summary(summary_text):
205
+ """
206
+ Parse AI-generated summary to extract structured data.
207
+ Robust against markdown bold, missing colons, varied whitespace.
208
+
209
+ Returns:
210
+ tuple: (title, summary, tags, music, category)
211
+ """
212
+ title = ""
213
+ summary = ""
214
+ tags = []
215
+ music = ""
216
+ category = ""
217
+
218
+ try:
219
+ title = _parse_field(summary_text, "📌", "TITLE")
220
+ summary = _parse_field(summary_text, "📝", "SUMMARY")
221
+ music = _parse_field(summary_text, "🎵", "MUSIC")
222
+
223
+ # Tags: grab block then split on whitespace/commas
224
+ raw_tags = _parse_field(summary_text, "🏷️", "TAGS")
225
+ if not raw_tags: # try without variation selector
226
+ raw_tags = _parse_field(summary_text, "🏷", "TAGS")
227
+ if not raw_tags: # model sometimes omits emoji entirely
228
+ _tm = re.search(r'(?:^|\n)\s*\*{0,2}TAGS\*{0,2}:?\s*([^\n]+)', summary_text, re.IGNORECASE)
229
+ if _tm:
230
+ raw_tags = _tm.group(1).strip()
231
+ if raw_tags:
232
+ tags = [t.strip() for t in re.split(r'[\s,]+', raw_tags) if t.strip()]
233
+
234
+ # Category: grab first word/phrase that matches a known category
235
+ raw_cat = _parse_field(summary_text, "📂", "CATEGORY").lower()
236
+ # Strip markdown bold leftovers and pick first line
237
+ raw_cat = re.sub(r'\*+', '', raw_cat).strip()
238
+ raw_cat = raw_cat.split('\n')[0].strip()
239
+ category = raw_cat
240
+
241
+ except Exception as e:
242
+ print(f"⚠️ Error parsing summary: {e}")
243
+
244
+ # Fallback: Auto-detect category if empty or unrecognised
245
+ valid_categories = {'product', 'places', 'recipe', 'software', 'book',
246
+ 'tv shows', 'workout', 'film', 'event', 'other'}
247
+ if not category or category not in valid_categories:
248
+ category = auto_detect_category(summary_text, title, summary, tags)
249
+
250
+ return title, summary, tags, music, category
251
+
252
+ def auto_detect_category(summary_text, title, summary, tags):
253
+ """
254
+ Auto-detect category based on content keywords
255
+
256
+ Returns:
257
+ str: Detected category
258
+ """
259
+ combined = f"{title} {summary} {' '.join(tags)} {summary_text}".lower()
260
+
261
+ # Category keywords
262
+ category_keywords = {
263
+ 'product': ['camera', 'device', 'gadget', 'tech', 'phone', 'laptop', 'review', 'unbox', 'product', 'dji', 'osmo', 'action cam'],
264
+ 'places': ['travel', 'trip', 'visit', 'destination', 'village', 'city', 'mountain', 'beach', 'hotel', 'itinerary', 'sikkim', 'location'],
265
+ 'recipe': ['recipe', 'cooking', 'food', 'dish', 'ingredients', 'cook', 'bake', 'meal', 'cuisine'],
266
+ 'software': ['app', 'software', 'code', 'programming', 'developer', 'api', 'python', 'javascript'],
267
+ 'book': ['book', 'novel', 'author', 'read', 'literature', 'story', 'chapter'],
268
+ 'workout': ['workout', 'fitness', 'exercise', 'gym', 'training', 'muscle', 'cardio', 'yoga'],
269
+ 'film': ['movie', 'film', 'cinema', 'actor', 'actress', 'director', 'trailer', 'premiere'],
270
+ 'tv shows': ['series', 'episode', 'season', 'show', 'tv show', 'streaming', 'netflix'],
271
+ 'event': ['event', 'concert', 'festival', 'conference', 'meetup', 'workshop', 'seminar']
272
+ }
273
+
274
+ # Count keyword matches
275
+ scores = {}
276
+ for category, keywords in category_keywords.items():
277
+ score = sum(1 for keyword in keywords if keyword in combined)
278
+ scores[category] = score
279
+
280
+ # Get category with highest score
281
+ best_category = max(scores, key=scores.get)
282
+
283
+ if scores[best_category] > 0:
284
+ return best_category
285
+
286
+ return "other"
287
+
288
+ def run_script(script_name, args):
289
+ """Run a Python script and return success status"""
290
+ try:
291
+ # Use sys.executable to ensure same Python interpreter (virtual env)
292
+ cmd = [sys.executable, os.path.join(os.path.dirname(__file__), script_name)] + args
293
+
294
+ # Force UTF-8 encoding for subprocess
295
+ env = os.environ.copy()
296
+ env['PYTHONIOENCODING'] = 'utf-8'
297
+
298
+ result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8', errors='replace', env=env)
299
+ return result.returncode == 0, result.stdout, result.stderr
300
+ except Exception as e:
301
+ return False, "", str(e)
302
+
303
+ def run_analysis_task(task_name, script_name, file_path, task_type="light"):
304
+ """
305
+ Run a single analysis task (for parallel execution)
306
+
307
+ Args:
308
+ task_name: Display name (e.g., "Visual Analysis")
309
+ script_name: Python script to run
310
+ file_path: Path to file to analyze
311
+ task_type: "heavy" or "light" (for scheduling)
312
+
313
+ Returns:
314
+ dict with task results
315
+ """
316
+ start_time = time.time()
317
+ print(f" ⚡ Starting {task_name}: {Path(file_path).name}")
318
+
319
+ success, stdout, stderr = run_script(script_name, [str(file_path)])
320
+
321
+ elapsed = time.time() - start_time
322
+
323
+ result = {
324
+ 'task_name': task_name,
325
+ 'file': Path(file_path).name,
326
+ 'success': success,
327
+ 'output': stdout if success else '',
328
+ 'error': stderr if not success else '',
329
+ 'elapsed': elapsed,
330
+ 'type': task_type
331
+ }
332
+
333
+ if success:
334
+ print(f" ✓ Completed {task_name}: {Path(file_path).name} ({elapsed:.1f}s)")
335
+ else:
336
+ print(f" ✗ Failed {task_name}: {Path(file_path).name}")
337
+
338
+ return result
339
+
340
+ def _extract_section(output: str, marker: str) -> str:
341
+ """Extract the content after a section marker, stopping at the next divider."""
342
+ if marker not in output:
343
+ return output[:2000]
344
+ after = output.split(marker, 1)[1]
345
+ lines = after.split("\n")
346
+ content_lines = []
347
+ started = False
348
+ for line in lines:
349
+ stripped = line.strip('-').strip('=').strip('─').strip()
350
+ if not started:
351
+ # Skip blank lines and pure divider lines
352
+ if stripped:
353
+ started = True
354
+ content_lines.append(line)
355
+ else:
356
+ # Stop at a divider line (5+ repeated chars)
357
+ raw = line.strip()
358
+ if (raw.startswith('─' * 5) or raw.startswith('-' * 5) or
359
+ raw.startswith('=' * 5) or raw.startswith('*' * 5)):
360
+ break
361
+ content_lines.append(line)
362
+ return "\n".join(content_lines).strip()
363
+
364
+
365
+ def _clean_visual(output: str) -> str:
366
+ return _extract_section(output, "📝 ANALYSIS:")
367
+
368
+
369
+ def _clean_audio(output: str) -> str:
370
+ return _extract_section(output, "📝 TRANSCRIBED TEXT:")
371
+
372
+
373
+ def _clean_text(output: str) -> str:
374
+ return _extract_section(output, "🔍 ANALYSIS:")
375
+
376
+
377
+ def cleanup_temp_folder(folder_path):
378
+ """Delete temp folder after successful database save"""
379
+ try:
380
+ if os.path.exists(folder_path):
381
+ shutil.rmtree(folder_path)
382
+ print(f"🗑️ Cleaned up temp folder: {Path(folder_path).name}")
383
+ return True
384
+ except Exception as e:
385
+ print(f"⚠ Warning: Could not delete temp folder: {e}")
386
+ return False
387
+
388
+
389
+ def _jpg_to_thumbnail(jpg_path) -> str:
390
+ """
391
+ Read a JPEG file and return it as a base64-encoded data URI.
392
+ Downsizes to max 480 px wide using Pillow if available; otherwise
393
+ raw base64 is used (larger but still works as <img src=...>).
394
+ """
395
+ try:
396
+ from PIL import Image
397
+ import io
398
+ img = Image.open(jpg_path)
399
+ img.thumbnail((1080, 1080), Image.LANCZOS)
400
+ buf = io.BytesIO()
401
+ img.save(buf, format="JPEG", quality=90)
402
+ data = buf.getvalue()
403
+ except Exception:
404
+ # Pillow not available or failed — use raw bytes
405
+ with open(jpg_path, "rb") as f:
406
+ data = f.read()
407
+ import base64
408
+ encoded = base64.b64encode(data).decode()
409
+ return f"data:image/jpeg;base64,{encoded}"
410
+
411
+ # ─────────────────────────────────────────────────────────────────────────────
412
+ # Non-Instagram analysis flows (no download/pipeline needed)
413
+ # ─────────────────────────────────────────────────────────────────────────────
414
+
415
+ _EMOJI_MAP = {
416
+ "TITLE": "📌",
417
+ "CHANNEL": "📢",
418
+ "DATE": "📅",
419
+ "SUMMARY": "📝",
420
+ "TAGS": "🏷️",
421
+ "MUSIC": "🎵",
422
+ "CATEGORY": "📂",
423
+ }
424
+
425
+ def _sanitise_yt_raw(raw: str, post_date: str | None) -> str:
426
+ """
427
+ Normalise YouTube Gemini raw output:
428
+ • Replace U+FFFD replacement chars that precede known section labels
429
+ with the correct emoji (model sometimes outputs \ufffd instead of
430
+ 📢 or 📝 due to encoding/font issues in the API layer).
431
+ • Re-attach 🏷️ when the model omitted it before TAGS:.
432
+ • Substitute the real upload date for "Unknown" in the DATE field.
433
+ """
434
+ lines = raw.splitlines()
435
+ fixed = []
436
+ for line in lines:
437
+ stripped = line.strip()
438
+ # Replace \ufffd or missing emoji before a known section label
439
+ m = re.match(r'^(\ufffd\s*|)(\*{0,2})([A-Z]+)(\*{0,2}):(.*)$', stripped)
440
+ if m:
441
+ prefix, pre_bold, label, post_bold, rest = m.groups()
442
+ if label in _EMOJI_MAP:
443
+ line = f"{_EMOJI_MAP[label]} {pre_bold}{label}{post_bold}:{rest}"
444
+ # Insert missing 🏷️ before bare 'TAGS:' (no emoji at all)
445
+ elif re.match(r'^TAGS\s*:', stripped, re.IGNORECASE):
446
+ line = re.sub(r'^TAGS\s*:', '🏷️ TAGS:', stripped, flags=re.IGNORECASE)
447
+ fixed.append(line)
448
+
449
+ result = "\n".join(fixed)
450
+
451
+ # Substitute actual upload date for 'Unknown'
452
+ if post_date:
453
+ result = re.sub(
454
+ r'(📅\s*DATE\s*:)\s*Unknown',
455
+ rf'\1 {post_date}',
456
+ result,
457
+ flags=re.IGNORECASE,
458
+ )
459
+
460
+ return result
461
+
462
+ def run_youtube_analysis(url: str, shortcode: str, db):
463
+ """Single-call YouTube analysis via Gemini native video support."""
464
+ print_section("🎬 YouTube Video Analysis")
465
+ print(f"📹 Analyzing: {url}")
466
+ print(" (Gemini will access the video directly — no download required)")
467
+
468
+ result = analyze_youtube(url)
469
+
470
+ if result.get('error'):
471
+ err = result['error']
472
+ if _is_quota_error(err):
473
+ db.queue_for_retry(shortcode, url, 'youtube', 'gemini_quota', retry_hours=24)
474
+ print("⏰ All Gemini models quota-exhausted — queued for retry in 24 hours.")
475
+ return RETRY_SENTINEL
476
+ print(f"❌ YouTube analysis failed: {err}")
477
+ return
478
+
479
+ raw = result.get('raw_output', '')
480
+ if not raw:
481
+ print("❌ Received empty response from Gemini.")
482
+ return
483
+
484
+ # Clean up encoding artefacts and patch in the real upload date
485
+ raw = _sanitise_yt_raw(raw, result.get('post_date'))
486
+
487
+ print_section("📋 RAW GEMINI OUTPUT")
488
+ print(raw[:2000])
489
+
490
+ title, summary_text, tags, music, category = parse_summary(raw)
491
+
492
+ # Use upload date scraped directly from YouTube page (always accurate)
493
+ yt_post_date = result.get('post_date')
494
+
495
+ print_section("💾 Saving to Database")
496
+ db.save_analysis(
497
+ shortcode=shortcode,
498
+ url=url,
499
+ username=result.get('channel', ''),
500
+ title=title,
501
+ summary=summary_text,
502
+ tags=tags,
503
+ music=music,
504
+ category=category,
505
+ visual_analysis='',
506
+ audio_transcription='',
507
+ text_analysis=raw,
508
+ content_type='youtube',
509
+ thumbnail=result.get('thumbnail', ''),
510
+ post_date=yt_post_date,
511
+ )
512
+ print(f"✓ YouTube analysis saved ({shortcode})")
513
+ print_header("✅ Done — YouTube Analysis Complete")
514
+ return True
515
+
516
+
517
+ def run_webpage_analysis(url: str, shortcode: str, db):
518
+ """Fetch web page text and run AI text analysis."""
519
+ print_section("🌐 Web Page Analysis")
520
+ print(f"🔗 Analyzing: {url}")
521
+
522
+ result = analyze_webpage(url)
523
+
524
+ if result.get('error'):
525
+ err = result['error']
526
+ if _is_quota_error(err):
527
+ db.queue_for_retry(shortcode, url, 'webpage', 'ai_quota', retry_hours=24)
528
+ print("⏰ AI models quota-exhausted — queued for retry in 24 hours.")
529
+ return RETRY_SENTINEL
530
+ print(f"❌ Web page analysis failed: {err}")
531
+ return
532
+
533
+ raw = result.get('raw_output', '')
534
+ page_title = result.get('page_title', '')
535
+ if not raw:
536
+ db.queue_for_retry(shortcode, url, 'webpage', 'ai_empty_response', retry_hours=1)
537
+ print("⏰ Empty AI response — queued for retry in 1 hour.")
538
+ return RETRY_SENTINEL
539
+
540
+ print_section("📋 RAW AI OUTPUT")
541
+ print(raw[:2000])
542
+
543
+ title, summary_text, tags, music, category = parse_summary(raw)
544
+ # Use on-page title as fallback if AI did not extract one
545
+ if not title and page_title:
546
+ title = page_title
547
+
548
+ print_section("💾 Saving to Database")
549
+ db.save_analysis(
550
+ shortcode=shortcode,
551
+ url=url,
552
+ username=result.get('author', ''),
553
+ title=title,
554
+ summary=summary_text,
555
+ tags=tags,
556
+ music=music,
557
+ category=category,
558
+ visual_analysis='',
559
+ audio_transcription='',
560
+ text_analysis=raw,
561
+ content_type='webpage',
562
+ thumbnail=result.get('thumbnail', ''),
563
+ post_date=result.get('post_date'),
564
+ )
565
+ print(f"✓ Web page analysis saved ({shortcode})")
566
+ print_header("✅ Done — Web Page Analysis Complete")
567
+ return True
568
+
569
+
570
+ # ─────────────────────────────────────────────────────────────────────────────
571
+
572
+ def main():
573
+ """Main orchestrator"""
574
+
575
+ print_header("🧠 SUPERBRAIN - Content Analyzer")
576
+
577
+ # Step 1: Get URL
578
+ if len(sys.argv) > 1:
579
+ instagram_url = sys.argv[1]
580
+ print(f"📎 Link: {instagram_url}")
581
+ else:
582
+ instagram_url = input("📎 Enter URL (Instagram / YouTube / web page): ").strip()
583
+
584
+ if not instagram_url:
585
+ print("❌ No link provided!")
586
+ sys.exit(1)
587
+
588
+ # Step 2: Validate link & detect type
589
+ print_section("🔍 Step 1: Validating Link")
590
+
591
+ validation = validate_link(instagram_url)
592
+
593
+ if not validation['valid']:
594
+ print(f"❌ Invalid link!")
595
+ print(f" Error: {validation['error']}")
596
+ sys.exit(1)
597
+
598
+ content_type = validation['content_type']
599
+ shortcode = validation['shortcode']
600
+ # Normalise URL (e.g. YouTube canonical form)
601
+ instagram_url = validation['url']
602
+
603
+ print(f"✓ Valid {content_type} link")
604
+ print(f" ID: {shortcode}")
605
+
606
+ # Step 2.5: Check cache in database
607
+ print_section("🔍 Step 2: Checking Cache")
608
+
609
+ db = get_db()
610
+ cached = db.check_cache(shortcode) if db.is_connected() else None
611
+
612
+ if cached:
613
+ print(f"✓ Found in cache! (Analyzed on {cached.get('analyzed_at', 'unknown')})")
614
+ print(f" Returning cached result...\n")
615
+
616
+ # Display cached summary
617
+ print_section("📋 CACHED RESULT")
618
+ print(f"📌 TITLE:\n{cached.get('title', 'N/A')}\n")
619
+ print(f"📝 SUMMARY:\n{cached.get('summary', 'N/A')}\n")
620
+ print(f"🏷️ TAGS:\n{', '.join(cached.get('tags', []))}\n")
621
+ print(f"🎵 MUSIC:\n{cached.get('music', 'N/A')}\n")
622
+ print(f"📂 CATEGORY:\n{cached.get('category', 'N/A')}\n")
623
+ print("=" * 80)
624
+ print("✅ Retrieved from cache (no AI processing needed)")
625
+ print("=" * 80 + "\n")
626
+ return
627
+ else:
628
+ print("⚡ Not in cache - will analyze and save")
629
+
630
+ # ── Dispatch non-Instagram types ──────────────────────────────────────────
631
+ if content_type == 'youtube':
632
+ result = run_youtube_analysis(instagram_url, shortcode, db)
633
+ if result == RETRY_SENTINEL:
634
+ sys.exit(2)
635
+ if result is None:
636
+ # Analysis failed — exit non-zero so api.py returns a proper error
637
+ sys.exit(1)
638
+ return
639
+ elif content_type == 'webpage':
640
+ result = run_webpage_analysis(instagram_url, shortcode, db)
641
+ if result == RETRY_SENTINEL:
642
+ sys.exit(2)
643
+ if result is None:
644
+ # Analysis failed — exit non-zero so api.py returns a proper error
645
+ sys.exit(1)
646
+ return
647
+ # Instagram falls through to the existing pipeline below
648
+
649
+ # Step 3: Download content
650
+ print_section("📥 Step 3: Downloading Content")
651
+
652
+ print("Running Instagram downloader...")
653
+
654
+ try:
655
+ # Pass URL via stdin simulation
656
+ import contextlib
657
+ from io import StringIO
658
+
659
+ # Import the downloader function
660
+ from instagram.instagram_downloader import download_instagram_content, RetryableDownloadError
661
+
662
+ download_result = download_instagram_content(instagram_url)
663
+
664
+ if download_result is None:
665
+ print("❌ Download failed!")
666
+ sys.exit(1)
667
+
668
+ download_folder = download_result
669
+ print(f"\n✓ Content downloaded to: {download_folder}")
670
+
671
+ except RetryableDownloadError as e:
672
+ msg = str(e)
673
+ print(f"⏰ Instagram download blocked — {msg}")
674
+ if "login required" in msg.lower():
675
+ print("❌ Instagram now requires login for this post.")
676
+ print(" Add INSTAGRAM_USERNAME and INSTAGRAM_PASSWORD in setup (Step 3) and retry.")
677
+ sys.exit(1)
678
+ db.queue_for_retry(shortcode, instagram_url, 'instagram', 'instagram_rate_limit', retry_hours=24)
679
+ print("⏰ Queued for retry in 24 hours.")
680
+ sys.exit(2)
681
+
682
+ except Exception as e:
683
+ print(f"❌ Download error: {e}")
684
+ import traceback
685
+ traceback.print_exc()
686
+ sys.exit(1)
687
+
688
+ # Step 4: Find downloaded files
689
+ print_section("📂 Step 3: Locating Files")
690
+
691
+ folder_path = Path(download_folder)
692
+
693
+ if not folder_path.exists():
694
+ print(f"❌ Folder not found: {download_folder}")
695
+ sys.exit(1)
696
+
697
+ # Find files
698
+ mp4_files = list(folder_path.glob("*.mp4"))
699
+ mp3_files = list(folder_path.glob("*_audio.mp3"))
700
+ jpg_files = list(folder_path.glob("*.jpg"))
701
+ info_files = list(folder_path.glob("info.txt"))
702
+
703
+ print(f"📹 Videos: {len(mp4_files)}")
704
+ print(f"🎵 Audio files: {len(mp3_files)}")
705
+ print(f"🖼️ Images: {len(jpg_files)}")
706
+ print(f"📄 Info files: {len(info_files)}")
707
+
708
+ # Step 5: Run analyses with SMART PARALLEL PROCESSING
709
+ print_section("🚀 Step 4: Running Parallel Analysis")
710
+ print("Strategy: Heavy tasks sequential, light tasks parallel")
711
+ print("Heavy: Visual (video processing), Audio (Whisper)")
712
+ print("Light: Music (Shazam), Text (metadata)")
713
+
714
+ results = {
715
+ 'visual': [],
716
+ 'audio_transcription': [],
717
+ 'music_identification': [],
718
+ 'text': []
719
+ }
720
+
721
+ all_tasks = []
722
+ analysis_start = time.time()
723
+
724
+ # === PHASE 1: Visual Analysis (HEAVY) - Run alone ===
725
+ if mp4_files or jpg_files:
726
+ print(f"\n🎬 Phase 1: Visual Analysis (Heavy Task)")
727
+
728
+ for video in mp4_files:
729
+ result = run_analysis_task("Visual", 'analyzers/visual_analyze.py', str(video), "heavy")
730
+ if result['success']:
731
+ results['visual'].append({
732
+ 'file': result['file'],
733
+ 'type': 'video',
734
+ 'output': result['output']
735
+ })
736
+ print(_clean_visual(result['output'])[:600] + "\n")
737
+
738
+ for img in jpg_files:
739
+ result = run_analysis_task("Visual", 'analyzers/visual_analyze.py', str(img), "heavy")
740
+ if result['success']:
741
+ results['visual'].append({
742
+ 'file': result['file'],
743
+ 'type': 'image',
744
+ 'output': result['output']
745
+ })
746
+ print(_clean_visual(result['output'])[:600] + "\n")
747
+
748
+ # === PHASE 2: Audio Transcription (HEAVY) - Run alone ===
749
+ if mp3_files:
750
+ print(f"\n🎙️ Phase 2: Audio Transcription (Heavy Task)")
751
+
752
+ for audio in mp3_files:
753
+ result = run_analysis_task("Audio", 'analyzers/audio_transcribe.py', str(audio), "heavy")
754
+ if result['success']:
755
+ results['audio_transcription'].append({
756
+ 'file': result['file'],
757
+ 'output': result['output']
758
+ })
759
+ print(_clean_audio(result['output'])[:600] + "\n")
760
+
761
+ # === PHASE 3: Light tasks in PARALLEL ===
762
+ print(f"\n⚡ Phase 3: Light Tasks (Parallel Execution)")
763
+
764
+ light_tasks = []
765
+
766
+ # Add music identification tasks
767
+ for audio in mp3_files:
768
+ light_tasks.append(('music', 'analyzers/music_identifier.py', str(audio)))
769
+
770
+ # Add text analysis tasks
771
+ for info_file in info_files:
772
+ light_tasks.append(('text', 'analyzers/text_analyzer.py', str(info_file)))
773
+
774
+ if light_tasks:
775
+ # Run light tasks in parallel (max 3 concurrent)
776
+ with ThreadPoolExecutor(max_workers=3) as executor:
777
+ futures = {}
778
+
779
+ for task_type, script, file_path in light_tasks:
780
+ task_name = "Music ID" if task_type == 'music' else "Text"
781
+ future = executor.submit(run_analysis_task, task_name, script, file_path, "light")
782
+ futures[future] = task_type
783
+
784
+ # Collect results as they complete
785
+ for future in as_completed(futures):
786
+ task_type = futures[future]
787
+ result = future.result()
788
+
789
+ if result['success']:
790
+ if task_type == 'music':
791
+ results['music_identification'].append({
792
+ 'file': result['file'],
793
+ 'output': result['output']
794
+ })
795
+ else: # text
796
+ results['text'].append({
797
+ 'file': result['file'],
798
+ 'output': result['output']
799
+ })
800
+
801
+ if task_type == 'music':
802
+ print(result['output'][:400] + "\n")
803
+ else:
804
+ print(_clean_text(result['output'])[:600] + "\n")
805
+
806
+ analysis_elapsed = time.time() - analysis_start
807
+ print(f"\n⏱️ Total Analysis Time: {analysis_elapsed:.1f}s")
808
+
809
+ # Final comprehensive summary
810
+ print_header("✅ GENERATING COMPREHENSIVE SUMMARY")
811
+
812
+ final_summary = generate_final_summary(results, instagram_url)
813
+
814
+ print_section("📋 FINAL REPORT")
815
+ print(final_summary)
816
+
817
+ # Extract structured data from summary for database
818
+ title, summary_text, tags, music, category = parse_summary(final_summary)
819
+
820
+ # Get additional metadata from info.txt if available
821
+ username = ""
822
+ likes = 0
823
+ post_date = None
824
+
825
+ if info_files:
826
+ try:
827
+ with open(info_files[0], 'r', encoding='utf-8') as f:
828
+ content = f.read()
829
+ username_match = re.search(r'Username: @(\S+)', content)
830
+ likes_match = re.search(r'Likes: (\d+)', content)
831
+ date_match = re.search(r'Date: ([\d\-: ]+)', content)
832
+
833
+ if username_match:
834
+ username = username_match.group(1)
835
+ if likes_match:
836
+ likes = int(likes_match.group(1))
837
+ if date_match:
838
+ post_date = date_match.group(1)
839
+ except:
840
+ pass
841
+
842
+ # Save to database
843
+ print_section("💾 Saving to Database")
844
+
845
+ # Combine analysis texts — extract clean content (not raw stdout)
846
+ visual_text = "\n\n".join([_clean_visual(r['output']) for r in results['visual']])
847
+ audio_text = "\n\n".join([_clean_audio(r['output']) for r in results['audio_transcription']])
848
+ text_text = "\n\n".join([_clean_text(r['output']) for r in results['text']])
849
+
850
+ # — Instagram thumbnail: first downloaded jpg (converted to base64) —
851
+ instagram_thumbnail = ""
852
+ if jpg_files:
853
+ print(f"🖼️ Saving thumbnail from {jpg_files[0].name}...")
854
+ instagram_thumbnail = _jpg_to_thumbnail(jpg_files[0])
855
+ elif mp4_files:
856
+ # Try to extract first frame from the video using cv2
857
+ try:
858
+ import cv2
859
+ import base64
860
+ import tempfile
861
+ cap = cv2.VideoCapture(str(mp4_files[0]))
862
+ ret, frame = cap.read()
863
+ cap.release()
864
+ if ret:
865
+ import cv2 as _cv2
866
+ # Resize to 480 wide keeping aspect
867
+ h, w = frame.shape[:2]
868
+ new_w = 480
869
+ new_h = int(h * new_w / w)
870
+ frame = _cv2.resize(frame, (new_w, new_h))
871
+ ok, buf = _cv2.imencode(".jpg", frame, [_cv2.IMWRITE_JPEG_QUALITY, 75])
872
+ if ok:
873
+ encoded = base64.b64encode(buf.tobytes()).decode()
874
+ instagram_thumbnail = f"data:image/jpeg;base64,{encoded}"
875
+ print("🖼️ Saved first-frame thumbnail from video")
876
+ except Exception as thumb_err:
877
+ print(f"⚠️ Could not extract video thumbnail: {thumb_err}")
878
+
879
+ db.save_analysis(
880
+ shortcode=shortcode,
881
+ url=instagram_url,
882
+ username=username,
883
+ title=title,
884
+ summary=summary_text,
885
+ tags=tags,
886
+ music=music,
887
+ category=category,
888
+ visual_analysis=visual_text,
889
+ audio_transcription=audio_text,
890
+ text_analysis=text_text,
891
+ likes=likes,
892
+ post_date=post_date,
893
+ thumbnail=instagram_thumbnail,
894
+ )
895
+
896
+ print(f"✓ Analysis saved to database")
897
+
898
+ # Step 8: Cleanup temp folder
899
+ print_section("🧹 Step 5: Cleanup")
900
+
901
+ if cleanup_temp_folder(download_folder):
902
+ print(f"✓ Temp folder deleted successfully")
903
+ else:
904
+ print(f"⚠ Temp folder not deleted (may need manual cleanup)")
905
+
906
+ print("\n" + "=" * 80)
907
+ print(f"📊 Analyses Complete: Visual({len(results['visual'])}), Audio({len(results['audio_transcription'])}), Music({len(results['music_identification'])}), Text({len(results['text'])})")
908
+ print(f"⏱️ Total Time: {analysis_elapsed:.1f}s")
909
+ print("=" * 80 + "\n")
910
+
911
+ if __name__ == "__main__":
912
+ main()