superbrain-server 1.0.2-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/superbrain.js +196 -0
- package/package.json +23 -0
- package/payload/.dockerignore +45 -0
- package/payload/.env.example +58 -0
- package/payload/Dockerfile +73 -0
- package/payload/analyzers/__init__.py +0 -0
- package/payload/analyzers/audio_transcribe.py +225 -0
- package/payload/analyzers/caption.py +244 -0
- package/payload/analyzers/music_identifier.py +346 -0
- package/payload/analyzers/text_analyzer.py +117 -0
- package/payload/analyzers/visual_analyze.py +218 -0
- package/payload/analyzers/webpage_analyzer.py +789 -0
- package/payload/analyzers/youtube_analyzer.py +320 -0
- package/payload/api.py +1676 -0
- package/payload/config/.api_keys.example +22 -0
- package/payload/config/model_rankings.json +492 -0
- package/payload/config/openrouter_free_models.json +1364 -0
- package/payload/config/whisper_model.txt +1 -0
- package/payload/config_settings.py +185 -0
- package/payload/core/__init__.py +0 -0
- package/payload/core/category_manager.py +219 -0
- package/payload/core/database.py +811 -0
- package/payload/core/link_checker.py +300 -0
- package/payload/core/model_router.py +1253 -0
- package/payload/docker-compose.yml +120 -0
- package/payload/instagram/__init__.py +0 -0
- package/payload/instagram/instagram_downloader.py +253 -0
- package/payload/instagram/instagram_login.py +190 -0
- package/payload/main.py +912 -0
- package/payload/requirements.txt +39 -0
- package/payload/reset.py +311 -0
- package/payload/start-docker-prod.sh +125 -0
- package/payload/start-docker.sh +56 -0
- package/payload/start.py +1302 -0
- package/payload/static/favicon.ico +0 -0
- package/payload/stop-docker.sh +16 -0
- package/payload/utils/__init__.py +0 -0
- package/payload/utils/db_stats.py +108 -0
- package/payload/utils/manage_token.py +91 -0
package/payload/main.py
ADDED
|
@@ -0,0 +1,912 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SuperBrain - Instagram Content Analyzer
|
|
4
|
+
Main orchestrator that coordinates all analysis scripts
|
|
5
|
+
With parallel processing for better performance
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import subprocess
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
import shutil
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
16
|
+
import time
|
|
17
|
+
|
|
18
|
+
# Import local modules
|
|
19
|
+
from core.link_checker import validate_link
|
|
20
|
+
from core.database import get_db
|
|
21
|
+
from analyzers.youtube_analyzer import analyze_youtube
|
|
22
|
+
from analyzers.webpage_analyzer import analyze_webpage
|
|
23
|
+
|
|
24
|
+
# Sentinel returned by run_*_analysis when the item has been queued for retry
|
|
25
|
+
RETRY_SENTINEL = "__ENQUEUED_FOR_RETRY__"
|
|
26
|
+
|
|
27
|
+
# Keywords that indicate a retryable quota / rate-limit failure
|
|
28
|
+
_QUOTA_KEYWORDS = (
|
|
29
|
+
"resource_exhausted", "quota", "rate_limit", "rate limit",
|
|
30
|
+
"429", "too many requests", "daily limit", "free tier",
|
|
31
|
+
"insufficient_quota", "ratelimit", "all gemini models exhausted",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def _is_quota_error(err: str) -> bool:
|
|
35
|
+
"""Return True when an error string looks like a recoverable quota / rate-limit."""
|
|
36
|
+
low = err.lower()
|
|
37
|
+
return any(k in low for k in _QUOTA_KEYWORDS)
|
|
38
|
+
|
|
39
|
+
def print_header(title):
|
|
40
|
+
"""Print section header"""
|
|
41
|
+
print("\n" + "=" * 80)
|
|
42
|
+
print(f" {title}")
|
|
43
|
+
print("=" * 80 + "\n")
|
|
44
|
+
|
|
45
|
+
def print_section(title):
|
|
46
|
+
"""Print subsection"""
|
|
47
|
+
print(f"\n{'─' * 80}")
|
|
48
|
+
print(f" {title}")
|
|
49
|
+
print('─' * 80 + "\n")
|
|
50
|
+
|
|
51
|
+
def generate_final_summary(results, instagram_url):
|
|
52
|
+
"""Generate comprehensive summary using all analysis results via ModelRouter."""
|
|
53
|
+
from core.model_router import get_router
|
|
54
|
+
|
|
55
|
+
# Collect all analysis data
|
|
56
|
+
visual_summary = ""
|
|
57
|
+
audio_summary = ""
|
|
58
|
+
music_info = ""
|
|
59
|
+
text_summary = ""
|
|
60
|
+
|
|
61
|
+
# Extract visual analysis
|
|
62
|
+
if results['visual']:
|
|
63
|
+
visual_summary = "VISUAL ANALYSIS:\n"
|
|
64
|
+
for item in results['visual']:
|
|
65
|
+
output = item['output']
|
|
66
|
+
clean = _clean_visual(output)
|
|
67
|
+
if clean:
|
|
68
|
+
visual_summary += f"- {clean[:600]}\n"
|
|
69
|
+
|
|
70
|
+
# Extract audio transcription
|
|
71
|
+
if results['audio_transcription']:
|
|
72
|
+
audio_summary = "AUDIO TRANSCRIPTION:\n"
|
|
73
|
+
for item in results['audio_transcription']:
|
|
74
|
+
output = item['output']
|
|
75
|
+
clean = _clean_audio(output)
|
|
76
|
+
lang = output.split('Detected Language:')[1].split('(')[0].strip() if 'Detected Language:' in output else 'Unknown'
|
|
77
|
+
audio_summary += f"- Language: {lang}\n"
|
|
78
|
+
if clean:
|
|
79
|
+
audio_summary += f"- Content: {clean[:400]}\n"
|
|
80
|
+
|
|
81
|
+
# Extract music identification
|
|
82
|
+
if results['music_identification']:
|
|
83
|
+
music_info = "MUSIC:\n"
|
|
84
|
+
for item in results['music_identification']:
|
|
85
|
+
output = item['output']
|
|
86
|
+
if '🎵 Song:' in output:
|
|
87
|
+
song = output.split('🎵 Song:')[1].split('\n')[0].strip()
|
|
88
|
+
artist = output.split('👤 Artist:')[1].split('\n')[0].strip() if '👤 Artist:' in output else 'Unknown'
|
|
89
|
+
music_info += f"- {song} by {artist}\n"
|
|
90
|
+
elif 'No match found' in output:
|
|
91
|
+
music_info += "- No music identified (likely voiceover/no background music)\n"
|
|
92
|
+
|
|
93
|
+
# Extract text analysis
|
|
94
|
+
if results['text']:
|
|
95
|
+
text_summary = "TEXT ANALYSIS:\n"
|
|
96
|
+
for item in results['text']:
|
|
97
|
+
clean = _clean_text(item['output'])
|
|
98
|
+
if clean:
|
|
99
|
+
text_summary += f"{clean[:600]}\n"
|
|
100
|
+
|
|
101
|
+
# Combine all information
|
|
102
|
+
combined_info = f"""
|
|
103
|
+
INSTAGRAM POST: {instagram_url}
|
|
104
|
+
|
|
105
|
+
{visual_summary}
|
|
106
|
+
|
|
107
|
+
{audio_summary}
|
|
108
|
+
|
|
109
|
+
{music_info}
|
|
110
|
+
|
|
111
|
+
{text_summary}
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
# Generate structured summary using LLM
|
|
115
|
+
prompt = f"""Based on the following analysis of an Instagram post, create a comprehensive structured summary.
|
|
116
|
+
|
|
117
|
+
{combined_info}
|
|
118
|
+
|
|
119
|
+
Generate a report in this EXACT format:
|
|
120
|
+
|
|
121
|
+
📌 TITLE:
|
|
122
|
+
[Create a clear, descriptive title]
|
|
123
|
+
|
|
124
|
+
📝 SUMMARY:
|
|
125
|
+
[Comprehensive 3-5 sentence summary including:
|
|
126
|
+
- Main content/theme
|
|
127
|
+
- Key information (locations, products, tips, itineraries, lists, tools, links, etc.)
|
|
128
|
+
- Important highlights
|
|
129
|
+
- Any actionable items or recommendations]
|
|
130
|
+
|
|
131
|
+
🏷️ TAGS:
|
|
132
|
+
[Generate 8-12 relevant hashtags/keywords]
|
|
133
|
+
|
|
134
|
+
🎵 MUSIC:
|
|
135
|
+
[Music/song name if found, or "No background music" or "Voiceover only"]
|
|
136
|
+
|
|
137
|
+
📂 CATEGORY:
|
|
138
|
+
[Choose ONE from: product, places, recipe, software, book, tv shows, workout, film, event]
|
|
139
|
+
|
|
140
|
+
Be specific, concise, and actionable. Focus on useful information."""
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
print("🤖 Generating comprehensive summary with AI...")
|
|
144
|
+
router = get_router()
|
|
145
|
+
summary = router.generate_text(prompt)
|
|
146
|
+
|
|
147
|
+
if not summary:
|
|
148
|
+
summary = "Unable to generate comprehensive summary."
|
|
149
|
+
|
|
150
|
+
return summary
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
return f"Error generating summary: {e}\n\nRaw data available in individual analysis sections above."
|
|
154
|
+
|
|
155
|
+
def _parse_field(text: str, emoji: str, label: str) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Extract a field value from AI output — handles all common AI formatting variations:
|
|
158
|
+
📌 TITLE: value
|
|
159
|
+
📌 **TITLE:** value
|
|
160
|
+
📌 **TITLE** \n value
|
|
161
|
+
Also handles emoji variation selectors (U+FE0F) that may or may not be present.
|
|
162
|
+
Returns the first non-empty content after the label, stopped at the next
|
|
163
|
+
section emoji line or blank-line boundary.
|
|
164
|
+
"""
|
|
165
|
+
# Strip variation selector from emoji so pattern works whether or not it's present
|
|
166
|
+
emoji_base = emoji.replace('\ufe0f', '')
|
|
167
|
+
pattern = re.compile(
|
|
168
|
+
rf'{re.escape(emoji_base)}\ufe0f?\s*\*{{0,2}}{re.escape(label)}\*{{0,2}}:?\s*',
|
|
169
|
+
re.IGNORECASE
|
|
170
|
+
)
|
|
171
|
+
m = pattern.search(text)
|
|
172
|
+
# Fallback: model may output U+FFFD instead of the emoji (encoding mangling)
|
|
173
|
+
if not m:
|
|
174
|
+
pattern_fb = re.compile(
|
|
175
|
+
rf'\ufffd\s*\*{{0,2}}{re.escape(label)}\*{{0,2}}:?\s*',
|
|
176
|
+
re.IGNORECASE
|
|
177
|
+
)
|
|
178
|
+
m = pattern_fb.search(text)
|
|
179
|
+
if not m:
|
|
180
|
+
return ""
|
|
181
|
+
after = text[m.end():]
|
|
182
|
+
# Collect until next section (identified by an emoji at line start) or 2 blank lines
|
|
183
|
+
lines = after.split('\n')
|
|
184
|
+
content_lines = []
|
|
185
|
+
for line in lines:
|
|
186
|
+
stripped = line.strip()
|
|
187
|
+
# Stop at next section header — match ANY emoji/symbol at line start,
|
|
188
|
+
# OR a U+FFFD replacement char (model sometimes mangles lower-plane emojis)
|
|
189
|
+
if content_lines and re.match(
|
|
190
|
+
r'^[\U0001F000-\U0001FFFF\U00002600-\U000027BF\U00002B00-\U00002BFF\uFFFD]',
|
|
191
|
+
stripped,
|
|
192
|
+
):
|
|
193
|
+
break
|
|
194
|
+
# Skip pure markdown bold/italic wrapper lines but keep the text
|
|
195
|
+
content_lines.append(re.sub(r'\*{1,3}([^*]+?)\*{1,3}', r'\1',
|
|
196
|
+
re.sub(r'^\*{1,3}|\*{1,3}$', '', stripped)))
|
|
197
|
+
# Remove leading/trailing blank lines and join
|
|
198
|
+
result = ' '.join(l for l in content_lines if l).strip()
|
|
199
|
+
# Strip surrounding markdown bold (**)
|
|
200
|
+
result = re.sub(r'^\*+|\*+$', '', result).strip('"').strip()
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def parse_summary(summary_text):
|
|
205
|
+
"""
|
|
206
|
+
Parse AI-generated summary to extract structured data.
|
|
207
|
+
Robust against markdown bold, missing colons, varied whitespace.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
tuple: (title, summary, tags, music, category)
|
|
211
|
+
"""
|
|
212
|
+
title = ""
|
|
213
|
+
summary = ""
|
|
214
|
+
tags = []
|
|
215
|
+
music = ""
|
|
216
|
+
category = ""
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
title = _parse_field(summary_text, "📌", "TITLE")
|
|
220
|
+
summary = _parse_field(summary_text, "📝", "SUMMARY")
|
|
221
|
+
music = _parse_field(summary_text, "🎵", "MUSIC")
|
|
222
|
+
|
|
223
|
+
# Tags: grab block then split on whitespace/commas
|
|
224
|
+
raw_tags = _parse_field(summary_text, "🏷️", "TAGS")
|
|
225
|
+
if not raw_tags: # try without variation selector
|
|
226
|
+
raw_tags = _parse_field(summary_text, "🏷", "TAGS")
|
|
227
|
+
if not raw_tags: # model sometimes omits emoji entirely
|
|
228
|
+
_tm = re.search(r'(?:^|\n)\s*\*{0,2}TAGS\*{0,2}:?\s*([^\n]+)', summary_text, re.IGNORECASE)
|
|
229
|
+
if _tm:
|
|
230
|
+
raw_tags = _tm.group(1).strip()
|
|
231
|
+
if raw_tags:
|
|
232
|
+
tags = [t.strip() for t in re.split(r'[\s,]+', raw_tags) if t.strip()]
|
|
233
|
+
|
|
234
|
+
# Category: grab first word/phrase that matches a known category
|
|
235
|
+
raw_cat = _parse_field(summary_text, "📂", "CATEGORY").lower()
|
|
236
|
+
# Strip markdown bold leftovers and pick first line
|
|
237
|
+
raw_cat = re.sub(r'\*+', '', raw_cat).strip()
|
|
238
|
+
raw_cat = raw_cat.split('\n')[0].strip()
|
|
239
|
+
category = raw_cat
|
|
240
|
+
|
|
241
|
+
except Exception as e:
|
|
242
|
+
print(f"⚠️ Error parsing summary: {e}")
|
|
243
|
+
|
|
244
|
+
# Fallback: Auto-detect category if empty or unrecognised
|
|
245
|
+
valid_categories = {'product', 'places', 'recipe', 'software', 'book',
|
|
246
|
+
'tv shows', 'workout', 'film', 'event', 'other'}
|
|
247
|
+
if not category or category not in valid_categories:
|
|
248
|
+
category = auto_detect_category(summary_text, title, summary, tags)
|
|
249
|
+
|
|
250
|
+
return title, summary, tags, music, category
|
|
251
|
+
|
|
252
|
+
def auto_detect_category(summary_text, title, summary, tags):
|
|
253
|
+
"""
|
|
254
|
+
Auto-detect category based on content keywords
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
str: Detected category
|
|
258
|
+
"""
|
|
259
|
+
combined = f"{title} {summary} {' '.join(tags)} {summary_text}".lower()
|
|
260
|
+
|
|
261
|
+
# Category keywords
|
|
262
|
+
category_keywords = {
|
|
263
|
+
'product': ['camera', 'device', 'gadget', 'tech', 'phone', 'laptop', 'review', 'unbox', 'product', 'dji', 'osmo', 'action cam'],
|
|
264
|
+
'places': ['travel', 'trip', 'visit', 'destination', 'village', 'city', 'mountain', 'beach', 'hotel', 'itinerary', 'sikkim', 'location'],
|
|
265
|
+
'recipe': ['recipe', 'cooking', 'food', 'dish', 'ingredients', 'cook', 'bake', 'meal', 'cuisine'],
|
|
266
|
+
'software': ['app', 'software', 'code', 'programming', 'developer', 'api', 'python', 'javascript'],
|
|
267
|
+
'book': ['book', 'novel', 'author', 'read', 'literature', 'story', 'chapter'],
|
|
268
|
+
'workout': ['workout', 'fitness', 'exercise', 'gym', 'training', 'muscle', 'cardio', 'yoga'],
|
|
269
|
+
'film': ['movie', 'film', 'cinema', 'actor', 'actress', 'director', 'trailer', 'premiere'],
|
|
270
|
+
'tv shows': ['series', 'episode', 'season', 'show', 'tv show', 'streaming', 'netflix'],
|
|
271
|
+
'event': ['event', 'concert', 'festival', 'conference', 'meetup', 'workshop', 'seminar']
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# Count keyword matches
|
|
275
|
+
scores = {}
|
|
276
|
+
for category, keywords in category_keywords.items():
|
|
277
|
+
score = sum(1 for keyword in keywords if keyword in combined)
|
|
278
|
+
scores[category] = score
|
|
279
|
+
|
|
280
|
+
# Get category with highest score
|
|
281
|
+
best_category = max(scores, key=scores.get)
|
|
282
|
+
|
|
283
|
+
if scores[best_category] > 0:
|
|
284
|
+
return best_category
|
|
285
|
+
|
|
286
|
+
return "other"
|
|
287
|
+
|
|
288
|
+
def run_script(script_name, args):
|
|
289
|
+
"""Run a Python script and return success status"""
|
|
290
|
+
try:
|
|
291
|
+
# Use sys.executable to ensure same Python interpreter (virtual env)
|
|
292
|
+
cmd = [sys.executable, os.path.join(os.path.dirname(__file__), script_name)] + args
|
|
293
|
+
|
|
294
|
+
# Force UTF-8 encoding for subprocess
|
|
295
|
+
env = os.environ.copy()
|
|
296
|
+
env['PYTHONIOENCODING'] = 'utf-8'
|
|
297
|
+
|
|
298
|
+
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8', errors='replace', env=env)
|
|
299
|
+
return result.returncode == 0, result.stdout, result.stderr
|
|
300
|
+
except Exception as e:
|
|
301
|
+
return False, "", str(e)
|
|
302
|
+
|
|
303
|
+
def run_analysis_task(task_name, script_name, file_path, task_type="light"):
|
|
304
|
+
"""
|
|
305
|
+
Run a single analysis task (for parallel execution)
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
task_name: Display name (e.g., "Visual Analysis")
|
|
309
|
+
script_name: Python script to run
|
|
310
|
+
file_path: Path to file to analyze
|
|
311
|
+
task_type: "heavy" or "light" (for scheduling)
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
dict with task results
|
|
315
|
+
"""
|
|
316
|
+
start_time = time.time()
|
|
317
|
+
print(f" ⚡ Starting {task_name}: {Path(file_path).name}")
|
|
318
|
+
|
|
319
|
+
success, stdout, stderr = run_script(script_name, [str(file_path)])
|
|
320
|
+
|
|
321
|
+
elapsed = time.time() - start_time
|
|
322
|
+
|
|
323
|
+
result = {
|
|
324
|
+
'task_name': task_name,
|
|
325
|
+
'file': Path(file_path).name,
|
|
326
|
+
'success': success,
|
|
327
|
+
'output': stdout if success else '',
|
|
328
|
+
'error': stderr if not success else '',
|
|
329
|
+
'elapsed': elapsed,
|
|
330
|
+
'type': task_type
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if success:
|
|
334
|
+
print(f" ✓ Completed {task_name}: {Path(file_path).name} ({elapsed:.1f}s)")
|
|
335
|
+
else:
|
|
336
|
+
print(f" ✗ Failed {task_name}: {Path(file_path).name}")
|
|
337
|
+
|
|
338
|
+
return result
|
|
339
|
+
|
|
340
|
+
def _extract_section(output: str, marker: str) -> str:
|
|
341
|
+
"""Extract the content after a section marker, stopping at the next divider."""
|
|
342
|
+
if marker not in output:
|
|
343
|
+
return output[:2000]
|
|
344
|
+
after = output.split(marker, 1)[1]
|
|
345
|
+
lines = after.split("\n")
|
|
346
|
+
content_lines = []
|
|
347
|
+
started = False
|
|
348
|
+
for line in lines:
|
|
349
|
+
stripped = line.strip('-').strip('=').strip('─').strip()
|
|
350
|
+
if not started:
|
|
351
|
+
# Skip blank lines and pure divider lines
|
|
352
|
+
if stripped:
|
|
353
|
+
started = True
|
|
354
|
+
content_lines.append(line)
|
|
355
|
+
else:
|
|
356
|
+
# Stop at a divider line (5+ repeated chars)
|
|
357
|
+
raw = line.strip()
|
|
358
|
+
if (raw.startswith('─' * 5) or raw.startswith('-' * 5) or
|
|
359
|
+
raw.startswith('=' * 5) or raw.startswith('*' * 5)):
|
|
360
|
+
break
|
|
361
|
+
content_lines.append(line)
|
|
362
|
+
return "\n".join(content_lines).strip()
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _clean_visual(output: str) -> str:
|
|
366
|
+
return _extract_section(output, "📝 ANALYSIS:")
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _clean_audio(output: str) -> str:
|
|
370
|
+
return _extract_section(output, "📝 TRANSCRIBED TEXT:")
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _clean_text(output: str) -> str:
|
|
374
|
+
return _extract_section(output, "🔍 ANALYSIS:")
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def cleanup_temp_folder(folder_path):
|
|
378
|
+
"""Delete temp folder after successful database save"""
|
|
379
|
+
try:
|
|
380
|
+
if os.path.exists(folder_path):
|
|
381
|
+
shutil.rmtree(folder_path)
|
|
382
|
+
print(f"🗑️ Cleaned up temp folder: {Path(folder_path).name}")
|
|
383
|
+
return True
|
|
384
|
+
except Exception as e:
|
|
385
|
+
print(f"⚠ Warning: Could not delete temp folder: {e}")
|
|
386
|
+
return False
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _jpg_to_thumbnail(jpg_path) -> str:
|
|
390
|
+
"""
|
|
391
|
+
Read a JPEG file and return it as a base64-encoded data URI.
|
|
392
|
+
Downsizes to max 480 px wide using Pillow if available; otherwise
|
|
393
|
+
raw base64 is used (larger but still works as <img src=...>).
|
|
394
|
+
"""
|
|
395
|
+
try:
|
|
396
|
+
from PIL import Image
|
|
397
|
+
import io
|
|
398
|
+
img = Image.open(jpg_path)
|
|
399
|
+
img.thumbnail((1080, 1080), Image.LANCZOS)
|
|
400
|
+
buf = io.BytesIO()
|
|
401
|
+
img.save(buf, format="JPEG", quality=90)
|
|
402
|
+
data = buf.getvalue()
|
|
403
|
+
except Exception:
|
|
404
|
+
# Pillow not available or failed — use raw bytes
|
|
405
|
+
with open(jpg_path, "rb") as f:
|
|
406
|
+
data = f.read()
|
|
407
|
+
import base64
|
|
408
|
+
encoded = base64.b64encode(data).decode()
|
|
409
|
+
return f"data:image/jpeg;base64,{encoded}"
|
|
410
|
+
|
|
411
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
412
|
+
# Non-Instagram analysis flows (no download/pipeline needed)
|
|
413
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
414
|
+
|
|
415
|
+
_EMOJI_MAP = {
|
|
416
|
+
"TITLE": "📌",
|
|
417
|
+
"CHANNEL": "📢",
|
|
418
|
+
"DATE": "📅",
|
|
419
|
+
"SUMMARY": "📝",
|
|
420
|
+
"TAGS": "🏷️",
|
|
421
|
+
"MUSIC": "🎵",
|
|
422
|
+
"CATEGORY": "📂",
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
def _sanitise_yt_raw(raw: str, post_date: str | None) -> str:
|
|
426
|
+
"""
|
|
427
|
+
Normalise YouTube Gemini raw output:
|
|
428
|
+
• Replace U+FFFD replacement chars that precede known section labels
|
|
429
|
+
with the correct emoji (model sometimes outputs \ufffd instead of
|
|
430
|
+
📢 or 📝 due to encoding/font issues in the API layer).
|
|
431
|
+
• Re-attach 🏷️ when the model omitted it before TAGS:.
|
|
432
|
+
• Substitute the real upload date for "Unknown" in the DATE field.
|
|
433
|
+
"""
|
|
434
|
+
lines = raw.splitlines()
|
|
435
|
+
fixed = []
|
|
436
|
+
for line in lines:
|
|
437
|
+
stripped = line.strip()
|
|
438
|
+
# Replace \ufffd or missing emoji before a known section label
|
|
439
|
+
m = re.match(r'^(\ufffd\s*|)(\*{0,2})([A-Z]+)(\*{0,2}):(.*)$', stripped)
|
|
440
|
+
if m:
|
|
441
|
+
prefix, pre_bold, label, post_bold, rest = m.groups()
|
|
442
|
+
if label in _EMOJI_MAP:
|
|
443
|
+
line = f"{_EMOJI_MAP[label]} {pre_bold}{label}{post_bold}:{rest}"
|
|
444
|
+
# Insert missing 🏷️ before bare 'TAGS:' (no emoji at all)
|
|
445
|
+
elif re.match(r'^TAGS\s*:', stripped, re.IGNORECASE):
|
|
446
|
+
line = re.sub(r'^TAGS\s*:', '🏷️ TAGS:', stripped, flags=re.IGNORECASE)
|
|
447
|
+
fixed.append(line)
|
|
448
|
+
|
|
449
|
+
result = "\n".join(fixed)
|
|
450
|
+
|
|
451
|
+
# Substitute actual upload date for 'Unknown'
|
|
452
|
+
if post_date:
|
|
453
|
+
result = re.sub(
|
|
454
|
+
r'(📅\s*DATE\s*:)\s*Unknown',
|
|
455
|
+
rf'\1 {post_date}',
|
|
456
|
+
result,
|
|
457
|
+
flags=re.IGNORECASE,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
return result
|
|
461
|
+
|
|
462
|
+
def run_youtube_analysis(url: str, shortcode: str, db):
|
|
463
|
+
"""Single-call YouTube analysis via Gemini native video support."""
|
|
464
|
+
print_section("🎬 YouTube Video Analysis")
|
|
465
|
+
print(f"📹 Analyzing: {url}")
|
|
466
|
+
print(" (Gemini will access the video directly — no download required)")
|
|
467
|
+
|
|
468
|
+
result = analyze_youtube(url)
|
|
469
|
+
|
|
470
|
+
if result.get('error'):
|
|
471
|
+
err = result['error']
|
|
472
|
+
if _is_quota_error(err):
|
|
473
|
+
db.queue_for_retry(shortcode, url, 'youtube', 'gemini_quota', retry_hours=24)
|
|
474
|
+
print("⏰ All Gemini models quota-exhausted — queued for retry in 24 hours.")
|
|
475
|
+
return RETRY_SENTINEL
|
|
476
|
+
print(f"❌ YouTube analysis failed: {err}")
|
|
477
|
+
return
|
|
478
|
+
|
|
479
|
+
raw = result.get('raw_output', '')
|
|
480
|
+
if not raw:
|
|
481
|
+
print("❌ Received empty response from Gemini.")
|
|
482
|
+
return
|
|
483
|
+
|
|
484
|
+
# Clean up encoding artefacts and patch in the real upload date
|
|
485
|
+
raw = _sanitise_yt_raw(raw, result.get('post_date'))
|
|
486
|
+
|
|
487
|
+
print_section("📋 RAW GEMINI OUTPUT")
|
|
488
|
+
print(raw[:2000])
|
|
489
|
+
|
|
490
|
+
title, summary_text, tags, music, category = parse_summary(raw)
|
|
491
|
+
|
|
492
|
+
# Use upload date scraped directly from YouTube page (always accurate)
|
|
493
|
+
yt_post_date = result.get('post_date')
|
|
494
|
+
|
|
495
|
+
print_section("💾 Saving to Database")
|
|
496
|
+
db.save_analysis(
|
|
497
|
+
shortcode=shortcode,
|
|
498
|
+
url=url,
|
|
499
|
+
username=result.get('channel', ''),
|
|
500
|
+
title=title,
|
|
501
|
+
summary=summary_text,
|
|
502
|
+
tags=tags,
|
|
503
|
+
music=music,
|
|
504
|
+
category=category,
|
|
505
|
+
visual_analysis='',
|
|
506
|
+
audio_transcription='',
|
|
507
|
+
text_analysis=raw,
|
|
508
|
+
content_type='youtube',
|
|
509
|
+
thumbnail=result.get('thumbnail', ''),
|
|
510
|
+
post_date=yt_post_date,
|
|
511
|
+
)
|
|
512
|
+
print(f"✓ YouTube analysis saved ({shortcode})")
|
|
513
|
+
print_header("✅ Done — YouTube Analysis Complete")
|
|
514
|
+
return True
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def run_webpage_analysis(url: str, shortcode: str, db):
|
|
518
|
+
"""Fetch web page text and run AI text analysis."""
|
|
519
|
+
print_section("🌐 Web Page Analysis")
|
|
520
|
+
print(f"🔗 Analyzing: {url}")
|
|
521
|
+
|
|
522
|
+
result = analyze_webpage(url)
|
|
523
|
+
|
|
524
|
+
if result.get('error'):
|
|
525
|
+
err = result['error']
|
|
526
|
+
if _is_quota_error(err):
|
|
527
|
+
db.queue_for_retry(shortcode, url, 'webpage', 'ai_quota', retry_hours=24)
|
|
528
|
+
print("⏰ AI models quota-exhausted — queued for retry in 24 hours.")
|
|
529
|
+
return RETRY_SENTINEL
|
|
530
|
+
print(f"❌ Web page analysis failed: {err}")
|
|
531
|
+
return
|
|
532
|
+
|
|
533
|
+
raw = result.get('raw_output', '')
|
|
534
|
+
page_title = result.get('page_title', '')
|
|
535
|
+
if not raw:
|
|
536
|
+
db.queue_for_retry(shortcode, url, 'webpage', 'ai_empty_response', retry_hours=1)
|
|
537
|
+
print("⏰ Empty AI response — queued for retry in 1 hour.")
|
|
538
|
+
return RETRY_SENTINEL
|
|
539
|
+
|
|
540
|
+
print_section("📋 RAW AI OUTPUT")
|
|
541
|
+
print(raw[:2000])
|
|
542
|
+
|
|
543
|
+
title, summary_text, tags, music, category = parse_summary(raw)
|
|
544
|
+
# Use on-page title as fallback if AI did not extract one
|
|
545
|
+
if not title and page_title:
|
|
546
|
+
title = page_title
|
|
547
|
+
|
|
548
|
+
print_section("💾 Saving to Database")
|
|
549
|
+
db.save_analysis(
|
|
550
|
+
shortcode=shortcode,
|
|
551
|
+
url=url,
|
|
552
|
+
username=result.get('author', ''),
|
|
553
|
+
title=title,
|
|
554
|
+
summary=summary_text,
|
|
555
|
+
tags=tags,
|
|
556
|
+
music=music,
|
|
557
|
+
category=category,
|
|
558
|
+
visual_analysis='',
|
|
559
|
+
audio_transcription='',
|
|
560
|
+
text_analysis=raw,
|
|
561
|
+
content_type='webpage',
|
|
562
|
+
thumbnail=result.get('thumbnail', ''),
|
|
563
|
+
post_date=result.get('post_date'),
|
|
564
|
+
)
|
|
565
|
+
print(f"✓ Web page analysis saved ({shortcode})")
|
|
566
|
+
print_header("✅ Done — Web Page Analysis Complete")
|
|
567
|
+
return True
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
571
|
+
|
|
572
|
+
def main():
|
|
573
|
+
"""Main orchestrator"""
|
|
574
|
+
|
|
575
|
+
print_header("🧠 SUPERBRAIN - Content Analyzer")
|
|
576
|
+
|
|
577
|
+
# Step 1: Get URL
|
|
578
|
+
if len(sys.argv) > 1:
|
|
579
|
+
instagram_url = sys.argv[1]
|
|
580
|
+
print(f"📎 Link: {instagram_url}")
|
|
581
|
+
else:
|
|
582
|
+
instagram_url = input("📎 Enter URL (Instagram / YouTube / web page): ").strip()
|
|
583
|
+
|
|
584
|
+
if not instagram_url:
|
|
585
|
+
print("❌ No link provided!")
|
|
586
|
+
sys.exit(1)
|
|
587
|
+
|
|
588
|
+
# Step 2: Validate link & detect type
|
|
589
|
+
print_section("🔍 Step 1: Validating Link")
|
|
590
|
+
|
|
591
|
+
validation = validate_link(instagram_url)
|
|
592
|
+
|
|
593
|
+
if not validation['valid']:
|
|
594
|
+
print(f"❌ Invalid link!")
|
|
595
|
+
print(f" Error: {validation['error']}")
|
|
596
|
+
sys.exit(1)
|
|
597
|
+
|
|
598
|
+
content_type = validation['content_type']
|
|
599
|
+
shortcode = validation['shortcode']
|
|
600
|
+
# Normalise URL (e.g. YouTube canonical form)
|
|
601
|
+
instagram_url = validation['url']
|
|
602
|
+
|
|
603
|
+
print(f"✓ Valid {content_type} link")
|
|
604
|
+
print(f" ID: {shortcode}")
|
|
605
|
+
|
|
606
|
+
# Step 2.5: Check cache in database
|
|
607
|
+
print_section("🔍 Step 2: Checking Cache")
|
|
608
|
+
|
|
609
|
+
db = get_db()
|
|
610
|
+
cached = db.check_cache(shortcode) if db.is_connected() else None
|
|
611
|
+
|
|
612
|
+
if cached:
|
|
613
|
+
print(f"✓ Found in cache! (Analyzed on {cached.get('analyzed_at', 'unknown')})")
|
|
614
|
+
print(f" Returning cached result...\n")
|
|
615
|
+
|
|
616
|
+
# Display cached summary
|
|
617
|
+
print_section("📋 CACHED RESULT")
|
|
618
|
+
print(f"📌 TITLE:\n{cached.get('title', 'N/A')}\n")
|
|
619
|
+
print(f"📝 SUMMARY:\n{cached.get('summary', 'N/A')}\n")
|
|
620
|
+
print(f"🏷️ TAGS:\n{', '.join(cached.get('tags', []))}\n")
|
|
621
|
+
print(f"🎵 MUSIC:\n{cached.get('music', 'N/A')}\n")
|
|
622
|
+
print(f"📂 CATEGORY:\n{cached.get('category', 'N/A')}\n")
|
|
623
|
+
print("=" * 80)
|
|
624
|
+
print("✅ Retrieved from cache (no AI processing needed)")
|
|
625
|
+
print("=" * 80 + "\n")
|
|
626
|
+
return
|
|
627
|
+
else:
|
|
628
|
+
print("⚡ Not in cache - will analyze and save")
|
|
629
|
+
|
|
630
|
+
# ── Dispatch non-Instagram types ──────────────────────────────────────────
|
|
631
|
+
if content_type == 'youtube':
|
|
632
|
+
result = run_youtube_analysis(instagram_url, shortcode, db)
|
|
633
|
+
if result == RETRY_SENTINEL:
|
|
634
|
+
sys.exit(2)
|
|
635
|
+
if result is None:
|
|
636
|
+
# Analysis failed — exit non-zero so api.py returns a proper error
|
|
637
|
+
sys.exit(1)
|
|
638
|
+
return
|
|
639
|
+
elif content_type == 'webpage':
|
|
640
|
+
result = run_webpage_analysis(instagram_url, shortcode, db)
|
|
641
|
+
if result == RETRY_SENTINEL:
|
|
642
|
+
sys.exit(2)
|
|
643
|
+
if result is None:
|
|
644
|
+
# Analysis failed — exit non-zero so api.py returns a proper error
|
|
645
|
+
sys.exit(1)
|
|
646
|
+
return
|
|
647
|
+
# Instagram falls through to the existing pipeline below
|
|
648
|
+
|
|
649
|
+
# Step 3: Download content
|
|
650
|
+
print_section("📥 Step 3: Downloading Content")
|
|
651
|
+
|
|
652
|
+
print("Running Instagram downloader...")
|
|
653
|
+
|
|
654
|
+
try:
|
|
655
|
+
# Pass URL via stdin simulation
|
|
656
|
+
import contextlib
|
|
657
|
+
from io import StringIO
|
|
658
|
+
|
|
659
|
+
# Import the downloader function
|
|
660
|
+
from instagram.instagram_downloader import download_instagram_content, RetryableDownloadError
|
|
661
|
+
|
|
662
|
+
download_result = download_instagram_content(instagram_url)
|
|
663
|
+
|
|
664
|
+
if download_result is None:
|
|
665
|
+
print("❌ Download failed!")
|
|
666
|
+
sys.exit(1)
|
|
667
|
+
|
|
668
|
+
download_folder = download_result
|
|
669
|
+
print(f"\n✓ Content downloaded to: {download_folder}")
|
|
670
|
+
|
|
671
|
+
except RetryableDownloadError as e:
|
|
672
|
+
msg = str(e)
|
|
673
|
+
print(f"⏰ Instagram download blocked — {msg}")
|
|
674
|
+
if "login required" in msg.lower():
|
|
675
|
+
print("❌ Instagram now requires login for this post.")
|
|
676
|
+
print(" Add INSTAGRAM_USERNAME and INSTAGRAM_PASSWORD in setup (Step 3) and retry.")
|
|
677
|
+
sys.exit(1)
|
|
678
|
+
db.queue_for_retry(shortcode, instagram_url, 'instagram', 'instagram_rate_limit', retry_hours=24)
|
|
679
|
+
print("⏰ Queued for retry in 24 hours.")
|
|
680
|
+
sys.exit(2)
|
|
681
|
+
|
|
682
|
+
except Exception as e:
|
|
683
|
+
print(f"❌ Download error: {e}")
|
|
684
|
+
import traceback
|
|
685
|
+
traceback.print_exc()
|
|
686
|
+
sys.exit(1)
|
|
687
|
+
|
|
688
|
+
# Step 4: Find downloaded files
|
|
689
|
+
print_section("📂 Step 3: Locating Files")
|
|
690
|
+
|
|
691
|
+
folder_path = Path(download_folder)
|
|
692
|
+
|
|
693
|
+
if not folder_path.exists():
|
|
694
|
+
print(f"❌ Folder not found: {download_folder}")
|
|
695
|
+
sys.exit(1)
|
|
696
|
+
|
|
697
|
+
# Find files
|
|
698
|
+
mp4_files = list(folder_path.glob("*.mp4"))
|
|
699
|
+
mp3_files = list(folder_path.glob("*_audio.mp3"))
|
|
700
|
+
jpg_files = list(folder_path.glob("*.jpg"))
|
|
701
|
+
info_files = list(folder_path.glob("info.txt"))
|
|
702
|
+
|
|
703
|
+
print(f"📹 Videos: {len(mp4_files)}")
|
|
704
|
+
print(f"🎵 Audio files: {len(mp3_files)}")
|
|
705
|
+
print(f"🖼️ Images: {len(jpg_files)}")
|
|
706
|
+
print(f"📄 Info files: {len(info_files)}")
|
|
707
|
+
|
|
708
|
+
# Step 5: Run analyses with SMART PARALLEL PROCESSING
|
|
709
|
+
print_section("🚀 Step 4: Running Parallel Analysis")
|
|
710
|
+
print("Strategy: Heavy tasks sequential, light tasks parallel")
|
|
711
|
+
print("Heavy: Visual (video processing), Audio (Whisper)")
|
|
712
|
+
print("Light: Music (Shazam), Text (metadata)")
|
|
713
|
+
|
|
714
|
+
results = {
|
|
715
|
+
'visual': [],
|
|
716
|
+
'audio_transcription': [],
|
|
717
|
+
'music_identification': [],
|
|
718
|
+
'text': []
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
all_tasks = []
|
|
722
|
+
analysis_start = time.time()
|
|
723
|
+
|
|
724
|
+
# === PHASE 1: Visual Analysis (HEAVY) - Run alone ===
|
|
725
|
+
if mp4_files or jpg_files:
|
|
726
|
+
print(f"\n🎬 Phase 1: Visual Analysis (Heavy Task)")
|
|
727
|
+
|
|
728
|
+
for video in mp4_files:
|
|
729
|
+
result = run_analysis_task("Visual", 'analyzers/visual_analyze.py', str(video), "heavy")
|
|
730
|
+
if result['success']:
|
|
731
|
+
results['visual'].append({
|
|
732
|
+
'file': result['file'],
|
|
733
|
+
'type': 'video',
|
|
734
|
+
'output': result['output']
|
|
735
|
+
})
|
|
736
|
+
print(_clean_visual(result['output'])[:600] + "\n")
|
|
737
|
+
|
|
738
|
+
for img in jpg_files:
|
|
739
|
+
result = run_analysis_task("Visual", 'analyzers/visual_analyze.py', str(img), "heavy")
|
|
740
|
+
if result['success']:
|
|
741
|
+
results['visual'].append({
|
|
742
|
+
'file': result['file'],
|
|
743
|
+
'type': 'image',
|
|
744
|
+
'output': result['output']
|
|
745
|
+
})
|
|
746
|
+
print(_clean_visual(result['output'])[:600] + "\n")
|
|
747
|
+
|
|
748
|
+
# === PHASE 2: Audio Transcription (HEAVY) - Run alone ===
|
|
749
|
+
if mp3_files:
|
|
750
|
+
print(f"\n🎙️ Phase 2: Audio Transcription (Heavy Task)")
|
|
751
|
+
|
|
752
|
+
for audio in mp3_files:
|
|
753
|
+
result = run_analysis_task("Audio", 'analyzers/audio_transcribe.py', str(audio), "heavy")
|
|
754
|
+
if result['success']:
|
|
755
|
+
results['audio_transcription'].append({
|
|
756
|
+
'file': result['file'],
|
|
757
|
+
'output': result['output']
|
|
758
|
+
})
|
|
759
|
+
print(_clean_audio(result['output'])[:600] + "\n")
|
|
760
|
+
|
|
761
|
+
# === PHASE 3: Light tasks in PARALLEL ===
|
|
762
|
+
print(f"\n⚡ Phase 3: Light Tasks (Parallel Execution)")
|
|
763
|
+
|
|
764
|
+
light_tasks = []
|
|
765
|
+
|
|
766
|
+
# Add music identification tasks
|
|
767
|
+
for audio in mp3_files:
|
|
768
|
+
light_tasks.append(('music', 'analyzers/music_identifier.py', str(audio)))
|
|
769
|
+
|
|
770
|
+
# Add text analysis tasks
|
|
771
|
+
for info_file in info_files:
|
|
772
|
+
light_tasks.append(('text', 'analyzers/text_analyzer.py', str(info_file)))
|
|
773
|
+
|
|
774
|
+
if light_tasks:
|
|
775
|
+
# Run light tasks in parallel (max 3 concurrent)
|
|
776
|
+
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
777
|
+
futures = {}
|
|
778
|
+
|
|
779
|
+
for task_type, script, file_path in light_tasks:
|
|
780
|
+
task_name = "Music ID" if task_type == 'music' else "Text"
|
|
781
|
+
future = executor.submit(run_analysis_task, task_name, script, file_path, "light")
|
|
782
|
+
futures[future] = task_type
|
|
783
|
+
|
|
784
|
+
# Collect results as they complete
|
|
785
|
+
for future in as_completed(futures):
|
|
786
|
+
task_type = futures[future]
|
|
787
|
+
result = future.result()
|
|
788
|
+
|
|
789
|
+
if result['success']:
|
|
790
|
+
if task_type == 'music':
|
|
791
|
+
results['music_identification'].append({
|
|
792
|
+
'file': result['file'],
|
|
793
|
+
'output': result['output']
|
|
794
|
+
})
|
|
795
|
+
else: # text
|
|
796
|
+
results['text'].append({
|
|
797
|
+
'file': result['file'],
|
|
798
|
+
'output': result['output']
|
|
799
|
+
})
|
|
800
|
+
|
|
801
|
+
if task_type == 'music':
|
|
802
|
+
print(result['output'][:400] + "\n")
|
|
803
|
+
else:
|
|
804
|
+
print(_clean_text(result['output'])[:600] + "\n")
|
|
805
|
+
|
|
806
|
+
analysis_elapsed = time.time() - analysis_start
|
|
807
|
+
print(f"\n⏱️ Total Analysis Time: {analysis_elapsed:.1f}s")
|
|
808
|
+
|
|
809
|
+
# Final comprehensive summary
|
|
810
|
+
print_header("✅ GENERATING COMPREHENSIVE SUMMARY")
|
|
811
|
+
|
|
812
|
+
final_summary = generate_final_summary(results, instagram_url)
|
|
813
|
+
|
|
814
|
+
print_section("📋 FINAL REPORT")
|
|
815
|
+
print(final_summary)
|
|
816
|
+
|
|
817
|
+
# Extract structured data from summary for database
|
|
818
|
+
title, summary_text, tags, music, category = parse_summary(final_summary)
|
|
819
|
+
|
|
820
|
+
# Get additional metadata from info.txt if available
|
|
821
|
+
username = ""
|
|
822
|
+
likes = 0
|
|
823
|
+
post_date = None
|
|
824
|
+
|
|
825
|
+
if info_files:
|
|
826
|
+
try:
|
|
827
|
+
with open(info_files[0], 'r', encoding='utf-8') as f:
|
|
828
|
+
content = f.read()
|
|
829
|
+
username_match = re.search(r'Username: @(\S+)', content)
|
|
830
|
+
likes_match = re.search(r'Likes: (\d+)', content)
|
|
831
|
+
date_match = re.search(r'Date: ([\d\-: ]+)', content)
|
|
832
|
+
|
|
833
|
+
if username_match:
|
|
834
|
+
username = username_match.group(1)
|
|
835
|
+
if likes_match:
|
|
836
|
+
likes = int(likes_match.group(1))
|
|
837
|
+
if date_match:
|
|
838
|
+
post_date = date_match.group(1)
|
|
839
|
+
except:
|
|
840
|
+
pass
|
|
841
|
+
|
|
842
|
+
# Save to database
|
|
843
|
+
print_section("💾 Saving to Database")
|
|
844
|
+
|
|
845
|
+
# Combine analysis texts — extract clean content (not raw stdout)
|
|
846
|
+
visual_text = "\n\n".join([_clean_visual(r['output']) for r in results['visual']])
|
|
847
|
+
audio_text = "\n\n".join([_clean_audio(r['output']) for r in results['audio_transcription']])
|
|
848
|
+
text_text = "\n\n".join([_clean_text(r['output']) for r in results['text']])
|
|
849
|
+
|
|
850
|
+
# — Instagram thumbnail: first downloaded jpg (converted to base64) —
|
|
851
|
+
instagram_thumbnail = ""
|
|
852
|
+
if jpg_files:
|
|
853
|
+
print(f"🖼️ Saving thumbnail from {jpg_files[0].name}...")
|
|
854
|
+
instagram_thumbnail = _jpg_to_thumbnail(jpg_files[0])
|
|
855
|
+
elif mp4_files:
|
|
856
|
+
# Try to extract first frame from the video using cv2
|
|
857
|
+
try:
|
|
858
|
+
import cv2
|
|
859
|
+
import base64
|
|
860
|
+
import tempfile
|
|
861
|
+
cap = cv2.VideoCapture(str(mp4_files[0]))
|
|
862
|
+
ret, frame = cap.read()
|
|
863
|
+
cap.release()
|
|
864
|
+
if ret:
|
|
865
|
+
import cv2 as _cv2
|
|
866
|
+
# Resize to 480 wide keeping aspect
|
|
867
|
+
h, w = frame.shape[:2]
|
|
868
|
+
new_w = 480
|
|
869
|
+
new_h = int(h * new_w / w)
|
|
870
|
+
frame = _cv2.resize(frame, (new_w, new_h))
|
|
871
|
+
ok, buf = _cv2.imencode(".jpg", frame, [_cv2.IMWRITE_JPEG_QUALITY, 75])
|
|
872
|
+
if ok:
|
|
873
|
+
encoded = base64.b64encode(buf.tobytes()).decode()
|
|
874
|
+
instagram_thumbnail = f"data:image/jpeg;base64,{encoded}"
|
|
875
|
+
print("🖼️ Saved first-frame thumbnail from video")
|
|
876
|
+
except Exception as thumb_err:
|
|
877
|
+
print(f"⚠️ Could not extract video thumbnail: {thumb_err}")
|
|
878
|
+
|
|
879
|
+
db.save_analysis(
|
|
880
|
+
shortcode=shortcode,
|
|
881
|
+
url=instagram_url,
|
|
882
|
+
username=username,
|
|
883
|
+
title=title,
|
|
884
|
+
summary=summary_text,
|
|
885
|
+
tags=tags,
|
|
886
|
+
music=music,
|
|
887
|
+
category=category,
|
|
888
|
+
visual_analysis=visual_text,
|
|
889
|
+
audio_transcription=audio_text,
|
|
890
|
+
text_analysis=text_text,
|
|
891
|
+
likes=likes,
|
|
892
|
+
post_date=post_date,
|
|
893
|
+
thumbnail=instagram_thumbnail,
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
print(f"✓ Analysis saved to database")
|
|
897
|
+
|
|
898
|
+
# Step 8: Cleanup temp folder
|
|
899
|
+
print_section("🧹 Step 5: Cleanup")
|
|
900
|
+
|
|
901
|
+
if cleanup_temp_folder(download_folder):
|
|
902
|
+
print(f"✓ Temp folder deleted successfully")
|
|
903
|
+
else:
|
|
904
|
+
print(f"⚠ Temp folder not deleted (may need manual cleanup)")
|
|
905
|
+
|
|
906
|
+
print("\n" + "=" * 80)
|
|
907
|
+
print(f"📊 Analyses Complete: Visual({len(results['visual'])}), Audio({len(results['audio_transcription'])}), Music({len(results['music_identification'])}), Text({len(results['text'])})")
|
|
908
|
+
print(f"⏱️ Total Time: {analysis_elapsed:.1f}s")
|
|
909
|
+
print("=" * 80 + "\n")
|
|
910
|
+
|
|
911
|
+
if __name__ == "__main__":
|
|
912
|
+
main()
|