npm - superbrain-server - Versions diffs - 1.0.2-beta.0 - Mend

superbrain-server 1.0.2-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/bin/superbrain.js +196 -0
package/package.json +23 -0
package/payload/.dockerignore +45 -0
package/payload/.env.example +58 -0
package/payload/Dockerfile +73 -0
package/payload/analyzers/__init__.py +0 -0
package/payload/analyzers/audio_transcribe.py +225 -0
package/payload/analyzers/caption.py +244 -0
package/payload/analyzers/music_identifier.py +346 -0
package/payload/analyzers/text_analyzer.py +117 -0
package/payload/analyzers/visual_analyze.py +218 -0
package/payload/analyzers/webpage_analyzer.py +789 -0
package/payload/analyzers/youtube_analyzer.py +320 -0
package/payload/api.py +1676 -0
package/payload/config/.api_keys.example +22 -0
package/payload/config/model_rankings.json +492 -0
package/payload/config/openrouter_free_models.json +1364 -0
package/payload/config/whisper_model.txt +1 -0
package/payload/config_settings.py +185 -0
package/payload/core/__init__.py +0 -0
package/payload/core/category_manager.py +219 -0
package/payload/core/database.py +811 -0
package/payload/core/link_checker.py +300 -0
package/payload/core/model_router.py +1253 -0
package/payload/docker-compose.yml +120 -0
package/payload/instagram/__init__.py +0 -0
package/payload/instagram/instagram_downloader.py +253 -0
package/payload/instagram/instagram_login.py +190 -0
package/payload/main.py +912 -0
package/payload/requirements.txt +39 -0
package/payload/reset.py +311 -0
package/payload/start-docker-prod.sh +125 -0
package/payload/start-docker.sh +56 -0
package/payload/start.py +1302 -0
package/payload/static/favicon.ico +0 -0
package/payload/stop-docker.sh +16 -0
package/payload/utils/__init__.py +0 -0
package/payload/utils/db_stats.py +108 -0
package/payload/utils/manage_token.py +91 -0

package/payload/analyzers/caption.py ADDED Viewed

@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Simple Instagram Caption Extractor
+Fast, reliable, no rate limiting using direct HTML parsing.
+"""
+import requests
+import re
+import sys
+import json
+import html
+import io
+# Force UTF-8 encoding for stdout on Windows
+if sys.platform == 'win32':
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+def is_valid_instagram_url(url):
+    """
+    Check if the URL is a valid Instagram post/reel URL.
+    Args:
+        url: Instagram URL to validate
+    Returns:
+        Boolean indicating if URL is valid
+    """
+    patterns = [
+        r'instagram\.com/p/[A-Za-z0-9_-]+',      # Regular posts
+        r'instagram\.com/reel/[A-Za-z0-9_-]+',   # Reels
+        r'instagram\.com/tv/[A-Za-z0-9_-]+',     # IGTV
+    ]
+    return any(re.search(pattern, url) for pattern in patterns)
+def clean_caption(caption):
+    """
+    Clean the caption by removing metadata, hashtags, and decoding HTML entities.
+    Args:
+        caption: Raw caption text
+    Returns:
+        Cleaned caption text
+    """
+    if not caption:
+        return caption
+    # Decode HTML entities (e.g., &quot; -> ", &#x2764; -> ❤)
+    caption = html.unescape(caption)
+    # Remove Instagram metadata patterns - multiple variations
+    # Pattern 1: "123 likes, 45 comments - username on Date: "
+    # Pattern 2: "12K likes, 50 comments - username on Date: "
+    # Pattern 3: "1,277 likes, 34 comments - username on Date: "
+    caption = re.sub(r'^\s*[\d,\.]+[KMB]?\s*(likes?|comments?)[^:]*?:\s*["\']?', '', caption, flags=re.IGNORECASE)
+    # Remove trailing quotes
+    caption = re.sub(r'["\']\.?\s*$', '', caption)
+    # Remove trailing metadata like "- See photos and videos"
+    caption = re.sub(r'\s*-\s*See\s+(photos?|videos?).*$', '', caption, flags=re.IGNORECASE)
+    # Remove "X likes, Y comments" patterns at the end
+    caption = re.sub(r'\s*[\d,\.]+[KMB]?\s*(likes?|comments?).*$', '', caption, flags=re.IGNORECASE)
+    # Remove hashtags (including the # symbol and the tag text)
+    caption = re.sub(r'#\w+', '', caption)
+    # Clean up extra quotes at the beginning and end
+    caption = caption.strip('"\'')
+    # Clean up extra whitespace and newlines
+    caption = re.sub(r'\n\s*\n+', '\n', caption)  # Remove multiple blank lines
+    caption = re.sub(r'[ \t]+', ' ', caption)  # Normalize spaces
+    caption = caption.strip()
+    # Remove lines that only contain dots or whitespace
+    lines = caption.split('\n')
+    lines = [line.strip() for line in lines if line.strip() and line.strip() != '.']
+    caption = '\n'.join(lines)
+    return caption
+def get_caption(url):
+    """
+    Get the caption from an Instagram post or reel by parsing HTML.
+    Args:
+        url: Instagram post or reel URL
+    Returns:
+        Caption text or error message
+    """
+    # Validate URL
+    if not is_valid_instagram_url(url):
+        return "❌ Invalid Instagram URL. Please provide a valid post or reel link."
+    # Clean URL - remove query parameters and trailing slashes
+    url = url.split('?')[0].rstrip('/') + '/'
+    try:
+        # Request headers to mimic a browser
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+            'Cache-Control': 'max-age=0',
+        }
+        # Make request
+        response = requests.get(url, headers=headers, timeout=15)
+        if response.status_code != 200:
+            return f"❌ Error: Unable to fetch post (Status code: {response.status_code})"
+        # Get text - let requests handle any decompression
+        html = response.text
+        # Method 1: Try to extract from JSON-LD structured data
+        json_ld_pattern = r'<script type="application/ld\+json">(.*?)</script>'
+        json_ld_matches = re.findall(json_ld_pattern, html, re.DOTALL)
+        for json_str in json_ld_matches:
+            try:
+                data = json.loads(json_str)
+                if isinstance(data, dict):
+                    # Check for caption in various fields
+                    caption = data.get('caption') or data.get('description') or data.get('articleBody')
+                    if caption:
+                        return clean_caption(caption)
+            except:
+                continue
+        # Method 2: Extract from meta tags
+        meta_patterns = [
+            r'<meta property="og:description" content="([^"]*)"',
+            r'<meta name="description" content="([^"]*)"',
+            r'<meta property="og:title" content="([^"]*)"',
+        ]
+        for pattern in meta_patterns:
+            match = re.search(pattern, html)
+            if match:
+                caption = match.group(1)
+                caption = clean_caption(caption)
+                if caption and len(caption) > 10:  # Make sure it's not just metadata
+                    return caption
+        # Method 3: Try to find in embedded JSON data
+        shared_data_pattern = r'window\._sharedData\s*=\s*({.*?});'
+        match = re.search(shared_data_pattern, html)
+        if match:
+            try:
+                shared_data = json.loads(match.group(1))
+                # Navigate through the nested structure
+                entry_data = shared_data.get('entry_data', {})
+                # Try PostPage
+                if 'PostPage' in entry_data:
+                    media = entry_data['PostPage'][0]['graphql']['shortcode_media']
+                    caption_edges = media.get('edge_media_to_caption', {}).get('edges', [])
+                    if caption_edges:
+                        return clean_caption(caption_edges[0]['node']['text'])
+            except:
+                pass
+        # Method 4: Try additional_data pattern
+        additional_pattern = r'"caption":\s*"([^"]*)"'
+        matches = re.findall(additional_pattern, html)
+        if matches:
+            # Get the longest caption (likely the actual post caption)
+            caption = max(matches, key=len)
+            if caption:
+                # Decode unicode escapes
+                caption = caption.encode().decode('unicode_escape')
+                return clean_caption(caption)
+        return "ℹ️ Could not extract caption. The post may have no caption or Instagram's HTML structure has changed."
+    except requests.exceptions.Timeout:
+        return "❌ Request timed out. Please check your internet connection and try again."
+    except requests.exceptions.ConnectionError:
+        return "❌ Connection error. Please check your internet connection."
+    except requests.exceptions.RequestException as e:
+        return f"❌ Request error: {str(e)}"
+    except Exception as e:
+        return f"❌ Unexpected error: {str(e)}"
+def main():
+    """Main function to run the caption extractor."""
+    # Check if URL was provided as command line argument
+    if len(sys.argv) > 1:
+        url = sys.argv[1]
+        # When called from API, just print the caption
+        caption = get_caption(url)
+        print(caption)
+    else:
+        # Interactive mode
+        print("=" * 60)
+        print("📸 Instagram Caption Extractor")
+        print("=" * 60)
+        print()
+        # Prompt for URL
+        url = input("Enter Instagram post or reel URL: ").strip()
+        if not url:
+            print("❌ No URL provided. Exiting.")
+            return
+        print()
+        print("🔍 Fetching caption...")
+        print()
+        # Get and display caption
+        caption = get_caption(url)
+        print("📝 Caption:")
+        print("-" * 60)
+        print(caption)
+        print("-" * 60)
+if __name__ == "__main__":
+    main()

package/payload/analyzers/music_identifier.py ADDED Viewed

@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+"""
+Music Identifier – Optimized Shazam multi-segment recognition
+==============================================================
+Strategy:
+  • Quick probe  — tries a 12 s fingerprint from several evenly-spaced
+                   positions first (fast, low-bandwidth)
+  • Deep scan    — if nothing found, retries the same positions with
+                   full 20 s segments for trickier tracks
+  • Positions    — up to 5 evenly-spread offsets scaled to audio length
+                   so songs that start after speech/ambient sound are caught
+Output format is identical to the original so that main.py's parser
+requires no changes.
+"""
+import sys
+import os
+import asyncio
+import subprocess
+import tempfile
+from pathlib import Path
+# Ensure backend root is on sys.path when called as a subprocess
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+try:
+    from shazamio import Shazam
+    _HAS_SHAZAM = True
+except ImportError:
+    _HAS_SHAZAM = False
+# ─────────────────────────────────────────────────────────────────────────────
+#  Audio helpers
+# ─────────────────────────────────────────────────────────────────────────────
+def _get_duration(audio_path: str) -> float:
+    """Return audio duration in seconds via ffprobe. Falls back to 60 s."""
+    try:
+        result = subprocess.run(
+            ["ffprobe", "-v", "error",
+             "-show_entries", "format=duration",
+             "-of", "default=noprint_wrappers=1:nokey=1",
+             audio_path],
+            capture_output=True, text=True, timeout=10,
+        )
+        return float(result.stdout.strip())
+    except Exception:
+        return 60.0
+def _extract_segment(audio_path: str, start_sec: float,
+                     duration: float = 20.0) -> str | None:
+    """Cut a slice from *audio_path* starting at *start_sec*.
+    Returns path to a temp MP3 file (caller must delete it), or None.
+    """
+    try:
+        fd, seg_path = tempfile.mkstemp(suffix=".mp3")
+        os.close(fd)
+        subprocess.run(
+            ["ffmpeg", "-y",
+             "-ss", str(int(start_sec)),
+             "-t",  str(int(duration)),
+             "-i",  audio_path,
+             "-acodec", "libmp3lame", "-q:a", "3",
+             seg_path],
+            capture_output=True, timeout=30,
+        )
+        if os.path.getsize(seg_path) > 1024:
+            return seg_path
+        os.remove(seg_path)
+        return None
+    except Exception:
+        return None
+def _segment_positions(duration: float) -> list[float]:
+    """Return evenly-spread start-second offsets to probe.
+    Rules:
+      <= 20 s  -> [0]                              (short clip, try as-is)
+      <= 50 s  -> [0, ~50%]                        (2 positions)
+      <= 90 s  -> [0, ~33%, ~66%]                  (3 positions)
+      <= 180 s -> [0, ~25%, ~50%, ~75%]            (4 positions)
+      > 180 s  -> [0, ~20%, ~40%, ~60%, ~80%]      (5 positions)
+    """
+    if duration <= 20:
+        return [0.0]
+    if duration <= 50:
+        return [0.0, duration * 0.50]
+    if duration <= 90:
+        return [0.0, duration * 0.33, duration * 0.66]
+    if duration <= 180:
+        return [0.0, duration * 0.25, duration * 0.50, duration * 0.75]
+    return [0.0, duration * 0.20, duration * 0.40,
+            duration * 0.60, duration * 0.80]
+# ─────────────────────────────────────────────────────────────────────────────
+#  Shazam core
+# ─────────────────────────────────────────────────────────────────────────────
+async def _shazam_recognize_file(shazam, path: str) -> dict | None:
+    try:
+        result = await shazam.recognize(path)
+        if result and "track" in result:
+            return result
+    except Exception:
+        pass
+    return None
+async def _shazam_multi_segment(audio_path: str) -> dict | None:
+    """Two-pass Shazam scan.
+    Pass 1 — Quick probe (12 s segments):  fast fingerprint; catches most hits
+    Pass 2 — Deep scan  (20 s segments):   longer window catches harder tracks
+    Both passes share the same evenly-distributed position list.
+    """
+    if not _HAS_SHAZAM:
+        return None
+    shazam    = Shazam()
+    duration  = _get_duration(audio_path)
+    positions = _segment_positions(duration)
+    total     = len(positions)
+    # ── Pass 1: quick 12 s probe ─────────────────────────────────────────────
+    print(f"   ⚡ [Pass 1 – quick probe, 12 s  x  {total} position{'s' if total > 1 else ''}]")
+    for i, start in enumerate(positions, start=1):
+        label = f"@{int(start)}s" if start > 0 else "start"
+        print(f"   [Shazam] {i}/{total} {label}...", end=" ", flush=True)
+        if start == 0:
+            # Try the original file first (no re-encoding overhead)
+            result = await _shazam_recognize_file(shazam, audio_path)
+        else:
+            seg = _extract_segment(audio_path, start, duration=12)
+            if not seg:
+                print("(extract failed)")
+                continue
+            try:
+                result = await _shazam_recognize_file(shazam, seg)
+            finally:
+                try:
+                    os.remove(seg)
+                except Exception:
+                    pass
+        if result:
+            print("match!")
+            return result
+        print("no match")
+    # ── Pass 2: deep 20 s scan ───────────────────────────────────────────────
+    if total == 1:
+        # Audio is <= 20 s — pass 2 adds nothing new
+        return None
+    print()
+    print(f"   [Shazam] Pass 2 – deep scan, 20 s  x  {total} positions")
+    for i, start in enumerate(positions, start=1):
+        label = f"@{int(start)}s" if start > 0 else "start"
+        print(f"   [Shazam] {i}/{total} {label} (20 s)...", end=" ", flush=True)
+        seg = _extract_segment(audio_path, start, duration=20)
+        if not seg:
+            print("(extract failed)")
+            continue
+        try:
+            result = await _shazam_recognize_file(shazam, seg)
+        finally:
+            try:
+                os.remove(seg)
+            except Exception:
+                pass
+        if result:
+            print("match!")
+            return result
+        print("no match")
+    return None
+# ─────────────────────────────────────────────────────────────────────────────
+#  Result formatter
+# ─────────────────────────────────────────────────────────────────────────────
+def _format_shazam(result: dict) -> dict:
+    track = result["track"]
+    # ── Artist ────────────────────────────────────────────────────────────────
+    artist = track.get("subtitle", "").strip()
+    if not artist and track.get("artists"):
+        aliases = [a.get("alias", "").replace("-", " ").title()
+                   for a in track["artists"] if a.get("alias")]
+        artist = ", ".join(aliases)
+    if not artist:
+        for section in track.get("sections", []):
+            if section.get("type") == "SONG":
+                for meta in section.get("metadata", []):
+                    if meta.get("title", "").lower() in ("artist", "artists"):
+                        artist = meta.get("text", "").strip()
+                if not artist:
+                    artist = section.get("tabname", "").strip()
+    if not artist and "hub" in track:
+        hub_text = track["hub"].get("actions", [{}])[0].get("name", "")
+        if " - " in hub_text:
+            artist = hub_text.split(" - ")[0].strip()
+    # ── Metadata ──────────────────────────────────────────────────────────────
+    album = released = label = genre = ""
+    for section in track.get("sections", []):
+        if section.get("type") == "SONG":
+            for meta in section.get("metadata", []):
+                t, v = meta.get("title", "").lower(), meta.get("text", "")
+                if   t == "album":    album    = v
+                elif t == "released": released = v
+                elif t == "label":    label    = v
+    if "genres" in track:
+        genre = track["genres"].get("primary", "")
+    # ── Links ─────────────────────────────────────────────────────────────────
+    spotify = ""
+    if "hub" in track:
+        for p in track["hub"].get("providers", []):
+            if p.get("type") == "SPOTIFY":
+                spotify = p["actions"][0].get("uri", "")
+    return {
+        "title":        track.get("title", ""),
+        "artist":       artist or "Unknown",
+        "album":        album,
+        "released":     released,
+        "label":        label,
+        "genre":        genre,
+        "shazam_count": track.get("shazamcount", 0),
+        "spotify":      spotify,
+        "apple":        track.get("url", ""),
+        "source":       "Shazam",
+    }
+# ─────────────────────────────────────────────────────────────────────────────
+#  Output printer
+# ─────────────────────────────────────────────────────────────────────────────
+def _print_result(info: dict) -> None:
+    print()
+    print("=" * 70)
+    print(f"\u2705 MUSIC IDENTIFIED  [{info['source']}]")
+    print("=" * 70)
+    print()
+    print(f"\U0001f3b5 Song: {info['title']}")
+    print(f"\U0001f464 Artist: {info['artist']}")
+    if info["album"]:
+        print(f"\U0001f4bf Album: {info['album']}")
+    if info["released"]:
+        print(f"\U0001f4c5 Released: {info['released']}")
+    if info["label"]:
+        print(f"\U0001f3f7\ufe0f  Label: {info['label']}")
+    if info["genre"]:
+        print(f"\U0001f3b8 Genre: {info['genre']}")
+    if info["shazam_count"]:
+        c   = info["shazam_count"]
+        fmt = (f"{c / 1_000_000:.1f}M" if c >= 1_000_000
+               else (f"{c / 1_000:.1f}K" if c >= 1_000 else str(c)))
+        print(f"\U0001f525 Shazams: {fmt}")
+    print()
+    print("\U0001f517 LINKS:")
+    if info["spotify"]:
+        print(f"   Spotify: {info['spotify']}")
+    if info["apple"]:
+        print(f"   Apple Music: {info['apple']}")
+    print()
+    print("=" * 70)
+# ─────────────────────────────────────────────────────────────────────────────
+#  Public API
+# ─────────────────────────────────────────────────────────────────────────────
+async def identify_music(audio_path: str) -> None:
+    """Identify music from *audio_path* using optimized Shazam multi-segment."""
+    print("=" * 70)
+    print("MUSIC IDENTIFIER  (Shazam – optimized multi-segment)")
+    print("=" * 70)
+    print()
+    path = Path(audio_path)
+    if not path.exists():
+        print(f"File not found: {audio_path}")
+        return
+    valid_exts = {".mp3", ".wav", ".m4a", ".ogg", ".flac",
+                  ".aac", ".mp4", ".avi", ".mov"}
+    if path.suffix.lower() not in valid_exts:
+        print(f"Unsupported file type: {path.suffix}")
+        return
+    if not _HAS_SHAZAM:
+        print("shazamio is not installed. Run:  pip install shazamio")
+        return
+    print(f"Analyzing: {path.name}")
+    print()
+    result = await _shazam_multi_segment(str(path))
+    if result:
+        _print_result(_format_shazam(result))
+        return
+    print()
+    print("No match found. The audio might be:")
+    print("   - Original / unreleased / user-created music")
+    print("   - Too short or poor audio quality")
+    print("   - Background / ambient sound without a clear melody")
+    print("   - In a niche regional catalogue not yet in Shazam's DB")
+# ─────────────────────────────────────────────────────────────────────────────
+#  CLI entry point
+# ─────────────────────────────────────────────────────────────────────────────
+def main() -> None:
+    if len(sys.argv) > 1:
+        audio_path = sys.argv[1].strip("\"'").strip()
+    else:
+        print("=" * 70)
+        print("MUSIC IDENTIFIER  (Shazam – optimized multi-segment)")
+        print("=" * 70)
+        print()
+        audio_path = input("Enter audio/video file path: ").strip()
+    if not audio_path:
+        print("No path provided!")
+        return
+    asyncio.run(identify_music(audio_path))
+if __name__ == "__main__":
+    main()