PyPI - lyrics-transcriber - Versions diffs - 0.58.0__py3-none-any.whl → 0.60.0__py3-none-any.whl - Mend

lyrics-transcriber 0.58.0py3-none-any.whl → 0.60.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

lyrics_transcriber/frontend/web_assets/index.html CHANGED Viewed

@@ -5,7 +5,7 @@
     <link rel="icon" type="image/svg+xml" href="/vite.svg" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>Lyrics Transcriber Analyzer</title>
-    <script type="module" crossorigin src="/assets/index-fO30CduZ.js"></script>
+    <script type="module" crossorigin src="/assets/index-Bktwnsnn.js"></script>
   </head>
   <body>
     <div id="root"></div>

lyrics_transcriber/lyrics/base_lyrics_provider.py CHANGED Viewed

@@ -16,6 +16,7 @@ class LyricsProviderConfig:
     """Configuration for lyrics providers."""
     genius_api_token: Optional[str] = None
+    rapidapi_key: Optional[str] = None
     spotify_cookie: Optional[str] = None
     lyrics_file: Optional[str] = None
     cache_dir: Optional[str] = None

lyrics_transcriber/lyrics/genius.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 import re
 from typing import Optional, Dict, Any
+import requests
 import lyricsgenius
 from lyrics_transcriber.types import LyricsData, LyricsMetadata
 from lyrics_transcriber.lyrics.base_lyrics_provider import BaseLyricsProvider, LyricsProviderConfig
@@ -12,8 +13,10 @@ class GeniusProvider(BaseLyricsProvider):
     def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger] = None):
         super().__init__(config, logger)
         self.api_token = config.genius_api_token
+        self.rapidapi_key = config.rapidapi_key
         self.client = None
-        if self.api_token:
+        # Only initialize lyricsgenius client if rapidapi_key is not set
+        if self.api_token and not self.rapidapi_key:
             self.client = lyricsgenius.Genius(
                 self.api_token,
                 verbose=(logger.getEffectiveLevel() == logging.DEBUG if logger else False),
@@ -25,9 +28,17 @@ class GeniusProvider(BaseLyricsProvider):
             )
     def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
-        """Fetch raw song data from Genius API."""
+        """Fetch raw song data from Genius API or RapidAPI."""
+        # Try RapidAPI first if available
+        if self.rapidapi_key:
+            self.logger.info(f"Trying RapidAPI for {artist} - {title}")
+            result = self._fetch_from_rapidapi(artist, title)
+            if result:
+                return result
+        # Fall back to direct Genius API
         if not self.client:
-            self.logger.warning("No Genius API token provided")
+            self.logger.warning("No Genius API token provided and RapidAPI failed")
             return None
         self.logger.info(f"Searching Genius for {artist} - {title}")
@@ -40,8 +51,186 @@ class GeniusProvider(BaseLyricsProvider):
             self.logger.error(f"Error fetching from Genius: {str(e)}")
         return None
+    def _fetch_from_rapidapi(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
+        """Fetch song data using RapidAPI."""
+        try:
+            # Step 1: Search for the song
+            search_url = "https://genius-song-lyrics1.p.rapidapi.com/search/"
+            search_params = {
+                "q": f"{artist} {title}",
+                "per_page": "10",
+                "page": "1"
+            }
+            headers = {
+                "x-rapidapi-key": self.rapidapi_key,
+                "x-rapidapi-host": "genius-song-lyrics1.p.rapidapi.com"
+            }
+            self.logger.debug(f"Making RapidAPI search request for '{artist} {title}'")
+            search_response = requests.get(search_url, headers=headers, params=search_params, timeout=10)
+            search_response.raise_for_status()
+            search_data = search_response.json()
+            # Find the best match from search results
+            if not search_data.get("hits"):
+                self.logger.warning("No search results from RapidAPI")
+                return None
+            best_match = None
+            for hit in search_data["hits"]:
+                result = hit.get("result", {})
+                if result.get("id"):
+                    best_match = result
+                    break
+            if not best_match:
+                self.logger.warning("No valid song ID found in RapidAPI search results")
+                return None
+            song_id = best_match["id"]
+            self.logger.debug(f"Found song ID: {song_id}")
+            # Step 2: Fetch lyrics using the song ID
+            lyrics_url = "https://genius-song-lyrics1.p.rapidapi.com/song/lyrics/"
+            lyrics_params = {"id": str(song_id)}
+            self.logger.debug(f"Making RapidAPI lyrics request for song ID {song_id}")
+            lyrics_response = requests.get(lyrics_url, headers=headers, params=lyrics_params, timeout=10)
+            lyrics_response.raise_for_status()
+            lyrics_data = lyrics_response.json()
+            # Extract lyrics from the nested response structure
+            lyrics_text = self._extract_lyrics_from_rapidapi_response(lyrics_data)
+            if not lyrics_text:
+                self.logger.warning("No lyrics found in RapidAPI response")
+                return None
+            # Create a clean RapidAPI-only response structure
+            # Don't mix search metadata (which contains Genius fields) with our clean structure
+            rapidapi_response = {
+                "title": best_match.get("title", ""),
+                "primary_artist": best_match.get("primary_artist", {}),
+                "lyrics": lyrics_text,
+                "id": song_id,
+                "url": best_match.get("url", ""),
+                "release_date_for_display": best_match.get("release_date_for_display", ""),
+                # Mark this as RapidAPI source
+                "_rapidapi_source": True
+            }
+            self.logger.info("Successfully fetched lyrics from RapidAPI")
+            return rapidapi_response
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"RapidAPI request failed: {str(e)}")
+            return None
+        except Exception as e:
+            self.logger.error(f"Error fetching from RapidAPI: {str(e)}")
+            return None
+    def _extract_lyrics_from_rapidapi_response(self, lyrics_data: Dict[str, Any]) -> Optional[str]:
+        """Extract lyrics text from RapidAPI response structure."""
+        try:
+            # Log the actual response structure for debugging
+            self.logger.debug(f"RapidAPI response structure: {lyrics_data}")
+            # Try different possible response structures
+            # Structure 1: lyrics.lyrics.body.html (the actual RapidAPI structure)
+            nested_lyrics = lyrics_data.get("lyrics", {}).get("lyrics", {})
+            if isinstance(nested_lyrics, dict):
+                html_content = nested_lyrics.get("body", {}).get("html")
+                if html_content:
+                    return self._clean_html_lyrics(html_content)
+            # Structure 2: lyrics.lyrics (simple string)
+            if isinstance(lyrics_data.get("lyrics", {}).get("lyrics"), str):
+                return lyrics_data["lyrics"]["lyrics"]
+            # Structure 3: lyrics.body.html (HTML content)
+            html_content = lyrics_data.get("lyrics", {}).get("body", {}).get("html")
+            if html_content:
+                return self._clean_html_lyrics(html_content)
+            # Structure 4: Direct lyrics field
+            if isinstance(lyrics_data.get("lyrics"), str):
+                return lyrics_data["lyrics"]
+            # Structure 5: body.html at top level
+            if lyrics_data.get("body", {}).get("html"):
+                return self._clean_html_lyrics(lyrics_data["body"]["html"])
+            # Structure 6: Check if lyrics is a dict with other possible keys
+            lyrics_obj = lyrics_data.get("lyrics", {})
+            if isinstance(lyrics_obj, dict):
+                # Try common alternative keys
+                for key in ["text", "content", "plain", "body"]:
+                    if key in lyrics_obj:
+                        content = lyrics_obj[key]
+                        if isinstance(content, str):
+                            return content
+                        elif isinstance(content, dict) and "html" in content:
+                            return self._clean_html_lyrics(content["html"])
+                        elif isinstance(content, dict) and "text" in content:
+                            return content["text"]
+            self.logger.warning(f"Unknown RapidAPI response structure: {list(lyrics_data.keys())}")
+            if "lyrics" in lyrics_data:
+                self.logger.warning(f"Lyrics object structure: {lyrics_data['lyrics']}")
+            return None
+        except Exception as e:
+            self.logger.error(f"Error extracting lyrics from RapidAPI response: {str(e)}")
+            return None
+    def _clean_html_lyrics(self, html_content: str) -> str:
+        """Clean HTML content to extract plain text lyrics."""
+        import re
+        if not html_content:
+            return ""
+        # Remove HTML tags while preserving line breaks
+        text = re.sub(r'<br\s*/?>', '\n', html_content)  # Convert <br> to newlines
+        text = re.sub(r'<[^>]+>', '', text)  # Remove all other HTML tags
+        # Decode HTML entities
+        text = text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
+        text = text.replace('&quot;', '"').replace('&#x27;', "'").replace('&nbsp;', ' ')
+        # Remove section markers but keep the lyrics content
+        # Instead of removing entire lines, just remove the square bracket markers
+        text = re.sub(r'\[Verse \d+\]', '', text)
+        text = re.sub(r'\[Pre-Chorus\]', '', text)
+        text = re.sub(r'\[Chorus\]', '', text)
+        text = re.sub(r'\[Refrain\]', '', text)
+        text = re.sub(r'\[Outro\]', '', text)
+        text = re.sub(r'\[Bridge\]', '', text)
+        text = re.sub(r'\[Intro\]', '', text)
+        # Clean up multiple consecutive newlines
+        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
+        # Clean up leading/trailing whitespace
+        text = text.strip()
+        return text
     def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
         """Convert Genius's raw API response to standardized format."""
+        # Use our explicit source marker for detection
+        is_rapidapi = raw_data.get("_rapidapi_source", False)
+        if is_rapidapi:
+            return self._convert_rapidapi_format(raw_data)
+        else:
+            return self._convert_lyricsgenius_format(raw_data)
+    def _convert_lyricsgenius_format(self, raw_data: Dict[str, Any]) -> LyricsData:
+        """Convert lyricsgenius format to standardized format."""
         # Clean the lyrics before processing
         lyrics = self._clean_lyrics(raw_data.get("lyrics", ""))
@@ -74,6 +263,46 @@ class GeniusProvider(BaseLyricsProvider):
                 "verified_annotations": len(raw_data.get("verified_annotations_by", [])),
                 "verified_contributors": len(raw_data.get("verified_contributors", [])),
                 "external_urls": {"genius": raw_data.get("url")},
+                "api_source": "lyricsgenius",
+            },
+        )
+        # Create segments with words from cleaned lyrics
+        segments = self._create_segments_with_words(lyrics, is_synced=False)
+        # Create result object with segments
+        return LyricsData(source="genius", segments=segments, metadata=metadata)
+    def _convert_rapidapi_format(self, raw_data: Dict[str, Any]) -> LyricsData:
+        """Convert RapidAPI format to standardized format."""
+        # Clean the lyrics before processing
+        lyrics = self._clean_lyrics(raw_data.get("lyrics", ""))
+        # Extract artist name from primary_artist
+        primary_artist = raw_data.get("primary_artist", {})
+        artist_name = primary_artist.get("name", "")
+        # Extract release date from release_date_for_display
+        release_date = raw_data.get("release_date_for_display")
+        # Create metadata object
+        metadata = LyricsMetadata(
+            source="genius",
+            track_name=raw_data.get("title", ""),
+            artist_names=artist_name,
+            album_name=raw_data.get("album", {}).get("name") if raw_data.get("album") else None,
+            lyrics_provider="genius",
+            lyrics_provider_id=str(raw_data.get("id")),
+            is_synced=False,  # Genius doesn't provide synced lyrics
+            provider_metadata={
+                "genius_id": raw_data.get("id"),
+                "release_date": release_date,
+                "page_url": raw_data.get("url"),
+                "annotation_count": raw_data.get("annotation_count"),
+                "lyrics_state": raw_data.get("lyrics_state"),
+                "pyongs_count": raw_data.get("pyongs_count"),
+                "external_urls": {"genius": raw_data.get("url")},
+                "api_source": "rapidapi",
             },
         )
@@ -86,6 +315,19 @@ class GeniusProvider(BaseLyricsProvider):
     def _clean_lyrics(self, lyrics: str) -> str:
         """Clean and process lyrics from Genius to remove unwanted content."""
         self.logger.debug("Starting lyrics cleaning process")
+        # Handle unexpected input types
+        if not isinstance(lyrics, str):
+            self.logger.warning(f"Expected string for lyrics, got {type(lyrics)}: {repr(lyrics)}")
+            if lyrics is None:
+                return ""
+            # Try to convert to string
+            try:
+                lyrics = str(lyrics)
+            except Exception as e:
+                self.logger.error(f"Failed to convert lyrics to string: {e}")
+                return ""
         original = lyrics
         lyrics = lyrics.replace("\\n", "\n")
@@ -123,10 +365,20 @@ class GeniusProvider(BaseLyricsProvider):
         if original != lyrics:
             self.logger.debug("Removed standalone 'Embed' text")
+        # Remove section markers but keep the lyrics content (for non-HTML lyrics)
+        # Instead of removing entire lines, just remove the square bracket markers
         original = lyrics
-        lyrics = re.sub(r".*?\[.*?\].*?", "", lyrics)
+        lyrics = re.sub(r'\[Verse \d+\]', '', lyrics)
+        lyrics = re.sub(r'\[Pre-Chorus\]', '', lyrics)
+        lyrics = re.sub(r'\[Chorus\]', '', lyrics)
+        lyrics = re.sub(r'\[Refrain\]', '', lyrics)
+        lyrics = re.sub(r'\[Outro\]', '', lyrics)
+        lyrics = re.sub(r'\[Bridge\]', '', lyrics)
+        lyrics = re.sub(r'\[Intro\]', '', lyrics)
         if original != lyrics:
-            self.logger.debug("Removed lines containing square brackets")
+            self.logger.debug("Removed section markers while preserving lyrics content")
+        # Remove common LyricsGenius page elements
         self.logger.debug("Completed lyrics cleaning process")
         return lyrics

lyrics_transcriber/lyrics/musixmatch.py ADDED Viewed

@@ -0,0 +1,156 @@
+import logging
+from typing import Optional, Dict, Any
+import requests
+from lyrics_transcriber.types import LyricsData, LyricsMetadata
+from lyrics_transcriber.lyrics.base_lyrics_provider import BaseLyricsProvider, LyricsProviderConfig
+class MusixmatchProvider(BaseLyricsProvider):
+    """Handles fetching lyrics from Musixmatch via RapidAPI."""
+    def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger] = None):
+        super().__init__(config, logger)
+        self.rapidapi_key = config.rapidapi_key
+    def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
+        """Fetch raw song data from Musixmatch via RapidAPI."""
+        if not self.rapidapi_key:
+            self.logger.warning("No RapidAPI key provided for Musixmatch")
+            return None
+        self.logger.info(f"Fetching lyrics from Musixmatch for {artist} - {title}")
+        try:
+            # Construct the API URL with artist and title
+            url = f"https://musixmatch-song-lyrics-api.p.rapidapi.com/lyrics/{artist}/{title}/"
+            headers = {
+                "x-rapidapi-key": self.rapidapi_key,
+                "x-rapidapi-host": "musixmatch-song-lyrics-api.p.rapidapi.com"
+            }
+            self.logger.debug(f"Making Musixmatch API request to: {url}")
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            data = response.json()
+            # Check if we got a valid response
+            if not data.get("message", {}).get("body", {}).get("macro_calls"):
+                self.logger.warning("Invalid response structure from Musixmatch API")
+                return None
+            # Check if lyrics are available
+            lyrics_data = data.get("message", {}).get("body", {}).get("macro_calls", {}).get("track.lyrics.get", {})
+            if not lyrics_data.get("message", {}).get("body", {}).get("lyrics"):
+                self.logger.warning("No lyrics found in Musixmatch response")
+                return None
+            self.logger.info("Successfully fetched lyrics from Musixmatch")
+            return data
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Musixmatch API request failed: {str(e)}")
+            return None
+        except Exception as e:
+            self.logger.error(f"Error fetching from Musixmatch: {str(e)}")
+            return None
+    def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
+        """Convert Musixmatch's raw API response to standardized format."""
+        try:
+            # Extract macro calls from the nested response
+            macro_calls = raw_data.get("message", {}).get("body", {}).get("macro_calls", {})
+            # Extract track information
+            track_data = macro_calls.get("matcher.track.get", {}).get("message", {}).get("body", {}).get("track", {})
+            # Extract lyrics information
+            lyrics_data = macro_calls.get("track.lyrics.get", {}).get("message", {}).get("body", {}).get("lyrics", {})
+            # Get the actual lyrics text
+            lyrics_text = lyrics_data.get("lyrics_body", "")
+            # Clean the lyrics
+            lyrics_text = self._clean_lyrics(lyrics_text)
+            # Create metadata object
+            metadata = LyricsMetadata(
+                source="musixmatch",
+                track_name=track_data.get("track_name", ""),
+                artist_names=track_data.get("artist_name", ""),
+                album_name=track_data.get("album_name", ""),
+                duration_ms=track_data.get("track_length", 0) * 1000 if track_data.get("track_length") else None,
+                explicit=bool(track_data.get("explicit", 0)),
+                language=lyrics_data.get("lyrics_language", ""),
+                is_synced=False,  # Musixmatch API doesn't provide sync data in this format
+                lyrics_provider="musixmatch",
+                lyrics_provider_id=str(lyrics_data.get("lyrics_id", "")),
+                provider_metadata={
+                    "musixmatch_track_id": track_data.get("track_id"),
+                    "musixmatch_lyrics_id": lyrics_data.get("lyrics_id"),
+                    "album_id": track_data.get("album_id"),
+                    "artist_id": track_data.get("artist_id"),
+                    "track_share_url": track_data.get("track_share_url"),
+                    "track_edit_url": track_data.get("track_edit_url"),
+                    "lyrics_language": lyrics_data.get("lyrics_language"),
+                    "lyrics_language_description": lyrics_data.get("lyrics_language_description"),
+                    "lyrics_copyright": lyrics_data.get("lyrics_copyright"),
+                    "track_rating": track_data.get("track_rating"),
+                    "num_favourite": track_data.get("num_favourite"),
+                    "first_release_date": track_data.get("first_release_date"),
+                    "spotify_id": track_data.get("track_spotify_id"),
+                    "isrc": track_data.get("track_isrc"),
+                    "api_source": "rapidapi_musixmatch",
+                },
+            )
+            # Create segments with words from lyrics
+            segments = self._create_segments_with_words(lyrics_text, is_synced=False)
+            # Create result object with segments
+            return LyricsData(source="musixmatch", segments=segments, metadata=metadata)
+        except Exception as e:
+            self.logger.error(f"Error converting Musixmatch response format: {str(e)}")
+            # Return empty lyrics data if conversion fails
+            return LyricsData(
+                source="musixmatch",
+                segments=[],
+                metadata=LyricsMetadata(
+                    source="musixmatch",
+                    track_name="",
+                    artist_names="",
+                    lyrics_provider="musixmatch",
+                    is_synced=False,
+                    provider_metadata={"api_source": "rapidapi_musixmatch", "conversion_error": str(e)},
+                )
+            )
+    def _clean_lyrics(self, lyrics: str) -> str:
+        """Clean and process lyrics from Musixmatch to remove unwanted content."""
+        if not isinstance(lyrics, str):
+            self.logger.warning(f"Expected string for lyrics, got {type(lyrics)}: {repr(lyrics)}")
+            if lyrics is None:
+                return ""
+            try:
+                lyrics = str(lyrics)
+            except Exception as e:
+                self.logger.error(f"Failed to convert lyrics to string: {e}")
+                return ""
+        # Replace escaped newlines with actual newlines, handling whitespace
+        import re
+        lyrics = re.sub(r'\s*\\n\s*', '\n', lyrics)
+        # Remove any HTML tags that might be present
+        lyrics = re.sub(r'<[^>]+>', '', lyrics)
+        # Clean up multiple consecutive newlines
+        lyrics = re.sub(r'\n\s*\n\s*\n+', '\n\n', lyrics)
+        # Clean up leading/trailing whitespace
+        lyrics = lyrics.strip()
+        self.logger.debug("Completed Musixmatch lyrics cleaning process")
+        return lyrics

lyrics-transcriber 0.58.0__py3-none-any.whl → 0.60.0__py3-none-any.whl

lyrics-transcriber 0.58.0py3-none-any.whl → 0.60.0py3-none-any.whl