lyrics-transcriber 0.58.0__py3-none-any.whl → 0.60.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  <link rel="icon" type="image/svg+xml" href="/vite.svg" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>Lyrics Transcriber Analyzer</title>
8
- <script type="module" crossorigin src="/assets/index-fO30CduZ.js"></script>
8
+ <script type="module" crossorigin src="/assets/index-Bktwnsnn.js"></script>
9
9
  </head>
10
10
  <body>
11
11
  <div id="root"></div>
@@ -16,6 +16,7 @@ class LyricsProviderConfig:
16
16
  """Configuration for lyrics providers."""
17
17
 
18
18
  genius_api_token: Optional[str] = None
19
+ rapidapi_key: Optional[str] = None
19
20
  spotify_cookie: Optional[str] = None
20
21
  lyrics_file: Optional[str] = None
21
22
  cache_dir: Optional[str] = None
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import re
3
3
  from typing import Optional, Dict, Any
4
+ import requests
4
5
  import lyricsgenius
5
6
  from lyrics_transcriber.types import LyricsData, LyricsMetadata
6
7
  from lyrics_transcriber.lyrics.base_lyrics_provider import BaseLyricsProvider, LyricsProviderConfig
@@ -12,8 +13,10 @@ class GeniusProvider(BaseLyricsProvider):
12
13
  def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger] = None):
13
14
  super().__init__(config, logger)
14
15
  self.api_token = config.genius_api_token
16
+ self.rapidapi_key = config.rapidapi_key
15
17
  self.client = None
16
- if self.api_token:
18
+ # Only initialize lyricsgenius client if rapidapi_key is not set
19
+ if self.api_token and not self.rapidapi_key:
17
20
  self.client = lyricsgenius.Genius(
18
21
  self.api_token,
19
22
  verbose=(logger.getEffectiveLevel() == logging.DEBUG if logger else False),
@@ -25,9 +28,17 @@ class GeniusProvider(BaseLyricsProvider):
25
28
  )
26
29
 
27
30
  def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
28
- """Fetch raw song data from Genius API."""
31
+ """Fetch raw song data from Genius API or RapidAPI."""
32
+ # Try RapidAPI first if available
33
+ if self.rapidapi_key:
34
+ self.logger.info(f"Trying RapidAPI for {artist} - {title}")
35
+ result = self._fetch_from_rapidapi(artist, title)
36
+ if result:
37
+ return result
38
+
39
+ # Fall back to direct Genius API
29
40
  if not self.client:
30
- self.logger.warning("No Genius API token provided")
41
+ self.logger.warning("No Genius API token provided and RapidAPI failed")
31
42
  return None
32
43
 
33
44
  self.logger.info(f"Searching Genius for {artist} - {title}")
@@ -40,8 +51,186 @@ class GeniusProvider(BaseLyricsProvider):
40
51
  self.logger.error(f"Error fetching from Genius: {str(e)}")
41
52
  return None
42
53
 
54
+ def _fetch_from_rapidapi(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
55
+ """Fetch song data using RapidAPI."""
56
+ try:
57
+ # Step 1: Search for the song
58
+ search_url = "https://genius-song-lyrics1.p.rapidapi.com/search/"
59
+ search_params = {
60
+ "q": f"{artist} {title}",
61
+ "per_page": "10",
62
+ "page": "1"
63
+ }
64
+
65
+ headers = {
66
+ "x-rapidapi-key": self.rapidapi_key,
67
+ "x-rapidapi-host": "genius-song-lyrics1.p.rapidapi.com"
68
+ }
69
+
70
+ self.logger.debug(f"Making RapidAPI search request for '{artist} {title}'")
71
+ search_response = requests.get(search_url, headers=headers, params=search_params, timeout=10)
72
+ search_response.raise_for_status()
73
+
74
+ search_data = search_response.json()
75
+
76
+ # Find the best match from search results
77
+ if not search_data.get("hits"):
78
+ self.logger.warning("No search results from RapidAPI")
79
+ return None
80
+
81
+ best_match = None
82
+ for hit in search_data["hits"]:
83
+ result = hit.get("result", {})
84
+ if result.get("id"):
85
+ best_match = result
86
+ break
87
+
88
+ if not best_match:
89
+ self.logger.warning("No valid song ID found in RapidAPI search results")
90
+ return None
91
+
92
+ song_id = best_match["id"]
93
+ self.logger.debug(f"Found song ID: {song_id}")
94
+
95
+ # Step 2: Fetch lyrics using the song ID
96
+ lyrics_url = "https://genius-song-lyrics1.p.rapidapi.com/song/lyrics/"
97
+ lyrics_params = {"id": str(song_id)}
98
+
99
+ self.logger.debug(f"Making RapidAPI lyrics request for song ID {song_id}")
100
+ lyrics_response = requests.get(lyrics_url, headers=headers, params=lyrics_params, timeout=10)
101
+ lyrics_response.raise_for_status()
102
+
103
+ lyrics_data = lyrics_response.json()
104
+
105
+ # Extract lyrics from the nested response structure
106
+ lyrics_text = self._extract_lyrics_from_rapidapi_response(lyrics_data)
107
+ if not lyrics_text:
108
+ self.logger.warning("No lyrics found in RapidAPI response")
109
+ return None
110
+
111
+ # Create a clean RapidAPI-only response structure
112
+ # Don't mix search metadata (which contains Genius fields) with our clean structure
113
+ rapidapi_response = {
114
+ "title": best_match.get("title", ""),
115
+ "primary_artist": best_match.get("primary_artist", {}),
116
+ "lyrics": lyrics_text,
117
+ "id": song_id,
118
+ "url": best_match.get("url", ""),
119
+ "release_date_for_display": best_match.get("release_date_for_display", ""),
120
+ # Mark this as RapidAPI source
121
+ "_rapidapi_source": True
122
+ }
123
+
124
+ self.logger.info("Successfully fetched lyrics from RapidAPI")
125
+ return rapidapi_response
126
+
127
+ except requests.exceptions.RequestException as e:
128
+ self.logger.error(f"RapidAPI request failed: {str(e)}")
129
+ return None
130
+ except Exception as e:
131
+ self.logger.error(f"Error fetching from RapidAPI: {str(e)}")
132
+ return None
133
+
134
+ def _extract_lyrics_from_rapidapi_response(self, lyrics_data: Dict[str, Any]) -> Optional[str]:
135
+ """Extract lyrics text from RapidAPI response structure."""
136
+ try:
137
+ # Log the actual response structure for debugging
138
+ self.logger.debug(f"RapidAPI response structure: {lyrics_data}")
139
+
140
+ # Try different possible response structures
141
+
142
+ # Structure 1: lyrics.lyrics.body.html (the actual RapidAPI structure)
143
+ nested_lyrics = lyrics_data.get("lyrics", {}).get("lyrics", {})
144
+ if isinstance(nested_lyrics, dict):
145
+ html_content = nested_lyrics.get("body", {}).get("html")
146
+ if html_content:
147
+ return self._clean_html_lyrics(html_content)
148
+
149
+ # Structure 2: lyrics.lyrics (simple string)
150
+ if isinstance(lyrics_data.get("lyrics", {}).get("lyrics"), str):
151
+ return lyrics_data["lyrics"]["lyrics"]
152
+
153
+ # Structure 3: lyrics.body.html (HTML content)
154
+ html_content = lyrics_data.get("lyrics", {}).get("body", {}).get("html")
155
+ if html_content:
156
+ return self._clean_html_lyrics(html_content)
157
+
158
+ # Structure 4: Direct lyrics field
159
+ if isinstance(lyrics_data.get("lyrics"), str):
160
+ return lyrics_data["lyrics"]
161
+
162
+ # Structure 5: body.html at top level
163
+ if lyrics_data.get("body", {}).get("html"):
164
+ return self._clean_html_lyrics(lyrics_data["body"]["html"])
165
+
166
+ # Structure 6: Check if lyrics is a dict with other possible keys
167
+ lyrics_obj = lyrics_data.get("lyrics", {})
168
+ if isinstance(lyrics_obj, dict):
169
+ # Try common alternative keys
170
+ for key in ["text", "content", "plain", "body"]:
171
+ if key in lyrics_obj:
172
+ content = lyrics_obj[key]
173
+ if isinstance(content, str):
174
+ return content
175
+ elif isinstance(content, dict) and "html" in content:
176
+ return self._clean_html_lyrics(content["html"])
177
+ elif isinstance(content, dict) and "text" in content:
178
+ return content["text"]
179
+
180
+ self.logger.warning(f"Unknown RapidAPI response structure: {list(lyrics_data.keys())}")
181
+ if "lyrics" in lyrics_data:
182
+ self.logger.warning(f"Lyrics object structure: {lyrics_data['lyrics']}")
183
+ return None
184
+
185
+ except Exception as e:
186
+ self.logger.error(f"Error extracting lyrics from RapidAPI response: {str(e)}")
187
+ return None
188
+
189
+ def _clean_html_lyrics(self, html_content: str) -> str:
190
+ """Clean HTML content to extract plain text lyrics."""
191
+ import re
192
+
193
+ if not html_content:
194
+ return ""
195
+
196
+ # Remove HTML tags while preserving line breaks
197
+ text = re.sub(r'<br\s*/?>', '\n', html_content) # Convert <br> to newlines
198
+ text = re.sub(r'<[^>]+>', '', text) # Remove all other HTML tags
199
+
200
+ # Decode HTML entities
201
+ text = text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
202
+ text = text.replace('&quot;', '"').replace('&#x27;', "'").replace('&nbsp;', ' ')
203
+
204
+ # Remove section markers but keep the lyrics content
205
+ # Instead of removing entire lines, just remove the square bracket markers
206
+ text = re.sub(r'\[Verse \d+\]', '', text)
207
+ text = re.sub(r'\[Pre-Chorus\]', '', text)
208
+ text = re.sub(r'\[Chorus\]', '', text)
209
+ text = re.sub(r'\[Refrain\]', '', text)
210
+ text = re.sub(r'\[Outro\]', '', text)
211
+ text = re.sub(r'\[Bridge\]', '', text)
212
+ text = re.sub(r'\[Intro\]', '', text)
213
+
214
+ # Clean up multiple consecutive newlines
215
+ text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
216
+
217
+ # Clean up leading/trailing whitespace
218
+ text = text.strip()
219
+
220
+ return text
221
+
43
222
  def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
44
223
  """Convert Genius's raw API response to standardized format."""
224
+ # Use our explicit source marker for detection
225
+ is_rapidapi = raw_data.get("_rapidapi_source", False)
226
+
227
+ if is_rapidapi:
228
+ return self._convert_rapidapi_format(raw_data)
229
+ else:
230
+ return self._convert_lyricsgenius_format(raw_data)
231
+
232
+ def _convert_lyricsgenius_format(self, raw_data: Dict[str, Any]) -> LyricsData:
233
+ """Convert lyricsgenius format to standardized format."""
45
234
  # Clean the lyrics before processing
46
235
  lyrics = self._clean_lyrics(raw_data.get("lyrics", ""))
47
236
 
@@ -74,6 +263,46 @@ class GeniusProvider(BaseLyricsProvider):
74
263
  "verified_annotations": len(raw_data.get("verified_annotations_by", [])),
75
264
  "verified_contributors": len(raw_data.get("verified_contributors", [])),
76
265
  "external_urls": {"genius": raw_data.get("url")},
266
+ "api_source": "lyricsgenius",
267
+ },
268
+ )
269
+
270
+ # Create segments with words from cleaned lyrics
271
+ segments = self._create_segments_with_words(lyrics, is_synced=False)
272
+
273
+ # Create result object with segments
274
+ return LyricsData(source="genius", segments=segments, metadata=metadata)
275
+
276
+ def _convert_rapidapi_format(self, raw_data: Dict[str, Any]) -> LyricsData:
277
+ """Convert RapidAPI format to standardized format."""
278
+ # Clean the lyrics before processing
279
+ lyrics = self._clean_lyrics(raw_data.get("lyrics", ""))
280
+
281
+ # Extract artist name from primary_artist
282
+ primary_artist = raw_data.get("primary_artist", {})
283
+ artist_name = primary_artist.get("name", "")
284
+
285
+ # Extract release date from release_date_for_display
286
+ release_date = raw_data.get("release_date_for_display")
287
+
288
+ # Create metadata object
289
+ metadata = LyricsMetadata(
290
+ source="genius",
291
+ track_name=raw_data.get("title", ""),
292
+ artist_names=artist_name,
293
+ album_name=raw_data.get("album", {}).get("name") if raw_data.get("album") else None,
294
+ lyrics_provider="genius",
295
+ lyrics_provider_id=str(raw_data.get("id")),
296
+ is_synced=False, # Genius doesn't provide synced lyrics
297
+ provider_metadata={
298
+ "genius_id": raw_data.get("id"),
299
+ "release_date": release_date,
300
+ "page_url": raw_data.get("url"),
301
+ "annotation_count": raw_data.get("annotation_count"),
302
+ "lyrics_state": raw_data.get("lyrics_state"),
303
+ "pyongs_count": raw_data.get("pyongs_count"),
304
+ "external_urls": {"genius": raw_data.get("url")},
305
+ "api_source": "rapidapi",
77
306
  },
78
307
  )
79
308
 
@@ -86,6 +315,19 @@ class GeniusProvider(BaseLyricsProvider):
86
315
  def _clean_lyrics(self, lyrics: str) -> str:
87
316
  """Clean and process lyrics from Genius to remove unwanted content."""
88
317
  self.logger.debug("Starting lyrics cleaning process")
318
+
319
+ # Handle unexpected input types
320
+ if not isinstance(lyrics, str):
321
+ self.logger.warning(f"Expected string for lyrics, got {type(lyrics)}: {repr(lyrics)}")
322
+ if lyrics is None:
323
+ return ""
324
+ # Try to convert to string
325
+ try:
326
+ lyrics = str(lyrics)
327
+ except Exception as e:
328
+ self.logger.error(f"Failed to convert lyrics to string: {e}")
329
+ return ""
330
+
89
331
  original = lyrics
90
332
 
91
333
  lyrics = lyrics.replace("\\n", "\n")
@@ -123,10 +365,20 @@ class GeniusProvider(BaseLyricsProvider):
123
365
  if original != lyrics:
124
366
  self.logger.debug("Removed standalone 'Embed' text")
125
367
 
368
+ # Remove section markers but keep the lyrics content (for non-HTML lyrics)
369
+ # Instead of removing entire lines, just remove the square bracket markers
126
370
  original = lyrics
127
- lyrics = re.sub(r".*?\[.*?\].*?", "", lyrics)
371
+ lyrics = re.sub(r'\[Verse \d+\]', '', lyrics)
372
+ lyrics = re.sub(r'\[Pre-Chorus\]', '', lyrics)
373
+ lyrics = re.sub(r'\[Chorus\]', '', lyrics)
374
+ lyrics = re.sub(r'\[Refrain\]', '', lyrics)
375
+ lyrics = re.sub(r'\[Outro\]', '', lyrics)
376
+ lyrics = re.sub(r'\[Bridge\]', '', lyrics)
377
+ lyrics = re.sub(r'\[Intro\]', '', lyrics)
128
378
  if original != lyrics:
129
- self.logger.debug("Removed lines containing square brackets")
379
+ self.logger.debug("Removed section markers while preserving lyrics content")
380
+
381
+ # Remove common LyricsGenius page elements
130
382
 
131
383
  self.logger.debug("Completed lyrics cleaning process")
132
384
  return lyrics
@@ -0,0 +1,156 @@
1
+ import logging
2
+ from typing import Optional, Dict, Any
3
+ import requests
4
+ from lyrics_transcriber.types import LyricsData, LyricsMetadata
5
+ from lyrics_transcriber.lyrics.base_lyrics_provider import BaseLyricsProvider, LyricsProviderConfig
6
+
7
+
8
+ class MusixmatchProvider(BaseLyricsProvider):
9
+ """Handles fetching lyrics from Musixmatch via RapidAPI."""
10
+
11
+ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger] = None):
12
+ super().__init__(config, logger)
13
+ self.rapidapi_key = config.rapidapi_key
14
+
15
+ def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
16
+ """Fetch raw song data from Musixmatch via RapidAPI."""
17
+ if not self.rapidapi_key:
18
+ self.logger.warning("No RapidAPI key provided for Musixmatch")
19
+ return None
20
+
21
+ self.logger.info(f"Fetching lyrics from Musixmatch for {artist} - {title}")
22
+
23
+ try:
24
+ # Construct the API URL with artist and title
25
+ url = f"https://musixmatch-song-lyrics-api.p.rapidapi.com/lyrics/{artist}/{title}/"
26
+
27
+ headers = {
28
+ "x-rapidapi-key": self.rapidapi_key,
29
+ "x-rapidapi-host": "musixmatch-song-lyrics-api.p.rapidapi.com"
30
+ }
31
+
32
+ self.logger.debug(f"Making Musixmatch API request to: {url}")
33
+ response = requests.get(url, headers=headers, timeout=10)
34
+ response.raise_for_status()
35
+
36
+ data = response.json()
37
+
38
+ # Check if we got a valid response
39
+ if not data.get("message", {}).get("body", {}).get("macro_calls"):
40
+ self.logger.warning("Invalid response structure from Musixmatch API")
41
+ return None
42
+
43
+ # Check if lyrics are available
44
+ lyrics_data = data.get("message", {}).get("body", {}).get("macro_calls", {}).get("track.lyrics.get", {})
45
+ if not lyrics_data.get("message", {}).get("body", {}).get("lyrics"):
46
+ self.logger.warning("No lyrics found in Musixmatch response")
47
+ return None
48
+
49
+ self.logger.info("Successfully fetched lyrics from Musixmatch")
50
+ return data
51
+
52
+ except requests.exceptions.RequestException as e:
53
+ self.logger.error(f"Musixmatch API request failed: {str(e)}")
54
+ return None
55
+ except Exception as e:
56
+ self.logger.error(f"Error fetching from Musixmatch: {str(e)}")
57
+ return None
58
+
59
+ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
60
+ """Convert Musixmatch's raw API response to standardized format."""
61
+ try:
62
+ # Extract macro calls from the nested response
63
+ macro_calls = raw_data.get("message", {}).get("body", {}).get("macro_calls", {})
64
+
65
+ # Extract track information
66
+ track_data = macro_calls.get("matcher.track.get", {}).get("message", {}).get("body", {}).get("track", {})
67
+
68
+ # Extract lyrics information
69
+ lyrics_data = macro_calls.get("track.lyrics.get", {}).get("message", {}).get("body", {}).get("lyrics", {})
70
+
71
+ # Get the actual lyrics text
72
+ lyrics_text = lyrics_data.get("lyrics_body", "")
73
+
74
+ # Clean the lyrics
75
+ lyrics_text = self._clean_lyrics(lyrics_text)
76
+
77
+ # Create metadata object
78
+ metadata = LyricsMetadata(
79
+ source="musixmatch",
80
+ track_name=track_data.get("track_name", ""),
81
+ artist_names=track_data.get("artist_name", ""),
82
+ album_name=track_data.get("album_name", ""),
83
+ duration_ms=track_data.get("track_length", 0) * 1000 if track_data.get("track_length") else None,
84
+ explicit=bool(track_data.get("explicit", 0)),
85
+ language=lyrics_data.get("lyrics_language", ""),
86
+ is_synced=False, # Musixmatch API doesn't provide sync data in this format
87
+ lyrics_provider="musixmatch",
88
+ lyrics_provider_id=str(lyrics_data.get("lyrics_id", "")),
89
+ provider_metadata={
90
+ "musixmatch_track_id": track_data.get("track_id"),
91
+ "musixmatch_lyrics_id": lyrics_data.get("lyrics_id"),
92
+ "album_id": track_data.get("album_id"),
93
+ "artist_id": track_data.get("artist_id"),
94
+ "track_share_url": track_data.get("track_share_url"),
95
+ "track_edit_url": track_data.get("track_edit_url"),
96
+ "lyrics_language": lyrics_data.get("lyrics_language"),
97
+ "lyrics_language_description": lyrics_data.get("lyrics_language_description"),
98
+ "lyrics_copyright": lyrics_data.get("lyrics_copyright"),
99
+ "track_rating": track_data.get("track_rating"),
100
+ "num_favourite": track_data.get("num_favourite"),
101
+ "first_release_date": track_data.get("first_release_date"),
102
+ "spotify_id": track_data.get("track_spotify_id"),
103
+ "isrc": track_data.get("track_isrc"),
104
+ "api_source": "rapidapi_musixmatch",
105
+ },
106
+ )
107
+
108
+ # Create segments with words from lyrics
109
+ segments = self._create_segments_with_words(lyrics_text, is_synced=False)
110
+
111
+ # Create result object with segments
112
+ return LyricsData(source="musixmatch", segments=segments, metadata=metadata)
113
+
114
+ except Exception as e:
115
+ self.logger.error(f"Error converting Musixmatch response format: {str(e)}")
116
+ # Return empty lyrics data if conversion fails
117
+ return LyricsData(
118
+ source="musixmatch",
119
+ segments=[],
120
+ metadata=LyricsMetadata(
121
+ source="musixmatch",
122
+ track_name="",
123
+ artist_names="",
124
+ lyrics_provider="musixmatch",
125
+ is_synced=False,
126
+ provider_metadata={"api_source": "rapidapi_musixmatch", "conversion_error": str(e)},
127
+ )
128
+ )
129
+
130
+ def _clean_lyrics(self, lyrics: str) -> str:
131
+ """Clean and process lyrics from Musixmatch to remove unwanted content."""
132
+ if not isinstance(lyrics, str):
133
+ self.logger.warning(f"Expected string for lyrics, got {type(lyrics)}: {repr(lyrics)}")
134
+ if lyrics is None:
135
+ return ""
136
+ try:
137
+ lyrics = str(lyrics)
138
+ except Exception as e:
139
+ self.logger.error(f"Failed to convert lyrics to string: {e}")
140
+ return ""
141
+
142
+ # Replace escaped newlines with actual newlines, handling whitespace
143
+ import re
144
+ lyrics = re.sub(r'\s*\\n\s*', '\n', lyrics)
145
+
146
+ # Remove any HTML tags that might be present
147
+ lyrics = re.sub(r'<[^>]+>', '', lyrics)
148
+
149
+ # Clean up multiple consecutive newlines
150
+ lyrics = re.sub(r'\n\s*\n\s*\n+', '\n\n', lyrics)
151
+
152
+ # Clean up leading/trailing whitespace
153
+ lyrics = lyrics.strip()
154
+
155
+ self.logger.debug("Completed Musixmatch lyrics cleaning process")
156
+ return lyrics