lyrics-transcriber 0.30.0__py3-none-any.whl → 0.30.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,263 @@
1
+ import logging
2
+ import difflib
3
+ from typing import Any, Dict, List, Optional, Set, Tuple
4
+
5
+ from ..transcribers.base_transcriber import TranscriptionData, LyricsSegment, Word, TranscriptionResult
6
+ from ..lyrics.base_lyrics_provider import LyricsData
7
+ from .base_strategy import CorrectionResult, CorrectionStrategy
8
+
9
+
10
+ class DiffBasedCorrector(CorrectionStrategy):
11
+ """
12
+ Implements word-diff based correction strategy using anchor words
13
+ to align and correct transcribed lyrics.
14
+
15
+ Key Features:
16
+ - Uses multiple reference sources (internet lyrics + optional second transcription)
17
+ - Preserves timing information from original transcription
18
+ - Provides detailed metadata about corrections made
19
+ - Falls back to original words when corrections aren't confident
20
+
21
+ Potential Improvements:
22
+ 1. Add phonetic matching for better word alignment (e.g., Soundex or Metaphone)
23
+ 2. Implement context-aware corrections using surrounding words
24
+ 3. Use more sophisticated alignment algorithms (e.g., Smith-Waterman)
25
+ 4. Add validation using language models to ensure semantic consistency
26
+ 5. Implement word normalization (e.g., handling contractions, punctuation)
27
+ """
28
+
29
+ def __init__(self, logger: Optional[logging.Logger] = None):
30
+ self.logger = logger or logging.getLogger(__name__)
31
+
32
+ def _find_anchor_words(self, segments: List[LyricsSegment]) -> Set[str]:
33
+ """
34
+ Identify potential anchor words from transcribed segments.
35
+
36
+ Since we don't have confidence values, we'll use these heuristics:
37
+ 1. Words that are longer (more likely to be distinctive)
38
+ 2. Words that aren't common stop words
39
+ 3. Words that appear multiple times in the same position
40
+ """
41
+ stop_words = {
42
+ "a",
43
+ "an",
44
+ "and",
45
+ "are",
46
+ "as",
47
+ "at",
48
+ "be",
49
+ "by",
50
+ "for",
51
+ "from",
52
+ "has",
53
+ "he",
54
+ "in",
55
+ "is",
56
+ "it",
57
+ "its",
58
+ "of",
59
+ "on",
60
+ "that",
61
+ "the",
62
+ "to",
63
+ "was",
64
+ "were",
65
+ "will",
66
+ "with",
67
+ }
68
+
69
+ anchors = set()
70
+ word_positions = {} # Track words and their relative positions
71
+
72
+ for segment in segments:
73
+ for i, word in enumerate(segment.words):
74
+ word_lower = word.text.lower().strip()
75
+
76
+ # Skip very short words and stop words
77
+ if len(word_lower) <= 2 or word_lower in stop_words:
78
+ continue
79
+
80
+ # Track position of this word
81
+ if word_lower not in word_positions:
82
+ word_positions[word_lower] = []
83
+ word_positions[word_lower].append(i)
84
+
85
+ # If word appears multiple times in similar positions, it's a good anchor
86
+ if len(word_positions[word_lower]) >= 2:
87
+ anchors.add(word_lower)
88
+
89
+ # Longer words (4+ chars) are more likely to be distinctive
90
+ if len(word_lower) >= 4:
91
+ anchors.add(word_lower)
92
+
93
+ return anchors
94
+
95
+ def _align_texts(self, source_text: str, target_text: str) -> List[Tuple[str, str]]:
96
+ """
97
+ Align two texts using difflib and return word pairs.
98
+
99
+ Uses Python's difflib for fuzzy string matching to find the best
100
+ alignment between transcribed text and reference lyrics.
101
+
102
+ Returns both matching and non-matching word pairs.
103
+ """
104
+ # Split into words and convert to lowercase for matching
105
+ source_words = source_text.lower().split()
106
+ target_words = target_text.lower().split()
107
+
108
+ # Use SequenceMatcher to find matching blocks
109
+ matcher = difflib.SequenceMatcher(None, source_words, target_words)
110
+
111
+ # Create alignment pairs for both matching and non-matching sections
112
+ alignments = []
113
+ i = j = 0
114
+
115
+ for block in matcher.get_matching_blocks():
116
+ # Add non-matching pairs before this block
117
+ while i < block.a and j < block.b:
118
+ alignments.append((source_words[i], target_words[j]))
119
+ i += 1
120
+ j += 1
121
+
122
+ # Add matching pairs from this block
123
+ for _ in range(block.size):
124
+ alignments.append((source_words[i], target_words[j]))
125
+ i += 1
126
+ j += 1
127
+
128
+ # Add any remaining non-matching pairs
129
+ while i < len(source_words) and j < len(target_words):
130
+ alignments.append((source_words[i], target_words[j]))
131
+ i += 1
132
+ j += 1
133
+
134
+ return alignments
135
+
136
+ def _create_correction_mapping(
137
+ self, transcription: TranscriptionData, lyrics_results: List[LyricsData], anchor_words: Set[str]
138
+ ) -> Dict[str, Dict[str, int]]:
139
+ """
140
+ Create a mapping of potential corrections based on aligned texts.
141
+
142
+ Strategy:
143
+ 1. Use anchor words to establish alignment points
144
+ 2. Look at words between anchor points in both sources
145
+ 3. Build frequency map of potential corrections
146
+ 4. Consider timing information when available
147
+ """
148
+ correction_counts: Dict[str, Dict[str, int]] = {}
149
+
150
+ # Get transcription text as list of words
151
+ trans_words = [w.text.lower().strip() for segment in transcription.segments for w in segment.words]
152
+
153
+ # Process each lyrics source
154
+ for lyrics in lyrics_results:
155
+ # Split lyrics into words
156
+ lyrics_words = lyrics.lyrics.lower().split()
157
+
158
+ # Get alignments between transcription and lyrics
159
+ alignments = self._align_texts(transcription.text, lyrics.lyrics)
160
+
161
+ # Process aligned word pairs
162
+ for trans_word, lyrics_word in alignments:
163
+ trans_word = trans_word.strip()
164
+ lyrics_word = lyrics_word.strip()
165
+
166
+ # Skip if words are identical
167
+ if trans_word == lyrics_word:
168
+ continue
169
+
170
+ # Initialize correction mapping for this word if needed
171
+ if trans_word not in correction_counts:
172
+ correction_counts[trans_word] = {}
173
+
174
+ # Count this correction
175
+ correction_counts[trans_word][lyrics_word] = correction_counts[trans_word].get(lyrics_word, 0) + 1
176
+
177
+ return correction_counts
178
+
179
+ def correct(
180
+ self,
181
+ transcription_results: List[TranscriptionResult],
182
+ lyrics_results: List[LyricsData],
183
+ ) -> CorrectionResult:
184
+ """Apply diff-based correction algorithm."""
185
+ self.logger.info("Starting diff-based correction")
186
+
187
+ # Sort transcription results by priority
188
+ sorted_results = sorted(transcription_results, key=lambda x: x.priority)
189
+ if not sorted_results:
190
+ raise ValueError("No transcription results available")
191
+
192
+ # Use highest priority transcription as primary source
193
+ primary_transcription = sorted_results[0].result
194
+
195
+ # Find anchor words from all transcriptions
196
+ anchor_words = self._find_anchor_words(primary_transcription.segments)
197
+ for result in sorted_results[1:]:
198
+ anchor_words.update(self._find_anchor_words(result.result.segments))
199
+
200
+ # Create correction mapping
201
+ corrections = self._create_correction_mapping(primary_transcription, lyrics_results, anchor_words)
202
+
203
+ # Apply corrections while preserving timing
204
+ corrected_segments = []
205
+ corrections_made = 0
206
+ source_mapping = {}
207
+
208
+ for segment in primary_transcription.segments:
209
+ corrected_words = []
210
+
211
+ for word in segment.words:
212
+ word_lower = word.text.lower().strip()
213
+
214
+ # Check if we have a correction for this word
215
+ if word_lower in corrections:
216
+ # Get the most common correction
217
+ possible_corrections = corrections[word_lower]
218
+ if possible_corrections:
219
+ best_correction = max(possible_corrections.items(), key=lambda x: x[1])[0]
220
+
221
+ # Create corrected word with preserved timing
222
+ corrected_word = Word(
223
+ text=best_correction,
224
+ start_time=word.start_time,
225
+ end_time=word.end_time,
226
+ confidence=None, # We don't have confidence values
227
+ )
228
+ corrected_words.append(corrected_word)
229
+ corrections_made += 1
230
+ source_mapping[best_correction] = "internet_lyrics"
231
+ continue
232
+
233
+ # If no correction made, keep original word
234
+ corrected_words.append(word)
235
+
236
+ # Create new segment with corrected words
237
+ corrected_segment = LyricsSegment(
238
+ text=" ".join(w.text for w in corrected_words),
239
+ words=corrected_words,
240
+ start_time=segment.start_time,
241
+ end_time=segment.end_time,
242
+ )
243
+ corrected_segments.append(corrected_segment)
244
+
245
+ # Since we don't have confidence values, use a simpler metric
246
+ # based on how many corrections were needed
247
+ total_words = sum(len(segment.words) for segment in corrected_segments)
248
+ correction_ratio = 1 - (corrections_made / total_words if total_words > 0 else 0)
249
+
250
+ return CorrectionResult(
251
+ segments=corrected_segments,
252
+ text=" ".join(segment.text for segment in corrected_segments),
253
+ confidence=correction_ratio, # Use correction ratio as confidence
254
+ corrections_made=corrections_made,
255
+ source_mapping=source_mapping,
256
+ metadata={
257
+ "correction_strategy": "diff_based",
258
+ "anchor_words_count": len(anchor_words),
259
+ "total_words": total_words,
260
+ "correction_ratio": correction_ratio,
261
+ "primary_source": sorted_results[0].name,
262
+ },
263
+ )
@@ -0,0 +1,201 @@
1
+ from dataclasses import dataclass, asdict
2
+ import logging
3
+ from typing import Optional, Dict, Any, List
4
+ import json
5
+ import hashlib
6
+ from pathlib import Path
7
+ import os
8
+ from abc import ABC, abstractmethod
9
+
10
+
11
+ @dataclass
12
+ class Word:
13
+ """Represents a single word with its timing and confidence information."""
14
+
15
+ text: str
16
+ start_time: float
17
+ end_time: float
18
+ confidence: Optional[float] = None
19
+
20
+ def to_dict(self) -> Dict[str, Any]:
21
+ """Convert Word to dictionary for JSON serialization."""
22
+ d = asdict(self)
23
+ # Remove confidence from output if it's None
24
+ if d["confidence"] is None:
25
+ del d["confidence"]
26
+ return d
27
+
28
+
29
+ @dataclass
30
+ class LyricsSegment:
31
+ """Represents a segment/line of lyrics with timing information."""
32
+
33
+ text: str
34
+ words: List[Word]
35
+ start_time: float
36
+ end_time: float
37
+
38
+ def to_dict(self) -> Dict[str, Any]:
39
+ """Convert LyricsSegment to dictionary for JSON serialization."""
40
+ return {
41
+ "text": self.text,
42
+ "words": [word.to_dict() for word in self.words],
43
+ "start_time": self.start_time,
44
+ "end_time": self.end_time,
45
+ }
46
+
47
+
48
+ @dataclass
49
+ class LyricsProviderConfig:
50
+ """Configuration for lyrics providers."""
51
+
52
+ genius_api_token: Optional[str] = None
53
+ spotify_cookie: Optional[str] = None
54
+ cache_dir: Optional[str] = None
55
+ audio_filepath: Optional[str] = None
56
+
57
+
58
+ @dataclass
59
+ class LyricsMetadata:
60
+ """Standardized metadata for lyrics results."""
61
+
62
+ source: str
63
+ track_name: str
64
+ artist_names: str
65
+
66
+ # Common metadata fields
67
+ album_name: Optional[str] = None
68
+ duration_ms: Optional[int] = None
69
+ explicit: Optional[bool] = None
70
+ language: Optional[str] = None
71
+ is_synced: bool = False
72
+
73
+ # Lyrics provider details
74
+ lyrics_provider: Optional[str] = None
75
+ lyrics_provider_id: Optional[str] = None
76
+
77
+ # Provider-specific metadata
78
+ provider_metadata: Dict[str, Any] = None
79
+
80
+ def to_dict(self) -> Dict[str, Any]:
81
+ """Convert metadata to dictionary for JSON serialization."""
82
+ return asdict(self)
83
+
84
+
85
+ @dataclass
86
+ class LyricsData:
87
+ """Standardized response format for all lyrics providers."""
88
+
89
+ lyrics: str
90
+ segments: List[LyricsSegment]
91
+ metadata: LyricsMetadata
92
+
93
+ def to_dict(self) -> Dict[str, Any]:
94
+ """Convert result to dictionary for JSON serialization."""
95
+ return {"lyrics": self.lyrics, "segments": [segment.to_dict() for segment in self.segments], "metadata": self.metadata.to_dict()}
96
+
97
+
98
+ class BaseLyricsProvider(ABC):
99
+ """Base class for lyrics providers."""
100
+
101
+ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger] = None):
102
+ self.logger = logger or logging.getLogger(__name__)
103
+ self.cache_dir = Path(config.cache_dir) if config.cache_dir else None
104
+ self.audio_filepath = config.audio_filepath
105
+ if self.cache_dir:
106
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
107
+ self.logger.debug(f"Initialized {self.__class__.__name__} with cache dir: {self.cache_dir}")
108
+
109
+ def fetch_lyrics(self, artist: str, title: str) -> Optional[LyricsData]:
110
+ """Fetch lyrics for a given artist and title, using cache if available."""
111
+ if not self.cache_dir:
112
+ return self._fetch_and_convert_result(artist, title)
113
+
114
+ file_hash = self._get_file_hash(self.audio_filepath)
115
+ raw_cache_path = self._get_cache_path(file_hash, "raw")
116
+
117
+ # Try to load from cache first
118
+ raw_data = self._load_from_cache(raw_cache_path)
119
+ if raw_data is not None:
120
+ self.logger.info(f"Using cached lyrics for {artist} - {title}")
121
+ return self._save_and_convert_result(file_hash, raw_data)
122
+
123
+ # If not in cache, fetch from source
124
+ raw_result = self._fetch_data_from_source(artist, title)
125
+ if raw_result:
126
+ # Save raw API response
127
+ self._save_to_cache(raw_cache_path, raw_result)
128
+ return self._save_and_convert_result(file_hash, raw_result)
129
+
130
+ return None
131
+
132
+ def _get_file_hash(self, filepath: str) -> str:
133
+ """Calculate MD5 hash of a file."""
134
+ self.logger.debug(f"Calculating hash for file: {filepath}")
135
+ md5_hash = hashlib.md5()
136
+ with open(filepath, "rb") as f:
137
+ for chunk in iter(lambda: f.read(4096), b""):
138
+ md5_hash.update(chunk)
139
+ hash_result = md5_hash.hexdigest()
140
+ self.logger.debug(f"File hash: {hash_result}")
141
+ return hash_result
142
+
143
+ def _get_artist_title_hash(self, artist: str, title: str) -> str:
144
+ """Calculate MD5 hash of the artist and title."""
145
+ combined = f"{artist.lower()}_{title.lower()}"
146
+ return hashlib.md5(combined.encode()).hexdigest()
147
+
148
+ def _get_cache_path(self, cache_key: str, suffix: str) -> str:
149
+ """Get the cache file path for a given cache key and suffix."""
150
+ return os.path.join(self.cache_dir, f"{self.get_name().lower()}_{cache_key}_{suffix}.json")
151
+
152
+ def _save_to_cache(self, cache_path: str, data: Dict[str, Any]) -> None:
153
+ """Save data to cache."""
154
+ self.logger.debug(f"Saving lyrics to cache: {cache_path}")
155
+ with open(cache_path, "w", encoding="utf-8") as f:
156
+ json.dump(data, f, indent=2, ensure_ascii=False)
157
+ self.logger.debug("Cache save completed")
158
+
159
+ def _load_from_cache(self, cache_path: str) -> Optional[Dict[str, Any]]:
160
+ """Load data from cache if it exists."""
161
+ self.logger.debug(f"Attempting to load from cache: {cache_path}")
162
+ try:
163
+ with open(cache_path, "r", encoding="utf-8") as f:
164
+ data = json.load(f)
165
+ self.logger.debug("Lyrics loaded from cache")
166
+ return data
167
+ except FileNotFoundError:
168
+ self.logger.debug("Cache file not found")
169
+ return None
170
+ except json.JSONDecodeError:
171
+ self.logger.warning(f"Cache file {cache_path} is corrupted")
172
+ return None
173
+
174
+ def _save_and_convert_result(self, cache_key: str, raw_data: Dict[str, Any]) -> LyricsData:
175
+ """Convert raw result to standardized format, save to cache, and return."""
176
+ converted_cache_path = self._get_cache_path(cache_key, "converted")
177
+ converted_result = self._convert_result_format(raw_data)
178
+ # Convert to dictionary before saving to cache
179
+ self._save_to_cache(converted_cache_path, converted_result.to_dict())
180
+ return converted_result
181
+
182
+ def _fetch_and_convert_result(self, artist: str, title: str) -> Optional[LyricsData]:
183
+ """Fetch and convert result when caching is disabled."""
184
+ raw_result = self._fetch_data_from_source(artist, title)
185
+ if raw_result:
186
+ return self._convert_result_format(raw_result)
187
+ return None
188
+
189
+ @abstractmethod
190
+ def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
191
+ """Fetch raw data from the source (implemented by subclasses)."""
192
+ raise NotImplementedError("Subclasses must implement _fetch_data_from_source")
193
+
194
+ @abstractmethod
195
+ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
196
+ """Convert raw API response to standardized format (implemented by subclasses)."""
197
+ raise NotImplementedError("Subclasses must implement _convert_result_format")
198
+
199
+ def get_name(self) -> str:
200
+ """Return the name of this lyrics provider."""
201
+ return self.__class__.__name__.replace("Provider", "")
@@ -0,0 +1,70 @@
1
+ import logging
2
+ from typing import Optional, Dict, Any
3
+ import lyricsgenius
4
+ from .base_lyrics_provider import BaseLyricsProvider, LyricsMetadata, LyricsProviderConfig, LyricsData
5
+
6
+
7
+ class GeniusProvider(BaseLyricsProvider):
8
+ """Handles fetching lyrics from Genius."""
9
+
10
+ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger] = None):
11
+ super().__init__(config, logger)
12
+ self.api_token = config.genius_api_token
13
+ self.client = None
14
+ if self.api_token:
15
+ self.client = lyricsgenius.Genius(self.api_token)
16
+ self.client.verbose = False
17
+ self.client.remove_section_headers = True
18
+
19
+ def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
20
+ """Fetch raw song data from Genius API."""
21
+ if not self.client:
22
+ self.logger.warning("No Genius API token provided")
23
+ return None
24
+
25
+ self.logger.info(f"Searching Genius for {artist} - {title}")
26
+ try:
27
+ song = self.client.search_song(title, artist)
28
+ if song:
29
+ self.logger.info("Found lyrics on Genius")
30
+ return song.to_dict()
31
+ except Exception as e:
32
+ self.logger.error(f"Error fetching from Genius: {str(e)}")
33
+ return None
34
+
35
+ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
36
+ """Convert Genius's raw API response to standardized format."""
37
+ # Extract release date components if available
38
+ release_date = None
39
+ if release_components := raw_data.get("release_date_components"):
40
+ year = release_components.get("year")
41
+ month = release_components.get("month")
42
+ day = release_components.get("day")
43
+ if all(x is not None for x in (year, month, day)):
44
+ release_date = f"{year}-{month:02d}-{day:02d}"
45
+
46
+ # Create metadata object
47
+ metadata = LyricsMetadata(
48
+ source="genius",
49
+ track_name=raw_data.get("title", ""),
50
+ artist_names=raw_data.get("artist_names", ""),
51
+ album_name=raw_data.get("album", {}).get("name"),
52
+ lyrics_provider="genius",
53
+ lyrics_provider_id=str(raw_data.get("id")),
54
+ is_synced=False, # Genius doesn't provide synced lyrics
55
+ provider_metadata={
56
+ "genius_id": raw_data.get("id"),
57
+ "release_date": release_date,
58
+ "page_url": raw_data.get("url"),
59
+ "annotation_count": raw_data.get("annotation_count"),
60
+ "lyrics_state": raw_data.get("lyrics_state"),
61
+ "lyrics_owner_id": raw_data.get("lyrics_owner_id"),
62
+ "pyongs_count": raw_data.get("pyongs_count"),
63
+ "verified_annotations": len(raw_data.get("verified_annotations_by", [])),
64
+ "verified_contributors": len(raw_data.get("verified_contributors", [])),
65
+ "external_urls": {"genius": raw_data.get("url")},
66
+ },
67
+ )
68
+
69
+ # Create result object
70
+ return LyricsData(lyrics=raw_data.get("lyrics", ""), segments=[], metadata=metadata) # Genius doesn't provide timestamp data
@@ -0,0 +1,82 @@
1
+ import logging
2
+ from typing import Optional, Dict, Any
3
+ import syrics.api
4
+
5
+ from lyrics_transcriber.lyrics.base_lyrics_provider import LyricsSegment, Word
6
+ from .base_lyrics_provider import BaseLyricsProvider, LyricsProviderConfig, LyricsMetadata, LyricsData
7
+
8
+
9
+ class SpotifyProvider(BaseLyricsProvider):
10
+ """Handles fetching lyrics from Spotify."""
11
+
12
+ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger] = None):
13
+ super().__init__(config, logger)
14
+ self.cookie = config.spotify_cookie
15
+ self.client = syrics.api.Spotify(self.cookie) if self.cookie else None
16
+
17
+ def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
18
+ """Fetch raw data from Spotify APIs using syrics library."""
19
+ if not self.client:
20
+ self.logger.warning("No Spotify cookie provided")
21
+ return None
22
+
23
+ try:
24
+ # Search for track
25
+ search_query = f"{title} - {artist}"
26
+ search_results = self.client.search(search_query, type="track", limit=1)
27
+
28
+ track_data = search_results["tracks"]["items"][0]
29
+ self.logger.debug(
30
+ f"Found track: {track_data['artists'][0]['name']} - {track_data['name']} " f"({track_data['external_urls']['spotify']})"
31
+ )
32
+
33
+ # Get lyrics data
34
+ lyrics_data = self.client.get_lyrics(track_data["id"])
35
+ if not lyrics_data:
36
+ return None
37
+
38
+ return {"track_data": track_data, "lyrics_data": lyrics_data}
39
+ except Exception as e:
40
+ self.logger.error(f"Error fetching from Spotify: {str(e)}")
41
+ return None
42
+
43
+ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
44
+ """Convert Spotify's raw API response to standardized format."""
45
+ track_data = raw_data["track_data"]
46
+ lyrics_data = raw_data["lyrics_data"]["lyrics"]
47
+
48
+ # Convert raw lines to LyricsSegment objects
49
+ segments = []
50
+ for line in lyrics_data.get("lines", []):
51
+ if not line.get("words"):
52
+ continue
53
+
54
+ segment = LyricsSegment(
55
+ text=line["words"],
56
+ words=[], # TODO: Could potentially split words if needed
57
+ start_time=float(line["startTimeMs"]) / 1000 if line["startTimeMs"] != "0" else None,
58
+ end_time=float(line["endTimeMs"]) / 1000 if line["endTimeMs"] != "0" else None,
59
+ )
60
+ segments.append(segment)
61
+
62
+ # Create metadata object
63
+ metadata = LyricsMetadata(
64
+ source="spotify",
65
+ track_name=track_data.get("name"),
66
+ artist_names=", ".join(artist.get("name", "") for artist in track_data.get("artists", [])),
67
+ album_name=track_data.get("album", {}).get("name"),
68
+ duration_ms=track_data.get("duration_ms"),
69
+ explicit=track_data.get("explicit"),
70
+ language=lyrics_data.get("language"),
71
+ is_synced=lyrics_data.get("syncType") == "LINE_SYNCED",
72
+ lyrics_provider=lyrics_data.get("provider"),
73
+ lyrics_provider_id=lyrics_data.get("providerLyricsId"),
74
+ provider_metadata={
75
+ "spotify_id": track_data.get("id"),
76
+ "preview_url": track_data.get("preview_url"),
77
+ "external_urls": track_data.get("external_urls"),
78
+ "sync_type": lyrics_data.get("syncType"),
79
+ },
80
+ )
81
+
82
+ return LyricsData(lyrics="\n".join(segment.text for segment in segments), segments=segments, metadata=metadata)