lyrics-transcriber 0.40.0__py3-none-any.whl → 0.42.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. lyrics_transcriber/cli/cli_main.py +7 -0
  2. lyrics_transcriber/core/config.py +1 -0
  3. lyrics_transcriber/core/controller.py +30 -52
  4. lyrics_transcriber/correction/anchor_sequence.py +325 -150
  5. lyrics_transcriber/correction/corrector.py +224 -107
  6. lyrics_transcriber/correction/handlers/base.py +28 -10
  7. lyrics_transcriber/correction/handlers/extend_anchor.py +47 -24
  8. lyrics_transcriber/correction/handlers/levenshtein.py +75 -33
  9. lyrics_transcriber/correction/handlers/llm.py +290 -0
  10. lyrics_transcriber/correction/handlers/no_space_punct_match.py +81 -36
  11. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +46 -26
  12. lyrics_transcriber/correction/handlers/repeat.py +28 -11
  13. lyrics_transcriber/correction/handlers/sound_alike.py +68 -32
  14. lyrics_transcriber/correction/handlers/syllables_match.py +80 -30
  15. lyrics_transcriber/correction/handlers/word_count_match.py +36 -19
  16. lyrics_transcriber/correction/handlers/word_operations.py +68 -22
  17. lyrics_transcriber/correction/text_utils.py +3 -7
  18. lyrics_transcriber/frontend/.yarn/install-state.gz +0 -0
  19. lyrics_transcriber/frontend/.yarn/releases/yarn-4.6.0.cjs +934 -0
  20. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  21. lyrics_transcriber/frontend/dist/assets/{index-DKnNJHRK.js → index-coH8y7gV.js} +16284 -9032
  22. lyrics_transcriber/frontend/dist/assets/index-coH8y7gV.js.map +1 -0
  23. lyrics_transcriber/frontend/dist/index.html +1 -1
  24. lyrics_transcriber/frontend/package.json +6 -2
  25. lyrics_transcriber/frontend/src/App.tsx +18 -2
  26. lyrics_transcriber/frontend/src/api.ts +103 -6
  27. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +7 -6
  28. lyrics_transcriber/frontend/src/components/DetailsModal.tsx +86 -59
  29. lyrics_transcriber/frontend/src/components/EditModal.tsx +93 -43
  30. lyrics_transcriber/frontend/src/components/FileUpload.tsx +2 -2
  31. lyrics_transcriber/frontend/src/components/Header.tsx +251 -0
  32. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +303 -265
  33. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +117 -0
  34. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +125 -40
  35. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +129 -115
  36. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +59 -78
  37. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +40 -16
  38. lyrics_transcriber/frontend/src/components/WordEditControls.tsx +4 -10
  39. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +137 -68
  40. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +1 -1
  41. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +85 -115
  42. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  43. lyrics_transcriber/frontend/src/components/shared/types.ts +15 -7
  44. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +35 -0
  45. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  46. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +7 -7
  47. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +121 -0
  48. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  49. lyrics_transcriber/frontend/src/types.js +2 -0
  50. lyrics_transcriber/frontend/src/types.ts +70 -49
  51. lyrics_transcriber/frontend/src/validation.ts +132 -0
  52. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  53. lyrics_transcriber/frontend/yarn.lock +3752 -0
  54. lyrics_transcriber/lyrics/base_lyrics_provider.py +75 -12
  55. lyrics_transcriber/lyrics/file_provider.py +6 -5
  56. lyrics_transcriber/lyrics/genius.py +5 -2
  57. lyrics_transcriber/lyrics/spotify.py +58 -21
  58. lyrics_transcriber/output/ass/config.py +16 -5
  59. lyrics_transcriber/output/cdg.py +8 -8
  60. lyrics_transcriber/output/generator.py +29 -14
  61. lyrics_transcriber/output/plain_text.py +15 -10
  62. lyrics_transcriber/output/segment_resizer.py +16 -3
  63. lyrics_transcriber/output/subtitles.py +56 -2
  64. lyrics_transcriber/output/video.py +107 -1
  65. lyrics_transcriber/review/__init__.py +0 -1
  66. lyrics_transcriber/review/server.py +337 -164
  67. lyrics_transcriber/transcribers/audioshake.py +3 -0
  68. lyrics_transcriber/transcribers/base_transcriber.py +11 -3
  69. lyrics_transcriber/transcribers/whisper.py +11 -1
  70. lyrics_transcriber/types.py +151 -105
  71. lyrics_transcriber/utils/word_utils.py +27 -0
  72. {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/METADATA +3 -1
  73. {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/RECORD +76 -63
  74. {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/WHEEL +1 -1
  75. lyrics_transcriber/frontend/dist/assets/index-DKnNJHRK.js.map +0 -1
  76. lyrics_transcriber/frontend/package-lock.json +0 -4260
  77. lyrics_transcriber/frontend/src/components/shared/utils/initializeDataWithIds.tsx +0 -202
  78. {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/LICENSE +0 -0
  79. {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/entry_points.txt +0 -0
@@ -8,9 +8,10 @@ from pathlib import Path
8
8
  import json
9
9
  import hashlib
10
10
 
11
- from lyrics_transcriber.types import PhraseScore, AnchorSequence, GapSequence, ScoredAnchor
11
+ from lyrics_transcriber.types import LyricsData, PhraseScore, AnchorSequence, GapSequence, ScoredAnchor, TranscriptionResult, Word
12
12
  from lyrics_transcriber.correction.phrase_analyzer import PhraseAnalyzer
13
13
  from lyrics_transcriber.correction.text_utils import clean_text
14
+ from lyrics_transcriber.utils.word_utils import WordUtils
14
15
 
15
16
 
16
17
  class AnchorSequenceFinder:
@@ -93,38 +94,81 @@ class AnchorSequenceFinder:
93
94
  return anchor
94
95
  return None
95
96
 
96
- def _get_cache_key(self, transcribed: str, references: Dict[str, str]) -> str:
97
+ def _get_cache_key(self, transcribed: str, references: Dict[str, LyricsData], transcription_result: TranscriptionResult) -> str:
97
98
  """Generate a unique cache key for the input combination."""
98
- # Create a string that uniquely identifies the inputs
99
- input_str = f"{transcribed}|{'|'.join(f'{k}:{v}' for k,v in sorted(references.items()))}"
99
+ # Create a string that uniquely identifies the inputs, but only using stable content
100
+ # Use only the text content, not IDs or other potentially varying metadata
101
+ ref_texts = []
102
+ for source, lyrics in sorted(references.items()):
103
+ text = " ".join(w.text for s in lyrics.segments for w in s.words)
104
+ ref_texts.append(f"{source}:{text}")
105
+
106
+ input_str = f"{transcribed}|" f"{','.join(ref_texts)}"
100
107
  return hashlib.md5(input_str.encode()).hexdigest()
101
108
 
102
- def _save_to_cache(self, cache_path: Path, data: Any) -> None:
109
+ def _save_to_cache(self, cache_path: Path, anchors: List[ScoredAnchor]) -> None:
103
110
  """Save results to cache file."""
104
111
  self.logger.debug(f"Saving to cache: {cache_path}")
112
+ # Convert to dictionary format that matches the expected loading format
113
+ cache_data = [{"anchor": anchor.anchor.to_dict(), "phrase_score": anchor.phrase_score.to_dict()} for anchor in anchors]
105
114
  with open(cache_path, "w") as f:
106
- json.dump(data, f, indent=2)
115
+ json.dump(cache_data, f, indent=2)
107
116
 
108
- def _load_from_cache(self, cache_path: Path) -> Optional[Any]:
117
+ def _load_from_cache(self, cache_path: Path) -> Optional[List[ScoredAnchor]]:
109
118
  """Load results from cache if available."""
110
119
  try:
111
120
  self.logger.debug(f"Attempting to load from cache: {cache_path}")
112
121
  with open(cache_path, "r") as f:
113
- return json.load(f)
114
- except (FileNotFoundError, json.JSONDecodeError):
115
- self.logger.debug("Cache miss or invalid cache file")
122
+ cached_data = json.load(f)
123
+
124
+ self.logger.info("Loading anchors from cache")
125
+ try:
126
+ # Log the raw dictionary data instead of the object
127
+ # if cached_data:
128
+ # self.logger.debug(f"Cached data structure: {json.dumps(cached_data[0], indent=2)}")
129
+
130
+ # Convert cached data back to ScoredAnchor objects
131
+ anchors = []
132
+ for data in cached_data:
133
+ if "anchor" not in data or "phrase_score" not in data:
134
+ raise KeyError("Missing required keys: anchor, phrase_score")
135
+
136
+ anchor = AnchorSequence.from_dict(data["anchor"])
137
+ phrase_score = PhraseScore.from_dict(data["phrase_score"])
138
+ anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
139
+
140
+ return anchors
141
+
142
+ except KeyError as e:
143
+ self.logger.error(f"Cache format mismatch. Missing key: {e}")
144
+ # Log the raw data for debugging
145
+ if cached_data:
146
+ self.logger.error(f"First cached anchor data: {json.dumps(cached_data[0], indent=2)}")
147
+ self.logger.error("Expected keys: anchor, phrase_score")
148
+ self.logger.warning(f"Cache format mismatch: {e}. Recomputing.")
149
+ return None
150
+
151
+ except (FileNotFoundError, json.JSONDecodeError) as e:
152
+ self.logger.debug(f"Cache miss or invalid cache file: {e}")
153
+ return None
154
+ except Exception as e:
155
+ self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
116
156
  return None
117
157
 
118
158
  def _process_ngram_length(
119
- self, n: int, trans_words: List[str], ref_texts_clean: Dict[str, List[str]], min_sources: int
159
+ self,
160
+ n: int,
161
+ trans_words: List[str],
162
+ all_words: List[Word],
163
+ ref_texts_clean: Dict[str, List[str]],
164
+ ref_words: Dict[str, List[Word]],
165
+ min_sources: int,
120
166
  ) -> List[AnchorSequence]:
121
167
  """Process a single n-gram length to find matching sequences."""
122
168
  candidate_anchors = []
123
169
  used_positions = {source: set() for source in ref_texts_clean.keys()}
124
170
  used_trans_positions = set()
125
171
 
126
- # Try each position in the transcribed text multiple times
127
- # to catch repeated phrases
128
172
  found_new_match = True
129
173
  while found_new_match:
130
174
  found_new_match = False
@@ -137,56 +181,108 @@ class AnchorSequenceFinder:
137
181
  if trans_pos in used_trans_positions:
138
182
  continue
139
183
 
184
+ # Get the actual words from the transcription at this position
185
+ actual_words = [w.text.lower().strip('.,?!"\n') for w in all_words[trans_pos : trans_pos + n]]
186
+ ngram_words = [w.lower() for w in ngram]
187
+
188
+ if actual_words != ngram_words:
189
+ self.logger.error(f"Mismatch between ngram and actual words at position {trans_pos}:")
190
+ self.logger.error(f"Ngram words: {ngram_words}")
191
+ self.logger.error(f"Actual words: {actual_words}")
192
+ self.logger.error(f"Full trans_words: {trans_words}")
193
+ self.logger.error(f"Full all_words: {[w.text for w in all_words]}")
194
+ raise AssertionError(
195
+ f"Ngram words don't match actual words at position {trans_pos}. "
196
+ f"This should never happen as trans_words should be derived from all_words."
197
+ )
198
+
140
199
  matches = self._find_matching_sources(ngram, ref_texts_clean, n)
141
200
  if len(matches) >= min_sources:
201
+ # Get Word IDs for transcribed words
202
+ transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
203
+
204
+ # Get Word IDs for reference words
205
+ reference_word_ids = {source: [w.id for w in ref_words[source][pos : pos + n]] for source, pos in matches.items()}
206
+
142
207
  # Mark positions as used
143
208
  for source, pos in matches.items():
144
209
  used_positions[source].add(pos)
145
210
  used_trans_positions.add(trans_pos)
146
211
 
147
- anchor = AnchorSequence(ngram, trans_pos, matches, len(matches) / len(ref_texts_clean))
212
+ anchor = AnchorSequence(
213
+ id=WordUtils.generate_id(),
214
+ transcribed_word_ids=transcribed_word_ids,
215
+ transcription_position=trans_pos,
216
+ reference_positions=matches,
217
+ reference_word_ids=reference_word_ids,
218
+ confidence=len(matches) / len(ref_texts_clean),
219
+ )
148
220
  candidate_anchors.append(anchor)
149
221
  found_new_match = True
150
- break # Start over to try finding more matches
222
+ break
151
223
 
152
224
  return candidate_anchors
153
225
 
154
- def find_anchors(self, transcribed: str, references: Dict[str, str]) -> List[ScoredAnchor]:
226
+ def find_anchors(
227
+ self,
228
+ transcribed: str,
229
+ references: Dict[str, LyricsData],
230
+ transcription_result: TranscriptionResult,
231
+ ) -> List[ScoredAnchor]:
155
232
  """Find anchor sequences that appear in both transcription and references."""
156
- cache_key = self._get_cache_key(transcribed, references)
233
+ cache_key = self._get_cache_key(transcribed, references, transcription_result)
157
234
  cache_path = self.cache_dir / f"anchors_{cache_key}.json"
158
235
 
159
236
  # Try to load from cache
160
237
  if cached_data := self._load_from_cache(cache_path):
161
238
  self.logger.info("Loading anchors from cache")
162
239
  try:
163
- return [ScoredAnchor.from_dict(anchor) for anchor in cached_data]
164
- except KeyError as e:
165
- self.logger.warning(f"Cache format mismatch: {e}. Recomputing.")
240
+ # Convert cached_data to dictionary before logging
241
+ if cached_data:
242
+ first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
243
+ return cached_data
244
+ except Exception as e:
245
+ self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
246
+ if cached_data:
247
+ try:
248
+ first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
249
+ self.logger.error(f"First cached anchor data: {json.dumps(first_anchor, indent=2)}")
250
+ except:
251
+ self.logger.error("Could not serialize first cached anchor for logging")
166
252
 
167
253
  # If not in cache or cache format invalid, perform the computation
168
- self.logger.info("Cache miss - computing anchors")
254
+ self.logger.info(f"Cache miss for key {cache_key} - computing anchors")
169
255
  self.logger.info(f"Finding anchor sequences for transcription with length {len(transcribed)}")
170
256
 
171
- # Clean and split texts
172
- trans_words = self._clean_text(transcribed).split()
173
- ref_texts_clean = {source: self._clean_text(text).split() for source, text in references.items()}
257
+ # Get all words from transcription
258
+ all_words = []
259
+ for segment in transcription_result.segments:
260
+ all_words.extend(segment.words)
261
+
262
+ # Clean and split texts - this should match all_words exactly
263
+ trans_words = [w.text.lower().strip('.,?!"\n') for w in all_words] # Changed to derive directly from all_words
264
+ ref_texts_clean = {
265
+ source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
266
+ for source, lyrics in references.items()
267
+ }
268
+ ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
174
269
 
175
270
  max_length = min(len(trans_words), min(len(words) for words in ref_texts_clean.values()))
176
271
  n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
177
272
 
178
- # Set up parallel processing
179
- num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
180
- self.logger.info(f"Processing {len(n_gram_lengths)} n-gram lengths using {num_processes} processes")
181
-
182
- # Create partial function with fixed arguments
273
+ # Process n-gram lengths in parallel
183
274
  process_length_partial = partial(
184
- self._process_ngram_length, trans_words=trans_words, ref_texts_clean=ref_texts_clean, min_sources=self.min_sources
275
+ self._process_ngram_length,
276
+ trans_words=trans_words,
277
+ all_words=all_words, # Pass the Word objects
278
+ ref_texts_clean=ref_texts_clean,
279
+ ref_words=ref_words,
280
+ min_sources=self.min_sources,
185
281
  )
186
282
 
187
283
  # Process n-gram lengths in parallel
188
284
  candidate_anchors = []
189
- with Pool(processes=num_processes) as pool:
285
+ with Pool(processes=max(cpu_count() - 1, 1)) as pool:
190
286
  results = list(
191
287
  tqdm(
192
288
  pool.imap(process_length_partial, n_gram_lengths, chunksize=1),
@@ -198,13 +294,10 @@ class AnchorSequenceFinder:
198
294
  candidate_anchors.extend(anchors)
199
295
 
200
296
  self.logger.info(f"Found {len(candidate_anchors)} candidate anchors")
201
- filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed)
202
-
203
- # Before returning, save to cache with correct format
204
- self._save_to_cache(
205
- cache_path, [{"anchor": anchor.anchor.to_dict(), "phrase_score": anchor.phrase_score.to_dict()} for anchor in filtered_anchors]
206
- )
297
+ filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
207
298
 
299
+ # Save to cache
300
+ self._save_to_cache(cache_path, filtered_anchors)
208
301
  return filtered_anchors
209
302
 
210
303
  def _score_sequence(self, words: List[str], context: str) -> PhraseScore:
@@ -212,19 +305,6 @@ class AnchorSequenceFinder:
212
305
  self.logger.debug(f"_score_sequence called for: '{' '.join(words)}'")
213
306
  return self.phrase_analyzer.score_phrase(words, context)
214
307
 
215
- def _score_anchor(self, anchor: AnchorSequence, context: str) -> ScoredAnchor:
216
- """Score an anchor sequence based on phrase quality and line breaks.
217
-
218
- Args:
219
- anchor: The anchor sequence to score
220
- context: The original transcribed text
221
- """
222
- # Let phrase_analyzer handle all scoring including line breaks
223
- phrase_score = self.phrase_analyzer.score_phrase(anchor.words, context)
224
-
225
- # self.logger.debug(f"_score_anchor called for sequence: '{anchor.text}'")
226
- return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
227
-
228
308
  def _get_sequence_priority(self, scored_anchor: ScoredAnchor) -> Tuple[float, float, float, float, int]:
229
309
  """Get priority tuple for sorting sequences.
230
310
 
@@ -239,7 +319,7 @@ class AnchorSequenceFinder:
239
319
  """
240
320
  # self.logger.debug(f"_get_sequence_priority called for anchor: '{scored_anchor.anchor.text}'")
241
321
  position_bonus = 1.0 if scored_anchor.anchor.transcription_position == 0 else 0.0
242
- length_bonus = len(scored_anchor.anchor.words) * 0.2 # Add bonus for longer sequences
322
+ length_bonus = len(scored_anchor.anchor.transcribed_word_ids) * 0.2 # Changed from words to transcribed_word_ids
243
323
 
244
324
  return (
245
325
  len(scored_anchor.anchor.reference_positions), # More sources is better
@@ -260,25 +340,39 @@ class AnchorSequenceFinder:
260
340
  True if sequences overlap in transcription or share any reference positions
261
341
  """
262
342
  # Check transcription overlap
263
- seq1_trans_range = range(seq1.transcription_position, seq1.transcription_position + len(seq1.words))
264
- seq2_trans_range = range(seq2.transcription_position, seq2.transcription_position + len(seq2.words))
343
+ seq1_trans_range = range(
344
+ seq1.transcription_position, seq1.transcription_position + len(seq1.transcribed_word_ids)
345
+ ) # Changed from words
346
+ seq2_trans_range = range(
347
+ seq2.transcription_position, seq2.transcription_position + len(seq2.transcribed_word_ids)
348
+ ) # Changed from words
265
349
  trans_overlap = bool(set(seq1_trans_range) & set(seq2_trans_range))
266
350
 
267
351
  # Check reference overlap - only consider positions in shared sources
268
352
  shared_sources = set(seq1.reference_positions.keys()) & set(seq2.reference_positions.keys())
269
353
  ref_overlap = any(seq1.reference_positions[source] == seq2.reference_positions[source] for source in shared_sources)
270
354
 
271
- # self.logger.debug(f"Checking overlap between '{seq1.text}' and '{seq2.text}'")
272
355
  return trans_overlap or ref_overlap
273
356
 
274
- def _remove_overlapping_sequences(self, anchors: List[AnchorSequence], context: str) -> List[ScoredAnchor]:
357
+ def _remove_overlapping_sequences(
358
+ self,
359
+ anchors: List[AnchorSequence],
360
+ context: str,
361
+ transcription_result: TranscriptionResult,
362
+ ) -> List[ScoredAnchor]:
275
363
  """Remove overlapping sequences using phrase analysis."""
276
364
  if not anchors:
277
365
  return []
278
366
 
279
367
  self.logger.info(f"Scoring {len(anchors)} anchors")
280
368
 
281
- # Benchmark both approaches
369
+ # Create word map for scoring
370
+ word_map = {w.id: w for s in transcription_result.segments for w in s.words}
371
+
372
+ # Add word map to each anchor for scoring
373
+ for anchor in anchors:
374
+ anchor.transcribed_words = [word_map[word_id] for word_id in anchor.transcribed_word_ids]
375
+
282
376
  start_time = time.time()
283
377
 
284
378
  # Try different pool sizes
@@ -292,7 +386,7 @@ class AnchorSequenceFinder:
292
386
  with Pool(processes=num_processes) as pool:
293
387
  scored_anchors = list(
294
388
  tqdm(
295
- pool.imap(score_anchor_partial, anchors, chunksize=50), # Added chunksize
389
+ pool.imap(score_anchor_partial, anchors, chunksize=50),
296
390
  total=len(anchors),
297
391
  desc="Scoring anchors (parallel)",
298
392
  )
@@ -326,7 +420,11 @@ class AnchorSequenceFinder:
326
420
  if not hasattr(AnchorSequenceFinder._score_anchor_static, "_phrase_analyzer"):
327
421
  AnchorSequenceFinder._score_anchor_static._phrase_analyzer = PhraseAnalyzer(logger=logging.getLogger(__name__))
328
422
 
329
- phrase_score = AnchorSequenceFinder._score_anchor_static._phrase_analyzer.score_phrase(anchor.words, context)
423
+ # Get the words from the transcribed word IDs
424
+ # We need to pass in the actual words for scoring
425
+ words = [w.text for w in anchor.transcribed_words] # This needs to be passed in
426
+
427
+ phrase_score = AnchorSequenceFinder._score_anchor_static._phrase_analyzer.score_phrase(words, context)
330
428
  return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
331
429
 
332
430
  def _get_reference_words(self, source: str, ref_words: List[str], start_pos: Optional[int], end_pos: Optional[int]) -> List[str]:
@@ -347,125 +445,202 @@ class AnchorSequenceFinder:
347
445
  end_pos = len(ref_words)
348
446
  return ref_words[start_pos:end_pos]
349
447
 
350
- def find_gaps(self, transcribed: str, anchors: List[ScoredAnchor], references: Dict[str, str]) -> List[GapSequence]:
448
+ def find_gaps(
449
+ self,
450
+ transcribed: str,
451
+ anchors: List[ScoredAnchor],
452
+ references: Dict[str, LyricsData],
453
+ transcription_result: TranscriptionResult,
454
+ ) -> List[GapSequence]:
351
455
  """Find gaps between anchor sequences in the transcribed text."""
352
- cache_key = self._get_cache_key(transcribed, references)
353
- cache_path = self.cache_dir / f"gaps_{cache_key}.json"
354
-
355
- # Try to load from cache
356
- if cached_data := self._load_from_cache(cache_path):
357
- self.logger.info("Loading gaps from cache")
358
- return [GapSequence.from_dict(gap) for gap in cached_data]
359
-
360
- # If not in cache, perform the computation
361
- self.logger.info("Cache miss - computing gaps")
362
- words = self._clean_text(transcribed).split()
363
- ref_texts_clean = {source: self._clean_text(text).split() for source, text in references.items()}
364
- # Store original reference texts split into words
365
- ref_texts_original = {source: text.split() for source, text in references.items()}
366
-
456
+ # Get all words from transcription
457
+ all_words = []
458
+ for segment in transcription_result.segments:
459
+ all_words.extend(segment.words)
460
+
461
+ # Clean and split reference texts
462
+ ref_texts_clean = {
463
+ source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
464
+ for source, lyrics in references.items()
465
+ }
466
+ ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
467
+
468
+ # Create gaps with Word IDs
367
469
  gaps = []
368
470
  sorted_anchors = sorted(anchors, key=lambda x: x.anchor.transcription_position)
369
471
 
370
472
  # Handle initial gap
371
- if initial_gap := self._create_initial_gap(
372
- words, sorted_anchors[0] if sorted_anchors else None, ref_texts_clean, ref_texts_original
373
- ):
374
- gaps.append(initial_gap)
473
+ if sorted_anchors:
474
+ first_anchor = sorted_anchors[0].anchor
475
+ first_anchor_pos = first_anchor.transcription_position
476
+ if first_anchor_pos > 0:
477
+ gap_word_ids = [w.id for w in all_words[:first_anchor_pos]]
478
+ if gap := self._create_initial_gap(
479
+ id=WordUtils.generate_id(),
480
+ transcribed_word_ids=gap_word_ids,
481
+ transcription_position=0,
482
+ following_anchor_id=first_anchor.id,
483
+ ref_texts_clean=ref_texts_clean,
484
+ ref_words=ref_words,
485
+ following_anchor=first_anchor,
486
+ ):
487
+ gaps.append(gap)
375
488
 
376
489
  # Handle gaps between anchors
377
490
  for i in range(len(sorted_anchors) - 1):
378
- if between_gap := self._create_between_gap(
379
- words, sorted_anchors[i], sorted_anchors[i + 1], ref_texts_clean, ref_texts_original
380
- ):
381
- gaps.append(between_gap)
491
+ current_anchor = sorted_anchors[i].anchor
492
+ next_anchor = sorted_anchors[i + 1].anchor
493
+ gap_start = current_anchor.transcription_position + len(current_anchor.transcribed_word_ids)
494
+ gap_end = next_anchor.transcription_position
495
+
496
+ if gap_end > gap_start:
497
+ gap_word_ids = [w.id for w in all_words[gap_start:gap_end]]
498
+ if between_gap := self._create_between_gap(
499
+ id=WordUtils.generate_id(),
500
+ transcribed_word_ids=gap_word_ids,
501
+ transcription_position=gap_start,
502
+ preceding_anchor_id=current_anchor.id,
503
+ following_anchor_id=next_anchor.id,
504
+ ref_texts_clean=ref_texts_clean,
505
+ ref_words=ref_words,
506
+ preceding_anchor=current_anchor,
507
+ following_anchor=next_anchor,
508
+ ):
509
+ gaps.append(between_gap)
382
510
 
383
511
  # Handle final gap
384
- if sorted_anchors and (final_gap := self._create_final_gap(words, sorted_anchors[-1], ref_texts_clean, ref_texts_original)):
385
- gaps.append(final_gap)
512
+ if sorted_anchors:
513
+ last_anchor = sorted_anchors[-1].anchor
514
+ last_pos = last_anchor.transcription_position + len(last_anchor.transcribed_word_ids)
515
+ if last_pos < len(all_words):
516
+ gap_word_ids = [w.id for w in all_words[last_pos:]]
517
+ if final_gap := self._create_final_gap(
518
+ id=WordUtils.generate_id(),
519
+ transcribed_word_ids=gap_word_ids,
520
+ transcription_position=last_pos,
521
+ preceding_anchor_id=last_anchor.id,
522
+ ref_texts_clean=ref_texts_clean,
523
+ ref_words=ref_words,
524
+ preceding_anchor=last_anchor,
525
+ ):
526
+ gaps.append(final_gap)
386
527
 
387
- # Save to cache
388
- self._save_to_cache(cache_path, [gap.to_dict() for gap in gaps])
389
528
  return gaps
390
529
 
391
530
  def _create_initial_gap(
392
531
  self,
393
- words: List[str],
394
- first_anchor: Optional[ScoredAnchor],
532
+ id: str,
533
+ transcribed_word_ids: List[str],
534
+ transcription_position: int,
535
+ following_anchor_id: str,
395
536
  ref_texts_clean: Dict[str, List[str]],
396
- ref_texts_original: Dict[str, List[str]],
537
+ ref_words: Dict[str, List[Word]],
538
+ following_anchor: AnchorSequence,
397
539
  ) -> Optional[GapSequence]:
398
- """Create gap sequence before the first anchor."""
399
- if not first_anchor:
400
- ref_words = {source: words for source, words in ref_texts_clean.items()}
401
- ref_words_original = {source: words for source, words in ref_texts_original.items()}
402
- return GapSequence(words, 0, None, None, ref_words, ref_words_original)
403
-
404
- if first_anchor.anchor.transcription_position > 0:
405
- ref_words = {}
406
- ref_words_original = {}
407
- for source in ref_texts_clean:
408
- end_pos = first_anchor.anchor.reference_positions.get(source)
409
- ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], None, end_pos)
410
- ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], None, end_pos)
540
+ """Create gap sequence before the first anchor.
541
+
542
+ The gap includes all reference words from the start of each reference
543
+ up to the position where the following anchor starts in that reference.
544
+ """
545
+ if transcription_position > 0:
546
+ # Get reference word IDs for the gap
547
+ reference_word_ids = {}
548
+ for source, words in ref_words.items():
549
+ if source in ref_texts_clean:
550
+ # Get the position where the following anchor starts in this source
551
+ if source in following_anchor.reference_positions:
552
+ end_pos = following_anchor.reference_positions[source]
553
+ # Include all words from start up to the anchor
554
+ reference_word_ids[source] = [w.id for w in words[:end_pos]]
555
+ else:
556
+ # If this source doesn't contain the following anchor,
557
+ # we can't determine the gap content for it
558
+ reference_word_ids[source] = []
411
559
 
412
560
  return GapSequence(
413
- words[: first_anchor.anchor.transcription_position], 0, None, first_anchor.anchor, ref_words, ref_words_original
561
+ id=id,
562
+ transcribed_word_ids=transcribed_word_ids,
563
+ transcription_position=transcription_position,
564
+ preceding_anchor_id=None,
565
+ following_anchor_id=following_anchor_id,
566
+ reference_word_ids=reference_word_ids,
414
567
  )
415
568
  return None
416
569
 
417
570
  def _create_between_gap(
418
571
  self,
419
- words: List[str],
420
- current_anchor: ScoredAnchor,
421
- next_anchor: ScoredAnchor,
572
+ id: str,
573
+ transcribed_word_ids: List[str],
574
+ transcription_position: int,
575
+ preceding_anchor_id: str,
576
+ following_anchor_id: str,
422
577
  ref_texts_clean: Dict[str, List[str]],
423
- ref_texts_original: Dict[str, List[str]],
578
+ ref_words: Dict[str, List[Word]],
579
+ preceding_anchor: AnchorSequence,
580
+ following_anchor: AnchorSequence,
424
581
  ) -> Optional[GapSequence]:
425
- """Create gap sequence between two anchors."""
426
- gap_start = current_anchor.anchor.transcription_position + current_anchor.anchor.length
427
- gap_end = next_anchor.anchor.transcription_position
428
-
429
- if gap_end > gap_start:
430
- ref_words = {}
431
- ref_words_original = {}
432
- shared_sources = set(current_anchor.anchor.reference_positions.keys()) & set(next_anchor.anchor.reference_positions.keys())
433
-
434
- # Check for large position differences in next_anchor
435
- if len(next_anchor.anchor.reference_positions) > 1:
436
- positions = list(next_anchor.anchor.reference_positions.values())
437
- max_diff = max(positions) - min(positions)
438
- if max_diff > 20:
439
- earliest_source = min(next_anchor.anchor.reference_positions.items(), key=lambda x: x[1])[0]
440
- self.logger.warning(
441
- f"Large position difference ({max_diff} words) in next anchor. Using only earliest source: {earliest_source}"
442
- )
443
- shared_sources &= {earliest_source}
444
-
445
- for source in shared_sources:
446
- start_pos = current_anchor.anchor.reference_positions[source] + current_anchor.anchor.length
447
- end_pos = next_anchor.anchor.reference_positions[source]
448
- ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], start_pos, end_pos)
449
- ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], start_pos, end_pos)
582
+ """Create gap sequence between two anchors.
450
583
 
451
- return GapSequence(
452
- words[gap_start:gap_end], gap_start, current_anchor.anchor, next_anchor.anchor, ref_words, ref_words_original
453
- )
454
- return None
584
+ For each reference source, the gap includes all words between the end of the
585
+ preceding anchor and the start of the following anchor in that source.
586
+ """
587
+ # Get reference word IDs for the gap
588
+ reference_word_ids = {}
589
+ for source, words in ref_words.items():
590
+ if source in ref_texts_clean:
591
+ # Only process sources that contain both anchors
592
+ if source in preceding_anchor.reference_positions and source in following_anchor.reference_positions:
593
+ start_pos = preceding_anchor.reference_positions[source] + len(preceding_anchor.reference_word_ids[source])
594
+ end_pos = following_anchor.reference_positions[source]
595
+ # Include all words between the anchors
596
+ reference_word_ids[source] = [w.id for w in words[start_pos:end_pos]]
597
+ else:
598
+ # If this source doesn't contain both anchors,
599
+ # we can't determine the gap content for it
600
+ reference_word_ids[source] = []
601
+
602
+ return GapSequence(
603
+ id=id,
604
+ transcribed_word_ids=transcribed_word_ids,
605
+ transcription_position=transcription_position,
606
+ preceding_anchor_id=preceding_anchor_id,
607
+ following_anchor_id=following_anchor_id,
608
+ reference_word_ids=reference_word_ids,
609
+ )
455
610
 
456
611
  def _create_final_gap(
457
- self, words: List[str], last_anchor: ScoredAnchor, ref_texts_clean: Dict[str, List[str]], ref_texts_original: Dict[str, List[str]]
612
+ self,
613
+ id: str,
614
+ transcribed_word_ids: List[str],
615
+ transcription_position: int,
616
+ preceding_anchor_id: str,
617
+ ref_texts_clean: Dict[str, List[str]],
618
+ ref_words: Dict[str, List[Word]],
619
+ preceding_anchor: AnchorSequence,
458
620
  ) -> Optional[GapSequence]:
459
- """Create gap sequence after the last anchor."""
460
- last_pos = last_anchor.anchor.transcription_position + last_anchor.anchor.length
461
- if last_pos < len(words):
462
- ref_words = {}
463
- ref_words_original = {}
464
- for source in ref_texts_clean:
465
- if source in last_anchor.anchor.reference_positions:
466
- start_pos = last_anchor.anchor.reference_positions[source] + last_anchor.anchor.length
467
- ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], start_pos, None)
468
- ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], start_pos, None)
469
-
470
- return GapSequence(words[last_pos:], last_pos, last_anchor.anchor, None, ref_words, ref_words_original)
471
- return None
621
+ """Create gap sequence after the last anchor.
622
+
623
+ For each reference source, includes all words from the end of the
624
+ preceding anchor to the end of that reference.
625
+ """
626
+ # Get reference word IDs for the gap
627
+ reference_word_ids = {}
628
+ for source, words in ref_words.items():
629
+ if source in ref_texts_clean:
630
+ if source in preceding_anchor.reference_positions:
631
+ start_pos = preceding_anchor.reference_positions[source] + len(preceding_anchor.reference_word_ids[source])
632
+ # Include all words from end of last anchor to end of reference
633
+ reference_word_ids[source] = [w.id for w in words[start_pos:]]
634
+ else:
635
+ # If this source doesn't contain the preceding anchor,
636
+ # we can't determine the gap content for it
637
+ reference_word_ids[source] = []
638
+
639
+ return GapSequence(
640
+ id=id,
641
+ transcribed_word_ids=transcribed_word_ids,
642
+ transcription_position=transcription_position,
643
+ preceding_anchor_id=preceding_anchor_id,
644
+ following_anchor_id=None,
645
+ reference_word_ids=reference_word_ids,
646
+ )