PyPI - lyrics-transcriber - Versions diffs - 0.40.0__py3-none-any.whl → 0.42.0__py3-none-any.whl - Mend

lyrics-transcriber 0.40.0py3-none-any.whl → 0.42.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

lyrics_transcriber/correction/handlers/relaxed_word_count_match.py CHANGED Viewed

@@ -11,53 +11,73 @@ class RelaxedWordCountMatchHandler(GapCorrectionHandler):
     def __init__(self, logger: Optional[logging.Logger] = None):
         super().__init__(logger)
-        self.logger = logger
+        self.logger = logger or logging.getLogger(__name__)
-    def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
+    def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
         # Must have reference words
-        if not gap.reference_words:
-            self.logger.debug("No reference words available.")
+        if not gap.reference_word_ids:
+            self.logger.debug("No reference word IDs available.")
+            return False, {}
+        if not self._validate_data(data):
             return False, {}
         # Check if any source has matching word count
-        for source, words in gap.reference_words.items():
-            if len(words) == gap.length:
+        for source, ref_word_ids in gap.reference_word_ids.items():
+            if len(ref_word_ids) == gap.length:
                 self.logger.debug(f"Source '{source}' has matching word count.")
-                return True, {}
+                return True, {
+                    "matching_source": source,
+                    "reference_word_ids": ref_word_ids,
+                    "word_map": data["word_map"],
+                    "anchor_sequences": data.get("anchor_sequences", []),
+                }
         self.logger.debug("No source with matching word count found.")
         return False, {}
     def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
-        corrections = []
+        """Handle the gap using word count matching."""
+        if not self._validate_data(data):
+            return []
-        # Find the first source that has matching word count
-        matching_source = None
-        reference_words = None
-        reference_words_original = None
-        for source, words in gap.reference_words.items():
-            if len(words) == gap.length:
-                matching_source = source
-                reference_words = words
-                reference_words_original = gap.reference_words_original[source]
-                self.logger.debug(f"Using source '{source}' for corrections.")
-                break
+        corrections = []
+        matching_source = data["matching_source"]
+        reference_word_ids = data["reference_word_ids"]
+        word_map = data["word_map"]
+        anchor_sequences = data.get("anchor_sequences", [])
-        # Use the centralized method to calculate reference positions for the matching source
-        reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
+        # Use the centralized method to calculate reference positions
+        reference_positions = WordOperations.calculate_reference_positions(
+            gap, sources=[matching_source], anchor_sequences=anchor_sequences
+        )
         self.logger.debug(f"Calculated reference positions: {reference_positions}")
         # Since we found a source with matching word count, we can correct using that source
-        for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
-            if orig_word.lower() != ref_word.lower():
+        for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
+            # Get the actual words from the word map
+            if orig_word_id not in word_map:
+                self.logger.error(f"Original word ID {orig_word_id} not found in word_map")
+                continue
+            orig_word = word_map[orig_word_id]
+            if ref_word_id not in word_map:
+                self.logger.error(f"Reference word ID {ref_word_id} not found in word_map")
+                continue
+            ref_word = word_map[ref_word_id]
+            if orig_word.text.lower() != ref_word.text.lower():
                 correction = WordOperations.create_word_replacement_correction(
-                    original_word=orig_word,
-                    corrected_word=ref_word_original,
+                    original_word=orig_word.text,
+                    corrected_word=ref_word.text,
                     original_position=gap.transcription_position + i,
                     source=matching_source,
                     confidence=1.0,
-                    reason=f"RelaxedWordCountMatchHandler: Source '{matching_source}' had matching word count",
+                    reason=f"Source '{matching_source}' had matching word count",
                     reference_positions=reference_positions,
+                    handler="RelaxedWordCountMatchHandler",
+                    original_word_id=orig_word_id,
+                    corrected_word_id=ref_word_id,  # Use the reference word's ID
                 )
                 corrections.append(correction)
                 self.logger.debug(f"Correction made: {correction}")

lyrics_transcriber/correction/handlers/repeat.py CHANGED Viewed

@@ -9,13 +9,17 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
     """Handler that applies corrections that were previously made by other handlers."""
     def __init__(self, logger: Optional[logging.Logger] = None, confidence_threshold: float = 0.7):
+        super().__init__(logger)
         self.logger = logger or logging.getLogger(__name__)
         self.confidence_threshold = confidence_threshold
         self.previous_corrections: List[WordCorrection] = []
-    def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
+    def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
         """Check if any words in the gap match previous corrections."""
-        return bool(self.previous_corrections), {}
+        if not self._validate_data(data):
+            return False, {}
+        return bool(self.previous_corrections), {"word_map": data["word_map"], "anchor_sequences": data.get("anchor_sequences", [])}
     def set_previous_corrections(self, corrections: List[WordCorrection]) -> None:
         """Store corrections from previous handlers to use as reference."""
@@ -23,10 +27,14 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
     def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
         """Apply previous corrections to matching words in the current gap."""
+        if not self._validate_data(data):
+            return []
+        word_map = data["word_map"]
         corrections = []
         # Use the centralized method to calculate reference positions
-        reference_positions = WordOperations.calculate_reference_positions(gap)
+        reference_positions = WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", []))
         # Build a map of original words to their corrections
         correction_map: Dict[str, List[WordCorrection]] = {}
@@ -35,8 +43,14 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
                 correction_map.setdefault(corr.original_word.lower(), []).append(corr)
         # Check each word in the gap
-        for i, word in enumerate(gap.words):
-            word_lower = word.lower()
+        for i, word_id in enumerate(gap.transcribed_word_ids):
+            if word_id not in word_map:
+                self.logger.error(f"Word ID {word_id} not found in word map")
+                continue
+            word = word_map[word_id]
+            word_lower = word.text.lower()
             if word_lower in correction_map:
                 # Get the most common correction for this word
                 prev_corrections = correction_map[word_lower]
@@ -46,13 +60,13 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
                 )
                 self.logger.debug(
-                    f"Applying previous correction: {word} -> {best_correction.corrected_word} "
+                    f"Applying previous correction: {word.text} -> {best_correction.corrected_word} "
                     f"(confidence: {best_correction.confidence:.2f})"
                 )
                 corrections.append(
                     WordCorrection(
-                        original_word=word,
+                        original_word=word.text,
                         corrected_word=best_correction.corrected_word,
                         segment_index=0,
                         original_position=gap.transcription_position + i,
@@ -61,10 +75,13 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
                         reason=f"RepeatCorrectionHandler: Matches previous correction",
                         alternatives={best_correction.corrected_word: 1},
                         is_deletion=best_correction.is_deletion,
-                        reference_positions=reference_positions,  # Add reference positions
-                        length=best_correction.length,  # Preserve length from original correction
-                        split_index=best_correction.split_index,  # Preserve split info if present
-                        split_total=best_correction.split_total,  # Preserve split info if present
+                        reference_positions=reference_positions,
+                        length=best_correction.length,
+                        split_index=best_correction.split_index,
+                        split_total=best_correction.split_total,
+                        handler="RepeatCorrectionHandler",
+                        word_id=word_id,
+                        corrected_word_id=best_correction.corrected_word_id,
                     )
                 )

lyrics_transcriber/correction/handlers/sound_alike.py CHANGED Viewed

@@ -36,54 +36,86 @@ class SoundAlikeHandler(GapCorrectionHandler):
         self.logger = logger or logging.getLogger(__name__)
         self.similarity_threshold = similarity_threshold
-    def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
+    def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
+        """Check if any gap word has a metaphone match with any reference word."""
+        if not self._validate_data(data):
+            return False, {}
+        word_map = data["word_map"]
         # Must have reference words
-        if not gap.reference_words:
+        if not gap.reference_word_ids:
             self.logger.debug("No reference words available")
             return False, {}
         # Gap must have words
-        if not gap.words:
+        if not gap.transcribed_word_ids:
             self.logger.debug("No gap words available")
             return False, {}
         # Check if any gap word has a metaphone match with any reference word
-        for word in gap.words:
-            word_codes = doublemetaphone(word)
-            self.logger.debug(f"Gap word '{word}' has metaphone codes: {word_codes}")
-            for ref_words in gap.reference_words.values():
-                for ref_word in ref_words:
-                    ref_codes = doublemetaphone(ref_word)
-                    self.logger.debug(f"Reference word '{ref_word}' has metaphone codes: {ref_codes}")
+        for word_id in gap.transcribed_word_ids:
+            if word_id not in word_map:
+                continue
+            word = word_map[word_id]
+            word_codes = doublemetaphone(word.text)
+            self.logger.debug(f"Gap word '{word.text}' has metaphone codes: {word_codes}")
+            for source, ref_word_ids in gap.reference_word_ids.items():
+                for ref_word_id in ref_word_ids:
+                    if ref_word_id not in word_map:
+                        continue
+                    ref_word = word_map[ref_word_id]
+                    ref_codes = doublemetaphone(ref_word.text)
+                    self.logger.debug(f"Reference word '{ref_word.text}' has metaphone codes: {ref_codes}")
                     if self._codes_match(word_codes, ref_codes):
-                        self.logger.debug(f"Found metaphone match between '{word}' and '{ref_word}'")
+                        self.logger.debug(f"Found metaphone match between '{word.text}' and '{ref_word.text}'")
                         return True, {}
         self.logger.debug("No metaphone matches found")
         return False, {}
     def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
+        """Process the gap and create corrections for sound-alike matches."""
+        if not self._validate_data(data):
+            return []
+        word_map = data["word_map"]
         corrections = []
-        # Use the centralized method to calculate reference positions for all sources
-        reference_positions = WordOperations.calculate_reference_positions(gap)
+        # Use the centralized method to calculate reference positions
+        reference_positions = WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", []))
         # For each word in the gap
-        for i, word in enumerate(gap.words):
-            word_codes = doublemetaphone(word)
-            self.logger.debug(f"Processing '{word}' (codes: {word_codes})")
+        for i, word_id in enumerate(gap.transcribed_word_ids):
+            if word_id not in word_map:
+                continue
+            word = word_map[word_id]
+            word_codes = doublemetaphone(word.text)
+            self.logger.debug(f"Processing '{word.text}' (codes: {word_codes})")
             # Skip if word exactly matches any reference
-            exact_match = any(i < len(ref_words) and word.lower() == ref_words[i].lower() for ref_words in gap.reference_words.values())
+            exact_match = False
+            for source, ref_word_ids in gap.reference_word_ids.items():
+                if i < len(ref_word_ids):
+                    ref_word_id = ref_word_ids[i]
+                    if ref_word_id in word_map:
+                        ref_word = word_map[ref_word_id]
+                        if word.text.lower() == ref_word.text.lower():
+                            exact_match = True
+                            break
             if exact_match:
                 continue
             # Find sound-alike matches in references
-            matches: Dict[str, Tuple[List[str], float]] = {}
+            matches: Dict[str, Tuple[List[str], float, str]] = {}  # Added word_id to tuple
-            for source, ref_words in gap.reference_words.items():
-                ref_words_original = gap.reference_words_original[source]  # Get original formatted words
-                for j, (ref_word, ref_word_original) in enumerate(zip(ref_words, ref_words_original)):
-                    ref_codes = doublemetaphone(ref_word)
+            for source, ref_word_ids in gap.reference_word_ids.items():
+                for j, ref_word_id in enumerate(ref_word_ids):
+                    if ref_word_id not in word_map:
+                        continue
+                    ref_word = word_map[ref_word_id]
+                    ref_codes = doublemetaphone(ref_word.text)
                     match_confidence = self._get_match_confidence(word_codes, ref_codes)
                     if match_confidence >= self.similarity_threshold:
@@ -94,22 +126,23 @@ class SoundAlikeHandler(GapCorrectionHandler):
                         adjusted_confidence = match_confidence * position_multiplier
                         if adjusted_confidence >= self.similarity_threshold:
-                            if ref_word_original not in matches:  # Use original formatted word as key
-                                matches[ref_word_original] = ([], adjusted_confidence)
-                            matches[ref_word_original][0].append(source)
+                            if ref_word.text not in matches:
+                                matches[ref_word.text] = ([], adjusted_confidence, ref_word_id)
+                            matches[ref_word.text][0].append(source)
             # Create correction for best match if any found
             if matches:
-                best_match, (sources, base_confidence) = max(matches.items(), key=lambda x: (len(x[1][0]), x[1][1]))
+                best_match, (sources, base_confidence, ref_word_id) = max(matches.items(), key=lambda x: (len(x[1][0]), x[1][1]))
-                source_confidence = len(sources) / len(gap.reference_words)
+                source_confidence = len(sources) / len(gap.reference_word_ids)
                 final_confidence = base_confidence * source_confidence
-                self.logger.debug(f"Found match: {word} -> {best_match} (confidence: {final_confidence:.2f}, sources: {sources})")
+                self.logger.debug(f"Found match: {word.text} -> {best_match} " f"(confidence: {final_confidence:.2f}, sources: {sources})")
                 corrections.append(
                     WordCorrection(
-                        original_word=word,
-                        corrected_word=best_match,  # Already using original formatted word
+                        original_word=word.text,
+                        corrected_word=best_match,
                         segment_index=0,
                         original_position=gap.transcription_position + i,
                         confidence=final_confidence,
@@ -117,8 +150,11 @@ class SoundAlikeHandler(GapCorrectionHandler):
                         reason=f"SoundAlikeHandler: Phonetic match ({final_confidence:.2f} confidence)",
                         alternatives={k: len(v[0]) for k, v in matches.items()},
                         is_deletion=False,
-                        reference_positions=reference_positions,  # Add reference positions
-                        length=1,  # Single word replacement
+                        reference_positions=reference_positions,
+                        length=1,
+                        handler="SoundAlikeHandler",
+                        word_id=word_id,
+                        corrected_word_id=ref_word_id,
                     )
                 )

lyrics_transcriber/correction/handlers/syllables_match.py CHANGED Viewed

@@ -102,18 +102,44 @@ class SyllablesMatchHandler(GapCorrectionHandler):
         )
         return [spacy_count, pyphen_count, nltk_count, syllables_count]
-    def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
+    def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
         # Must have reference words
-        if not gap.reference_words:
-            self.logger.debug("No reference words available")
+        if not gap.reference_word_ids:
+            self.logger.debug("No reference word IDs available")
             return False, {}
+        # Get word lookup map from data
+        if not data or "word_map" not in data:
+            self.logger.error("No word_map provided in data")
+            return False, {}
+        word_map = data["word_map"]
+        # Get actual words from word IDs
+        gap_words = []
+        for word_id in gap.transcribed_word_ids:
+            if word_id not in word_map:
+                self.logger.error(f"Word ID {word_id} not found in word_map")
+                return False, {}
+            gap_words.append(word_map[word_id].text)
         # Get syllable counts for gap text using different methods
-        gap_syllables = self._count_syllables(gap.words)
+        gap_syllables = self._count_syllables(gap_words)
         # Check if any reference source has matching syllable count with any method
-        for source, words in gap.reference_words.items():
-            ref_syllables = self._count_syllables(words)
+        for source, ref_word_ids in gap.reference_word_ids.items():
+            # Get reference words from word map
+            ref_words = []
+            for word_id in ref_word_ids:
+                if word_id not in word_map:
+                    self.logger.error(f"Reference word ID {word_id} not found in word_map")
+                    continue
+                ref_words.append(word_map[word_id].text)
+            if not ref_words:
+                continue
+            ref_syllables = self._count_syllables(ref_words)
             # If any counting method matches between gap and reference, we can handle it
             if any(gap_count == ref_count for gap_count in gap_syllables for ref_count in ref_syllables):
@@ -121,81 +147,105 @@ class SyllablesMatchHandler(GapCorrectionHandler):
                 return True, {
                     "gap_syllables": gap_syllables,
                     "matching_source": source,
-                    "reference_words": words,
-                    "reference_words_original": gap.reference_words_original[source],
+                    "reference_word_ids": ref_word_ids,
+                    "word_map": word_map,
                 }
         self.logger.debug("No reference source had matching syllable count")
         return False, {}
-    def handle(self, gap: GapSequence, data: Dict[str, Any]) -> List[WordCorrection]:
+    def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
+        """Handle the gap using syllable matching."""
+        if not data:
+            can_handle, data = self.can_handle(gap)
+            if not can_handle:
+                return []
         corrections = []
         matching_source = data["matching_source"]
-        reference_words = data["reference_words"]
-        reference_words_original = data["reference_words_original"]
+        reference_word_ids = data["reference_word_ids"]
+        word_map = data["word_map"]
+        # Get the actual words from word IDs
+        gap_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids]
+        ref_words = [word_map[word_id].text for word_id in reference_word_ids]
         # Use the centralized method to calculate reference positions
         reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
         # Since we matched syllable counts for the entire gap, we should handle all words
-        if len(gap.words) > len(reference_words):
+        if len(gap_words) > len(ref_words):
             # Multiple transcribed words -> fewer reference words
             # Try to distribute the reference words across the gap words
-            words_per_ref = len(gap.words) / len(reference_words)
+            words_per_ref = len(gap_words) / len(ref_words)
-            for ref_idx, ref_word_original in enumerate(reference_words_original):
+            for ref_idx, ref_word_id in enumerate(reference_word_ids):
                 start_idx = int(ref_idx * words_per_ref)
                 end_idx = int((ref_idx + 1) * words_per_ref)
                 # Get the group of words to combine
-                words_to_combine = gap.words[start_idx:end_idx]
+                words_to_combine = gap_words[start_idx:end_idx]
+                word_ids_to_combine = gap.transcribed_word_ids[start_idx:end_idx]
                 corrections.extend(
                     WordOperations.create_word_combine_corrections(
                         original_words=words_to_combine,
-                        reference_word=ref_word_original,
+                        reference_word=word_map[ref_word_id].text,
                         original_position=gap.transcription_position + start_idx,
                         source=matching_source,
                         confidence=0.8,
-                        combine_reason="SyllablesMatchHandler: Words combined based on syllable match",
-                        delete_reason="SyllablesMatchHandler: Word removed as part of syllable match combination",
+                        combine_reason="Words combined based on syllable match",
+                        delete_reason="Word removed as part of syllable match combination",
                         reference_positions=reference_positions,
+                        handler="SyllablesMatchHandler",
+                        original_word_ids=word_ids_to_combine,
+                        corrected_word_id=ref_word_id,
                     )
                 )
-        elif len(gap.words) < len(reference_words):
+        elif len(gap_words) < len(ref_words):
             # Single transcribed word -> multiple reference words
-            words_per_gap = len(reference_words) / len(gap.words)
+            words_per_gap = len(ref_words) / len(gap_words)
-            for i, orig_word in enumerate(gap.words):
+            for i, word_id in enumerate(gap.transcribed_word_ids):
                 start_idx = int(i * words_per_gap)
                 end_idx = int((i + 1) * words_per_gap)
-                ref_words_original_for_orig = reference_words_original[start_idx:end_idx]
+                ref_word_ids_for_split = reference_word_ids[start_idx:end_idx]
+                ref_words_for_split = [word_map[ref_id].text for ref_id in ref_word_ids_for_split]
                 corrections.extend(
                     WordOperations.create_word_split_corrections(
-                        original_word=orig_word,
-                        reference_words=ref_words_original_for_orig,
+                        original_word=word_map[word_id].text,
+                        reference_words=ref_words_for_split,
                         original_position=gap.transcription_position + i,
                         source=matching_source,
                         confidence=0.8,
-                        reason="SyllablesMatchHandler: Split word based on syllable match",
+                        reason="Split word based on syllable match",
                         reference_positions=reference_positions,
+                        handler="SyllablesMatchHandler",
+                        original_word_id=word_id,
+                        corrected_word_ids=ref_word_ids_for_split,
                     )
                 )
         else:
             # One-to-one replacement
-            for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
-                if orig_word.lower() != ref_word.lower():
+            for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
+                orig_word = word_map[orig_word_id]
+                ref_word = word_map[ref_word_id]
+                if orig_word.text.lower() != ref_word.text.lower():
                     corrections.append(
                         WordOperations.create_word_replacement_correction(
-                            original_word=orig_word,
-                            corrected_word=ref_word_original,
+                            original_word=orig_word.text,
+                            corrected_word=ref_word.text,
                             original_position=gap.transcription_position + i,
                             source=matching_source,
                             confidence=0.8,
-                            reason=f"SyllablesMatchHandler: Source '{matching_source}' had matching syllable count",
+                            reason=f"Source '{matching_source}' had matching syllable count",
                             reference_positions=reference_positions,
+                            handler="SyllablesMatchHandler",
+                            original_word_id=orig_word_id,
+                            corrected_word_id=ref_word_id,
                         )
                     )

lyrics_transcriber/correction/handlers/word_count_match.py CHANGED Viewed

@@ -13,49 +13,66 @@ class WordCountMatchHandler(GapCorrectionHandler):
         super().__init__(logger)
         self.logger = logger or logging.getLogger(__name__)
-    def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
+    def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
         # Must have reference words
-        if not gap.reference_words:
-            self.logger.debug("No reference words available.")
+        if not gap.reference_word_ids:
+            self.logger.debug("No reference word IDs available.")
             return False, {}
-        ref_words_lists = list(gap.reference_words.values())
+        if not self._validate_data(data):
+            return False, {}
+        ref_word_lists = list(gap.reference_word_ids.values())
         # All sources must have same number of words as gap
-        if not all(len(words) == gap.length for words in ref_words_lists):
+        if not all(len(words) == gap.length for words in ref_word_lists):
             self.logger.debug("Not all sources have the same number of words as the gap.")
             return False, {}
         # If we have multiple sources, they must all agree
-        if len(ref_words_lists) > 1 and not all(words == ref_words_lists[0] for words in ref_words_lists[1:]):
+        if len(ref_word_lists) > 1 and not all(words == ref_word_lists[0] for words in ref_word_lists[1:]):
             self.logger.debug("Not all sources agree on the words.")
             return False, {}
         self.logger.debug("All sources agree and have matching word counts.")
-        return True, {}
+        return True, {"word_map": data["word_map"]}
     def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
+        if not self._validate_data(data):
+            return []
         corrections = []
-        # Get both clean and original reference words from first source
-        source = list(gap.reference_words.keys())[0]
-        reference_words = gap.reference_words[source]
-        reference_words_original = gap.reference_words_original[source]
-        sources = ", ".join(gap.reference_words.keys())
+        word_map = data["word_map"]
+        source = list(gap.reference_word_ids.keys())[0]
+        reference_word_ids = gap.reference_word_ids[source]
+        sources = ", ".join(gap.reference_word_ids.keys())
-        # Use the centralized method to calculate reference positions for all sources
         reference_positions = WordOperations.calculate_reference_positions(gap)
-        # Since we know all reference sources agree, we can correct all words in the gap
-        for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
-            if orig_word.lower() != ref_word.lower():
+        for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
+            # Get the actual words from the word map
+            if orig_word_id not in word_map:
+                self.logger.error(f"Original word ID {orig_word_id} not found in word_map")
+                continue
+            orig_word = word_map[orig_word_id]
+            if ref_word_id not in word_map:
+                self.logger.error(f"Reference word ID {ref_word_id} not found in word_map")
+                continue
+            ref_word = word_map[ref_word_id]
+            if orig_word.text.lower() != ref_word.text.lower():
                 correction = WordOperations.create_word_replacement_correction(
-                    original_word=orig_word,
-                    corrected_word=ref_word_original,
+                    original_word=orig_word.text,
+                    corrected_word=ref_word.text,
                     original_position=gap.transcription_position + i,
                     source=sources,
                     confidence=1.0,
-                    reason="WordCountMatchHandler: Reference sources had same word count as gap",
+                    reason="Reference sources had same word count as gap",
                     reference_positions=reference_positions,
+                    handler="WordCountMatchHandler",
+                    original_word_id=orig_word_id,
+                    corrected_word_id=ref_word_id,  # Use the reference word's ID
                 )
                 corrections.append(correction)
                 self.logger.debug(f"Correction made: {correction}")

lyrics-transcriber 0.40.0__py3-none-any.whl → 0.42.0__py3-none-any.whl

lyrics-transcriber 0.40.0py3-none-any.whl → 0.42.0py3-none-any.whl