PyPI - lyrics-transcriber - Versions diffs - 0.40.0__py3-none-any.whl → 0.42.0__py3-none-any.whl - Mend

lyrics-transcriber 0.40.0py3-none-any.whl → 0.42.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

lyrics_transcriber/correction/handlers/word_operations.py CHANGED Viewed

@@ -1,38 +1,55 @@
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Any
 from lyrics_transcriber.types import WordCorrection, GapSequence
+from lyrics_transcriber.utils.word_utils import WordUtils
 class WordOperations:
     """Utility class for common word manipulation operations used by correction handlers."""
     @staticmethod
-    def calculate_reference_positions(gap: GapSequence, sources: Optional[List[str]] = None) -> Dict[str, int]:
+    def calculate_reference_positions(
+        gap: GapSequence, sources: Optional[List[str]] = None, anchor_sequences: Optional[List[Any]] = None
+    ) -> Dict[str, int]:
         """Calculate reference positions for given sources based on preceding anchor.
         Args:
-            gap: The gap sequence containing the preceding anchor
+            gap: The gap sequence containing the preceding anchor ID
             sources: Optional list of sources to calculate positions for. If None, uses all sources.
+            anchor_sequences: List of anchor sequences to look up preceding anchor
         Returns:
             Dictionary mapping source names to their reference positions
         """
         reference_positions = {}
-        if gap.preceding_anchor:
-            # If no sources specified, use all sources from reference words
-            sources_to_check = sources or list(gap.reference_words.keys())
-            for source in sources_to_check:
-                if source in gap.preceding_anchor.reference_positions:
-                    # Calculate base position from anchor
-                    anchor_pos = gap.preceding_anchor.reference_positions[source]
-                    base_ref_pos = anchor_pos + len(gap.preceding_anchor.words)
-                    # Calculate word offset within the gap
-                    word_offset = gap.words.index(gap.words[gap.transcription_position - gap.transcription_position])
-                    # Add word offset to base position
-                    ref_pos = base_ref_pos + word_offset
-                    reference_positions[source] = ref_pos
+        if not gap.preceding_anchor_id or not anchor_sequences:
+            return reference_positions
+        # Find the preceding anchor in the sequences
+        preceding_anchor = next(
+            (scored_anchor.anchor for scored_anchor in anchor_sequences if scored_anchor.anchor.id == gap.preceding_anchor_id), None
+        )
+        if not preceding_anchor:
+            return reference_positions
+        # If no sources specified, use all sources from reference words
+        sources_to_check = sources or list(gap.reference_word_ids.keys())
+        for source in sources_to_check:
+            # Get reference positions from the anchor
+            if source in preceding_anchor.reference_positions:
+                # Calculate base position from anchor
+                anchor_pos = preceding_anchor.reference_positions[source]
+                base_ref_pos = anchor_pos + len(preceding_anchor.reference_word_ids[source])
+                # Calculate word offset within the gap
+                word_offset = 0
+                # Add word offset to base position
+                ref_pos = base_ref_pos + word_offset
+                reference_positions[source] = ref_pos
         return reference_positions
     @staticmethod
@@ -43,7 +60,10 @@ class WordOperations:
         source: str,
         confidence: float,
         reason: str,
+        handler: str,
         reference_positions: Optional[Dict[str, int]] = None,
+        original_word_id: Optional[str] = None,
+        corrected_word_id: Optional[str] = None,
     ) -> WordCorrection:
         """Creates a correction for replacing a single word with another word."""
         return WordCorrection(
@@ -56,7 +76,10 @@ class WordOperations:
             reason=reason,
             alternatives={},
             reference_positions=reference_positions,
-            length=1,  # Single word replacement
+            length=1,
+            handler=handler,
+            word_id=original_word_id,
+            corrected_word_id=corrected_word_id if corrected_word_id is not None else (WordUtils.generate_id() if corrected_word else None),
         )
     @staticmethod
@@ -67,11 +90,19 @@ class WordOperations:
         source: str,
         confidence: float,
         reason: str,
+        handler: str,
         reference_positions: Optional[Dict[str, int]] = None,
+        original_word_id: Optional[str] = None,
+        corrected_word_ids: Optional[List[str]] = None,
     ) -> List[WordCorrection]:
         """Creates corrections for splitting a single word into multiple words."""
         corrections = []
-        for split_idx, ref_word in enumerate(reference_words):
+        # Generate word IDs if none provided
+        if corrected_word_ids is None:
+            corrected_word_ids = [WordUtils.generate_id() for _ in reference_words]
+        for split_idx, (ref_word, word_id) in enumerate(zip(reference_words, corrected_word_ids)):
             corrections.append(
                 WordCorrection(
                     original_word=original_word,
@@ -86,6 +117,9 @@ class WordOperations:
                     split_total=len(reference_words),
                     reference_positions=reference_positions,
                     length=1,  # Each split word is length 1
+                    handler=handler,
+                    word_id=WordUtils.generate_id(),  # Generate new ID for each split
+                    corrected_word_id=word_id,
                 )
             )
         return corrections
@@ -99,10 +133,16 @@ class WordOperations:
         confidence: float,
         combine_reason: str,
         delete_reason: str,
+        handler: str,
         reference_positions: Optional[Dict[str, int]] = None,
+        original_word_ids: Optional[List[str]] = None,
+        corrected_word_id: Optional[str] = None,
     ) -> List[WordCorrection]:
         """Creates corrections for combining multiple words into a single word."""
         corrections = []
+        word_ids = original_word_ids or [None] * len(original_words)
+        final_word_id = corrected_word_id or WordUtils.generate_id()
         # First word gets replaced
         corrections.append(
@@ -117,11 +157,14 @@ class WordOperations:
                 alternatives={},
                 reference_positions=reference_positions,
                 length=len(original_words),  # Combined word spans all original words
+                handler=handler,
+                word_id=WordUtils.generate_id(),  # Generate new ID for combined word
+                corrected_word_id=final_word_id,
             )
         )
         # Additional words get marked for deletion
-        for i, word in enumerate(original_words[1:], start=1):
+        for i, (word, word_id) in enumerate(zip(original_words[1:], word_ids[1:]), start=1):
             corrections.append(
                 WordCorrection(
                     original_word=word,
@@ -135,6 +178,9 @@ class WordOperations:
                     is_deletion=True,
                     reference_positions=reference_positions,
                     length=1,  # Deleted words are length 1
+                    handler=handler,
+                    word_id=WordUtils.generate_id(),  # Generate new ID for each deleted word
+                    corrected_word_id=None,  # Deleted words don't need a corrected ID
                 )
             )

lyrics_transcriber/correction/text_utils.py CHANGED Viewed

@@ -10,19 +10,15 @@ def clean_text(text: str) -> str:
     Returns:
         Cleaned text with:
         - All text converted to lowercase
-        - Hyphens and slashes converted to spaces
-        - All other punctuation removed
         - Multiple spaces/whitespace collapsed to single space
         - Leading/trailing whitespace removed
+        - Punctuation removed (except for internal hyphens/slashes in words)
     """
     # Convert to lowercase
     text = text.lower()
-    # Replace hyphens and slashes with spaces first
-    text = text.replace("-", " ").replace("/", " ")
-    # Remove remaining punctuation
-    text = re.sub(r"[^\w\s]", "", text)
+    # Remove punctuation except hyphens and slashes that are between word characters
+    text = re.sub(r"(?<!\w)[^\w\s]|[^\w\s](?!\w)", "", text)
     # Normalize whitespace (collapse multiple spaces, remove leading/trailing)
     text = " ".join(text.split())

lyrics_transcriber/frontend/.yarn/install-state.gz ADDED Viewed

Binary file

lyrics-transcriber 0.40.0__py3-none-any.whl → 0.42.0__py3-none-any.whl

lyrics-transcriber 0.40.0py3-none-any.whl → 0.42.0py3-none-any.whl