lyrics-transcriber 0.30.0__py3-none-any.whl → 0.32.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. lyrics_transcriber/__init__.py +2 -1
  2. lyrics_transcriber/cli/{main.py → cli_main.py} +47 -14
  3. lyrics_transcriber/core/config.py +35 -0
  4. lyrics_transcriber/core/controller.py +164 -166
  5. lyrics_transcriber/correction/anchor_sequence.py +471 -0
  6. lyrics_transcriber/correction/corrector.py +256 -0
  7. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  8. lyrics_transcriber/correction/handlers/base.py +30 -0
  9. lyrics_transcriber/correction/handlers/extend_anchor.py +91 -0
  10. lyrics_transcriber/correction/handlers/levenshtein.py +147 -0
  11. lyrics_transcriber/correction/handlers/no_space_punct_match.py +98 -0
  12. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +55 -0
  13. lyrics_transcriber/correction/handlers/repeat.py +71 -0
  14. lyrics_transcriber/correction/handlers/sound_alike.py +223 -0
  15. lyrics_transcriber/correction/handlers/syllables_match.py +182 -0
  16. lyrics_transcriber/correction/handlers/word_count_match.py +54 -0
  17. lyrics_transcriber/correction/handlers/word_operations.py +135 -0
  18. lyrics_transcriber/correction/phrase_analyzer.py +426 -0
  19. lyrics_transcriber/correction/text_utils.py +30 -0
  20. lyrics_transcriber/lyrics/base_lyrics_provider.py +125 -0
  21. lyrics_transcriber/lyrics/genius.py +73 -0
  22. lyrics_transcriber/lyrics/spotify.py +82 -0
  23. lyrics_transcriber/output/ass/__init__.py +21 -0
  24. lyrics_transcriber/output/{ass.py → ass/ass.py} +150 -690
  25. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  26. lyrics_transcriber/output/ass/config.py +37 -0
  27. lyrics_transcriber/output/ass/constants.py +23 -0
  28. lyrics_transcriber/output/ass/event.py +94 -0
  29. lyrics_transcriber/output/ass/formatters.py +132 -0
  30. lyrics_transcriber/output/ass/lyrics_line.py +219 -0
  31. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  32. lyrics_transcriber/output/ass/section_detector.py +89 -0
  33. lyrics_transcriber/output/ass/section_screen.py +106 -0
  34. lyrics_transcriber/output/ass/style.py +187 -0
  35. lyrics_transcriber/output/cdg.py +503 -0
  36. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  37. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  38. lyrics_transcriber/output/cdgmaker/composer.py +1919 -0
  39. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  40. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  41. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  42. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  43. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  44. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  45. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  46. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  47. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  48. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  49. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  50. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  51. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  52. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  53. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  54. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  55. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  56. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  57. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  58. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  59. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  60. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  61. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  62. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  63. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  64. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  65. lyrics_transcriber/output/generator.py +140 -171
  66. lyrics_transcriber/output/lyrics_file.py +102 -0
  67. lyrics_transcriber/output/plain_text.py +91 -0
  68. lyrics_transcriber/output/segment_resizer.py +416 -0
  69. lyrics_transcriber/output/subtitles.py +328 -302
  70. lyrics_transcriber/output/video.py +219 -0
  71. lyrics_transcriber/review/__init__.py +1 -0
  72. lyrics_transcriber/review/server.py +138 -0
  73. lyrics_transcriber/storage/dropbox.py +110 -134
  74. lyrics_transcriber/transcribers/audioshake.py +171 -105
  75. lyrics_transcriber/transcribers/base_transcriber.py +149 -0
  76. lyrics_transcriber/transcribers/whisper.py +267 -133
  77. lyrics_transcriber/types.py +454 -0
  78. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/METADATA +14 -3
  79. lyrics_transcriber-0.32.1.dist-info/RECORD +86 -0
  80. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/WHEEL +1 -1
  81. lyrics_transcriber-0.32.1.dist-info/entry_points.txt +4 -0
  82. lyrics_transcriber/core/corrector.py +0 -56
  83. lyrics_transcriber/core/fetcher.py +0 -143
  84. lyrics_transcriber/storage/tokens.py +0 -116
  85. lyrics_transcriber/transcribers/base.py +0 -31
  86. lyrics_transcriber-0.30.0.dist-info/RECORD +0 -22
  87. lyrics_transcriber-0.30.0.dist-info/entry_points.txt +0 -3
  88. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,256 @@
1
+ from typing import List, Optional, Tuple, Union
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ from lyrics_transcriber.correction.handlers.no_space_punct_match import NoSpacePunctuationMatchHandler
6
+ from lyrics_transcriber.correction.handlers.relaxed_word_count_match import RelaxedWordCountMatchHandler
7
+ from lyrics_transcriber.correction.handlers.syllables_match import SyllablesMatchHandler
8
+ from lyrics_transcriber.types import GapSequence, LyricsData, TranscriptionResult, CorrectionResult, LyricsSegment, WordCorrection, Word
9
+ from lyrics_transcriber.correction.anchor_sequence import AnchorSequenceFinder
10
+ from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
11
+ from lyrics_transcriber.correction.handlers.word_count_match import WordCountMatchHandler
12
+ from lyrics_transcriber.correction.handlers.extend_anchor import ExtendAnchorHandler
13
+ from lyrics_transcriber.correction.handlers.sound_alike import SoundAlikeHandler
14
+ from lyrics_transcriber.correction.handlers.levenshtein import LevenshteinHandler
15
+ from lyrics_transcriber.correction.handlers.repeat import RepeatCorrectionHandler
16
+
17
+
18
+ class LyricsCorrector:
19
+ """
20
+ Coordinates lyrics correction process using multiple correction handlers.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ cache_dir: Union[str, Path],
26
+ handlers: Optional[List[GapCorrectionHandler]] = None,
27
+ anchor_finder: Optional[AnchorSequenceFinder] = None,
28
+ logger: Optional[logging.Logger] = None,
29
+ ):
30
+ self.logger = logger or logging.getLogger(__name__)
31
+ self.anchor_finder = anchor_finder or AnchorSequenceFinder(cache_dir=cache_dir, logger=self.logger)
32
+
33
+ # Default handlers in order of preference
34
+ self.handlers = handlers or [
35
+ WordCountMatchHandler(),
36
+ RelaxedWordCountMatchHandler(),
37
+ NoSpacePunctuationMatchHandler(),
38
+ SyllablesMatchHandler(),
39
+ ExtendAnchorHandler(),
40
+ RepeatCorrectionHandler(),
41
+ SoundAlikeHandler(),
42
+ LevenshteinHandler(),
43
+ ]
44
+
45
+ def run(self, transcription_results: List[TranscriptionResult], lyrics_results: List[LyricsData]) -> CorrectionResult:
46
+ """Execute the correction process."""
47
+ if not transcription_results:
48
+ self.logger.error("No transcription results available")
49
+ raise ValueError("No primary transcription data available")
50
+
51
+ # Get primary transcription
52
+ primary_transcription = sorted(transcription_results, key=lambda x: x.priority)[0].result
53
+ transcribed_text = " ".join(" ".join(w.text for w in segment.words) for segment in primary_transcription.segments)
54
+ reference_texts = {lyrics.source: lyrics.lyrics for lyrics in lyrics_results}
55
+
56
+ # Find anchor sequences and gaps
57
+ self.logger.debug("Finding anchor sequences and gaps")
58
+ anchor_sequences = self.anchor_finder.find_anchors(transcribed_text, reference_texts)
59
+ gap_sequences = self.anchor_finder.find_gaps(transcribed_text, anchor_sequences, reference_texts)
60
+
61
+ # Process corrections
62
+ corrections, corrected_segments = self._process_corrections(primary_transcription.segments, gap_sequences)
63
+
64
+ # Calculate correction ratio
65
+ total_words = sum(len(segment.words) for segment in corrected_segments)
66
+ corrections_made = len(corrections)
67
+ correction_ratio = 1 - (corrections_made / total_words if total_words > 0 else 0)
68
+
69
+ return CorrectionResult(
70
+ original_segments=primary_transcription.segments,
71
+ corrected_segments=corrected_segments,
72
+ corrected_text="\n".join(segment.text for segment in corrected_segments) + "\n",
73
+ corrections=corrections,
74
+ corrections_made=corrections_made,
75
+ confidence=correction_ratio,
76
+ transcribed_text=transcribed_text,
77
+ reference_texts=reference_texts,
78
+ anchor_sequences=anchor_sequences,
79
+ resized_segments=[],
80
+ gap_sequences=gap_sequences,
81
+ metadata={
82
+ "anchor_sequences_count": len(anchor_sequences),
83
+ "gap_sequences_count": len(gap_sequences),
84
+ "total_words": total_words,
85
+ "correction_ratio": correction_ratio,
86
+ },
87
+ )
88
+
89
+ def _preserve_formatting(self, original: str, new_word: str) -> str:
90
+ """Preserve original word's formatting when applying correction."""
91
+ # Find leading/trailing whitespace
92
+ leading_space = " " if original != original.lstrip() else ""
93
+ trailing_space = " " if original != original.rstrip() else ""
94
+ return leading_space + new_word.strip() + trailing_space
95
+
96
+ def _process_corrections(
97
+ self, segments: List[LyricsSegment], gap_sequences: List[GapSequence]
98
+ ) -> Tuple[List[WordCorrection], List[LyricsSegment]]:
99
+ """Process corrections using handlers.
100
+
101
+ The correction flow works as follows:
102
+ 1. First pass: Process all gaps
103
+ - Iterate through each gap sequence
104
+ - Try handlers until one can handle the gap
105
+ - Store all corrections in the gap
106
+ 2. Second pass: Apply corrections to segments
107
+ - Iterate through segments and words
108
+ - Look up any corrections that apply to each word
109
+ - Create new segments with corrected words
110
+
111
+ This two-pass approach separates the concerns of:
112
+ a) Finding and making corrections (gap-centric)
113
+ b) Applying those corrections to the original text (segment-centric)
114
+ """
115
+ self.logger.info(f"Starting correction process with {len(gap_sequences)} gaps")
116
+
117
+ # First pass: Process all gaps
118
+ all_corrections = self._process_gaps(gap_sequences)
119
+
120
+ # Second pass: Apply corrections to segments
121
+ corrected_segments = self._apply_corrections_to_segments(segments, all_corrections)
122
+
123
+ self.logger.info(f"Correction process complete. Made {len(all_corrections)} corrections")
124
+ return all_corrections, corrected_segments
125
+
126
+ def _process_gaps(self, gap_sequences: List[GapSequence]) -> List[WordCorrection]:
127
+ """Process each gap using available handlers until all words are corrected or no handlers remain."""
128
+ all_corrections = []
129
+
130
+ for gap in gap_sequences:
131
+ self.logger.debug(f"Processing gap: {gap.text}")
132
+ high_confidence_positions = set() # Track positions that have high confidence corrections
133
+ corrected_positions = set() # Track all corrected positions regardless of confidence
134
+
135
+ # Try each handler until gap is fully corrected
136
+ for handler in self.handlers:
137
+ # Skip if all words have high confidence corrections
138
+ uncorrected_positions = set(range(gap.transcription_position, gap.transcription_position + gap.length))
139
+ uncorrected_positions -= corrected_positions # Skip any corrected positions
140
+
141
+ if not uncorrected_positions:
142
+ self.logger.debug("All words have been corrected, skipping remaining handlers")
143
+ break
144
+
145
+ self.logger.debug(f"Trying handler {handler.__class__.__name__}")
146
+
147
+ # Pass previous corrections to RepeatCorrectionHandler
148
+ if isinstance(handler, RepeatCorrectionHandler):
149
+ handler.set_previous_corrections(all_corrections)
150
+
151
+ can_handle, handler_data = handler.can_handle(gap)
152
+ if can_handle:
153
+ self.logger.debug(f"{handler.__class__.__name__} can handle gap")
154
+ # Only pass handler_data if it's not empty
155
+ corrections = handler.handle(gap, handler_data if handler_data else None)
156
+ if corrections:
157
+ # Add corrections to gap and track corrected positions
158
+ for correction in corrections:
159
+ # Skip if this position was already corrected
160
+ if correction.original_position in corrected_positions:
161
+ continue
162
+
163
+ gap.add_correction(correction)
164
+ corrected_positions.add(correction.original_position)
165
+ # Track positions with high confidence corrections (>= 0.9)
166
+ if correction.confidence >= 0.9:
167
+ high_confidence_positions.add(correction.original_position)
168
+
169
+ # Filter out corrections for already corrected positions
170
+ new_corrections = [c for c in corrections if c.original_position in corrected_positions]
171
+ if new_corrections:
172
+ self.logger.debug(
173
+ f"{handler.__class__.__name__} made {len(new_corrections)} corrections: "
174
+ f"{[f'{c.original_word}->{c.corrected_word}' for c in new_corrections]}"
175
+ )
176
+ all_corrections.extend(new_corrections)
177
+
178
+ # Log remaining uncorrected words
179
+ if not gap.is_fully_corrected:
180
+ uncorrected = [word for pos, word in gap.uncorrected_words if pos not in corrected_positions]
181
+ if uncorrected:
182
+ self.logger.debug(f"Uncorrected words remaining: {', '.join(uncorrected)}")
183
+
184
+ if not gap.corrections:
185
+ self.logger.warning("No handler could handle the gap")
186
+
187
+ return all_corrections
188
+
189
+ def _apply_corrections_to_segments(self, segments: List[LyricsSegment], corrections: List[WordCorrection]) -> List[LyricsSegment]:
190
+ """Apply corrections to create new segments."""
191
+ correction_map = {}
192
+ # Group corrections by original_position to handle splits
193
+ for c in corrections:
194
+ if c.original_position not in correction_map:
195
+ correction_map[c.original_position] = []
196
+ correction_map[c.original_position].append(c)
197
+
198
+ corrected_segments = []
199
+ current_word_idx = 0
200
+
201
+ for segment_idx, segment in enumerate(segments):
202
+ corrected_words = []
203
+ for word in segment.words:
204
+ if current_word_idx in correction_map:
205
+ word_corrections = sorted(correction_map[current_word_idx], key=lambda x: x.split_index or 0)
206
+
207
+ # Check if any correction has a valid split_total
208
+ total_splits = next((c.split_total for c in word_corrections if c.split_total is not None), None)
209
+
210
+ if total_splits:
211
+ # Handle word split
212
+ split_duration = (word.end_time - word.start_time) / total_splits
213
+
214
+ for i, correction in enumerate(word_corrections):
215
+ start_time = word.start_time + (i * split_duration)
216
+ end_time = start_time + split_duration
217
+
218
+ # Update corrected_position as we create new words
219
+ correction.corrected_position = len(corrected_words)
220
+ corrected_words.append(
221
+ Word(
222
+ text=self._preserve_formatting(correction.original_word, correction.corrected_word),
223
+ start_time=start_time,
224
+ end_time=end_time,
225
+ confidence=correction.confidence,
226
+ )
227
+ )
228
+ else:
229
+ # Handle single word replacement
230
+ correction = word_corrections[0]
231
+ if not correction.is_deletion:
232
+ # Update corrected_position
233
+ correction.corrected_position = len(corrected_words)
234
+ corrected_words.append(
235
+ Word(
236
+ text=self._preserve_formatting(correction.original_word, correction.corrected_word),
237
+ start_time=word.start_time,
238
+ end_time=word.end_time,
239
+ confidence=correction.confidence,
240
+ )
241
+ )
242
+ else:
243
+ corrected_words.append(word)
244
+ current_word_idx += 1
245
+
246
+ if corrected_words:
247
+ corrected_segments.append(
248
+ LyricsSegment(
249
+ text=" ".join(w.text for w in corrected_words),
250
+ words=corrected_words,
251
+ start_time=segment.start_time,
252
+ end_time=segment.end_time,
253
+ )
254
+ )
255
+
256
+ return corrected_segments
File without changes
@@ -0,0 +1,30 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Optional, Tuple, Dict, Any
3
+
4
+ from lyrics_transcriber.types import GapSequence, WordCorrection
5
+
6
+
7
+ class GapCorrectionHandler(ABC):
8
+ """Base class for gap correction handlers."""
9
+
10
+ @abstractmethod
11
+ def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
12
+ """Determine if this handler can process the given gap.
13
+
14
+ Returns:
15
+ Tuple containing:
16
+ - bool: Whether this handler can process the gap
17
+ - dict: Data computed during can_handle that will be needed by handle().
18
+ Empty dict if no data needs to be passed.
19
+ """
20
+ pass
21
+
22
+ @abstractmethod
23
+ def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
24
+ """Process a gap and return any corrections.
25
+
26
+ Args:
27
+ gap: The gap sequence to process
28
+ data: Optional data dictionary returned by can_handle()
29
+ """
30
+ pass
@@ -0,0 +1,91 @@
1
+ from typing import List, Optional, Tuple, Dict, Any
2
+
3
+ from lyrics_transcriber.types import GapSequence, WordCorrection
4
+ from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
5
+ from lyrics_transcriber.correction.handlers.word_operations import WordOperations
6
+
7
+
8
+ class ExtendAnchorHandler(GapCorrectionHandler):
9
+ """Handles gaps where some words match reference text but there are extra words.
10
+
11
+ This handler looks for cases where:
12
+ 1. One or more words in the gap match words in the same position in at least one reference source
13
+ 2. The gap may contain additional words that aren't in the reference
14
+
15
+ When such matches are found, it:
16
+ 1. Validates all matching words (creates corrections that keep the same words)
17
+ 2. Leaves all non-matching words unchanged for other handlers to process
18
+
19
+ The confidence of validations is based on the ratio of reference sources that agree.
20
+ For example, if 2 out of 4 sources have the matching word, confidence will be 0.5.
21
+
22
+ Examples:
23
+ Gap: "hello world extra words"
24
+ References:
25
+ genius: ["hello", "world"]
26
+ spotify: ["hello", "world"]
27
+ Result:
28
+ - Validate "hello" (confidence=1.0)
29
+ - Validate "world" (confidence=1.0)
30
+ - Leave "extra" and "words" unchanged
31
+
32
+ Gap: "martyr youre a"
33
+ References:
34
+ genius: ["martyr"]
35
+ spotify: ["mother"]
36
+ Result:
37
+ - Validate "martyr" (confidence=0.5, source="genius")
38
+ - Leave "youre" and "a" unchanged
39
+ """
40
+
41
+ def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
42
+ # Must have reference words
43
+ if not gap.reference_words:
44
+ return False, {}
45
+
46
+ # Gap must have words
47
+ if not gap.words:
48
+ return False, {}
49
+
50
+ # At least one word must match between gap and any reference source
51
+ # in the same position
52
+ has_match = any(
53
+ i < len(ref_words) and gap.words[i].lower() == ref_words[i].lower()
54
+ for ref_words in gap.reference_words.values()
55
+ for i in range(min(len(gap.words), len(ref_words)))
56
+ )
57
+
58
+ return has_match, {}
59
+
60
+ def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
61
+ corrections = []
62
+
63
+ # Process each word in the gap that has a corresponding reference position
64
+ for i, word in enumerate(gap.words):
65
+ # Find reference sources that have a matching word at this position
66
+ matching_sources = [
67
+ source for source, ref_words in gap.reference_words.items() if i < len(ref_words) and word.lower() == ref_words[i].lower()
68
+ ]
69
+
70
+ if matching_sources:
71
+ # Word matches reference(s) at this position - validate it
72
+ confidence = len(matching_sources) / len(gap.reference_words)
73
+ sources = ", ".join(matching_sources)
74
+
75
+ # Calculate reference positions for matching sources
76
+ reference_positions = WordOperations.calculate_reference_positions(gap, matching_sources)
77
+
78
+ corrections.append(
79
+ WordOperations.create_word_replacement_correction(
80
+ original_word=word,
81
+ corrected_word=word, # Same word, just validating
82
+ original_position=gap.transcription_position + i,
83
+ source=sources,
84
+ confidence=confidence,
85
+ reason="ExtendAnchorHandler: Matched reference source(s)",
86
+ reference_positions=reference_positions,
87
+ )
88
+ )
89
+ # No else clause - non-matching words are left unchanged
90
+
91
+ return corrections
@@ -0,0 +1,147 @@
1
+ from typing import List, Optional, Tuple, Dict, Any
2
+ import string
3
+ import Levenshtein
4
+ import logging
5
+
6
+ from lyrics_transcriber.types import GapSequence, WordCorrection
7
+ from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
8
+ from lyrics_transcriber.correction.handlers.word_operations import WordOperations
9
+
10
+
11
+ class LevenshteinHandler(GapCorrectionHandler):
12
+ """Handles corrections based on Levenshtein (edit distance) similarity between words.
13
+
14
+ This handler looks for words that are similar in spelling to reference words in the same position.
15
+ The similarity calculation includes:
16
+ 1. Basic Levenshtein ratio
17
+ 2. Bonus for words starting with the same letter
18
+ 3. Penalty for words starting with different letters
19
+ 4. Bonus for similar length words
20
+
21
+ Examples:
22
+ Gap: "wold" (misspelling)
23
+ References:
24
+ genius: ["world"]
25
+ spotify: ["world"]
26
+ Result:
27
+ - Correct "wold" to "world" (high confidence due to small edit distance)
28
+
29
+ Gap: "worde" (misspelling)
30
+ References:
31
+ genius: ["world"]
32
+ spotify: ["words"]
33
+ Result:
34
+ - Correct "worde" to "world" (lower confidence due to disagreeing sources)
35
+ """
36
+
37
+ def __init__(self, similarity_threshold: float = 0.65, logger: Optional[logging.Logger] = None):
38
+ self.similarity_threshold = similarity_threshold
39
+ self.logger = logger or logging.getLogger(__name__)
40
+
41
+ def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
42
+ """Check if we can handle this gap - we'll try if there are reference words."""
43
+ if not gap.reference_words:
44
+ self.logger.debug("No reference words available")
45
+ return False, {}
46
+
47
+ if not gap.words:
48
+ self.logger.debug("No gap words available")
49
+ return False, {}
50
+
51
+ # Check if any word has sufficient similarity to reference
52
+ for i, word in enumerate(gap.words):
53
+ for ref_words in gap.reference_words.values():
54
+ if i < len(ref_words):
55
+ similarity = self._get_string_similarity(word, ref_words[i])
56
+ if similarity >= self.similarity_threshold:
57
+ self.logger.debug(f"Found similar word: '{word}' -> '{ref_words[i]}' ({similarity:.2f})")
58
+ return True, {}
59
+
60
+ self.logger.debug("No words meet similarity threshold")
61
+ return False, {}
62
+
63
+ def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
64
+ """Try to correct words based on string similarity."""
65
+ corrections = []
66
+
67
+ # Process each word in the gap
68
+ for i, word in enumerate(gap.words):
69
+ # Skip if word is empty or just punctuation
70
+ if not word.strip():
71
+ continue
72
+
73
+ # Skip exact matches
74
+ if any(i < len(ref_words) and word.lower() == ref_words[i].lower() for ref_words in gap.reference_words.values()):
75
+ self.logger.debug(f"Skipping exact match: '{word}'")
76
+ continue
77
+
78
+ # Find matching reference words at this position
79
+ matches = {} # word -> (sources, similarity)
80
+ for source, ref_words in gap.reference_words.items():
81
+ ref_words_original = gap.reference_words_original[source] # Get original formatted words
82
+ if i >= len(ref_words):
83
+ continue
84
+
85
+ ref_word = ref_words[i]
86
+ ref_word_original = ref_words_original[i] # Get original formatted word
87
+ similarity = self._get_string_similarity(word, ref_word)
88
+
89
+ if similarity >= self.similarity_threshold:
90
+ self.logger.debug(f"Found match: '{word}' -> '{ref_word}' ({similarity:.2f})")
91
+ if ref_word_original not in matches: # Use original formatted word as key
92
+ matches[ref_word_original] = ([], similarity)
93
+ matches[ref_word_original][0].append(source)
94
+
95
+ # Create correction for best match if any found
96
+ if matches:
97
+ best_match, (sources, similarity) = max(
98
+ matches.items(), key=lambda x: (len(x[1][0]), x[1][1]) # Sort by number of sources, then similarity
99
+ )
100
+
101
+ source_confidence = len(sources) / len(gap.reference_words)
102
+ final_confidence = similarity * source_confidence
103
+
104
+ # Calculate reference positions for matching sources
105
+ reference_positions = WordOperations.calculate_reference_positions(gap, sources)
106
+
107
+ self.logger.debug(f"Creating correction: {word} -> {best_match} (confidence: {final_confidence})")
108
+ corrections.append(
109
+ WordOperations.create_word_replacement_correction(
110
+ original_word=word,
111
+ corrected_word=best_match, # Using original formatted word
112
+ original_position=gap.transcription_position + i,
113
+ source=", ".join(sources),
114
+ confidence=final_confidence,
115
+ reason=f"LevenshteinHandler: String similarity ({final_confidence:.2f})",
116
+ reference_positions=reference_positions,
117
+ )
118
+ )
119
+
120
+ return corrections
121
+
122
+ def _clean_word(self, word: str) -> str:
123
+ """Remove punctuation and standardize for comparison."""
124
+ return word.strip().lower().strip(string.punctuation)
125
+
126
+ def _get_string_similarity(self, word1: str, word2: str) -> float:
127
+ """Calculate string similarity using Levenshtein ratio with adjustments."""
128
+ # Clean words
129
+ w1, w2 = self._clean_word(word1), self._clean_word(word2)
130
+ if not w1 or not w2:
131
+ return 0.0
132
+
133
+ # Calculate Levenshtein ratio
134
+ similarity = Levenshtein.ratio(w1, w2)
135
+
136
+ # Boost similarity for words starting with the same letter
137
+ if w1[0] == w2[0]:
138
+ similarity = (similarity + 1) / 2
139
+ else:
140
+ # Penalize words starting with different letters
141
+ similarity = similarity * 0.9
142
+
143
+ # Boost for similar length words
144
+ length_ratio = min(len(w1), len(w2)) / max(len(w1), len(w2))
145
+ similarity = (similarity + length_ratio) / 2
146
+
147
+ return similarity
@@ -0,0 +1,98 @@
1
+ from typing import List, Optional, Tuple, Dict, Any
2
+ import re
3
+
4
+ from lyrics_transcriber.types import GapSequence, WordCorrection
5
+ from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
6
+ from lyrics_transcriber.correction.handlers.word_operations import WordOperations
7
+
8
+
9
+ class NoSpacePunctuationMatchHandler(GapCorrectionHandler):
10
+ """Handles gaps where reference text matches when spaces and punctuation are removed."""
11
+
12
+ def _remove_spaces_and_punct(self, words: List[str]) -> str:
13
+ """Join words and remove all whitespace and punctuation."""
14
+ text = "".join(words).lower()
15
+ # Remove all punctuation including apostrophes
16
+ return re.sub(r"[^\w\s]", "", text)
17
+
18
+ def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
19
+ # Must have reference words
20
+ if not gap.reference_words:
21
+ return False, {}
22
+
23
+ # Get the gap text without spaces and punctuation
24
+ gap_text = self._remove_spaces_and_punct(gap.words)
25
+
26
+ # Check if any reference source matches when spaces and punctuation are removed
27
+ for words in gap.reference_words.values():
28
+ ref_text = self._remove_spaces_and_punct(words)
29
+ if gap_text == ref_text:
30
+ return True, {}
31
+
32
+ return False, {}
33
+
34
+ def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
35
+ corrections = []
36
+
37
+ # Find the matching source (we know there is at least one from can_handle)
38
+ gap_text = self._remove_spaces_and_punct(gap.words)
39
+ matching_source = None
40
+ reference_words = None
41
+ reference_words_original = None
42
+ for source, words in gap.reference_words.items():
43
+ if self._remove_spaces_and_punct(words) == gap_text:
44
+ matching_source = source
45
+ reference_words = words
46
+ reference_words_original = gap.reference_words_original[source]
47
+ break
48
+
49
+ # Calculate reference positions for the matching source
50
+ reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
51
+
52
+ # Handle cases where number of words differ
53
+ if len(gap.words) > len(reference_words):
54
+ # Multiple transcribed words -> fewer reference words
55
+ corrections.extend(
56
+ WordOperations.create_word_combine_corrections(
57
+ original_words=gap.words,
58
+ reference_word=reference_words_original[0],
59
+ original_position=gap.transcription_position,
60
+ source=matching_source,
61
+ confidence=1.0,
62
+ combine_reason="NoSpacePunctuationMatchHandler: Words combined based on text match",
63
+ delete_reason="NoSpacePunctuationMatchHandler: Word removed as part of text match combination",
64
+ reference_positions=reference_positions,
65
+ )
66
+ )
67
+
68
+ elif len(gap.words) < len(reference_words):
69
+ # Single transcribed word -> multiple reference words
70
+ corrections.extend(
71
+ WordOperations.create_word_split_corrections(
72
+ original_word=gap.words[0],
73
+ reference_words=reference_words_original,
74
+ original_position=gap.transcription_position,
75
+ source=matching_source,
76
+ confidence=1.0,
77
+ reason="NoSpacePunctuationMatchHandler: Split word based on text match",
78
+ reference_positions=reference_positions,
79
+ )
80
+ )
81
+
82
+ else:
83
+ # One-to-one replacement
84
+ for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
85
+ if orig_word.lower() != ref_word.lower():
86
+ corrections.append(
87
+ WordOperations.create_word_replacement_correction(
88
+ original_word=orig_word,
89
+ corrected_word=ref_word_original,
90
+ original_position=gap.transcription_position + i,
91
+ source=matching_source,
92
+ confidence=1.0,
93
+ reason=f"NoSpacePunctuationMatchHandler: Source '{matching_source}' matched when spaces and punctuation removed",
94
+ reference_positions=reference_positions,
95
+ )
96
+ )
97
+
98
+ return corrections