lyrics-transcriber 0.30.0__py3-none-any.whl → 0.32.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/__init__.py +2 -1
- lyrics_transcriber/cli/{main.py → cli_main.py} +47 -14
- lyrics_transcriber/core/config.py +35 -0
- lyrics_transcriber/core/controller.py +164 -166
- lyrics_transcriber/correction/anchor_sequence.py +471 -0
- lyrics_transcriber/correction/corrector.py +256 -0
- lyrics_transcriber/correction/handlers/__init__.py +0 -0
- lyrics_transcriber/correction/handlers/base.py +30 -0
- lyrics_transcriber/correction/handlers/extend_anchor.py +91 -0
- lyrics_transcriber/correction/handlers/levenshtein.py +147 -0
- lyrics_transcriber/correction/handlers/no_space_punct_match.py +98 -0
- lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +55 -0
- lyrics_transcriber/correction/handlers/repeat.py +71 -0
- lyrics_transcriber/correction/handlers/sound_alike.py +223 -0
- lyrics_transcriber/correction/handlers/syllables_match.py +182 -0
- lyrics_transcriber/correction/handlers/word_count_match.py +54 -0
- lyrics_transcriber/correction/handlers/word_operations.py +135 -0
- lyrics_transcriber/correction/phrase_analyzer.py +426 -0
- lyrics_transcriber/correction/text_utils.py +30 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +125 -0
- lyrics_transcriber/lyrics/genius.py +73 -0
- lyrics_transcriber/lyrics/spotify.py +82 -0
- lyrics_transcriber/output/ass/__init__.py +21 -0
- lyrics_transcriber/output/{ass.py → ass/ass.py} +150 -690
- lyrics_transcriber/output/ass/ass_specs.txt +732 -0
- lyrics_transcriber/output/ass/config.py +37 -0
- lyrics_transcriber/output/ass/constants.py +23 -0
- lyrics_transcriber/output/ass/event.py +94 -0
- lyrics_transcriber/output/ass/formatters.py +132 -0
- lyrics_transcriber/output/ass/lyrics_line.py +219 -0
- lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
- lyrics_transcriber/output/ass/section_detector.py +89 -0
- lyrics_transcriber/output/ass/section_screen.py +106 -0
- lyrics_transcriber/output/ass/style.py +187 -0
- lyrics_transcriber/output/cdg.py +503 -0
- lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
- lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
- lyrics_transcriber/output/cdgmaker/composer.py +1919 -0
- lyrics_transcriber/output/cdgmaker/config.py +151 -0
- lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
- lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
- lyrics_transcriber/output/cdgmaker/pack.py +507 -0
- lyrics_transcriber/output/cdgmaker/render.py +346 -0
- lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
- lyrics_transcriber/output/cdgmaker/utils.py +132 -0
- lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
- lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
- lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/arial.ttf +0 -0
- lyrics_transcriber/output/fonts/georgia.ttf +0 -0
- lyrics_transcriber/output/fonts/verdana.ttf +0 -0
- lyrics_transcriber/output/generator.py +140 -171
- lyrics_transcriber/output/lyrics_file.py +102 -0
- lyrics_transcriber/output/plain_text.py +91 -0
- lyrics_transcriber/output/segment_resizer.py +416 -0
- lyrics_transcriber/output/subtitles.py +328 -302
- lyrics_transcriber/output/video.py +219 -0
- lyrics_transcriber/review/__init__.py +1 -0
- lyrics_transcriber/review/server.py +138 -0
- lyrics_transcriber/storage/dropbox.py +110 -134
- lyrics_transcriber/transcribers/audioshake.py +171 -105
- lyrics_transcriber/transcribers/base_transcriber.py +149 -0
- lyrics_transcriber/transcribers/whisper.py +267 -133
- lyrics_transcriber/types.py +454 -0
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/METADATA +14 -3
- lyrics_transcriber-0.32.1.dist-info/RECORD +86 -0
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/WHEEL +1 -1
- lyrics_transcriber-0.32.1.dist-info/entry_points.txt +4 -0
- lyrics_transcriber/core/corrector.py +0 -56
- lyrics_transcriber/core/fetcher.py +0 -143
- lyrics_transcriber/storage/tokens.py +0 -116
- lyrics_transcriber/transcribers/base.py +0 -31
- lyrics_transcriber-0.30.0.dist-info/RECORD +0 -22
- lyrics_transcriber-0.30.0.dist-info/entry_points.txt +0 -3
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,256 @@
|
|
1
|
+
from typing import List, Optional, Tuple, Union
|
2
|
+
import logging
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
from lyrics_transcriber.correction.handlers.no_space_punct_match import NoSpacePunctuationMatchHandler
|
6
|
+
from lyrics_transcriber.correction.handlers.relaxed_word_count_match import RelaxedWordCountMatchHandler
|
7
|
+
from lyrics_transcriber.correction.handlers.syllables_match import SyllablesMatchHandler
|
8
|
+
from lyrics_transcriber.types import GapSequence, LyricsData, TranscriptionResult, CorrectionResult, LyricsSegment, WordCorrection, Word
|
9
|
+
from lyrics_transcriber.correction.anchor_sequence import AnchorSequenceFinder
|
10
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
11
|
+
from lyrics_transcriber.correction.handlers.word_count_match import WordCountMatchHandler
|
12
|
+
from lyrics_transcriber.correction.handlers.extend_anchor import ExtendAnchorHandler
|
13
|
+
from lyrics_transcriber.correction.handlers.sound_alike import SoundAlikeHandler
|
14
|
+
from lyrics_transcriber.correction.handlers.levenshtein import LevenshteinHandler
|
15
|
+
from lyrics_transcriber.correction.handlers.repeat import RepeatCorrectionHandler
|
16
|
+
|
17
|
+
|
18
|
+
class LyricsCorrector:
|
19
|
+
"""
|
20
|
+
Coordinates lyrics correction process using multiple correction handlers.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
cache_dir: Union[str, Path],
|
26
|
+
handlers: Optional[List[GapCorrectionHandler]] = None,
|
27
|
+
anchor_finder: Optional[AnchorSequenceFinder] = None,
|
28
|
+
logger: Optional[logging.Logger] = None,
|
29
|
+
):
|
30
|
+
self.logger = logger or logging.getLogger(__name__)
|
31
|
+
self.anchor_finder = anchor_finder or AnchorSequenceFinder(cache_dir=cache_dir, logger=self.logger)
|
32
|
+
|
33
|
+
# Default handlers in order of preference
|
34
|
+
self.handlers = handlers or [
|
35
|
+
WordCountMatchHandler(),
|
36
|
+
RelaxedWordCountMatchHandler(),
|
37
|
+
NoSpacePunctuationMatchHandler(),
|
38
|
+
SyllablesMatchHandler(),
|
39
|
+
ExtendAnchorHandler(),
|
40
|
+
RepeatCorrectionHandler(),
|
41
|
+
SoundAlikeHandler(),
|
42
|
+
LevenshteinHandler(),
|
43
|
+
]
|
44
|
+
|
45
|
+
def run(self, transcription_results: List[TranscriptionResult], lyrics_results: List[LyricsData]) -> CorrectionResult:
|
46
|
+
"""Execute the correction process."""
|
47
|
+
if not transcription_results:
|
48
|
+
self.logger.error("No transcription results available")
|
49
|
+
raise ValueError("No primary transcription data available")
|
50
|
+
|
51
|
+
# Get primary transcription
|
52
|
+
primary_transcription = sorted(transcription_results, key=lambda x: x.priority)[0].result
|
53
|
+
transcribed_text = " ".join(" ".join(w.text for w in segment.words) for segment in primary_transcription.segments)
|
54
|
+
reference_texts = {lyrics.source: lyrics.lyrics for lyrics in lyrics_results}
|
55
|
+
|
56
|
+
# Find anchor sequences and gaps
|
57
|
+
self.logger.debug("Finding anchor sequences and gaps")
|
58
|
+
anchor_sequences = self.anchor_finder.find_anchors(transcribed_text, reference_texts)
|
59
|
+
gap_sequences = self.anchor_finder.find_gaps(transcribed_text, anchor_sequences, reference_texts)
|
60
|
+
|
61
|
+
# Process corrections
|
62
|
+
corrections, corrected_segments = self._process_corrections(primary_transcription.segments, gap_sequences)
|
63
|
+
|
64
|
+
# Calculate correction ratio
|
65
|
+
total_words = sum(len(segment.words) for segment in corrected_segments)
|
66
|
+
corrections_made = len(corrections)
|
67
|
+
correction_ratio = 1 - (corrections_made / total_words if total_words > 0 else 0)
|
68
|
+
|
69
|
+
return CorrectionResult(
|
70
|
+
original_segments=primary_transcription.segments,
|
71
|
+
corrected_segments=corrected_segments,
|
72
|
+
corrected_text="\n".join(segment.text for segment in corrected_segments) + "\n",
|
73
|
+
corrections=corrections,
|
74
|
+
corrections_made=corrections_made,
|
75
|
+
confidence=correction_ratio,
|
76
|
+
transcribed_text=transcribed_text,
|
77
|
+
reference_texts=reference_texts,
|
78
|
+
anchor_sequences=anchor_sequences,
|
79
|
+
resized_segments=[],
|
80
|
+
gap_sequences=gap_sequences,
|
81
|
+
metadata={
|
82
|
+
"anchor_sequences_count": len(anchor_sequences),
|
83
|
+
"gap_sequences_count": len(gap_sequences),
|
84
|
+
"total_words": total_words,
|
85
|
+
"correction_ratio": correction_ratio,
|
86
|
+
},
|
87
|
+
)
|
88
|
+
|
89
|
+
def _preserve_formatting(self, original: str, new_word: str) -> str:
|
90
|
+
"""Preserve original word's formatting when applying correction."""
|
91
|
+
# Find leading/trailing whitespace
|
92
|
+
leading_space = " " if original != original.lstrip() else ""
|
93
|
+
trailing_space = " " if original != original.rstrip() else ""
|
94
|
+
return leading_space + new_word.strip() + trailing_space
|
95
|
+
|
96
|
+
def _process_corrections(
|
97
|
+
self, segments: List[LyricsSegment], gap_sequences: List[GapSequence]
|
98
|
+
) -> Tuple[List[WordCorrection], List[LyricsSegment]]:
|
99
|
+
"""Process corrections using handlers.
|
100
|
+
|
101
|
+
The correction flow works as follows:
|
102
|
+
1. First pass: Process all gaps
|
103
|
+
- Iterate through each gap sequence
|
104
|
+
- Try handlers until one can handle the gap
|
105
|
+
- Store all corrections in the gap
|
106
|
+
2. Second pass: Apply corrections to segments
|
107
|
+
- Iterate through segments and words
|
108
|
+
- Look up any corrections that apply to each word
|
109
|
+
- Create new segments with corrected words
|
110
|
+
|
111
|
+
This two-pass approach separates the concerns of:
|
112
|
+
a) Finding and making corrections (gap-centric)
|
113
|
+
b) Applying those corrections to the original text (segment-centric)
|
114
|
+
"""
|
115
|
+
self.logger.info(f"Starting correction process with {len(gap_sequences)} gaps")
|
116
|
+
|
117
|
+
# First pass: Process all gaps
|
118
|
+
all_corrections = self._process_gaps(gap_sequences)
|
119
|
+
|
120
|
+
# Second pass: Apply corrections to segments
|
121
|
+
corrected_segments = self._apply_corrections_to_segments(segments, all_corrections)
|
122
|
+
|
123
|
+
self.logger.info(f"Correction process complete. Made {len(all_corrections)} corrections")
|
124
|
+
return all_corrections, corrected_segments
|
125
|
+
|
126
|
+
def _process_gaps(self, gap_sequences: List[GapSequence]) -> List[WordCorrection]:
|
127
|
+
"""Process each gap using available handlers until all words are corrected or no handlers remain."""
|
128
|
+
all_corrections = []
|
129
|
+
|
130
|
+
for gap in gap_sequences:
|
131
|
+
self.logger.debug(f"Processing gap: {gap.text}")
|
132
|
+
high_confidence_positions = set() # Track positions that have high confidence corrections
|
133
|
+
corrected_positions = set() # Track all corrected positions regardless of confidence
|
134
|
+
|
135
|
+
# Try each handler until gap is fully corrected
|
136
|
+
for handler in self.handlers:
|
137
|
+
# Skip if all words have high confidence corrections
|
138
|
+
uncorrected_positions = set(range(gap.transcription_position, gap.transcription_position + gap.length))
|
139
|
+
uncorrected_positions -= corrected_positions # Skip any corrected positions
|
140
|
+
|
141
|
+
if not uncorrected_positions:
|
142
|
+
self.logger.debug("All words have been corrected, skipping remaining handlers")
|
143
|
+
break
|
144
|
+
|
145
|
+
self.logger.debug(f"Trying handler {handler.__class__.__name__}")
|
146
|
+
|
147
|
+
# Pass previous corrections to RepeatCorrectionHandler
|
148
|
+
if isinstance(handler, RepeatCorrectionHandler):
|
149
|
+
handler.set_previous_corrections(all_corrections)
|
150
|
+
|
151
|
+
can_handle, handler_data = handler.can_handle(gap)
|
152
|
+
if can_handle:
|
153
|
+
self.logger.debug(f"{handler.__class__.__name__} can handle gap")
|
154
|
+
# Only pass handler_data if it's not empty
|
155
|
+
corrections = handler.handle(gap, handler_data if handler_data else None)
|
156
|
+
if corrections:
|
157
|
+
# Add corrections to gap and track corrected positions
|
158
|
+
for correction in corrections:
|
159
|
+
# Skip if this position was already corrected
|
160
|
+
if correction.original_position in corrected_positions:
|
161
|
+
continue
|
162
|
+
|
163
|
+
gap.add_correction(correction)
|
164
|
+
corrected_positions.add(correction.original_position)
|
165
|
+
# Track positions with high confidence corrections (>= 0.9)
|
166
|
+
if correction.confidence >= 0.9:
|
167
|
+
high_confidence_positions.add(correction.original_position)
|
168
|
+
|
169
|
+
# Filter out corrections for already corrected positions
|
170
|
+
new_corrections = [c for c in corrections if c.original_position in corrected_positions]
|
171
|
+
if new_corrections:
|
172
|
+
self.logger.debug(
|
173
|
+
f"{handler.__class__.__name__} made {len(new_corrections)} corrections: "
|
174
|
+
f"{[f'{c.original_word}->{c.corrected_word}' for c in new_corrections]}"
|
175
|
+
)
|
176
|
+
all_corrections.extend(new_corrections)
|
177
|
+
|
178
|
+
# Log remaining uncorrected words
|
179
|
+
if not gap.is_fully_corrected:
|
180
|
+
uncorrected = [word for pos, word in gap.uncorrected_words if pos not in corrected_positions]
|
181
|
+
if uncorrected:
|
182
|
+
self.logger.debug(f"Uncorrected words remaining: {', '.join(uncorrected)}")
|
183
|
+
|
184
|
+
if not gap.corrections:
|
185
|
+
self.logger.warning("No handler could handle the gap")
|
186
|
+
|
187
|
+
return all_corrections
|
188
|
+
|
189
|
+
def _apply_corrections_to_segments(self, segments: List[LyricsSegment], corrections: List[WordCorrection]) -> List[LyricsSegment]:
|
190
|
+
"""Apply corrections to create new segments."""
|
191
|
+
correction_map = {}
|
192
|
+
# Group corrections by original_position to handle splits
|
193
|
+
for c in corrections:
|
194
|
+
if c.original_position not in correction_map:
|
195
|
+
correction_map[c.original_position] = []
|
196
|
+
correction_map[c.original_position].append(c)
|
197
|
+
|
198
|
+
corrected_segments = []
|
199
|
+
current_word_idx = 0
|
200
|
+
|
201
|
+
for segment_idx, segment in enumerate(segments):
|
202
|
+
corrected_words = []
|
203
|
+
for word in segment.words:
|
204
|
+
if current_word_idx in correction_map:
|
205
|
+
word_corrections = sorted(correction_map[current_word_idx], key=lambda x: x.split_index or 0)
|
206
|
+
|
207
|
+
# Check if any correction has a valid split_total
|
208
|
+
total_splits = next((c.split_total for c in word_corrections if c.split_total is not None), None)
|
209
|
+
|
210
|
+
if total_splits:
|
211
|
+
# Handle word split
|
212
|
+
split_duration = (word.end_time - word.start_time) / total_splits
|
213
|
+
|
214
|
+
for i, correction in enumerate(word_corrections):
|
215
|
+
start_time = word.start_time + (i * split_duration)
|
216
|
+
end_time = start_time + split_duration
|
217
|
+
|
218
|
+
# Update corrected_position as we create new words
|
219
|
+
correction.corrected_position = len(corrected_words)
|
220
|
+
corrected_words.append(
|
221
|
+
Word(
|
222
|
+
text=self._preserve_formatting(correction.original_word, correction.corrected_word),
|
223
|
+
start_time=start_time,
|
224
|
+
end_time=end_time,
|
225
|
+
confidence=correction.confidence,
|
226
|
+
)
|
227
|
+
)
|
228
|
+
else:
|
229
|
+
# Handle single word replacement
|
230
|
+
correction = word_corrections[0]
|
231
|
+
if not correction.is_deletion:
|
232
|
+
# Update corrected_position
|
233
|
+
correction.corrected_position = len(corrected_words)
|
234
|
+
corrected_words.append(
|
235
|
+
Word(
|
236
|
+
text=self._preserve_formatting(correction.original_word, correction.corrected_word),
|
237
|
+
start_time=word.start_time,
|
238
|
+
end_time=word.end_time,
|
239
|
+
confidence=correction.confidence,
|
240
|
+
)
|
241
|
+
)
|
242
|
+
else:
|
243
|
+
corrected_words.append(word)
|
244
|
+
current_word_idx += 1
|
245
|
+
|
246
|
+
if corrected_words:
|
247
|
+
corrected_segments.append(
|
248
|
+
LyricsSegment(
|
249
|
+
text=" ".join(w.text for w in corrected_words),
|
250
|
+
words=corrected_words,
|
251
|
+
start_time=segment.start_time,
|
252
|
+
end_time=segment.end_time,
|
253
|
+
)
|
254
|
+
)
|
255
|
+
|
256
|
+
return corrected_segments
|
File without changes
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import List, Optional, Tuple, Dict, Any
|
3
|
+
|
4
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
5
|
+
|
6
|
+
|
7
|
+
class GapCorrectionHandler(ABC):
|
8
|
+
"""Base class for gap correction handlers."""
|
9
|
+
|
10
|
+
@abstractmethod
|
11
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
12
|
+
"""Determine if this handler can process the given gap.
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
Tuple containing:
|
16
|
+
- bool: Whether this handler can process the gap
|
17
|
+
- dict: Data computed during can_handle that will be needed by handle().
|
18
|
+
Empty dict if no data needs to be passed.
|
19
|
+
"""
|
20
|
+
pass
|
21
|
+
|
22
|
+
@abstractmethod
|
23
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
24
|
+
"""Process a gap and return any corrections.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
gap: The gap sequence to process
|
28
|
+
data: Optional data dictionary returned by can_handle()
|
29
|
+
"""
|
30
|
+
pass
|
@@ -0,0 +1,91 @@
|
|
1
|
+
from typing import List, Optional, Tuple, Dict, Any
|
2
|
+
|
3
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
4
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
5
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
6
|
+
|
7
|
+
|
8
|
+
class ExtendAnchorHandler(GapCorrectionHandler):
|
9
|
+
"""Handles gaps where some words match reference text but there are extra words.
|
10
|
+
|
11
|
+
This handler looks for cases where:
|
12
|
+
1. One or more words in the gap match words in the same position in at least one reference source
|
13
|
+
2. The gap may contain additional words that aren't in the reference
|
14
|
+
|
15
|
+
When such matches are found, it:
|
16
|
+
1. Validates all matching words (creates corrections that keep the same words)
|
17
|
+
2. Leaves all non-matching words unchanged for other handlers to process
|
18
|
+
|
19
|
+
The confidence of validations is based on the ratio of reference sources that agree.
|
20
|
+
For example, if 2 out of 4 sources have the matching word, confidence will be 0.5.
|
21
|
+
|
22
|
+
Examples:
|
23
|
+
Gap: "hello world extra words"
|
24
|
+
References:
|
25
|
+
genius: ["hello", "world"]
|
26
|
+
spotify: ["hello", "world"]
|
27
|
+
Result:
|
28
|
+
- Validate "hello" (confidence=1.0)
|
29
|
+
- Validate "world" (confidence=1.0)
|
30
|
+
- Leave "extra" and "words" unchanged
|
31
|
+
|
32
|
+
Gap: "martyr youre a"
|
33
|
+
References:
|
34
|
+
genius: ["martyr"]
|
35
|
+
spotify: ["mother"]
|
36
|
+
Result:
|
37
|
+
- Validate "martyr" (confidence=0.5, source="genius")
|
38
|
+
- Leave "youre" and "a" unchanged
|
39
|
+
"""
|
40
|
+
|
41
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
42
|
+
# Must have reference words
|
43
|
+
if not gap.reference_words:
|
44
|
+
return False, {}
|
45
|
+
|
46
|
+
# Gap must have words
|
47
|
+
if not gap.words:
|
48
|
+
return False, {}
|
49
|
+
|
50
|
+
# At least one word must match between gap and any reference source
|
51
|
+
# in the same position
|
52
|
+
has_match = any(
|
53
|
+
i < len(ref_words) and gap.words[i].lower() == ref_words[i].lower()
|
54
|
+
for ref_words in gap.reference_words.values()
|
55
|
+
for i in range(min(len(gap.words), len(ref_words)))
|
56
|
+
)
|
57
|
+
|
58
|
+
return has_match, {}
|
59
|
+
|
60
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
61
|
+
corrections = []
|
62
|
+
|
63
|
+
# Process each word in the gap that has a corresponding reference position
|
64
|
+
for i, word in enumerate(gap.words):
|
65
|
+
# Find reference sources that have a matching word at this position
|
66
|
+
matching_sources = [
|
67
|
+
source for source, ref_words in gap.reference_words.items() if i < len(ref_words) and word.lower() == ref_words[i].lower()
|
68
|
+
]
|
69
|
+
|
70
|
+
if matching_sources:
|
71
|
+
# Word matches reference(s) at this position - validate it
|
72
|
+
confidence = len(matching_sources) / len(gap.reference_words)
|
73
|
+
sources = ", ".join(matching_sources)
|
74
|
+
|
75
|
+
# Calculate reference positions for matching sources
|
76
|
+
reference_positions = WordOperations.calculate_reference_positions(gap, matching_sources)
|
77
|
+
|
78
|
+
corrections.append(
|
79
|
+
WordOperations.create_word_replacement_correction(
|
80
|
+
original_word=word,
|
81
|
+
corrected_word=word, # Same word, just validating
|
82
|
+
original_position=gap.transcription_position + i,
|
83
|
+
source=sources,
|
84
|
+
confidence=confidence,
|
85
|
+
reason="ExtendAnchorHandler: Matched reference source(s)",
|
86
|
+
reference_positions=reference_positions,
|
87
|
+
)
|
88
|
+
)
|
89
|
+
# No else clause - non-matching words are left unchanged
|
90
|
+
|
91
|
+
return corrections
|
@@ -0,0 +1,147 @@
|
|
1
|
+
from typing import List, Optional, Tuple, Dict, Any
|
2
|
+
import string
|
3
|
+
import Levenshtein
|
4
|
+
import logging
|
5
|
+
|
6
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
7
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
8
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
9
|
+
|
10
|
+
|
11
|
+
class LevenshteinHandler(GapCorrectionHandler):
|
12
|
+
"""Handles corrections based on Levenshtein (edit distance) similarity between words.
|
13
|
+
|
14
|
+
This handler looks for words that are similar in spelling to reference words in the same position.
|
15
|
+
The similarity calculation includes:
|
16
|
+
1. Basic Levenshtein ratio
|
17
|
+
2. Bonus for words starting with the same letter
|
18
|
+
3. Penalty for words starting with different letters
|
19
|
+
4. Bonus for similar length words
|
20
|
+
|
21
|
+
Examples:
|
22
|
+
Gap: "wold" (misspelling)
|
23
|
+
References:
|
24
|
+
genius: ["world"]
|
25
|
+
spotify: ["world"]
|
26
|
+
Result:
|
27
|
+
- Correct "wold" to "world" (high confidence due to small edit distance)
|
28
|
+
|
29
|
+
Gap: "worde" (misspelling)
|
30
|
+
References:
|
31
|
+
genius: ["world"]
|
32
|
+
spotify: ["words"]
|
33
|
+
Result:
|
34
|
+
- Correct "worde" to "world" (lower confidence due to disagreeing sources)
|
35
|
+
"""
|
36
|
+
|
37
|
+
def __init__(self, similarity_threshold: float = 0.65, logger: Optional[logging.Logger] = None):
|
38
|
+
self.similarity_threshold = similarity_threshold
|
39
|
+
self.logger = logger or logging.getLogger(__name__)
|
40
|
+
|
41
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
42
|
+
"""Check if we can handle this gap - we'll try if there are reference words."""
|
43
|
+
if not gap.reference_words:
|
44
|
+
self.logger.debug("No reference words available")
|
45
|
+
return False, {}
|
46
|
+
|
47
|
+
if not gap.words:
|
48
|
+
self.logger.debug("No gap words available")
|
49
|
+
return False, {}
|
50
|
+
|
51
|
+
# Check if any word has sufficient similarity to reference
|
52
|
+
for i, word in enumerate(gap.words):
|
53
|
+
for ref_words in gap.reference_words.values():
|
54
|
+
if i < len(ref_words):
|
55
|
+
similarity = self._get_string_similarity(word, ref_words[i])
|
56
|
+
if similarity >= self.similarity_threshold:
|
57
|
+
self.logger.debug(f"Found similar word: '{word}' -> '{ref_words[i]}' ({similarity:.2f})")
|
58
|
+
return True, {}
|
59
|
+
|
60
|
+
self.logger.debug("No words meet similarity threshold")
|
61
|
+
return False, {}
|
62
|
+
|
63
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
64
|
+
"""Try to correct words based on string similarity."""
|
65
|
+
corrections = []
|
66
|
+
|
67
|
+
# Process each word in the gap
|
68
|
+
for i, word in enumerate(gap.words):
|
69
|
+
# Skip if word is empty or just punctuation
|
70
|
+
if not word.strip():
|
71
|
+
continue
|
72
|
+
|
73
|
+
# Skip exact matches
|
74
|
+
if any(i < len(ref_words) and word.lower() == ref_words[i].lower() for ref_words in gap.reference_words.values()):
|
75
|
+
self.logger.debug(f"Skipping exact match: '{word}'")
|
76
|
+
continue
|
77
|
+
|
78
|
+
# Find matching reference words at this position
|
79
|
+
matches = {} # word -> (sources, similarity)
|
80
|
+
for source, ref_words in gap.reference_words.items():
|
81
|
+
ref_words_original = gap.reference_words_original[source] # Get original formatted words
|
82
|
+
if i >= len(ref_words):
|
83
|
+
continue
|
84
|
+
|
85
|
+
ref_word = ref_words[i]
|
86
|
+
ref_word_original = ref_words_original[i] # Get original formatted word
|
87
|
+
similarity = self._get_string_similarity(word, ref_word)
|
88
|
+
|
89
|
+
if similarity >= self.similarity_threshold:
|
90
|
+
self.logger.debug(f"Found match: '{word}' -> '{ref_word}' ({similarity:.2f})")
|
91
|
+
if ref_word_original not in matches: # Use original formatted word as key
|
92
|
+
matches[ref_word_original] = ([], similarity)
|
93
|
+
matches[ref_word_original][0].append(source)
|
94
|
+
|
95
|
+
# Create correction for best match if any found
|
96
|
+
if matches:
|
97
|
+
best_match, (sources, similarity) = max(
|
98
|
+
matches.items(), key=lambda x: (len(x[1][0]), x[1][1]) # Sort by number of sources, then similarity
|
99
|
+
)
|
100
|
+
|
101
|
+
source_confidence = len(sources) / len(gap.reference_words)
|
102
|
+
final_confidence = similarity * source_confidence
|
103
|
+
|
104
|
+
# Calculate reference positions for matching sources
|
105
|
+
reference_positions = WordOperations.calculate_reference_positions(gap, sources)
|
106
|
+
|
107
|
+
self.logger.debug(f"Creating correction: {word} -> {best_match} (confidence: {final_confidence})")
|
108
|
+
corrections.append(
|
109
|
+
WordOperations.create_word_replacement_correction(
|
110
|
+
original_word=word,
|
111
|
+
corrected_word=best_match, # Using original formatted word
|
112
|
+
original_position=gap.transcription_position + i,
|
113
|
+
source=", ".join(sources),
|
114
|
+
confidence=final_confidence,
|
115
|
+
reason=f"LevenshteinHandler: String similarity ({final_confidence:.2f})",
|
116
|
+
reference_positions=reference_positions,
|
117
|
+
)
|
118
|
+
)
|
119
|
+
|
120
|
+
return corrections
|
121
|
+
|
122
|
+
def _clean_word(self, word: str) -> str:
|
123
|
+
"""Remove punctuation and standardize for comparison."""
|
124
|
+
return word.strip().lower().strip(string.punctuation)
|
125
|
+
|
126
|
+
def _get_string_similarity(self, word1: str, word2: str) -> float:
|
127
|
+
"""Calculate string similarity using Levenshtein ratio with adjustments."""
|
128
|
+
# Clean words
|
129
|
+
w1, w2 = self._clean_word(word1), self._clean_word(word2)
|
130
|
+
if not w1 or not w2:
|
131
|
+
return 0.0
|
132
|
+
|
133
|
+
# Calculate Levenshtein ratio
|
134
|
+
similarity = Levenshtein.ratio(w1, w2)
|
135
|
+
|
136
|
+
# Boost similarity for words starting with the same letter
|
137
|
+
if w1[0] == w2[0]:
|
138
|
+
similarity = (similarity + 1) / 2
|
139
|
+
else:
|
140
|
+
# Penalize words starting with different letters
|
141
|
+
similarity = similarity * 0.9
|
142
|
+
|
143
|
+
# Boost for similar length words
|
144
|
+
length_ratio = min(len(w1), len(w2)) / max(len(w1), len(w2))
|
145
|
+
similarity = (similarity + length_ratio) / 2
|
146
|
+
|
147
|
+
return similarity
|
@@ -0,0 +1,98 @@
|
|
1
|
+
from typing import List, Optional, Tuple, Dict, Any
|
2
|
+
import re
|
3
|
+
|
4
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
5
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
6
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
7
|
+
|
8
|
+
|
9
|
+
class NoSpacePunctuationMatchHandler(GapCorrectionHandler):
|
10
|
+
"""Handles gaps where reference text matches when spaces and punctuation are removed."""
|
11
|
+
|
12
|
+
def _remove_spaces_and_punct(self, words: List[str]) -> str:
|
13
|
+
"""Join words and remove all whitespace and punctuation."""
|
14
|
+
text = "".join(words).lower()
|
15
|
+
# Remove all punctuation including apostrophes
|
16
|
+
return re.sub(r"[^\w\s]", "", text)
|
17
|
+
|
18
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
19
|
+
# Must have reference words
|
20
|
+
if not gap.reference_words:
|
21
|
+
return False, {}
|
22
|
+
|
23
|
+
# Get the gap text without spaces and punctuation
|
24
|
+
gap_text = self._remove_spaces_and_punct(gap.words)
|
25
|
+
|
26
|
+
# Check if any reference source matches when spaces and punctuation are removed
|
27
|
+
for words in gap.reference_words.values():
|
28
|
+
ref_text = self._remove_spaces_and_punct(words)
|
29
|
+
if gap_text == ref_text:
|
30
|
+
return True, {}
|
31
|
+
|
32
|
+
return False, {}
|
33
|
+
|
34
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
35
|
+
corrections = []
|
36
|
+
|
37
|
+
# Find the matching source (we know there is at least one from can_handle)
|
38
|
+
gap_text = self._remove_spaces_and_punct(gap.words)
|
39
|
+
matching_source = None
|
40
|
+
reference_words = None
|
41
|
+
reference_words_original = None
|
42
|
+
for source, words in gap.reference_words.items():
|
43
|
+
if self._remove_spaces_and_punct(words) == gap_text:
|
44
|
+
matching_source = source
|
45
|
+
reference_words = words
|
46
|
+
reference_words_original = gap.reference_words_original[source]
|
47
|
+
break
|
48
|
+
|
49
|
+
# Calculate reference positions for the matching source
|
50
|
+
reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
|
51
|
+
|
52
|
+
# Handle cases where number of words differ
|
53
|
+
if len(gap.words) > len(reference_words):
|
54
|
+
# Multiple transcribed words -> fewer reference words
|
55
|
+
corrections.extend(
|
56
|
+
WordOperations.create_word_combine_corrections(
|
57
|
+
original_words=gap.words,
|
58
|
+
reference_word=reference_words_original[0],
|
59
|
+
original_position=gap.transcription_position,
|
60
|
+
source=matching_source,
|
61
|
+
confidence=1.0,
|
62
|
+
combine_reason="NoSpacePunctuationMatchHandler: Words combined based on text match",
|
63
|
+
delete_reason="NoSpacePunctuationMatchHandler: Word removed as part of text match combination",
|
64
|
+
reference_positions=reference_positions,
|
65
|
+
)
|
66
|
+
)
|
67
|
+
|
68
|
+
elif len(gap.words) < len(reference_words):
|
69
|
+
# Single transcribed word -> multiple reference words
|
70
|
+
corrections.extend(
|
71
|
+
WordOperations.create_word_split_corrections(
|
72
|
+
original_word=gap.words[0],
|
73
|
+
reference_words=reference_words_original,
|
74
|
+
original_position=gap.transcription_position,
|
75
|
+
source=matching_source,
|
76
|
+
confidence=1.0,
|
77
|
+
reason="NoSpacePunctuationMatchHandler: Split word based on text match",
|
78
|
+
reference_positions=reference_positions,
|
79
|
+
)
|
80
|
+
)
|
81
|
+
|
82
|
+
else:
|
83
|
+
# One-to-one replacement
|
84
|
+
for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
|
85
|
+
if orig_word.lower() != ref_word.lower():
|
86
|
+
corrections.append(
|
87
|
+
WordOperations.create_word_replacement_correction(
|
88
|
+
original_word=orig_word,
|
89
|
+
corrected_word=ref_word_original,
|
90
|
+
original_position=gap.transcription_position + i,
|
91
|
+
source=matching_source,
|
92
|
+
confidence=1.0,
|
93
|
+
reason=f"NoSpacePunctuationMatchHandler: Source '{matching_source}' matched when spaces and punctuation removed",
|
94
|
+
reference_positions=reference_positions,
|
95
|
+
)
|
96
|
+
)
|
97
|
+
|
98
|
+
return corrections
|