lyrics-transcriber 0.30.0__py3-none-any.whl → 0.32.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/__init__.py +2 -1
- lyrics_transcriber/cli/{main.py → cli_main.py} +47 -14
- lyrics_transcriber/core/config.py +35 -0
- lyrics_transcriber/core/controller.py +164 -166
- lyrics_transcriber/correction/anchor_sequence.py +471 -0
- lyrics_transcriber/correction/corrector.py +256 -0
- lyrics_transcriber/correction/handlers/__init__.py +0 -0
- lyrics_transcriber/correction/handlers/base.py +30 -0
- lyrics_transcriber/correction/handlers/extend_anchor.py +91 -0
- lyrics_transcriber/correction/handlers/levenshtein.py +147 -0
- lyrics_transcriber/correction/handlers/no_space_punct_match.py +98 -0
- lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +55 -0
- lyrics_transcriber/correction/handlers/repeat.py +71 -0
- lyrics_transcriber/correction/handlers/sound_alike.py +223 -0
- lyrics_transcriber/correction/handlers/syllables_match.py +182 -0
- lyrics_transcriber/correction/handlers/word_count_match.py +54 -0
- lyrics_transcriber/correction/handlers/word_operations.py +135 -0
- lyrics_transcriber/correction/phrase_analyzer.py +426 -0
- lyrics_transcriber/correction/text_utils.py +30 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +125 -0
- lyrics_transcriber/lyrics/genius.py +73 -0
- lyrics_transcriber/lyrics/spotify.py +82 -0
- lyrics_transcriber/output/ass/__init__.py +21 -0
- lyrics_transcriber/output/{ass.py → ass/ass.py} +150 -690
- lyrics_transcriber/output/ass/ass_specs.txt +732 -0
- lyrics_transcriber/output/ass/config.py +37 -0
- lyrics_transcriber/output/ass/constants.py +23 -0
- lyrics_transcriber/output/ass/event.py +94 -0
- lyrics_transcriber/output/ass/formatters.py +132 -0
- lyrics_transcriber/output/ass/lyrics_line.py +219 -0
- lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
- lyrics_transcriber/output/ass/section_detector.py +89 -0
- lyrics_transcriber/output/ass/section_screen.py +106 -0
- lyrics_transcriber/output/ass/style.py +187 -0
- lyrics_transcriber/output/cdg.py +503 -0
- lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
- lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
- lyrics_transcriber/output/cdgmaker/composer.py +1919 -0
- lyrics_transcriber/output/cdgmaker/config.py +151 -0
- lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
- lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
- lyrics_transcriber/output/cdgmaker/pack.py +507 -0
- lyrics_transcriber/output/cdgmaker/render.py +346 -0
- lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
- lyrics_transcriber/output/cdgmaker/utils.py +132 -0
- lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
- lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
- lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/arial.ttf +0 -0
- lyrics_transcriber/output/fonts/georgia.ttf +0 -0
- lyrics_transcriber/output/fonts/verdana.ttf +0 -0
- lyrics_transcriber/output/generator.py +140 -171
- lyrics_transcriber/output/lyrics_file.py +102 -0
- lyrics_transcriber/output/plain_text.py +91 -0
- lyrics_transcriber/output/segment_resizer.py +416 -0
- lyrics_transcriber/output/subtitles.py +328 -302
- lyrics_transcriber/output/video.py +219 -0
- lyrics_transcriber/review/__init__.py +1 -0
- lyrics_transcriber/review/server.py +138 -0
- lyrics_transcriber/storage/dropbox.py +110 -134
- lyrics_transcriber/transcribers/audioshake.py +171 -105
- lyrics_transcriber/transcribers/base_transcriber.py +149 -0
- lyrics_transcriber/transcribers/whisper.py +267 -133
- lyrics_transcriber/types.py +454 -0
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/METADATA +14 -3
- lyrics_transcriber-0.32.1.dist-info/RECORD +86 -0
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/WHEEL +1 -1
- lyrics_transcriber-0.32.1.dist-info/entry_points.txt +4 -0
- lyrics_transcriber/core/corrector.py +0 -56
- lyrics_transcriber/core/fetcher.py +0 -143
- lyrics_transcriber/storage/tokens.py +0 -116
- lyrics_transcriber/transcribers/base.py +0 -31
- lyrics_transcriber-0.30.0.dist-info/RECORD +0 -22
- lyrics_transcriber-0.30.0.dist-info/entry_points.txt +0 -3
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
from typing import List, Tuple, Dict, Any, Optional
|
2
|
+
|
3
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
4
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
5
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
6
|
+
|
7
|
+
|
8
|
+
class RelaxedWordCountMatchHandler(GapCorrectionHandler):
|
9
|
+
"""Handles gaps where at least one reference source has matching word count."""
|
10
|
+
|
11
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
12
|
+
# Must have reference words
|
13
|
+
if not gap.reference_words:
|
14
|
+
return False, {}
|
15
|
+
|
16
|
+
# Check if any source has matching word count
|
17
|
+
for words in gap.reference_words.values():
|
18
|
+
if len(words) == gap.length:
|
19
|
+
return True, {}
|
20
|
+
|
21
|
+
return False, {}
|
22
|
+
|
23
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
24
|
+
corrections = []
|
25
|
+
|
26
|
+
# Find the first source that has matching word count
|
27
|
+
matching_source = None
|
28
|
+
reference_words = None
|
29
|
+
reference_words_original = None
|
30
|
+
for source, words in gap.reference_words.items():
|
31
|
+
if len(words) == gap.length:
|
32
|
+
matching_source = source
|
33
|
+
reference_words = words
|
34
|
+
reference_words_original = gap.reference_words_original[source]
|
35
|
+
break
|
36
|
+
|
37
|
+
# Use the centralized method to calculate reference positions for the matching source
|
38
|
+
reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
|
39
|
+
|
40
|
+
# Since we found a source with matching word count, we can correct using that source
|
41
|
+
for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
|
42
|
+
if orig_word.lower() != ref_word.lower():
|
43
|
+
corrections.append(
|
44
|
+
WordOperations.create_word_replacement_correction(
|
45
|
+
original_word=orig_word,
|
46
|
+
corrected_word=ref_word_original,
|
47
|
+
original_position=gap.transcription_position + i,
|
48
|
+
source=matching_source,
|
49
|
+
confidence=1.0,
|
50
|
+
reason=f"RelaxedWordCountMatchHandler: Source '{matching_source}' had matching word count",
|
51
|
+
reference_positions=reference_positions,
|
52
|
+
)
|
53
|
+
)
|
54
|
+
|
55
|
+
return corrections
|
@@ -0,0 +1,71 @@
|
|
1
|
+
from typing import List, Dict, Optional, Tuple, Any
|
2
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
3
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
4
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
5
|
+
import logging
|
6
|
+
|
7
|
+
|
8
|
+
class RepeatCorrectionHandler(GapCorrectionHandler):
|
9
|
+
"""Handler that applies corrections that were previously made by other handlers."""
|
10
|
+
|
11
|
+
def __init__(self, logger: Optional[logging.Logger] = None, confidence_threshold: float = 0.7):
|
12
|
+
self.logger = logger or logging.getLogger(__name__)
|
13
|
+
self.confidence_threshold = confidence_threshold
|
14
|
+
self.previous_corrections: List[WordCorrection] = []
|
15
|
+
|
16
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
17
|
+
"""Check if any words in the gap match previous corrections."""
|
18
|
+
return bool(self.previous_corrections), {}
|
19
|
+
|
20
|
+
def set_previous_corrections(self, corrections: List[WordCorrection]) -> None:
|
21
|
+
"""Store corrections from previous handlers to use as reference."""
|
22
|
+
self.previous_corrections = corrections
|
23
|
+
|
24
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
25
|
+
"""Apply previous corrections to matching words in the current gap."""
|
26
|
+
corrections = []
|
27
|
+
|
28
|
+
# Use the centralized method to calculate reference positions
|
29
|
+
reference_positions = WordOperations.calculate_reference_positions(gap)
|
30
|
+
|
31
|
+
# Build a map of original words to their corrections
|
32
|
+
correction_map: Dict[str, List[WordCorrection]] = {}
|
33
|
+
for corr in self.previous_corrections:
|
34
|
+
if corr.confidence >= self.confidence_threshold:
|
35
|
+
correction_map.setdefault(corr.original_word.lower(), []).append(corr)
|
36
|
+
|
37
|
+
# Check each word in the gap
|
38
|
+
for i, word in enumerate(gap.words):
|
39
|
+
word_lower = word.lower()
|
40
|
+
if word_lower in correction_map:
|
41
|
+
# Get the most common correction for this word
|
42
|
+
prev_corrections = correction_map[word_lower]
|
43
|
+
best_correction = max(
|
44
|
+
prev_corrections,
|
45
|
+
key=lambda c: (sum(1 for pc in prev_corrections if pc.corrected_word == c.corrected_word), c.confidence),
|
46
|
+
)
|
47
|
+
|
48
|
+
self.logger.debug(
|
49
|
+
f"Applying previous correction: {word} -> {best_correction.corrected_word} "
|
50
|
+
f"(confidence: {best_correction.confidence:.2f})"
|
51
|
+
)
|
52
|
+
|
53
|
+
corrections.append(
|
54
|
+
WordCorrection(
|
55
|
+
original_word=word,
|
56
|
+
corrected_word=best_correction.corrected_word,
|
57
|
+
segment_index=0,
|
58
|
+
original_position=gap.transcription_position + i,
|
59
|
+
confidence=best_correction.confidence * 0.9, # Slightly lower confidence for repeats
|
60
|
+
source=best_correction.source,
|
61
|
+
reason=f"RepeatCorrectionHandler: Matches previous correction",
|
62
|
+
alternatives={best_correction.corrected_word: 1},
|
63
|
+
is_deletion=best_correction.is_deletion,
|
64
|
+
reference_positions=reference_positions, # Add reference positions
|
65
|
+
length=best_correction.length, # Preserve length from original correction
|
66
|
+
split_index=best_correction.split_index, # Preserve split info if present
|
67
|
+
split_total=best_correction.split_total, # Preserve split info if present
|
68
|
+
)
|
69
|
+
)
|
70
|
+
|
71
|
+
return corrections
|
@@ -0,0 +1,223 @@
|
|
1
|
+
from typing import List, Dict, Tuple, Optional, Any
|
2
|
+
import logging
|
3
|
+
from metaphone import doublemetaphone
|
4
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
5
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
6
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
7
|
+
|
8
|
+
|
9
|
+
class SoundAlikeHandler(GapCorrectionHandler):
|
10
|
+
"""Handles gaps where words sound similar to reference words but are spelled differently.
|
11
|
+
|
12
|
+
Uses Double Metaphone algorithm to detect sound-alike words. For each word in the gap,
|
13
|
+
it checks if its phonetic encoding matches any reference word's encoding.
|
14
|
+
|
15
|
+
The confidence of corrections is based on:
|
16
|
+
1. The ratio of reference sources agreeing on the correction
|
17
|
+
2. Whether the match was on primary (1.0) or secondary (0.8) metaphone code
|
18
|
+
|
19
|
+
Examples:
|
20
|
+
Gap: "shush look deep"
|
21
|
+
References:
|
22
|
+
genius: ["search", "look", "deep"]
|
23
|
+
spotify: ["search", "look", "deep"]
|
24
|
+
Result:
|
25
|
+
- Correct "shush" to "search" (confidence based on metaphone match type)
|
26
|
+
- Validate "look" and "deep" (exact matches)
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(self, logger: Optional[logging.Logger] = None, similarity_threshold: float = 0.6):
|
30
|
+
"""Initialize the handler.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
logger: Optional logger instance
|
34
|
+
similarity_threshold: Minimum confidence threshold for matches (default: 0.6)
|
35
|
+
"""
|
36
|
+
self.logger = logger or logging.getLogger(__name__)
|
37
|
+
self.similarity_threshold = similarity_threshold
|
38
|
+
|
39
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
40
|
+
# Must have reference words
|
41
|
+
if not gap.reference_words:
|
42
|
+
self.logger.debug("No reference words available")
|
43
|
+
return False, {}
|
44
|
+
|
45
|
+
# Gap must have words
|
46
|
+
if not gap.words:
|
47
|
+
self.logger.debug("No gap words available")
|
48
|
+
return False, {}
|
49
|
+
|
50
|
+
# Check if any gap word has a metaphone match with any reference word
|
51
|
+
for word in gap.words:
|
52
|
+
word_codes = doublemetaphone(word)
|
53
|
+
self.logger.debug(f"Gap word '{word}' has metaphone codes: {word_codes}")
|
54
|
+
for ref_words in gap.reference_words.values():
|
55
|
+
for ref_word in ref_words:
|
56
|
+
ref_codes = doublemetaphone(ref_word)
|
57
|
+
self.logger.debug(f"Reference word '{ref_word}' has metaphone codes: {ref_codes}")
|
58
|
+
if self._codes_match(word_codes, ref_codes):
|
59
|
+
self.logger.debug(f"Found metaphone match between '{word}' and '{ref_word}'")
|
60
|
+
return True, {}
|
61
|
+
self.logger.debug("No metaphone matches found")
|
62
|
+
return False, {}
|
63
|
+
|
64
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
65
|
+
corrections = []
|
66
|
+
|
67
|
+
# Use the centralized method to calculate reference positions for all sources
|
68
|
+
reference_positions = WordOperations.calculate_reference_positions(gap)
|
69
|
+
|
70
|
+
# For each word in the gap
|
71
|
+
for i, word in enumerate(gap.words):
|
72
|
+
word_codes = doublemetaphone(word)
|
73
|
+
self.logger.debug(f"Processing '{word}' (codes: {word_codes})")
|
74
|
+
|
75
|
+
# Skip if word exactly matches any reference
|
76
|
+
exact_match = any(i < len(ref_words) and word.lower() == ref_words[i].lower() for ref_words in gap.reference_words.values())
|
77
|
+
if exact_match:
|
78
|
+
continue
|
79
|
+
|
80
|
+
# Find sound-alike matches in references
|
81
|
+
matches: Dict[str, Tuple[List[str], float]] = {}
|
82
|
+
|
83
|
+
for source, ref_words in gap.reference_words.items():
|
84
|
+
ref_words_original = gap.reference_words_original[source] # Get original formatted words
|
85
|
+
for j, (ref_word, ref_word_original) in enumerate(zip(ref_words, ref_words_original)):
|
86
|
+
ref_codes = doublemetaphone(ref_word)
|
87
|
+
|
88
|
+
match_confidence = self._get_match_confidence(word_codes, ref_codes)
|
89
|
+
if match_confidence >= self.similarity_threshold:
|
90
|
+
# Special handling for short codes - don't apply position penalty
|
91
|
+
is_short_code = any(len(c) <= 2 for c in word_codes if c) or any(len(c) <= 2 for c in ref_codes if c)
|
92
|
+
position_multiplier = 1.0 if is_short_code or i == j else 0.8
|
93
|
+
|
94
|
+
adjusted_confidence = match_confidence * position_multiplier
|
95
|
+
|
96
|
+
if adjusted_confidence >= self.similarity_threshold:
|
97
|
+
if ref_word_original not in matches: # Use original formatted word as key
|
98
|
+
matches[ref_word_original] = ([], adjusted_confidence)
|
99
|
+
matches[ref_word_original][0].append(source)
|
100
|
+
|
101
|
+
# Create correction for best match if any found
|
102
|
+
if matches:
|
103
|
+
best_match, (sources, base_confidence) = max(matches.items(), key=lambda x: (len(x[1][0]), x[1][1]))
|
104
|
+
|
105
|
+
source_confidence = len(sources) / len(gap.reference_words)
|
106
|
+
final_confidence = base_confidence * source_confidence
|
107
|
+
|
108
|
+
self.logger.debug(f"Found match: {word} -> {best_match} (confidence: {final_confidence:.2f}, sources: {sources})")
|
109
|
+
corrections.append(
|
110
|
+
WordCorrection(
|
111
|
+
original_word=word,
|
112
|
+
corrected_word=best_match, # Already using original formatted word
|
113
|
+
segment_index=0,
|
114
|
+
original_position=gap.transcription_position + i,
|
115
|
+
confidence=final_confidence,
|
116
|
+
source=", ".join(sources),
|
117
|
+
reason=f"SoundAlikeHandler: Phonetic match ({final_confidence:.2f} confidence)",
|
118
|
+
alternatives={k: len(v[0]) for k, v in matches.items()},
|
119
|
+
is_deletion=False,
|
120
|
+
reference_positions=reference_positions, # Add reference positions
|
121
|
+
length=1, # Single word replacement
|
122
|
+
)
|
123
|
+
)
|
124
|
+
|
125
|
+
return corrections
|
126
|
+
|
127
|
+
def _codes_match(self, codes1: Tuple[str, str], codes2: Tuple[str, str]) -> float:
|
128
|
+
"""Check if two sets of metaphone codes match and return match quality."""
|
129
|
+
# Get all non-empty codes
|
130
|
+
codes1_set = {c for c in codes1 if c}
|
131
|
+
codes2_set = {c for c in codes2 if c}
|
132
|
+
|
133
|
+
if not codes1_set or not codes2_set:
|
134
|
+
return 0.0
|
135
|
+
|
136
|
+
best_match = 0.0
|
137
|
+
for code1 in codes1_set:
|
138
|
+
for code2 in codes2_set:
|
139
|
+
# Special case for very short codes (like 'A' for 'you')
|
140
|
+
if len(code1) <= 2 or len(code2) <= 2:
|
141
|
+
if code1 == code2:
|
142
|
+
best_match = max(best_match, 1.0)
|
143
|
+
elif code1 in code2 or code2 in code1:
|
144
|
+
best_match = max(best_match, 0.8)
|
145
|
+
elif code1[0] == code2[0]: # Match first character
|
146
|
+
best_match = max(best_match, 0.7)
|
147
|
+
continue
|
148
|
+
|
149
|
+
# Skip if codes are too different in length
|
150
|
+
length_diff = abs(len(code1) - len(code2))
|
151
|
+
if length_diff > 3:
|
152
|
+
continue
|
153
|
+
|
154
|
+
# Exact match
|
155
|
+
if code1 == code2:
|
156
|
+
best_match = max(best_match, 1.0)
|
157
|
+
continue
|
158
|
+
|
159
|
+
# Similar codes (allow 1-2 character differences)
|
160
|
+
if len(code1) >= 2 and len(code2) >= 2:
|
161
|
+
# Compare first N characters where N is min length
|
162
|
+
min_len = min(len(code1), len(code2))
|
163
|
+
|
164
|
+
# Check for shared characters in any position
|
165
|
+
shared_chars = sum(1 for c in code1 if c in code2)
|
166
|
+
if shared_chars >= min(2, min_len): # More lenient shared character requirement
|
167
|
+
match_quality = 0.7 + (0.1 * shared_chars / max(len(code1), len(code2)))
|
168
|
+
best_match = max(best_match, match_quality)
|
169
|
+
continue
|
170
|
+
|
171
|
+
# Compare aligned characters
|
172
|
+
differences = sum(1 for a, b in zip(code1[:min_len], code2[:min_len]) if a != b)
|
173
|
+
if differences <= 2:
|
174
|
+
match_quality = 0.85 - (differences * 0.1)
|
175
|
+
best_match = max(best_match, match_quality)
|
176
|
+
continue
|
177
|
+
|
178
|
+
# Common prefix/suffix match with more lenient threshold
|
179
|
+
common_prefix_len = 0
|
180
|
+
for a, b in zip(code1, code2):
|
181
|
+
if a != b:
|
182
|
+
break
|
183
|
+
common_prefix_len += 1
|
184
|
+
|
185
|
+
common_suffix_len = 0
|
186
|
+
for a, b in zip(code1[::-1], code2[::-1]):
|
187
|
+
if a != b:
|
188
|
+
break
|
189
|
+
common_suffix_len += 1
|
190
|
+
|
191
|
+
if common_prefix_len >= 1 or common_suffix_len >= 1: # Even more lenient prefix/suffix requirement
|
192
|
+
match_quality = 0.7 + (0.1 * max(common_prefix_len, common_suffix_len))
|
193
|
+
best_match = max(best_match, match_quality)
|
194
|
+
continue
|
195
|
+
|
196
|
+
# Substring match
|
197
|
+
if len(code1) >= 2 and len(code2) >= 2: # More lenient length requirement
|
198
|
+
# Look for shared substrings of length 2 or more
|
199
|
+
for length in range(min(len(code1), len(code2)), 1, -1):
|
200
|
+
for i in range(len(code1) - length + 1):
|
201
|
+
substring = code1[i : i + length]
|
202
|
+
if substring in code2:
|
203
|
+
match_quality = 0.7 + (0.1 * length / max(len(code1), len(code2)))
|
204
|
+
best_match = max(best_match, match_quality)
|
205
|
+
break
|
206
|
+
|
207
|
+
return best_match
|
208
|
+
|
209
|
+
def _get_match_confidence(self, codes1: Tuple[str, str], codes2: Tuple[str, str]) -> float:
|
210
|
+
"""Calculate confidence score for a metaphone code match."""
|
211
|
+
match_quality = self._codes_match(codes1, codes2)
|
212
|
+
if match_quality == 0:
|
213
|
+
return 0.0
|
214
|
+
|
215
|
+
# Get primary codes (first code of each tuple)
|
216
|
+
code1, code2 = codes1[0], codes2[0]
|
217
|
+
|
218
|
+
# Boost confidence for codes that share prefixes
|
219
|
+
if code1 and code2 and len(code1) >= 2 and len(code2) >= 2:
|
220
|
+
if code1[:2] == code2[:2]:
|
221
|
+
match_quality = min(1.0, match_quality + 0.1)
|
222
|
+
|
223
|
+
return match_quality
|
@@ -0,0 +1,182 @@
|
|
1
|
+
from typing import List, Tuple, Dict, Any
|
2
|
+
import spacy
|
3
|
+
import logging
|
4
|
+
import pyphen
|
5
|
+
import nltk
|
6
|
+
from nltk.corpus import cmudict
|
7
|
+
import syllables
|
8
|
+
from spacy_syllables import SpacySyllables
|
9
|
+
|
10
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
11
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
12
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
13
|
+
|
14
|
+
|
15
|
+
class SyllablesMatchHandler(GapCorrectionHandler):
|
16
|
+
"""Handles gaps where number of syllables in reference text matches number of syllables in transcription."""
|
17
|
+
|
18
|
+
def __init__(self):
|
19
|
+
# Marking SpacySyllables as used to prevent unused import warning
|
20
|
+
_ = SpacySyllables
|
21
|
+
# Load spacy model with syllables pipeline
|
22
|
+
self.nlp = spacy.load("en_core_web_sm")
|
23
|
+
# Add syllables component to pipeline if not already present
|
24
|
+
if "syllables" not in self.nlp.pipe_names:
|
25
|
+
self.nlp.add_pipe("syllables", after="tagger")
|
26
|
+
# Initialize Pyphen for English
|
27
|
+
self.dic = pyphen.Pyphen(lang="en_US")
|
28
|
+
# Initialize NLTK's CMU dictionary
|
29
|
+
try:
|
30
|
+
self.cmudict = cmudict.dict()
|
31
|
+
except LookupError:
|
32
|
+
nltk.download("cmudict")
|
33
|
+
self.cmudict = cmudict.dict()
|
34
|
+
self.logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
def _count_syllables_spacy(self, words: List[str]) -> int:
|
37
|
+
"""Count syllables using spacy_syllables."""
|
38
|
+
text = " ".join(words)
|
39
|
+
doc = self.nlp(text)
|
40
|
+
total_syllables = sum(token._.syllables_count or 1 for token in doc)
|
41
|
+
return total_syllables
|
42
|
+
|
43
|
+
def _count_syllables_pyphen(self, words: List[str]) -> int:
|
44
|
+
"""Count syllables using pyphen."""
|
45
|
+
total_syllables = 0
|
46
|
+
for word in words:
|
47
|
+
hyphenated = self.dic.inserted(word)
|
48
|
+
syllables_count = len(hyphenated.split("-")) if hyphenated else 1
|
49
|
+
total_syllables += syllables_count
|
50
|
+
return total_syllables
|
51
|
+
|
52
|
+
def _count_syllables_nltk(self, words: List[str]) -> int:
|
53
|
+
"""Count syllables using NLTK's CMU dictionary."""
|
54
|
+
total_syllables = 0
|
55
|
+
for word in words:
|
56
|
+
word = word.lower()
|
57
|
+
if word in self.cmudict:
|
58
|
+
syllables_count = len([ph for ph in self.cmudict[word][0] if ph[-1].isdigit()])
|
59
|
+
total_syllables += syllables_count
|
60
|
+
else:
|
61
|
+
total_syllables += 1
|
62
|
+
return total_syllables
|
63
|
+
|
64
|
+
def _count_syllables_lib(self, words: List[str]) -> int:
|
65
|
+
"""Count syllables using the syllables library."""
|
66
|
+
total_syllables = 0
|
67
|
+
for word in words:
|
68
|
+
syllables_count = syllables.estimate(word)
|
69
|
+
total_syllables += syllables_count
|
70
|
+
return total_syllables
|
71
|
+
|
72
|
+
def _count_syllables(self, words: List[str]) -> List[int]:
|
73
|
+
"""Count syllables using multiple methods."""
|
74
|
+
spacy_count = self._count_syllables_spacy(words)
|
75
|
+
pyphen_count = self._count_syllables_pyphen(words)
|
76
|
+
nltk_count = self._count_syllables_nltk(words)
|
77
|
+
syllables_count = self._count_syllables_lib(words)
|
78
|
+
|
79
|
+
text = " ".join(words)
|
80
|
+
self.logger.debug(
|
81
|
+
f"Syllable counts for '{text}': spacy={spacy_count}, pyphen={pyphen_count}, nltk={nltk_count}, syllables={syllables_count}"
|
82
|
+
)
|
83
|
+
return [spacy_count, pyphen_count, nltk_count, syllables_count]
|
84
|
+
|
85
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
86
|
+
# Must have reference words
|
87
|
+
if not gap.reference_words:
|
88
|
+
self.logger.debug("No reference words available")
|
89
|
+
return False, {}
|
90
|
+
|
91
|
+
# Get syllable counts for gap text using different methods
|
92
|
+
gap_syllables = self._count_syllables(gap.words)
|
93
|
+
|
94
|
+
# Check if any reference source has matching syllable count with any method
|
95
|
+
for source, words in gap.reference_words.items():
|
96
|
+
ref_syllables = self._count_syllables(words)
|
97
|
+
|
98
|
+
# If any counting method matches between gap and reference, we can handle it
|
99
|
+
if any(gap_count == ref_count for gap_count in gap_syllables for ref_count in ref_syllables):
|
100
|
+
self.logger.debug(f"Found matching syllable count in source '{source}'")
|
101
|
+
return True, {
|
102
|
+
"gap_syllables": gap_syllables,
|
103
|
+
"matching_source": source,
|
104
|
+
"reference_words": words,
|
105
|
+
"reference_words_original": gap.reference_words_original[source],
|
106
|
+
}
|
107
|
+
|
108
|
+
self.logger.debug("No reference source had matching syllable count")
|
109
|
+
return False, {}
|
110
|
+
|
111
|
+
def handle(self, gap: GapSequence, data: Dict[str, Any]) -> List[WordCorrection]:
|
112
|
+
corrections = []
|
113
|
+
matching_source = data["matching_source"]
|
114
|
+
reference_words = data["reference_words"]
|
115
|
+
reference_words_original = data["reference_words_original"]
|
116
|
+
|
117
|
+
# Use the centralized method to calculate reference positions
|
118
|
+
reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
|
119
|
+
|
120
|
+
# Since we matched syllable counts for the entire gap, we should handle all words
|
121
|
+
if len(gap.words) > len(reference_words):
|
122
|
+
# Multiple transcribed words -> fewer reference words
|
123
|
+
# Try to distribute the reference words across the gap words
|
124
|
+
words_per_ref = len(gap.words) / len(reference_words)
|
125
|
+
|
126
|
+
for ref_idx, ref_word_original in enumerate(reference_words_original):
|
127
|
+
start_idx = int(ref_idx * words_per_ref)
|
128
|
+
end_idx = int((ref_idx + 1) * words_per_ref)
|
129
|
+
|
130
|
+
# Get the group of words to combine
|
131
|
+
words_to_combine = gap.words[start_idx:end_idx]
|
132
|
+
corrections.extend(
|
133
|
+
WordOperations.create_word_combine_corrections(
|
134
|
+
original_words=words_to_combine,
|
135
|
+
reference_word=ref_word_original,
|
136
|
+
original_position=gap.transcription_position + start_idx,
|
137
|
+
source=matching_source,
|
138
|
+
confidence=0.8,
|
139
|
+
combine_reason="SyllablesMatchHandler: Words combined based on syllable match",
|
140
|
+
delete_reason="SyllablesMatchHandler: Word removed as part of syllable match combination",
|
141
|
+
reference_positions=reference_positions,
|
142
|
+
)
|
143
|
+
)
|
144
|
+
|
145
|
+
elif len(gap.words) < len(reference_words):
|
146
|
+
# Single transcribed word -> multiple reference words
|
147
|
+
words_per_gap = len(reference_words) / len(gap.words)
|
148
|
+
|
149
|
+
for i, orig_word in enumerate(gap.words):
|
150
|
+
start_idx = int(i * words_per_gap)
|
151
|
+
end_idx = int((i + 1) * words_per_gap)
|
152
|
+
ref_words_original_for_orig = reference_words_original[start_idx:end_idx]
|
153
|
+
|
154
|
+
corrections.extend(
|
155
|
+
WordOperations.create_word_split_corrections(
|
156
|
+
original_word=orig_word,
|
157
|
+
reference_words=ref_words_original_for_orig,
|
158
|
+
original_position=gap.transcription_position + i,
|
159
|
+
source=matching_source,
|
160
|
+
confidence=0.8,
|
161
|
+
reason="SyllablesMatchHandler: Split word based on syllable match",
|
162
|
+
reference_positions=reference_positions,
|
163
|
+
)
|
164
|
+
)
|
165
|
+
|
166
|
+
else:
|
167
|
+
# One-to-one replacement
|
168
|
+
for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
|
169
|
+
if orig_word.lower() != ref_word.lower():
|
170
|
+
corrections.append(
|
171
|
+
WordOperations.create_word_replacement_correction(
|
172
|
+
original_word=orig_word,
|
173
|
+
corrected_word=ref_word_original,
|
174
|
+
original_position=gap.transcription_position + i,
|
175
|
+
source=matching_source,
|
176
|
+
confidence=0.8,
|
177
|
+
reason=f"SyllablesMatchHandler: Source '{matching_source}' had matching syllable count",
|
178
|
+
reference_positions=reference_positions,
|
179
|
+
)
|
180
|
+
)
|
181
|
+
|
182
|
+
return corrections
|
@@ -0,0 +1,54 @@
|
|
1
|
+
from typing import List, Tuple, Dict, Any, Optional
|
2
|
+
|
3
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
4
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
5
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
6
|
+
|
7
|
+
|
8
|
+
class WordCountMatchHandler(GapCorrectionHandler):
|
9
|
+
"""Handles gaps where reference sources agree and have matching word counts."""
|
10
|
+
|
11
|
+
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
12
|
+
# Must have reference words
|
13
|
+
if not gap.reference_words:
|
14
|
+
return False, {}
|
15
|
+
|
16
|
+
ref_words_lists = list(gap.reference_words.values())
|
17
|
+
|
18
|
+
# All sources must have same number of words as gap
|
19
|
+
if not all(len(words) == gap.length for words in ref_words_lists):
|
20
|
+
return False, {}
|
21
|
+
|
22
|
+
# If we have multiple sources, they must all agree
|
23
|
+
if len(ref_words_lists) > 1 and not all(words == ref_words_lists[0] for words in ref_words_lists[1:]):
|
24
|
+
return False, {}
|
25
|
+
|
26
|
+
return True, {}
|
27
|
+
|
28
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
29
|
+
corrections = []
|
30
|
+
# Get both clean and original reference words from first source
|
31
|
+
source = list(gap.reference_words.keys())[0]
|
32
|
+
reference_words = gap.reference_words[source]
|
33
|
+
reference_words_original = gap.reference_words_original[source]
|
34
|
+
sources = ", ".join(gap.reference_words.keys())
|
35
|
+
|
36
|
+
# Use the centralized method to calculate reference positions for all sources
|
37
|
+
reference_positions = WordOperations.calculate_reference_positions(gap)
|
38
|
+
|
39
|
+
# Since we know all reference sources agree, we can correct all words in the gap
|
40
|
+
for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
|
41
|
+
if orig_word.lower() != ref_word.lower():
|
42
|
+
corrections.append(
|
43
|
+
WordOperations.create_word_replacement_correction(
|
44
|
+
original_word=orig_word,
|
45
|
+
corrected_word=ref_word_original,
|
46
|
+
original_position=gap.transcription_position + i,
|
47
|
+
source=sources,
|
48
|
+
confidence=1.0,
|
49
|
+
reason="WordCountMatchHandler: Reference sources had same word count as gap",
|
50
|
+
reference_positions=reference_positions,
|
51
|
+
)
|
52
|
+
)
|
53
|
+
|
54
|
+
return corrections
|