lyrics-transcriber 0.41.0__py3-none-any.whl → 0.42.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/core/controller.py +30 -52
- lyrics_transcriber/correction/anchor_sequence.py +325 -150
- lyrics_transcriber/correction/corrector.py +224 -107
- lyrics_transcriber/correction/handlers/base.py +28 -10
- lyrics_transcriber/correction/handlers/extend_anchor.py +47 -24
- lyrics_transcriber/correction/handlers/levenshtein.py +75 -33
- lyrics_transcriber/correction/handlers/llm.py +290 -0
- lyrics_transcriber/correction/handlers/no_space_punct_match.py +81 -36
- lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +46 -26
- lyrics_transcriber/correction/handlers/repeat.py +28 -11
- lyrics_transcriber/correction/handlers/sound_alike.py +68 -32
- lyrics_transcriber/correction/handlers/syllables_match.py +80 -30
- lyrics_transcriber/correction/handlers/word_count_match.py +36 -19
- lyrics_transcriber/correction/handlers/word_operations.py +68 -22
- lyrics_transcriber/correction/text_utils.py +3 -7
- lyrics_transcriber/frontend/.yarn/install-state.gz +0 -0
- lyrics_transcriber/frontend/.yarn/releases/yarn-4.6.0.cjs +934 -0
- lyrics_transcriber/frontend/.yarnrc.yml +3 -0
- lyrics_transcriber/frontend/dist/assets/{index-DKnNJHRK.js → index-coH8y7gV.js} +16284 -9032
- lyrics_transcriber/frontend/dist/assets/index-coH8y7gV.js.map +1 -0
- lyrics_transcriber/frontend/dist/index.html +1 -1
- lyrics_transcriber/frontend/package.json +6 -2
- lyrics_transcriber/frontend/src/App.tsx +18 -2
- lyrics_transcriber/frontend/src/api.ts +103 -6
- lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +7 -6
- lyrics_transcriber/frontend/src/components/DetailsModal.tsx +86 -59
- lyrics_transcriber/frontend/src/components/EditModal.tsx +93 -43
- lyrics_transcriber/frontend/src/components/FileUpload.tsx +2 -2
- lyrics_transcriber/frontend/src/components/Header.tsx +251 -0
- lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +303 -265
- lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +117 -0
- lyrics_transcriber/frontend/src/components/ReferenceView.tsx +125 -40
- lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +129 -115
- lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +59 -78
- lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +40 -16
- lyrics_transcriber/frontend/src/components/WordEditControls.tsx +4 -10
- lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +137 -68
- lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +1 -1
- lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +85 -115
- lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
- lyrics_transcriber/frontend/src/components/shared/types.ts +15 -7
- lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +35 -0
- lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
- lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +7 -7
- lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +121 -0
- lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
- lyrics_transcriber/frontend/src/types.js +2 -0
- lyrics_transcriber/frontend/src/types.ts +70 -49
- lyrics_transcriber/frontend/src/validation.ts +132 -0
- lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
- lyrics_transcriber/frontend/yarn.lock +3752 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +75 -12
- lyrics_transcriber/lyrics/file_provider.py +6 -5
- lyrics_transcriber/lyrics/genius.py +5 -2
- lyrics_transcriber/lyrics/spotify.py +58 -21
- lyrics_transcriber/output/ass/config.py +16 -5
- lyrics_transcriber/output/cdg.py +1 -1
- lyrics_transcriber/output/generator.py +22 -8
- lyrics_transcriber/output/plain_text.py +15 -10
- lyrics_transcriber/output/segment_resizer.py +16 -3
- lyrics_transcriber/output/subtitles.py +27 -1
- lyrics_transcriber/output/video.py +107 -1
- lyrics_transcriber/review/__init__.py +0 -1
- lyrics_transcriber/review/server.py +337 -164
- lyrics_transcriber/transcribers/audioshake.py +3 -0
- lyrics_transcriber/transcribers/base_transcriber.py +11 -3
- lyrics_transcriber/transcribers/whisper.py +11 -1
- lyrics_transcriber/types.py +151 -105
- lyrics_transcriber/utils/word_utils.py +27 -0
- {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/METADATA +3 -1
- {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/RECORD +74 -61
- {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/WHEEL +1 -1
- lyrics_transcriber/frontend/dist/assets/index-DKnNJHRK.js.map +0 -1
- lyrics_transcriber/frontend/package-lock.json +0 -4260
- lyrics_transcriber/frontend/src/components/shared/utils/initializeDataWithIds.tsx +0 -202
- {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/LICENSE +0 -0
- {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/entry_points.txt +0 -0
@@ -38,23 +38,38 @@ class LevenshteinHandler(GapCorrectionHandler):
|
|
38
38
|
self.similarity_threshold = similarity_threshold
|
39
39
|
self.logger = logger or logging.getLogger(__name__)
|
40
40
|
|
41
|
-
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
41
|
+
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
42
42
|
"""Check if we can handle this gap - we'll try if there are reference words."""
|
43
|
-
if not
|
43
|
+
if not data or "word_map" not in data:
|
44
|
+
self.logger.error("No word_map provided in data")
|
45
|
+
return False, {}
|
46
|
+
|
47
|
+
word_map = data["word_map"]
|
48
|
+
|
49
|
+
if not gap.reference_word_ids:
|
44
50
|
self.logger.debug("No reference words available")
|
45
51
|
return False, {}
|
46
52
|
|
47
|
-
if not gap.
|
53
|
+
if not gap.transcribed_word_ids:
|
48
54
|
self.logger.debug("No gap words available")
|
49
55
|
return False, {}
|
50
56
|
|
51
57
|
# Check if any word has sufficient similarity to reference
|
52
|
-
for i,
|
53
|
-
|
54
|
-
|
55
|
-
|
58
|
+
for i, word_id in enumerate(gap.transcribed_word_ids):
|
59
|
+
if word_id not in word_map:
|
60
|
+
continue
|
61
|
+
word = word_map[word_id]
|
62
|
+
|
63
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
64
|
+
if i < len(ref_word_ids):
|
65
|
+
ref_word_id = ref_word_ids[i]
|
66
|
+
if ref_word_id not in word_map:
|
67
|
+
continue
|
68
|
+
ref_word = word_map[ref_word_id]
|
69
|
+
|
70
|
+
similarity = self._get_string_similarity(word.text, ref_word.text)
|
56
71
|
if similarity >= self.similarity_threshold:
|
57
|
-
self.logger.debug(f"Found similar word: '{word}' -> '{
|
72
|
+
self.logger.debug(f"Found similar word: '{word.text}' -> '{ref_word.text}' ({similarity:.2f})")
|
58
73
|
return True, {}
|
59
74
|
|
60
75
|
self.logger.debug("No words meet similarity threshold")
|
@@ -62,58 +77,85 @@ class LevenshteinHandler(GapCorrectionHandler):
|
|
62
77
|
|
63
78
|
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
64
79
|
"""Try to correct words based on string similarity."""
|
80
|
+
if not data or "word_map" not in data:
|
81
|
+
self.logger.error("No word_map provided in data")
|
82
|
+
return []
|
83
|
+
|
84
|
+
word_map = data["word_map"]
|
65
85
|
corrections = []
|
66
86
|
|
67
87
|
# Process each word in the gap
|
68
|
-
for i,
|
88
|
+
for i, word_id in enumerate(gap.transcribed_word_ids):
|
89
|
+
if word_id not in word_map:
|
90
|
+
continue
|
91
|
+
word = word_map[word_id]
|
92
|
+
|
69
93
|
# Skip if word is empty or just punctuation
|
70
|
-
if not word.strip():
|
94
|
+
if not word.text.strip():
|
71
95
|
continue
|
72
96
|
|
73
97
|
# Skip exact matches
|
74
|
-
|
75
|
-
|
98
|
+
exact_match = False
|
99
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
100
|
+
if i < len(ref_word_ids):
|
101
|
+
ref_word_id = ref_word_ids[i]
|
102
|
+
if ref_word_id in word_map:
|
103
|
+
ref_word = word_map[ref_word_id]
|
104
|
+
if word.text.lower() == ref_word.text.lower():
|
105
|
+
exact_match = True
|
106
|
+
break
|
107
|
+
if exact_match:
|
76
108
|
continue
|
77
109
|
|
78
110
|
# Find matching reference words at this position
|
79
|
-
matches = {} # word -> (sources, similarity)
|
80
|
-
|
81
|
-
|
82
|
-
if i >= len(
|
111
|
+
matches: Dict[str, Tuple[List[str], float, str]] = {} # word -> (sources, similarity, word_id)
|
112
|
+
|
113
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
114
|
+
if i >= len(ref_word_ids):
|
83
115
|
continue
|
84
116
|
|
85
|
-
|
86
|
-
|
87
|
-
|
117
|
+
ref_word_id = ref_word_ids[i]
|
118
|
+
if ref_word_id not in word_map:
|
119
|
+
continue
|
120
|
+
ref_word = word_map[ref_word_id]
|
121
|
+
|
122
|
+
similarity = self._get_string_similarity(word.text, ref_word.text)
|
88
123
|
|
89
124
|
if similarity >= self.similarity_threshold:
|
90
|
-
self.logger.debug(f"Found match: '{word}' -> '{ref_word}' ({similarity:.2f})")
|
91
|
-
if
|
92
|
-
matches[
|
93
|
-
matches[
|
125
|
+
self.logger.debug(f"Found match: '{word.text}' -> '{ref_word.text}' ({similarity:.2f})")
|
126
|
+
if ref_word.text not in matches:
|
127
|
+
matches[ref_word.text] = ([], similarity, ref_word_id)
|
128
|
+
matches[ref_word.text][0].append(source)
|
94
129
|
|
95
130
|
# Create correction for best match if any found
|
96
131
|
if matches:
|
97
|
-
best_match, (sources, similarity) = max(
|
132
|
+
best_match, (sources, similarity, ref_word_id) = max(
|
98
133
|
matches.items(), key=lambda x: (len(x[1][0]), x[1][1]) # Sort by number of sources, then similarity
|
99
134
|
)
|
100
135
|
|
101
|
-
source_confidence = len(sources) / len(gap.
|
136
|
+
source_confidence = len(sources) / len(gap.reference_word_ids)
|
102
137
|
final_confidence = similarity * source_confidence
|
103
138
|
|
104
|
-
# Calculate reference positions
|
105
|
-
reference_positions = WordOperations.calculate_reference_positions(gap,
|
139
|
+
# Calculate reference positions
|
140
|
+
reference_positions = WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", []))
|
106
141
|
|
107
|
-
self.logger.debug(f"Creating correction: {word} -> {best_match} (confidence: {final_confidence})")
|
142
|
+
self.logger.debug(f"Creating correction: {word.text} -> {best_match} (confidence: {final_confidence})")
|
108
143
|
corrections.append(
|
109
|
-
|
110
|
-
original_word=word,
|
111
|
-
corrected_word=best_match,
|
144
|
+
WordCorrection(
|
145
|
+
original_word=word.text,
|
146
|
+
corrected_word=best_match,
|
147
|
+
segment_index=0,
|
112
148
|
original_position=gap.transcription_position + i,
|
113
|
-
source=", ".join(sources),
|
114
149
|
confidence=final_confidence,
|
115
|
-
|
150
|
+
source=", ".join(sources),
|
151
|
+
reason=f"String similarity ({final_confidence:.2f})",
|
152
|
+
alternatives={k: len(v[0]) for k, v in matches.items()},
|
153
|
+
is_deletion=False,
|
116
154
|
reference_positions=reference_positions,
|
155
|
+
length=1,
|
156
|
+
handler="LevenshteinHandler",
|
157
|
+
word_id=word_id,
|
158
|
+
corrected_word_id=ref_word_id,
|
117
159
|
)
|
118
160
|
)
|
119
161
|
|
@@ -0,0 +1,290 @@
|
|
1
|
+
from typing import List, Optional, Tuple, Dict, Any, Union
|
2
|
+
import logging
|
3
|
+
import json
|
4
|
+
from ollama import chat
|
5
|
+
from datetime import datetime
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
from lyrics_transcriber.types import GapSequence, WordCorrection
|
9
|
+
from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
|
10
|
+
from lyrics_transcriber.correction.handlers.word_operations import WordOperations
|
11
|
+
|
12
|
+
|
13
|
+
class LLMHandler(GapCorrectionHandler):
|
14
|
+
"""Uses an LLM to analyze and correct gaps by comparing with reference lyrics."""
|
15
|
+
|
16
|
+
def __init__(self, logger: Optional[logging.Logger] = None, cache_dir: Optional[Union[str, Path]] = None):
|
17
|
+
super().__init__(logger)
|
18
|
+
self.logger = logger or logging.getLogger(__name__)
|
19
|
+
self.model = "deepseek-r1:7b"
|
20
|
+
self.cache_dir = Path(cache_dir) if cache_dir else None
|
21
|
+
|
22
|
+
def _format_prompt(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> str:
|
23
|
+
"""Format the prompt for the LLM with context about the gap and reference lyrics."""
|
24
|
+
word_map = data.get("word_map", {})
|
25
|
+
metadata = data.get("metadata", {}) if data else {}
|
26
|
+
|
27
|
+
if not word_map:
|
28
|
+
self.logger.error("No word_map provided in data")
|
29
|
+
return ""
|
30
|
+
|
31
|
+
# Format transcribed words with their IDs
|
32
|
+
transcribed_words = [{"id": word_id, "text": word_map[word_id].text} for word_id in gap.transcribed_word_ids if word_id in word_map]
|
33
|
+
|
34
|
+
prompt = (
|
35
|
+
"You are a lyrics correction expert. You will be given transcribed lyrics that may contain errors "
|
36
|
+
"and reference lyrics from multiple sources. Your task is to analyze each word in the transcribed text "
|
37
|
+
"and suggest specific corrections based on the reference lyrics.\n\n"
|
38
|
+
"Each word has a unique ID. When suggesting corrections, you must specify the ID of the word being corrected. "
|
39
|
+
"This ensures accuracy in applying your corrections.\n\n"
|
40
|
+
"For each correction, specify:\n"
|
41
|
+
"1. The word ID being corrected\n"
|
42
|
+
"2. The correction type ('replace', 'split', 'combine', or 'delete')\n"
|
43
|
+
"3. The corrected text\n"
|
44
|
+
"4. Your confidence level\n"
|
45
|
+
"5. The reason for the correction\n\n"
|
46
|
+
)
|
47
|
+
|
48
|
+
# Add song context if available
|
49
|
+
if metadata and metadata.get("artist") and metadata.get("title"):
|
50
|
+
prompt += f"Song: {metadata['title']}\nArtist: {metadata['artist']}\n\n"
|
51
|
+
|
52
|
+
# Format transcribed words with IDs
|
53
|
+
prompt += "Transcribed words:\n"
|
54
|
+
for word in transcribed_words:
|
55
|
+
prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
|
56
|
+
|
57
|
+
prompt += "\nReference lyrics from different sources:\n"
|
58
|
+
|
59
|
+
# Add each reference source with words and their IDs
|
60
|
+
for source, word_ids in gap.reference_word_ids.items():
|
61
|
+
reference_words = [{"id": word_id, "text": word_map[word_id].text} for word_id in word_ids if word_id in word_map]
|
62
|
+
prompt += f"\n{source} immediate context:\n"
|
63
|
+
for word in reference_words:
|
64
|
+
prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
|
65
|
+
|
66
|
+
# Add full lyrics if available
|
67
|
+
if metadata and metadata.get("full_reference_texts", {}).get(source):
|
68
|
+
prompt += f"\nFull {source} lyrics:\n{metadata['full_reference_texts'][source]}\n"
|
69
|
+
|
70
|
+
# Add context about surrounding anchors if available
|
71
|
+
if gap.preceding_anchor_id:
|
72
|
+
preceding_anchor = next((a.anchor for a in data.get("anchor_sequences", []) if a.anchor.id == gap.preceding_anchor_id), None)
|
73
|
+
if preceding_anchor:
|
74
|
+
anchor_words = [
|
75
|
+
{"id": word_id, "text": word_map[word_id].text}
|
76
|
+
for word_id in preceding_anchor.transcribed_word_ids
|
77
|
+
if word_id in word_map
|
78
|
+
]
|
79
|
+
prompt += "\nPreceding correct words:\n"
|
80
|
+
for word in anchor_words:
|
81
|
+
prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
|
82
|
+
|
83
|
+
prompt += (
|
84
|
+
"\nProvide corrections in the following JSON format:\n"
|
85
|
+
"{\n"
|
86
|
+
' "corrections": [\n'
|
87
|
+
" {\n"
|
88
|
+
' "word_id": "id_of_word_to_correct",\n'
|
89
|
+
' "type": "replace|split|combine|delete",\n'
|
90
|
+
' "corrected_text": "new text",\n'
|
91
|
+
' "reference_word_id": "id_from_reference_lyrics", // Optional, use when matching a specific reference word\n'
|
92
|
+
' "confidence": 0.9,\n'
|
93
|
+
' "reason": "explanation of correction"\n'
|
94
|
+
" }\n"
|
95
|
+
" ]\n"
|
96
|
+
"}\n\n"
|
97
|
+
"Important rules:\n"
|
98
|
+
"1. Always include the word_id for each correction\n"
|
99
|
+
"2. For 'split' type, corrected_text should contain the space-separated words\n"
|
100
|
+
"3. For 'combine' type, word_id should be the first word to combine\n"
|
101
|
+
"4. Include reference_word_id when the correction matches a specific reference word\n"
|
102
|
+
"5. Only suggest corrections when you're confident they improve the lyrics\n"
|
103
|
+
"6. Preserve any existing words that match the reference lyrics\n"
|
104
|
+
"7. Respond ONLY with the JSON object, no other text"
|
105
|
+
)
|
106
|
+
|
107
|
+
return prompt
|
108
|
+
|
109
|
+
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
110
|
+
"""LLM handler can attempt to handle any gap with reference words."""
|
111
|
+
if not gap.reference_word_ids:
|
112
|
+
self.logger.debug("No reference words available")
|
113
|
+
return False, {}
|
114
|
+
|
115
|
+
return True, {}
|
116
|
+
|
117
|
+
def _write_debug_info(self, prompt: str, response: str, gap_index: int, audio_file_hash: Optional[str] = None) -> None:
|
118
|
+
"""Write prompt and response to debug files."""
|
119
|
+
if not self.cache_dir:
|
120
|
+
self.logger.warning("No cache directory provided, skipping LLM debug output")
|
121
|
+
return
|
122
|
+
|
123
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
124
|
+
debug_dir = self.cache_dir / "llm_debug"
|
125
|
+
debug_dir.mkdir(exist_ok=True, parents=True)
|
126
|
+
|
127
|
+
hash_prefix = f"{audio_file_hash}_" if audio_file_hash else ""
|
128
|
+
filename = debug_dir / f"llm_debug_{hash_prefix}{gap_index}_{timestamp}.txt"
|
129
|
+
|
130
|
+
debug_content = "=== LLM PROMPT ===\n" f"{prompt}\n\n" "=== LLM RESPONSE ===\n" f"{response}\n"
|
131
|
+
|
132
|
+
try:
|
133
|
+
with open(filename, "w", encoding="utf-8") as f:
|
134
|
+
f.write(debug_content)
|
135
|
+
except IOError as e:
|
136
|
+
self.logger.error(f"Failed to write LLM debug file: {e}")
|
137
|
+
|
138
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
139
|
+
"""Process the gap using the LLM and create corrections based on its response."""
|
140
|
+
if not data or "word_map" not in data:
|
141
|
+
self.logger.error("No word_map provided in data")
|
142
|
+
return []
|
143
|
+
|
144
|
+
word_map = data["word_map"]
|
145
|
+
transcribed_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids if word_id in word_map]
|
146
|
+
|
147
|
+
# Calculate reference positions using the centralized method
|
148
|
+
reference_positions = (
|
149
|
+
WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", [])) or {}
|
150
|
+
) # Ensure empty dict if None
|
151
|
+
|
152
|
+
prompt = self._format_prompt(gap, data)
|
153
|
+
if not prompt:
|
154
|
+
return []
|
155
|
+
|
156
|
+
# Get a unique index for this gap based on its position
|
157
|
+
gap_index = gap.transcription_position
|
158
|
+
|
159
|
+
try:
|
160
|
+
self.logger.debug(f"Processing gap words: {transcribed_words}")
|
161
|
+
self.logger.debug(f"Reference word IDs: {gap.reference_word_ids}")
|
162
|
+
|
163
|
+
response = chat(model=self.model, messages=[{"role": "user", "content": prompt}], format="json")
|
164
|
+
|
165
|
+
# Write debug info to files
|
166
|
+
self._write_debug_info(prompt, response.message.content, gap_index, audio_file_hash=data.get("audio_file_hash"))
|
167
|
+
|
168
|
+
try:
|
169
|
+
corrections_data = json.loads(response.message.content)
|
170
|
+
except json.JSONDecodeError as e:
|
171
|
+
self.logger.error(f"Failed to parse LLM response as JSON: {e}")
|
172
|
+
self.logger.error(f"Raw response content: {response.message.content}")
|
173
|
+
return []
|
174
|
+
|
175
|
+
# Check if corrections exist and are non-empty
|
176
|
+
if not corrections_data.get("corrections"):
|
177
|
+
self.logger.debug("No corrections suggested by LLM")
|
178
|
+
return []
|
179
|
+
|
180
|
+
corrections = []
|
181
|
+
for correction in corrections_data["corrections"]:
|
182
|
+
# Validate word_id exists in gap
|
183
|
+
if correction["word_id"] not in gap.transcribed_word_ids:
|
184
|
+
self.logger.error(f"LLM suggested correction for word_id {correction['word_id']} which is not in the gap")
|
185
|
+
continue
|
186
|
+
|
187
|
+
# Get original word from word map
|
188
|
+
original_word = word_map[correction["word_id"]]
|
189
|
+
position = gap.transcription_position + gap.transcribed_word_ids.index(correction["word_id"])
|
190
|
+
|
191
|
+
self.logger.debug(f"Processing correction: {correction}")
|
192
|
+
|
193
|
+
if correction["type"] == "replace":
|
194
|
+
self.logger.debug(
|
195
|
+
f"Creating replacement: '{original_word.text}' -> '{correction['corrected_text']}' " f"at position {position}"
|
196
|
+
)
|
197
|
+
corrections.append(
|
198
|
+
WordOperations.create_word_replacement_correction(
|
199
|
+
original_word=original_word.text,
|
200
|
+
corrected_word=correction["corrected_text"],
|
201
|
+
original_position=position,
|
202
|
+
source="LLM",
|
203
|
+
confidence=correction["confidence"],
|
204
|
+
reason=correction["reason"],
|
205
|
+
handler="LLMHandler",
|
206
|
+
reference_positions=reference_positions,
|
207
|
+
original_word_id=correction["word_id"],
|
208
|
+
corrected_word_id=correction.get("reference_word_id"),
|
209
|
+
)
|
210
|
+
)
|
211
|
+
elif correction["type"] == "split":
|
212
|
+
split_words = correction["corrected_text"].split()
|
213
|
+
self.logger.debug(f"Creating split: '{original_word.text}' -> {split_words} " f"at position {position}")
|
214
|
+
|
215
|
+
# Get reference word IDs if provided
|
216
|
+
reference_word_ids = correction.get("reference_word_ids", [None] * len(split_words))
|
217
|
+
|
218
|
+
corrections.extend(
|
219
|
+
WordOperations.create_word_split_corrections(
|
220
|
+
original_word=original_word.text,
|
221
|
+
reference_words=split_words,
|
222
|
+
original_position=position,
|
223
|
+
source="LLM",
|
224
|
+
confidence=correction["confidence"],
|
225
|
+
reason=correction["reason"],
|
226
|
+
handler="LLMHandler",
|
227
|
+
reference_positions=reference_positions,
|
228
|
+
original_word_id=correction["word_id"],
|
229
|
+
corrected_word_ids=reference_word_ids,
|
230
|
+
)
|
231
|
+
)
|
232
|
+
elif correction["type"] == "combine":
|
233
|
+
# Get all word IDs to combine
|
234
|
+
word_ids_to_combine = []
|
235
|
+
current_idx = gap.transcribed_word_ids.index(correction["word_id"])
|
236
|
+
words_needed = len(correction["corrected_text"].split())
|
237
|
+
|
238
|
+
if current_idx + words_needed <= len(gap.transcribed_word_ids):
|
239
|
+
word_ids_to_combine = gap.transcribed_word_ids[current_idx : current_idx + words_needed]
|
240
|
+
else:
|
241
|
+
self.logger.error(f"Not enough words available to combine at position {position}")
|
242
|
+
continue
|
243
|
+
|
244
|
+
words_to_combine = [word_map[word_id].text for word_id in word_ids_to_combine]
|
245
|
+
|
246
|
+
self.logger.debug(
|
247
|
+
f"Creating combine: {words_to_combine} -> '{correction['corrected_text']}' " f"at position {position}"
|
248
|
+
)
|
249
|
+
|
250
|
+
corrections.extend(
|
251
|
+
WordOperations.create_word_combine_corrections(
|
252
|
+
original_words=words_to_combine,
|
253
|
+
reference_word=correction["corrected_text"],
|
254
|
+
original_position=position,
|
255
|
+
source="LLM",
|
256
|
+
confidence=correction["confidence"],
|
257
|
+
combine_reason=correction["reason"],
|
258
|
+
delete_reason=f"Part of combining words: {correction['reason']}",
|
259
|
+
handler="LLMHandler",
|
260
|
+
reference_positions=reference_positions,
|
261
|
+
original_word_ids=word_ids_to_combine,
|
262
|
+
corrected_word_id=correction.get("reference_word_id"),
|
263
|
+
)
|
264
|
+
)
|
265
|
+
elif correction["type"] == "delete":
|
266
|
+
self.logger.debug(f"Creating deletion: '{original_word.text}' at position {position}")
|
267
|
+
corrections.append(
|
268
|
+
WordCorrection(
|
269
|
+
original_word=original_word.text,
|
270
|
+
corrected_word="",
|
271
|
+
segment_index=0,
|
272
|
+
original_position=position,
|
273
|
+
confidence=correction["confidence"],
|
274
|
+
source="LLM",
|
275
|
+
reason=correction["reason"],
|
276
|
+
alternatives={},
|
277
|
+
is_deletion=True,
|
278
|
+
handler="LLMHandler",
|
279
|
+
reference_positions=reference_positions,
|
280
|
+
word_id=correction["word_id"],
|
281
|
+
corrected_word_id=None, # Deleted words don't need a corrected ID
|
282
|
+
)
|
283
|
+
)
|
284
|
+
|
285
|
+
self.logger.debug(f"Created {len(corrections)} corrections: {[f'{c.original_word}->{c.corrected_word}' for c in corrections]}")
|
286
|
+
return corrections
|
287
|
+
|
288
|
+
except Exception as e:
|
289
|
+
self.logger.error(f"Unexpected error in LLM handler: {e}")
|
290
|
+
return []
|
@@ -20,88 +20,133 @@ class NoSpacePunctuationMatchHandler(GapCorrectionHandler):
|
|
20
20
|
# Remove all punctuation including apostrophes
|
21
21
|
return re.sub(r"[^\w\s]", "", text)
|
22
22
|
|
23
|
-
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
23
|
+
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
24
24
|
# Must have reference words
|
25
|
-
if not gap.
|
26
|
-
self.logger.debug("No reference
|
25
|
+
if not gap.reference_word_ids:
|
26
|
+
self.logger.debug("No reference word IDs available.")
|
27
27
|
return False, {}
|
28
28
|
|
29
|
+
# Get word lookup map from data
|
30
|
+
if not data or "word_map" not in data:
|
31
|
+
self.logger.error("No word_map provided in data")
|
32
|
+
return False, {}
|
33
|
+
|
34
|
+
word_map = data["word_map"]
|
35
|
+
|
36
|
+
# Get the actual words from word IDs
|
37
|
+
gap_words = []
|
38
|
+
for word_id in gap.transcribed_word_ids:
|
39
|
+
if word_id not in word_map:
|
40
|
+
self.logger.error(f"Word ID {word_id} not found in word_map")
|
41
|
+
return False, {}
|
42
|
+
gap_words.append(word_map[word_id].text)
|
43
|
+
|
29
44
|
# Get the gap text without spaces and punctuation
|
30
|
-
gap_text = self._remove_spaces_and_punct(
|
45
|
+
gap_text = self._remove_spaces_and_punct(gap_words)
|
31
46
|
|
32
47
|
# Check if any reference source matches when spaces and punctuation are removed
|
33
|
-
for
|
34
|
-
|
48
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
49
|
+
ref_words = []
|
50
|
+
for word_id in ref_word_ids:
|
51
|
+
if word_id not in word_map:
|
52
|
+
self.logger.error(f"Reference word ID {word_id} not found in word_map")
|
53
|
+
continue
|
54
|
+
ref_words.append(word_map[word_id].text)
|
55
|
+
|
56
|
+
if not ref_words:
|
57
|
+
continue
|
58
|
+
|
59
|
+
ref_text = self._remove_spaces_and_punct(ref_words)
|
35
60
|
if gap_text == ref_text:
|
36
61
|
self.logger.debug("Found a matching reference source with spaces and punctuation removed.")
|
37
|
-
return True, {
|
62
|
+
return True, {
|
63
|
+
"matching_source": source,
|
64
|
+
"reference_word_ids": ref_word_ids,
|
65
|
+
"word_map": word_map,
|
66
|
+
}
|
38
67
|
|
39
68
|
self.logger.debug("No matching reference source found with spaces and punctuation removed.")
|
40
69
|
return False, {}
|
41
70
|
|
42
71
|
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
43
|
-
|
72
|
+
"""Handle the gap using no-space punctuation matching."""
|
73
|
+
if not data:
|
74
|
+
can_handle, data = self.can_handle(gap)
|
75
|
+
if not can_handle:
|
76
|
+
return []
|
44
77
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
reference_words_original = None
|
50
|
-
for source, words in gap.reference_words.items():
|
51
|
-
if self._remove_spaces_and_punct(words) == gap_text:
|
52
|
-
matching_source = source
|
53
|
-
reference_words = words
|
54
|
-
reference_words_original = gap.reference_words_original[source]
|
55
|
-
self.logger.debug(f"Using source '{source}' for corrections.")
|
56
|
-
break
|
78
|
+
corrections = []
|
79
|
+
matching_source = data["matching_source"]
|
80
|
+
reference_word_ids = data["reference_word_ids"]
|
81
|
+
word_map = data["word_map"]
|
57
82
|
|
58
83
|
# Calculate reference positions for the matching source
|
59
84
|
reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
|
60
85
|
|
61
86
|
# Handle cases where number of words differ
|
62
|
-
if len(gap.
|
87
|
+
if len(gap.transcribed_word_ids) > len(reference_word_ids):
|
63
88
|
# Multiple transcribed words -> fewer reference words
|
89
|
+
# Get the actual words from word IDs
|
90
|
+
gap_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids]
|
91
|
+
ref_word = word_map[reference_word_ids[0]].text
|
92
|
+
|
64
93
|
corrections.extend(
|
65
94
|
WordOperations.create_word_combine_corrections(
|
66
|
-
original_words=
|
67
|
-
reference_word=
|
95
|
+
original_words=gap_words,
|
96
|
+
reference_word=ref_word,
|
68
97
|
original_position=gap.transcription_position,
|
69
98
|
source=matching_source,
|
70
99
|
confidence=1.0,
|
71
|
-
combine_reason="
|
72
|
-
delete_reason="
|
100
|
+
combine_reason="Words combined based on text match",
|
101
|
+
delete_reason="Word removed as part of text match combination",
|
73
102
|
reference_positions=reference_positions,
|
103
|
+
handler="NoSpacePunctuationMatchHandler",
|
104
|
+
original_word_ids=gap.transcribed_word_ids,
|
105
|
+
corrected_word_id=reference_word_ids[0], # Use the reference word's ID
|
74
106
|
)
|
75
107
|
)
|
76
|
-
self.logger.debug(f"Combined words into '{
|
108
|
+
self.logger.debug(f"Combined words into '{ref_word}'.")
|
77
109
|
|
78
|
-
elif len(gap.
|
110
|
+
elif len(gap.transcribed_word_ids) < len(reference_word_ids):
|
79
111
|
# Single transcribed word -> multiple reference words
|
112
|
+
# Get the actual words
|
113
|
+
gap_word = word_map[gap.transcribed_word_ids[0]].text
|
114
|
+
ref_words = [word_map[word_id].text for word_id in reference_word_ids]
|
115
|
+
|
80
116
|
corrections.extend(
|
81
117
|
WordOperations.create_word_split_corrections(
|
82
|
-
original_word=
|
83
|
-
reference_words=
|
118
|
+
original_word=gap_word,
|
119
|
+
reference_words=ref_words,
|
84
120
|
original_position=gap.transcription_position,
|
85
121
|
source=matching_source,
|
86
122
|
confidence=1.0,
|
87
|
-
reason="
|
123
|
+
reason="Split word based on text match",
|
88
124
|
reference_positions=reference_positions,
|
125
|
+
handler="NoSpacePunctuationMatchHandler",
|
126
|
+
original_word_id=gap.transcribed_word_ids[0],
|
127
|
+
corrected_word_ids=reference_word_ids, # Use the reference word IDs
|
89
128
|
)
|
90
129
|
)
|
91
|
-
self.logger.debug(f"Split word '{
|
130
|
+
self.logger.debug(f"Split word '{gap_word}' into {ref_words}.")
|
92
131
|
|
93
132
|
else:
|
94
133
|
# One-to-one replacement
|
95
|
-
for i, (
|
96
|
-
|
134
|
+
for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
|
135
|
+
orig_word = word_map[orig_word_id]
|
136
|
+
ref_word = word_map[ref_word_id]
|
137
|
+
|
138
|
+
if orig_word.text.lower() != ref_word.text.lower():
|
97
139
|
correction = WordOperations.create_word_replacement_correction(
|
98
|
-
original_word=orig_word,
|
99
|
-
corrected_word=
|
140
|
+
original_word=orig_word.text,
|
141
|
+
corrected_word=ref_word.text,
|
100
142
|
original_position=gap.transcription_position + i,
|
101
143
|
source=matching_source,
|
102
144
|
confidence=1.0,
|
103
|
-
reason=f"
|
145
|
+
reason=f"Source '{matching_source}' matched when spaces and punctuation removed",
|
104
146
|
reference_positions=reference_positions,
|
147
|
+
handler="NoSpacePunctuationMatchHandler",
|
148
|
+
original_word_id=orig_word_id,
|
149
|
+
corrected_word_id=ref_word_id,
|
105
150
|
)
|
106
151
|
corrections.append(correction)
|
107
152
|
self.logger.debug(f"Correction made: {correction}")
|