lyrics-transcriber 0.41.0__py3-none-any.whl → 0.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. lyrics_transcriber/core/controller.py +30 -52
  2. lyrics_transcriber/correction/anchor_sequence.py +325 -150
  3. lyrics_transcriber/correction/corrector.py +224 -107
  4. lyrics_transcriber/correction/handlers/base.py +28 -10
  5. lyrics_transcriber/correction/handlers/extend_anchor.py +47 -24
  6. lyrics_transcriber/correction/handlers/levenshtein.py +75 -33
  7. lyrics_transcriber/correction/handlers/llm.py +290 -0
  8. lyrics_transcriber/correction/handlers/no_space_punct_match.py +81 -36
  9. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +46 -26
  10. lyrics_transcriber/correction/handlers/repeat.py +28 -11
  11. lyrics_transcriber/correction/handlers/sound_alike.py +68 -32
  12. lyrics_transcriber/correction/handlers/syllables_match.py +80 -30
  13. lyrics_transcriber/correction/handlers/word_count_match.py +36 -19
  14. lyrics_transcriber/correction/handlers/word_operations.py +68 -22
  15. lyrics_transcriber/correction/text_utils.py +3 -7
  16. lyrics_transcriber/frontend/.yarn/install-state.gz +0 -0
  17. lyrics_transcriber/frontend/.yarn/releases/yarn-4.6.0.cjs +934 -0
  18. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  19. lyrics_transcriber/frontend/dist/assets/{index-DKnNJHRK.js → index-D0Gr3Ep7.js} +16509 -9038
  20. lyrics_transcriber/frontend/dist/assets/index-D0Gr3Ep7.js.map +1 -0
  21. lyrics_transcriber/frontend/dist/index.html +1 -1
  22. lyrics_transcriber/frontend/package.json +6 -2
  23. lyrics_transcriber/frontend/src/App.tsx +18 -2
  24. lyrics_transcriber/frontend/src/api.ts +103 -6
  25. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +14 -6
  26. lyrics_transcriber/frontend/src/components/DetailsModal.tsx +86 -59
  27. lyrics_transcriber/frontend/src/components/EditModal.tsx +281 -63
  28. lyrics_transcriber/frontend/src/components/FileUpload.tsx +2 -2
  29. lyrics_transcriber/frontend/src/components/Header.tsx +249 -0
  30. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +320 -266
  31. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +120 -0
  32. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +174 -52
  33. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +158 -114
  34. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +59 -78
  35. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +39 -16
  36. lyrics_transcriber/frontend/src/components/WordEditControls.tsx +4 -10
  37. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +134 -68
  38. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +1 -1
  39. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +85 -115
  40. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  41. lyrics_transcriber/frontend/src/components/shared/types.ts +15 -7
  42. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +67 -0
  43. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  44. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +7 -7
  45. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +121 -0
  46. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  47. lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
  48. lyrics_transcriber/frontend/src/types.js +2 -0
  49. lyrics_transcriber/frontend/src/types.ts +70 -49
  50. lyrics_transcriber/frontend/src/validation.ts +132 -0
  51. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  52. lyrics_transcriber/frontend/yarn.lock +3752 -0
  53. lyrics_transcriber/lyrics/base_lyrics_provider.py +75 -12
  54. lyrics_transcriber/lyrics/file_provider.py +6 -5
  55. lyrics_transcriber/lyrics/genius.py +5 -2
  56. lyrics_transcriber/lyrics/spotify.py +58 -21
  57. lyrics_transcriber/output/ass/config.py +16 -5
  58. lyrics_transcriber/output/cdg.py +1 -1
  59. lyrics_transcriber/output/generator.py +22 -8
  60. lyrics_transcriber/output/plain_text.py +15 -10
  61. lyrics_transcriber/output/segment_resizer.py +16 -3
  62. lyrics_transcriber/output/subtitles.py +27 -1
  63. lyrics_transcriber/output/video.py +107 -1
  64. lyrics_transcriber/review/__init__.py +0 -1
  65. lyrics_transcriber/review/server.py +337 -164
  66. lyrics_transcriber/transcribers/audioshake.py +3 -0
  67. lyrics_transcriber/transcribers/base_transcriber.py +11 -3
  68. lyrics_transcriber/transcribers/whisper.py +11 -1
  69. lyrics_transcriber/types.py +151 -105
  70. lyrics_transcriber/utils/word_utils.py +27 -0
  71. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/METADATA +3 -1
  72. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/RECORD +75 -61
  73. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/WHEEL +1 -1
  74. lyrics_transcriber/frontend/dist/assets/index-DKnNJHRK.js.map +0 -1
  75. lyrics_transcriber/frontend/package-lock.json +0 -4260
  76. lyrics_transcriber/frontend/src/components/shared/utils/initializeDataWithIds.tsx +0 -202
  77. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/LICENSE +0 -0
  78. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/entry_points.txt +0 -0
@@ -38,23 +38,38 @@ class LevenshteinHandler(GapCorrectionHandler):
38
38
  self.similarity_threshold = similarity_threshold
39
39
  self.logger = logger or logging.getLogger(__name__)
40
40
 
41
- def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
41
+ def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
42
42
  """Check if we can handle this gap - we'll try if there are reference words."""
43
- if not gap.reference_words:
43
+ if not data or "word_map" not in data:
44
+ self.logger.error("No word_map provided in data")
45
+ return False, {}
46
+
47
+ word_map = data["word_map"]
48
+
49
+ if not gap.reference_word_ids:
44
50
  self.logger.debug("No reference words available")
45
51
  return False, {}
46
52
 
47
- if not gap.words:
53
+ if not gap.transcribed_word_ids:
48
54
  self.logger.debug("No gap words available")
49
55
  return False, {}
50
56
 
51
57
  # Check if any word has sufficient similarity to reference
52
- for i, word in enumerate(gap.words):
53
- for ref_words in gap.reference_words.values():
54
- if i < len(ref_words):
55
- similarity = self._get_string_similarity(word, ref_words[i])
58
+ for i, word_id in enumerate(gap.transcribed_word_ids):
59
+ if word_id not in word_map:
60
+ continue
61
+ word = word_map[word_id]
62
+
63
+ for source, ref_word_ids in gap.reference_word_ids.items():
64
+ if i < len(ref_word_ids):
65
+ ref_word_id = ref_word_ids[i]
66
+ if ref_word_id not in word_map:
67
+ continue
68
+ ref_word = word_map[ref_word_id]
69
+
70
+ similarity = self._get_string_similarity(word.text, ref_word.text)
56
71
  if similarity >= self.similarity_threshold:
57
- self.logger.debug(f"Found similar word: '{word}' -> '{ref_words[i]}' ({similarity:.2f})")
72
+ self.logger.debug(f"Found similar word: '{word.text}' -> '{ref_word.text}' ({similarity:.2f})")
58
73
  return True, {}
59
74
 
60
75
  self.logger.debug("No words meet similarity threshold")
@@ -62,58 +77,85 @@ class LevenshteinHandler(GapCorrectionHandler):
62
77
 
63
78
  def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
64
79
  """Try to correct words based on string similarity."""
80
+ if not data or "word_map" not in data:
81
+ self.logger.error("No word_map provided in data")
82
+ return []
83
+
84
+ word_map = data["word_map"]
65
85
  corrections = []
66
86
 
67
87
  # Process each word in the gap
68
- for i, word in enumerate(gap.words):
88
+ for i, word_id in enumerate(gap.transcribed_word_ids):
89
+ if word_id not in word_map:
90
+ continue
91
+ word = word_map[word_id]
92
+
69
93
  # Skip if word is empty or just punctuation
70
- if not word.strip():
94
+ if not word.text.strip():
71
95
  continue
72
96
 
73
97
  # Skip exact matches
74
- if any(i < len(ref_words) and word.lower() == ref_words[i].lower() for ref_words in gap.reference_words.values()):
75
- self.logger.debug(f"Skipping exact match: '{word}'")
98
+ exact_match = False
99
+ for source, ref_word_ids in gap.reference_word_ids.items():
100
+ if i < len(ref_word_ids):
101
+ ref_word_id = ref_word_ids[i]
102
+ if ref_word_id in word_map:
103
+ ref_word = word_map[ref_word_id]
104
+ if word.text.lower() == ref_word.text.lower():
105
+ exact_match = True
106
+ break
107
+ if exact_match:
76
108
  continue
77
109
 
78
110
  # Find matching reference words at this position
79
- matches = {} # word -> (sources, similarity)
80
- for source, ref_words in gap.reference_words.items():
81
- ref_words_original = gap.reference_words_original[source] # Get original formatted words
82
- if i >= len(ref_words):
111
+ matches: Dict[str, Tuple[List[str], float, str]] = {} # word -> (sources, similarity, word_id)
112
+
113
+ for source, ref_word_ids in gap.reference_word_ids.items():
114
+ if i >= len(ref_word_ids):
83
115
  continue
84
116
 
85
- ref_word = ref_words[i]
86
- ref_word_original = ref_words_original[i] # Get original formatted word
87
- similarity = self._get_string_similarity(word, ref_word)
117
+ ref_word_id = ref_word_ids[i]
118
+ if ref_word_id not in word_map:
119
+ continue
120
+ ref_word = word_map[ref_word_id]
121
+
122
+ similarity = self._get_string_similarity(word.text, ref_word.text)
88
123
 
89
124
  if similarity >= self.similarity_threshold:
90
- self.logger.debug(f"Found match: '{word}' -> '{ref_word}' ({similarity:.2f})")
91
- if ref_word_original not in matches: # Use original formatted word as key
92
- matches[ref_word_original] = ([], similarity)
93
- matches[ref_word_original][0].append(source)
125
+ self.logger.debug(f"Found match: '{word.text}' -> '{ref_word.text}' ({similarity:.2f})")
126
+ if ref_word.text not in matches:
127
+ matches[ref_word.text] = ([], similarity, ref_word_id)
128
+ matches[ref_word.text][0].append(source)
94
129
 
95
130
  # Create correction for best match if any found
96
131
  if matches:
97
- best_match, (sources, similarity) = max(
132
+ best_match, (sources, similarity, ref_word_id) = max(
98
133
  matches.items(), key=lambda x: (len(x[1][0]), x[1][1]) # Sort by number of sources, then similarity
99
134
  )
100
135
 
101
- source_confidence = len(sources) / len(gap.reference_words)
136
+ source_confidence = len(sources) / len(gap.reference_word_ids)
102
137
  final_confidence = similarity * source_confidence
103
138
 
104
- # Calculate reference positions for matching sources
105
- reference_positions = WordOperations.calculate_reference_positions(gap, sources)
139
+ # Calculate reference positions
140
+ reference_positions = WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", []))
106
141
 
107
- self.logger.debug(f"Creating correction: {word} -> {best_match} (confidence: {final_confidence})")
142
+ self.logger.debug(f"Creating correction: {word.text} -> {best_match} (confidence: {final_confidence})")
108
143
  corrections.append(
109
- WordOperations.create_word_replacement_correction(
110
- original_word=word,
111
- corrected_word=best_match, # Using original formatted word
144
+ WordCorrection(
145
+ original_word=word.text,
146
+ corrected_word=best_match,
147
+ segment_index=0,
112
148
  original_position=gap.transcription_position + i,
113
- source=", ".join(sources),
114
149
  confidence=final_confidence,
115
- reason=f"LevenshteinHandler: String similarity ({final_confidence:.2f})",
150
+ source=", ".join(sources),
151
+ reason=f"String similarity ({final_confidence:.2f})",
152
+ alternatives={k: len(v[0]) for k, v in matches.items()},
153
+ is_deletion=False,
116
154
  reference_positions=reference_positions,
155
+ length=1,
156
+ handler="LevenshteinHandler",
157
+ word_id=word_id,
158
+ corrected_word_id=ref_word_id,
117
159
  )
118
160
  )
119
161
 
@@ -0,0 +1,290 @@
1
+ from typing import List, Optional, Tuple, Dict, Any, Union
2
+ import logging
3
+ import json
4
+ from ollama import chat
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+ from lyrics_transcriber.types import GapSequence, WordCorrection
9
+ from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
10
+ from lyrics_transcriber.correction.handlers.word_operations import WordOperations
11
+
12
+
13
+ class LLMHandler(GapCorrectionHandler):
14
+ """Uses an LLM to analyze and correct gaps by comparing with reference lyrics."""
15
+
16
+ def __init__(self, logger: Optional[logging.Logger] = None, cache_dir: Optional[Union[str, Path]] = None):
17
+ super().__init__(logger)
18
+ self.logger = logger or logging.getLogger(__name__)
19
+ self.model = "deepseek-r1:7b"
20
+ self.cache_dir = Path(cache_dir) if cache_dir else None
21
+
22
+ def _format_prompt(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> str:
23
+ """Format the prompt for the LLM with context about the gap and reference lyrics."""
24
+ word_map = data.get("word_map", {})
25
+ metadata = data.get("metadata", {}) if data else {}
26
+
27
+ if not word_map:
28
+ self.logger.error("No word_map provided in data")
29
+ return ""
30
+
31
+ # Format transcribed words with their IDs
32
+ transcribed_words = [{"id": word_id, "text": word_map[word_id].text} for word_id in gap.transcribed_word_ids if word_id in word_map]
33
+
34
+ prompt = (
35
+ "You are a lyrics correction expert. You will be given transcribed lyrics that may contain errors "
36
+ "and reference lyrics from multiple sources. Your task is to analyze each word in the transcribed text "
37
+ "and suggest specific corrections based on the reference lyrics.\n\n"
38
+ "Each word has a unique ID. When suggesting corrections, you must specify the ID of the word being corrected. "
39
+ "This ensures accuracy in applying your corrections.\n\n"
40
+ "For each correction, specify:\n"
41
+ "1. The word ID being corrected\n"
42
+ "2. The correction type ('replace', 'split', 'combine', or 'delete')\n"
43
+ "3. The corrected text\n"
44
+ "4. Your confidence level\n"
45
+ "5. The reason for the correction\n\n"
46
+ )
47
+
48
+ # Add song context if available
49
+ if metadata and metadata.get("artist") and metadata.get("title"):
50
+ prompt += f"Song: {metadata['title']}\nArtist: {metadata['artist']}\n\n"
51
+
52
+ # Format transcribed words with IDs
53
+ prompt += "Transcribed words:\n"
54
+ for word in transcribed_words:
55
+ prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
56
+
57
+ prompt += "\nReference lyrics from different sources:\n"
58
+
59
+ # Add each reference source with words and their IDs
60
+ for source, word_ids in gap.reference_word_ids.items():
61
+ reference_words = [{"id": word_id, "text": word_map[word_id].text} for word_id in word_ids if word_id in word_map]
62
+ prompt += f"\n{source} immediate context:\n"
63
+ for word in reference_words:
64
+ prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
65
+
66
+ # Add full lyrics if available
67
+ if metadata and metadata.get("full_reference_texts", {}).get(source):
68
+ prompt += f"\nFull {source} lyrics:\n{metadata['full_reference_texts'][source]}\n"
69
+
70
+ # Add context about surrounding anchors if available
71
+ if gap.preceding_anchor_id:
72
+ preceding_anchor = next((a.anchor for a in data.get("anchor_sequences", []) if a.anchor.id == gap.preceding_anchor_id), None)
73
+ if preceding_anchor:
74
+ anchor_words = [
75
+ {"id": word_id, "text": word_map[word_id].text}
76
+ for word_id in preceding_anchor.transcribed_word_ids
77
+ if word_id in word_map
78
+ ]
79
+ prompt += "\nPreceding correct words:\n"
80
+ for word in anchor_words:
81
+ prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
82
+
83
+ prompt += (
84
+ "\nProvide corrections in the following JSON format:\n"
85
+ "{\n"
86
+ ' "corrections": [\n'
87
+ " {\n"
88
+ ' "word_id": "id_of_word_to_correct",\n'
89
+ ' "type": "replace|split|combine|delete",\n'
90
+ ' "corrected_text": "new text",\n'
91
+ ' "reference_word_id": "id_from_reference_lyrics", // Optional, use when matching a specific reference word\n'
92
+ ' "confidence": 0.9,\n'
93
+ ' "reason": "explanation of correction"\n'
94
+ " }\n"
95
+ " ]\n"
96
+ "}\n\n"
97
+ "Important rules:\n"
98
+ "1. Always include the word_id for each correction\n"
99
+ "2. For 'split' type, corrected_text should contain the space-separated words\n"
100
+ "3. For 'combine' type, word_id should be the first word to combine\n"
101
+ "4. Include reference_word_id when the correction matches a specific reference word\n"
102
+ "5. Only suggest corrections when you're confident they improve the lyrics\n"
103
+ "6. Preserve any existing words that match the reference lyrics\n"
104
+ "7. Respond ONLY with the JSON object, no other text"
105
+ )
106
+
107
+ return prompt
108
+
109
+ def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
110
+ """LLM handler can attempt to handle any gap with reference words."""
111
+ if not gap.reference_word_ids:
112
+ self.logger.debug("No reference words available")
113
+ return False, {}
114
+
115
+ return True, {}
116
+
117
+ def _write_debug_info(self, prompt: str, response: str, gap_index: int, audio_file_hash: Optional[str] = None) -> None:
118
+ """Write prompt and response to debug files."""
119
+ if not self.cache_dir:
120
+ self.logger.warning("No cache directory provided, skipping LLM debug output")
121
+ return
122
+
123
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
124
+ debug_dir = self.cache_dir / "llm_debug"
125
+ debug_dir.mkdir(exist_ok=True, parents=True)
126
+
127
+ hash_prefix = f"{audio_file_hash}_" if audio_file_hash else ""
128
+ filename = debug_dir / f"llm_debug_{hash_prefix}{gap_index}_{timestamp}.txt"
129
+
130
+ debug_content = "=== LLM PROMPT ===\n" f"{prompt}\n\n" "=== LLM RESPONSE ===\n" f"{response}\n"
131
+
132
+ try:
133
+ with open(filename, "w", encoding="utf-8") as f:
134
+ f.write(debug_content)
135
+ except IOError as e:
136
+ self.logger.error(f"Failed to write LLM debug file: {e}")
137
+
138
+ def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
139
+ """Process the gap using the LLM and create corrections based on its response."""
140
+ if not data or "word_map" not in data:
141
+ self.logger.error("No word_map provided in data")
142
+ return []
143
+
144
+ word_map = data["word_map"]
145
+ transcribed_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids if word_id in word_map]
146
+
147
+ # Calculate reference positions using the centralized method
148
+ reference_positions = (
149
+ WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", [])) or {}
150
+ ) # Ensure empty dict if None
151
+
152
+ prompt = self._format_prompt(gap, data)
153
+ if not prompt:
154
+ return []
155
+
156
+ # Get a unique index for this gap based on its position
157
+ gap_index = gap.transcription_position
158
+
159
+ try:
160
+ self.logger.debug(f"Processing gap words: {transcribed_words}")
161
+ self.logger.debug(f"Reference word IDs: {gap.reference_word_ids}")
162
+
163
+ response = chat(model=self.model, messages=[{"role": "user", "content": prompt}], format="json")
164
+
165
+ # Write debug info to files
166
+ self._write_debug_info(prompt, response.message.content, gap_index, audio_file_hash=data.get("audio_file_hash"))
167
+
168
+ try:
169
+ corrections_data = json.loads(response.message.content)
170
+ except json.JSONDecodeError as e:
171
+ self.logger.error(f"Failed to parse LLM response as JSON: {e}")
172
+ self.logger.error(f"Raw response content: {response.message.content}")
173
+ return []
174
+
175
+ # Check if corrections exist and are non-empty
176
+ if not corrections_data.get("corrections"):
177
+ self.logger.debug("No corrections suggested by LLM")
178
+ return []
179
+
180
+ corrections = []
181
+ for correction in corrections_data["corrections"]:
182
+ # Validate word_id exists in gap
183
+ if correction["word_id"] not in gap.transcribed_word_ids:
184
+ self.logger.error(f"LLM suggested correction for word_id {correction['word_id']} which is not in the gap")
185
+ continue
186
+
187
+ # Get original word from word map
188
+ original_word = word_map[correction["word_id"]]
189
+ position = gap.transcription_position + gap.transcribed_word_ids.index(correction["word_id"])
190
+
191
+ self.logger.debug(f"Processing correction: {correction}")
192
+
193
+ if correction["type"] == "replace":
194
+ self.logger.debug(
195
+ f"Creating replacement: '{original_word.text}' -> '{correction['corrected_text']}' " f"at position {position}"
196
+ )
197
+ corrections.append(
198
+ WordOperations.create_word_replacement_correction(
199
+ original_word=original_word.text,
200
+ corrected_word=correction["corrected_text"],
201
+ original_position=position,
202
+ source="LLM",
203
+ confidence=correction["confidence"],
204
+ reason=correction["reason"],
205
+ handler="LLMHandler",
206
+ reference_positions=reference_positions,
207
+ original_word_id=correction["word_id"],
208
+ corrected_word_id=correction.get("reference_word_id"),
209
+ )
210
+ )
211
+ elif correction["type"] == "split":
212
+ split_words = correction["corrected_text"].split()
213
+ self.logger.debug(f"Creating split: '{original_word.text}' -> {split_words} " f"at position {position}")
214
+
215
+ # Get reference word IDs if provided
216
+ reference_word_ids = correction.get("reference_word_ids", [None] * len(split_words))
217
+
218
+ corrections.extend(
219
+ WordOperations.create_word_split_corrections(
220
+ original_word=original_word.text,
221
+ reference_words=split_words,
222
+ original_position=position,
223
+ source="LLM",
224
+ confidence=correction["confidence"],
225
+ reason=correction["reason"],
226
+ handler="LLMHandler",
227
+ reference_positions=reference_positions,
228
+ original_word_id=correction["word_id"],
229
+ corrected_word_ids=reference_word_ids,
230
+ )
231
+ )
232
+ elif correction["type"] == "combine":
233
+ # Get all word IDs to combine
234
+ word_ids_to_combine = []
235
+ current_idx = gap.transcribed_word_ids.index(correction["word_id"])
236
+ words_needed = len(correction["corrected_text"].split())
237
+
238
+ if current_idx + words_needed <= len(gap.transcribed_word_ids):
239
+ word_ids_to_combine = gap.transcribed_word_ids[current_idx : current_idx + words_needed]
240
+ else:
241
+ self.logger.error(f"Not enough words available to combine at position {position}")
242
+ continue
243
+
244
+ words_to_combine = [word_map[word_id].text for word_id in word_ids_to_combine]
245
+
246
+ self.logger.debug(
247
+ f"Creating combine: {words_to_combine} -> '{correction['corrected_text']}' " f"at position {position}"
248
+ )
249
+
250
+ corrections.extend(
251
+ WordOperations.create_word_combine_corrections(
252
+ original_words=words_to_combine,
253
+ reference_word=correction["corrected_text"],
254
+ original_position=position,
255
+ source="LLM",
256
+ confidence=correction["confidence"],
257
+ combine_reason=correction["reason"],
258
+ delete_reason=f"Part of combining words: {correction['reason']}",
259
+ handler="LLMHandler",
260
+ reference_positions=reference_positions,
261
+ original_word_ids=word_ids_to_combine,
262
+ corrected_word_id=correction.get("reference_word_id"),
263
+ )
264
+ )
265
+ elif correction["type"] == "delete":
266
+ self.logger.debug(f"Creating deletion: '{original_word.text}' at position {position}")
267
+ corrections.append(
268
+ WordCorrection(
269
+ original_word=original_word.text,
270
+ corrected_word="",
271
+ segment_index=0,
272
+ original_position=position,
273
+ confidence=correction["confidence"],
274
+ source="LLM",
275
+ reason=correction["reason"],
276
+ alternatives={},
277
+ is_deletion=True,
278
+ handler="LLMHandler",
279
+ reference_positions=reference_positions,
280
+ word_id=correction["word_id"],
281
+ corrected_word_id=None, # Deleted words don't need a corrected ID
282
+ )
283
+ )
284
+
285
+ self.logger.debug(f"Created {len(corrections)} corrections: {[f'{c.original_word}->{c.corrected_word}' for c in corrections]}")
286
+ return corrections
287
+
288
+ except Exception as e:
289
+ self.logger.error(f"Unexpected error in LLM handler: {e}")
290
+ return []
@@ -20,88 +20,133 @@ class NoSpacePunctuationMatchHandler(GapCorrectionHandler):
20
20
  # Remove all punctuation including apostrophes
21
21
  return re.sub(r"[^\w\s]", "", text)
22
22
 
23
- def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
23
+ def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
24
24
  # Must have reference words
25
- if not gap.reference_words:
26
- self.logger.debug("No reference words available.")
25
+ if not gap.reference_word_ids:
26
+ self.logger.debug("No reference word IDs available.")
27
27
  return False, {}
28
28
 
29
+ # Get word lookup map from data
30
+ if not data or "word_map" not in data:
31
+ self.logger.error("No word_map provided in data")
32
+ return False, {}
33
+
34
+ word_map = data["word_map"]
35
+
36
+ # Get the actual words from word IDs
37
+ gap_words = []
38
+ for word_id in gap.transcribed_word_ids:
39
+ if word_id not in word_map:
40
+ self.logger.error(f"Word ID {word_id} not found in word_map")
41
+ return False, {}
42
+ gap_words.append(word_map[word_id].text)
43
+
29
44
  # Get the gap text without spaces and punctuation
30
- gap_text = self._remove_spaces_and_punct(gap.words)
45
+ gap_text = self._remove_spaces_and_punct(gap_words)
31
46
 
32
47
  # Check if any reference source matches when spaces and punctuation are removed
33
- for words in gap.reference_words.values():
34
- ref_text = self._remove_spaces_and_punct(words)
48
+ for source, ref_word_ids in gap.reference_word_ids.items():
49
+ ref_words = []
50
+ for word_id in ref_word_ids:
51
+ if word_id not in word_map:
52
+ self.logger.error(f"Reference word ID {word_id} not found in word_map")
53
+ continue
54
+ ref_words.append(word_map[word_id].text)
55
+
56
+ if not ref_words:
57
+ continue
58
+
59
+ ref_text = self._remove_spaces_and_punct(ref_words)
35
60
  if gap_text == ref_text:
36
61
  self.logger.debug("Found a matching reference source with spaces and punctuation removed.")
37
- return True, {}
62
+ return True, {
63
+ "matching_source": source,
64
+ "reference_word_ids": ref_word_ids,
65
+ "word_map": word_map,
66
+ }
38
67
 
39
68
  self.logger.debug("No matching reference source found with spaces and punctuation removed.")
40
69
  return False, {}
41
70
 
42
71
  def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
43
- corrections = []
72
+ """Handle the gap using no-space punctuation matching."""
73
+ if not data:
74
+ can_handle, data = self.can_handle(gap)
75
+ if not can_handle:
76
+ return []
44
77
 
45
- # Find the matching source (we know there is at least one from can_handle)
46
- gap_text = self._remove_spaces_and_punct(gap.words)
47
- matching_source = None
48
- reference_words = None
49
- reference_words_original = None
50
- for source, words in gap.reference_words.items():
51
- if self._remove_spaces_and_punct(words) == gap_text:
52
- matching_source = source
53
- reference_words = words
54
- reference_words_original = gap.reference_words_original[source]
55
- self.logger.debug(f"Using source '{source}' for corrections.")
56
- break
78
+ corrections = []
79
+ matching_source = data["matching_source"]
80
+ reference_word_ids = data["reference_word_ids"]
81
+ word_map = data["word_map"]
57
82
 
58
83
  # Calculate reference positions for the matching source
59
84
  reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
60
85
 
61
86
  # Handle cases where number of words differ
62
- if len(gap.words) > len(reference_words):
87
+ if len(gap.transcribed_word_ids) > len(reference_word_ids):
63
88
  # Multiple transcribed words -> fewer reference words
89
+ # Get the actual words from word IDs
90
+ gap_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids]
91
+ ref_word = word_map[reference_word_ids[0]].text
92
+
64
93
  corrections.extend(
65
94
  WordOperations.create_word_combine_corrections(
66
- original_words=gap.words,
67
- reference_word=reference_words_original[0],
95
+ original_words=gap_words,
96
+ reference_word=ref_word,
68
97
  original_position=gap.transcription_position,
69
98
  source=matching_source,
70
99
  confidence=1.0,
71
- combine_reason="NoSpacePunctuationMatchHandler: Words combined based on text match",
72
- delete_reason="NoSpacePunctuationMatchHandler: Word removed as part of text match combination",
100
+ combine_reason="Words combined based on text match",
101
+ delete_reason="Word removed as part of text match combination",
73
102
  reference_positions=reference_positions,
103
+ handler="NoSpacePunctuationMatchHandler",
104
+ original_word_ids=gap.transcribed_word_ids,
105
+ corrected_word_id=reference_word_ids[0], # Use the reference word's ID
74
106
  )
75
107
  )
76
- self.logger.debug(f"Combined words into '{reference_words_original[0]}'.")
108
+ self.logger.debug(f"Combined words into '{ref_word}'.")
77
109
 
78
- elif len(gap.words) < len(reference_words):
110
+ elif len(gap.transcribed_word_ids) < len(reference_word_ids):
79
111
  # Single transcribed word -> multiple reference words
112
+ # Get the actual words
113
+ gap_word = word_map[gap.transcribed_word_ids[0]].text
114
+ ref_words = [word_map[word_id].text for word_id in reference_word_ids]
115
+
80
116
  corrections.extend(
81
117
  WordOperations.create_word_split_corrections(
82
- original_word=gap.words[0],
83
- reference_words=reference_words_original,
118
+ original_word=gap_word,
119
+ reference_words=ref_words,
84
120
  original_position=gap.transcription_position,
85
121
  source=matching_source,
86
122
  confidence=1.0,
87
- reason="NoSpacePunctuationMatchHandler: Split word based on text match",
123
+ reason="Split word based on text match",
88
124
  reference_positions=reference_positions,
125
+ handler="NoSpacePunctuationMatchHandler",
126
+ original_word_id=gap.transcribed_word_ids[0],
127
+ corrected_word_ids=reference_word_ids, # Use the reference word IDs
89
128
  )
90
129
  )
91
- self.logger.debug(f"Split word '{gap.words[0]}' into {reference_words_original}.")
130
+ self.logger.debug(f"Split word '{gap_word}' into {ref_words}.")
92
131
 
93
132
  else:
94
133
  # One-to-one replacement
95
- for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
96
- if orig_word.lower() != ref_word.lower():
134
+ for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
135
+ orig_word = word_map[orig_word_id]
136
+ ref_word = word_map[ref_word_id]
137
+
138
+ if orig_word.text.lower() != ref_word.text.lower():
97
139
  correction = WordOperations.create_word_replacement_correction(
98
- original_word=orig_word,
99
- corrected_word=ref_word_original,
140
+ original_word=orig_word.text,
141
+ corrected_word=ref_word.text,
100
142
  original_position=gap.transcription_position + i,
101
143
  source=matching_source,
102
144
  confidence=1.0,
103
- reason=f"NoSpacePunctuationMatchHandler: Source '{matching_source}' matched when spaces and punctuation removed",
145
+ reason=f"Source '{matching_source}' matched when spaces and punctuation removed",
104
146
  reference_positions=reference_positions,
147
+ handler="NoSpacePunctuationMatchHandler",
148
+ original_word_id=orig_word_id,
149
+ corrected_word_id=ref_word_id,
105
150
  )
106
151
  corrections.append(correction)
107
152
  self.logger.debug(f"Correction made: {correction}")