lyrics-transcriber 0.41.0__py3-none-any.whl → 0.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. lyrics_transcriber/core/controller.py +30 -52
  2. lyrics_transcriber/correction/anchor_sequence.py +325 -150
  3. lyrics_transcriber/correction/corrector.py +224 -107
  4. lyrics_transcriber/correction/handlers/base.py +28 -10
  5. lyrics_transcriber/correction/handlers/extend_anchor.py +47 -24
  6. lyrics_transcriber/correction/handlers/levenshtein.py +75 -33
  7. lyrics_transcriber/correction/handlers/llm.py +290 -0
  8. lyrics_transcriber/correction/handlers/no_space_punct_match.py +81 -36
  9. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +46 -26
  10. lyrics_transcriber/correction/handlers/repeat.py +28 -11
  11. lyrics_transcriber/correction/handlers/sound_alike.py +68 -32
  12. lyrics_transcriber/correction/handlers/syllables_match.py +80 -30
  13. lyrics_transcriber/correction/handlers/word_count_match.py +36 -19
  14. lyrics_transcriber/correction/handlers/word_operations.py +68 -22
  15. lyrics_transcriber/correction/text_utils.py +3 -7
  16. lyrics_transcriber/frontend/.yarn/install-state.gz +0 -0
  17. lyrics_transcriber/frontend/.yarn/releases/yarn-4.6.0.cjs +934 -0
  18. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  19. lyrics_transcriber/frontend/dist/assets/{index-DKnNJHRK.js → index-D0Gr3Ep7.js} +16509 -9038
  20. lyrics_transcriber/frontend/dist/assets/index-D0Gr3Ep7.js.map +1 -0
  21. lyrics_transcriber/frontend/dist/index.html +1 -1
  22. lyrics_transcriber/frontend/package.json +6 -2
  23. lyrics_transcriber/frontend/src/App.tsx +18 -2
  24. lyrics_transcriber/frontend/src/api.ts +103 -6
  25. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +14 -6
  26. lyrics_transcriber/frontend/src/components/DetailsModal.tsx +86 -59
  27. lyrics_transcriber/frontend/src/components/EditModal.tsx +281 -63
  28. lyrics_transcriber/frontend/src/components/FileUpload.tsx +2 -2
  29. lyrics_transcriber/frontend/src/components/Header.tsx +249 -0
  30. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +320 -266
  31. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +120 -0
  32. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +174 -52
  33. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +158 -114
  34. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +59 -78
  35. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +39 -16
  36. lyrics_transcriber/frontend/src/components/WordEditControls.tsx +4 -10
  37. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +134 -68
  38. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +1 -1
  39. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +85 -115
  40. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  41. lyrics_transcriber/frontend/src/components/shared/types.ts +15 -7
  42. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +67 -0
  43. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  44. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +7 -7
  45. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +121 -0
  46. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  47. lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
  48. lyrics_transcriber/frontend/src/types.js +2 -0
  49. lyrics_transcriber/frontend/src/types.ts +70 -49
  50. lyrics_transcriber/frontend/src/validation.ts +132 -0
  51. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  52. lyrics_transcriber/frontend/yarn.lock +3752 -0
  53. lyrics_transcriber/lyrics/base_lyrics_provider.py +75 -12
  54. lyrics_transcriber/lyrics/file_provider.py +6 -5
  55. lyrics_transcriber/lyrics/genius.py +5 -2
  56. lyrics_transcriber/lyrics/spotify.py +58 -21
  57. lyrics_transcriber/output/ass/config.py +16 -5
  58. lyrics_transcriber/output/cdg.py +1 -1
  59. lyrics_transcriber/output/generator.py +22 -8
  60. lyrics_transcriber/output/plain_text.py +15 -10
  61. lyrics_transcriber/output/segment_resizer.py +16 -3
  62. lyrics_transcriber/output/subtitles.py +27 -1
  63. lyrics_transcriber/output/video.py +107 -1
  64. lyrics_transcriber/review/__init__.py +0 -1
  65. lyrics_transcriber/review/server.py +337 -164
  66. lyrics_transcriber/transcribers/audioshake.py +3 -0
  67. lyrics_transcriber/transcribers/base_transcriber.py +11 -3
  68. lyrics_transcriber/transcribers/whisper.py +11 -1
  69. lyrics_transcriber/types.py +151 -105
  70. lyrics_transcriber/utils/word_utils.py +27 -0
  71. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/METADATA +3 -1
  72. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/RECORD +75 -61
  73. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/WHEEL +1 -1
  74. lyrics_transcriber/frontend/dist/assets/index-DKnNJHRK.js.map +0 -1
  75. lyrics_transcriber/frontend/package-lock.json +0 -4260
  76. lyrics_transcriber/frontend/src/components/shared/utils/initializeDataWithIds.tsx +0 -202
  77. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/LICENSE +0 -0
  78. {lyrics_transcriber-0.41.0.dist-info → lyrics_transcriber-0.43.0.dist-info}/entry_points.txt +0 -0
@@ -11,53 +11,73 @@ class RelaxedWordCountMatchHandler(GapCorrectionHandler):
11
11
 
12
12
  def __init__(self, logger: Optional[logging.Logger] = None):
13
13
  super().__init__(logger)
14
- self.logger = logger
14
+ self.logger = logger or logging.getLogger(__name__)
15
15
 
16
- def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
16
+ def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
17
17
  # Must have reference words
18
- if not gap.reference_words:
19
- self.logger.debug("No reference words available.")
18
+ if not gap.reference_word_ids:
19
+ self.logger.debug("No reference word IDs available.")
20
+ return False, {}
21
+
22
+ if not self._validate_data(data):
20
23
  return False, {}
21
24
 
22
25
  # Check if any source has matching word count
23
- for source, words in gap.reference_words.items():
24
- if len(words) == gap.length:
26
+ for source, ref_word_ids in gap.reference_word_ids.items():
27
+ if len(ref_word_ids) == gap.length:
25
28
  self.logger.debug(f"Source '{source}' has matching word count.")
26
- return True, {}
29
+ return True, {
30
+ "matching_source": source,
31
+ "reference_word_ids": ref_word_ids,
32
+ "word_map": data["word_map"],
33
+ "anchor_sequences": data.get("anchor_sequences", []),
34
+ }
27
35
 
28
36
  self.logger.debug("No source with matching word count found.")
29
37
  return False, {}
30
38
 
31
39
  def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
32
- corrections = []
40
+ """Handle the gap using word count matching."""
41
+ if not self._validate_data(data):
42
+ return []
33
43
 
34
- # Find the first source that has matching word count
35
- matching_source = None
36
- reference_words = None
37
- reference_words_original = None
38
- for source, words in gap.reference_words.items():
39
- if len(words) == gap.length:
40
- matching_source = source
41
- reference_words = words
42
- reference_words_original = gap.reference_words_original[source]
43
- self.logger.debug(f"Using source '{source}' for corrections.")
44
- break
44
+ corrections = []
45
+ matching_source = data["matching_source"]
46
+ reference_word_ids = data["reference_word_ids"]
47
+ word_map = data["word_map"]
48
+ anchor_sequences = data.get("anchor_sequences", [])
45
49
 
46
- # Use the centralized method to calculate reference positions for the matching source
47
- reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
50
+ # Use the centralized method to calculate reference positions
51
+ reference_positions = WordOperations.calculate_reference_positions(
52
+ gap, sources=[matching_source], anchor_sequences=anchor_sequences
53
+ )
48
54
  self.logger.debug(f"Calculated reference positions: {reference_positions}")
49
55
 
50
56
  # Since we found a source with matching word count, we can correct using that source
51
- for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
52
- if orig_word.lower() != ref_word.lower():
57
+ for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
58
+ # Get the actual words from the word map
59
+ if orig_word_id not in word_map:
60
+ self.logger.error(f"Original word ID {orig_word_id} not found in word_map")
61
+ continue
62
+ orig_word = word_map[orig_word_id]
63
+
64
+ if ref_word_id not in word_map:
65
+ self.logger.error(f"Reference word ID {ref_word_id} not found in word_map")
66
+ continue
67
+ ref_word = word_map[ref_word_id]
68
+
69
+ if orig_word.text.lower() != ref_word.text.lower():
53
70
  correction = WordOperations.create_word_replacement_correction(
54
- original_word=orig_word,
55
- corrected_word=ref_word_original,
71
+ original_word=orig_word.text,
72
+ corrected_word=ref_word.text,
56
73
  original_position=gap.transcription_position + i,
57
74
  source=matching_source,
58
75
  confidence=1.0,
59
- reason=f"RelaxedWordCountMatchHandler: Source '{matching_source}' had matching word count",
76
+ reason=f"Source '{matching_source}' had matching word count",
60
77
  reference_positions=reference_positions,
78
+ handler="RelaxedWordCountMatchHandler",
79
+ original_word_id=orig_word_id,
80
+ corrected_word_id=ref_word_id, # Use the reference word's ID
61
81
  )
62
82
  corrections.append(correction)
63
83
  self.logger.debug(f"Correction made: {correction}")
@@ -9,13 +9,17 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
9
9
  """Handler that applies corrections that were previously made by other handlers."""
10
10
 
11
11
  def __init__(self, logger: Optional[logging.Logger] = None, confidence_threshold: float = 0.7):
12
+ super().__init__(logger)
12
13
  self.logger = logger or logging.getLogger(__name__)
13
14
  self.confidence_threshold = confidence_threshold
14
15
  self.previous_corrections: List[WordCorrection] = []
15
16
 
16
- def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
17
+ def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
17
18
  """Check if any words in the gap match previous corrections."""
18
- return bool(self.previous_corrections), {}
19
+ if not self._validate_data(data):
20
+ return False, {}
21
+
22
+ return bool(self.previous_corrections), {"word_map": data["word_map"], "anchor_sequences": data.get("anchor_sequences", [])}
19
23
 
20
24
  def set_previous_corrections(self, corrections: List[WordCorrection]) -> None:
21
25
  """Store corrections from previous handlers to use as reference."""
@@ -23,10 +27,14 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
23
27
 
24
28
  def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
25
29
  """Apply previous corrections to matching words in the current gap."""
30
+ if not self._validate_data(data):
31
+ return []
32
+
33
+ word_map = data["word_map"]
26
34
  corrections = []
27
35
 
28
36
  # Use the centralized method to calculate reference positions
29
- reference_positions = WordOperations.calculate_reference_positions(gap)
37
+ reference_positions = WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", []))
30
38
 
31
39
  # Build a map of original words to their corrections
32
40
  correction_map: Dict[str, List[WordCorrection]] = {}
@@ -35,8 +43,14 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
35
43
  correction_map.setdefault(corr.original_word.lower(), []).append(corr)
36
44
 
37
45
  # Check each word in the gap
38
- for i, word in enumerate(gap.words):
39
- word_lower = word.lower()
46
+ for i, word_id in enumerate(gap.transcribed_word_ids):
47
+ if word_id not in word_map:
48
+ self.logger.error(f"Word ID {word_id} not found in word map")
49
+ continue
50
+
51
+ word = word_map[word_id]
52
+ word_lower = word.text.lower()
53
+
40
54
  if word_lower in correction_map:
41
55
  # Get the most common correction for this word
42
56
  prev_corrections = correction_map[word_lower]
@@ -46,13 +60,13 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
46
60
  )
47
61
 
48
62
  self.logger.debug(
49
- f"Applying previous correction: {word} -> {best_correction.corrected_word} "
63
+ f"Applying previous correction: {word.text} -> {best_correction.corrected_word} "
50
64
  f"(confidence: {best_correction.confidence:.2f})"
51
65
  )
52
66
 
53
67
  corrections.append(
54
68
  WordCorrection(
55
- original_word=word,
69
+ original_word=word.text,
56
70
  corrected_word=best_correction.corrected_word,
57
71
  segment_index=0,
58
72
  original_position=gap.transcription_position + i,
@@ -61,10 +75,13 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
61
75
  reason=f"RepeatCorrectionHandler: Matches previous correction",
62
76
  alternatives={best_correction.corrected_word: 1},
63
77
  is_deletion=best_correction.is_deletion,
64
- reference_positions=reference_positions, # Add reference positions
65
- length=best_correction.length, # Preserve length from original correction
66
- split_index=best_correction.split_index, # Preserve split info if present
67
- split_total=best_correction.split_total, # Preserve split info if present
78
+ reference_positions=reference_positions,
79
+ length=best_correction.length,
80
+ split_index=best_correction.split_index,
81
+ split_total=best_correction.split_total,
82
+ handler="RepeatCorrectionHandler",
83
+ word_id=word_id,
84
+ corrected_word_id=best_correction.corrected_word_id,
68
85
  )
69
86
  )
70
87
 
@@ -36,54 +36,86 @@ class SoundAlikeHandler(GapCorrectionHandler):
36
36
  self.logger = logger or logging.getLogger(__name__)
37
37
  self.similarity_threshold = similarity_threshold
38
38
 
39
- def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
39
+ def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
40
+ """Check if any gap word has a metaphone match with any reference word."""
41
+ if not self._validate_data(data):
42
+ return False, {}
43
+
44
+ word_map = data["word_map"]
45
+
40
46
  # Must have reference words
41
- if not gap.reference_words:
47
+ if not gap.reference_word_ids:
42
48
  self.logger.debug("No reference words available")
43
49
  return False, {}
44
50
 
45
51
  # Gap must have words
46
- if not gap.words:
52
+ if not gap.transcribed_word_ids:
47
53
  self.logger.debug("No gap words available")
48
54
  return False, {}
49
55
 
50
56
  # Check if any gap word has a metaphone match with any reference word
51
- for word in gap.words:
52
- word_codes = doublemetaphone(word)
53
- self.logger.debug(f"Gap word '{word}' has metaphone codes: {word_codes}")
54
- for ref_words in gap.reference_words.values():
55
- for ref_word in ref_words:
56
- ref_codes = doublemetaphone(ref_word)
57
- self.logger.debug(f"Reference word '{ref_word}' has metaphone codes: {ref_codes}")
57
+ for word_id in gap.transcribed_word_ids:
58
+ if word_id not in word_map:
59
+ continue
60
+ word = word_map[word_id]
61
+ word_codes = doublemetaphone(word.text)
62
+ self.logger.debug(f"Gap word '{word.text}' has metaphone codes: {word_codes}")
63
+
64
+ for source, ref_word_ids in gap.reference_word_ids.items():
65
+ for ref_word_id in ref_word_ids:
66
+ if ref_word_id not in word_map:
67
+ continue
68
+ ref_word = word_map[ref_word_id]
69
+ ref_codes = doublemetaphone(ref_word.text)
70
+ self.logger.debug(f"Reference word '{ref_word.text}' has metaphone codes: {ref_codes}")
58
71
  if self._codes_match(word_codes, ref_codes):
59
- self.logger.debug(f"Found metaphone match between '{word}' and '{ref_word}'")
72
+ self.logger.debug(f"Found metaphone match between '{word.text}' and '{ref_word.text}'")
60
73
  return True, {}
74
+
61
75
  self.logger.debug("No metaphone matches found")
62
76
  return False, {}
63
77
 
64
78
  def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
79
+ """Process the gap and create corrections for sound-alike matches."""
80
+ if not self._validate_data(data):
81
+ return []
82
+
83
+ word_map = data["word_map"]
65
84
  corrections = []
66
85
 
67
- # Use the centralized method to calculate reference positions for all sources
68
- reference_positions = WordOperations.calculate_reference_positions(gap)
86
+ # Use the centralized method to calculate reference positions
87
+ reference_positions = WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", []))
69
88
 
70
89
  # For each word in the gap
71
- for i, word in enumerate(gap.words):
72
- word_codes = doublemetaphone(word)
73
- self.logger.debug(f"Processing '{word}' (codes: {word_codes})")
90
+ for i, word_id in enumerate(gap.transcribed_word_ids):
91
+ if word_id not in word_map:
92
+ continue
93
+ word = word_map[word_id]
94
+ word_codes = doublemetaphone(word.text)
95
+ self.logger.debug(f"Processing '{word.text}' (codes: {word_codes})")
74
96
 
75
97
  # Skip if word exactly matches any reference
76
- exact_match = any(i < len(ref_words) and word.lower() == ref_words[i].lower() for ref_words in gap.reference_words.values())
98
+ exact_match = False
99
+ for source, ref_word_ids in gap.reference_word_ids.items():
100
+ if i < len(ref_word_ids):
101
+ ref_word_id = ref_word_ids[i]
102
+ if ref_word_id in word_map:
103
+ ref_word = word_map[ref_word_id]
104
+ if word.text.lower() == ref_word.text.lower():
105
+ exact_match = True
106
+ break
77
107
  if exact_match:
78
108
  continue
79
109
 
80
110
  # Find sound-alike matches in references
81
- matches: Dict[str, Tuple[List[str], float]] = {}
111
+ matches: Dict[str, Tuple[List[str], float, str]] = {} # Added word_id to tuple
82
112
 
83
- for source, ref_words in gap.reference_words.items():
84
- ref_words_original = gap.reference_words_original[source] # Get original formatted words
85
- for j, (ref_word, ref_word_original) in enumerate(zip(ref_words, ref_words_original)):
86
- ref_codes = doublemetaphone(ref_word)
113
+ for source, ref_word_ids in gap.reference_word_ids.items():
114
+ for j, ref_word_id in enumerate(ref_word_ids):
115
+ if ref_word_id not in word_map:
116
+ continue
117
+ ref_word = word_map[ref_word_id]
118
+ ref_codes = doublemetaphone(ref_word.text)
87
119
 
88
120
  match_confidence = self._get_match_confidence(word_codes, ref_codes)
89
121
  if match_confidence >= self.similarity_threshold:
@@ -94,22 +126,23 @@ class SoundAlikeHandler(GapCorrectionHandler):
94
126
  adjusted_confidence = match_confidence * position_multiplier
95
127
 
96
128
  if adjusted_confidence >= self.similarity_threshold:
97
- if ref_word_original not in matches: # Use original formatted word as key
98
- matches[ref_word_original] = ([], adjusted_confidence)
99
- matches[ref_word_original][0].append(source)
129
+ if ref_word.text not in matches:
130
+ matches[ref_word.text] = ([], adjusted_confidence, ref_word_id)
131
+ matches[ref_word.text][0].append(source)
100
132
 
101
133
  # Create correction for best match if any found
102
134
  if matches:
103
- best_match, (sources, base_confidence) = max(matches.items(), key=lambda x: (len(x[1][0]), x[1][1]))
135
+ best_match, (sources, base_confidence, ref_word_id) = max(matches.items(), key=lambda x: (len(x[1][0]), x[1][1]))
104
136
 
105
- source_confidence = len(sources) / len(gap.reference_words)
137
+ source_confidence = len(sources) / len(gap.reference_word_ids)
106
138
  final_confidence = base_confidence * source_confidence
107
139
 
108
- self.logger.debug(f"Found match: {word} -> {best_match} (confidence: {final_confidence:.2f}, sources: {sources})")
140
+ self.logger.debug(f"Found match: {word.text} -> {best_match} " f"(confidence: {final_confidence:.2f}, sources: {sources})")
141
+
109
142
  corrections.append(
110
143
  WordCorrection(
111
- original_word=word,
112
- corrected_word=best_match, # Already using original formatted word
144
+ original_word=word.text,
145
+ corrected_word=best_match,
113
146
  segment_index=0,
114
147
  original_position=gap.transcription_position + i,
115
148
  confidence=final_confidence,
@@ -117,8 +150,11 @@ class SoundAlikeHandler(GapCorrectionHandler):
117
150
  reason=f"SoundAlikeHandler: Phonetic match ({final_confidence:.2f} confidence)",
118
151
  alternatives={k: len(v[0]) for k, v in matches.items()},
119
152
  is_deletion=False,
120
- reference_positions=reference_positions, # Add reference positions
121
- length=1, # Single word replacement
153
+ reference_positions=reference_positions,
154
+ length=1,
155
+ handler="SoundAlikeHandler",
156
+ word_id=word_id,
157
+ corrected_word_id=ref_word_id,
122
158
  )
123
159
  )
124
160
 
@@ -102,18 +102,44 @@ class SyllablesMatchHandler(GapCorrectionHandler):
102
102
  )
103
103
  return [spacy_count, pyphen_count, nltk_count, syllables_count]
104
104
 
105
- def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
105
+ def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
106
106
  # Must have reference words
107
- if not gap.reference_words:
108
- self.logger.debug("No reference words available")
107
+ if not gap.reference_word_ids:
108
+ self.logger.debug("No reference word IDs available")
109
109
  return False, {}
110
110
 
111
+ # Get word lookup map from data
112
+ if not data or "word_map" not in data:
113
+ self.logger.error("No word_map provided in data")
114
+ return False, {}
115
+
116
+ word_map = data["word_map"]
117
+
118
+ # Get actual words from word IDs
119
+ gap_words = []
120
+ for word_id in gap.transcribed_word_ids:
121
+ if word_id not in word_map:
122
+ self.logger.error(f"Word ID {word_id} not found in word_map")
123
+ return False, {}
124
+ gap_words.append(word_map[word_id].text)
125
+
111
126
  # Get syllable counts for gap text using different methods
112
- gap_syllables = self._count_syllables(gap.words)
127
+ gap_syllables = self._count_syllables(gap_words)
113
128
 
114
129
  # Check if any reference source has matching syllable count with any method
115
- for source, words in gap.reference_words.items():
116
- ref_syllables = self._count_syllables(words)
130
+ for source, ref_word_ids in gap.reference_word_ids.items():
131
+ # Get reference words from word map
132
+ ref_words = []
133
+ for word_id in ref_word_ids:
134
+ if word_id not in word_map:
135
+ self.logger.error(f"Reference word ID {word_id} not found in word_map")
136
+ continue
137
+ ref_words.append(word_map[word_id].text)
138
+
139
+ if not ref_words:
140
+ continue
141
+
142
+ ref_syllables = self._count_syllables(ref_words)
117
143
 
118
144
  # If any counting method matches between gap and reference, we can handle it
119
145
  if any(gap_count == ref_count for gap_count in gap_syllables for ref_count in ref_syllables):
@@ -121,81 +147,105 @@ class SyllablesMatchHandler(GapCorrectionHandler):
121
147
  return True, {
122
148
  "gap_syllables": gap_syllables,
123
149
  "matching_source": source,
124
- "reference_words": words,
125
- "reference_words_original": gap.reference_words_original[source],
150
+ "reference_word_ids": ref_word_ids,
151
+ "word_map": word_map,
126
152
  }
127
153
 
128
154
  self.logger.debug("No reference source had matching syllable count")
129
155
  return False, {}
130
156
 
131
- def handle(self, gap: GapSequence, data: Dict[str, Any]) -> List[WordCorrection]:
157
+ def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
158
+ """Handle the gap using syllable matching."""
159
+ if not data:
160
+ can_handle, data = self.can_handle(gap)
161
+ if not can_handle:
162
+ return []
163
+
132
164
  corrections = []
133
165
  matching_source = data["matching_source"]
134
- reference_words = data["reference_words"]
135
- reference_words_original = data["reference_words_original"]
166
+ reference_word_ids = data["reference_word_ids"]
167
+ word_map = data["word_map"]
168
+
169
+ # Get the actual words from word IDs
170
+ gap_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids]
171
+ ref_words = [word_map[word_id].text for word_id in reference_word_ids]
136
172
 
137
173
  # Use the centralized method to calculate reference positions
138
174
  reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
139
175
 
140
176
  # Since we matched syllable counts for the entire gap, we should handle all words
141
- if len(gap.words) > len(reference_words):
177
+ if len(gap_words) > len(ref_words):
142
178
  # Multiple transcribed words -> fewer reference words
143
179
  # Try to distribute the reference words across the gap words
144
- words_per_ref = len(gap.words) / len(reference_words)
180
+ words_per_ref = len(gap_words) / len(ref_words)
145
181
 
146
- for ref_idx, ref_word_original in enumerate(reference_words_original):
182
+ for ref_idx, ref_word_id in enumerate(reference_word_ids):
147
183
  start_idx = int(ref_idx * words_per_ref)
148
184
  end_idx = int((ref_idx + 1) * words_per_ref)
149
185
 
150
186
  # Get the group of words to combine
151
- words_to_combine = gap.words[start_idx:end_idx]
187
+ words_to_combine = gap_words[start_idx:end_idx]
188
+ word_ids_to_combine = gap.transcribed_word_ids[start_idx:end_idx]
152
189
  corrections.extend(
153
190
  WordOperations.create_word_combine_corrections(
154
191
  original_words=words_to_combine,
155
- reference_word=ref_word_original,
192
+ reference_word=word_map[ref_word_id].text,
156
193
  original_position=gap.transcription_position + start_idx,
157
194
  source=matching_source,
158
195
  confidence=0.8,
159
- combine_reason="SyllablesMatchHandler: Words combined based on syllable match",
160
- delete_reason="SyllablesMatchHandler: Word removed as part of syllable match combination",
196
+ combine_reason="Words combined based on syllable match",
197
+ delete_reason="Word removed as part of syllable match combination",
161
198
  reference_positions=reference_positions,
199
+ handler="SyllablesMatchHandler",
200
+ original_word_ids=word_ids_to_combine,
201
+ corrected_word_id=ref_word_id,
162
202
  )
163
203
  )
164
204
 
165
- elif len(gap.words) < len(reference_words):
205
+ elif len(gap_words) < len(ref_words):
166
206
  # Single transcribed word -> multiple reference words
167
- words_per_gap = len(reference_words) / len(gap.words)
207
+ words_per_gap = len(ref_words) / len(gap_words)
168
208
 
169
- for i, orig_word in enumerate(gap.words):
209
+ for i, word_id in enumerate(gap.transcribed_word_ids):
170
210
  start_idx = int(i * words_per_gap)
171
211
  end_idx = int((i + 1) * words_per_gap)
172
- ref_words_original_for_orig = reference_words_original[start_idx:end_idx]
212
+ ref_word_ids_for_split = reference_word_ids[start_idx:end_idx]
213
+ ref_words_for_split = [word_map[ref_id].text for ref_id in ref_word_ids_for_split]
173
214
 
174
215
  corrections.extend(
175
216
  WordOperations.create_word_split_corrections(
176
- original_word=orig_word,
177
- reference_words=ref_words_original_for_orig,
217
+ original_word=word_map[word_id].text,
218
+ reference_words=ref_words_for_split,
178
219
  original_position=gap.transcription_position + i,
179
220
  source=matching_source,
180
221
  confidence=0.8,
181
- reason="SyllablesMatchHandler: Split word based on syllable match",
222
+ reason="Split word based on syllable match",
182
223
  reference_positions=reference_positions,
224
+ handler="SyllablesMatchHandler",
225
+ original_word_id=word_id,
226
+ corrected_word_ids=ref_word_ids_for_split,
183
227
  )
184
228
  )
185
229
 
186
230
  else:
187
231
  # One-to-one replacement
188
- for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
189
- if orig_word.lower() != ref_word.lower():
232
+ for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
233
+ orig_word = word_map[orig_word_id]
234
+ ref_word = word_map[ref_word_id]
235
+
236
+ if orig_word.text.lower() != ref_word.text.lower():
190
237
  corrections.append(
191
238
  WordOperations.create_word_replacement_correction(
192
- original_word=orig_word,
193
- corrected_word=ref_word_original,
239
+ original_word=orig_word.text,
240
+ corrected_word=ref_word.text,
194
241
  original_position=gap.transcription_position + i,
195
242
  source=matching_source,
196
243
  confidence=0.8,
197
- reason=f"SyllablesMatchHandler: Source '{matching_source}' had matching syllable count",
244
+ reason=f"Source '{matching_source}' had matching syllable count",
198
245
  reference_positions=reference_positions,
246
+ handler="SyllablesMatchHandler",
247
+ original_word_id=orig_word_id,
248
+ corrected_word_id=ref_word_id,
199
249
  )
200
250
  )
201
251
 
@@ -13,49 +13,66 @@ class WordCountMatchHandler(GapCorrectionHandler):
13
13
  super().__init__(logger)
14
14
  self.logger = logger or logging.getLogger(__name__)
15
15
 
16
- def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
16
+ def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
17
17
  # Must have reference words
18
- if not gap.reference_words:
19
- self.logger.debug("No reference words available.")
18
+ if not gap.reference_word_ids:
19
+ self.logger.debug("No reference word IDs available.")
20
20
  return False, {}
21
21
 
22
- ref_words_lists = list(gap.reference_words.values())
22
+ if not self._validate_data(data):
23
+ return False, {}
24
+
25
+ ref_word_lists = list(gap.reference_word_ids.values())
23
26
 
24
27
  # All sources must have same number of words as gap
25
- if not all(len(words) == gap.length for words in ref_words_lists):
28
+ if not all(len(words) == gap.length for words in ref_word_lists):
26
29
  self.logger.debug("Not all sources have the same number of words as the gap.")
27
30
  return False, {}
28
31
 
29
32
  # If we have multiple sources, they must all agree
30
- if len(ref_words_lists) > 1 and not all(words == ref_words_lists[0] for words in ref_words_lists[1:]):
33
+ if len(ref_word_lists) > 1 and not all(words == ref_word_lists[0] for words in ref_word_lists[1:]):
31
34
  self.logger.debug("Not all sources agree on the words.")
32
35
  return False, {}
33
36
 
34
37
  self.logger.debug("All sources agree and have matching word counts.")
35
- return True, {}
38
+ return True, {"word_map": data["word_map"]}
36
39
 
37
40
  def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
41
+ if not self._validate_data(data):
42
+ return []
43
+
38
44
  corrections = []
39
- # Get both clean and original reference words from first source
40
- source = list(gap.reference_words.keys())[0]
41
- reference_words = gap.reference_words[source]
42
- reference_words_original = gap.reference_words_original[source]
43
- sources = ", ".join(gap.reference_words.keys())
45
+ word_map = data["word_map"]
46
+ source = list(gap.reference_word_ids.keys())[0]
47
+ reference_word_ids = gap.reference_word_ids[source]
48
+ sources = ", ".join(gap.reference_word_ids.keys())
44
49
 
45
- # Use the centralized method to calculate reference positions for all sources
46
50
  reference_positions = WordOperations.calculate_reference_positions(gap)
47
51
 
48
- # Since we know all reference sources agree, we can correct all words in the gap
49
- for i, (orig_word, ref_word, ref_word_original) in enumerate(zip(gap.words, reference_words, reference_words_original)):
50
- if orig_word.lower() != ref_word.lower():
52
+ for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
53
+ # Get the actual words from the word map
54
+ if orig_word_id not in word_map:
55
+ self.logger.error(f"Original word ID {orig_word_id} not found in word_map")
56
+ continue
57
+ orig_word = word_map[orig_word_id]
58
+
59
+ if ref_word_id not in word_map:
60
+ self.logger.error(f"Reference word ID {ref_word_id} not found in word_map")
61
+ continue
62
+ ref_word = word_map[ref_word_id]
63
+
64
+ if orig_word.text.lower() != ref_word.text.lower():
51
65
  correction = WordOperations.create_word_replacement_correction(
52
- original_word=orig_word,
53
- corrected_word=ref_word_original,
66
+ original_word=orig_word.text,
67
+ corrected_word=ref_word.text,
54
68
  original_position=gap.transcription_position + i,
55
69
  source=sources,
56
70
  confidence=1.0,
57
- reason="WordCountMatchHandler: Reference sources had same word count as gap",
71
+ reason="Reference sources had same word count as gap",
58
72
  reference_positions=reference_positions,
73
+ handler="WordCountMatchHandler",
74
+ original_word_id=orig_word_id,
75
+ corrected_word_id=ref_word_id, # Use the reference word's ID
59
76
  )
60
77
  corrections.append(correction)
61
78
  self.logger.debug(f"Correction made: {correction}")