lyrics-transcriber 0.40.0__py3-none-any.whl → 0.42.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/cli/cli_main.py +7 -0
- lyrics_transcriber/core/config.py +1 -0
- lyrics_transcriber/core/controller.py +30 -52
- lyrics_transcriber/correction/anchor_sequence.py +325 -150
- lyrics_transcriber/correction/corrector.py +224 -107
- lyrics_transcriber/correction/handlers/base.py +28 -10
- lyrics_transcriber/correction/handlers/extend_anchor.py +47 -24
- lyrics_transcriber/correction/handlers/levenshtein.py +75 -33
- lyrics_transcriber/correction/handlers/llm.py +290 -0
- lyrics_transcriber/correction/handlers/no_space_punct_match.py +81 -36
- lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +46 -26
- lyrics_transcriber/correction/handlers/repeat.py +28 -11
- lyrics_transcriber/correction/handlers/sound_alike.py +68 -32
- lyrics_transcriber/correction/handlers/syllables_match.py +80 -30
- lyrics_transcriber/correction/handlers/word_count_match.py +36 -19
- lyrics_transcriber/correction/handlers/word_operations.py +68 -22
- lyrics_transcriber/correction/text_utils.py +3 -7
- lyrics_transcriber/frontend/.yarn/install-state.gz +0 -0
- lyrics_transcriber/frontend/.yarn/releases/yarn-4.6.0.cjs +934 -0
- lyrics_transcriber/frontend/.yarnrc.yml +3 -0
- lyrics_transcriber/frontend/dist/assets/{index-DKnNJHRK.js → index-coH8y7gV.js} +16284 -9032
- lyrics_transcriber/frontend/dist/assets/index-coH8y7gV.js.map +1 -0
- lyrics_transcriber/frontend/dist/index.html +1 -1
- lyrics_transcriber/frontend/package.json +6 -2
- lyrics_transcriber/frontend/src/App.tsx +18 -2
- lyrics_transcriber/frontend/src/api.ts +103 -6
- lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +7 -6
- lyrics_transcriber/frontend/src/components/DetailsModal.tsx +86 -59
- lyrics_transcriber/frontend/src/components/EditModal.tsx +93 -43
- lyrics_transcriber/frontend/src/components/FileUpload.tsx +2 -2
- lyrics_transcriber/frontend/src/components/Header.tsx +251 -0
- lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +303 -265
- lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +117 -0
- lyrics_transcriber/frontend/src/components/ReferenceView.tsx +125 -40
- lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +129 -115
- lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +59 -78
- lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +40 -16
- lyrics_transcriber/frontend/src/components/WordEditControls.tsx +4 -10
- lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +137 -68
- lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +1 -1
- lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +85 -115
- lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
- lyrics_transcriber/frontend/src/components/shared/types.ts +15 -7
- lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +35 -0
- lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
- lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +7 -7
- lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +121 -0
- lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
- lyrics_transcriber/frontend/src/types.js +2 -0
- lyrics_transcriber/frontend/src/types.ts +70 -49
- lyrics_transcriber/frontend/src/validation.ts +132 -0
- lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
- lyrics_transcriber/frontend/yarn.lock +3752 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +75 -12
- lyrics_transcriber/lyrics/file_provider.py +6 -5
- lyrics_transcriber/lyrics/genius.py +5 -2
- lyrics_transcriber/lyrics/spotify.py +58 -21
- lyrics_transcriber/output/ass/config.py +16 -5
- lyrics_transcriber/output/cdg.py +8 -8
- lyrics_transcriber/output/generator.py +29 -14
- lyrics_transcriber/output/plain_text.py +15 -10
- lyrics_transcriber/output/segment_resizer.py +16 -3
- lyrics_transcriber/output/subtitles.py +56 -2
- lyrics_transcriber/output/video.py +107 -1
- lyrics_transcriber/review/__init__.py +0 -1
- lyrics_transcriber/review/server.py +337 -164
- lyrics_transcriber/transcribers/audioshake.py +3 -0
- lyrics_transcriber/transcribers/base_transcriber.py +11 -3
- lyrics_transcriber/transcribers/whisper.py +11 -1
- lyrics_transcriber/types.py +151 -105
- lyrics_transcriber/utils/word_utils.py +27 -0
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/METADATA +3 -1
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/RECORD +76 -63
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/WHEEL +1 -1
- lyrics_transcriber/frontend/dist/assets/index-DKnNJHRK.js.map +0 -1
- lyrics_transcriber/frontend/package-lock.json +0 -4260
- lyrics_transcriber/frontend/src/components/shared/utils/initializeDataWithIds.tsx +0 -202
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/LICENSE +0 -0
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/entry_points.txt +0 -0
@@ -11,53 +11,73 @@ class RelaxedWordCountMatchHandler(GapCorrectionHandler):
|
|
11
11
|
|
12
12
|
def __init__(self, logger: Optional[logging.Logger] = None):
|
13
13
|
super().__init__(logger)
|
14
|
-
self.logger = logger
|
14
|
+
self.logger = logger or logging.getLogger(__name__)
|
15
15
|
|
16
|
-
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
16
|
+
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
17
17
|
# Must have reference words
|
18
|
-
if not gap.
|
19
|
-
self.logger.debug("No reference
|
18
|
+
if not gap.reference_word_ids:
|
19
|
+
self.logger.debug("No reference word IDs available.")
|
20
|
+
return False, {}
|
21
|
+
|
22
|
+
if not self._validate_data(data):
|
20
23
|
return False, {}
|
21
24
|
|
22
25
|
# Check if any source has matching word count
|
23
|
-
for source,
|
24
|
-
if len(
|
26
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
27
|
+
if len(ref_word_ids) == gap.length:
|
25
28
|
self.logger.debug(f"Source '{source}' has matching word count.")
|
26
|
-
return True, {
|
29
|
+
return True, {
|
30
|
+
"matching_source": source,
|
31
|
+
"reference_word_ids": ref_word_ids,
|
32
|
+
"word_map": data["word_map"],
|
33
|
+
"anchor_sequences": data.get("anchor_sequences", []),
|
34
|
+
}
|
27
35
|
|
28
36
|
self.logger.debug("No source with matching word count found.")
|
29
37
|
return False, {}
|
30
38
|
|
31
39
|
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
32
|
-
|
40
|
+
"""Handle the gap using word count matching."""
|
41
|
+
if not self._validate_data(data):
|
42
|
+
return []
|
33
43
|
|
34
|
-
|
35
|
-
matching_source =
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
if len(words) == gap.length:
|
40
|
-
matching_source = source
|
41
|
-
reference_words = words
|
42
|
-
reference_words_original = gap.reference_words_original[source]
|
43
|
-
self.logger.debug(f"Using source '{source}' for corrections.")
|
44
|
-
break
|
44
|
+
corrections = []
|
45
|
+
matching_source = data["matching_source"]
|
46
|
+
reference_word_ids = data["reference_word_ids"]
|
47
|
+
word_map = data["word_map"]
|
48
|
+
anchor_sequences = data.get("anchor_sequences", [])
|
45
49
|
|
46
|
-
# Use the centralized method to calculate reference positions
|
47
|
-
reference_positions = WordOperations.calculate_reference_positions(
|
50
|
+
# Use the centralized method to calculate reference positions
|
51
|
+
reference_positions = WordOperations.calculate_reference_positions(
|
52
|
+
gap, sources=[matching_source], anchor_sequences=anchor_sequences
|
53
|
+
)
|
48
54
|
self.logger.debug(f"Calculated reference positions: {reference_positions}")
|
49
55
|
|
50
56
|
# Since we found a source with matching word count, we can correct using that source
|
51
|
-
for i, (
|
52
|
-
|
57
|
+
for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
|
58
|
+
# Get the actual words from the word map
|
59
|
+
if orig_word_id not in word_map:
|
60
|
+
self.logger.error(f"Original word ID {orig_word_id} not found in word_map")
|
61
|
+
continue
|
62
|
+
orig_word = word_map[orig_word_id]
|
63
|
+
|
64
|
+
if ref_word_id not in word_map:
|
65
|
+
self.logger.error(f"Reference word ID {ref_word_id} not found in word_map")
|
66
|
+
continue
|
67
|
+
ref_word = word_map[ref_word_id]
|
68
|
+
|
69
|
+
if orig_word.text.lower() != ref_word.text.lower():
|
53
70
|
correction = WordOperations.create_word_replacement_correction(
|
54
|
-
original_word=orig_word,
|
55
|
-
corrected_word=
|
71
|
+
original_word=orig_word.text,
|
72
|
+
corrected_word=ref_word.text,
|
56
73
|
original_position=gap.transcription_position + i,
|
57
74
|
source=matching_source,
|
58
75
|
confidence=1.0,
|
59
|
-
reason=f"
|
76
|
+
reason=f"Source '{matching_source}' had matching word count",
|
60
77
|
reference_positions=reference_positions,
|
78
|
+
handler="RelaxedWordCountMatchHandler",
|
79
|
+
original_word_id=orig_word_id,
|
80
|
+
corrected_word_id=ref_word_id, # Use the reference word's ID
|
61
81
|
)
|
62
82
|
corrections.append(correction)
|
63
83
|
self.logger.debug(f"Correction made: {correction}")
|
@@ -9,13 +9,17 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
|
|
9
9
|
"""Handler that applies corrections that were previously made by other handlers."""
|
10
10
|
|
11
11
|
def __init__(self, logger: Optional[logging.Logger] = None, confidence_threshold: float = 0.7):
|
12
|
+
super().__init__(logger)
|
12
13
|
self.logger = logger or logging.getLogger(__name__)
|
13
14
|
self.confidence_threshold = confidence_threshold
|
14
15
|
self.previous_corrections: List[WordCorrection] = []
|
15
16
|
|
16
|
-
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
17
|
+
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
17
18
|
"""Check if any words in the gap match previous corrections."""
|
18
|
-
|
19
|
+
if not self._validate_data(data):
|
20
|
+
return False, {}
|
21
|
+
|
22
|
+
return bool(self.previous_corrections), {"word_map": data["word_map"], "anchor_sequences": data.get("anchor_sequences", [])}
|
19
23
|
|
20
24
|
def set_previous_corrections(self, corrections: List[WordCorrection]) -> None:
|
21
25
|
"""Store corrections from previous handlers to use as reference."""
|
@@ -23,10 +27,14 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
|
|
23
27
|
|
24
28
|
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
25
29
|
"""Apply previous corrections to matching words in the current gap."""
|
30
|
+
if not self._validate_data(data):
|
31
|
+
return []
|
32
|
+
|
33
|
+
word_map = data["word_map"]
|
26
34
|
corrections = []
|
27
35
|
|
28
36
|
# Use the centralized method to calculate reference positions
|
29
|
-
reference_positions = WordOperations.calculate_reference_positions(gap)
|
37
|
+
reference_positions = WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", []))
|
30
38
|
|
31
39
|
# Build a map of original words to their corrections
|
32
40
|
correction_map: Dict[str, List[WordCorrection]] = {}
|
@@ -35,8 +43,14 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
|
|
35
43
|
correction_map.setdefault(corr.original_word.lower(), []).append(corr)
|
36
44
|
|
37
45
|
# Check each word in the gap
|
38
|
-
for i,
|
39
|
-
|
46
|
+
for i, word_id in enumerate(gap.transcribed_word_ids):
|
47
|
+
if word_id not in word_map:
|
48
|
+
self.logger.error(f"Word ID {word_id} not found in word map")
|
49
|
+
continue
|
50
|
+
|
51
|
+
word = word_map[word_id]
|
52
|
+
word_lower = word.text.lower()
|
53
|
+
|
40
54
|
if word_lower in correction_map:
|
41
55
|
# Get the most common correction for this word
|
42
56
|
prev_corrections = correction_map[word_lower]
|
@@ -46,13 +60,13 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
|
|
46
60
|
)
|
47
61
|
|
48
62
|
self.logger.debug(
|
49
|
-
f"Applying previous correction: {word} -> {best_correction.corrected_word} "
|
63
|
+
f"Applying previous correction: {word.text} -> {best_correction.corrected_word} "
|
50
64
|
f"(confidence: {best_correction.confidence:.2f})"
|
51
65
|
)
|
52
66
|
|
53
67
|
corrections.append(
|
54
68
|
WordCorrection(
|
55
|
-
original_word=word,
|
69
|
+
original_word=word.text,
|
56
70
|
corrected_word=best_correction.corrected_word,
|
57
71
|
segment_index=0,
|
58
72
|
original_position=gap.transcription_position + i,
|
@@ -61,10 +75,13 @@ class RepeatCorrectionHandler(GapCorrectionHandler):
|
|
61
75
|
reason=f"RepeatCorrectionHandler: Matches previous correction",
|
62
76
|
alternatives={best_correction.corrected_word: 1},
|
63
77
|
is_deletion=best_correction.is_deletion,
|
64
|
-
reference_positions=reference_positions,
|
65
|
-
length=best_correction.length,
|
66
|
-
split_index=best_correction.split_index,
|
67
|
-
split_total=best_correction.split_total,
|
78
|
+
reference_positions=reference_positions,
|
79
|
+
length=best_correction.length,
|
80
|
+
split_index=best_correction.split_index,
|
81
|
+
split_total=best_correction.split_total,
|
82
|
+
handler="RepeatCorrectionHandler",
|
83
|
+
word_id=word_id,
|
84
|
+
corrected_word_id=best_correction.corrected_word_id,
|
68
85
|
)
|
69
86
|
)
|
70
87
|
|
@@ -36,54 +36,86 @@ class SoundAlikeHandler(GapCorrectionHandler):
|
|
36
36
|
self.logger = logger or logging.getLogger(__name__)
|
37
37
|
self.similarity_threshold = similarity_threshold
|
38
38
|
|
39
|
-
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
39
|
+
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
40
|
+
"""Check if any gap word has a metaphone match with any reference word."""
|
41
|
+
if not self._validate_data(data):
|
42
|
+
return False, {}
|
43
|
+
|
44
|
+
word_map = data["word_map"]
|
45
|
+
|
40
46
|
# Must have reference words
|
41
|
-
if not gap.
|
47
|
+
if not gap.reference_word_ids:
|
42
48
|
self.logger.debug("No reference words available")
|
43
49
|
return False, {}
|
44
50
|
|
45
51
|
# Gap must have words
|
46
|
-
if not gap.
|
52
|
+
if not gap.transcribed_word_ids:
|
47
53
|
self.logger.debug("No gap words available")
|
48
54
|
return False, {}
|
49
55
|
|
50
56
|
# Check if any gap word has a metaphone match with any reference word
|
51
|
-
for
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
57
|
+
for word_id in gap.transcribed_word_ids:
|
58
|
+
if word_id not in word_map:
|
59
|
+
continue
|
60
|
+
word = word_map[word_id]
|
61
|
+
word_codes = doublemetaphone(word.text)
|
62
|
+
self.logger.debug(f"Gap word '{word.text}' has metaphone codes: {word_codes}")
|
63
|
+
|
64
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
65
|
+
for ref_word_id in ref_word_ids:
|
66
|
+
if ref_word_id not in word_map:
|
67
|
+
continue
|
68
|
+
ref_word = word_map[ref_word_id]
|
69
|
+
ref_codes = doublemetaphone(ref_word.text)
|
70
|
+
self.logger.debug(f"Reference word '{ref_word.text}' has metaphone codes: {ref_codes}")
|
58
71
|
if self._codes_match(word_codes, ref_codes):
|
59
|
-
self.logger.debug(f"Found metaphone match between '{word}' and '{ref_word}'")
|
72
|
+
self.logger.debug(f"Found metaphone match between '{word.text}' and '{ref_word.text}'")
|
60
73
|
return True, {}
|
74
|
+
|
61
75
|
self.logger.debug("No metaphone matches found")
|
62
76
|
return False, {}
|
63
77
|
|
64
78
|
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
79
|
+
"""Process the gap and create corrections for sound-alike matches."""
|
80
|
+
if not self._validate_data(data):
|
81
|
+
return []
|
82
|
+
|
83
|
+
word_map = data["word_map"]
|
65
84
|
corrections = []
|
66
85
|
|
67
|
-
# Use the centralized method to calculate reference positions
|
68
|
-
reference_positions = WordOperations.calculate_reference_positions(gap)
|
86
|
+
# Use the centralized method to calculate reference positions
|
87
|
+
reference_positions = WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", []))
|
69
88
|
|
70
89
|
# For each word in the gap
|
71
|
-
for i,
|
72
|
-
|
73
|
-
|
90
|
+
for i, word_id in enumerate(gap.transcribed_word_ids):
|
91
|
+
if word_id not in word_map:
|
92
|
+
continue
|
93
|
+
word = word_map[word_id]
|
94
|
+
word_codes = doublemetaphone(word.text)
|
95
|
+
self.logger.debug(f"Processing '{word.text}' (codes: {word_codes})")
|
74
96
|
|
75
97
|
# Skip if word exactly matches any reference
|
76
|
-
exact_match =
|
98
|
+
exact_match = False
|
99
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
100
|
+
if i < len(ref_word_ids):
|
101
|
+
ref_word_id = ref_word_ids[i]
|
102
|
+
if ref_word_id in word_map:
|
103
|
+
ref_word = word_map[ref_word_id]
|
104
|
+
if word.text.lower() == ref_word.text.lower():
|
105
|
+
exact_match = True
|
106
|
+
break
|
77
107
|
if exact_match:
|
78
108
|
continue
|
79
109
|
|
80
110
|
# Find sound-alike matches in references
|
81
|
-
matches: Dict[str, Tuple[List[str], float]] = {}
|
111
|
+
matches: Dict[str, Tuple[List[str], float, str]] = {} # Added word_id to tuple
|
82
112
|
|
83
|
-
for source,
|
84
|
-
|
85
|
-
|
86
|
-
|
113
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
114
|
+
for j, ref_word_id in enumerate(ref_word_ids):
|
115
|
+
if ref_word_id not in word_map:
|
116
|
+
continue
|
117
|
+
ref_word = word_map[ref_word_id]
|
118
|
+
ref_codes = doublemetaphone(ref_word.text)
|
87
119
|
|
88
120
|
match_confidence = self._get_match_confidence(word_codes, ref_codes)
|
89
121
|
if match_confidence >= self.similarity_threshold:
|
@@ -94,22 +126,23 @@ class SoundAlikeHandler(GapCorrectionHandler):
|
|
94
126
|
adjusted_confidence = match_confidence * position_multiplier
|
95
127
|
|
96
128
|
if adjusted_confidence >= self.similarity_threshold:
|
97
|
-
if
|
98
|
-
matches[
|
99
|
-
matches[
|
129
|
+
if ref_word.text not in matches:
|
130
|
+
matches[ref_word.text] = ([], adjusted_confidence, ref_word_id)
|
131
|
+
matches[ref_word.text][0].append(source)
|
100
132
|
|
101
133
|
# Create correction for best match if any found
|
102
134
|
if matches:
|
103
|
-
best_match, (sources, base_confidence) = max(matches.items(), key=lambda x: (len(x[1][0]), x[1][1]))
|
135
|
+
best_match, (sources, base_confidence, ref_word_id) = max(matches.items(), key=lambda x: (len(x[1][0]), x[1][1]))
|
104
136
|
|
105
|
-
source_confidence = len(sources) / len(gap.
|
137
|
+
source_confidence = len(sources) / len(gap.reference_word_ids)
|
106
138
|
final_confidence = base_confidence * source_confidence
|
107
139
|
|
108
|
-
self.logger.debug(f"Found match: {word} -> {best_match} (confidence: {final_confidence:.2f}, sources: {sources})")
|
140
|
+
self.logger.debug(f"Found match: {word.text} -> {best_match} " f"(confidence: {final_confidence:.2f}, sources: {sources})")
|
141
|
+
|
109
142
|
corrections.append(
|
110
143
|
WordCorrection(
|
111
|
-
original_word=word,
|
112
|
-
corrected_word=best_match,
|
144
|
+
original_word=word.text,
|
145
|
+
corrected_word=best_match,
|
113
146
|
segment_index=0,
|
114
147
|
original_position=gap.transcription_position + i,
|
115
148
|
confidence=final_confidence,
|
@@ -117,8 +150,11 @@ class SoundAlikeHandler(GapCorrectionHandler):
|
|
117
150
|
reason=f"SoundAlikeHandler: Phonetic match ({final_confidence:.2f} confidence)",
|
118
151
|
alternatives={k: len(v[0]) for k, v in matches.items()},
|
119
152
|
is_deletion=False,
|
120
|
-
reference_positions=reference_positions,
|
121
|
-
length=1,
|
153
|
+
reference_positions=reference_positions,
|
154
|
+
length=1,
|
155
|
+
handler="SoundAlikeHandler",
|
156
|
+
word_id=word_id,
|
157
|
+
corrected_word_id=ref_word_id,
|
122
158
|
)
|
123
159
|
)
|
124
160
|
|
@@ -102,18 +102,44 @@ class SyllablesMatchHandler(GapCorrectionHandler):
|
|
102
102
|
)
|
103
103
|
return [spacy_count, pyphen_count, nltk_count, syllables_count]
|
104
104
|
|
105
|
-
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
105
|
+
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
106
106
|
# Must have reference words
|
107
|
-
if not gap.
|
108
|
-
self.logger.debug("No reference
|
107
|
+
if not gap.reference_word_ids:
|
108
|
+
self.logger.debug("No reference word IDs available")
|
109
109
|
return False, {}
|
110
110
|
|
111
|
+
# Get word lookup map from data
|
112
|
+
if not data or "word_map" not in data:
|
113
|
+
self.logger.error("No word_map provided in data")
|
114
|
+
return False, {}
|
115
|
+
|
116
|
+
word_map = data["word_map"]
|
117
|
+
|
118
|
+
# Get actual words from word IDs
|
119
|
+
gap_words = []
|
120
|
+
for word_id in gap.transcribed_word_ids:
|
121
|
+
if word_id not in word_map:
|
122
|
+
self.logger.error(f"Word ID {word_id} not found in word_map")
|
123
|
+
return False, {}
|
124
|
+
gap_words.append(word_map[word_id].text)
|
125
|
+
|
111
126
|
# Get syllable counts for gap text using different methods
|
112
|
-
gap_syllables = self._count_syllables(
|
127
|
+
gap_syllables = self._count_syllables(gap_words)
|
113
128
|
|
114
129
|
# Check if any reference source has matching syllable count with any method
|
115
|
-
for source,
|
116
|
-
|
130
|
+
for source, ref_word_ids in gap.reference_word_ids.items():
|
131
|
+
# Get reference words from word map
|
132
|
+
ref_words = []
|
133
|
+
for word_id in ref_word_ids:
|
134
|
+
if word_id not in word_map:
|
135
|
+
self.logger.error(f"Reference word ID {word_id} not found in word_map")
|
136
|
+
continue
|
137
|
+
ref_words.append(word_map[word_id].text)
|
138
|
+
|
139
|
+
if not ref_words:
|
140
|
+
continue
|
141
|
+
|
142
|
+
ref_syllables = self._count_syllables(ref_words)
|
117
143
|
|
118
144
|
# If any counting method matches between gap and reference, we can handle it
|
119
145
|
if any(gap_count == ref_count for gap_count in gap_syllables for ref_count in ref_syllables):
|
@@ -121,81 +147,105 @@ class SyllablesMatchHandler(GapCorrectionHandler):
|
|
121
147
|
return True, {
|
122
148
|
"gap_syllables": gap_syllables,
|
123
149
|
"matching_source": source,
|
124
|
-
"
|
125
|
-
"
|
150
|
+
"reference_word_ids": ref_word_ids,
|
151
|
+
"word_map": word_map,
|
126
152
|
}
|
127
153
|
|
128
154
|
self.logger.debug("No reference source had matching syllable count")
|
129
155
|
return False, {}
|
130
156
|
|
131
|
-
def handle(self, gap: GapSequence, data: Dict[str, Any]) -> List[WordCorrection]:
|
157
|
+
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
158
|
+
"""Handle the gap using syllable matching."""
|
159
|
+
if not data:
|
160
|
+
can_handle, data = self.can_handle(gap)
|
161
|
+
if not can_handle:
|
162
|
+
return []
|
163
|
+
|
132
164
|
corrections = []
|
133
165
|
matching_source = data["matching_source"]
|
134
|
-
|
135
|
-
|
166
|
+
reference_word_ids = data["reference_word_ids"]
|
167
|
+
word_map = data["word_map"]
|
168
|
+
|
169
|
+
# Get the actual words from word IDs
|
170
|
+
gap_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids]
|
171
|
+
ref_words = [word_map[word_id].text for word_id in reference_word_ids]
|
136
172
|
|
137
173
|
# Use the centralized method to calculate reference positions
|
138
174
|
reference_positions = WordOperations.calculate_reference_positions(gap, [matching_source])
|
139
175
|
|
140
176
|
# Since we matched syllable counts for the entire gap, we should handle all words
|
141
|
-
if len(
|
177
|
+
if len(gap_words) > len(ref_words):
|
142
178
|
# Multiple transcribed words -> fewer reference words
|
143
179
|
# Try to distribute the reference words across the gap words
|
144
|
-
words_per_ref = len(
|
180
|
+
words_per_ref = len(gap_words) / len(ref_words)
|
145
181
|
|
146
|
-
for ref_idx,
|
182
|
+
for ref_idx, ref_word_id in enumerate(reference_word_ids):
|
147
183
|
start_idx = int(ref_idx * words_per_ref)
|
148
184
|
end_idx = int((ref_idx + 1) * words_per_ref)
|
149
185
|
|
150
186
|
# Get the group of words to combine
|
151
|
-
words_to_combine =
|
187
|
+
words_to_combine = gap_words[start_idx:end_idx]
|
188
|
+
word_ids_to_combine = gap.transcribed_word_ids[start_idx:end_idx]
|
152
189
|
corrections.extend(
|
153
190
|
WordOperations.create_word_combine_corrections(
|
154
191
|
original_words=words_to_combine,
|
155
|
-
reference_word=
|
192
|
+
reference_word=word_map[ref_word_id].text,
|
156
193
|
original_position=gap.transcription_position + start_idx,
|
157
194
|
source=matching_source,
|
158
195
|
confidence=0.8,
|
159
|
-
combine_reason="
|
160
|
-
delete_reason="
|
196
|
+
combine_reason="Words combined based on syllable match",
|
197
|
+
delete_reason="Word removed as part of syllable match combination",
|
161
198
|
reference_positions=reference_positions,
|
199
|
+
handler="SyllablesMatchHandler",
|
200
|
+
original_word_ids=word_ids_to_combine,
|
201
|
+
corrected_word_id=ref_word_id,
|
162
202
|
)
|
163
203
|
)
|
164
204
|
|
165
|
-
elif len(
|
205
|
+
elif len(gap_words) < len(ref_words):
|
166
206
|
# Single transcribed word -> multiple reference words
|
167
|
-
words_per_gap = len(
|
207
|
+
words_per_gap = len(ref_words) / len(gap_words)
|
168
208
|
|
169
|
-
for i,
|
209
|
+
for i, word_id in enumerate(gap.transcribed_word_ids):
|
170
210
|
start_idx = int(i * words_per_gap)
|
171
211
|
end_idx = int((i + 1) * words_per_gap)
|
172
|
-
|
212
|
+
ref_word_ids_for_split = reference_word_ids[start_idx:end_idx]
|
213
|
+
ref_words_for_split = [word_map[ref_id].text for ref_id in ref_word_ids_for_split]
|
173
214
|
|
174
215
|
corrections.extend(
|
175
216
|
WordOperations.create_word_split_corrections(
|
176
|
-
original_word=
|
177
|
-
reference_words=
|
217
|
+
original_word=word_map[word_id].text,
|
218
|
+
reference_words=ref_words_for_split,
|
178
219
|
original_position=gap.transcription_position + i,
|
179
220
|
source=matching_source,
|
180
221
|
confidence=0.8,
|
181
|
-
reason="
|
222
|
+
reason="Split word based on syllable match",
|
182
223
|
reference_positions=reference_positions,
|
224
|
+
handler="SyllablesMatchHandler",
|
225
|
+
original_word_id=word_id,
|
226
|
+
corrected_word_ids=ref_word_ids_for_split,
|
183
227
|
)
|
184
228
|
)
|
185
229
|
|
186
230
|
else:
|
187
231
|
# One-to-one replacement
|
188
|
-
for i, (
|
189
|
-
|
232
|
+
for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
|
233
|
+
orig_word = word_map[orig_word_id]
|
234
|
+
ref_word = word_map[ref_word_id]
|
235
|
+
|
236
|
+
if orig_word.text.lower() != ref_word.text.lower():
|
190
237
|
corrections.append(
|
191
238
|
WordOperations.create_word_replacement_correction(
|
192
|
-
original_word=orig_word,
|
193
|
-
corrected_word=
|
239
|
+
original_word=orig_word.text,
|
240
|
+
corrected_word=ref_word.text,
|
194
241
|
original_position=gap.transcription_position + i,
|
195
242
|
source=matching_source,
|
196
243
|
confidence=0.8,
|
197
|
-
reason=f"
|
244
|
+
reason=f"Source '{matching_source}' had matching syllable count",
|
198
245
|
reference_positions=reference_positions,
|
246
|
+
handler="SyllablesMatchHandler",
|
247
|
+
original_word_id=orig_word_id,
|
248
|
+
corrected_word_id=ref_word_id,
|
199
249
|
)
|
200
250
|
)
|
201
251
|
|
@@ -13,49 +13,66 @@ class WordCountMatchHandler(GapCorrectionHandler):
|
|
13
13
|
super().__init__(logger)
|
14
14
|
self.logger = logger or logging.getLogger(__name__)
|
15
15
|
|
16
|
-
def can_handle(self, gap: GapSequence) -> Tuple[bool, Dict[str, Any]]:
|
16
|
+
def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
|
17
17
|
# Must have reference words
|
18
|
-
if not gap.
|
19
|
-
self.logger.debug("No reference
|
18
|
+
if not gap.reference_word_ids:
|
19
|
+
self.logger.debug("No reference word IDs available.")
|
20
20
|
return False, {}
|
21
21
|
|
22
|
-
|
22
|
+
if not self._validate_data(data):
|
23
|
+
return False, {}
|
24
|
+
|
25
|
+
ref_word_lists = list(gap.reference_word_ids.values())
|
23
26
|
|
24
27
|
# All sources must have same number of words as gap
|
25
|
-
if not all(len(words) == gap.length for words in
|
28
|
+
if not all(len(words) == gap.length for words in ref_word_lists):
|
26
29
|
self.logger.debug("Not all sources have the same number of words as the gap.")
|
27
30
|
return False, {}
|
28
31
|
|
29
32
|
# If we have multiple sources, they must all agree
|
30
|
-
if len(
|
33
|
+
if len(ref_word_lists) > 1 and not all(words == ref_word_lists[0] for words in ref_word_lists[1:]):
|
31
34
|
self.logger.debug("Not all sources agree on the words.")
|
32
35
|
return False, {}
|
33
36
|
|
34
37
|
self.logger.debug("All sources agree and have matching word counts.")
|
35
|
-
return True, {}
|
38
|
+
return True, {"word_map": data["word_map"]}
|
36
39
|
|
37
40
|
def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
|
41
|
+
if not self._validate_data(data):
|
42
|
+
return []
|
43
|
+
|
38
44
|
corrections = []
|
39
|
-
|
40
|
-
source = list(gap.
|
41
|
-
|
42
|
-
|
43
|
-
sources = ", ".join(gap.reference_words.keys())
|
45
|
+
word_map = data["word_map"]
|
46
|
+
source = list(gap.reference_word_ids.keys())[0]
|
47
|
+
reference_word_ids = gap.reference_word_ids[source]
|
48
|
+
sources = ", ".join(gap.reference_word_ids.keys())
|
44
49
|
|
45
|
-
# Use the centralized method to calculate reference positions for all sources
|
46
50
|
reference_positions = WordOperations.calculate_reference_positions(gap)
|
47
51
|
|
48
|
-
|
49
|
-
|
50
|
-
if
|
52
|
+
for i, (orig_word_id, ref_word_id) in enumerate(zip(gap.transcribed_word_ids, reference_word_ids)):
|
53
|
+
# Get the actual words from the word map
|
54
|
+
if orig_word_id not in word_map:
|
55
|
+
self.logger.error(f"Original word ID {orig_word_id} not found in word_map")
|
56
|
+
continue
|
57
|
+
orig_word = word_map[orig_word_id]
|
58
|
+
|
59
|
+
if ref_word_id not in word_map:
|
60
|
+
self.logger.error(f"Reference word ID {ref_word_id} not found in word_map")
|
61
|
+
continue
|
62
|
+
ref_word = word_map[ref_word_id]
|
63
|
+
|
64
|
+
if orig_word.text.lower() != ref_word.text.lower():
|
51
65
|
correction = WordOperations.create_word_replacement_correction(
|
52
|
-
original_word=orig_word,
|
53
|
-
corrected_word=
|
66
|
+
original_word=orig_word.text,
|
67
|
+
corrected_word=ref_word.text,
|
54
68
|
original_position=gap.transcription_position + i,
|
55
69
|
source=sources,
|
56
70
|
confidence=1.0,
|
57
|
-
reason="
|
71
|
+
reason="Reference sources had same word count as gap",
|
58
72
|
reference_positions=reference_positions,
|
73
|
+
handler="WordCountMatchHandler",
|
74
|
+
original_word_id=orig_word_id,
|
75
|
+
corrected_word_id=ref_word_id, # Use the reference word's ID
|
59
76
|
)
|
60
77
|
corrections.append(correction)
|
61
78
|
self.logger.debug(f"Correction made: {correction}")
|