lyrics-transcriber 0.40.0__py3-none-any.whl → 0.42.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/cli/cli_main.py +7 -0
- lyrics_transcriber/core/config.py +1 -0
- lyrics_transcriber/core/controller.py +30 -52
- lyrics_transcriber/correction/anchor_sequence.py +325 -150
- lyrics_transcriber/correction/corrector.py +224 -107
- lyrics_transcriber/correction/handlers/base.py +28 -10
- lyrics_transcriber/correction/handlers/extend_anchor.py +47 -24
- lyrics_transcriber/correction/handlers/levenshtein.py +75 -33
- lyrics_transcriber/correction/handlers/llm.py +290 -0
- lyrics_transcriber/correction/handlers/no_space_punct_match.py +81 -36
- lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +46 -26
- lyrics_transcriber/correction/handlers/repeat.py +28 -11
- lyrics_transcriber/correction/handlers/sound_alike.py +68 -32
- lyrics_transcriber/correction/handlers/syllables_match.py +80 -30
- lyrics_transcriber/correction/handlers/word_count_match.py +36 -19
- lyrics_transcriber/correction/handlers/word_operations.py +68 -22
- lyrics_transcriber/correction/text_utils.py +3 -7
- lyrics_transcriber/frontend/.yarn/install-state.gz +0 -0
- lyrics_transcriber/frontend/.yarn/releases/yarn-4.6.0.cjs +934 -0
- lyrics_transcriber/frontend/.yarnrc.yml +3 -0
- lyrics_transcriber/frontend/dist/assets/{index-DKnNJHRK.js → index-coH8y7gV.js} +16284 -9032
- lyrics_transcriber/frontend/dist/assets/index-coH8y7gV.js.map +1 -0
- lyrics_transcriber/frontend/dist/index.html +1 -1
- lyrics_transcriber/frontend/package.json +6 -2
- lyrics_transcriber/frontend/src/App.tsx +18 -2
- lyrics_transcriber/frontend/src/api.ts +103 -6
- lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +7 -6
- lyrics_transcriber/frontend/src/components/DetailsModal.tsx +86 -59
- lyrics_transcriber/frontend/src/components/EditModal.tsx +93 -43
- lyrics_transcriber/frontend/src/components/FileUpload.tsx +2 -2
- lyrics_transcriber/frontend/src/components/Header.tsx +251 -0
- lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +303 -265
- lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +117 -0
- lyrics_transcriber/frontend/src/components/ReferenceView.tsx +125 -40
- lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +129 -115
- lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +59 -78
- lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +40 -16
- lyrics_transcriber/frontend/src/components/WordEditControls.tsx +4 -10
- lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +137 -68
- lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +1 -1
- lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +85 -115
- lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
- lyrics_transcriber/frontend/src/components/shared/types.ts +15 -7
- lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +35 -0
- lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
- lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +7 -7
- lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +121 -0
- lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
- lyrics_transcriber/frontend/src/types.js +2 -0
- lyrics_transcriber/frontend/src/types.ts +70 -49
- lyrics_transcriber/frontend/src/validation.ts +132 -0
- lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
- lyrics_transcriber/frontend/yarn.lock +3752 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +75 -12
- lyrics_transcriber/lyrics/file_provider.py +6 -5
- lyrics_transcriber/lyrics/genius.py +5 -2
- lyrics_transcriber/lyrics/spotify.py +58 -21
- lyrics_transcriber/output/ass/config.py +16 -5
- lyrics_transcriber/output/cdg.py +8 -8
- lyrics_transcriber/output/generator.py +29 -14
- lyrics_transcriber/output/plain_text.py +15 -10
- lyrics_transcriber/output/segment_resizer.py +16 -3
- lyrics_transcriber/output/subtitles.py +56 -2
- lyrics_transcriber/output/video.py +107 -1
- lyrics_transcriber/review/__init__.py +0 -1
- lyrics_transcriber/review/server.py +337 -164
- lyrics_transcriber/transcribers/audioshake.py +3 -0
- lyrics_transcriber/transcribers/base_transcriber.py +11 -3
- lyrics_transcriber/transcribers/whisper.py +11 -1
- lyrics_transcriber/types.py +151 -105
- lyrics_transcriber/utils/word_utils.py +27 -0
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/METADATA +3 -1
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/RECORD +76 -63
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/WHEEL +1 -1
- lyrics_transcriber/frontend/dist/assets/index-DKnNJHRK.js.map +0 -1
- lyrics_transcriber/frontend/package-lock.json +0 -4260
- lyrics_transcriber/frontend/src/components/shared/utils/initializeDataWithIds.tsx +0 -202
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/LICENSE +0 -0
- {lyrics_transcriber-0.40.0.dist-info → lyrics_transcriber-0.42.0.dist-info}/entry_points.txt +0 -0
@@ -8,9 +8,10 @@ from pathlib import Path
|
|
8
8
|
import json
|
9
9
|
import hashlib
|
10
10
|
|
11
|
-
from lyrics_transcriber.types import PhraseScore, AnchorSequence, GapSequence, ScoredAnchor
|
11
|
+
from lyrics_transcriber.types import LyricsData, PhraseScore, AnchorSequence, GapSequence, ScoredAnchor, TranscriptionResult, Word
|
12
12
|
from lyrics_transcriber.correction.phrase_analyzer import PhraseAnalyzer
|
13
13
|
from lyrics_transcriber.correction.text_utils import clean_text
|
14
|
+
from lyrics_transcriber.utils.word_utils import WordUtils
|
14
15
|
|
15
16
|
|
16
17
|
class AnchorSequenceFinder:
|
@@ -93,38 +94,81 @@ class AnchorSequenceFinder:
|
|
93
94
|
return anchor
|
94
95
|
return None
|
95
96
|
|
96
|
-
def _get_cache_key(self, transcribed: str, references: Dict[str,
|
97
|
+
def _get_cache_key(self, transcribed: str, references: Dict[str, LyricsData], transcription_result: TranscriptionResult) -> str:
|
97
98
|
"""Generate a unique cache key for the input combination."""
|
98
|
-
# Create a string that uniquely identifies the inputs
|
99
|
-
|
99
|
+
# Create a string that uniquely identifies the inputs, but only using stable content
|
100
|
+
# Use only the text content, not IDs or other potentially varying metadata
|
101
|
+
ref_texts = []
|
102
|
+
for source, lyrics in sorted(references.items()):
|
103
|
+
text = " ".join(w.text for s in lyrics.segments for w in s.words)
|
104
|
+
ref_texts.append(f"{source}:{text}")
|
105
|
+
|
106
|
+
input_str = f"{transcribed}|" f"{','.join(ref_texts)}"
|
100
107
|
return hashlib.md5(input_str.encode()).hexdigest()
|
101
108
|
|
102
|
-
def _save_to_cache(self, cache_path: Path,
|
109
|
+
def _save_to_cache(self, cache_path: Path, anchors: List[ScoredAnchor]) -> None:
|
103
110
|
"""Save results to cache file."""
|
104
111
|
self.logger.debug(f"Saving to cache: {cache_path}")
|
112
|
+
# Convert to dictionary format that matches the expected loading format
|
113
|
+
cache_data = [{"anchor": anchor.anchor.to_dict(), "phrase_score": anchor.phrase_score.to_dict()} for anchor in anchors]
|
105
114
|
with open(cache_path, "w") as f:
|
106
|
-
json.dump(
|
115
|
+
json.dump(cache_data, f, indent=2)
|
107
116
|
|
108
|
-
def _load_from_cache(self, cache_path: Path) -> Optional[
|
117
|
+
def _load_from_cache(self, cache_path: Path) -> Optional[List[ScoredAnchor]]:
|
109
118
|
"""Load results from cache if available."""
|
110
119
|
try:
|
111
120
|
self.logger.debug(f"Attempting to load from cache: {cache_path}")
|
112
121
|
with open(cache_path, "r") as f:
|
113
|
-
|
114
|
-
|
115
|
-
self.logger.
|
122
|
+
cached_data = json.load(f)
|
123
|
+
|
124
|
+
self.logger.info("Loading anchors from cache")
|
125
|
+
try:
|
126
|
+
# Log the raw dictionary data instead of the object
|
127
|
+
# if cached_data:
|
128
|
+
# self.logger.debug(f"Cached data structure: {json.dumps(cached_data[0], indent=2)}")
|
129
|
+
|
130
|
+
# Convert cached data back to ScoredAnchor objects
|
131
|
+
anchors = []
|
132
|
+
for data in cached_data:
|
133
|
+
if "anchor" not in data or "phrase_score" not in data:
|
134
|
+
raise KeyError("Missing required keys: anchor, phrase_score")
|
135
|
+
|
136
|
+
anchor = AnchorSequence.from_dict(data["anchor"])
|
137
|
+
phrase_score = PhraseScore.from_dict(data["phrase_score"])
|
138
|
+
anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
|
139
|
+
|
140
|
+
return anchors
|
141
|
+
|
142
|
+
except KeyError as e:
|
143
|
+
self.logger.error(f"Cache format mismatch. Missing key: {e}")
|
144
|
+
# Log the raw data for debugging
|
145
|
+
if cached_data:
|
146
|
+
self.logger.error(f"First cached anchor data: {json.dumps(cached_data[0], indent=2)}")
|
147
|
+
self.logger.error("Expected keys: anchor, phrase_score")
|
148
|
+
self.logger.warning(f"Cache format mismatch: {e}. Recomputing.")
|
149
|
+
return None
|
150
|
+
|
151
|
+
except (FileNotFoundError, json.JSONDecodeError) as e:
|
152
|
+
self.logger.debug(f"Cache miss or invalid cache file: {e}")
|
153
|
+
return None
|
154
|
+
except Exception as e:
|
155
|
+
self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
|
116
156
|
return None
|
117
157
|
|
118
158
|
def _process_ngram_length(
|
119
|
-
self,
|
159
|
+
self,
|
160
|
+
n: int,
|
161
|
+
trans_words: List[str],
|
162
|
+
all_words: List[Word],
|
163
|
+
ref_texts_clean: Dict[str, List[str]],
|
164
|
+
ref_words: Dict[str, List[Word]],
|
165
|
+
min_sources: int,
|
120
166
|
) -> List[AnchorSequence]:
|
121
167
|
"""Process a single n-gram length to find matching sequences."""
|
122
168
|
candidate_anchors = []
|
123
169
|
used_positions = {source: set() for source in ref_texts_clean.keys()}
|
124
170
|
used_trans_positions = set()
|
125
171
|
|
126
|
-
# Try each position in the transcribed text multiple times
|
127
|
-
# to catch repeated phrases
|
128
172
|
found_new_match = True
|
129
173
|
while found_new_match:
|
130
174
|
found_new_match = False
|
@@ -137,56 +181,108 @@ class AnchorSequenceFinder:
|
|
137
181
|
if trans_pos in used_trans_positions:
|
138
182
|
continue
|
139
183
|
|
184
|
+
# Get the actual words from the transcription at this position
|
185
|
+
actual_words = [w.text.lower().strip('.,?!"\n') for w in all_words[trans_pos : trans_pos + n]]
|
186
|
+
ngram_words = [w.lower() for w in ngram]
|
187
|
+
|
188
|
+
if actual_words != ngram_words:
|
189
|
+
self.logger.error(f"Mismatch between ngram and actual words at position {trans_pos}:")
|
190
|
+
self.logger.error(f"Ngram words: {ngram_words}")
|
191
|
+
self.logger.error(f"Actual words: {actual_words}")
|
192
|
+
self.logger.error(f"Full trans_words: {trans_words}")
|
193
|
+
self.logger.error(f"Full all_words: {[w.text for w in all_words]}")
|
194
|
+
raise AssertionError(
|
195
|
+
f"Ngram words don't match actual words at position {trans_pos}. "
|
196
|
+
f"This should never happen as trans_words should be derived from all_words."
|
197
|
+
)
|
198
|
+
|
140
199
|
matches = self._find_matching_sources(ngram, ref_texts_clean, n)
|
141
200
|
if len(matches) >= min_sources:
|
201
|
+
# Get Word IDs for transcribed words
|
202
|
+
transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
|
203
|
+
|
204
|
+
# Get Word IDs for reference words
|
205
|
+
reference_word_ids = {source: [w.id for w in ref_words[source][pos : pos + n]] for source, pos in matches.items()}
|
206
|
+
|
142
207
|
# Mark positions as used
|
143
208
|
for source, pos in matches.items():
|
144
209
|
used_positions[source].add(pos)
|
145
210
|
used_trans_positions.add(trans_pos)
|
146
211
|
|
147
|
-
anchor = AnchorSequence(
|
212
|
+
anchor = AnchorSequence(
|
213
|
+
id=WordUtils.generate_id(),
|
214
|
+
transcribed_word_ids=transcribed_word_ids,
|
215
|
+
transcription_position=trans_pos,
|
216
|
+
reference_positions=matches,
|
217
|
+
reference_word_ids=reference_word_ids,
|
218
|
+
confidence=len(matches) / len(ref_texts_clean),
|
219
|
+
)
|
148
220
|
candidate_anchors.append(anchor)
|
149
221
|
found_new_match = True
|
150
|
-
break
|
222
|
+
break
|
151
223
|
|
152
224
|
return candidate_anchors
|
153
225
|
|
154
|
-
def find_anchors(
|
226
|
+
def find_anchors(
|
227
|
+
self,
|
228
|
+
transcribed: str,
|
229
|
+
references: Dict[str, LyricsData],
|
230
|
+
transcription_result: TranscriptionResult,
|
231
|
+
) -> List[ScoredAnchor]:
|
155
232
|
"""Find anchor sequences that appear in both transcription and references."""
|
156
|
-
cache_key = self._get_cache_key(transcribed, references)
|
233
|
+
cache_key = self._get_cache_key(transcribed, references, transcription_result)
|
157
234
|
cache_path = self.cache_dir / f"anchors_{cache_key}.json"
|
158
235
|
|
159
236
|
# Try to load from cache
|
160
237
|
if cached_data := self._load_from_cache(cache_path):
|
161
238
|
self.logger.info("Loading anchors from cache")
|
162
239
|
try:
|
163
|
-
|
164
|
-
|
165
|
-
|
240
|
+
# Convert cached_data to dictionary before logging
|
241
|
+
if cached_data:
|
242
|
+
first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
|
243
|
+
return cached_data
|
244
|
+
except Exception as e:
|
245
|
+
self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
|
246
|
+
if cached_data:
|
247
|
+
try:
|
248
|
+
first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
|
249
|
+
self.logger.error(f"First cached anchor data: {json.dumps(first_anchor, indent=2)}")
|
250
|
+
except:
|
251
|
+
self.logger.error("Could not serialize first cached anchor for logging")
|
166
252
|
|
167
253
|
# If not in cache or cache format invalid, perform the computation
|
168
|
-
self.logger.info("Cache miss - computing anchors")
|
254
|
+
self.logger.info(f"Cache miss for key {cache_key} - computing anchors")
|
169
255
|
self.logger.info(f"Finding anchor sequences for transcription with length {len(transcribed)}")
|
170
256
|
|
171
|
-
#
|
172
|
-
|
173
|
-
|
257
|
+
# Get all words from transcription
|
258
|
+
all_words = []
|
259
|
+
for segment in transcription_result.segments:
|
260
|
+
all_words.extend(segment.words)
|
261
|
+
|
262
|
+
# Clean and split texts - this should match all_words exactly
|
263
|
+
trans_words = [w.text.lower().strip('.,?!"\n') for w in all_words] # Changed to derive directly from all_words
|
264
|
+
ref_texts_clean = {
|
265
|
+
source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
|
266
|
+
for source, lyrics in references.items()
|
267
|
+
}
|
268
|
+
ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
|
174
269
|
|
175
270
|
max_length = min(len(trans_words), min(len(words) for words in ref_texts_clean.values()))
|
176
271
|
n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
|
177
272
|
|
178
|
-
#
|
179
|
-
num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
|
180
|
-
self.logger.info(f"Processing {len(n_gram_lengths)} n-gram lengths using {num_processes} processes")
|
181
|
-
|
182
|
-
# Create partial function with fixed arguments
|
273
|
+
# Process n-gram lengths in parallel
|
183
274
|
process_length_partial = partial(
|
184
|
-
self._process_ngram_length,
|
275
|
+
self._process_ngram_length,
|
276
|
+
trans_words=trans_words,
|
277
|
+
all_words=all_words, # Pass the Word objects
|
278
|
+
ref_texts_clean=ref_texts_clean,
|
279
|
+
ref_words=ref_words,
|
280
|
+
min_sources=self.min_sources,
|
185
281
|
)
|
186
282
|
|
187
283
|
# Process n-gram lengths in parallel
|
188
284
|
candidate_anchors = []
|
189
|
-
with Pool(processes=
|
285
|
+
with Pool(processes=max(cpu_count() - 1, 1)) as pool:
|
190
286
|
results = list(
|
191
287
|
tqdm(
|
192
288
|
pool.imap(process_length_partial, n_gram_lengths, chunksize=1),
|
@@ -198,13 +294,10 @@ class AnchorSequenceFinder:
|
|
198
294
|
candidate_anchors.extend(anchors)
|
199
295
|
|
200
296
|
self.logger.info(f"Found {len(candidate_anchors)} candidate anchors")
|
201
|
-
filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed)
|
202
|
-
|
203
|
-
# Before returning, save to cache with correct format
|
204
|
-
self._save_to_cache(
|
205
|
-
cache_path, [{"anchor": anchor.anchor.to_dict(), "phrase_score": anchor.phrase_score.to_dict()} for anchor in filtered_anchors]
|
206
|
-
)
|
297
|
+
filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
|
207
298
|
|
299
|
+
# Save to cache
|
300
|
+
self._save_to_cache(cache_path, filtered_anchors)
|
208
301
|
return filtered_anchors
|
209
302
|
|
210
303
|
def _score_sequence(self, words: List[str], context: str) -> PhraseScore:
|
@@ -212,19 +305,6 @@ class AnchorSequenceFinder:
|
|
212
305
|
self.logger.debug(f"_score_sequence called for: '{' '.join(words)}'")
|
213
306
|
return self.phrase_analyzer.score_phrase(words, context)
|
214
307
|
|
215
|
-
def _score_anchor(self, anchor: AnchorSequence, context: str) -> ScoredAnchor:
|
216
|
-
"""Score an anchor sequence based on phrase quality and line breaks.
|
217
|
-
|
218
|
-
Args:
|
219
|
-
anchor: The anchor sequence to score
|
220
|
-
context: The original transcribed text
|
221
|
-
"""
|
222
|
-
# Let phrase_analyzer handle all scoring including line breaks
|
223
|
-
phrase_score = self.phrase_analyzer.score_phrase(anchor.words, context)
|
224
|
-
|
225
|
-
# self.logger.debug(f"_score_anchor called for sequence: '{anchor.text}'")
|
226
|
-
return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
|
227
|
-
|
228
308
|
def _get_sequence_priority(self, scored_anchor: ScoredAnchor) -> Tuple[float, float, float, float, int]:
|
229
309
|
"""Get priority tuple for sorting sequences.
|
230
310
|
|
@@ -239,7 +319,7 @@ class AnchorSequenceFinder:
|
|
239
319
|
"""
|
240
320
|
# self.logger.debug(f"_get_sequence_priority called for anchor: '{scored_anchor.anchor.text}'")
|
241
321
|
position_bonus = 1.0 if scored_anchor.anchor.transcription_position == 0 else 0.0
|
242
|
-
length_bonus = len(scored_anchor.anchor.
|
322
|
+
length_bonus = len(scored_anchor.anchor.transcribed_word_ids) * 0.2 # Changed from words to transcribed_word_ids
|
243
323
|
|
244
324
|
return (
|
245
325
|
len(scored_anchor.anchor.reference_positions), # More sources is better
|
@@ -260,25 +340,39 @@ class AnchorSequenceFinder:
|
|
260
340
|
True if sequences overlap in transcription or share any reference positions
|
261
341
|
"""
|
262
342
|
# Check transcription overlap
|
263
|
-
seq1_trans_range = range(
|
264
|
-
|
343
|
+
seq1_trans_range = range(
|
344
|
+
seq1.transcription_position, seq1.transcription_position + len(seq1.transcribed_word_ids)
|
345
|
+
) # Changed from words
|
346
|
+
seq2_trans_range = range(
|
347
|
+
seq2.transcription_position, seq2.transcription_position + len(seq2.transcribed_word_ids)
|
348
|
+
) # Changed from words
|
265
349
|
trans_overlap = bool(set(seq1_trans_range) & set(seq2_trans_range))
|
266
350
|
|
267
351
|
# Check reference overlap - only consider positions in shared sources
|
268
352
|
shared_sources = set(seq1.reference_positions.keys()) & set(seq2.reference_positions.keys())
|
269
353
|
ref_overlap = any(seq1.reference_positions[source] == seq2.reference_positions[source] for source in shared_sources)
|
270
354
|
|
271
|
-
# self.logger.debug(f"Checking overlap between '{seq1.text}' and '{seq2.text}'")
|
272
355
|
return trans_overlap or ref_overlap
|
273
356
|
|
274
|
-
def _remove_overlapping_sequences(
|
357
|
+
def _remove_overlapping_sequences(
|
358
|
+
self,
|
359
|
+
anchors: List[AnchorSequence],
|
360
|
+
context: str,
|
361
|
+
transcription_result: TranscriptionResult,
|
362
|
+
) -> List[ScoredAnchor]:
|
275
363
|
"""Remove overlapping sequences using phrase analysis."""
|
276
364
|
if not anchors:
|
277
365
|
return []
|
278
366
|
|
279
367
|
self.logger.info(f"Scoring {len(anchors)} anchors")
|
280
368
|
|
281
|
-
#
|
369
|
+
# Create word map for scoring
|
370
|
+
word_map = {w.id: w for s in transcription_result.segments for w in s.words}
|
371
|
+
|
372
|
+
# Add word map to each anchor for scoring
|
373
|
+
for anchor in anchors:
|
374
|
+
anchor.transcribed_words = [word_map[word_id] for word_id in anchor.transcribed_word_ids]
|
375
|
+
|
282
376
|
start_time = time.time()
|
283
377
|
|
284
378
|
# Try different pool sizes
|
@@ -292,7 +386,7 @@ class AnchorSequenceFinder:
|
|
292
386
|
with Pool(processes=num_processes) as pool:
|
293
387
|
scored_anchors = list(
|
294
388
|
tqdm(
|
295
|
-
pool.imap(score_anchor_partial, anchors, chunksize=50),
|
389
|
+
pool.imap(score_anchor_partial, anchors, chunksize=50),
|
296
390
|
total=len(anchors),
|
297
391
|
desc="Scoring anchors (parallel)",
|
298
392
|
)
|
@@ -326,7 +420,11 @@ class AnchorSequenceFinder:
|
|
326
420
|
if not hasattr(AnchorSequenceFinder._score_anchor_static, "_phrase_analyzer"):
|
327
421
|
AnchorSequenceFinder._score_anchor_static._phrase_analyzer = PhraseAnalyzer(logger=logging.getLogger(__name__))
|
328
422
|
|
329
|
-
|
423
|
+
# Get the words from the transcribed word IDs
|
424
|
+
# We need to pass in the actual words for scoring
|
425
|
+
words = [w.text for w in anchor.transcribed_words] # This needs to be passed in
|
426
|
+
|
427
|
+
phrase_score = AnchorSequenceFinder._score_anchor_static._phrase_analyzer.score_phrase(words, context)
|
330
428
|
return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
|
331
429
|
|
332
430
|
def _get_reference_words(self, source: str, ref_words: List[str], start_pos: Optional[int], end_pos: Optional[int]) -> List[str]:
|
@@ -347,125 +445,202 @@ class AnchorSequenceFinder:
|
|
347
445
|
end_pos = len(ref_words)
|
348
446
|
return ref_words[start_pos:end_pos]
|
349
447
|
|
350
|
-
def find_gaps(
|
448
|
+
def find_gaps(
|
449
|
+
self,
|
450
|
+
transcribed: str,
|
451
|
+
anchors: List[ScoredAnchor],
|
452
|
+
references: Dict[str, LyricsData],
|
453
|
+
transcription_result: TranscriptionResult,
|
454
|
+
) -> List[GapSequence]:
|
351
455
|
"""Find gaps between anchor sequences in the transcribed text."""
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
#
|
365
|
-
ref_texts_original = {source: text.split() for source, text in references.items()}
|
366
|
-
|
456
|
+
# Get all words from transcription
|
457
|
+
all_words = []
|
458
|
+
for segment in transcription_result.segments:
|
459
|
+
all_words.extend(segment.words)
|
460
|
+
|
461
|
+
# Clean and split reference texts
|
462
|
+
ref_texts_clean = {
|
463
|
+
source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
|
464
|
+
for source, lyrics in references.items()
|
465
|
+
}
|
466
|
+
ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
|
467
|
+
|
468
|
+
# Create gaps with Word IDs
|
367
469
|
gaps = []
|
368
470
|
sorted_anchors = sorted(anchors, key=lambda x: x.anchor.transcription_position)
|
369
471
|
|
370
472
|
# Handle initial gap
|
371
|
-
if
|
372
|
-
|
373
|
-
|
374
|
-
|
473
|
+
if sorted_anchors:
|
474
|
+
first_anchor = sorted_anchors[0].anchor
|
475
|
+
first_anchor_pos = first_anchor.transcription_position
|
476
|
+
if first_anchor_pos > 0:
|
477
|
+
gap_word_ids = [w.id for w in all_words[:first_anchor_pos]]
|
478
|
+
if gap := self._create_initial_gap(
|
479
|
+
id=WordUtils.generate_id(),
|
480
|
+
transcribed_word_ids=gap_word_ids,
|
481
|
+
transcription_position=0,
|
482
|
+
following_anchor_id=first_anchor.id,
|
483
|
+
ref_texts_clean=ref_texts_clean,
|
484
|
+
ref_words=ref_words,
|
485
|
+
following_anchor=first_anchor,
|
486
|
+
):
|
487
|
+
gaps.append(gap)
|
375
488
|
|
376
489
|
# Handle gaps between anchors
|
377
490
|
for i in range(len(sorted_anchors) - 1):
|
378
|
-
|
379
|
-
|
380
|
-
)
|
381
|
-
|
491
|
+
current_anchor = sorted_anchors[i].anchor
|
492
|
+
next_anchor = sorted_anchors[i + 1].anchor
|
493
|
+
gap_start = current_anchor.transcription_position + len(current_anchor.transcribed_word_ids)
|
494
|
+
gap_end = next_anchor.transcription_position
|
495
|
+
|
496
|
+
if gap_end > gap_start:
|
497
|
+
gap_word_ids = [w.id for w in all_words[gap_start:gap_end]]
|
498
|
+
if between_gap := self._create_between_gap(
|
499
|
+
id=WordUtils.generate_id(),
|
500
|
+
transcribed_word_ids=gap_word_ids,
|
501
|
+
transcription_position=gap_start,
|
502
|
+
preceding_anchor_id=current_anchor.id,
|
503
|
+
following_anchor_id=next_anchor.id,
|
504
|
+
ref_texts_clean=ref_texts_clean,
|
505
|
+
ref_words=ref_words,
|
506
|
+
preceding_anchor=current_anchor,
|
507
|
+
following_anchor=next_anchor,
|
508
|
+
):
|
509
|
+
gaps.append(between_gap)
|
382
510
|
|
383
511
|
# Handle final gap
|
384
|
-
if sorted_anchors
|
385
|
-
|
512
|
+
if sorted_anchors:
|
513
|
+
last_anchor = sorted_anchors[-1].anchor
|
514
|
+
last_pos = last_anchor.transcription_position + len(last_anchor.transcribed_word_ids)
|
515
|
+
if last_pos < len(all_words):
|
516
|
+
gap_word_ids = [w.id for w in all_words[last_pos:]]
|
517
|
+
if final_gap := self._create_final_gap(
|
518
|
+
id=WordUtils.generate_id(),
|
519
|
+
transcribed_word_ids=gap_word_ids,
|
520
|
+
transcription_position=last_pos,
|
521
|
+
preceding_anchor_id=last_anchor.id,
|
522
|
+
ref_texts_clean=ref_texts_clean,
|
523
|
+
ref_words=ref_words,
|
524
|
+
preceding_anchor=last_anchor,
|
525
|
+
):
|
526
|
+
gaps.append(final_gap)
|
386
527
|
|
387
|
-
# Save to cache
|
388
|
-
self._save_to_cache(cache_path, [gap.to_dict() for gap in gaps])
|
389
528
|
return gaps
|
390
529
|
|
391
530
|
def _create_initial_gap(
|
392
531
|
self,
|
393
|
-
|
394
|
-
|
532
|
+
id: str,
|
533
|
+
transcribed_word_ids: List[str],
|
534
|
+
transcription_position: int,
|
535
|
+
following_anchor_id: str,
|
395
536
|
ref_texts_clean: Dict[str, List[str]],
|
396
|
-
|
537
|
+
ref_words: Dict[str, List[Word]],
|
538
|
+
following_anchor: AnchorSequence,
|
397
539
|
) -> Optional[GapSequence]:
|
398
|
-
"""Create gap sequence before the first anchor.
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
540
|
+
"""Create gap sequence before the first anchor.
|
541
|
+
|
542
|
+
The gap includes all reference words from the start of each reference
|
543
|
+
up to the position where the following anchor starts in that reference.
|
544
|
+
"""
|
545
|
+
if transcription_position > 0:
|
546
|
+
# Get reference word IDs for the gap
|
547
|
+
reference_word_ids = {}
|
548
|
+
for source, words in ref_words.items():
|
549
|
+
if source in ref_texts_clean:
|
550
|
+
# Get the position where the following anchor starts in this source
|
551
|
+
if source in following_anchor.reference_positions:
|
552
|
+
end_pos = following_anchor.reference_positions[source]
|
553
|
+
# Include all words from start up to the anchor
|
554
|
+
reference_word_ids[source] = [w.id for w in words[:end_pos]]
|
555
|
+
else:
|
556
|
+
# If this source doesn't contain the following anchor,
|
557
|
+
# we can't determine the gap content for it
|
558
|
+
reference_word_ids[source] = []
|
411
559
|
|
412
560
|
return GapSequence(
|
413
|
-
|
561
|
+
id=id,
|
562
|
+
transcribed_word_ids=transcribed_word_ids,
|
563
|
+
transcription_position=transcription_position,
|
564
|
+
preceding_anchor_id=None,
|
565
|
+
following_anchor_id=following_anchor_id,
|
566
|
+
reference_word_ids=reference_word_ids,
|
414
567
|
)
|
415
568
|
return None
|
416
569
|
|
417
570
|
def _create_between_gap(
|
418
571
|
self,
|
419
|
-
|
420
|
-
|
421
|
-
|
572
|
+
id: str,
|
573
|
+
transcribed_word_ids: List[str],
|
574
|
+
transcription_position: int,
|
575
|
+
preceding_anchor_id: str,
|
576
|
+
following_anchor_id: str,
|
422
577
|
ref_texts_clean: Dict[str, List[str]],
|
423
|
-
|
578
|
+
ref_words: Dict[str, List[Word]],
|
579
|
+
preceding_anchor: AnchorSequence,
|
580
|
+
following_anchor: AnchorSequence,
|
424
581
|
) -> Optional[GapSequence]:
|
425
|
-
"""Create gap sequence between two anchors.
|
426
|
-
gap_start = current_anchor.anchor.transcription_position + current_anchor.anchor.length
|
427
|
-
gap_end = next_anchor.anchor.transcription_position
|
428
|
-
|
429
|
-
if gap_end > gap_start:
|
430
|
-
ref_words = {}
|
431
|
-
ref_words_original = {}
|
432
|
-
shared_sources = set(current_anchor.anchor.reference_positions.keys()) & set(next_anchor.anchor.reference_positions.keys())
|
433
|
-
|
434
|
-
# Check for large position differences in next_anchor
|
435
|
-
if len(next_anchor.anchor.reference_positions) > 1:
|
436
|
-
positions = list(next_anchor.anchor.reference_positions.values())
|
437
|
-
max_diff = max(positions) - min(positions)
|
438
|
-
if max_diff > 20:
|
439
|
-
earliest_source = min(next_anchor.anchor.reference_positions.items(), key=lambda x: x[1])[0]
|
440
|
-
self.logger.warning(
|
441
|
-
f"Large position difference ({max_diff} words) in next anchor. Using only earliest source: {earliest_source}"
|
442
|
-
)
|
443
|
-
shared_sources &= {earliest_source}
|
444
|
-
|
445
|
-
for source in shared_sources:
|
446
|
-
start_pos = current_anchor.anchor.reference_positions[source] + current_anchor.anchor.length
|
447
|
-
end_pos = next_anchor.anchor.reference_positions[source]
|
448
|
-
ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], start_pos, end_pos)
|
449
|
-
ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], start_pos, end_pos)
|
582
|
+
"""Create gap sequence between two anchors.
|
450
583
|
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
584
|
+
For each reference source, the gap includes all words between the end of the
|
585
|
+
preceding anchor and the start of the following anchor in that source.
|
586
|
+
"""
|
587
|
+
# Get reference word IDs for the gap
|
588
|
+
reference_word_ids = {}
|
589
|
+
for source, words in ref_words.items():
|
590
|
+
if source in ref_texts_clean:
|
591
|
+
# Only process sources that contain both anchors
|
592
|
+
if source in preceding_anchor.reference_positions and source in following_anchor.reference_positions:
|
593
|
+
start_pos = preceding_anchor.reference_positions[source] + len(preceding_anchor.reference_word_ids[source])
|
594
|
+
end_pos = following_anchor.reference_positions[source]
|
595
|
+
# Include all words between the anchors
|
596
|
+
reference_word_ids[source] = [w.id for w in words[start_pos:end_pos]]
|
597
|
+
else:
|
598
|
+
# If this source doesn't contain both anchors,
|
599
|
+
# we can't determine the gap content for it
|
600
|
+
reference_word_ids[source] = []
|
601
|
+
|
602
|
+
return GapSequence(
|
603
|
+
id=id,
|
604
|
+
transcribed_word_ids=transcribed_word_ids,
|
605
|
+
transcription_position=transcription_position,
|
606
|
+
preceding_anchor_id=preceding_anchor_id,
|
607
|
+
following_anchor_id=following_anchor_id,
|
608
|
+
reference_word_ids=reference_word_ids,
|
609
|
+
)
|
455
610
|
|
456
611
|
def _create_final_gap(
|
457
|
-
self,
|
612
|
+
self,
|
613
|
+
id: str,
|
614
|
+
transcribed_word_ids: List[str],
|
615
|
+
transcription_position: int,
|
616
|
+
preceding_anchor_id: str,
|
617
|
+
ref_texts_clean: Dict[str, List[str]],
|
618
|
+
ref_words: Dict[str, List[Word]],
|
619
|
+
preceding_anchor: AnchorSequence,
|
458
620
|
) -> Optional[GapSequence]:
|
459
|
-
"""Create gap sequence after the last anchor.
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
621
|
+
"""Create gap sequence after the last anchor.
|
622
|
+
|
623
|
+
For each reference source, includes all words from the end of the
|
624
|
+
preceding anchor to the end of that reference.
|
625
|
+
"""
|
626
|
+
# Get reference word IDs for the gap
|
627
|
+
reference_word_ids = {}
|
628
|
+
for source, words in ref_words.items():
|
629
|
+
if source in ref_texts_clean:
|
630
|
+
if source in preceding_anchor.reference_positions:
|
631
|
+
start_pos = preceding_anchor.reference_positions[source] + len(preceding_anchor.reference_word_ids[source])
|
632
|
+
# Include all words from end of last anchor to end of reference
|
633
|
+
reference_word_ids[source] = [w.id for w in words[start_pos:]]
|
634
|
+
else:
|
635
|
+
# If this source doesn't contain the preceding anchor,
|
636
|
+
# we can't determine the gap content for it
|
637
|
+
reference_word_ids[source] = []
|
638
|
+
|
639
|
+
return GapSequence(
|
640
|
+
id=id,
|
641
|
+
transcribed_word_ids=transcribed_word_ids,
|
642
|
+
transcription_position=transcription_position,
|
643
|
+
preceding_anchor_id=preceding_anchor_id,
|
644
|
+
following_anchor_id=None,
|
645
|
+
reference_word_ids=reference_word_ids,
|
646
|
+
)
|