lyrics-transcriber 0.30.0__py3-none-any.whl → 0.32.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/__init__.py +2 -1
- lyrics_transcriber/cli/{main.py → cli_main.py} +47 -14
- lyrics_transcriber/core/config.py +35 -0
- lyrics_transcriber/core/controller.py +164 -166
- lyrics_transcriber/correction/anchor_sequence.py +471 -0
- lyrics_transcriber/correction/corrector.py +256 -0
- lyrics_transcriber/correction/handlers/__init__.py +0 -0
- lyrics_transcriber/correction/handlers/base.py +30 -0
- lyrics_transcriber/correction/handlers/extend_anchor.py +91 -0
- lyrics_transcriber/correction/handlers/levenshtein.py +147 -0
- lyrics_transcriber/correction/handlers/no_space_punct_match.py +98 -0
- lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +55 -0
- lyrics_transcriber/correction/handlers/repeat.py +71 -0
- lyrics_transcriber/correction/handlers/sound_alike.py +223 -0
- lyrics_transcriber/correction/handlers/syllables_match.py +182 -0
- lyrics_transcriber/correction/handlers/word_count_match.py +54 -0
- lyrics_transcriber/correction/handlers/word_operations.py +135 -0
- lyrics_transcriber/correction/phrase_analyzer.py +426 -0
- lyrics_transcriber/correction/text_utils.py +30 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +125 -0
- lyrics_transcriber/lyrics/genius.py +73 -0
- lyrics_transcriber/lyrics/spotify.py +82 -0
- lyrics_transcriber/output/ass/__init__.py +21 -0
- lyrics_transcriber/output/{ass.py → ass/ass.py} +150 -690
- lyrics_transcriber/output/ass/ass_specs.txt +732 -0
- lyrics_transcriber/output/ass/config.py +37 -0
- lyrics_transcriber/output/ass/constants.py +23 -0
- lyrics_transcriber/output/ass/event.py +94 -0
- lyrics_transcriber/output/ass/formatters.py +132 -0
- lyrics_transcriber/output/ass/lyrics_line.py +219 -0
- lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
- lyrics_transcriber/output/ass/section_detector.py +89 -0
- lyrics_transcriber/output/ass/section_screen.py +106 -0
- lyrics_transcriber/output/ass/style.py +187 -0
- lyrics_transcriber/output/cdg.py +503 -0
- lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
- lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
- lyrics_transcriber/output/cdgmaker/composer.py +1919 -0
- lyrics_transcriber/output/cdgmaker/config.py +151 -0
- lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
- lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
- lyrics_transcriber/output/cdgmaker/pack.py +507 -0
- lyrics_transcriber/output/cdgmaker/render.py +346 -0
- lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
- lyrics_transcriber/output/cdgmaker/utils.py +132 -0
- lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
- lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
- lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/arial.ttf +0 -0
- lyrics_transcriber/output/fonts/georgia.ttf +0 -0
- lyrics_transcriber/output/fonts/verdana.ttf +0 -0
- lyrics_transcriber/output/generator.py +140 -171
- lyrics_transcriber/output/lyrics_file.py +102 -0
- lyrics_transcriber/output/plain_text.py +91 -0
- lyrics_transcriber/output/segment_resizer.py +416 -0
- lyrics_transcriber/output/subtitles.py +328 -302
- lyrics_transcriber/output/video.py +219 -0
- lyrics_transcriber/review/__init__.py +1 -0
- lyrics_transcriber/review/server.py +138 -0
- lyrics_transcriber/storage/dropbox.py +110 -134
- lyrics_transcriber/transcribers/audioshake.py +171 -105
- lyrics_transcriber/transcribers/base_transcriber.py +149 -0
- lyrics_transcriber/transcribers/whisper.py +267 -133
- lyrics_transcriber/types.py +454 -0
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/METADATA +14 -3
- lyrics_transcriber-0.32.1.dist-info/RECORD +86 -0
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/WHEEL +1 -1
- lyrics_transcriber-0.32.1.dist-info/entry_points.txt +4 -0
- lyrics_transcriber/core/corrector.py +0 -56
- lyrics_transcriber/core/fetcher.py +0 -143
- lyrics_transcriber/storage/tokens.py +0 -116
- lyrics_transcriber/transcribers/base.py +0 -31
- lyrics_transcriber-0.30.0.dist-info/RECORD +0 -22
- lyrics_transcriber-0.30.0.dist-info/entry_points.txt +0 -3
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,471 @@
|
|
1
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
2
|
+
import logging
|
3
|
+
from tqdm import tqdm
|
4
|
+
from multiprocessing import Pool, cpu_count
|
5
|
+
from functools import partial
|
6
|
+
import time
|
7
|
+
from pathlib import Path
|
8
|
+
import json
|
9
|
+
import hashlib
|
10
|
+
|
11
|
+
from lyrics_transcriber.types import PhraseScore, AnchorSequence, GapSequence, ScoredAnchor
|
12
|
+
from lyrics_transcriber.correction.phrase_analyzer import PhraseAnalyzer
|
13
|
+
from lyrics_transcriber.correction.text_utils import clean_text
|
14
|
+
|
15
|
+
|
16
|
+
class AnchorSequenceFinder:
|
17
|
+
"""Identifies and manages anchor sequences between transcribed and reference lyrics."""
|
18
|
+
|
19
|
+
def __init__(
|
20
|
+
self,
|
21
|
+
cache_dir: Union[str, Path],
|
22
|
+
min_sequence_length: int = 3,
|
23
|
+
min_sources: int = 1,
|
24
|
+
logger: Optional[logging.Logger] = None,
|
25
|
+
):
|
26
|
+
self.min_sequence_length = min_sequence_length
|
27
|
+
self.min_sources = min_sources
|
28
|
+
self.logger = logger or logging.getLogger(__name__)
|
29
|
+
self.phrase_analyzer = PhraseAnalyzer(logger=self.logger)
|
30
|
+
self.used_positions = {}
|
31
|
+
|
32
|
+
# Initialize cache directory
|
33
|
+
self.cache_dir = Path(cache_dir)
|
34
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
35
|
+
self.logger.debug(f"Initialized AnchorSequenceFinder with cache dir: {self.cache_dir}")
|
36
|
+
|
37
|
+
def _clean_text(self, text: str) -> str:
|
38
|
+
"""Clean text by removing punctuation and normalizing whitespace."""
|
39
|
+
# self.logger.debug(f"_clean_text called with text length: {len(text)}")
|
40
|
+
return clean_text(text)
|
41
|
+
|
42
|
+
def _find_ngrams(self, words: List[str], n: int) -> List[Tuple[List[str], int]]:
|
43
|
+
"""Generate n-grams with their starting positions."""
|
44
|
+
# self.logger.debug(f"_find_ngrams called with {len(words)} words, n={n}")
|
45
|
+
return [(words[i : i + n], i) for i in range(len(words) - n + 1)]
|
46
|
+
|
47
|
+
def _find_matching_sources(self, ngram: List[str], references: Dict[str, List[str]], n: int) -> Dict[str, int]:
|
48
|
+
"""Find which sources contain the given n-gram and at what positions."""
|
49
|
+
# self.logger.debug(f"_find_matching_sources called for ngram: '{' '.join(ngram)}'")
|
50
|
+
matches = {}
|
51
|
+
all_positions = {source: [] for source in references}
|
52
|
+
|
53
|
+
# First, find all positions in each source
|
54
|
+
for source, words in references.items():
|
55
|
+
for i in range(len(words) - n + 1):
|
56
|
+
if words[i : i + n] == ngram:
|
57
|
+
all_positions[source].append(i)
|
58
|
+
|
59
|
+
# Then, try to find an unused position for each source
|
60
|
+
for source, positions in all_positions.items():
|
61
|
+
used = self.used_positions.get(source, set())
|
62
|
+
# Try each position in order
|
63
|
+
for pos in positions:
|
64
|
+
if pos not in used:
|
65
|
+
matches[source] = pos
|
66
|
+
break
|
67
|
+
|
68
|
+
return matches
|
69
|
+
|
70
|
+
def _filter_used_positions(self, matches: Dict[str, int]) -> Dict[str, int]:
|
71
|
+
"""Filter out positions that have already been used.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
matches: Dict mapping source IDs to positions
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
Dict mapping source IDs to unused positions
|
78
|
+
"""
|
79
|
+
self.logger.debug(f"_filter_used_positions called with {len(matches)} matches")
|
80
|
+
return {source: pos for source, pos in matches.items() if pos not in self.used_positions.get(source, set())}
|
81
|
+
|
82
|
+
def _create_anchor(
|
83
|
+
self, ngram: List[str], trans_pos: int, matching_sources: Dict[str, int], total_sources: int
|
84
|
+
) -> Optional[AnchorSequence]:
|
85
|
+
"""Create an anchor sequence if it meets the minimum sources requirement."""
|
86
|
+
self.logger.debug(f"_create_anchor called for ngram: '{' '.join(ngram)}' at position {trans_pos}")
|
87
|
+
if len(matching_sources) >= self.min_sources:
|
88
|
+
confidence = len(matching_sources) / total_sources
|
89
|
+
anchor = AnchorSequence(
|
90
|
+
words=ngram, transcription_position=trans_pos, reference_positions=matching_sources, confidence=confidence
|
91
|
+
)
|
92
|
+
self.logger.debug(f"Found anchor sequence: '{' '.join(ngram)}' (confidence: {confidence:.2f})")
|
93
|
+
return anchor
|
94
|
+
return None
|
95
|
+
|
96
|
+
def _get_cache_key(self, transcribed: str, references: Dict[str, str]) -> str:
|
97
|
+
"""Generate a unique cache key for the input combination."""
|
98
|
+
# Create a string that uniquely identifies the inputs
|
99
|
+
input_str = f"{transcribed}|{'|'.join(f'{k}:{v}' for k,v in sorted(references.items()))}"
|
100
|
+
return hashlib.md5(input_str.encode()).hexdigest()
|
101
|
+
|
102
|
+
def _save_to_cache(self, cache_path: Path, data: Any) -> None:
|
103
|
+
"""Save results to cache file."""
|
104
|
+
self.logger.debug(f"Saving to cache: {cache_path}")
|
105
|
+
with open(cache_path, "w") as f:
|
106
|
+
json.dump(data, f, indent=2)
|
107
|
+
|
108
|
+
def _load_from_cache(self, cache_path: Path) -> Optional[Any]:
|
109
|
+
"""Load results from cache if available."""
|
110
|
+
try:
|
111
|
+
self.logger.debug(f"Attempting to load from cache: {cache_path}")
|
112
|
+
with open(cache_path, "r") as f:
|
113
|
+
return json.load(f)
|
114
|
+
except (FileNotFoundError, json.JSONDecodeError):
|
115
|
+
self.logger.debug("Cache miss or invalid cache file")
|
116
|
+
return None
|
117
|
+
|
118
|
+
def _process_ngram_length(
|
119
|
+
self, n: int, trans_words: List[str], ref_texts_clean: Dict[str, List[str]], min_sources: int
|
120
|
+
) -> List[AnchorSequence]:
|
121
|
+
"""Process a single n-gram length to find matching sequences."""
|
122
|
+
candidate_anchors = []
|
123
|
+
used_positions = {source: set() for source in ref_texts_clean.keys()}
|
124
|
+
used_trans_positions = set()
|
125
|
+
|
126
|
+
# Try each position in the transcribed text multiple times
|
127
|
+
# to catch repeated phrases
|
128
|
+
found_new_match = True
|
129
|
+
while found_new_match:
|
130
|
+
found_new_match = False
|
131
|
+
|
132
|
+
# Generate n-grams from transcribed text
|
133
|
+
trans_ngrams = self._find_ngrams(trans_words, n)
|
134
|
+
|
135
|
+
for ngram, trans_pos in trans_ngrams:
|
136
|
+
# Skip if we've already used this transcription position
|
137
|
+
if trans_pos in used_trans_positions:
|
138
|
+
continue
|
139
|
+
|
140
|
+
matches = self._find_matching_sources(ngram, ref_texts_clean, n)
|
141
|
+
if len(matches) >= min_sources:
|
142
|
+
# Mark positions as used
|
143
|
+
for source, pos in matches.items():
|
144
|
+
used_positions[source].add(pos)
|
145
|
+
used_trans_positions.add(trans_pos)
|
146
|
+
|
147
|
+
anchor = AnchorSequence(ngram, trans_pos, matches, len(matches) / len(ref_texts_clean))
|
148
|
+
candidate_anchors.append(anchor)
|
149
|
+
found_new_match = True
|
150
|
+
break # Start over to try finding more matches
|
151
|
+
|
152
|
+
return candidate_anchors
|
153
|
+
|
154
|
+
def find_anchors(self, transcribed: str, references: Dict[str, str]) -> List[ScoredAnchor]:
|
155
|
+
"""Find anchor sequences that appear in both transcription and references."""
|
156
|
+
cache_key = self._get_cache_key(transcribed, references)
|
157
|
+
cache_path = self.cache_dir / f"anchors_{cache_key}.json"
|
158
|
+
|
159
|
+
# Try to load from cache
|
160
|
+
if cached_data := self._load_from_cache(cache_path):
|
161
|
+
self.logger.info("Loading anchors from cache")
|
162
|
+
try:
|
163
|
+
return [ScoredAnchor.from_dict(anchor) for anchor in cached_data]
|
164
|
+
except KeyError as e:
|
165
|
+
self.logger.warning(f"Cache format mismatch: {e}. Recomputing.")
|
166
|
+
|
167
|
+
# If not in cache or cache format invalid, perform the computation
|
168
|
+
self.logger.info("Cache miss - computing anchors")
|
169
|
+
self.logger.info(f"Finding anchor sequences for transcription with length {len(transcribed)}")
|
170
|
+
|
171
|
+
# Clean and split texts
|
172
|
+
trans_words = self._clean_text(transcribed).split()
|
173
|
+
ref_texts_clean = {source: self._clean_text(text).split() for source, text in references.items()}
|
174
|
+
|
175
|
+
max_length = min(len(trans_words), min(len(words) for words in ref_texts_clean.values()))
|
176
|
+
n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
|
177
|
+
|
178
|
+
# Set up parallel processing
|
179
|
+
num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
|
180
|
+
self.logger.info(f"Processing {len(n_gram_lengths)} n-gram lengths using {num_processes} processes")
|
181
|
+
|
182
|
+
# Create partial function with fixed arguments
|
183
|
+
process_length_partial = partial(
|
184
|
+
self._process_ngram_length, trans_words=trans_words, ref_texts_clean=ref_texts_clean, min_sources=self.min_sources
|
185
|
+
)
|
186
|
+
|
187
|
+
# Process n-gram lengths in parallel
|
188
|
+
candidate_anchors = []
|
189
|
+
with Pool(processes=num_processes) as pool:
|
190
|
+
results = list(
|
191
|
+
tqdm(
|
192
|
+
pool.imap(process_length_partial, n_gram_lengths, chunksize=1),
|
193
|
+
total=len(n_gram_lengths),
|
194
|
+
desc="Processing n-gram lengths",
|
195
|
+
)
|
196
|
+
)
|
197
|
+
for anchors in results:
|
198
|
+
candidate_anchors.extend(anchors)
|
199
|
+
|
200
|
+
self.logger.info(f"Found {len(candidate_anchors)} candidate anchors")
|
201
|
+
filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed)
|
202
|
+
|
203
|
+
# Before returning, save to cache with correct format
|
204
|
+
self._save_to_cache(
|
205
|
+
cache_path, [{"anchor": anchor.anchor.to_dict(), "phrase_score": anchor.phrase_score.to_dict()} for anchor in filtered_anchors]
|
206
|
+
)
|
207
|
+
|
208
|
+
return filtered_anchors
|
209
|
+
|
210
|
+
def _score_sequence(self, words: List[str], context: str) -> PhraseScore:
|
211
|
+
"""Score a sequence based on its phrase quality"""
|
212
|
+
self.logger.debug(f"_score_sequence called for: '{' '.join(words)}'")
|
213
|
+
return self.phrase_analyzer.score_phrase(words, context)
|
214
|
+
|
215
|
+
def _score_anchor(self, anchor: AnchorSequence, context: str) -> ScoredAnchor:
|
216
|
+
"""Score an anchor sequence based on phrase quality and line breaks.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
anchor: The anchor sequence to score
|
220
|
+
context: The original transcribed text
|
221
|
+
"""
|
222
|
+
# Let phrase_analyzer handle all scoring including line breaks
|
223
|
+
phrase_score = self.phrase_analyzer.score_phrase(anchor.words, context)
|
224
|
+
|
225
|
+
# self.logger.debug(f"_score_anchor called for sequence: '{anchor.text}'")
|
226
|
+
return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
|
227
|
+
|
228
|
+
def _get_sequence_priority(self, scored_anchor: ScoredAnchor) -> Tuple[float, float, float, float, int]:
|
229
|
+
"""Get priority tuple for sorting sequences.
|
230
|
+
|
231
|
+
Returns tuple of:
|
232
|
+
- Number of sources matched (higher is better)
|
233
|
+
- Length bonus (length * 0.2) to favor longer sequences
|
234
|
+
- Break score (higher is better)
|
235
|
+
- Total score (higher is better)
|
236
|
+
- Negative position (earlier is better)
|
237
|
+
|
238
|
+
Position bonus: Add 1.0 to total score for sequences at position 0
|
239
|
+
"""
|
240
|
+
# self.logger.debug(f"_get_sequence_priority called for anchor: '{scored_anchor.anchor.text}'")
|
241
|
+
position_bonus = 1.0 if scored_anchor.anchor.transcription_position == 0 else 0.0
|
242
|
+
length_bonus = len(scored_anchor.anchor.words) * 0.2 # Add bonus for longer sequences
|
243
|
+
|
244
|
+
return (
|
245
|
+
len(scored_anchor.anchor.reference_positions), # More sources is better
|
246
|
+
length_bonus, # Longer sequences preferred
|
247
|
+
scored_anchor.phrase_score.natural_break_score, # Better breaks preferred
|
248
|
+
scored_anchor.phrase_score.total_score + position_bonus, # Add bonus for position 0
|
249
|
+
-scored_anchor.anchor.transcription_position, # Earlier positions preferred
|
250
|
+
)
|
251
|
+
|
252
|
+
def _sequences_overlap(self, seq1: AnchorSequence, seq2: AnchorSequence) -> bool:
|
253
|
+
"""Check if two sequences overlap in either transcription or references.
|
254
|
+
|
255
|
+
Args:
|
256
|
+
seq1: First sequence
|
257
|
+
seq2: Second sequence
|
258
|
+
|
259
|
+
Returns:
|
260
|
+
True if sequences overlap in transcription or share any reference positions
|
261
|
+
"""
|
262
|
+
# Check transcription overlap
|
263
|
+
seq1_trans_range = range(seq1.transcription_position, seq1.transcription_position + len(seq1.words))
|
264
|
+
seq2_trans_range = range(seq2.transcription_position, seq2.transcription_position + len(seq2.words))
|
265
|
+
trans_overlap = bool(set(seq1_trans_range) & set(seq2_trans_range))
|
266
|
+
|
267
|
+
# Check reference overlap - only consider positions in shared sources
|
268
|
+
shared_sources = set(seq1.reference_positions.keys()) & set(seq2.reference_positions.keys())
|
269
|
+
ref_overlap = any(seq1.reference_positions[source] == seq2.reference_positions[source] for source in shared_sources)
|
270
|
+
|
271
|
+
# self.logger.debug(f"Checking overlap between '{seq1.text}' and '{seq2.text}'")
|
272
|
+
return trans_overlap or ref_overlap
|
273
|
+
|
274
|
+
def _remove_overlapping_sequences(self, anchors: List[AnchorSequence], context: str) -> List[ScoredAnchor]:
|
275
|
+
"""Remove overlapping sequences using phrase analysis."""
|
276
|
+
if not anchors:
|
277
|
+
return []
|
278
|
+
|
279
|
+
self.logger.info(f"Scoring {len(anchors)} anchors")
|
280
|
+
|
281
|
+
# Benchmark both approaches
|
282
|
+
start_time = time.time()
|
283
|
+
|
284
|
+
# Try different pool sizes
|
285
|
+
num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
|
286
|
+
self.logger.info(f"Using {num_processes} processes")
|
287
|
+
|
288
|
+
# Create a partial function with the context parameter fixed
|
289
|
+
score_anchor_partial = partial(self._score_anchor_static, context=context)
|
290
|
+
|
291
|
+
# Use multiprocessing to score anchors in parallel
|
292
|
+
with Pool(processes=num_processes) as pool:
|
293
|
+
scored_anchors = list(
|
294
|
+
tqdm(
|
295
|
+
pool.imap(score_anchor_partial, anchors, chunksize=50), # Added chunksize
|
296
|
+
total=len(anchors),
|
297
|
+
desc="Scoring anchors (parallel)",
|
298
|
+
)
|
299
|
+
)
|
300
|
+
|
301
|
+
parallel_time = time.time() - start_time
|
302
|
+
self.logger.info(f"Parallel scoring took {parallel_time:.2f} seconds")
|
303
|
+
|
304
|
+
# Sort and filter as before
|
305
|
+
scored_anchors.sort(key=self._get_sequence_priority, reverse=True)
|
306
|
+
|
307
|
+
self.logger.info(f"Filtering {len(scored_anchors)} overlapping sequences")
|
308
|
+
filtered_scored = []
|
309
|
+
for scored_anchor in tqdm(scored_anchors, desc="Filtering overlaps"):
|
310
|
+
overlaps = False
|
311
|
+
for existing in filtered_scored:
|
312
|
+
if self._sequences_overlap(scored_anchor.anchor, existing.anchor):
|
313
|
+
overlaps = True
|
314
|
+
break
|
315
|
+
|
316
|
+
if not overlaps:
|
317
|
+
filtered_scored.append(scored_anchor)
|
318
|
+
|
319
|
+
self.logger.info(f"Filtered down to {len(filtered_scored)} non-overlapping anchors")
|
320
|
+
return filtered_scored
|
321
|
+
|
322
|
+
@staticmethod
|
323
|
+
def _score_anchor_static(anchor: AnchorSequence, context: str) -> ScoredAnchor:
|
324
|
+
"""Static version of _score_anchor for multiprocessing compatibility."""
|
325
|
+
# Create analyzer only once per process
|
326
|
+
if not hasattr(AnchorSequenceFinder._score_anchor_static, "_phrase_analyzer"):
|
327
|
+
AnchorSequenceFinder._score_anchor_static._phrase_analyzer = PhraseAnalyzer(logger=logging.getLogger(__name__))
|
328
|
+
|
329
|
+
phrase_score = AnchorSequenceFinder._score_anchor_static._phrase_analyzer.score_phrase(anchor.words, context)
|
330
|
+
return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
|
331
|
+
|
332
|
+
def _get_reference_words(self, source: str, ref_words: List[str], start_pos: Optional[int], end_pos: Optional[int]) -> List[str]:
|
333
|
+
"""Get words from reference text between two positions.
|
334
|
+
|
335
|
+
Args:
|
336
|
+
source: Reference source identifier
|
337
|
+
ref_words: List of words from the reference text
|
338
|
+
start_pos: Starting position (None for beginning)
|
339
|
+
end_pos: Ending position (None for end)
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
List of words between the positions
|
343
|
+
"""
|
344
|
+
if start_pos is None:
|
345
|
+
start_pos = 0
|
346
|
+
if end_pos is None:
|
347
|
+
end_pos = len(ref_words)
|
348
|
+
return ref_words[start_pos:end_pos]
|
349
|
+
|
350
|
+
def find_gaps(self, transcribed: str, anchors: List[ScoredAnchor], references: Dict[str, str]) -> List[GapSequence]:
|
351
|
+
"""Find gaps between anchor sequences in the transcribed text."""
|
352
|
+
cache_key = self._get_cache_key(transcribed, references)
|
353
|
+
cache_path = self.cache_dir / f"gaps_{cache_key}.json"
|
354
|
+
|
355
|
+
# Try to load from cache
|
356
|
+
if cached_data := self._load_from_cache(cache_path):
|
357
|
+
self.logger.info("Loading gaps from cache")
|
358
|
+
return [GapSequence.from_dict(gap) for gap in cached_data]
|
359
|
+
|
360
|
+
# If not in cache, perform the computation
|
361
|
+
self.logger.info("Cache miss - computing gaps")
|
362
|
+
words = self._clean_text(transcribed).split()
|
363
|
+
ref_texts_clean = {source: self._clean_text(text).split() for source, text in references.items()}
|
364
|
+
# Store original reference texts split into words
|
365
|
+
ref_texts_original = {source: text.split() for source, text in references.items()}
|
366
|
+
|
367
|
+
gaps = []
|
368
|
+
sorted_anchors = sorted(anchors, key=lambda x: x.anchor.transcription_position)
|
369
|
+
|
370
|
+
# Handle initial gap
|
371
|
+
if initial_gap := self._create_initial_gap(
|
372
|
+
words, sorted_anchors[0] if sorted_anchors else None, ref_texts_clean, ref_texts_original
|
373
|
+
):
|
374
|
+
gaps.append(initial_gap)
|
375
|
+
|
376
|
+
# Handle gaps between anchors
|
377
|
+
for i in range(len(sorted_anchors) - 1):
|
378
|
+
if between_gap := self._create_between_gap(
|
379
|
+
words, sorted_anchors[i], sorted_anchors[i + 1], ref_texts_clean, ref_texts_original
|
380
|
+
):
|
381
|
+
gaps.append(between_gap)
|
382
|
+
|
383
|
+
# Handle final gap
|
384
|
+
if sorted_anchors and (final_gap := self._create_final_gap(words, sorted_anchors[-1], ref_texts_clean, ref_texts_original)):
|
385
|
+
gaps.append(final_gap)
|
386
|
+
|
387
|
+
# Save to cache
|
388
|
+
self._save_to_cache(cache_path, [gap.to_dict() for gap in gaps])
|
389
|
+
return gaps
|
390
|
+
|
391
|
+
def _create_initial_gap(
|
392
|
+
self,
|
393
|
+
words: List[str],
|
394
|
+
first_anchor: Optional[ScoredAnchor],
|
395
|
+
ref_texts_clean: Dict[str, List[str]],
|
396
|
+
ref_texts_original: Dict[str, List[str]],
|
397
|
+
) -> Optional[GapSequence]:
|
398
|
+
"""Create gap sequence before the first anchor."""
|
399
|
+
if not first_anchor:
|
400
|
+
ref_words = {source: words for source, words in ref_texts_clean.items()}
|
401
|
+
ref_words_original = {source: words for source, words in ref_texts_original.items()}
|
402
|
+
return GapSequence(words, 0, None, None, ref_words, ref_words_original)
|
403
|
+
|
404
|
+
if first_anchor.anchor.transcription_position > 0:
|
405
|
+
ref_words = {}
|
406
|
+
ref_words_original = {}
|
407
|
+
for source in ref_texts_clean:
|
408
|
+
end_pos = first_anchor.anchor.reference_positions.get(source)
|
409
|
+
ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], None, end_pos)
|
410
|
+
ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], None, end_pos)
|
411
|
+
|
412
|
+
return GapSequence(
|
413
|
+
words[: first_anchor.anchor.transcription_position], 0, None, first_anchor.anchor, ref_words, ref_words_original
|
414
|
+
)
|
415
|
+
return None
|
416
|
+
|
417
|
+
def _create_between_gap(
|
418
|
+
self,
|
419
|
+
words: List[str],
|
420
|
+
current_anchor: ScoredAnchor,
|
421
|
+
next_anchor: ScoredAnchor,
|
422
|
+
ref_texts_clean: Dict[str, List[str]],
|
423
|
+
ref_texts_original: Dict[str, List[str]],
|
424
|
+
) -> Optional[GapSequence]:
|
425
|
+
"""Create gap sequence between two anchors."""
|
426
|
+
gap_start = current_anchor.anchor.transcription_position + current_anchor.anchor.length
|
427
|
+
gap_end = next_anchor.anchor.transcription_position
|
428
|
+
|
429
|
+
if gap_end > gap_start:
|
430
|
+
ref_words = {}
|
431
|
+
ref_words_original = {}
|
432
|
+
shared_sources = set(current_anchor.anchor.reference_positions.keys()) & set(next_anchor.anchor.reference_positions.keys())
|
433
|
+
|
434
|
+
# Check for large position differences in next_anchor
|
435
|
+
if len(next_anchor.anchor.reference_positions) > 1:
|
436
|
+
positions = list(next_anchor.anchor.reference_positions.values())
|
437
|
+
max_diff = max(positions) - min(positions)
|
438
|
+
if max_diff > 20:
|
439
|
+
earliest_source = min(next_anchor.anchor.reference_positions.items(), key=lambda x: x[1])[0]
|
440
|
+
self.logger.warning(
|
441
|
+
f"Large position difference ({max_diff} words) in next anchor. Using only earliest source: {earliest_source}"
|
442
|
+
)
|
443
|
+
shared_sources &= {earliest_source}
|
444
|
+
|
445
|
+
for source in shared_sources:
|
446
|
+
start_pos = current_anchor.anchor.reference_positions[source] + current_anchor.anchor.length
|
447
|
+
end_pos = next_anchor.anchor.reference_positions[source]
|
448
|
+
ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], start_pos, end_pos)
|
449
|
+
ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], start_pos, end_pos)
|
450
|
+
|
451
|
+
return GapSequence(
|
452
|
+
words[gap_start:gap_end], gap_start, current_anchor.anchor, next_anchor.anchor, ref_words, ref_words_original
|
453
|
+
)
|
454
|
+
return None
|
455
|
+
|
456
|
+
def _create_final_gap(
|
457
|
+
self, words: List[str], last_anchor: ScoredAnchor, ref_texts_clean: Dict[str, List[str]], ref_texts_original: Dict[str, List[str]]
|
458
|
+
) -> Optional[GapSequence]:
|
459
|
+
"""Create gap sequence after the last anchor."""
|
460
|
+
last_pos = last_anchor.anchor.transcription_position + last_anchor.anchor.length
|
461
|
+
if last_pos < len(words):
|
462
|
+
ref_words = {}
|
463
|
+
ref_words_original = {}
|
464
|
+
for source in ref_texts_clean:
|
465
|
+
if source in last_anchor.anchor.reference_positions:
|
466
|
+
start_pos = last_anchor.anchor.reference_positions[source] + last_anchor.anchor.length
|
467
|
+
ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], start_pos, None)
|
468
|
+
ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], start_pos, None)
|
469
|
+
|
470
|
+
return GapSequence(words[last_pos:], last_pos, last_anchor.anchor, None, ref_words, ref_words_original)
|
471
|
+
return None
|