lyrics-transcriber 0.30.0__py3-none-any.whl → 0.32.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. lyrics_transcriber/__init__.py +2 -1
  2. lyrics_transcriber/cli/{main.py → cli_main.py} +47 -14
  3. lyrics_transcriber/core/config.py +35 -0
  4. lyrics_transcriber/core/controller.py +164 -166
  5. lyrics_transcriber/correction/anchor_sequence.py +471 -0
  6. lyrics_transcriber/correction/corrector.py +256 -0
  7. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  8. lyrics_transcriber/correction/handlers/base.py +30 -0
  9. lyrics_transcriber/correction/handlers/extend_anchor.py +91 -0
  10. lyrics_transcriber/correction/handlers/levenshtein.py +147 -0
  11. lyrics_transcriber/correction/handlers/no_space_punct_match.py +98 -0
  12. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +55 -0
  13. lyrics_transcriber/correction/handlers/repeat.py +71 -0
  14. lyrics_transcriber/correction/handlers/sound_alike.py +223 -0
  15. lyrics_transcriber/correction/handlers/syllables_match.py +182 -0
  16. lyrics_transcriber/correction/handlers/word_count_match.py +54 -0
  17. lyrics_transcriber/correction/handlers/word_operations.py +135 -0
  18. lyrics_transcriber/correction/phrase_analyzer.py +426 -0
  19. lyrics_transcriber/correction/text_utils.py +30 -0
  20. lyrics_transcriber/lyrics/base_lyrics_provider.py +125 -0
  21. lyrics_transcriber/lyrics/genius.py +73 -0
  22. lyrics_transcriber/lyrics/spotify.py +82 -0
  23. lyrics_transcriber/output/ass/__init__.py +21 -0
  24. lyrics_transcriber/output/{ass.py → ass/ass.py} +150 -690
  25. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  26. lyrics_transcriber/output/ass/config.py +37 -0
  27. lyrics_transcriber/output/ass/constants.py +23 -0
  28. lyrics_transcriber/output/ass/event.py +94 -0
  29. lyrics_transcriber/output/ass/formatters.py +132 -0
  30. lyrics_transcriber/output/ass/lyrics_line.py +219 -0
  31. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  32. lyrics_transcriber/output/ass/section_detector.py +89 -0
  33. lyrics_transcriber/output/ass/section_screen.py +106 -0
  34. lyrics_transcriber/output/ass/style.py +187 -0
  35. lyrics_transcriber/output/cdg.py +503 -0
  36. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  37. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  38. lyrics_transcriber/output/cdgmaker/composer.py +1919 -0
  39. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  40. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  41. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  42. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  43. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  44. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  45. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  46. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  47. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  48. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  49. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  50. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  51. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  52. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  53. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  54. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  55. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  56. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  57. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  58. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  59. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  60. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  61. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  62. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  63. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  64. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  65. lyrics_transcriber/output/generator.py +140 -171
  66. lyrics_transcriber/output/lyrics_file.py +102 -0
  67. lyrics_transcriber/output/plain_text.py +91 -0
  68. lyrics_transcriber/output/segment_resizer.py +416 -0
  69. lyrics_transcriber/output/subtitles.py +328 -302
  70. lyrics_transcriber/output/video.py +219 -0
  71. lyrics_transcriber/review/__init__.py +1 -0
  72. lyrics_transcriber/review/server.py +138 -0
  73. lyrics_transcriber/storage/dropbox.py +110 -134
  74. lyrics_transcriber/transcribers/audioshake.py +171 -105
  75. lyrics_transcriber/transcribers/base_transcriber.py +149 -0
  76. lyrics_transcriber/transcribers/whisper.py +267 -133
  77. lyrics_transcriber/types.py +454 -0
  78. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/METADATA +14 -3
  79. lyrics_transcriber-0.32.1.dist-info/RECORD +86 -0
  80. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/WHEEL +1 -1
  81. lyrics_transcriber-0.32.1.dist-info/entry_points.txt +4 -0
  82. lyrics_transcriber/core/corrector.py +0 -56
  83. lyrics_transcriber/core/fetcher.py +0 -143
  84. lyrics_transcriber/storage/tokens.py +0 -116
  85. lyrics_transcriber/transcribers/base.py +0 -31
  86. lyrics_transcriber-0.30.0.dist-info/RECORD +0 -22
  87. lyrics_transcriber-0.30.0.dist-info/entry_points.txt +0 -3
  88. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,471 @@
1
+ from typing import Any, Dict, List, Optional, Tuple, Union
2
+ import logging
3
+ from tqdm import tqdm
4
+ from multiprocessing import Pool, cpu_count
5
+ from functools import partial
6
+ import time
7
+ from pathlib import Path
8
+ import json
9
+ import hashlib
10
+
11
+ from lyrics_transcriber.types import PhraseScore, AnchorSequence, GapSequence, ScoredAnchor
12
+ from lyrics_transcriber.correction.phrase_analyzer import PhraseAnalyzer
13
+ from lyrics_transcriber.correction.text_utils import clean_text
14
+
15
+
16
+ class AnchorSequenceFinder:
17
+ """Identifies and manages anchor sequences between transcribed and reference lyrics."""
18
+
19
+ def __init__(
20
+ self,
21
+ cache_dir: Union[str, Path],
22
+ min_sequence_length: int = 3,
23
+ min_sources: int = 1,
24
+ logger: Optional[logging.Logger] = None,
25
+ ):
26
+ self.min_sequence_length = min_sequence_length
27
+ self.min_sources = min_sources
28
+ self.logger = logger or logging.getLogger(__name__)
29
+ self.phrase_analyzer = PhraseAnalyzer(logger=self.logger)
30
+ self.used_positions = {}
31
+
32
+ # Initialize cache directory
33
+ self.cache_dir = Path(cache_dir)
34
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
35
+ self.logger.debug(f"Initialized AnchorSequenceFinder with cache dir: {self.cache_dir}")
36
+
37
+ def _clean_text(self, text: str) -> str:
38
+ """Clean text by removing punctuation and normalizing whitespace."""
39
+ # self.logger.debug(f"_clean_text called with text length: {len(text)}")
40
+ return clean_text(text)
41
+
42
+ def _find_ngrams(self, words: List[str], n: int) -> List[Tuple[List[str], int]]:
43
+ """Generate n-grams with their starting positions."""
44
+ # self.logger.debug(f"_find_ngrams called with {len(words)} words, n={n}")
45
+ return [(words[i : i + n], i) for i in range(len(words) - n + 1)]
46
+
47
+ def _find_matching_sources(self, ngram: List[str], references: Dict[str, List[str]], n: int) -> Dict[str, int]:
48
+ """Find which sources contain the given n-gram and at what positions."""
49
+ # self.logger.debug(f"_find_matching_sources called for ngram: '{' '.join(ngram)}'")
50
+ matches = {}
51
+ all_positions = {source: [] for source in references}
52
+
53
+ # First, find all positions in each source
54
+ for source, words in references.items():
55
+ for i in range(len(words) - n + 1):
56
+ if words[i : i + n] == ngram:
57
+ all_positions[source].append(i)
58
+
59
+ # Then, try to find an unused position for each source
60
+ for source, positions in all_positions.items():
61
+ used = self.used_positions.get(source, set())
62
+ # Try each position in order
63
+ for pos in positions:
64
+ if pos not in used:
65
+ matches[source] = pos
66
+ break
67
+
68
+ return matches
69
+
70
+ def _filter_used_positions(self, matches: Dict[str, int]) -> Dict[str, int]:
71
+ """Filter out positions that have already been used.
72
+
73
+ Args:
74
+ matches: Dict mapping source IDs to positions
75
+
76
+ Returns:
77
+ Dict mapping source IDs to unused positions
78
+ """
79
+ self.logger.debug(f"_filter_used_positions called with {len(matches)} matches")
80
+ return {source: pos for source, pos in matches.items() if pos not in self.used_positions.get(source, set())}
81
+
82
+ def _create_anchor(
83
+ self, ngram: List[str], trans_pos: int, matching_sources: Dict[str, int], total_sources: int
84
+ ) -> Optional[AnchorSequence]:
85
+ """Create an anchor sequence if it meets the minimum sources requirement."""
86
+ self.logger.debug(f"_create_anchor called for ngram: '{' '.join(ngram)}' at position {trans_pos}")
87
+ if len(matching_sources) >= self.min_sources:
88
+ confidence = len(matching_sources) / total_sources
89
+ anchor = AnchorSequence(
90
+ words=ngram, transcription_position=trans_pos, reference_positions=matching_sources, confidence=confidence
91
+ )
92
+ self.logger.debug(f"Found anchor sequence: '{' '.join(ngram)}' (confidence: {confidence:.2f})")
93
+ return anchor
94
+ return None
95
+
96
+ def _get_cache_key(self, transcribed: str, references: Dict[str, str]) -> str:
97
+ """Generate a unique cache key for the input combination."""
98
+ # Create a string that uniquely identifies the inputs
99
+ input_str = f"{transcribed}|{'|'.join(f'{k}:{v}' for k,v in sorted(references.items()))}"
100
+ return hashlib.md5(input_str.encode()).hexdigest()
101
+
102
+ def _save_to_cache(self, cache_path: Path, data: Any) -> None:
103
+ """Save results to cache file."""
104
+ self.logger.debug(f"Saving to cache: {cache_path}")
105
+ with open(cache_path, "w") as f:
106
+ json.dump(data, f, indent=2)
107
+
108
+ def _load_from_cache(self, cache_path: Path) -> Optional[Any]:
109
+ """Load results from cache if available."""
110
+ try:
111
+ self.logger.debug(f"Attempting to load from cache: {cache_path}")
112
+ with open(cache_path, "r") as f:
113
+ return json.load(f)
114
+ except (FileNotFoundError, json.JSONDecodeError):
115
+ self.logger.debug("Cache miss or invalid cache file")
116
+ return None
117
+
118
+ def _process_ngram_length(
119
+ self, n: int, trans_words: List[str], ref_texts_clean: Dict[str, List[str]], min_sources: int
120
+ ) -> List[AnchorSequence]:
121
+ """Process a single n-gram length to find matching sequences."""
122
+ candidate_anchors = []
123
+ used_positions = {source: set() for source in ref_texts_clean.keys()}
124
+ used_trans_positions = set()
125
+
126
+ # Try each position in the transcribed text multiple times
127
+ # to catch repeated phrases
128
+ found_new_match = True
129
+ while found_new_match:
130
+ found_new_match = False
131
+
132
+ # Generate n-grams from transcribed text
133
+ trans_ngrams = self._find_ngrams(trans_words, n)
134
+
135
+ for ngram, trans_pos in trans_ngrams:
136
+ # Skip if we've already used this transcription position
137
+ if trans_pos in used_trans_positions:
138
+ continue
139
+
140
+ matches = self._find_matching_sources(ngram, ref_texts_clean, n)
141
+ if len(matches) >= min_sources:
142
+ # Mark positions as used
143
+ for source, pos in matches.items():
144
+ used_positions[source].add(pos)
145
+ used_trans_positions.add(trans_pos)
146
+
147
+ anchor = AnchorSequence(ngram, trans_pos, matches, len(matches) / len(ref_texts_clean))
148
+ candidate_anchors.append(anchor)
149
+ found_new_match = True
150
+ break # Start over to try finding more matches
151
+
152
+ return candidate_anchors
153
+
154
+ def find_anchors(self, transcribed: str, references: Dict[str, str]) -> List[ScoredAnchor]:
155
+ """Find anchor sequences that appear in both transcription and references."""
156
+ cache_key = self._get_cache_key(transcribed, references)
157
+ cache_path = self.cache_dir / f"anchors_{cache_key}.json"
158
+
159
+ # Try to load from cache
160
+ if cached_data := self._load_from_cache(cache_path):
161
+ self.logger.info("Loading anchors from cache")
162
+ try:
163
+ return [ScoredAnchor.from_dict(anchor) for anchor in cached_data]
164
+ except KeyError as e:
165
+ self.logger.warning(f"Cache format mismatch: {e}. Recomputing.")
166
+
167
+ # If not in cache or cache format invalid, perform the computation
168
+ self.logger.info("Cache miss - computing anchors")
169
+ self.logger.info(f"Finding anchor sequences for transcription with length {len(transcribed)}")
170
+
171
+ # Clean and split texts
172
+ trans_words = self._clean_text(transcribed).split()
173
+ ref_texts_clean = {source: self._clean_text(text).split() for source, text in references.items()}
174
+
175
+ max_length = min(len(trans_words), min(len(words) for words in ref_texts_clean.values()))
176
+ n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
177
+
178
+ # Set up parallel processing
179
+ num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
180
+ self.logger.info(f"Processing {len(n_gram_lengths)} n-gram lengths using {num_processes} processes")
181
+
182
+ # Create partial function with fixed arguments
183
+ process_length_partial = partial(
184
+ self._process_ngram_length, trans_words=trans_words, ref_texts_clean=ref_texts_clean, min_sources=self.min_sources
185
+ )
186
+
187
+ # Process n-gram lengths in parallel
188
+ candidate_anchors = []
189
+ with Pool(processes=num_processes) as pool:
190
+ results = list(
191
+ tqdm(
192
+ pool.imap(process_length_partial, n_gram_lengths, chunksize=1),
193
+ total=len(n_gram_lengths),
194
+ desc="Processing n-gram lengths",
195
+ )
196
+ )
197
+ for anchors in results:
198
+ candidate_anchors.extend(anchors)
199
+
200
+ self.logger.info(f"Found {len(candidate_anchors)} candidate anchors")
201
+ filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed)
202
+
203
+ # Before returning, save to cache with correct format
204
+ self._save_to_cache(
205
+ cache_path, [{"anchor": anchor.anchor.to_dict(), "phrase_score": anchor.phrase_score.to_dict()} for anchor in filtered_anchors]
206
+ )
207
+
208
+ return filtered_anchors
209
+
210
+ def _score_sequence(self, words: List[str], context: str) -> PhraseScore:
211
+ """Score a sequence based on its phrase quality"""
212
+ self.logger.debug(f"_score_sequence called for: '{' '.join(words)}'")
213
+ return self.phrase_analyzer.score_phrase(words, context)
214
+
215
+ def _score_anchor(self, anchor: AnchorSequence, context: str) -> ScoredAnchor:
216
+ """Score an anchor sequence based on phrase quality and line breaks.
217
+
218
+ Args:
219
+ anchor: The anchor sequence to score
220
+ context: The original transcribed text
221
+ """
222
+ # Let phrase_analyzer handle all scoring including line breaks
223
+ phrase_score = self.phrase_analyzer.score_phrase(anchor.words, context)
224
+
225
+ # self.logger.debug(f"_score_anchor called for sequence: '{anchor.text}'")
226
+ return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
227
+
228
+ def _get_sequence_priority(self, scored_anchor: ScoredAnchor) -> Tuple[float, float, float, float, int]:
229
+ """Get priority tuple for sorting sequences.
230
+
231
+ Returns tuple of:
232
+ - Number of sources matched (higher is better)
233
+ - Length bonus (length * 0.2) to favor longer sequences
234
+ - Break score (higher is better)
235
+ - Total score (higher is better)
236
+ - Negative position (earlier is better)
237
+
238
+ Position bonus: Add 1.0 to total score for sequences at position 0
239
+ """
240
+ # self.logger.debug(f"_get_sequence_priority called for anchor: '{scored_anchor.anchor.text}'")
241
+ position_bonus = 1.0 if scored_anchor.anchor.transcription_position == 0 else 0.0
242
+ length_bonus = len(scored_anchor.anchor.words) * 0.2 # Add bonus for longer sequences
243
+
244
+ return (
245
+ len(scored_anchor.anchor.reference_positions), # More sources is better
246
+ length_bonus, # Longer sequences preferred
247
+ scored_anchor.phrase_score.natural_break_score, # Better breaks preferred
248
+ scored_anchor.phrase_score.total_score + position_bonus, # Add bonus for position 0
249
+ -scored_anchor.anchor.transcription_position, # Earlier positions preferred
250
+ )
251
+
252
+ def _sequences_overlap(self, seq1: AnchorSequence, seq2: AnchorSequence) -> bool:
253
+ """Check if two sequences overlap in either transcription or references.
254
+
255
+ Args:
256
+ seq1: First sequence
257
+ seq2: Second sequence
258
+
259
+ Returns:
260
+ True if sequences overlap in transcription or share any reference positions
261
+ """
262
+ # Check transcription overlap
263
+ seq1_trans_range = range(seq1.transcription_position, seq1.transcription_position + len(seq1.words))
264
+ seq2_trans_range = range(seq2.transcription_position, seq2.transcription_position + len(seq2.words))
265
+ trans_overlap = bool(set(seq1_trans_range) & set(seq2_trans_range))
266
+
267
+ # Check reference overlap - only consider positions in shared sources
268
+ shared_sources = set(seq1.reference_positions.keys()) & set(seq2.reference_positions.keys())
269
+ ref_overlap = any(seq1.reference_positions[source] == seq2.reference_positions[source] for source in shared_sources)
270
+
271
+ # self.logger.debug(f"Checking overlap between '{seq1.text}' and '{seq2.text}'")
272
+ return trans_overlap or ref_overlap
273
+
274
+ def _remove_overlapping_sequences(self, anchors: List[AnchorSequence], context: str) -> List[ScoredAnchor]:
275
+ """Remove overlapping sequences using phrase analysis."""
276
+ if not anchors:
277
+ return []
278
+
279
+ self.logger.info(f"Scoring {len(anchors)} anchors")
280
+
281
+ # Benchmark both approaches
282
+ start_time = time.time()
283
+
284
+ # Try different pool sizes
285
+ num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
286
+ self.logger.info(f"Using {num_processes} processes")
287
+
288
+ # Create a partial function with the context parameter fixed
289
+ score_anchor_partial = partial(self._score_anchor_static, context=context)
290
+
291
+ # Use multiprocessing to score anchors in parallel
292
+ with Pool(processes=num_processes) as pool:
293
+ scored_anchors = list(
294
+ tqdm(
295
+ pool.imap(score_anchor_partial, anchors, chunksize=50), # Added chunksize
296
+ total=len(anchors),
297
+ desc="Scoring anchors (parallel)",
298
+ )
299
+ )
300
+
301
+ parallel_time = time.time() - start_time
302
+ self.logger.info(f"Parallel scoring took {parallel_time:.2f} seconds")
303
+
304
+ # Sort and filter as before
305
+ scored_anchors.sort(key=self._get_sequence_priority, reverse=True)
306
+
307
+ self.logger.info(f"Filtering {len(scored_anchors)} overlapping sequences")
308
+ filtered_scored = []
309
+ for scored_anchor in tqdm(scored_anchors, desc="Filtering overlaps"):
310
+ overlaps = False
311
+ for existing in filtered_scored:
312
+ if self._sequences_overlap(scored_anchor.anchor, existing.anchor):
313
+ overlaps = True
314
+ break
315
+
316
+ if not overlaps:
317
+ filtered_scored.append(scored_anchor)
318
+
319
+ self.logger.info(f"Filtered down to {len(filtered_scored)} non-overlapping anchors")
320
+ return filtered_scored
321
+
322
+ @staticmethod
323
+ def _score_anchor_static(anchor: AnchorSequence, context: str) -> ScoredAnchor:
324
+ """Static version of _score_anchor for multiprocessing compatibility."""
325
+ # Create analyzer only once per process
326
+ if not hasattr(AnchorSequenceFinder._score_anchor_static, "_phrase_analyzer"):
327
+ AnchorSequenceFinder._score_anchor_static._phrase_analyzer = PhraseAnalyzer(logger=logging.getLogger(__name__))
328
+
329
+ phrase_score = AnchorSequenceFinder._score_anchor_static._phrase_analyzer.score_phrase(anchor.words, context)
330
+ return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
331
+
332
+ def _get_reference_words(self, source: str, ref_words: List[str], start_pos: Optional[int], end_pos: Optional[int]) -> List[str]:
333
+ """Get words from reference text between two positions.
334
+
335
+ Args:
336
+ source: Reference source identifier
337
+ ref_words: List of words from the reference text
338
+ start_pos: Starting position (None for beginning)
339
+ end_pos: Ending position (None for end)
340
+
341
+ Returns:
342
+ List of words between the positions
343
+ """
344
+ if start_pos is None:
345
+ start_pos = 0
346
+ if end_pos is None:
347
+ end_pos = len(ref_words)
348
+ return ref_words[start_pos:end_pos]
349
+
350
+ def find_gaps(self, transcribed: str, anchors: List[ScoredAnchor], references: Dict[str, str]) -> List[GapSequence]:
351
+ """Find gaps between anchor sequences in the transcribed text."""
352
+ cache_key = self._get_cache_key(transcribed, references)
353
+ cache_path = self.cache_dir / f"gaps_{cache_key}.json"
354
+
355
+ # Try to load from cache
356
+ if cached_data := self._load_from_cache(cache_path):
357
+ self.logger.info("Loading gaps from cache")
358
+ return [GapSequence.from_dict(gap) for gap in cached_data]
359
+
360
+ # If not in cache, perform the computation
361
+ self.logger.info("Cache miss - computing gaps")
362
+ words = self._clean_text(transcribed).split()
363
+ ref_texts_clean = {source: self._clean_text(text).split() for source, text in references.items()}
364
+ # Store original reference texts split into words
365
+ ref_texts_original = {source: text.split() for source, text in references.items()}
366
+
367
+ gaps = []
368
+ sorted_anchors = sorted(anchors, key=lambda x: x.anchor.transcription_position)
369
+
370
+ # Handle initial gap
371
+ if initial_gap := self._create_initial_gap(
372
+ words, sorted_anchors[0] if sorted_anchors else None, ref_texts_clean, ref_texts_original
373
+ ):
374
+ gaps.append(initial_gap)
375
+
376
+ # Handle gaps between anchors
377
+ for i in range(len(sorted_anchors) - 1):
378
+ if between_gap := self._create_between_gap(
379
+ words, sorted_anchors[i], sorted_anchors[i + 1], ref_texts_clean, ref_texts_original
380
+ ):
381
+ gaps.append(between_gap)
382
+
383
+ # Handle final gap
384
+ if sorted_anchors and (final_gap := self._create_final_gap(words, sorted_anchors[-1], ref_texts_clean, ref_texts_original)):
385
+ gaps.append(final_gap)
386
+
387
+ # Save to cache
388
+ self._save_to_cache(cache_path, [gap.to_dict() for gap in gaps])
389
+ return gaps
390
+
391
+ def _create_initial_gap(
392
+ self,
393
+ words: List[str],
394
+ first_anchor: Optional[ScoredAnchor],
395
+ ref_texts_clean: Dict[str, List[str]],
396
+ ref_texts_original: Dict[str, List[str]],
397
+ ) -> Optional[GapSequence]:
398
+ """Create gap sequence before the first anchor."""
399
+ if not first_anchor:
400
+ ref_words = {source: words for source, words in ref_texts_clean.items()}
401
+ ref_words_original = {source: words for source, words in ref_texts_original.items()}
402
+ return GapSequence(words, 0, None, None, ref_words, ref_words_original)
403
+
404
+ if first_anchor.anchor.transcription_position > 0:
405
+ ref_words = {}
406
+ ref_words_original = {}
407
+ for source in ref_texts_clean:
408
+ end_pos = first_anchor.anchor.reference_positions.get(source)
409
+ ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], None, end_pos)
410
+ ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], None, end_pos)
411
+
412
+ return GapSequence(
413
+ words[: first_anchor.anchor.transcription_position], 0, None, first_anchor.anchor, ref_words, ref_words_original
414
+ )
415
+ return None
416
+
417
+ def _create_between_gap(
418
+ self,
419
+ words: List[str],
420
+ current_anchor: ScoredAnchor,
421
+ next_anchor: ScoredAnchor,
422
+ ref_texts_clean: Dict[str, List[str]],
423
+ ref_texts_original: Dict[str, List[str]],
424
+ ) -> Optional[GapSequence]:
425
+ """Create gap sequence between two anchors."""
426
+ gap_start = current_anchor.anchor.transcription_position + current_anchor.anchor.length
427
+ gap_end = next_anchor.anchor.transcription_position
428
+
429
+ if gap_end > gap_start:
430
+ ref_words = {}
431
+ ref_words_original = {}
432
+ shared_sources = set(current_anchor.anchor.reference_positions.keys()) & set(next_anchor.anchor.reference_positions.keys())
433
+
434
+ # Check for large position differences in next_anchor
435
+ if len(next_anchor.anchor.reference_positions) > 1:
436
+ positions = list(next_anchor.anchor.reference_positions.values())
437
+ max_diff = max(positions) - min(positions)
438
+ if max_diff > 20:
439
+ earliest_source = min(next_anchor.anchor.reference_positions.items(), key=lambda x: x[1])[0]
440
+ self.logger.warning(
441
+ f"Large position difference ({max_diff} words) in next anchor. Using only earliest source: {earliest_source}"
442
+ )
443
+ shared_sources &= {earliest_source}
444
+
445
+ for source in shared_sources:
446
+ start_pos = current_anchor.anchor.reference_positions[source] + current_anchor.anchor.length
447
+ end_pos = next_anchor.anchor.reference_positions[source]
448
+ ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], start_pos, end_pos)
449
+ ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], start_pos, end_pos)
450
+
451
+ return GapSequence(
452
+ words[gap_start:gap_end], gap_start, current_anchor.anchor, next_anchor.anchor, ref_words, ref_words_original
453
+ )
454
+ return None
455
+
456
+ def _create_final_gap(
457
+ self, words: List[str], last_anchor: ScoredAnchor, ref_texts_clean: Dict[str, List[str]], ref_texts_original: Dict[str, List[str]]
458
+ ) -> Optional[GapSequence]:
459
+ """Create gap sequence after the last anchor."""
460
+ last_pos = last_anchor.anchor.transcription_position + last_anchor.anchor.length
461
+ if last_pos < len(words):
462
+ ref_words = {}
463
+ ref_words_original = {}
464
+ for source in ref_texts_clean:
465
+ if source in last_anchor.anchor.reference_positions:
466
+ start_pos = last_anchor.anchor.reference_positions[source] + last_anchor.anchor.length
467
+ ref_words[source] = self._get_reference_words(source, ref_texts_clean[source], start_pos, None)
468
+ ref_words_original[source] = self._get_reference_words(source, ref_texts_original[source], start_pos, None)
469
+
470
+ return GapSequence(words[last_pos:], last_pos, last_anchor.anchor, None, ref_words, ref_words_original)
471
+ return None