karaoke-gen 0.57.0__py3-none-any.whl → 0.71.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. karaoke_gen/audio_fetcher.py +461 -0
  2. karaoke_gen/audio_processor.py +407 -30
  3. karaoke_gen/config.py +62 -113
  4. karaoke_gen/file_handler.py +32 -59
  5. karaoke_gen/karaoke_finalise/karaoke_finalise.py +148 -67
  6. karaoke_gen/karaoke_gen.py +270 -61
  7. karaoke_gen/lyrics_processor.py +13 -1
  8. karaoke_gen/metadata.py +78 -73
  9. karaoke_gen/pipeline/__init__.py +87 -0
  10. karaoke_gen/pipeline/base.py +215 -0
  11. karaoke_gen/pipeline/context.py +230 -0
  12. karaoke_gen/pipeline/executors/__init__.py +21 -0
  13. karaoke_gen/pipeline/executors/local.py +159 -0
  14. karaoke_gen/pipeline/executors/remote.py +257 -0
  15. karaoke_gen/pipeline/stages/__init__.py +27 -0
  16. karaoke_gen/pipeline/stages/finalize.py +202 -0
  17. karaoke_gen/pipeline/stages/render.py +165 -0
  18. karaoke_gen/pipeline/stages/screens.py +139 -0
  19. karaoke_gen/pipeline/stages/separation.py +191 -0
  20. karaoke_gen/pipeline/stages/transcription.py +191 -0
  21. karaoke_gen/style_loader.py +531 -0
  22. karaoke_gen/utils/bulk_cli.py +6 -0
  23. karaoke_gen/utils/cli_args.py +424 -0
  24. karaoke_gen/utils/gen_cli.py +26 -261
  25. karaoke_gen/utils/remote_cli.py +1815 -0
  26. karaoke_gen/video_background_processor.py +351 -0
  27. karaoke_gen-0.71.23.dist-info/METADATA +610 -0
  28. karaoke_gen-0.71.23.dist-info/RECORD +275 -0
  29. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.23.dist-info}/WHEEL +1 -1
  30. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.23.dist-info}/entry_points.txt +1 -0
  31. lyrics_transcriber/__init__.py +10 -0
  32. lyrics_transcriber/cli/__init__.py +0 -0
  33. lyrics_transcriber/cli/cli_main.py +285 -0
  34. lyrics_transcriber/core/__init__.py +0 -0
  35. lyrics_transcriber/core/config.py +50 -0
  36. lyrics_transcriber/core/controller.py +520 -0
  37. lyrics_transcriber/correction/__init__.py +0 -0
  38. lyrics_transcriber/correction/agentic/__init__.py +9 -0
  39. lyrics_transcriber/correction/agentic/adapter.py +71 -0
  40. lyrics_transcriber/correction/agentic/agent.py +313 -0
  41. lyrics_transcriber/correction/agentic/feedback/aggregator.py +12 -0
  42. lyrics_transcriber/correction/agentic/feedback/collector.py +17 -0
  43. lyrics_transcriber/correction/agentic/feedback/retention.py +24 -0
  44. lyrics_transcriber/correction/agentic/feedback/store.py +76 -0
  45. lyrics_transcriber/correction/agentic/handlers/__init__.py +24 -0
  46. lyrics_transcriber/correction/agentic/handlers/ambiguous.py +44 -0
  47. lyrics_transcriber/correction/agentic/handlers/background_vocals.py +68 -0
  48. lyrics_transcriber/correction/agentic/handlers/base.py +51 -0
  49. lyrics_transcriber/correction/agentic/handlers/complex_multi_error.py +46 -0
  50. lyrics_transcriber/correction/agentic/handlers/extra_words.py +74 -0
  51. lyrics_transcriber/correction/agentic/handlers/no_error.py +42 -0
  52. lyrics_transcriber/correction/agentic/handlers/punctuation.py +44 -0
  53. lyrics_transcriber/correction/agentic/handlers/registry.py +60 -0
  54. lyrics_transcriber/correction/agentic/handlers/repeated_section.py +44 -0
  55. lyrics_transcriber/correction/agentic/handlers/sound_alike.py +126 -0
  56. lyrics_transcriber/correction/agentic/models/__init__.py +5 -0
  57. lyrics_transcriber/correction/agentic/models/ai_correction.py +31 -0
  58. lyrics_transcriber/correction/agentic/models/correction_session.py +30 -0
  59. lyrics_transcriber/correction/agentic/models/enums.py +38 -0
  60. lyrics_transcriber/correction/agentic/models/human_feedback.py +30 -0
  61. lyrics_transcriber/correction/agentic/models/learning_data.py +26 -0
  62. lyrics_transcriber/correction/agentic/models/observability_metrics.py +28 -0
  63. lyrics_transcriber/correction/agentic/models/schemas.py +46 -0
  64. lyrics_transcriber/correction/agentic/models/utils.py +19 -0
  65. lyrics_transcriber/correction/agentic/observability/__init__.py +5 -0
  66. lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +35 -0
  67. lyrics_transcriber/correction/agentic/observability/metrics.py +46 -0
  68. lyrics_transcriber/correction/agentic/observability/performance.py +19 -0
  69. lyrics_transcriber/correction/agentic/prompts/__init__.py +2 -0
  70. lyrics_transcriber/correction/agentic/prompts/classifier.py +227 -0
  71. lyrics_transcriber/correction/agentic/providers/__init__.py +6 -0
  72. lyrics_transcriber/correction/agentic/providers/base.py +36 -0
  73. lyrics_transcriber/correction/agentic/providers/circuit_breaker.py +145 -0
  74. lyrics_transcriber/correction/agentic/providers/config.py +73 -0
  75. lyrics_transcriber/correction/agentic/providers/constants.py +24 -0
  76. lyrics_transcriber/correction/agentic/providers/health.py +28 -0
  77. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +212 -0
  78. lyrics_transcriber/correction/agentic/providers/model_factory.py +209 -0
  79. lyrics_transcriber/correction/agentic/providers/response_cache.py +218 -0
  80. lyrics_transcriber/correction/agentic/providers/response_parser.py +111 -0
  81. lyrics_transcriber/correction/agentic/providers/retry_executor.py +127 -0
  82. lyrics_transcriber/correction/agentic/router.py +35 -0
  83. lyrics_transcriber/correction/agentic/workflows/__init__.py +5 -0
  84. lyrics_transcriber/correction/agentic/workflows/consensus_workflow.py +24 -0
  85. lyrics_transcriber/correction/agentic/workflows/correction_graph.py +59 -0
  86. lyrics_transcriber/correction/agentic/workflows/feedback_workflow.py +24 -0
  87. lyrics_transcriber/correction/anchor_sequence.py +1043 -0
  88. lyrics_transcriber/correction/corrector.py +760 -0
  89. lyrics_transcriber/correction/feedback/__init__.py +2 -0
  90. lyrics_transcriber/correction/feedback/schemas.py +107 -0
  91. lyrics_transcriber/correction/feedback/store.py +236 -0
  92. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  93. lyrics_transcriber/correction/handlers/base.py +52 -0
  94. lyrics_transcriber/correction/handlers/extend_anchor.py +149 -0
  95. lyrics_transcriber/correction/handlers/levenshtein.py +189 -0
  96. lyrics_transcriber/correction/handlers/llm.py +293 -0
  97. lyrics_transcriber/correction/handlers/llm_providers.py +60 -0
  98. lyrics_transcriber/correction/handlers/no_space_punct_match.py +154 -0
  99. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +85 -0
  100. lyrics_transcriber/correction/handlers/repeat.py +88 -0
  101. lyrics_transcriber/correction/handlers/sound_alike.py +259 -0
  102. lyrics_transcriber/correction/handlers/syllables_match.py +252 -0
  103. lyrics_transcriber/correction/handlers/word_count_match.py +80 -0
  104. lyrics_transcriber/correction/handlers/word_operations.py +187 -0
  105. lyrics_transcriber/correction/operations.py +352 -0
  106. lyrics_transcriber/correction/phrase_analyzer.py +435 -0
  107. lyrics_transcriber/correction/text_utils.py +30 -0
  108. lyrics_transcriber/frontend/.gitignore +23 -0
  109. lyrics_transcriber/frontend/.yarn/releases/yarn-4.7.0.cjs +935 -0
  110. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  111. lyrics_transcriber/frontend/README.md +50 -0
  112. lyrics_transcriber/frontend/REPLACE_ALL_FUNCTIONALITY.md +210 -0
  113. lyrics_transcriber/frontend/__init__.py +25 -0
  114. lyrics_transcriber/frontend/eslint.config.js +28 -0
  115. lyrics_transcriber/frontend/index.html +18 -0
  116. lyrics_transcriber/frontend/package.json +42 -0
  117. lyrics_transcriber/frontend/public/android-chrome-192x192.png +0 -0
  118. lyrics_transcriber/frontend/public/android-chrome-512x512.png +0 -0
  119. lyrics_transcriber/frontend/public/apple-touch-icon.png +0 -0
  120. lyrics_transcriber/frontend/public/favicon-16x16.png +0 -0
  121. lyrics_transcriber/frontend/public/favicon-32x32.png +0 -0
  122. lyrics_transcriber/frontend/public/favicon.ico +0 -0
  123. lyrics_transcriber/frontend/public/nomad-karaoke-logo.png +0 -0
  124. lyrics_transcriber/frontend/src/App.tsx +212 -0
  125. lyrics_transcriber/frontend/src/api.ts +239 -0
  126. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +77 -0
  127. lyrics_transcriber/frontend/src/components/AddLyricsModal.tsx +114 -0
  128. lyrics_transcriber/frontend/src/components/AgenticCorrectionMetrics.tsx +204 -0
  129. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +180 -0
  130. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +167 -0
  131. lyrics_transcriber/frontend/src/components/CorrectionAnnotationModal.tsx +359 -0
  132. lyrics_transcriber/frontend/src/components/CorrectionDetailCard.tsx +281 -0
  133. lyrics_transcriber/frontend/src/components/CorrectionMetrics.tsx +162 -0
  134. lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +257 -0
  135. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +68 -0
  136. lyrics_transcriber/frontend/src/components/EditModal.tsx +702 -0
  137. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +496 -0
  138. lyrics_transcriber/frontend/src/components/EditWordList.tsx +379 -0
  139. lyrics_transcriber/frontend/src/components/FileUpload.tsx +77 -0
  140. lyrics_transcriber/frontend/src/components/FindReplaceModal.tsx +467 -0
  141. lyrics_transcriber/frontend/src/components/Header.tsx +387 -0
  142. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +1373 -0
  143. lyrics_transcriber/frontend/src/components/MetricsDashboard.tsx +51 -0
  144. lyrics_transcriber/frontend/src/components/ModeSelector.tsx +67 -0
  145. lyrics_transcriber/frontend/src/components/ModelSelector.tsx +23 -0
  146. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +144 -0
  147. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +268 -0
  148. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +688 -0
  149. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +354 -0
  150. lyrics_transcriber/frontend/src/components/SegmentDetailsModal.tsx +64 -0
  151. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +376 -0
  152. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +131 -0
  153. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +256 -0
  154. lyrics_transcriber/frontend/src/components/WordDivider.tsx +187 -0
  155. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +379 -0
  156. lyrics_transcriber/frontend/src/components/shared/components/SourceSelector.tsx +56 -0
  157. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +87 -0
  158. lyrics_transcriber/frontend/src/components/shared/constants.ts +20 -0
  159. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +180 -0
  160. lyrics_transcriber/frontend/src/components/shared/styles.ts +13 -0
  161. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  162. lyrics_transcriber/frontend/src/components/shared/types.ts +129 -0
  163. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +177 -0
  164. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  165. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +75 -0
  166. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +360 -0
  167. lyrics_transcriber/frontend/src/components/shared/utils/timingUtils.ts +110 -0
  168. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  169. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +435 -0
  170. lyrics_transcriber/frontend/src/main.tsx +17 -0
  171. lyrics_transcriber/frontend/src/theme.ts +177 -0
  172. lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
  173. lyrics_transcriber/frontend/src/types.js +2 -0
  174. lyrics_transcriber/frontend/src/types.ts +199 -0
  175. lyrics_transcriber/frontend/src/validation.ts +132 -0
  176. lyrics_transcriber/frontend/src/vite-env.d.ts +1 -0
  177. lyrics_transcriber/frontend/tsconfig.app.json +26 -0
  178. lyrics_transcriber/frontend/tsconfig.json +25 -0
  179. lyrics_transcriber/frontend/tsconfig.node.json +23 -0
  180. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -0
  181. lyrics_transcriber/frontend/update_version.js +11 -0
  182. lyrics_transcriber/frontend/vite.config.d.ts +2 -0
  183. lyrics_transcriber/frontend/vite.config.js +10 -0
  184. lyrics_transcriber/frontend/vite.config.ts +11 -0
  185. lyrics_transcriber/frontend/web_assets/android-chrome-192x192.png +0 -0
  186. lyrics_transcriber/frontend/web_assets/android-chrome-512x512.png +0 -0
  187. lyrics_transcriber/frontend/web_assets/apple-touch-icon.png +0 -0
  188. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js +42039 -0
  189. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js.map +1 -0
  190. lyrics_transcriber/frontend/web_assets/favicon-16x16.png +0 -0
  191. lyrics_transcriber/frontend/web_assets/favicon-32x32.png +0 -0
  192. lyrics_transcriber/frontend/web_assets/favicon.ico +0 -0
  193. lyrics_transcriber/frontend/web_assets/index.html +18 -0
  194. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.png +0 -0
  195. lyrics_transcriber/frontend/yarn.lock +3752 -0
  196. lyrics_transcriber/lyrics/__init__.py +0 -0
  197. lyrics_transcriber/lyrics/base_lyrics_provider.py +211 -0
  198. lyrics_transcriber/lyrics/file_provider.py +95 -0
  199. lyrics_transcriber/lyrics/genius.py +384 -0
  200. lyrics_transcriber/lyrics/lrclib.py +231 -0
  201. lyrics_transcriber/lyrics/musixmatch.py +156 -0
  202. lyrics_transcriber/lyrics/spotify.py +290 -0
  203. lyrics_transcriber/lyrics/user_input_provider.py +44 -0
  204. lyrics_transcriber/output/__init__.py +0 -0
  205. lyrics_transcriber/output/ass/__init__.py +21 -0
  206. lyrics_transcriber/output/ass/ass.py +2088 -0
  207. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  208. lyrics_transcriber/output/ass/config.py +180 -0
  209. lyrics_transcriber/output/ass/constants.py +23 -0
  210. lyrics_transcriber/output/ass/event.py +94 -0
  211. lyrics_transcriber/output/ass/formatters.py +132 -0
  212. lyrics_transcriber/output/ass/lyrics_line.py +265 -0
  213. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  214. lyrics_transcriber/output/ass/section_detector.py +89 -0
  215. lyrics_transcriber/output/ass/section_screen.py +106 -0
  216. lyrics_transcriber/output/ass/style.py +187 -0
  217. lyrics_transcriber/output/cdg.py +619 -0
  218. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  219. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  220. lyrics_transcriber/output/cdgmaker/composer.py +2260 -0
  221. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  222. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  223. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  224. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  225. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  226. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  227. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  228. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  229. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  230. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  231. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  232. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  233. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  234. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  235. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  236. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  237. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  238. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  239. lyrics_transcriber/output/countdown_processor.py +267 -0
  240. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  241. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  242. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  243. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  244. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  245. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  246. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  247. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  248. lyrics_transcriber/output/generator.py +257 -0
  249. lyrics_transcriber/output/lrc_to_cdg.py +61 -0
  250. lyrics_transcriber/output/lyrics_file.py +102 -0
  251. lyrics_transcriber/output/plain_text.py +96 -0
  252. lyrics_transcriber/output/segment_resizer.py +431 -0
  253. lyrics_transcriber/output/subtitles.py +397 -0
  254. lyrics_transcriber/output/video.py +544 -0
  255. lyrics_transcriber/review/__init__.py +0 -0
  256. lyrics_transcriber/review/server.py +676 -0
  257. lyrics_transcriber/storage/__init__.py +0 -0
  258. lyrics_transcriber/storage/dropbox.py +225 -0
  259. lyrics_transcriber/transcribers/__init__.py +0 -0
  260. lyrics_transcriber/transcribers/audioshake.py +290 -0
  261. lyrics_transcriber/transcribers/base_transcriber.py +157 -0
  262. lyrics_transcriber/transcribers/whisper.py +330 -0
  263. lyrics_transcriber/types.py +648 -0
  264. lyrics_transcriber/utils/__init__.py +0 -0
  265. lyrics_transcriber/utils/word_utils.py +27 -0
  266. karaoke_gen-0.57.0.dist-info/METADATA +0 -167
  267. karaoke_gen-0.57.0.dist-info/RECORD +0 -23
  268. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.23.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1043 @@
1
+ import threading
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+ import logging
5
+ from tqdm import tqdm
6
+ from multiprocessing import Pool, cpu_count
7
+ from functools import partial
8
+ from pathlib import Path
9
+ import json
10
+ import hashlib
11
+
12
+ from lyrics_transcriber.types import LyricsData, PhraseScore, PhraseType, AnchorSequence, GapSequence, ScoredAnchor, TranscriptionResult, Word
13
+ from lyrics_transcriber.correction.phrase_analyzer import PhraseAnalyzer
14
+ from lyrics_transcriber.correction.text_utils import clean_text
15
+ from lyrics_transcriber.utils.word_utils import WordUtils
16
+
17
+
18
+ class AnchorSequenceTimeoutError(Exception):
19
+ """Raised when anchor sequence computation exceeds timeout."""
20
+ pass
21
+
22
+
23
+ class AnchorSequenceFinder:
24
+ """Identifies and manages anchor sequences between transcribed and reference lyrics."""
25
+
26
+ def __init__(
27
+ self,
28
+ cache_dir: Union[str, Path],
29
+ min_sequence_length: int = 3,
30
+ min_sources: int = 1,
31
+ timeout_seconds: int = 600, # 10 minutes default timeout
32
+ max_iterations_per_ngram: int = 1000, # Maximum iterations for while loop
33
+ progress_check_interval: int = 50, # Check progress every N iterations
34
+ logger: Optional[logging.Logger] = None,
35
+ ):
36
+ self.min_sequence_length = min_sequence_length
37
+ self.min_sources = min_sources
38
+ self.timeout_seconds = timeout_seconds
39
+ self.max_iterations_per_ngram = max_iterations_per_ngram
40
+ self.progress_check_interval = progress_check_interval
41
+ self.logger = logger or logging.getLogger(__name__)
42
+ self.phrase_analyzer = PhraseAnalyzer(logger=self.logger)
43
+ self.used_positions = {}
44
+
45
+ # Initialize cache directory
46
+ self.cache_dir = Path(cache_dir)
47
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
48
+ self.logger.info(f"Initialized AnchorSequenceFinder with cache dir: {self.cache_dir}, timeout: {timeout_seconds}s")
49
+
50
+ def _check_timeout(self, start_time: float, operation_name: str = "operation"):
51
+ """Check if timeout has occurred and raise exception if so."""
52
+ if self.timeout_seconds > 0:
53
+ elapsed_time = time.time() - start_time
54
+ if elapsed_time > self.timeout_seconds:
55
+ raise AnchorSequenceTimeoutError(f"{operation_name} exceeded {self.timeout_seconds} seconds (elapsed: {elapsed_time:.1f}s)")
56
+
57
+ def _clean_text(self, text: str) -> str:
58
+ """Clean text by removing punctuation and normalizing whitespace."""
59
+ # self.logger.debug(f"_clean_text called with text length: {len(text)}")
60
+ return clean_text(text)
61
+
62
+ def _find_ngrams(self, words: List[str], n: int) -> List[Tuple[List[str], int]]:
63
+ """Generate n-grams with their starting positions."""
64
+ # self.logger.debug(f"_find_ngrams called with {len(words)} words, n={n}")
65
+ return [(words[i : i + n], i) for i in range(len(words) - n + 1)]
66
+
67
+ def _find_matching_sources(self, ngram: List[str], references: Dict[str, List[str]], n: int) -> Dict[str, int]:
68
+ """Find which sources contain the given n-gram and at what positions."""
69
+ # self.logger.debug(f"_find_matching_sources called for ngram: '{' '.join(ngram)}'")
70
+ matches = {}
71
+ all_positions = {source: [] for source in references}
72
+
73
+ # First, find all positions in each source
74
+ for source, words in references.items():
75
+ for i in range(len(words) - n + 1):
76
+ if words[i : i + n] == ngram:
77
+ all_positions[source].append(i)
78
+
79
+ # Then, try to find an unused position for each source
80
+ for source, positions in all_positions.items():
81
+ used = self.used_positions.get(source, set())
82
+ # Try each position in order
83
+ for pos in positions:
84
+ if pos not in used:
85
+ matches[source] = pos
86
+ break
87
+
88
+ return matches
89
+
90
+ def _filter_used_positions(self, matches: Dict[str, int]) -> Dict[str, int]:
91
+ """Filter out positions that have already been used.
92
+
93
+ Args:
94
+ matches: Dict mapping source IDs to positions
95
+
96
+ Returns:
97
+ Dict mapping source IDs to unused positions
98
+ """
99
+ self.logger.debug(f"_filter_used_positions called with {len(matches)} matches")
100
+ return {source: pos for source, pos in matches.items() if pos not in self.used_positions.get(source, set())}
101
+
102
+ def _create_anchor(
103
+ self, ngram: List[str], trans_pos: int, matching_sources: Dict[str, int], total_sources: int
104
+ ) -> Optional[AnchorSequence]:
105
+ """Create an anchor sequence if it meets the minimum sources requirement."""
106
+ self.logger.debug(f"_create_anchor called for ngram: '{' '.join(ngram)}' at position {trans_pos}")
107
+ if len(matching_sources) >= self.min_sources:
108
+ confidence = len(matching_sources) / total_sources
109
+ # Use new API to avoid setting _words field
110
+ anchor = AnchorSequence(
111
+ id=WordUtils.generate_id(),
112
+ transcribed_word_ids=[WordUtils.generate_id() for _ in ngram],
113
+ transcription_position=trans_pos,
114
+ reference_positions=matching_sources,
115
+ reference_word_ids={source: [WordUtils.generate_id() for _ in ngram]
116
+ for source in matching_sources.keys()},
117
+ confidence=confidence
118
+ )
119
+ self.logger.debug(f"Found anchor sequence: '{' '.join(ngram)}' (confidence: {confidence:.2f})")
120
+ return anchor
121
+ return None
122
+
123
+ def _get_cache_key(self, transcribed: str, references: Dict[str, LyricsData], transcription_result: TranscriptionResult) -> str:
124
+ """Generate a unique cache key for the input combination."""
125
+ # Create a string that uniquely identifies the inputs, including word IDs
126
+ ref_texts = []
127
+ for source, lyrics in sorted(references.items()):
128
+ # Include both text and ID for each word to ensure cache uniqueness
129
+ words_with_ids = [f"{w.text}:{w.id}" for s in lyrics.segments for w in s.words]
130
+ ref_texts.append(f"{source}:{','.join(words_with_ids)}")
131
+
132
+ # Also include transcription word IDs to ensure complete matching
133
+ trans_words_with_ids = [f"{w.text}:{w.id}" for s in transcription_result.result.segments for w in s.words]
134
+
135
+ input_str = f"{transcribed}|" f"{','.join(trans_words_with_ids)}|" f"{','.join(ref_texts)}"
136
+ return hashlib.md5(input_str.encode()).hexdigest()
137
+
138
+ def _save_to_cache(self, cache_path: Path, anchors: List[ScoredAnchor]) -> None:
139
+ """Save results to cache file."""
140
+ self.logger.debug(f"Saving to cache: {cache_path}")
141
+ # Convert to dictionary format that matches the expected loading format
142
+ cache_data = [{"anchor": anchor.anchor.to_dict(), "phrase_score": anchor.phrase_score.to_dict()} for anchor in anchors]
143
+ with open(cache_path, "w") as f:
144
+ json.dump(cache_data, f, indent=2)
145
+
146
+ def _load_from_cache(self, cache_path: Path) -> Optional[List[ScoredAnchor]]:
147
+ """Load results from cache if available."""
148
+ try:
149
+ self.logger.debug(f"Attempting to load from cache: {cache_path}")
150
+ with open(cache_path, "r") as f:
151
+ cached_data = json.load(f)
152
+
153
+ self.logger.info("Loading anchors from cache")
154
+ try:
155
+ # Log the raw dictionary data instead of the object
156
+ # if cached_data:
157
+ # self.logger.debug(f"Cached data structure: {json.dumps(cached_data[0], indent=2)}")
158
+
159
+ # Convert cached data back to ScoredAnchor objects
160
+ anchors = []
161
+ for data in cached_data:
162
+ if "anchor" not in data or "phrase_score" not in data:
163
+ raise KeyError("Missing required keys: anchor, phrase_score")
164
+
165
+ anchor = AnchorSequence.from_dict(data["anchor"])
166
+ phrase_score = PhraseScore.from_dict(data["phrase_score"])
167
+ anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
168
+
169
+ return anchors
170
+
171
+ except KeyError as e:
172
+ self.logger.error(f"Cache format mismatch. Missing key: {e}")
173
+ # Log the raw data for debugging
174
+ if cached_data:
175
+ self.logger.error(f"First cached anchor data: {json.dumps(cached_data[0], indent=2)}")
176
+ self.logger.error("Expected keys: anchor, phrase_score")
177
+ self.logger.warning(f"Cache format mismatch: {e}. Recomputing.")
178
+ return None
179
+
180
+ except (FileNotFoundError, json.JSONDecodeError) as e:
181
+ self.logger.debug(f"Cache miss or invalid cache file: {e}")
182
+ return None
183
+ except Exception as e:
184
+ self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
185
+ return None
186
+
187
+ def _process_ngram_length(
188
+ self,
189
+ n: int,
190
+ trans_words: List[str],
191
+ all_words: List[Word],
192
+ ref_texts_clean: Dict[str, List[str]],
193
+ ref_words: Dict[str, List[Word]],
194
+ min_sources: int,
195
+ ) -> List[AnchorSequence]:
196
+ """Process a single n-gram length to find matching sequences with timeout and early termination."""
197
+ self.logger.debug(f"🔍 N-GRAM {n}: Starting processing with {len(trans_words)} transcription words")
198
+ self.logger.debug(f"🔍 N-GRAM {n}: Reference sources: {list(ref_texts_clean.keys())}")
199
+ self.logger.debug(f"🔍 N-GRAM {n}: Max iterations limit: {self.max_iterations_per_ngram}")
200
+
201
+ candidate_anchors = []
202
+ used_positions = {source: set() for source in ref_texts_clean.keys()}
203
+ used_trans_positions = set()
204
+
205
+ iteration_count = 0
206
+ last_progress_check = 0
207
+ last_anchor_count = 0
208
+ stagnation_count = 0
209
+
210
+ self.logger.debug(f"🔍 N-GRAM {n}: Processing n-gram length {n} with max {self.max_iterations_per_ngram} iterations")
211
+
212
+ # Generate n-grams from transcribed text once
213
+ trans_ngrams = self._find_ngrams(trans_words, n)
214
+ self.logger.debug(f"🔍 N-GRAM {n}: Generated {len(trans_ngrams)} n-grams for processing")
215
+
216
+ # Process all n-grams efficiently in multiple passes
217
+ found_new_match = True
218
+ while found_new_match and iteration_count < self.max_iterations_per_ngram:
219
+ found_new_match = False
220
+ iteration_count += 1
221
+ anchors_found_this_iteration = 0
222
+
223
+ # Log every 10th iteration to track progress
224
+ if iteration_count % 10 == 0:
225
+ self.logger.debug(f"🔍 N-GRAM {n}: Iteration {iteration_count}, anchors found: {len(candidate_anchors)}")
226
+
227
+ # Check for progress stagnation every N iterations
228
+ if iteration_count - last_progress_check >= self.progress_check_interval:
229
+ current_anchor_count = len(candidate_anchors)
230
+ if current_anchor_count == last_anchor_count:
231
+ stagnation_count += 1
232
+ self.logger.debug(f"🔍 N-GRAM {n}: Stagnation check {stagnation_count}/3 at iteration {iteration_count}")
233
+ if stagnation_count >= 3: # No progress for 3 consecutive checks
234
+ self.logger.debug(f"🔍 N-GRAM {n}: ⏹️ Early termination due to stagnation after {iteration_count} iterations")
235
+ break
236
+ else:
237
+ stagnation_count = 0 # Reset stagnation counter
238
+
239
+ last_anchor_count = current_anchor_count
240
+ last_progress_check = iteration_count
241
+
242
+ self.logger.debug(f"🔍 N-GRAM {n}: iteration {iteration_count}, anchors: {current_anchor_count}, stagnation: {stagnation_count}")
243
+
244
+ # Process all n-grams in this iteration
245
+ for ngram, trans_pos in trans_ngrams:
246
+ # Skip if we've already used this transcription position
247
+ if trans_pos in used_trans_positions:
248
+ continue
249
+
250
+ # Get the actual words from the transcription at this position
251
+ actual_words = [w.text.lower().strip('.,?!"\n') for w in all_words[trans_pos : trans_pos + n]]
252
+ ngram_words = [w.lower() for w in ngram]
253
+
254
+ if actual_words != ngram_words:
255
+ self.logger.error(f"🔍 N-GRAM {n}: ❌ Mismatch between ngram and actual words at position {trans_pos}:")
256
+ self.logger.error(f"🔍 N-GRAM {n}: Ngram words: {ngram_words}")
257
+ self.logger.error(f"🔍 N-GRAM {n}: Actual words: {actual_words}")
258
+ self.logger.error(f"🔍 N-GRAM {n}: Full trans_words: {trans_words}")
259
+ self.logger.error(f"🔍 N-GRAM {n}: Full all_words: {[w.text for w in all_words]}")
260
+ raise AssertionError(
261
+ f"Ngram words don't match actual words at position {trans_pos}. "
262
+ f"This should never happen as trans_words should be derived from all_words."
263
+ )
264
+
265
+ matches = self._find_matching_sources(ngram, ref_texts_clean, n)
266
+ if len(matches) >= min_sources:
267
+ # Log successful match
268
+ if len(candidate_anchors) < 5: # Only log first few matches to avoid spam
269
+ self.logger.debug(f"🔍 N-GRAM {n}: ✅ Found match: '{' '.join(ngram)}' at pos {trans_pos} with {len(matches)} sources")
270
+
271
+ # Get Word IDs for transcribed words
272
+ transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
273
+
274
+ # Get Word IDs for reference words
275
+ reference_word_ids = {source: [w.id for w in ref_words[source][pos : pos + n]] for source, pos in matches.items()}
276
+
277
+ # Mark positions as used
278
+ for source, pos in matches.items():
279
+ used_positions[source].add(pos)
280
+ used_trans_positions.add(trans_pos)
281
+
282
+ anchor = AnchorSequence(
283
+ id=WordUtils.generate_id(),
284
+ transcribed_word_ids=transcribed_word_ids,
285
+ transcription_position=trans_pos,
286
+ reference_positions=matches,
287
+ reference_word_ids=reference_word_ids,
288
+ confidence=len(matches) / len(ref_texts_clean),
289
+ )
290
+ candidate_anchors.append(anchor)
291
+ anchors_found_this_iteration += 1
292
+ found_new_match = True
293
+
294
+ # For efficiency, if we have very low iteration limits, find one match per iteration
295
+ if self.max_iterations_per_ngram <= 10:
296
+ break
297
+
298
+ # Log progress for this iteration
299
+ if anchors_found_this_iteration > 0:
300
+ self.logger.debug(f"🔍 N-GRAM {n}: Found {anchors_found_this_iteration} anchors in iteration {iteration_count}")
301
+
302
+ # Early termination if we've found enough anchors or processed all positions
303
+ if len(used_trans_positions) >= len(trans_ngrams) or len(candidate_anchors) >= len(trans_ngrams):
304
+ self.logger.debug(f"🔍 N-GRAM {n}: ⏹️ Early termination - processed all positions after {iteration_count} iterations")
305
+ break
306
+
307
+ if iteration_count >= self.max_iterations_per_ngram:
308
+ self.logger.debug(f"🔍 N-GRAM {n}: ⏰ Processing terminated after reaching max iterations ({self.max_iterations_per_ngram})")
309
+
310
+ self.logger.debug(f"🔍 N-GRAM {n}: ✅ Completed processing after {iteration_count} iterations, found {len(candidate_anchors)} anchors")
311
+ return candidate_anchors
312
+
313
+ def find_anchors(
314
+ self,
315
+ transcribed: str,
316
+ references: Dict[str, LyricsData],
317
+ transcription_result: TranscriptionResult,
318
+ ) -> List[ScoredAnchor]:
319
+ """Find anchor sequences that appear in both transcription and references with timeout protection."""
320
+ start_time = time.time()
321
+
322
+ try:
323
+ self.logger.info(f"🔍 ANCHOR SEARCH: Starting anchor search (timeout: {self.timeout_seconds}s, sources: {list(references.keys())})")
324
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Transcribed text length: {len(transcribed)}")
325
+
326
+ cache_key = self._get_cache_key(transcribed, references, transcription_result)
327
+ cache_path = self.cache_dir / f"anchors_{cache_key}.json"
328
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Cache key: {cache_key}")
329
+
330
+ # Try to load from cache
331
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Checking cache at {cache_path}")
332
+ if cached_data := self._load_from_cache(cache_path):
333
+ self.logger.info("🔍 ANCHOR SEARCH: ✅ Cache hit - loading anchors from cache")
334
+ try:
335
+ # Convert cached_data to dictionary before logging
336
+ if cached_data:
337
+ first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
338
+ return cached_data
339
+ except Exception as e:
340
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Error loading cache: {type(e).__name__}: {e}")
341
+ if cached_data:
342
+ try:
343
+ first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
344
+ self.logger.error(f"First cached anchor data: {json.dumps(first_anchor, indent=2)}")
345
+ except:
346
+ self.logger.error("Could not serialize first cached anchor for logging")
347
+
348
+ # If not in cache or cache format invalid, perform the computation
349
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cache miss - computing anchors")
350
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Finding anchor sequences for transcription with length {len(transcribed)}")
351
+
352
+ # Check timeout before starting computation
353
+ self._check_timeout(start_time, "anchor computation initialization")
354
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - initialization")
355
+
356
+ # Get all words from transcription
357
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Extracting words from transcription result...")
358
+ all_words = []
359
+ for segment in transcription_result.result.segments:
360
+ all_words.extend(segment.words)
361
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Extracted {len(all_words)} words from transcription")
362
+
363
+ # Clean and split texts
364
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Cleaning transcription words...")
365
+ trans_words = [w.text.lower().strip('.,?!"\n') for w in all_words]
366
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Cleaned {len(trans_words)} transcription words")
367
+
368
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Processing reference sources...")
369
+ ref_texts_clean = {
370
+ source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
371
+ for source, lyrics in references.items()
372
+ }
373
+ ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
374
+
375
+ for source, words in ref_texts_clean.items():
376
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Reference '{source}': {len(words)} words")
377
+
378
+ # Check timeout after preprocessing
379
+ self._check_timeout(start_time, "anchor computation preprocessing")
380
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - preprocessing")
381
+
382
+ # Filter out very short reference sources for n-gram length calculation
383
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Calculating n-gram lengths...")
384
+ valid_ref_lengths = [
385
+ len(words) for words in ref_texts_clean.values()
386
+ if len(words) >= self.min_sequence_length
387
+ ]
388
+
389
+ if not valid_ref_lengths:
390
+ self.logger.warning("🔍 ANCHOR SEARCH: ❌ No reference sources long enough for anchor detection")
391
+ return []
392
+
393
+ # Calculate max length using only valid reference sources
394
+ max_length = min(len(trans_words), min(valid_ref_lengths))
395
+ n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
396
+ self.logger.debug(f"🔍 ANCHOR SEARCH: N-gram lengths to process: {list(n_gram_lengths)} (max_length: {max_length})")
397
+
398
+ # Process n-gram lengths in parallel with timeout
399
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Setting up parallel processing...")
400
+ process_length_partial = partial(
401
+ self._process_ngram_length,
402
+ trans_words=trans_words,
403
+ all_words=all_words, # Pass the Word objects
404
+ ref_texts_clean=ref_texts_clean,
405
+ ref_words=ref_words,
406
+ min_sources=self.min_sources,
407
+ )
408
+
409
+ # Process n-gram lengths in parallel with timeout
410
+ candidate_anchors = []
411
+ pool_timeout = max(60, self.timeout_seconds // 2) if self.timeout_seconds > 0 else 300 # Use half the total timeout for pool operations
412
+
413
+ # Check timeout before parallel processing
414
+ self._check_timeout(start_time, "parallel processing start")
415
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - about to start parallel processing")
416
+
417
+ pool = None
418
+ try:
419
+ num_processes = max(cpu_count() - 1, 1)
420
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🚀 Starting parallel processing ({num_processes} processes, {len(n_gram_lengths)} n-gram lengths)")
421
+ pool = Pool(processes=num_processes)
422
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Pool created successfully")
423
+ results = []
424
+
425
+ # Submit all jobs first
426
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Submitting {len(n_gram_lengths)} n-gram processing jobs...")
427
+ async_results = []
428
+ for i, n in enumerate(n_gram_lengths):
429
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Submitting job {i+1}/{len(n_gram_lengths)} for n-gram length {n}")
430
+ async_result = pool.apply_async(process_length_partial, (n,))
431
+ async_results.append(async_result)
432
+
433
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ All {len(async_results)} jobs submitted")
434
+
435
+ # Collect results with individual timeouts
436
+ batch_results = []
437
+ batch_size = 10
438
+
439
+ for i, async_result in enumerate(async_results):
440
+ n_gram_length = n_gram_lengths[i]
441
+ try:
442
+ # Check remaining time for pool timeout (more lenient than overall timeout)
443
+ elapsed_time = time.time() - start_time
444
+ remaining_time = max(10, self.timeout_seconds - elapsed_time) if self.timeout_seconds > 0 else pool_timeout
445
+
446
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Remaining time for n-gram {n_gram_length}: {remaining_time}s")
447
+
448
+ # Use a more lenient timeout for individual results to allow fallback
449
+ individual_timeout = min(pool_timeout, remaining_time) if self.timeout_seconds > 0 else pool_timeout
450
+
451
+ result = async_result.get(timeout=individual_timeout)
452
+ results.append(result)
453
+
454
+ # Batch logging - collect info for batched logging
455
+ batch_results.append((n_gram_length, len(result)))
456
+
457
+ # Log progress every batch_size results or on the last result (at DEBUG level)
458
+ if (i + 1) % batch_size == 0 or (i + 1) == len(async_results):
459
+ total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
460
+ n_gram_ranges = [str(ng) for ng, _ in batch_results]
461
+ range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
462
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} ({i+1-len(batch_results)+1}-{i+1}/{len(async_results)}) - found {total_anchors_in_batch} anchors")
463
+ batch_results = [] # Reset batch
464
+
465
+ except Exception as e:
466
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n_gram_length} failed or timed out: {str(e)}")
467
+ results.append([]) # Add empty result to maintain order
468
+
469
+ # Add failed result to batch for logging
470
+ batch_results.append((n_gram_length, 0))
471
+
472
+ # If we're running short on time, trigger fallback early
473
+ if self.timeout_seconds > 0 and (time.time() - start_time) > (self.timeout_seconds * 0.8):
474
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Approaching timeout limit, triggering early fallback")
475
+ # Raise exception to trigger fallback to sequential processing
476
+ raise Exception("Parallel processing timeout, triggering fallback")
477
+
478
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Parallel processing completed, combining results...")
479
+ for anchors in results:
480
+ candidate_anchors.extend(anchors)
481
+
482
+ # Explicitly cleanup pool to avoid hangs in containerized environments
483
+ self.logger.debug(f"🔍 ANCHOR SEARCH: 🧹 Cleaning up pool...")
484
+ pool.close()
485
+ pool.terminate()
486
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Pool cleanup completed")
487
+
488
+ except AnchorSequenceTimeoutError:
489
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Parallel processing timed out")
490
+ # Re-raise timeout errors
491
+ raise
492
+ except Exception as e:
493
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Parallel processing failed: {str(e)}")
494
+ # Fall back to sequential processing with timeout checks
495
+ self.logger.info("🔍 ANCHOR SEARCH: Falling back to sequential processing")
496
+ for n in n_gram_lengths:
497
+ try:
498
+ # Check timeout more leniently during sequential processing
499
+ if self.timeout_seconds > 0:
500
+ elapsed_time = time.time() - start_time
501
+ # Allow more time for sequential processing (up to 2x the original timeout)
502
+ if elapsed_time > (self.timeout_seconds * 2.0):
503
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Sequential processing timeout for n-gram {n}")
504
+ break
505
+
506
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Sequential processing n-gram length {n}")
507
+
508
+ anchors = self._process_ngram_length(
509
+ n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
510
+ )
511
+ candidate_anchors.extend(anchors)
512
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Sequential n-gram {n} completed - found {len(anchors)} anchors")
513
+ except Exception as e:
514
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Sequential processing failed for n-gram length {n}: {str(e)}")
515
+ continue
516
+ finally:
517
+ # Always ensure pool is cleaned up to avoid hangs in containerized environments
518
+ if pool is not None:
519
+ try:
520
+ self.logger.debug(f"🔍 ANCHOR SEARCH: 🧹 Final pool cleanup...")
521
+ pool.terminate()
522
+ pool.join(timeout=5) # Wait max 5 seconds for workers to terminate
523
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Final pool cleanup completed")
524
+ except Exception as cleanup_error:
525
+ self.logger.debug(f"🔍 ANCHOR SEARCH: ⚠️ Pool cleanup error (ignored): {cleanup_error}")
526
+
527
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
528
+
529
+ # Check timeout before expensive filtering operation
530
+ self._check_timeout(start_time, "overlap filtering start")
531
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Starting overlap filtering...")
532
+
533
+ filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
534
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Filtering completed - {len(filtered_anchors)} final anchors")
535
+
536
+ # Save to cache
537
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Saving results to cache...")
538
+ self._save_to_cache(cache_path, filtered_anchors)
539
+
540
+ total_time = time.time() - start_time
541
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Completed in {total_time:.1f}s - found {len(filtered_anchors)} anchors")
542
+
543
+ return filtered_anchors
544
+
545
+ except AnchorSequenceTimeoutError:
546
+ elapsed_time = time.time() - start_time
547
+ self.logger.error(f"🔍 ANCHOR SEARCH: ⏰ TIMEOUT after {elapsed_time:.1f}s (limit: {self.timeout_seconds}s)")
548
+ raise
549
+ except Exception as e:
550
+ elapsed_time = time.time() - start_time
551
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ FAILED after {elapsed_time:.1f}s: {str(e)}")
552
+ self.logger.error(f"🔍 ANCHOR SEARCH: Exception type: {type(e).__name__}")
553
+ import traceback
554
+ self.logger.error(f"🔍 ANCHOR SEARCH: Traceback: {traceback.format_exc()}")
555
+ raise
556
+ finally:
557
+ # No cleanup needed for time-based timeout checks
558
+ pass
559
+
560
+ def _score_sequence(self, words: List[str], context: str) -> PhraseScore:
561
+ """Score a sequence based on its phrase quality"""
562
+ self.logger.debug(f"_score_sequence called for: '{' '.join(words)}'")
563
+ return self.phrase_analyzer.score_phrase(words, context)
564
+
565
+ def _get_sequence_priority(self, scored_anchor: ScoredAnchor) -> Tuple[float, float, float, float, int]:
566
+ """Get priority tuple for sorting sequences.
567
+
568
+ Returns tuple of:
569
+ - Number of sources matched (higher is better)
570
+ - Length bonus (length * 0.2) to favor longer sequences
571
+ - Break score (higher is better)
572
+ - Total score (higher is better)
573
+ - Negative position (earlier is better)
574
+
575
+ Position bonus: Add 1.0 to total score for sequences at position 0
576
+ """
577
+ # self.logger.debug(f"_get_sequence_priority called for anchor: '{scored_anchor.anchor.text}'")
578
+ position_bonus = 1.0 if scored_anchor.anchor.transcription_position == 0 else 0.0
579
+ length_bonus = len(scored_anchor.anchor.transcribed_word_ids) * 0.2 # Changed from words to transcribed_word_ids
580
+
581
+ return (
582
+ len(scored_anchor.anchor.reference_positions), # More sources is better
583
+ length_bonus, # Longer sequences preferred
584
+ scored_anchor.phrase_score.natural_break_score, # Better breaks preferred
585
+ scored_anchor.phrase_score.total_score + position_bonus, # Add bonus for position 0
586
+ -scored_anchor.anchor.transcription_position, # Earlier positions preferred
587
+ )
588
+
589
+ def _sequences_overlap(self, seq1: AnchorSequence, seq2: AnchorSequence) -> bool:
590
+ """Check if two sequences overlap in either transcription or references.
591
+
592
+ Args:
593
+ seq1: First sequence
594
+ seq2: Second sequence
595
+
596
+ Returns:
597
+ True if sequences overlap in transcription or share any reference positions
598
+ """
599
+ # Check transcription overlap
600
+ seq1_trans_range = range(
601
+ seq1.transcription_position, seq1.transcription_position + len(seq1.transcribed_word_ids)
602
+ ) # Changed from words
603
+ seq2_trans_range = range(
604
+ seq2.transcription_position, seq2.transcription_position + len(seq2.transcribed_word_ids)
605
+ ) # Changed from words
606
+ trans_overlap = bool(set(seq1_trans_range) & set(seq2_trans_range))
607
+
608
+ # Check reference overlap - only consider positions in shared sources
609
+ shared_sources = set(seq1.reference_positions.keys()) & set(seq2.reference_positions.keys())
610
+ ref_overlap = any(seq1.reference_positions[source] == seq2.reference_positions[source] for source in shared_sources)
611
+
612
+ return trans_overlap or ref_overlap
613
+
614
+ def _remove_overlapping_sequences(
615
+ self,
616
+ anchors: List[AnchorSequence],
617
+ context: str,
618
+ transcription_result: TranscriptionResult,
619
+ ) -> List[ScoredAnchor]:
620
+ """Remove overlapping sequences using phrase analysis with timeout protection."""
621
+ self.logger.debug(f"🔍 FILTERING: Starting overlap removal for {len(anchors)} anchors")
622
+
623
+ if not anchors:
624
+ self.logger.debug(f"🔍 FILTERING: No anchors to process")
625
+ return []
626
+
627
+ self.logger.debug(f"🔍 FILTERING: Scoring {len(anchors)} anchors")
628
+
629
+ # Create word map for scoring
630
+ word_map = {w.id: w for s in transcription_result.result.segments for w in s.words}
631
+ self.logger.debug(f"🔍 FILTERING: Created word map with {len(word_map)} words")
632
+
633
+ # Add word map to each anchor for scoring
634
+ for i, anchor in enumerate(anchors):
635
+ # For backwards compatibility, only add transcribed_words if all IDs exist in word_map
636
+ try:
637
+ anchor.transcribed_words = [word_map[word_id] for word_id in anchor.transcribed_word_ids]
638
+ # Also set _words for backwards compatibility with text display
639
+ anchor._words = [word_map[word_id].text for word_id in anchor.transcribed_word_ids]
640
+ except KeyError:
641
+ # This can happen in tests using backwards compatible constructors
642
+ # Create dummy Word objects with the text from _words if available
643
+ if hasattr(anchor, '_words') and anchor._words is not None:
644
+ from lyrics_transcriber.types import Word
645
+ from lyrics_transcriber.utils.word_utils import WordUtils
646
+ anchor.transcribed_words = [
647
+ Word(
648
+ id=word_id,
649
+ text=text,
650
+ start_time=i * 1.0,
651
+ end_time=(i + 1) * 1.0,
652
+ confidence=1.0
653
+ )
654
+ for i, (word_id, text) in enumerate(zip(anchor.transcribed_word_ids, anchor._words))
655
+ ]
656
+ else:
657
+ # Create generic word objects for scoring
658
+ from lyrics_transcriber.types import Word
659
+ anchor.transcribed_words = [
660
+ Word(
661
+ id=word_id,
662
+ text=f"word_{i}",
663
+ start_time=i * 1.0,
664
+ end_time=(i + 1) * 1.0,
665
+ confidence=1.0
666
+ )
667
+ for i, word_id in enumerate(anchor.transcribed_word_ids)
668
+ ]
669
+
670
+ start_time = time.time()
671
+
672
+ # Try different pool sizes with timeout
673
+ num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
674
+ self.logger.info(f"🔍 FILTERING: Using {num_processes} processes for scoring")
675
+
676
+ # Create a partial function with the context parameter fixed
677
+ score_anchor_partial = partial(self._score_anchor_static, context=context)
678
+
679
+ # Use multiprocessing to score anchors in parallel with timeout
680
+ scored_anchors = []
681
+ pool_timeout = 300 # 5 minutes for scoring phase
682
+
683
+ scoring_pool = None
684
+ try:
685
+ self.logger.debug(f"🔍 FILTERING: Starting parallel scoring with timeout {pool_timeout}s")
686
+ scoring_pool = Pool(processes=num_processes)
687
+ # Submit scoring jobs with timeout
688
+ async_results = []
689
+ batch_size = 50
690
+
691
+ self.logger.debug(f"🔍 FILTERING: Splitting {len(anchors)} anchors into batches of {batch_size}")
692
+ for i in range(0, len(anchors), batch_size):
693
+ batch = anchors[i:i + batch_size]
694
+ async_result = scoring_pool.apply_async(self._score_batch_static, (batch, context))
695
+ async_results.append(async_result)
696
+
697
+ self.logger.debug(f"🔍 FILTERING: Submitted {len(async_results)} scoring batches")
698
+
699
+ # Collect results with timeout
700
+ for i, async_result in enumerate(async_results):
701
+ try:
702
+ self.logger.debug(f"🔍 FILTERING: Collecting batch {i+1}/{len(async_results)}")
703
+ batch_results = async_result.get(timeout=pool_timeout)
704
+ scored_anchors.extend(batch_results)
705
+ self.logger.debug(f"🔍 FILTERING: Completed scoring batch {i+1}/{len(async_results)}")
706
+ except Exception as e:
707
+ self.logger.warning(f"🔍 FILTERING: ⚠️ Scoring batch {i+1} failed or timed out: {str(e)}")
708
+ # Add basic scores for failed batch
709
+ start_idx = i * batch_size
710
+ end_idx = min((i + 1) * batch_size, len(anchors))
711
+ for j in range(start_idx, end_idx):
712
+ if j < len(anchors):
713
+ try:
714
+ phrase_score = PhraseScore(
715
+ total_score=1.0,
716
+ natural_break_score=1.0,
717
+ phrase_type=PhraseType.COMPLETE
718
+ )
719
+ scored_anchors.append(ScoredAnchor(anchor=anchors[j], phrase_score=phrase_score))
720
+ except:
721
+ continue
722
+
723
+ # Explicitly cleanup pool to avoid hangs in containerized environments
724
+ self.logger.debug(f"🔍 FILTERING: Cleaning up scoring pool...")
725
+ scoring_pool.close()
726
+ scoring_pool.terminate()
727
+ self.logger.debug(f"🔍 FILTERING: Scoring pool cleanup completed")
728
+
729
+ except Exception as e:
730
+ self.logger.warning(f"🔍 FILTERING: ❌ Parallel scoring failed: {str(e)}, falling back to basic scoring")
731
+ # Fall back to basic scoring
732
+ for anchor in anchors:
733
+ try:
734
+ phrase_score = PhraseScore(
735
+ total_score=1.0,
736
+ natural_break_score=1.0,
737
+ phrase_type=PhraseType.COMPLETE
738
+ )
739
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
740
+ except:
741
+ continue
742
+ finally:
743
+ # Always ensure scoring pool is cleaned up to avoid hangs
744
+ if scoring_pool is not None:
745
+ try:
746
+ self.logger.debug(f"🔍 FILTERING: Final scoring pool cleanup...")
747
+ scoring_pool.terminate()
748
+ scoring_pool.join(timeout=5) # Wait max 5 seconds for workers to terminate
749
+ self.logger.debug(f"🔍 FILTERING: Final scoring pool cleanup completed")
750
+ except Exception as cleanup_error:
751
+ self.logger.debug(f"🔍 FILTERING: Scoring pool cleanup error (ignored): {cleanup_error}")
752
+
753
+ parallel_time = time.time() - start_time
754
+ self.logger.debug(f"🔍 FILTERING: Parallel scoring completed in {parallel_time:.2f}s, scored {len(scored_anchors)} anchors")
755
+
756
+ # Sort and filter as before
757
+ self.logger.debug(f"🔍 FILTERING: Sorting anchors by priority...")
758
+ scored_anchors.sort(key=self._get_sequence_priority, reverse=True)
759
+ self.logger.debug(f"🔍 FILTERING: Sorting completed")
760
+
761
+ self.logger.debug(f"🔍 FILTERING: Filtering {len(scored_anchors)} overlapping sequences")
762
+ filtered_scored = []
763
+
764
+ for i, scored_anchor in enumerate(scored_anchors):
765
+ # Check timeout every 100 anchors using our timeout mechanism (more lenient)
766
+ if i % 100 == 0 and i > 0:
767
+ # Only check timeout if we're significantly over the limit
768
+ if self.timeout_seconds > 0:
769
+ elapsed_time = time.time() - start_time
770
+ # Use a more lenient timeout for filtering (allow 50% more time)
771
+ if elapsed_time > (self.timeout_seconds * 1.5):
772
+ self.logger.warning(f"🔍 FILTERING: ⏰ Filtering timed out, returning {len(filtered_scored)} anchors out of {len(scored_anchors)}")
773
+ break
774
+
775
+ self.logger.debug(f"🔍 FILTERING: Progress: {i}/{len(scored_anchors)} processed, {len(filtered_scored)} kept")
776
+
777
+ overlaps = False
778
+ for existing in filtered_scored:
779
+ if self._sequences_overlap(scored_anchor.anchor, existing.anchor):
780
+ overlaps = True
781
+ break
782
+
783
+ if not overlaps:
784
+ filtered_scored.append(scored_anchor)
785
+
786
+ self.logger.debug(f"🔍 FILTERING: Filtering completed - kept {len(filtered_scored)} non-overlapping anchors out of {len(scored_anchors)}")
787
+ return filtered_scored
788
+
789
+ @staticmethod
790
+ def _score_anchor_static(anchor: AnchorSequence, context: str) -> ScoredAnchor:
791
+ """Static version of _score_anchor for multiprocessing compatibility."""
792
+ # Create analyzer only once per process
793
+ if not hasattr(AnchorSequenceFinder._score_anchor_static, "_phrase_analyzer"):
794
+ AnchorSequenceFinder._score_anchor_static._phrase_analyzer = PhraseAnalyzer(logger=logging.getLogger(__name__))
795
+
796
+ # Get the words from the transcribed word IDs
797
+ # We need to pass in the actual words for scoring
798
+ words = [w.text for w in anchor.transcribed_words] # This needs to be passed in
799
+
800
+ phrase_score = AnchorSequenceFinder._score_anchor_static._phrase_analyzer.score_phrase(words, context)
801
+ return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
802
+
803
+ @staticmethod
804
+ def _score_batch_static(anchors: List[AnchorSequence], context: str) -> List[ScoredAnchor]:
805
+ """Score a batch of anchors for better timeout handling."""
806
+ # Create analyzer only once per process
807
+ if not hasattr(AnchorSequenceFinder._score_batch_static, "_phrase_analyzer"):
808
+ AnchorSequenceFinder._score_batch_static._phrase_analyzer = PhraseAnalyzer(logger=logging.getLogger(__name__))
809
+
810
+ scored_anchors = []
811
+ for anchor in anchors:
812
+ try:
813
+ words = [w.text for w in anchor.transcribed_words]
814
+ phrase_score = AnchorSequenceFinder._score_batch_static._phrase_analyzer.score_phrase(words, context)
815
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
816
+ except Exception:
817
+ # Add basic score for failed anchor
818
+ phrase_score = PhraseScore(
819
+ total_score=1.0,
820
+ natural_break_score=1.0,
821
+ phrase_type=PhraseType.COMPLETE
822
+ )
823
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
824
+
825
+ return scored_anchors
826
+
827
+ def _get_reference_words(self, source: str, ref_words: List[str], start_pos: Optional[int], end_pos: Optional[int]) -> List[str]:
828
+ """Get words from reference text between two positions.
829
+
830
+ Args:
831
+ source: Reference source identifier
832
+ ref_words: List of words from the reference text
833
+ start_pos: Starting position (None for beginning)
834
+ end_pos: Ending position (None for end)
835
+
836
+ Returns:
837
+ List of words between the positions
838
+ """
839
+ if start_pos is None:
840
+ start_pos = 0
841
+ if end_pos is None:
842
+ end_pos = len(ref_words)
843
+ return ref_words[start_pos:end_pos]
844
+
845
+ def find_gaps(
846
+ self,
847
+ transcribed: str,
848
+ anchors: List[ScoredAnchor],
849
+ references: Dict[str, LyricsData],
850
+ transcription_result: TranscriptionResult,
851
+ ) -> List[GapSequence]:
852
+ """Find gaps between anchor sequences in the transcribed text."""
853
+ # Get all words from transcription
854
+ all_words = []
855
+ for segment in transcription_result.result.segments:
856
+ all_words.extend(segment.words)
857
+
858
+ # Clean and split reference texts
859
+ ref_texts_clean = {
860
+ source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
861
+ for source, lyrics in references.items()
862
+ }
863
+ ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
864
+
865
+ # Create gaps with Word IDs
866
+ gaps = []
867
+ sorted_anchors = sorted(anchors, key=lambda x: x.anchor.transcription_position)
868
+
869
+ # Handle initial gap
870
+ if sorted_anchors:
871
+ first_anchor = sorted_anchors[0].anchor
872
+ first_anchor_pos = first_anchor.transcription_position
873
+ if first_anchor_pos > 0:
874
+ gap_word_ids = [w.id for w in all_words[:first_anchor_pos]]
875
+ if gap := self._create_initial_gap(
876
+ id=WordUtils.generate_id(),
877
+ transcribed_word_ids=gap_word_ids,
878
+ transcription_position=0,
879
+ following_anchor_id=first_anchor.id,
880
+ ref_texts_clean=ref_texts_clean,
881
+ ref_words=ref_words,
882
+ following_anchor=first_anchor,
883
+ ):
884
+ gaps.append(gap)
885
+
886
+ # Handle gaps between anchors
887
+ for i in range(len(sorted_anchors) - 1):
888
+ current_anchor = sorted_anchors[i].anchor
889
+ next_anchor = sorted_anchors[i + 1].anchor
890
+ gap_start = current_anchor.transcription_position + len(current_anchor.transcribed_word_ids)
891
+ gap_end = next_anchor.transcription_position
892
+
893
+ if gap_end > gap_start:
894
+ gap_word_ids = [w.id for w in all_words[gap_start:gap_end]]
895
+ if between_gap := self._create_between_gap(
896
+ id=WordUtils.generate_id(),
897
+ transcribed_word_ids=gap_word_ids,
898
+ transcription_position=gap_start,
899
+ preceding_anchor_id=current_anchor.id,
900
+ following_anchor_id=next_anchor.id,
901
+ ref_texts_clean=ref_texts_clean,
902
+ ref_words=ref_words,
903
+ preceding_anchor=current_anchor,
904
+ following_anchor=next_anchor,
905
+ ):
906
+ gaps.append(between_gap)
907
+
908
+ # Handle final gap
909
+ if sorted_anchors:
910
+ last_anchor = sorted_anchors[-1].anchor
911
+ last_pos = last_anchor.transcription_position + len(last_anchor.transcribed_word_ids)
912
+ if last_pos < len(all_words):
913
+ gap_word_ids = [w.id for w in all_words[last_pos:]]
914
+ if final_gap := self._create_final_gap(
915
+ id=WordUtils.generate_id(),
916
+ transcribed_word_ids=gap_word_ids,
917
+ transcription_position=last_pos,
918
+ preceding_anchor_id=last_anchor.id,
919
+ ref_texts_clean=ref_texts_clean,
920
+ ref_words=ref_words,
921
+ preceding_anchor=last_anchor,
922
+ ):
923
+ gaps.append(final_gap)
924
+
925
+ return gaps
926
+
927
+ def _create_initial_gap(
928
+ self,
929
+ id: str,
930
+ transcribed_word_ids: List[str],
931
+ transcription_position: int,
932
+ following_anchor_id: str,
933
+ ref_texts_clean: Dict[str, List[str]],
934
+ ref_words: Dict[str, List[Word]],
935
+ following_anchor: AnchorSequence,
936
+ ) -> Optional[GapSequence]:
937
+ """Create gap sequence before the first anchor.
938
+
939
+ The gap includes all reference words from the start of each reference
940
+ up to the position where the following anchor starts in that reference.
941
+ """
942
+ if transcription_position > 0:
943
+ # Get reference word IDs for the gap
944
+ reference_word_ids = {}
945
+ for source, words in ref_words.items():
946
+ if source in ref_texts_clean:
947
+ # Get the position where the following anchor starts in this source
948
+ if source in following_anchor.reference_positions:
949
+ end_pos = following_anchor.reference_positions[source]
950
+ # Include all words from start up to the anchor
951
+ reference_word_ids[source] = [w.id for w in words[:end_pos]]
952
+ else:
953
+ # If this source doesn't contain the following anchor,
954
+ # we can't determine the gap content for it
955
+ reference_word_ids[source] = []
956
+
957
+ return GapSequence(
958
+ id=id,
959
+ transcribed_word_ids=transcribed_word_ids,
960
+ transcription_position=transcription_position,
961
+ preceding_anchor_id=None,
962
+ following_anchor_id=following_anchor_id,
963
+ reference_word_ids=reference_word_ids,
964
+ )
965
+ return None
966
+
967
+ def _create_between_gap(
968
+ self,
969
+ id: str,
970
+ transcribed_word_ids: List[str],
971
+ transcription_position: int,
972
+ preceding_anchor_id: str,
973
+ following_anchor_id: str,
974
+ ref_texts_clean: Dict[str, List[str]],
975
+ ref_words: Dict[str, List[Word]],
976
+ preceding_anchor: AnchorSequence,
977
+ following_anchor: AnchorSequence,
978
+ ) -> Optional[GapSequence]:
979
+ """Create gap sequence between two anchors.
980
+
981
+ For each reference source, the gap includes all words between the end of the
982
+ preceding anchor and the start of the following anchor in that source.
983
+ """
984
+ # Get reference word IDs for the gap
985
+ reference_word_ids = {}
986
+ for source, words in ref_words.items():
987
+ if source in ref_texts_clean:
988
+ # Only process sources that contain both anchors
989
+ if source in preceding_anchor.reference_positions and source in following_anchor.reference_positions:
990
+ start_pos = preceding_anchor.reference_positions[source] + len(preceding_anchor.reference_word_ids[source])
991
+ end_pos = following_anchor.reference_positions[source]
992
+ # Include all words between the anchors
993
+ reference_word_ids[source] = [w.id for w in words[start_pos:end_pos]]
994
+ else:
995
+ # If this source doesn't contain both anchors,
996
+ # we can't determine the gap content for it
997
+ reference_word_ids[source] = []
998
+
999
+ return GapSequence(
1000
+ id=id,
1001
+ transcribed_word_ids=transcribed_word_ids,
1002
+ transcription_position=transcription_position,
1003
+ preceding_anchor_id=preceding_anchor_id,
1004
+ following_anchor_id=following_anchor_id,
1005
+ reference_word_ids=reference_word_ids,
1006
+ )
1007
+
1008
+ def _create_final_gap(
1009
+ self,
1010
+ id: str,
1011
+ transcribed_word_ids: List[str],
1012
+ transcription_position: int,
1013
+ preceding_anchor_id: str,
1014
+ ref_texts_clean: Dict[str, List[str]],
1015
+ ref_words: Dict[str, List[Word]],
1016
+ preceding_anchor: AnchorSequence,
1017
+ ) -> Optional[GapSequence]:
1018
+ """Create gap sequence after the last anchor.
1019
+
1020
+ For each reference source, includes all words from the end of the
1021
+ preceding anchor to the end of that reference.
1022
+ """
1023
+ # Get reference word IDs for the gap
1024
+ reference_word_ids = {}
1025
+ for source, words in ref_words.items():
1026
+ if source in ref_texts_clean:
1027
+ if source in preceding_anchor.reference_positions:
1028
+ start_pos = preceding_anchor.reference_positions[source] + len(preceding_anchor.reference_word_ids[source])
1029
+ # Include all words from end of last anchor to end of reference
1030
+ reference_word_ids[source] = [w.id for w in words[start_pos:]]
1031
+ else:
1032
+ # If this source doesn't contain the preceding anchor,
1033
+ # we can't determine the gap content for it
1034
+ reference_word_ids[source] = []
1035
+
1036
+ return GapSequence(
1037
+ id=id,
1038
+ transcribed_word_ids=transcribed_word_ids,
1039
+ transcription_position=transcription_position,
1040
+ preceding_anchor_id=preceding_anchor_id,
1041
+ following_anchor_id=None,
1042
+ reference_word_ids=reference_word_ids,
1043
+ )