karaoke-gen 0.75.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of karaoke-gen might be problematic. Click here for more details.

Files changed (287) hide show
  1. karaoke_gen/__init__.py +38 -0
  2. karaoke_gen/audio_fetcher.py +1614 -0
  3. karaoke_gen/audio_processor.py +790 -0
  4. karaoke_gen/config.py +83 -0
  5. karaoke_gen/file_handler.py +387 -0
  6. karaoke_gen/instrumental_review/__init__.py +45 -0
  7. karaoke_gen/instrumental_review/analyzer.py +408 -0
  8. karaoke_gen/instrumental_review/editor.py +322 -0
  9. karaoke_gen/instrumental_review/models.py +171 -0
  10. karaoke_gen/instrumental_review/server.py +475 -0
  11. karaoke_gen/instrumental_review/static/index.html +1529 -0
  12. karaoke_gen/instrumental_review/waveform.py +409 -0
  13. karaoke_gen/karaoke_finalise/__init__.py +1 -0
  14. karaoke_gen/karaoke_finalise/karaoke_finalise.py +1833 -0
  15. karaoke_gen/karaoke_gen.py +1026 -0
  16. karaoke_gen/lyrics_processor.py +474 -0
  17. karaoke_gen/metadata.py +160 -0
  18. karaoke_gen/pipeline/__init__.py +87 -0
  19. karaoke_gen/pipeline/base.py +215 -0
  20. karaoke_gen/pipeline/context.py +230 -0
  21. karaoke_gen/pipeline/executors/__init__.py +21 -0
  22. karaoke_gen/pipeline/executors/local.py +159 -0
  23. karaoke_gen/pipeline/executors/remote.py +257 -0
  24. karaoke_gen/pipeline/stages/__init__.py +27 -0
  25. karaoke_gen/pipeline/stages/finalize.py +202 -0
  26. karaoke_gen/pipeline/stages/render.py +165 -0
  27. karaoke_gen/pipeline/stages/screens.py +139 -0
  28. karaoke_gen/pipeline/stages/separation.py +191 -0
  29. karaoke_gen/pipeline/stages/transcription.py +191 -0
  30. karaoke_gen/resources/AvenirNext-Bold.ttf +0 -0
  31. karaoke_gen/resources/Montserrat-Bold.ttf +0 -0
  32. karaoke_gen/resources/Oswald-Bold.ttf +0 -0
  33. karaoke_gen/resources/Oswald-SemiBold.ttf +0 -0
  34. karaoke_gen/resources/Zurich_Cn_BT_Bold.ttf +0 -0
  35. karaoke_gen/style_loader.py +531 -0
  36. karaoke_gen/utils/__init__.py +18 -0
  37. karaoke_gen/utils/bulk_cli.py +492 -0
  38. karaoke_gen/utils/cli_args.py +432 -0
  39. karaoke_gen/utils/gen_cli.py +978 -0
  40. karaoke_gen/utils/remote_cli.py +3268 -0
  41. karaoke_gen/video_background_processor.py +351 -0
  42. karaoke_gen/video_generator.py +424 -0
  43. karaoke_gen-0.75.54.dist-info/METADATA +718 -0
  44. karaoke_gen-0.75.54.dist-info/RECORD +287 -0
  45. karaoke_gen-0.75.54.dist-info/WHEEL +4 -0
  46. karaoke_gen-0.75.54.dist-info/entry_points.txt +5 -0
  47. karaoke_gen-0.75.54.dist-info/licenses/LICENSE +21 -0
  48. lyrics_transcriber/__init__.py +10 -0
  49. lyrics_transcriber/cli/__init__.py +0 -0
  50. lyrics_transcriber/cli/cli_main.py +285 -0
  51. lyrics_transcriber/core/__init__.py +0 -0
  52. lyrics_transcriber/core/config.py +50 -0
  53. lyrics_transcriber/core/controller.py +594 -0
  54. lyrics_transcriber/correction/__init__.py +0 -0
  55. lyrics_transcriber/correction/agentic/__init__.py +9 -0
  56. lyrics_transcriber/correction/agentic/adapter.py +71 -0
  57. lyrics_transcriber/correction/agentic/agent.py +313 -0
  58. lyrics_transcriber/correction/agentic/feedback/aggregator.py +12 -0
  59. lyrics_transcriber/correction/agentic/feedback/collector.py +17 -0
  60. lyrics_transcriber/correction/agentic/feedback/retention.py +24 -0
  61. lyrics_transcriber/correction/agentic/feedback/store.py +76 -0
  62. lyrics_transcriber/correction/agentic/handlers/__init__.py +24 -0
  63. lyrics_transcriber/correction/agentic/handlers/ambiguous.py +44 -0
  64. lyrics_transcriber/correction/agentic/handlers/background_vocals.py +68 -0
  65. lyrics_transcriber/correction/agentic/handlers/base.py +51 -0
  66. lyrics_transcriber/correction/agentic/handlers/complex_multi_error.py +46 -0
  67. lyrics_transcriber/correction/agentic/handlers/extra_words.py +74 -0
  68. lyrics_transcriber/correction/agentic/handlers/no_error.py +42 -0
  69. lyrics_transcriber/correction/agentic/handlers/punctuation.py +44 -0
  70. lyrics_transcriber/correction/agentic/handlers/registry.py +60 -0
  71. lyrics_transcriber/correction/agentic/handlers/repeated_section.py +44 -0
  72. lyrics_transcriber/correction/agentic/handlers/sound_alike.py +126 -0
  73. lyrics_transcriber/correction/agentic/models/__init__.py +5 -0
  74. lyrics_transcriber/correction/agentic/models/ai_correction.py +31 -0
  75. lyrics_transcriber/correction/agentic/models/correction_session.py +30 -0
  76. lyrics_transcriber/correction/agentic/models/enums.py +38 -0
  77. lyrics_transcriber/correction/agentic/models/human_feedback.py +30 -0
  78. lyrics_transcriber/correction/agentic/models/learning_data.py +26 -0
  79. lyrics_transcriber/correction/agentic/models/observability_metrics.py +28 -0
  80. lyrics_transcriber/correction/agentic/models/schemas.py +46 -0
  81. lyrics_transcriber/correction/agentic/models/utils.py +19 -0
  82. lyrics_transcriber/correction/agentic/observability/__init__.py +5 -0
  83. lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +35 -0
  84. lyrics_transcriber/correction/agentic/observability/metrics.py +46 -0
  85. lyrics_transcriber/correction/agentic/observability/performance.py +19 -0
  86. lyrics_transcriber/correction/agentic/prompts/__init__.py +2 -0
  87. lyrics_transcriber/correction/agentic/prompts/classifier.py +227 -0
  88. lyrics_transcriber/correction/agentic/providers/__init__.py +6 -0
  89. lyrics_transcriber/correction/agentic/providers/base.py +36 -0
  90. lyrics_transcriber/correction/agentic/providers/circuit_breaker.py +145 -0
  91. lyrics_transcriber/correction/agentic/providers/config.py +73 -0
  92. lyrics_transcriber/correction/agentic/providers/constants.py +24 -0
  93. lyrics_transcriber/correction/agentic/providers/health.py +28 -0
  94. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +212 -0
  95. lyrics_transcriber/correction/agentic/providers/model_factory.py +209 -0
  96. lyrics_transcriber/correction/agentic/providers/response_cache.py +218 -0
  97. lyrics_transcriber/correction/agentic/providers/response_parser.py +111 -0
  98. lyrics_transcriber/correction/agentic/providers/retry_executor.py +127 -0
  99. lyrics_transcriber/correction/agentic/router.py +35 -0
  100. lyrics_transcriber/correction/agentic/workflows/__init__.py +5 -0
  101. lyrics_transcriber/correction/agentic/workflows/consensus_workflow.py +24 -0
  102. lyrics_transcriber/correction/agentic/workflows/correction_graph.py +59 -0
  103. lyrics_transcriber/correction/agentic/workflows/feedback_workflow.py +24 -0
  104. lyrics_transcriber/correction/anchor_sequence.py +919 -0
  105. lyrics_transcriber/correction/corrector.py +760 -0
  106. lyrics_transcriber/correction/feedback/__init__.py +2 -0
  107. lyrics_transcriber/correction/feedback/schemas.py +107 -0
  108. lyrics_transcriber/correction/feedback/store.py +236 -0
  109. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  110. lyrics_transcriber/correction/handlers/base.py +52 -0
  111. lyrics_transcriber/correction/handlers/extend_anchor.py +149 -0
  112. lyrics_transcriber/correction/handlers/levenshtein.py +189 -0
  113. lyrics_transcriber/correction/handlers/llm.py +293 -0
  114. lyrics_transcriber/correction/handlers/llm_providers.py +60 -0
  115. lyrics_transcriber/correction/handlers/no_space_punct_match.py +154 -0
  116. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +85 -0
  117. lyrics_transcriber/correction/handlers/repeat.py +88 -0
  118. lyrics_transcriber/correction/handlers/sound_alike.py +259 -0
  119. lyrics_transcriber/correction/handlers/syllables_match.py +252 -0
  120. lyrics_transcriber/correction/handlers/word_count_match.py +80 -0
  121. lyrics_transcriber/correction/handlers/word_operations.py +187 -0
  122. lyrics_transcriber/correction/operations.py +352 -0
  123. lyrics_transcriber/correction/phrase_analyzer.py +435 -0
  124. lyrics_transcriber/correction/text_utils.py +30 -0
  125. lyrics_transcriber/frontend/.gitignore +23 -0
  126. lyrics_transcriber/frontend/.yarn/releases/yarn-4.7.0.cjs +935 -0
  127. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  128. lyrics_transcriber/frontend/README.md +50 -0
  129. lyrics_transcriber/frontend/REPLACE_ALL_FUNCTIONALITY.md +210 -0
  130. lyrics_transcriber/frontend/__init__.py +25 -0
  131. lyrics_transcriber/frontend/eslint.config.js +28 -0
  132. lyrics_transcriber/frontend/index.html +18 -0
  133. lyrics_transcriber/frontend/package.json +42 -0
  134. lyrics_transcriber/frontend/public/android-chrome-192x192.png +0 -0
  135. lyrics_transcriber/frontend/public/android-chrome-512x512.png +0 -0
  136. lyrics_transcriber/frontend/public/apple-touch-icon.png +0 -0
  137. lyrics_transcriber/frontend/public/favicon-16x16.png +0 -0
  138. lyrics_transcriber/frontend/public/favicon-32x32.png +0 -0
  139. lyrics_transcriber/frontend/public/favicon.ico +0 -0
  140. lyrics_transcriber/frontend/public/nomad-karaoke-logo.png +0 -0
  141. lyrics_transcriber/frontend/src/App.tsx +214 -0
  142. lyrics_transcriber/frontend/src/api.ts +254 -0
  143. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +77 -0
  144. lyrics_transcriber/frontend/src/components/AddLyricsModal.tsx +114 -0
  145. lyrics_transcriber/frontend/src/components/AgenticCorrectionMetrics.tsx +204 -0
  146. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +180 -0
  147. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +167 -0
  148. lyrics_transcriber/frontend/src/components/CorrectionAnnotationModal.tsx +359 -0
  149. lyrics_transcriber/frontend/src/components/CorrectionDetailCard.tsx +281 -0
  150. lyrics_transcriber/frontend/src/components/CorrectionMetrics.tsx +162 -0
  151. lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +257 -0
  152. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +68 -0
  153. lyrics_transcriber/frontend/src/components/EditModal.tsx +702 -0
  154. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +496 -0
  155. lyrics_transcriber/frontend/src/components/EditWordList.tsx +379 -0
  156. lyrics_transcriber/frontend/src/components/FileUpload.tsx +77 -0
  157. lyrics_transcriber/frontend/src/components/FindReplaceModal.tsx +467 -0
  158. lyrics_transcriber/frontend/src/components/Header.tsx +413 -0
  159. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +1387 -0
  160. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
  161. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
  162. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
  163. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
  164. lyrics_transcriber/frontend/src/components/MetricsDashboard.tsx +51 -0
  165. lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
  166. lyrics_transcriber/frontend/src/components/ModeSelector.tsx +67 -0
  167. lyrics_transcriber/frontend/src/components/ModelSelector.tsx +23 -0
  168. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +144 -0
  169. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +268 -0
  170. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +336 -0
  171. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +354 -0
  172. lyrics_transcriber/frontend/src/components/SegmentDetailsModal.tsx +64 -0
  173. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +376 -0
  174. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +131 -0
  175. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +256 -0
  176. lyrics_transcriber/frontend/src/components/WordDivider.tsx +187 -0
  177. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +379 -0
  178. lyrics_transcriber/frontend/src/components/shared/components/SourceSelector.tsx +56 -0
  179. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +87 -0
  180. lyrics_transcriber/frontend/src/components/shared/constants.ts +20 -0
  181. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +180 -0
  182. lyrics_transcriber/frontend/src/components/shared/styles.ts +13 -0
  183. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  184. lyrics_transcriber/frontend/src/components/shared/types.ts +129 -0
  185. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +177 -0
  186. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  187. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +75 -0
  188. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +360 -0
  189. lyrics_transcriber/frontend/src/components/shared/utils/timingUtils.ts +110 -0
  190. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  191. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +435 -0
  192. lyrics_transcriber/frontend/src/main.tsx +17 -0
  193. lyrics_transcriber/frontend/src/theme.ts +177 -0
  194. lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
  195. lyrics_transcriber/frontend/src/types.js +2 -0
  196. lyrics_transcriber/frontend/src/types.ts +199 -0
  197. lyrics_transcriber/frontend/src/validation.ts +132 -0
  198. lyrics_transcriber/frontend/src/vite-env.d.ts +1 -0
  199. lyrics_transcriber/frontend/tsconfig.app.json +26 -0
  200. lyrics_transcriber/frontend/tsconfig.json +25 -0
  201. lyrics_transcriber/frontend/tsconfig.node.json +23 -0
  202. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -0
  203. lyrics_transcriber/frontend/update_version.js +11 -0
  204. lyrics_transcriber/frontend/vite.config.d.ts +2 -0
  205. lyrics_transcriber/frontend/vite.config.js +10 -0
  206. lyrics_transcriber/frontend/vite.config.ts +11 -0
  207. lyrics_transcriber/frontend/web_assets/android-chrome-192x192.png +0 -0
  208. lyrics_transcriber/frontend/web_assets/android-chrome-512x512.png +0 -0
  209. lyrics_transcriber/frontend/web_assets/apple-touch-icon.png +0 -0
  210. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js +43288 -0
  211. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +1 -0
  212. lyrics_transcriber/frontend/web_assets/favicon-16x16.png +0 -0
  213. lyrics_transcriber/frontend/web_assets/favicon-32x32.png +0 -0
  214. lyrics_transcriber/frontend/web_assets/favicon.ico +0 -0
  215. lyrics_transcriber/frontend/web_assets/index.html +18 -0
  216. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.png +0 -0
  217. lyrics_transcriber/frontend/yarn.lock +3752 -0
  218. lyrics_transcriber/lyrics/__init__.py +0 -0
  219. lyrics_transcriber/lyrics/base_lyrics_provider.py +211 -0
  220. lyrics_transcriber/lyrics/file_provider.py +95 -0
  221. lyrics_transcriber/lyrics/genius.py +384 -0
  222. lyrics_transcriber/lyrics/lrclib.py +231 -0
  223. lyrics_transcriber/lyrics/musixmatch.py +156 -0
  224. lyrics_transcriber/lyrics/spotify.py +290 -0
  225. lyrics_transcriber/lyrics/user_input_provider.py +44 -0
  226. lyrics_transcriber/output/__init__.py +0 -0
  227. lyrics_transcriber/output/ass/__init__.py +21 -0
  228. lyrics_transcriber/output/ass/ass.py +2088 -0
  229. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  230. lyrics_transcriber/output/ass/config.py +180 -0
  231. lyrics_transcriber/output/ass/constants.py +23 -0
  232. lyrics_transcriber/output/ass/event.py +94 -0
  233. lyrics_transcriber/output/ass/formatters.py +132 -0
  234. lyrics_transcriber/output/ass/lyrics_line.py +265 -0
  235. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  236. lyrics_transcriber/output/ass/section_detector.py +89 -0
  237. lyrics_transcriber/output/ass/section_screen.py +106 -0
  238. lyrics_transcriber/output/ass/style.py +187 -0
  239. lyrics_transcriber/output/cdg.py +619 -0
  240. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  241. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  242. lyrics_transcriber/output/cdgmaker/composer.py +2260 -0
  243. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  244. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  245. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  246. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  247. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  248. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  249. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  250. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  251. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  252. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  253. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  254. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  255. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  256. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  257. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  258. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  259. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  260. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  261. lyrics_transcriber/output/countdown_processor.py +306 -0
  262. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  263. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  264. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  265. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  266. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  267. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  268. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  269. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  270. lyrics_transcriber/output/generator.py +257 -0
  271. lyrics_transcriber/output/lrc_to_cdg.py +61 -0
  272. lyrics_transcriber/output/lyrics_file.py +102 -0
  273. lyrics_transcriber/output/plain_text.py +96 -0
  274. lyrics_transcriber/output/segment_resizer.py +431 -0
  275. lyrics_transcriber/output/subtitles.py +397 -0
  276. lyrics_transcriber/output/video.py +544 -0
  277. lyrics_transcriber/review/__init__.py +0 -0
  278. lyrics_transcriber/review/server.py +676 -0
  279. lyrics_transcriber/storage/__init__.py +0 -0
  280. lyrics_transcriber/storage/dropbox.py +225 -0
  281. lyrics_transcriber/transcribers/__init__.py +0 -0
  282. lyrics_transcriber/transcribers/audioshake.py +379 -0
  283. lyrics_transcriber/transcribers/base_transcriber.py +157 -0
  284. lyrics_transcriber/transcribers/whisper.py +330 -0
  285. lyrics_transcriber/types.py +650 -0
  286. lyrics_transcriber/utils/__init__.py +0 -0
  287. lyrics_transcriber/utils/word_utils.py +27 -0
@@ -0,0 +1,919 @@
1
+ import threading
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
4
+ import logging
5
+ from tqdm import tqdm
6
+ from functools import partial
7
+ from pathlib import Path
8
+ import json
9
+ import hashlib
10
+
11
+ from lyrics_transcriber.types import LyricsData, PhraseScore, PhraseType, AnchorSequence, GapSequence, ScoredAnchor, TranscriptionResult, Word
12
+ from lyrics_transcriber.correction.phrase_analyzer import PhraseAnalyzer
13
+ from lyrics_transcriber.correction.text_utils import clean_text
14
+ from lyrics_transcriber.utils.word_utils import WordUtils
15
+
16
+
17
+ class AnchorSequenceTimeoutError(Exception):
18
+ """Raised when anchor sequence computation exceeds timeout."""
19
+ pass
20
+
21
+
22
+ class AnchorSequenceFinder:
23
+ """Identifies and manages anchor sequences between transcribed and reference lyrics."""
24
+
25
+ def __init__(
26
+ self,
27
+ cache_dir: Union[str, Path],
28
+ min_sequence_length: int = 3,
29
+ min_sources: int = 1,
30
+ timeout_seconds: int = 600, # 10 minutes default timeout
31
+ max_iterations_per_ngram: int = 1000, # Maximum iterations for while loop
32
+ progress_check_interval: int = 50, # Check progress every N iterations
33
+ logger: Optional[logging.Logger] = None,
34
+ ):
35
+ self.min_sequence_length = min_sequence_length
36
+ self.min_sources = min_sources
37
+ self.timeout_seconds = timeout_seconds
38
+ self.max_iterations_per_ngram = max_iterations_per_ngram
39
+ self.progress_check_interval = progress_check_interval
40
+ self.logger = logger or logging.getLogger(__name__)
41
+ self.phrase_analyzer = PhraseAnalyzer(logger=self.logger)
42
+ self.used_positions = {}
43
+
44
+ # Initialize cache directory
45
+ self.cache_dir = Path(cache_dir)
46
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
47
+ self.logger.info(f"Initialized AnchorSequenceFinder with cache dir: {self.cache_dir}, timeout: {timeout_seconds}s")
48
+
49
+ def _check_timeout(self, start_time: float, operation_name: str = "operation"):
50
+ """Check if timeout has occurred and raise exception if so."""
51
+ if self.timeout_seconds > 0:
52
+ elapsed_time = time.time() - start_time
53
+ if elapsed_time > self.timeout_seconds:
54
+ raise AnchorSequenceTimeoutError(f"{operation_name} exceeded {self.timeout_seconds} seconds (elapsed: {elapsed_time:.1f}s)")
55
+
56
+ def _clean_text(self, text: str) -> str:
57
+ """Clean text by removing punctuation and normalizing whitespace."""
58
+ # self.logger.debug(f"_clean_text called with text length: {len(text)}")
59
+ return clean_text(text)
60
+
61
+ def _find_ngrams(self, words: List[str], n: int) -> List[Tuple[List[str], int]]:
62
+ """Generate n-grams with their starting positions."""
63
+ # self.logger.debug(f"_find_ngrams called with {len(words)} words, n={n}")
64
+ return [(words[i : i + n], i) for i in range(len(words) - n + 1)]
65
+
66
+ def _build_ngram_index(
67
+ self,
68
+ references: Dict[str, List[str]],
69
+ n: int
70
+ ) -> Dict[Tuple[str, ...], Dict[str, List[int]]]:
71
+ """
72
+ Build a hash-based index mapping n-grams to their positions in each reference.
73
+
74
+ Args:
75
+ references: Dict mapping source names to lists of cleaned words
76
+ n: The n-gram length to index
77
+
78
+ Returns:
79
+ Dict mapping n-gram tuples to {source: [positions]} dicts
80
+ """
81
+ index: Dict[Tuple[str, ...], Dict[str, List[int]]] = {}
82
+
83
+ for source, words in references.items():
84
+ for i in range(len(words) - n + 1):
85
+ ngram_tuple = tuple(words[i:i + n])
86
+ if ngram_tuple not in index:
87
+ index[ngram_tuple] = {}
88
+ if source not in index[ngram_tuple]:
89
+ index[ngram_tuple][source] = []
90
+ index[ngram_tuple][source].append(i)
91
+
92
+ return index
93
+
94
+ def _find_matching_sources_indexed(
95
+ self,
96
+ ngram: List[str],
97
+ ngram_index: Dict[Tuple[str, ...], Dict[str, List[int]]]
98
+ ) -> Dict[str, int]:
99
+ """
100
+ Find which sources contain the given n-gram using pre-built index (O(1) lookup).
101
+
102
+ Args:
103
+ ngram: List of words to find
104
+ ngram_index: Pre-built index from _build_ngram_index()
105
+
106
+ Returns:
107
+ Dict mapping source names to first unused position
108
+ """
109
+ matches = {}
110
+ ngram_tuple = tuple(ngram)
111
+
112
+ # O(1) lookup in the index
113
+ if ngram_tuple not in ngram_index:
114
+ return matches
115
+
116
+ source_positions = ngram_index[ngram_tuple]
117
+
118
+ # For each source that contains this n-gram, find first unused position
119
+ for source, positions in source_positions.items():
120
+ used = self.used_positions.get(source, set())
121
+ for pos in positions:
122
+ if pos not in used:
123
+ matches[source] = pos
124
+ break
125
+
126
+ return matches
127
+
128
+ def _find_matching_sources(self, ngram: List[str], references: Dict[str, List[str]], n: int) -> Dict[str, int]:
129
+ """Find which sources contain the given n-gram and at what positions (legacy O(n) method)."""
130
+ # self.logger.debug(f"_find_matching_sources called for ngram: '{' '.join(ngram)}'")
131
+ matches = {}
132
+ all_positions = {source: [] for source in references}
133
+
134
+ # First, find all positions in each source
135
+ for source, words in references.items():
136
+ for i in range(len(words) - n + 1):
137
+ if words[i : i + n] == ngram:
138
+ all_positions[source].append(i)
139
+
140
+ # Then, try to find an unused position for each source
141
+ for source, positions in all_positions.items():
142
+ used = self.used_positions.get(source, set())
143
+ # Try each position in order
144
+ for pos in positions:
145
+ if pos not in used:
146
+ matches[source] = pos
147
+ break
148
+
149
+ return matches
150
+
151
+ def _filter_used_positions(self, matches: Dict[str, int]) -> Dict[str, int]:
152
+ """Filter out positions that have already been used.
153
+
154
+ Args:
155
+ matches: Dict mapping source IDs to positions
156
+
157
+ Returns:
158
+ Dict mapping source IDs to unused positions
159
+ """
160
+ self.logger.debug(f"_filter_used_positions called with {len(matches)} matches")
161
+ return {source: pos for source, pos in matches.items() if pos not in self.used_positions.get(source, set())}
162
+
163
+ def _create_anchor(
164
+ self, ngram: List[str], trans_pos: int, matching_sources: Dict[str, int], total_sources: int
165
+ ) -> Optional[AnchorSequence]:
166
+ """Create an anchor sequence if it meets the minimum sources requirement."""
167
+ self.logger.debug(f"_create_anchor called for ngram: '{' '.join(ngram)}' at position {trans_pos}")
168
+ if len(matching_sources) >= self.min_sources:
169
+ confidence = len(matching_sources) / total_sources
170
+ # Use new API to avoid setting _words field
171
+ anchor = AnchorSequence(
172
+ id=WordUtils.generate_id(),
173
+ transcribed_word_ids=[WordUtils.generate_id() for _ in ngram],
174
+ transcription_position=trans_pos,
175
+ reference_positions=matching_sources,
176
+ reference_word_ids={source: [WordUtils.generate_id() for _ in ngram]
177
+ for source in matching_sources.keys()},
178
+ confidence=confidence
179
+ )
180
+ self.logger.debug(f"Found anchor sequence: '{' '.join(ngram)}' (confidence: {confidence:.2f})")
181
+ return anchor
182
+ return None
183
+
184
+ def _get_cache_key(self, transcribed: str, references: Dict[str, LyricsData], transcription_result: TranscriptionResult) -> str:
185
+ """Generate a unique cache key for the input combination."""
186
+ # Create a string that uniquely identifies the inputs, including word IDs
187
+ ref_texts = []
188
+ for source, lyrics in sorted(references.items()):
189
+ # Include both text and ID for each word to ensure cache uniqueness
190
+ words_with_ids = [f"{w.text}:{w.id}" for s in lyrics.segments for w in s.words]
191
+ ref_texts.append(f"{source}:{','.join(words_with_ids)}")
192
+
193
+ # Also include transcription word IDs to ensure complete matching
194
+ trans_words_with_ids = [f"{w.text}:{w.id}" for s in transcription_result.result.segments for w in s.words]
195
+
196
+ input_str = f"{transcribed}|" f"{','.join(trans_words_with_ids)}|" f"{','.join(ref_texts)}"
197
+ return hashlib.md5(input_str.encode()).hexdigest()
198
+
199
+ def _save_to_cache(self, cache_path: Path, anchors: List[ScoredAnchor]) -> None:
200
+ """Save results to cache file."""
201
+ self.logger.debug(f"Saving to cache: {cache_path}")
202
+ # Convert to dictionary format that matches the expected loading format
203
+ cache_data = [{"anchor": anchor.anchor.to_dict(), "phrase_score": anchor.phrase_score.to_dict()} for anchor in anchors]
204
+ with open(cache_path, "w") as f:
205
+ json.dump(cache_data, f, indent=2)
206
+
207
+ def _load_from_cache(self, cache_path: Path) -> Optional[List[ScoredAnchor]]:
208
+ """Load results from cache if available."""
209
+ try:
210
+ self.logger.debug(f"Attempting to load from cache: {cache_path}")
211
+ with open(cache_path, "r") as f:
212
+ cached_data = json.load(f)
213
+
214
+ self.logger.info("Loading anchors from cache")
215
+ try:
216
+ # Log the raw dictionary data instead of the object
217
+ # if cached_data:
218
+ # self.logger.debug(f"Cached data structure: {json.dumps(cached_data[0], indent=2)}")
219
+
220
+ # Convert cached data back to ScoredAnchor objects
221
+ anchors = []
222
+ for data in cached_data:
223
+ if "anchor" not in data or "phrase_score" not in data:
224
+ raise KeyError("Missing required keys: anchor, phrase_score")
225
+
226
+ anchor = AnchorSequence.from_dict(data["anchor"])
227
+ phrase_score = PhraseScore.from_dict(data["phrase_score"])
228
+ anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
229
+
230
+ return anchors
231
+
232
+ except KeyError as e:
233
+ self.logger.error(f"Cache format mismatch. Missing key: {e}")
234
+ # Log the raw data for debugging
235
+ if cached_data:
236
+ self.logger.error(f"First cached anchor data: {json.dumps(cached_data[0], indent=2)}")
237
+ self.logger.error("Expected keys: anchor, phrase_score")
238
+ self.logger.warning(f"Cache format mismatch: {e}. Recomputing.")
239
+ return None
240
+
241
+ except (FileNotFoundError, json.JSONDecodeError) as e:
242
+ self.logger.debug(f"Cache miss or invalid cache file: {e}")
243
+ return None
244
+ except Exception as e:
245
+ self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
246
+ return None
247
+
248
+ def _process_ngram_length(
249
+ self,
250
+ n: int,
251
+ trans_words: List[str],
252
+ all_words: List[Word],
253
+ ref_texts_clean: Dict[str, List[str]],
254
+ ref_words: Dict[str, List[Word]],
255
+ min_sources: int,
256
+ ) -> List[AnchorSequence]:
257
+ """Process a single n-gram length to find matching sequences using hash-based index."""
258
+ self.logger.debug(f"🔍 N-GRAM {n}: Starting processing with {len(trans_words)} transcription words")
259
+
260
+ candidate_anchors = []
261
+ used_trans_positions: Set[int] = set()
262
+
263
+ # Build hash-based index for O(1) lookups
264
+ ngram_index = self._build_ngram_index(ref_texts_clean, n)
265
+ self.logger.debug(f"🔍 N-GRAM {n}: Built index with {len(ngram_index)} unique n-grams")
266
+
267
+ # Generate n-grams from transcribed text
268
+ trans_ngrams = self._find_ngrams(trans_words, n)
269
+ self.logger.debug(f"🔍 N-GRAM {n}: Processing {len(trans_ngrams)} transcription n-grams")
270
+
271
+ # Single pass through all transcription n-grams
272
+ for ngram, trans_pos in trans_ngrams:
273
+ # Skip if we've already used this transcription position
274
+ if trans_pos in used_trans_positions:
275
+ continue
276
+
277
+ # Use indexed lookup (O(1) instead of O(n))
278
+ matches = self._find_matching_sources_indexed(ngram, ngram_index)
279
+
280
+ if len(matches) >= min_sources:
281
+ # Get Word IDs for transcribed words
282
+ transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
283
+
284
+ # Get Word IDs for reference words
285
+ reference_word_ids = {
286
+ source: [w.id for w in ref_words[source][pos : pos + n]]
287
+ for source, pos in matches.items()
288
+ }
289
+
290
+ # Mark transcription position as used
291
+ used_trans_positions.add(trans_pos)
292
+
293
+ # Mark reference positions as used
294
+ for source, pos in matches.items():
295
+ if source not in self.used_positions:
296
+ self.used_positions[source] = set()
297
+ self.used_positions[source].add(pos)
298
+
299
+ anchor = AnchorSequence(
300
+ id=WordUtils.generate_id(),
301
+ transcribed_word_ids=transcribed_word_ids,
302
+ transcription_position=trans_pos,
303
+ reference_positions=matches,
304
+ reference_word_ids=reference_word_ids,
305
+ confidence=len(matches) / len(ref_texts_clean),
306
+ )
307
+ candidate_anchors.append(anchor)
308
+
309
+ self.logger.debug(f"🔍 N-GRAM {n}: Found {len(candidate_anchors)} anchors")
310
+ return candidate_anchors
311
+
312
+ def find_anchors(
313
+ self,
314
+ transcribed: str,
315
+ references: Dict[str, LyricsData],
316
+ transcription_result: TranscriptionResult,
317
+ ) -> List[ScoredAnchor]:
318
+ """Find anchor sequences that appear in both transcription and references with timeout protection."""
319
+ start_time = time.time()
320
+
321
+ try:
322
+ self.logger.info(f"🔍 ANCHOR SEARCH: Starting find_anchors with timeout {self.timeout_seconds}s")
323
+ self.logger.info(f"🔍 ANCHOR SEARCH: Transcribed text length: {len(transcribed)}")
324
+ self.logger.info(f"🔍 ANCHOR SEARCH: Reference sources: {list(references.keys())}")
325
+
326
+ cache_key = self._get_cache_key(transcribed, references, transcription_result)
327
+ cache_path = self.cache_dir / f"anchors_{cache_key}.json"
328
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cache key: {cache_key}")
329
+
330
+ # Try to load from cache
331
+ self.logger.info(f"🔍 ANCHOR SEARCH: Checking cache at {cache_path}")
332
+ if cached_data := self._load_from_cache(cache_path):
333
+ self.logger.info("🔍 ANCHOR SEARCH: ✅ Cache hit! Loading anchors from cache")
334
+ try:
335
+ # Convert cached_data to dictionary before logging
336
+ if cached_data:
337
+ first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
338
+ return cached_data
339
+ except Exception as e:
340
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Error loading cache: {type(e).__name__}: {e}")
341
+ if cached_data:
342
+ try:
343
+ first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
344
+ self.logger.error(f"First cached anchor data: {json.dumps(first_anchor, indent=2)}")
345
+ except:
346
+ self.logger.error("Could not serialize first cached anchor for logging")
347
+
348
+ # If not in cache or cache format invalid, perform the computation
349
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cache miss - computing anchors")
350
+
351
+ # Reset used positions for fresh computation
352
+ self.used_positions = {}
353
+
354
+ # Check timeout before starting computation
355
+ self._check_timeout(start_time, "anchor computation initialization")
356
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - initialization")
357
+
358
+ # Get all words from transcription
359
+ self.logger.info(f"🔍 ANCHOR SEARCH: Extracting words from transcription result...")
360
+ all_words = []
361
+ for segment in transcription_result.result.segments:
362
+ all_words.extend(segment.words)
363
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Extracted {len(all_words)} words from transcription")
364
+
365
+ # Clean and split texts
366
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cleaning transcription words...")
367
+ trans_words = [w.text.lower().strip('.,?!"\n') for w in all_words]
368
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Cleaned {len(trans_words)} transcription words")
369
+
370
+ self.logger.info(f"🔍 ANCHOR SEARCH: Processing reference sources...")
371
+ ref_texts_clean = {
372
+ source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
373
+ for source, lyrics in references.items()
374
+ }
375
+ ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
376
+
377
+ for source, words in ref_texts_clean.items():
378
+ self.logger.info(f"🔍 ANCHOR SEARCH: Reference '{source}': {len(words)} words")
379
+
380
+ # Check timeout after preprocessing
381
+ self._check_timeout(start_time, "anchor computation preprocessing")
382
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - preprocessing")
383
+
384
+ # Filter out very short reference sources for n-gram length calculation
385
+ self.logger.info(f"🔍 ANCHOR SEARCH: Calculating n-gram lengths...")
386
+ valid_ref_lengths = [
387
+ len(words) for words in ref_texts_clean.values()
388
+ if len(words) >= self.min_sequence_length
389
+ ]
390
+
391
+ if not valid_ref_lengths:
392
+ self.logger.warning("🔍 ANCHOR SEARCH: ❌ No reference sources long enough for anchor detection")
393
+ return []
394
+
395
+ # Calculate max length using only valid reference sources
396
+ max_length = min(len(trans_words), min(valid_ref_lengths))
397
+ n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
398
+ self.logger.info(f"🔍 ANCHOR SEARCH: N-gram lengths to process: {list(n_gram_lengths)} (max_length: {max_length})")
399
+
400
+ # Process n-gram lengths in parallel with timeout
401
+ self.logger.info(f"🔍 ANCHOR SEARCH: Setting up parallel processing...")
402
+ process_length_partial = partial(
403
+ self._process_ngram_length,
404
+ trans_words=trans_words,
405
+ all_words=all_words, # Pass the Word objects
406
+ ref_texts_clean=ref_texts_clean,
407
+ ref_words=ref_words,
408
+ min_sources=self.min_sources,
409
+ )
410
+
411
+ # Process n-gram lengths sequentially (single-threaded for cloud compatibility)
412
+ candidate_anchors = []
413
+
414
+ # Check timeout before processing
415
+ self._check_timeout(start_time, "n-gram processing start")
416
+ self.logger.info(f"🔍 ANCHOR SEARCH: Starting sequential n-gram processing ({len(n_gram_lengths)} lengths)")
417
+
418
+ batch_size = 10
419
+ batch_results = []
420
+
421
+ for i, n in enumerate(n_gram_lengths):
422
+ try:
423
+ # Check timeout periodically
424
+ if self.timeout_seconds > 0:
425
+ elapsed_time = time.time() - start_time
426
+ if elapsed_time > self.timeout_seconds:
427
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Timeout reached at n-gram {n}, stopping")
428
+ break
429
+
430
+ anchors = self._process_ngram_length(
431
+ n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
432
+ )
433
+ candidate_anchors.extend(anchors)
434
+
435
+ # Batch logging
436
+ batch_results.append((n, len(anchors)))
437
+
438
+ # Log progress every batch_size results or on the last result
439
+ if (i + 1) % batch_size == 0 or (i + 1) == len(n_gram_lengths):
440
+ total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
441
+ n_gram_ranges = [str(ng) for ng, _ in batch_results]
442
+ range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
443
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} - found {total_anchors_in_batch} anchors")
444
+ batch_results = []
445
+
446
+ except Exception as e:
447
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n} failed: {str(e)}")
448
+ batch_results.append((n, 0))
449
+ continue
450
+
451
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
452
+
453
+ # Check timeout before expensive filtering operation
454
+ self._check_timeout(start_time, "overlap filtering start")
455
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🔄 Starting overlap filtering...")
456
+
457
+ filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
458
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Filtering completed - {len(filtered_anchors)} final anchors")
459
+
460
+ # Save to cache
461
+ self.logger.info(f"🔍 ANCHOR SEARCH: 💾 Saving results to cache...")
462
+ self._save_to_cache(cache_path, filtered_anchors)
463
+
464
+ total_time = time.time() - start_time
465
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🎉 Anchor sequence computation completed successfully in {total_time:.1f}s")
466
+
467
+ return filtered_anchors
468
+
469
+ except AnchorSequenceTimeoutError:
470
+ elapsed_time = time.time() - start_time
471
+ self.logger.error(f"🔍 ANCHOR SEARCH: ⏰ TIMEOUT after {elapsed_time:.1f}s (limit: {self.timeout_seconds}s)")
472
+ raise
473
+ except Exception as e:
474
+ elapsed_time = time.time() - start_time
475
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ FAILED after {elapsed_time:.1f}s: {str(e)}")
476
+ self.logger.error(f"🔍 ANCHOR SEARCH: Exception type: {type(e).__name__}")
477
+ import traceback
478
+ self.logger.error(f"🔍 ANCHOR SEARCH: Traceback: {traceback.format_exc()}")
479
+ raise
480
+ finally:
481
+ # No cleanup needed for time-based timeout checks
482
+ pass
483
+
484
+ def _score_sequence(self, words: List[str], context: str) -> PhraseScore:
485
+ """Score a sequence based on its phrase quality"""
486
+ self.logger.debug(f"_score_sequence called for: '{' '.join(words)}'")
487
+ return self.phrase_analyzer.score_phrase(words, context)
488
+
489
+ def _get_sequence_priority(self, scored_anchor: ScoredAnchor) -> Tuple[float, float, float, float, int]:
490
+ """Get priority tuple for sorting sequences.
491
+
492
+ Returns tuple of:
493
+ - Number of sources matched (higher is better)
494
+ - Length bonus (length * 0.2) to favor longer sequences
495
+ - Break score (higher is better)
496
+ - Total score (higher is better)
497
+ - Negative position (earlier is better)
498
+
499
+ Position bonus: Add 1.0 to total score for sequences at position 0
500
+ """
501
+ # self.logger.debug(f"_get_sequence_priority called for anchor: '{scored_anchor.anchor.text}'")
502
+ position_bonus = 1.0 if scored_anchor.anchor.transcription_position == 0 else 0.0
503
+ length_bonus = len(scored_anchor.anchor.transcribed_word_ids) * 0.2 # Changed from words to transcribed_word_ids
504
+
505
+ return (
506
+ len(scored_anchor.anchor.reference_positions), # More sources is better
507
+ length_bonus, # Longer sequences preferred
508
+ scored_anchor.phrase_score.natural_break_score, # Better breaks preferred
509
+ scored_anchor.phrase_score.total_score + position_bonus, # Add bonus for position 0
510
+ -scored_anchor.anchor.transcription_position, # Earlier positions preferred
511
+ )
512
+
513
+ def _sequences_overlap(self, seq1: AnchorSequence, seq2: AnchorSequence) -> bool:
514
+ """Check if two sequences overlap in either transcription or references.
515
+
516
+ Args:
517
+ seq1: First sequence
518
+ seq2: Second sequence
519
+
520
+ Returns:
521
+ True if sequences overlap in transcription or share any reference positions
522
+ """
523
+ # Check transcription overlap
524
+ seq1_trans_range = range(
525
+ seq1.transcription_position, seq1.transcription_position + len(seq1.transcribed_word_ids)
526
+ ) # Changed from words
527
+ seq2_trans_range = range(
528
+ seq2.transcription_position, seq2.transcription_position + len(seq2.transcribed_word_ids)
529
+ ) # Changed from words
530
+ trans_overlap = bool(set(seq1_trans_range) & set(seq2_trans_range))
531
+
532
+ # Check reference overlap - only consider positions in shared sources
533
+ shared_sources = set(seq1.reference_positions.keys()) & set(seq2.reference_positions.keys())
534
+ ref_overlap = any(seq1.reference_positions[source] == seq2.reference_positions[source] for source in shared_sources)
535
+
536
+ return trans_overlap or ref_overlap
537
+
538
+ def _remove_overlapping_sequences(
539
+ self,
540
+ anchors: List[AnchorSequence],
541
+ context: str,
542
+ transcription_result: TranscriptionResult,
543
+ ) -> List[ScoredAnchor]:
544
+ """Remove overlapping sequences using phrase analysis with timeout protection."""
545
+ self.logger.info(f"🔍 FILTERING: Starting overlap removal for {len(anchors)} anchors")
546
+
547
+ if not anchors:
548
+ self.logger.info(f"🔍 FILTERING: No anchors to process")
549
+ return []
550
+
551
+ self.logger.info(f"🔍 FILTERING: Scoring {len(anchors)} anchors")
552
+
553
+ # Create word map for scoring
554
+ word_map = {w.id: w for s in transcription_result.result.segments for w in s.words}
555
+ self.logger.debug(f"🔍 FILTERING: Created word map with {len(word_map)} words")
556
+
557
+ # Add word map to each anchor for scoring
558
+ for i, anchor in enumerate(anchors):
559
+ # For backwards compatibility, only add transcribed_words if all IDs exist in word_map
560
+ try:
561
+ anchor.transcribed_words = [word_map[word_id] for word_id in anchor.transcribed_word_ids]
562
+ # Also set _words for backwards compatibility with text display
563
+ anchor._words = [word_map[word_id].text for word_id in anchor.transcribed_word_ids]
564
+ except KeyError:
565
+ # This can happen in tests using backwards compatible constructors
566
+ # Create dummy Word objects with the text from _words if available
567
+ if hasattr(anchor, '_words') and anchor._words is not None:
568
+ from lyrics_transcriber.types import Word
569
+ from lyrics_transcriber.utils.word_utils import WordUtils
570
+ anchor.transcribed_words = [
571
+ Word(
572
+ id=word_id,
573
+ text=text,
574
+ start_time=i * 1.0,
575
+ end_time=(i + 1) * 1.0,
576
+ confidence=1.0
577
+ )
578
+ for i, (word_id, text) in enumerate(zip(anchor.transcribed_word_ids, anchor._words))
579
+ ]
580
+ else:
581
+ # Create generic word objects for scoring
582
+ from lyrics_transcriber.types import Word
583
+ anchor.transcribed_words = [
584
+ Word(
585
+ id=word_id,
586
+ text=f"word_{i}",
587
+ start_time=i * 1.0,
588
+ end_time=(i + 1) * 1.0,
589
+ confidence=1.0
590
+ )
591
+ for i, word_id in enumerate(anchor.transcribed_word_ids)
592
+ ]
593
+
594
+ start_time = time.time()
595
+
596
+ # Score anchors sequentially using simple rule-based scoring
597
+ # (Avoids expensive spaCy NLP and works in cloud environments)
598
+ scored_anchors = []
599
+ self.logger.debug(f"🔍 FILTERING: Scoring {len(anchors)} anchors sequentially")
600
+
601
+ for i, anchor in enumerate(anchors):
602
+ try:
603
+ # Simple rule-based scoring based on anchor properties
604
+ phrase_score = self._simple_score_anchor(anchor)
605
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
606
+ except Exception as e:
607
+ # Fallback to default score on error
608
+ self.logger.debug(f"🔍 FILTERING: Scoring failed for anchor {i}: {e}")
609
+ phrase_score = PhraseScore(
610
+ phrase_type=PhraseType.COMPLETE,
611
+ natural_break_score=1.0,
612
+ length_score=1.0
613
+ )
614
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
615
+
616
+ scoring_time = time.time() - start_time
617
+ self.logger.debug(f"🔍 FILTERING: Scoring completed in {scoring_time:.2f}s, scored {len(scored_anchors)} anchors")
618
+
619
+ # Sort anchors by priority (highest first)
620
+ self.logger.debug(f"🔍 FILTERING: Sorting anchors by priority...")
621
+ scored_anchors.sort(key=self._get_sequence_priority, reverse=True)
622
+
623
+ # O(N) overlap filtering using covered positions set
624
+ self.logger.debug(f"🔍 FILTERING: Filtering {len(scored_anchors)} overlapping sequences")
625
+ filtered_scored = []
626
+ covered_positions: Set[int] = set()
627
+
628
+ for scored_anchor in scored_anchors:
629
+ anchor = scored_anchor.anchor
630
+ start_pos = anchor.transcription_position
631
+ end_pos = start_pos + anchor.length
632
+
633
+ # Check if any position in this anchor's range is already covered
634
+ anchor_positions = set(range(start_pos, end_pos))
635
+ if not anchor_positions & covered_positions: # No overlap with covered
636
+ filtered_scored.append(scored_anchor)
637
+ covered_positions.update(anchor_positions)
638
+
639
+ self.logger.debug(f"🔍 FILTERING: Kept {len(filtered_scored)} non-overlapping anchors out of {len(scored_anchors)}")
640
+ return filtered_scored
641
+
642
+ def _simple_score_anchor(self, anchor: AnchorSequence) -> PhraseScore:
643
+ """
644
+ Simple rule-based scoring for anchors without expensive NLP.
645
+
646
+ Scoring criteria:
647
+ - Longer sequences are preferred (length_score)
648
+ - Sequences matching more reference sources are preferred (natural_break_score)
649
+ - All sequences treated as COMPLETE type for simplicity
650
+ """
651
+ # Length score: normalize to 0-1 range (3-15 words typical)
652
+ length = anchor.length
653
+ length_score = min(1.0, (length - 2) / 10.0) # 3 words = 0.1, 12 words = 1.0
654
+
655
+ # Source match score: more sources = higher score
656
+ num_sources = len(anchor.reference_positions)
657
+ natural_break_score = min(1.0, num_sources / 3.0) # 1 source = 0.33, 3+ sources = 1.0
658
+
659
+ return PhraseScore(
660
+ phrase_type=PhraseType.COMPLETE,
661
+ natural_break_score=natural_break_score,
662
+ length_score=length_score
663
+ )
664
+
665
+ @staticmethod
666
+ def _score_anchor_static(anchor: AnchorSequence, context: str) -> ScoredAnchor:
667
+ """Static version of _score_anchor for multiprocessing compatibility."""
668
+ # Create analyzer only once per process
669
+ if not hasattr(AnchorSequenceFinder._score_anchor_static, "_phrase_analyzer"):
670
+ AnchorSequenceFinder._score_anchor_static._phrase_analyzer = PhraseAnalyzer(logger=logging.getLogger(__name__))
671
+
672
+ # Get the words from the transcribed word IDs
673
+ # We need to pass in the actual words for scoring
674
+ words = [w.text for w in anchor.transcribed_words] # This needs to be passed in
675
+
676
+ phrase_score = AnchorSequenceFinder._score_anchor_static._phrase_analyzer.score_phrase(words, context)
677
+ return ScoredAnchor(anchor=anchor, phrase_score=phrase_score)
678
+
679
+ @staticmethod
680
+ def _score_batch_static(anchors: List[AnchorSequence], context: str) -> List[ScoredAnchor]:
681
+ """Score a batch of anchors for better timeout handling."""
682
+ # Create analyzer only once per process
683
+ if not hasattr(AnchorSequenceFinder._score_batch_static, "_phrase_analyzer"):
684
+ AnchorSequenceFinder._score_batch_static._phrase_analyzer = PhraseAnalyzer(logger=logging.getLogger(__name__))
685
+
686
+ scored_anchors = []
687
+ for anchor in anchors:
688
+ try:
689
+ words = [w.text for w in anchor.transcribed_words]
690
+ phrase_score = AnchorSequenceFinder._score_batch_static._phrase_analyzer.score_phrase(words, context)
691
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
692
+ except Exception:
693
+ # Add basic score for failed anchor
694
+ phrase_score = PhraseScore(
695
+ phrase_type=PhraseType.COMPLETE,
696
+ natural_break_score=1.0,
697
+ length_score=1.0
698
+ )
699
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
700
+
701
+ return scored_anchors
702
+
703
+ def _get_reference_words(self, source: str, ref_words: List[str], start_pos: Optional[int], end_pos: Optional[int]) -> List[str]:
704
+ """Get words from reference text between two positions.
705
+
706
+ Args:
707
+ source: Reference source identifier
708
+ ref_words: List of words from the reference text
709
+ start_pos: Starting position (None for beginning)
710
+ end_pos: Ending position (None for end)
711
+
712
+ Returns:
713
+ List of words between the positions
714
+ """
715
+ if start_pos is None:
716
+ start_pos = 0
717
+ if end_pos is None:
718
+ end_pos = len(ref_words)
719
+ return ref_words[start_pos:end_pos]
720
+
721
+ def find_gaps(
722
+ self,
723
+ transcribed: str,
724
+ anchors: List[ScoredAnchor],
725
+ references: Dict[str, LyricsData],
726
+ transcription_result: TranscriptionResult,
727
+ ) -> List[GapSequence]:
728
+ """Find gaps between anchor sequences in the transcribed text."""
729
+ # Get all words from transcription
730
+ all_words = []
731
+ for segment in transcription_result.result.segments:
732
+ all_words.extend(segment.words)
733
+
734
+ # Clean and split reference texts
735
+ ref_texts_clean = {
736
+ source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
737
+ for source, lyrics in references.items()
738
+ }
739
+ ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
740
+
741
+ # Create gaps with Word IDs
742
+ gaps = []
743
+ sorted_anchors = sorted(anchors, key=lambda x: x.anchor.transcription_position)
744
+
745
+ # Handle initial gap
746
+ if sorted_anchors:
747
+ first_anchor = sorted_anchors[0].anchor
748
+ first_anchor_pos = first_anchor.transcription_position
749
+ if first_anchor_pos > 0:
750
+ gap_word_ids = [w.id for w in all_words[:first_anchor_pos]]
751
+ if gap := self._create_initial_gap(
752
+ id=WordUtils.generate_id(),
753
+ transcribed_word_ids=gap_word_ids,
754
+ transcription_position=0,
755
+ following_anchor_id=first_anchor.id,
756
+ ref_texts_clean=ref_texts_clean,
757
+ ref_words=ref_words,
758
+ following_anchor=first_anchor,
759
+ ):
760
+ gaps.append(gap)
761
+
762
+ # Handle gaps between anchors
763
+ for i in range(len(sorted_anchors) - 1):
764
+ current_anchor = sorted_anchors[i].anchor
765
+ next_anchor = sorted_anchors[i + 1].anchor
766
+ gap_start = current_anchor.transcription_position + len(current_anchor.transcribed_word_ids)
767
+ gap_end = next_anchor.transcription_position
768
+
769
+ if gap_end > gap_start:
770
+ gap_word_ids = [w.id for w in all_words[gap_start:gap_end]]
771
+ if between_gap := self._create_between_gap(
772
+ id=WordUtils.generate_id(),
773
+ transcribed_word_ids=gap_word_ids,
774
+ transcription_position=gap_start,
775
+ preceding_anchor_id=current_anchor.id,
776
+ following_anchor_id=next_anchor.id,
777
+ ref_texts_clean=ref_texts_clean,
778
+ ref_words=ref_words,
779
+ preceding_anchor=current_anchor,
780
+ following_anchor=next_anchor,
781
+ ):
782
+ gaps.append(between_gap)
783
+
784
+ # Handle final gap
785
+ if sorted_anchors:
786
+ last_anchor = sorted_anchors[-1].anchor
787
+ last_pos = last_anchor.transcription_position + len(last_anchor.transcribed_word_ids)
788
+ if last_pos < len(all_words):
789
+ gap_word_ids = [w.id for w in all_words[last_pos:]]
790
+ if final_gap := self._create_final_gap(
791
+ id=WordUtils.generate_id(),
792
+ transcribed_word_ids=gap_word_ids,
793
+ transcription_position=last_pos,
794
+ preceding_anchor_id=last_anchor.id,
795
+ ref_texts_clean=ref_texts_clean,
796
+ ref_words=ref_words,
797
+ preceding_anchor=last_anchor,
798
+ ):
799
+ gaps.append(final_gap)
800
+
801
+ return gaps
802
+
803
+ def _create_initial_gap(
804
+ self,
805
+ id: str,
806
+ transcribed_word_ids: List[str],
807
+ transcription_position: int,
808
+ following_anchor_id: str,
809
+ ref_texts_clean: Dict[str, List[str]],
810
+ ref_words: Dict[str, List[Word]],
811
+ following_anchor: AnchorSequence,
812
+ ) -> Optional[GapSequence]:
813
+ """Create gap sequence before the first anchor.
814
+
815
+ The gap includes all reference words from the start of each reference
816
+ up to the position where the following anchor starts in that reference.
817
+ """
818
+ if transcription_position > 0:
819
+ # Get reference word IDs for the gap
820
+ reference_word_ids = {}
821
+ for source, words in ref_words.items():
822
+ if source in ref_texts_clean:
823
+ # Get the position where the following anchor starts in this source
824
+ if source in following_anchor.reference_positions:
825
+ end_pos = following_anchor.reference_positions[source]
826
+ # Include all words from start up to the anchor
827
+ reference_word_ids[source] = [w.id for w in words[:end_pos]]
828
+ else:
829
+ # If this source doesn't contain the following anchor,
830
+ # we can't determine the gap content for it
831
+ reference_word_ids[source] = []
832
+
833
+ return GapSequence(
834
+ id=id,
835
+ transcribed_word_ids=transcribed_word_ids,
836
+ transcription_position=transcription_position,
837
+ preceding_anchor_id=None,
838
+ following_anchor_id=following_anchor_id,
839
+ reference_word_ids=reference_word_ids,
840
+ )
841
+ return None
842
+
843
+ def _create_between_gap(
844
+ self,
845
+ id: str,
846
+ transcribed_word_ids: List[str],
847
+ transcription_position: int,
848
+ preceding_anchor_id: str,
849
+ following_anchor_id: str,
850
+ ref_texts_clean: Dict[str, List[str]],
851
+ ref_words: Dict[str, List[Word]],
852
+ preceding_anchor: AnchorSequence,
853
+ following_anchor: AnchorSequence,
854
+ ) -> Optional[GapSequence]:
855
+ """Create gap sequence between two anchors.
856
+
857
+ For each reference source, the gap includes all words between the end of the
858
+ preceding anchor and the start of the following anchor in that source.
859
+ """
860
+ # Get reference word IDs for the gap
861
+ reference_word_ids = {}
862
+ for source, words in ref_words.items():
863
+ if source in ref_texts_clean:
864
+ # Only process sources that contain both anchors
865
+ if source in preceding_anchor.reference_positions and source in following_anchor.reference_positions:
866
+ start_pos = preceding_anchor.reference_positions[source] + len(preceding_anchor.reference_word_ids[source])
867
+ end_pos = following_anchor.reference_positions[source]
868
+ # Include all words between the anchors
869
+ reference_word_ids[source] = [w.id for w in words[start_pos:end_pos]]
870
+ else:
871
+ # If this source doesn't contain both anchors,
872
+ # we can't determine the gap content for it
873
+ reference_word_ids[source] = []
874
+
875
+ return GapSequence(
876
+ id=id,
877
+ transcribed_word_ids=transcribed_word_ids,
878
+ transcription_position=transcription_position,
879
+ preceding_anchor_id=preceding_anchor_id,
880
+ following_anchor_id=following_anchor_id,
881
+ reference_word_ids=reference_word_ids,
882
+ )
883
+
884
+ def _create_final_gap(
885
+ self,
886
+ id: str,
887
+ transcribed_word_ids: List[str],
888
+ transcription_position: int,
889
+ preceding_anchor_id: str,
890
+ ref_texts_clean: Dict[str, List[str]],
891
+ ref_words: Dict[str, List[Word]],
892
+ preceding_anchor: AnchorSequence,
893
+ ) -> Optional[GapSequence]:
894
+ """Create gap sequence after the last anchor.
895
+
896
+ For each reference source, includes all words from the end of the
897
+ preceding anchor to the end of that reference.
898
+ """
899
+ # Get reference word IDs for the gap
900
+ reference_word_ids = {}
901
+ for source, words in ref_words.items():
902
+ if source in ref_texts_clean:
903
+ if source in preceding_anchor.reference_positions:
904
+ start_pos = preceding_anchor.reference_positions[source] + len(preceding_anchor.reference_word_ids[source])
905
+ # Include all words from end of last anchor to end of reference
906
+ reference_word_ids[source] = [w.id for w in words[start_pos:]]
907
+ else:
908
+ # If this source doesn't contain the preceding anchor,
909
+ # we can't determine the gap content for it
910
+ reference_word_ids[source] = []
911
+
912
+ return GapSequence(
913
+ id=id,
914
+ transcribed_word_ids=transcribed_word_ids,
915
+ transcription_position=transcription_position,
916
+ preceding_anchor_id=preceding_anchor_id,
917
+ following_anchor_id=None,
918
+ reference_word_ids=reference_word_ids,
919
+ )