karaoke-gen 0.57.0__py3-none-any.whl → 0.71.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. karaoke_gen/audio_fetcher.py +461 -0
  2. karaoke_gen/audio_processor.py +407 -30
  3. karaoke_gen/config.py +62 -113
  4. karaoke_gen/file_handler.py +32 -59
  5. karaoke_gen/karaoke_finalise/karaoke_finalise.py +148 -67
  6. karaoke_gen/karaoke_gen.py +270 -61
  7. karaoke_gen/lyrics_processor.py +13 -1
  8. karaoke_gen/metadata.py +78 -73
  9. karaoke_gen/pipeline/__init__.py +87 -0
  10. karaoke_gen/pipeline/base.py +215 -0
  11. karaoke_gen/pipeline/context.py +230 -0
  12. karaoke_gen/pipeline/executors/__init__.py +21 -0
  13. karaoke_gen/pipeline/executors/local.py +159 -0
  14. karaoke_gen/pipeline/executors/remote.py +257 -0
  15. karaoke_gen/pipeline/stages/__init__.py +27 -0
  16. karaoke_gen/pipeline/stages/finalize.py +202 -0
  17. karaoke_gen/pipeline/stages/render.py +165 -0
  18. karaoke_gen/pipeline/stages/screens.py +139 -0
  19. karaoke_gen/pipeline/stages/separation.py +191 -0
  20. karaoke_gen/pipeline/stages/transcription.py +191 -0
  21. karaoke_gen/style_loader.py +531 -0
  22. karaoke_gen/utils/bulk_cli.py +6 -0
  23. karaoke_gen/utils/cli_args.py +424 -0
  24. karaoke_gen/utils/gen_cli.py +26 -261
  25. karaoke_gen/utils/remote_cli.py +1815 -0
  26. karaoke_gen/video_background_processor.py +351 -0
  27. karaoke_gen-0.71.23.dist-info/METADATA +610 -0
  28. karaoke_gen-0.71.23.dist-info/RECORD +275 -0
  29. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.23.dist-info}/WHEEL +1 -1
  30. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.23.dist-info}/entry_points.txt +1 -0
  31. lyrics_transcriber/__init__.py +10 -0
  32. lyrics_transcriber/cli/__init__.py +0 -0
  33. lyrics_transcriber/cli/cli_main.py +285 -0
  34. lyrics_transcriber/core/__init__.py +0 -0
  35. lyrics_transcriber/core/config.py +50 -0
  36. lyrics_transcriber/core/controller.py +520 -0
  37. lyrics_transcriber/correction/__init__.py +0 -0
  38. lyrics_transcriber/correction/agentic/__init__.py +9 -0
  39. lyrics_transcriber/correction/agentic/adapter.py +71 -0
  40. lyrics_transcriber/correction/agentic/agent.py +313 -0
  41. lyrics_transcriber/correction/agentic/feedback/aggregator.py +12 -0
  42. lyrics_transcriber/correction/agentic/feedback/collector.py +17 -0
  43. lyrics_transcriber/correction/agentic/feedback/retention.py +24 -0
  44. lyrics_transcriber/correction/agentic/feedback/store.py +76 -0
  45. lyrics_transcriber/correction/agentic/handlers/__init__.py +24 -0
  46. lyrics_transcriber/correction/agentic/handlers/ambiguous.py +44 -0
  47. lyrics_transcriber/correction/agentic/handlers/background_vocals.py +68 -0
  48. lyrics_transcriber/correction/agentic/handlers/base.py +51 -0
  49. lyrics_transcriber/correction/agentic/handlers/complex_multi_error.py +46 -0
  50. lyrics_transcriber/correction/agentic/handlers/extra_words.py +74 -0
  51. lyrics_transcriber/correction/agentic/handlers/no_error.py +42 -0
  52. lyrics_transcriber/correction/agentic/handlers/punctuation.py +44 -0
  53. lyrics_transcriber/correction/agentic/handlers/registry.py +60 -0
  54. lyrics_transcriber/correction/agentic/handlers/repeated_section.py +44 -0
  55. lyrics_transcriber/correction/agentic/handlers/sound_alike.py +126 -0
  56. lyrics_transcriber/correction/agentic/models/__init__.py +5 -0
  57. lyrics_transcriber/correction/agentic/models/ai_correction.py +31 -0
  58. lyrics_transcriber/correction/agentic/models/correction_session.py +30 -0
  59. lyrics_transcriber/correction/agentic/models/enums.py +38 -0
  60. lyrics_transcriber/correction/agentic/models/human_feedback.py +30 -0
  61. lyrics_transcriber/correction/agentic/models/learning_data.py +26 -0
  62. lyrics_transcriber/correction/agentic/models/observability_metrics.py +28 -0
  63. lyrics_transcriber/correction/agentic/models/schemas.py +46 -0
  64. lyrics_transcriber/correction/agentic/models/utils.py +19 -0
  65. lyrics_transcriber/correction/agentic/observability/__init__.py +5 -0
  66. lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +35 -0
  67. lyrics_transcriber/correction/agentic/observability/metrics.py +46 -0
  68. lyrics_transcriber/correction/agentic/observability/performance.py +19 -0
  69. lyrics_transcriber/correction/agentic/prompts/__init__.py +2 -0
  70. lyrics_transcriber/correction/agentic/prompts/classifier.py +227 -0
  71. lyrics_transcriber/correction/agentic/providers/__init__.py +6 -0
  72. lyrics_transcriber/correction/agentic/providers/base.py +36 -0
  73. lyrics_transcriber/correction/agentic/providers/circuit_breaker.py +145 -0
  74. lyrics_transcriber/correction/agentic/providers/config.py +73 -0
  75. lyrics_transcriber/correction/agentic/providers/constants.py +24 -0
  76. lyrics_transcriber/correction/agentic/providers/health.py +28 -0
  77. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +212 -0
  78. lyrics_transcriber/correction/agentic/providers/model_factory.py +209 -0
  79. lyrics_transcriber/correction/agentic/providers/response_cache.py +218 -0
  80. lyrics_transcriber/correction/agentic/providers/response_parser.py +111 -0
  81. lyrics_transcriber/correction/agentic/providers/retry_executor.py +127 -0
  82. lyrics_transcriber/correction/agentic/router.py +35 -0
  83. lyrics_transcriber/correction/agentic/workflows/__init__.py +5 -0
  84. lyrics_transcriber/correction/agentic/workflows/consensus_workflow.py +24 -0
  85. lyrics_transcriber/correction/agentic/workflows/correction_graph.py +59 -0
  86. lyrics_transcriber/correction/agentic/workflows/feedback_workflow.py +24 -0
  87. lyrics_transcriber/correction/anchor_sequence.py +1043 -0
  88. lyrics_transcriber/correction/corrector.py +760 -0
  89. lyrics_transcriber/correction/feedback/__init__.py +2 -0
  90. lyrics_transcriber/correction/feedback/schemas.py +107 -0
  91. lyrics_transcriber/correction/feedback/store.py +236 -0
  92. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  93. lyrics_transcriber/correction/handlers/base.py +52 -0
  94. lyrics_transcriber/correction/handlers/extend_anchor.py +149 -0
  95. lyrics_transcriber/correction/handlers/levenshtein.py +189 -0
  96. lyrics_transcriber/correction/handlers/llm.py +293 -0
  97. lyrics_transcriber/correction/handlers/llm_providers.py +60 -0
  98. lyrics_transcriber/correction/handlers/no_space_punct_match.py +154 -0
  99. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +85 -0
  100. lyrics_transcriber/correction/handlers/repeat.py +88 -0
  101. lyrics_transcriber/correction/handlers/sound_alike.py +259 -0
  102. lyrics_transcriber/correction/handlers/syllables_match.py +252 -0
  103. lyrics_transcriber/correction/handlers/word_count_match.py +80 -0
  104. lyrics_transcriber/correction/handlers/word_operations.py +187 -0
  105. lyrics_transcriber/correction/operations.py +352 -0
  106. lyrics_transcriber/correction/phrase_analyzer.py +435 -0
  107. lyrics_transcriber/correction/text_utils.py +30 -0
  108. lyrics_transcriber/frontend/.gitignore +23 -0
  109. lyrics_transcriber/frontend/.yarn/releases/yarn-4.7.0.cjs +935 -0
  110. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  111. lyrics_transcriber/frontend/README.md +50 -0
  112. lyrics_transcriber/frontend/REPLACE_ALL_FUNCTIONALITY.md +210 -0
  113. lyrics_transcriber/frontend/__init__.py +25 -0
  114. lyrics_transcriber/frontend/eslint.config.js +28 -0
  115. lyrics_transcriber/frontend/index.html +18 -0
  116. lyrics_transcriber/frontend/package.json +42 -0
  117. lyrics_transcriber/frontend/public/android-chrome-192x192.png +0 -0
  118. lyrics_transcriber/frontend/public/android-chrome-512x512.png +0 -0
  119. lyrics_transcriber/frontend/public/apple-touch-icon.png +0 -0
  120. lyrics_transcriber/frontend/public/favicon-16x16.png +0 -0
  121. lyrics_transcriber/frontend/public/favicon-32x32.png +0 -0
  122. lyrics_transcriber/frontend/public/favicon.ico +0 -0
  123. lyrics_transcriber/frontend/public/nomad-karaoke-logo.png +0 -0
  124. lyrics_transcriber/frontend/src/App.tsx +212 -0
  125. lyrics_transcriber/frontend/src/api.ts +239 -0
  126. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +77 -0
  127. lyrics_transcriber/frontend/src/components/AddLyricsModal.tsx +114 -0
  128. lyrics_transcriber/frontend/src/components/AgenticCorrectionMetrics.tsx +204 -0
  129. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +180 -0
  130. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +167 -0
  131. lyrics_transcriber/frontend/src/components/CorrectionAnnotationModal.tsx +359 -0
  132. lyrics_transcriber/frontend/src/components/CorrectionDetailCard.tsx +281 -0
  133. lyrics_transcriber/frontend/src/components/CorrectionMetrics.tsx +162 -0
  134. lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +257 -0
  135. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +68 -0
  136. lyrics_transcriber/frontend/src/components/EditModal.tsx +702 -0
  137. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +496 -0
  138. lyrics_transcriber/frontend/src/components/EditWordList.tsx +379 -0
  139. lyrics_transcriber/frontend/src/components/FileUpload.tsx +77 -0
  140. lyrics_transcriber/frontend/src/components/FindReplaceModal.tsx +467 -0
  141. lyrics_transcriber/frontend/src/components/Header.tsx +387 -0
  142. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +1373 -0
  143. lyrics_transcriber/frontend/src/components/MetricsDashboard.tsx +51 -0
  144. lyrics_transcriber/frontend/src/components/ModeSelector.tsx +67 -0
  145. lyrics_transcriber/frontend/src/components/ModelSelector.tsx +23 -0
  146. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +144 -0
  147. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +268 -0
  148. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +688 -0
  149. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +354 -0
  150. lyrics_transcriber/frontend/src/components/SegmentDetailsModal.tsx +64 -0
  151. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +376 -0
  152. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +131 -0
  153. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +256 -0
  154. lyrics_transcriber/frontend/src/components/WordDivider.tsx +187 -0
  155. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +379 -0
  156. lyrics_transcriber/frontend/src/components/shared/components/SourceSelector.tsx +56 -0
  157. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +87 -0
  158. lyrics_transcriber/frontend/src/components/shared/constants.ts +20 -0
  159. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +180 -0
  160. lyrics_transcriber/frontend/src/components/shared/styles.ts +13 -0
  161. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  162. lyrics_transcriber/frontend/src/components/shared/types.ts +129 -0
  163. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +177 -0
  164. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  165. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +75 -0
  166. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +360 -0
  167. lyrics_transcriber/frontend/src/components/shared/utils/timingUtils.ts +110 -0
  168. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  169. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +435 -0
  170. lyrics_transcriber/frontend/src/main.tsx +17 -0
  171. lyrics_transcriber/frontend/src/theme.ts +177 -0
  172. lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
  173. lyrics_transcriber/frontend/src/types.js +2 -0
  174. lyrics_transcriber/frontend/src/types.ts +199 -0
  175. lyrics_transcriber/frontend/src/validation.ts +132 -0
  176. lyrics_transcriber/frontend/src/vite-env.d.ts +1 -0
  177. lyrics_transcriber/frontend/tsconfig.app.json +26 -0
  178. lyrics_transcriber/frontend/tsconfig.json +25 -0
  179. lyrics_transcriber/frontend/tsconfig.node.json +23 -0
  180. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -0
  181. lyrics_transcriber/frontend/update_version.js +11 -0
  182. lyrics_transcriber/frontend/vite.config.d.ts +2 -0
  183. lyrics_transcriber/frontend/vite.config.js +10 -0
  184. lyrics_transcriber/frontend/vite.config.ts +11 -0
  185. lyrics_transcriber/frontend/web_assets/android-chrome-192x192.png +0 -0
  186. lyrics_transcriber/frontend/web_assets/android-chrome-512x512.png +0 -0
  187. lyrics_transcriber/frontend/web_assets/apple-touch-icon.png +0 -0
  188. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js +42039 -0
  189. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js.map +1 -0
  190. lyrics_transcriber/frontend/web_assets/favicon-16x16.png +0 -0
  191. lyrics_transcriber/frontend/web_assets/favicon-32x32.png +0 -0
  192. lyrics_transcriber/frontend/web_assets/favicon.ico +0 -0
  193. lyrics_transcriber/frontend/web_assets/index.html +18 -0
  194. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.png +0 -0
  195. lyrics_transcriber/frontend/yarn.lock +3752 -0
  196. lyrics_transcriber/lyrics/__init__.py +0 -0
  197. lyrics_transcriber/lyrics/base_lyrics_provider.py +211 -0
  198. lyrics_transcriber/lyrics/file_provider.py +95 -0
  199. lyrics_transcriber/lyrics/genius.py +384 -0
  200. lyrics_transcriber/lyrics/lrclib.py +231 -0
  201. lyrics_transcriber/lyrics/musixmatch.py +156 -0
  202. lyrics_transcriber/lyrics/spotify.py +290 -0
  203. lyrics_transcriber/lyrics/user_input_provider.py +44 -0
  204. lyrics_transcriber/output/__init__.py +0 -0
  205. lyrics_transcriber/output/ass/__init__.py +21 -0
  206. lyrics_transcriber/output/ass/ass.py +2088 -0
  207. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  208. lyrics_transcriber/output/ass/config.py +180 -0
  209. lyrics_transcriber/output/ass/constants.py +23 -0
  210. lyrics_transcriber/output/ass/event.py +94 -0
  211. lyrics_transcriber/output/ass/formatters.py +132 -0
  212. lyrics_transcriber/output/ass/lyrics_line.py +265 -0
  213. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  214. lyrics_transcriber/output/ass/section_detector.py +89 -0
  215. lyrics_transcriber/output/ass/section_screen.py +106 -0
  216. lyrics_transcriber/output/ass/style.py +187 -0
  217. lyrics_transcriber/output/cdg.py +619 -0
  218. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  219. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  220. lyrics_transcriber/output/cdgmaker/composer.py +2260 -0
  221. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  222. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  223. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  224. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  225. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  226. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  227. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  228. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  229. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  230. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  231. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  232. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  233. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  234. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  235. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  236. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  237. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  238. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  239. lyrics_transcriber/output/countdown_processor.py +267 -0
  240. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  241. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  242. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  243. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  244. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  245. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  246. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  247. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  248. lyrics_transcriber/output/generator.py +257 -0
  249. lyrics_transcriber/output/lrc_to_cdg.py +61 -0
  250. lyrics_transcriber/output/lyrics_file.py +102 -0
  251. lyrics_transcriber/output/plain_text.py +96 -0
  252. lyrics_transcriber/output/segment_resizer.py +431 -0
  253. lyrics_transcriber/output/subtitles.py +397 -0
  254. lyrics_transcriber/output/video.py +544 -0
  255. lyrics_transcriber/review/__init__.py +0 -0
  256. lyrics_transcriber/review/server.py +676 -0
  257. lyrics_transcriber/storage/__init__.py +0 -0
  258. lyrics_transcriber/storage/dropbox.py +225 -0
  259. lyrics_transcriber/transcribers/__init__.py +0 -0
  260. lyrics_transcriber/transcribers/audioshake.py +290 -0
  261. lyrics_transcriber/transcribers/base_transcriber.py +157 -0
  262. lyrics_transcriber/transcribers/whisper.py +330 -0
  263. lyrics_transcriber/types.py +648 -0
  264. lyrics_transcriber/utils/__init__.py +0 -0
  265. lyrics_transcriber/utils/word_utils.py +27 -0
  266. karaoke_gen-0.57.0.dist-info/METADATA +0 -167
  267. karaoke_gen-0.57.0.dist-info/RECORD +0 -23
  268. {karaoke_gen-0.57.0.dist-info → karaoke_gen-0.71.23.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,165 @@
1
+ """
2
+ Video rendering pipeline stage.
3
+
4
+ This stage handles:
5
+ - Rendering karaoke video with synchronized lyrics
6
+ - Using the OutputGenerator from lyrics_transcriber
7
+ - Combining audio, video, and synchronized lyrics
8
+
9
+ This stage runs after transcription is complete and corrections
10
+ have been applied.
11
+ """
12
+ import logging
13
+ import os
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from karaoke_gen.pipeline.base import PipelineStage, StageResult, StageStatus
17
+ from karaoke_gen.pipeline.context import PipelineContext
18
+
19
+
20
+ class RenderStage(PipelineStage):
21
+ """
22
+ Video rendering stage.
23
+
24
+ Renders the karaoke video with synchronized lyrics overlay.
25
+ Uses OutputGenerator from lyrics_transcriber.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ logger: Optional[logging.Logger] = None,
31
+ render_bounding_boxes: bool = False,
32
+ ):
33
+ """
34
+ Initialize the render stage.
35
+
36
+ Args:
37
+ logger: Logger instance
38
+ render_bounding_boxes: If True, render debug bounding boxes
39
+ """
40
+ self.logger = logger or logging.getLogger(__name__)
41
+ self.render_bounding_boxes = render_bounding_boxes
42
+
43
+ @property
44
+ def name(self) -> str:
45
+ return "render"
46
+
47
+ @property
48
+ def required_inputs(self) -> List[str]:
49
+ # Requires transcription output
50
+ return ["transcription"]
51
+
52
+ @property
53
+ def optional_inputs(self) -> List[str]:
54
+ return ["separation"]
55
+
56
+ @property
57
+ def output_keys(self) -> List[str]:
58
+ return [
59
+ "with_vocals_video_path", # Path to rendered video with vocals
60
+ "lrc_path", # Path to LRC file
61
+ "ass_path", # Path to ASS subtitle file
62
+ ]
63
+
64
+ async def execute(self, context: PipelineContext) -> StageResult:
65
+ """
66
+ Execute video rendering.
67
+
68
+ Args:
69
+ context: Pipeline context with transcription outputs
70
+
71
+ Returns:
72
+ StageResult with rendered video path
73
+ """
74
+ import time
75
+ start_time = time.time()
76
+
77
+ try:
78
+ context.update_progress(self.name, 0, "Starting video rendering")
79
+ context.log("INFO", f"Rendering video for: {context.artist} - {context.title}")
80
+
81
+ # Get transcription outputs
82
+ transcription = context.stage_outputs.get("transcription", {})
83
+ corrections_result = transcription.get("corrections_result")
84
+
85
+ if not corrections_result:
86
+ context.log("WARNING", "No corrections result available for rendering")
87
+ return StageResult(
88
+ status=StageStatus.SKIPPED,
89
+ outputs={},
90
+ error_message="No corrections result available",
91
+ )
92
+
93
+ # Import OutputGenerator
94
+ from lyrics_transcriber import OutputGenerator, OutputConfig
95
+
96
+ # Build output config
97
+ output_config = OutputConfig(
98
+ output_dir=context.output_dir,
99
+ cache_dir=os.path.join(context.output_dir, "cache"),
100
+ video_resolution="4k", # Default to 4K
101
+ )
102
+
103
+ # Apply style params if available
104
+ if context.style_params:
105
+ output_config = self._apply_style_params(output_config, context.style_params)
106
+
107
+ context.update_progress(self.name, 20, "Initializing video generator")
108
+
109
+ # Create OutputGenerator
110
+ generator = OutputGenerator(
111
+ config=output_config,
112
+ logger=self.logger,
113
+ )
114
+
115
+ context.update_progress(self.name, 40, "Rendering video with lyrics")
116
+
117
+ # Generate video
118
+ result = generator.generate_video(
119
+ result=corrections_result,
120
+ output_prefix=context.base_name,
121
+ audio_file=context.input_audio_path,
122
+ )
123
+
124
+ outputs = {}
125
+
126
+ if result:
127
+ outputs["with_vocals_video_path"] = result.get("video_path")
128
+ outputs["lrc_path"] = result.get("lrc_path")
129
+ outputs["ass_path"] = result.get("ass_path")
130
+
131
+ context.update_progress(self.name, 100, "Video rendering complete")
132
+
133
+ duration = time.time() - start_time
134
+ context.log("INFO", f"Video rendering completed in {duration:.1f}s")
135
+
136
+ return StageResult(
137
+ status=StageStatus.COMPLETED,
138
+ outputs=outputs,
139
+ duration_seconds=duration,
140
+ )
141
+
142
+ except Exception as e:
143
+ duration = time.time() - start_time
144
+ context.log("ERROR", f"Video rendering failed: {str(e)}")
145
+ return StageResult(
146
+ status=StageStatus.FAILED,
147
+ error_message=str(e),
148
+ error_details={"exception_type": type(e).__name__},
149
+ duration_seconds=duration,
150
+ )
151
+
152
+ def _apply_style_params(self, config, style_params: Dict[str, Any]):
153
+ """Apply style parameters to output config."""
154
+ # Apply karaoke style settings if present
155
+ karaoke_params = style_params.get("karaoke", {})
156
+
157
+ if karaoke_params.get("background_image"):
158
+ config.background_image = karaoke_params["background_image"]
159
+
160
+ if karaoke_params.get("font_path"):
161
+ config.font_path = karaoke_params["font_path"]
162
+
163
+ # Add more style mappings as needed
164
+
165
+ return config
@@ -0,0 +1,139 @@
1
+ """
2
+ Title and end screen generation pipeline stage.
3
+
4
+ This stage handles the generation of:
5
+ - Title screen video (intro)
6
+ - End screen video (outro)
7
+ - Corresponding PNG/JPG images
8
+
9
+ These are generated using the video_generator module.
10
+ """
11
+ import logging
12
+ import os
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from karaoke_gen.pipeline.base import PipelineStage, StageResult, StageStatus
16
+ from karaoke_gen.pipeline.context import PipelineContext
17
+
18
+
19
+ class ScreensStage(PipelineStage):
20
+ """
21
+ Title and end screen generation stage.
22
+
23
+ Generates title and end screen videos/images using configured
24
+ style parameters.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ output_png: bool = True,
30
+ output_jpg: bool = True,
31
+ logger: Optional[logging.Logger] = None,
32
+ ):
33
+ """
34
+ Initialize the screens stage.
35
+
36
+ Args:
37
+ output_png: Generate PNG format images
38
+ output_jpg: Generate JPG format images
39
+ logger: Logger instance
40
+ """
41
+ self.output_png = output_png
42
+ self.output_jpg = output_jpg
43
+ self.logger = logger or logging.getLogger(__name__)
44
+
45
+ @property
46
+ def name(self) -> str:
47
+ return "screens"
48
+
49
+ @property
50
+ def required_inputs(self) -> List[str]:
51
+ return []
52
+
53
+ @property
54
+ def output_keys(self) -> List[str]:
55
+ return [
56
+ "title_video_path", # Path to title screen video
57
+ "title_png_path", # Path to title screen PNG
58
+ "title_jpg_path", # Path to title screen JPG
59
+ "end_video_path", # Path to end screen video
60
+ "end_png_path", # Path to end screen PNG
61
+ "end_jpg_path", # Path to end screen JPG
62
+ ]
63
+
64
+ async def execute(self, context: PipelineContext) -> StageResult:
65
+ """
66
+ Execute screen generation.
67
+
68
+ Args:
69
+ context: Pipeline context with style params
70
+
71
+ Returns:
72
+ StageResult with screen file paths
73
+ """
74
+ import time
75
+ start_time = time.time()
76
+
77
+ try:
78
+ context.update_progress(self.name, 0, "Generating title and end screens")
79
+ context.log("INFO", f"Generating screens for: {context.artist} - {context.title}")
80
+
81
+ from karaoke_gen.video_generator import VideoGenerator
82
+
83
+ # Create video generator
84
+ generator = VideoGenerator(
85
+ artist=context.artist,
86
+ title=context.title,
87
+ output_dir=context.output_dir,
88
+ style_params=context.style_params,
89
+ logger=self.logger,
90
+ )
91
+
92
+ outputs = {}
93
+
94
+ context.update_progress(self.name, 25, "Generating title screen")
95
+
96
+ # Generate title screen
97
+ title_result = generator.generate_title_screen(
98
+ output_png=self.output_png,
99
+ output_jpg=self.output_jpg,
100
+ )
101
+
102
+ if title_result:
103
+ outputs["title_video_path"] = title_result.get("video_path")
104
+ outputs["title_png_path"] = title_result.get("png_path")
105
+ outputs["title_jpg_path"] = title_result.get("jpg_path")
106
+
107
+ context.update_progress(self.name, 75, "Generating end screen")
108
+
109
+ # Generate end screen
110
+ end_result = generator.generate_end_screen(
111
+ output_png=self.output_png,
112
+ output_jpg=self.output_jpg,
113
+ )
114
+
115
+ if end_result:
116
+ outputs["end_video_path"] = end_result.get("video_path")
117
+ outputs["end_png_path"] = end_result.get("png_path")
118
+ outputs["end_jpg_path"] = end_result.get("jpg_path")
119
+
120
+ context.update_progress(self.name, 100, "Screen generation complete")
121
+
122
+ duration = time.time() - start_time
123
+ context.log("INFO", f"Screen generation completed in {duration:.1f}s")
124
+
125
+ return StageResult(
126
+ status=StageStatus.COMPLETED,
127
+ outputs=outputs,
128
+ duration_seconds=duration,
129
+ )
130
+
131
+ except Exception as e:
132
+ duration = time.time() - start_time
133
+ context.log("ERROR", f"Screen generation failed: {str(e)}")
134
+ return StageResult(
135
+ status=StageStatus.FAILED,
136
+ error_message=str(e),
137
+ error_details={"exception_type": type(e).__name__},
138
+ duration_seconds=duration,
139
+ )
@@ -0,0 +1,191 @@
1
+ """
2
+ Audio separation pipeline stage.
3
+
4
+ This stage handles the separation of audio into stems:
5
+ - Clean instrumental (vocals removed)
6
+ - Vocals
7
+ - Backing vocals and lead vocals (optional)
8
+ - Other stems (drums, bass, guitar, etc.)
9
+ - Combined instrumental with backing vocals
10
+
11
+ The stage delegates to AudioProcessor but provides a consistent
12
+ pipeline interface.
13
+ """
14
+ import logging
15
+ import os
16
+ from typing import Any, Dict, List, Optional
17
+
18
+ from karaoke_gen.pipeline.base import PipelineStage, StageResult, StageStatus
19
+ from karaoke_gen.pipeline.context import PipelineContext
20
+
21
+
22
+ class SeparationStage(PipelineStage):
23
+ """
24
+ Audio separation stage.
25
+
26
+ Separates audio into stems using configured models.
27
+ Supports both local processing and remote API.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ model_file_dir: str = "/tmp/audio-separator-models/",
33
+ lossless_output_format: str = "flac",
34
+ clean_instrumental_model: str = "model_bs_roformer_ep_317_sdr_12.9755.ckpt",
35
+ backing_vocals_models: Optional[List[str]] = None,
36
+ other_stems_models: Optional[List[str]] = None,
37
+ logger: Optional[logging.Logger] = None,
38
+ log_level: int = logging.INFO,
39
+ skip_separation: bool = False,
40
+ ):
41
+ """
42
+ Initialize the separation stage.
43
+
44
+ Args:
45
+ model_file_dir: Directory for model files
46
+ lossless_output_format: Output format (flac, wav, etc.)
47
+ clean_instrumental_model: Model for clean instrumental separation
48
+ backing_vocals_models: Models for backing vocals separation
49
+ other_stems_models: Models for other stems (drums, bass, etc.)
50
+ logger: Logger instance
51
+ log_level: Logging level
52
+ skip_separation: If True, skip separation (for testing)
53
+ """
54
+ self.model_file_dir = model_file_dir
55
+ self.lossless_output_format = lossless_output_format
56
+ self.clean_instrumental_model = clean_instrumental_model
57
+ self.backing_vocals_models = backing_vocals_models or [
58
+ "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt"
59
+ ]
60
+ self.other_stems_models = other_stems_models or ["htdemucs_6s.yaml"]
61
+ self.logger = logger or logging.getLogger(__name__)
62
+ self.log_level = log_level
63
+ self.skip_separation = skip_separation
64
+
65
+ @property
66
+ def name(self) -> str:
67
+ return "separation"
68
+
69
+ @property
70
+ def required_inputs(self) -> List[str]:
71
+ # No required inputs from other stages - uses context.input_audio_path
72
+ return []
73
+
74
+ @property
75
+ def output_keys(self) -> List[str]:
76
+ return [
77
+ "clean_instrumental", # Dict with 'instrumental' and 'vocals' paths
78
+ "other_stems", # Dict mapping model -> stems dict
79
+ "backing_vocals", # Dict mapping model -> backing/lead vocals
80
+ "combined_instrumentals", # Dict mapping model -> combined path
81
+ ]
82
+
83
+ async def execute(self, context: PipelineContext) -> StageResult:
84
+ """
85
+ Execute audio separation.
86
+
87
+ Args:
88
+ context: Pipeline context with input audio path
89
+
90
+ Returns:
91
+ StageResult with separated stem paths
92
+ """
93
+ import time
94
+ start_time = time.time()
95
+
96
+ try:
97
+ context.update_progress(self.name, 0, "Starting audio separation")
98
+ context.log("INFO", f"Separating audio: {context.input_audio_path}")
99
+
100
+ if self.skip_separation:
101
+ context.log("INFO", "Skipping audio separation (skip_separation=True)")
102
+ return StageResult(
103
+ status=StageStatus.SKIPPED,
104
+ outputs={},
105
+ )
106
+
107
+ # Create AudioProcessor instance
108
+ from karaoke_gen.audio_processor import AudioProcessor
109
+
110
+ processor = AudioProcessor(
111
+ logger=self.logger,
112
+ log_level=self.log_level,
113
+ log_formatter=None,
114
+ model_file_dir=self.model_file_dir,
115
+ lossless_output_format=self.lossless_output_format,
116
+ clean_instrumental_model=self.clean_instrumental_model,
117
+ backing_vocals_models=self.backing_vocals_models,
118
+ other_stems_models=self.other_stems_models,
119
+ ffmpeg_base_command="ffmpeg -y -hide_banner -nostats -loglevel error",
120
+ )
121
+
122
+ context.update_progress(self.name, 10, "Processing audio separation")
123
+
124
+ # Run the separation
125
+ result = processor.process_audio_separation(
126
+ audio_file=context.input_audio_path,
127
+ artist_title=context.base_name,
128
+ track_output_dir=context.output_dir,
129
+ )
130
+
131
+ context.update_progress(self.name, 90, "Audio separation complete")
132
+
133
+ duration = time.time() - start_time
134
+ context.log("INFO", f"Audio separation completed in {duration:.1f}s")
135
+
136
+ return StageResult(
137
+ status=StageStatus.COMPLETED,
138
+ outputs=result,
139
+ duration_seconds=duration,
140
+ )
141
+
142
+ except Exception as e:
143
+ duration = time.time() - start_time
144
+ context.log("ERROR", f"Audio separation failed: {str(e)}")
145
+ return StageResult(
146
+ status=StageStatus.FAILED,
147
+ error_message=str(e),
148
+ error_details={"exception_type": type(e).__name__},
149
+ duration_seconds=duration,
150
+ )
151
+
152
+ def apply_countdown_padding(
153
+ self,
154
+ context: PipelineContext,
155
+ separation_result: Dict[str, Any],
156
+ padding_seconds: float,
157
+ ) -> Dict[str, Any]:
158
+ """
159
+ Apply countdown padding to instrumental files.
160
+
161
+ This is called after transcription determines the padding amount
162
+ needed to synchronize with padded vocals.
163
+
164
+ Args:
165
+ context: Pipeline context
166
+ separation_result: Original separation result
167
+ padding_seconds: Amount of padding to apply
168
+
169
+ Returns:
170
+ Updated separation result with padded file paths
171
+ """
172
+ from karaoke_gen.audio_processor import AudioProcessor
173
+
174
+ processor = AudioProcessor(
175
+ logger=self.logger,
176
+ log_level=self.log_level,
177
+ log_formatter=None,
178
+ model_file_dir=self.model_file_dir,
179
+ lossless_output_format=self.lossless_output_format,
180
+ clean_instrumental_model=self.clean_instrumental_model,
181
+ backing_vocals_models=self.backing_vocals_models,
182
+ other_stems_models=self.other_stems_models,
183
+ ffmpeg_base_command="ffmpeg -y -hide_banner -nostats -loglevel error",
184
+ )
185
+
186
+ return processor.apply_countdown_padding_to_instrumentals(
187
+ separation_result=separation_result,
188
+ padding_seconds=padding_seconds,
189
+ artist_title=context.base_name,
190
+ track_output_dir=context.output_dir,
191
+ )
@@ -0,0 +1,191 @@
1
+ """
2
+ Lyrics transcription pipeline stage.
3
+
4
+ This stage handles:
5
+ - Transcription of lyrics from audio (using AudioShake API)
6
+ - Fetching lyrics from online sources (Genius, Spotify, etc.)
7
+ - Synchronization of lyrics with audio timing
8
+ - Generation of LRC, ASS, and corrected text files
9
+
10
+ Note: Video rendering is handled by the RenderStage, not here.
11
+ """
12
+ import logging
13
+ import os
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from karaoke_gen.pipeline.base import PipelineStage, StageResult, StageStatus
17
+ from karaoke_gen.pipeline.context import PipelineContext
18
+
19
+
20
+ class TranscriptionStage(PipelineStage):
21
+ """
22
+ Lyrics transcription stage.
23
+
24
+ Transcribes and synchronizes lyrics from audio.
25
+ Does NOT render video - that's handled by RenderStage.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ style_params_json: Optional[str] = None,
31
+ lyrics_file: Optional[str] = None,
32
+ skip_transcription: bool = False,
33
+ skip_transcription_review: bool = False,
34
+ subtitle_offset_ms: int = 0,
35
+ lyrics_artist: Optional[str] = None,
36
+ lyrics_title: Optional[str] = None,
37
+ logger: Optional[logging.Logger] = None,
38
+ ):
39
+ """
40
+ Initialize the transcription stage.
41
+
42
+ Args:
43
+ style_params_json: Path to style parameters JSON file
44
+ lyrics_file: Path to existing lyrics file (optional)
45
+ skip_transcription: If True, skip automatic transcription
46
+ skip_transcription_review: If True, skip interactive review
47
+ subtitle_offset_ms: Offset for subtitle timing in milliseconds
48
+ lyrics_artist: Override artist name for lyrics search
49
+ lyrics_title: Override title for lyrics search
50
+ logger: Logger instance
51
+ """
52
+ self.style_params_json = style_params_json
53
+ self.lyrics_file = lyrics_file
54
+ self.skip_transcription = skip_transcription
55
+ self.skip_transcription_review = skip_transcription_review
56
+ self.subtitle_offset_ms = subtitle_offset_ms
57
+ self.lyrics_artist = lyrics_artist
58
+ self.lyrics_title = lyrics_title
59
+ self.logger = logger or logging.getLogger(__name__)
60
+
61
+ @property
62
+ def name(self) -> str:
63
+ return "transcription"
64
+
65
+ @property
66
+ def required_inputs(self) -> List[str]:
67
+ # No required inputs from other stages - uses context.input_audio_path
68
+ return []
69
+
70
+ @property
71
+ def optional_inputs(self) -> List[str]:
72
+ # Can use separation output for vocals-only transcription
73
+ return ["separation"]
74
+
75
+ @property
76
+ def output_keys(self) -> List[str]:
77
+ return [
78
+ "lrc_filepath", # Path to LRC lyrics file
79
+ "ass_filepath", # Path to ASS subtitle file
80
+ "corrected_txt_path", # Path to corrected text file
81
+ "corrections_result", # Full corrections JSON data
82
+ "countdown_padding_seconds", # Countdown padding applied (if any)
83
+ ]
84
+
85
+ async def execute(self, context: PipelineContext) -> StageResult:
86
+ """
87
+ Execute lyrics transcription.
88
+
89
+ Args:
90
+ context: Pipeline context with input audio path
91
+
92
+ Returns:
93
+ StageResult with lyrics file paths
94
+ """
95
+ import time
96
+ start_time = time.time()
97
+
98
+ try:
99
+ context.update_progress(self.name, 0, "Starting lyrics transcription")
100
+ context.log("INFO", f"Transcribing lyrics for: {context.artist} - {context.title}")
101
+
102
+ if self.skip_transcription:
103
+ context.log("INFO", "Skipping transcription (skip_transcription=True)")
104
+ return StageResult(
105
+ status=StageStatus.SKIPPED,
106
+ outputs={},
107
+ )
108
+
109
+ # Get style params from context or use instance value
110
+ style_params_json = self.style_params_json
111
+ if not style_params_json and context.style_params:
112
+ # Write style params to temp file
113
+ import json
114
+ import tempfile
115
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
116
+ json.dump(context.style_params, f)
117
+ style_params_json = f.name
118
+ context.add_temp_path(style_params_json)
119
+
120
+ # Create LyricsProcessor instance
121
+ # Note: render_video=False because we handle rendering in RenderStage
122
+ from karaoke_gen.lyrics_processor import LyricsProcessor
123
+
124
+ processor = LyricsProcessor(
125
+ logger=self.logger,
126
+ style_params_json=style_params_json,
127
+ lyrics_file=self.lyrics_file,
128
+ skip_transcription=self.skip_transcription,
129
+ skip_transcription_review=self.skip_transcription_review,
130
+ render_video=False, # Don't render video here
131
+ subtitle_offset_ms=self.subtitle_offset_ms,
132
+ )
133
+
134
+ context.update_progress(self.name, 20, "Running transcription")
135
+
136
+ # Run transcription
137
+ result = processor.transcribe_lyrics(
138
+ input_audio_wav=context.input_audio_path,
139
+ artist=context.artist,
140
+ title=context.title,
141
+ track_output_dir=context.output_dir,
142
+ lyrics_artist=self.lyrics_artist or context.artist,
143
+ lyrics_title=self.lyrics_title or context.title,
144
+ )
145
+
146
+ # Build output dictionary
147
+ outputs = {}
148
+
149
+ if result.get("lrc_filepath"):
150
+ outputs["lrc_filepath"] = result["lrc_filepath"]
151
+
152
+ if result.get("ass_filepath"):
153
+ outputs["ass_filepath"] = result["ass_filepath"]
154
+
155
+ if result.get("corrected_txt_path"):
156
+ outputs["corrected_txt_path"] = result["corrected_txt_path"]
157
+
158
+ # Get corrections data if available
159
+ if hasattr(processor, 'corrections_result'):
160
+ outputs["corrections_result"] = processor.corrections_result
161
+
162
+ # Check for countdown padding
163
+ lyrics_dir = os.path.join(context.output_dir, "lyrics")
164
+ countdown_file = os.path.join(lyrics_dir, "countdown_padding_seconds.txt")
165
+ if os.path.exists(countdown_file):
166
+ with open(countdown_file, 'r') as f:
167
+ try:
168
+ outputs["countdown_padding_seconds"] = float(f.read().strip())
169
+ except ValueError:
170
+ pass
171
+
172
+ context.update_progress(self.name, 100, "Transcription complete")
173
+
174
+ duration = time.time() - start_time
175
+ context.log("INFO", f"Lyrics transcription completed in {duration:.1f}s")
176
+
177
+ return StageResult(
178
+ status=StageStatus.COMPLETED,
179
+ outputs=outputs,
180
+ duration_seconds=duration,
181
+ )
182
+
183
+ except Exception as e:
184
+ duration = time.time() - start_time
185
+ context.log("ERROR", f"Lyrics transcription failed: {str(e)}")
186
+ return StageResult(
187
+ status=StageStatus.FAILED,
188
+ error_message=str(e),
189
+ error_details={"exception_type": type(e).__name__},
190
+ duration_seconds=duration,
191
+ )