karaoke-gen 0.76.20__py3-none-any.whl → 0.82.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. karaoke_gen/instrumental_review/static/index.html +179 -16
  2. karaoke_gen/karaoke_gen.py +5 -4
  3. karaoke_gen/lyrics_processor.py +25 -6
  4. {karaoke_gen-0.76.20.dist-info → karaoke_gen-0.82.0.dist-info}/METADATA +79 -3
  5. {karaoke_gen-0.76.20.dist-info → karaoke_gen-0.82.0.dist-info}/RECORD +33 -31
  6. lyrics_transcriber/core/config.py +8 -0
  7. lyrics_transcriber/core/controller.py +43 -1
  8. lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +178 -5
  9. lyrics_transcriber/correction/agentic/prompts/__init__.py +23 -0
  10. lyrics_transcriber/correction/agentic/prompts/classifier.py +66 -6
  11. lyrics_transcriber/correction/agentic/prompts/langfuse_prompts.py +298 -0
  12. lyrics_transcriber/correction/agentic/providers/config.py +7 -0
  13. lyrics_transcriber/correction/agentic/providers/constants.py +1 -1
  14. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +22 -7
  15. lyrics_transcriber/correction/agentic/providers/model_factory.py +28 -13
  16. lyrics_transcriber/correction/agentic/router.py +18 -13
  17. lyrics_transcriber/correction/corrector.py +1 -45
  18. lyrics_transcriber/frontend/.gitignore +1 -0
  19. lyrics_transcriber/frontend/e2e/agentic-corrections.spec.ts +207 -0
  20. lyrics_transcriber/frontend/e2e/fixtures/agentic-correction-data.json +226 -0
  21. lyrics_transcriber/frontend/package.json +4 -1
  22. lyrics_transcriber/frontend/playwright.config.ts +1 -1
  23. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +34 -30
  24. lyrics_transcriber/frontend/src/components/Header.tsx +141 -34
  25. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +120 -3
  26. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +11 -1
  27. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +122 -35
  28. lyrics_transcriber/frontend/src/components/shared/types.ts +6 -0
  29. lyrics_transcriber/output/generator.py +50 -3
  30. lyrics_transcriber/transcribers/local_whisper.py +260 -0
  31. lyrics_transcriber/correction/handlers/llm.py +0 -293
  32. lyrics_transcriber/correction/handlers/llm_providers.py +0 -60
  33. {karaoke_gen-0.76.20.dist-info → karaoke_gen-0.82.0.dist-info}/WHEEL +0 -0
  34. {karaoke_gen-0.76.20.dist-info → karaoke_gen-0.82.0.dist-info}/entry_points.txt +0 -0
  35. {karaoke_gen-0.76.20.dist-info → karaoke_gen-0.82.0.dist-info}/licenses/LICENSE +0 -0
@@ -60,8 +60,16 @@
60
60
  .logo {
61
61
  font-size: 1.25rem;
62
62
  font-weight: 600;
63
+ display: inline-flex;
64
+ align-items: center;
65
+ gap: 8px;
63
66
  }
64
-
67
+
68
+ .logo-img {
69
+ height: 40px;
70
+ width: auto;
71
+ }
72
+
65
73
  .track-info {
66
74
  font-size: 0.9rem;
67
75
  color: var(--text-muted);
@@ -568,6 +576,143 @@
568
576
  font-size: 1.5rem;
569
577
  color: var(--success);
570
578
  }
579
+
580
+ /* Mobile responsiveness */
581
+ @media (max-width: 768px) {
582
+ .app {
583
+ padding: 12px;
584
+ gap: 8px;
585
+ height: auto;
586
+ min-height: 100vh;
587
+ overflow-y: auto;
588
+ }
589
+
590
+ body {
591
+ overflow: auto;
592
+ }
593
+
594
+ .header {
595
+ flex-direction: column;
596
+ align-items: flex-start;
597
+ gap: 8px;
598
+ }
599
+
600
+ .header-left {
601
+ width: 100%;
602
+ }
603
+
604
+ .header-right {
605
+ width: 100%;
606
+ justify-content: flex-start;
607
+ flex-wrap: wrap;
608
+ }
609
+
610
+ .logo {
611
+ font-size: 1rem;
612
+ }
613
+
614
+ .logo-img {
615
+ height: 32px;
616
+ }
617
+
618
+ .waveform-player {
619
+ flex: none;
620
+ min-height: 200px;
621
+ }
622
+
623
+ .waveform-toolbar {
624
+ flex-wrap: wrap;
625
+ padding: 8px 12px;
626
+ gap: 8px;
627
+ }
628
+
629
+ .toolbar-left,
630
+ .toolbar-center,
631
+ .toolbar-right {
632
+ flex-wrap: wrap;
633
+ }
634
+
635
+ .audio-toggle-group {
636
+ order: 10;
637
+ width: 100%;
638
+ justify-content: center;
639
+ }
640
+
641
+ .bottom-section {
642
+ flex-direction: column;
643
+ }
644
+
645
+ .mute-panel {
646
+ max-height: none;
647
+ }
648
+
649
+ .selection-panel {
650
+ width: 100%;
651
+ }
652
+
653
+ .selection-option {
654
+ padding: 12px;
655
+ }
656
+
657
+ .btn {
658
+ min-height: 44px;
659
+ padding: 8px 12px;
660
+ }
661
+
662
+ .btn-icon {
663
+ width: 44px;
664
+ height: 44px;
665
+ }
666
+
667
+ .audio-toggle {
668
+ padding: 8px 12px;
669
+ min-height: 40px;
670
+ }
671
+
672
+ .zoom-btn {
673
+ width: 40px;
674
+ height: 40px;
675
+ }
676
+
677
+ .time-display {
678
+ font-size: 0.9rem;
679
+ }
680
+ }
681
+
682
+ @media (max-width: 480px) {
683
+ .app {
684
+ padding: 8px;
685
+ }
686
+
687
+ .header-left {
688
+ flex-wrap: wrap;
689
+ }
690
+
691
+ .track-info {
692
+ width: 100%;
693
+ margin-top: 4px;
694
+ }
695
+
696
+ .waveform-toolbar {
697
+ padding: 6px 8px;
698
+ }
699
+
700
+ .toolbar-center {
701
+ width: 100%;
702
+ justify-content: center;
703
+ order: -1;
704
+ }
705
+
706
+ .toolbar-left {
707
+ order: 1;
708
+ }
709
+
710
+ .toolbar-right {
711
+ order: 2;
712
+ width: 100%;
713
+ justify-content: space-between;
714
+ }
715
+ }
571
716
  </style>
572
717
  </head>
573
718
  <body>
@@ -641,7 +786,8 @@
641
786
 
642
787
  if (waveformRes.ok) {
643
788
  waveformData = await waveformRes.json();
644
- duration = waveformData.duration;
789
+ // API returns duration_seconds, not duration
790
+ duration = waveformData.duration_seconds || 0;
645
791
  }
646
792
 
647
793
  // Set initial selection based on recommendation
@@ -679,7 +825,7 @@
679
825
  app.innerHTML = `
680
826
  <div class="header">
681
827
  <div class="header-left">
682
- <span class="logo">🎤 Instrumental Review</span>
828
+ <span class="logo"><img src="https://gen.nomadkaraoke.com/nomad-karaoke-logo.svg" alt="Nomad Karaoke" class="logo-img" onerror="this.style.display='none'"> Instrumental Review</span>
683
829
  <span class="track-info">${escapeHtml(analysisData.artist) || ''} ${analysisData.artist && analysisData.title ? '–' : ''} ${escapeHtml(analysisData.title) || ''}</span>
684
830
  </div>
685
831
  <div class="header-right">
@@ -969,8 +1115,14 @@
969
1115
  canvas.onmousedown = (e) => {
970
1116
  const rect = canvas.getBoundingClientRect();
971
1117
  const x = e.clientX - rect.left;
1118
+
1119
+ // Guard against invalid duration
1120
+ if (!Number.isFinite(duration) || duration <= 0 || !Number.isFinite(rect.width) || rect.width <= 0) {
1121
+ return;
1122
+ }
1123
+
972
1124
  const time = (x / rect.width) * duration;
973
-
1125
+
974
1126
  // Shift+drag to select mute region
975
1127
  if (e.shiftKey) {
976
1128
  isDragging = true;
@@ -993,18 +1145,26 @@
993
1145
 
994
1146
  const endDrag = (e) => {
995
1147
  if (!isDragging) return;
996
-
1148
+
997
1149
  const rect = canvas.getBoundingClientRect();
998
1150
  const x = e.clientX - rect.left;
1151
+
1152
+ // Guard against invalid duration
1153
+ if (!Number.isFinite(duration) || duration <= 0 || !Number.isFinite(rect.width) || rect.width <= 0) {
1154
+ isDragging = false;
1155
+ showSelectionOverlay(false);
1156
+ return;
1157
+ }
1158
+
999
1159
  const time = (x / rect.width) * duration;
1000
-
1160
+
1001
1161
  const start = Math.min(dragStartTime, time);
1002
1162
  const end = Math.max(dragStartTime, time);
1003
-
1163
+
1004
1164
  if (end - start > 0.5) {
1005
1165
  addRegion(start, end);
1006
1166
  }
1007
-
1167
+
1008
1168
  isDragging = false;
1009
1169
  showSelectionOverlay(false);
1010
1170
  };
@@ -1090,14 +1250,15 @@
1090
1250
 
1091
1251
  function seekTo(time, autoPlay = true) {
1092
1252
  const audio = document.getElementById('audio-player');
1093
- if (audio) {
1094
- audio.currentTime = time;
1095
- currentTime = time;
1096
- updatePlayhead();
1097
- // Auto-play when seeking via click (if not already playing)
1098
- if (autoPlay && !isPlaying) {
1099
- audio.play();
1100
- }
1253
+ // Guard against non-finite time values (NaN, Infinity)
1254
+ if (!audio || !Number.isFinite(time)) return;
1255
+
1256
+ audio.currentTime = time;
1257
+ currentTime = time;
1258
+ updatePlayhead();
1259
+ // Auto-play when seeking via click (if not already playing)
1260
+ if (autoPlay && !isPlaying) {
1261
+ audio.play();
1101
1262
  }
1102
1263
  }
1103
1264
 
@@ -1155,6 +1316,8 @@
1155
1316
  }
1156
1317
 
1157
1318
  function formatTime(seconds) {
1319
+ // Guard against NaN/Infinity
1320
+ if (!Number.isFinite(seconds)) return '0:00';
1158
1321
  const mins = Math.floor(seconds / 60);
1159
1322
  const secs = Math.floor(seconds % 60);
1160
1323
  return `${mins}:${secs.toString().padStart(2, '0')}`;
@@ -796,21 +796,22 @@ class KaraokePrep:
796
796
 
797
797
  outputs = output_generator.generate_outputs(
798
798
  transcription_corrected=correction_result,
799
+ lyrics_results={}, # Lyrics already written during transcription phase
799
800
  audio_filepath=audio_path,
800
801
  output_prefix=output_prefix,
801
802
  )
802
803
 
803
804
  # Copy video to expected location in parent directory
804
- if outputs and outputs.get("video_filepath"):
805
- source_video = outputs["video_filepath"]
805
+ if outputs and outputs.video:
806
+ source_video = outputs.video
806
807
  dest_video = os.path.join(track_output_dir, f"{artist_title} (With Vocals).mkv")
807
808
  shutil.copy2(source_video, dest_video)
808
809
  self.logger.info(f"Video rendered successfully: {dest_video}")
809
810
  processed_track["with_vocals_video"] = dest_video
810
811
 
811
812
  # Update ASS filepath for video background processing
812
- if outputs.get("ass_filepath"):
813
- processed_track["ass_filepath"] = outputs["ass_filepath"]
813
+ if outputs.ass:
814
+ processed_track["ass_filepath"] = outputs.ass
814
815
  else:
815
816
  self.logger.warning("Video rendering did not produce expected output")
816
817
  else:
@@ -170,15 +170,15 @@ class LyricsProcessor:
170
170
  def _check_transcription_providers(self) -> dict:
171
171
  """
172
172
  Check which transcription providers are configured and return their status.
173
-
173
+
174
174
  Returns:
175
175
  dict with 'configured' (list of provider names) and 'missing' (list of missing configs)
176
176
  """
177
177
  load_dotenv()
178
-
178
+
179
179
  configured = []
180
180
  missing = []
181
-
181
+
182
182
  # Check AudioShake
183
183
  audioshake_token = os.getenv("AUDIOSHAKE_API_TOKEN")
184
184
  if audioshake_token:
@@ -187,7 +187,7 @@ class LyricsProcessor:
187
187
  else:
188
188
  missing.append("AudioShake (AUDIOSHAKE_API_TOKEN)")
189
189
  self.logger.debug("AudioShake transcription provider: not configured (missing AUDIOSHAKE_API_TOKEN)")
190
-
190
+
191
191
  # Check Whisper via RunPod
192
192
  runpod_key = os.getenv("RUNPOD_API_KEY")
193
193
  whisper_id = os.getenv("WHISPER_RUNPOD_ID")
@@ -203,7 +203,16 @@ class LyricsProcessor:
203
203
  else:
204
204
  missing.append("Whisper (RUNPOD_API_KEY + WHISPER_RUNPOD_ID)")
205
205
  self.logger.debug("Whisper transcription provider: not configured")
206
-
206
+
207
+ # Check Local Whisper (whisper-timestamped)
208
+ try:
209
+ import whisper_timestamped
210
+ configured.append("Local Whisper")
211
+ self.logger.debug("Local Whisper transcription provider: configured (whisper-timestamped installed)")
212
+ except ImportError:
213
+ missing.append("Local Whisper (pip install karaoke-gen[local-whisper])")
214
+ self.logger.debug("Local Whisper transcription provider: not configured (whisper-timestamped not installed)")
215
+
207
216
  return {"configured": configured, "missing": missing}
208
217
 
209
218
  def _build_transcription_provider_error_message(self, missing_providers: list) -> str:
@@ -221,12 +230,18 @@ class LyricsProcessor:
221
230
  " - Set environment variable: AUDIOSHAKE_API_TOKEN=your_token\n"
222
231
  " - Get an API key at: https://www.audioshake.ai/\n"
223
232
  "\n"
224
- "2. Whisper via RunPod (Open-source alternative)\n"
233
+ "2. Whisper via RunPod (Cloud-based open-source)\n"
225
234
  " - Set environment variables:\n"
226
235
  " RUNPOD_API_KEY=your_key\n"
227
236
  " WHISPER_RUNPOD_ID=your_endpoint_id\n"
228
237
  " - Set up a Whisper endpoint at: https://www.runpod.io/\n"
229
238
  "\n"
239
+ "3. Local Whisper (No cloud required - runs on your machine)\n"
240
+ " - Install with: pip install karaoke-gen[local-whisper]\n"
241
+ " - For CPU-only: pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu\n"
242
+ " pip install karaoke-gen[local-whisper]\n"
243
+ " - Requires 2-10GB RAM depending on model size\n"
244
+ "\n"
230
245
  "ALTERNATIVES:\n"
231
246
  "\n"
232
247
  "- Use --skip-lyrics flag to generate instrumental-only karaoke (no synchronized lyrics)\n"
@@ -348,6 +363,10 @@ class LyricsProcessor:
348
363
  # Create config objects for LyricsTranscriber
349
364
  transcriber_config = TranscriberConfig(
350
365
  audioshake_api_token=env_config.get("audioshake_api_token"),
366
+ runpod_api_key=env_config.get("runpod_api_key"),
367
+ whisper_runpod_id=env_config.get("whisper_runpod_id"),
368
+ # Local Whisper is enabled by default as a fallback when no cloud providers are configured
369
+ enable_local_whisper=True,
351
370
  )
352
371
 
353
372
  lyrics_config = LyricsConfig(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: karaoke-gen
3
- Version: 0.76.20
3
+ Version: 0.82.0
4
4
  Summary: Generate karaoke videos with synchronized lyrics. Handles the entire process from downloading audio and lyrics to creating the final video with title screens.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -13,12 +13,14 @@ Classifier: Programming Language :: Python :: 3.10
13
13
  Classifier: Programming Language :: Python :: 3.11
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: 3.13
16
+ Provides-Extra: local-whisper
16
17
  Requires-Dist: argparse (>=1.4.0)
17
18
  Requires-Dist: attrs (>=24.2.0)
18
19
  Requires-Dist: audio-separator[cpu] (>=0.34.0)
19
20
  Requires-Dist: beautifulsoup4 (>=4)
20
21
  Requires-Dist: cattrs (>=24.1.2)
21
22
  Requires-Dist: dropbox (>=12)
23
+ Requires-Dist: email-validator (>=2.0.0)
22
24
  Requires-Dist: fastapi (>=0.104.0)
23
25
  Requires-Dist: fetch-lyrics-from-genius (>=0.1)
24
26
  Requires-Dist: ffmpeg-python (>=0.2.0,<0.3.0)
@@ -40,6 +42,7 @@ Requires-Dist: kbputils (>=0.0.16,<0.0.17)
40
42
  Requires-Dist: langchain (>=0.3.0)
41
43
  Requires-Dist: langchain-anthropic (>=0.2.0)
42
44
  Requires-Dist: langchain-core (>=0.3.0)
45
+ Requires-Dist: langchain-google-vertexai (>=3.1.1)
43
46
  Requires-Dist: langchain-ollama (>=0.2.0)
44
47
  Requires-Dist: langchain-openai (>=0.2.0)
45
48
  Requires-Dist: langfuse (>=3.0.0)
@@ -74,10 +77,12 @@ Requires-Dist: python-levenshtein (>=0.26)
74
77
  Requires-Dist: python-multipart (>=0.0.20,<0.0.21)
75
78
  Requires-Dist: python-slugify (>=8)
76
79
  Requires-Dist: requests (>=2)
80
+ Requires-Dist: sendgrid (>=6.10.0)
77
81
  Requires-Dist: shortuuid (>=1.0.13)
78
82
  Requires-Dist: spacy (>=3.8.7)
79
83
  Requires-Dist: spacy-syllables (>=3)
80
84
  Requires-Dist: srsly (>=2.5.1)
85
+ Requires-Dist: stripe (>=7.0.0)
81
86
  Requires-Dist: syllables (>=1)
82
87
  Requires-Dist: syrics (>=0)
83
88
  Requires-Dist: thefuzz (>=0.22)
@@ -86,6 +91,7 @@ Requires-Dist: torch (>=2.7)
86
91
  Requires-Dist: tqdm (>=4.67)
87
92
  Requires-Dist: transformers (>=4.47)
88
93
  Requires-Dist: uvicorn[standard] (>=0.24.0)
94
+ Requires-Dist: whisper-timestamped (>=1.15.0) ; extra == "local-whisper"
89
95
  Requires-Dist: yt-dlp (>=2024.0.0)
90
96
  Project-URL: Documentation, https://github.com/nomadkaraoke/karaoke-gen/blob/main/README.md
91
97
  Project-URL: Homepage, https://github.com/nomadkaraoke/karaoke-gen
@@ -165,8 +171,40 @@ export AUDIOSHAKE_API_TOKEN="your_audioshake_token"
165
171
 
166
172
  Get an API key at [https://www.audioshake.ai/](https://www.audioshake.ai/) - business only, at time of writing this.
167
173
 
168
- #### Option 2: Whisper via RunPod
169
- Open-source alternative using OpenAI's Whisper model on RunPod infrastructure.
174
+ #### Option 2: Local Whisper (No Cloud Required)
175
+ Run Whisper directly on your local machine using whisper-timestamped. Works on CPU, NVIDIA GPU (CUDA), or Apple Silicon.
176
+
177
+ ```bash
178
+ # Install with local Whisper support
179
+ pip install "karaoke-gen[local-whisper]"
180
+
181
+ # Optional: Configure model size (tiny, base, small, medium, large)
182
+ export WHISPER_MODEL_SIZE="medium"
183
+
184
+ # Optional: Force specific device (cpu, cuda, mps)
185
+ export WHISPER_DEVICE="cpu"
186
+ ```
187
+
188
+ **Model Size Guide:**
189
+ | Model | VRAM | Speed | Quality |
190
+ |-------|------|-------|---------|
191
+ | tiny | ~1GB | Fast | Lower |
192
+ | base | ~1GB | Fast | Basic |
193
+ | small | ~2GB | Medium | Good |
194
+ | medium | ~5GB | Slower | Better |
195
+ | large | ~10GB | Slowest | Best |
196
+
197
+ **CPU-Only Installation** (no GPU required):
198
+ ```bash
199
+ # Pre-install CPU-only PyTorch first
200
+ pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
201
+ pip install "karaoke-gen[local-whisper]"
202
+ ```
203
+
204
+ Local Whisper runs automatically as a fallback when no cloud transcription services are configured.
205
+
206
+ #### Option 3: Whisper via RunPod
207
+ Cloud-based alternative using OpenAI's Whisper model on RunPod infrastructure.
170
208
 
171
209
  ```bash
172
210
  export RUNPOD_API_KEY="your_runpod_key"
@@ -668,6 +706,44 @@ If the output video has quality problems:
668
706
  - Check available codecs: `ffmpeg -codecs`
669
707
  - For 4K output, ensure sufficient disk space (10GB+ per track)
670
708
 
709
+ ### Local Whisper Issues
710
+
711
+ #### GPU Out of Memory
712
+ If you get CUDA out of memory errors:
713
+ ```bash
714
+ # Use a smaller model
715
+ export WHISPER_MODEL_SIZE="small" # or "tiny"
716
+
717
+ # Or force CPU mode
718
+ export WHISPER_DEVICE="cpu"
719
+ ```
720
+
721
+ #### Slow Transcription on CPU
722
+ CPU transcription is significantly slower than GPU. For faster processing:
723
+ - Use a smaller model (`tiny` or `base`)
724
+ - Consider using cloud transcription (AudioShake or RunPod)
725
+ - On Apple Silicon, the `small` model offers good speed/quality balance
726
+
727
+ #### Model Download Issues
728
+ Whisper models are downloaded on first use (~1-3GB depending on size). If downloads fail:
729
+ - Check your internet connection
730
+ - Set a custom cache directory: `export WHISPER_CACHE_DIR="/path/with/space"`
731
+ - Models are cached in `~/.cache/whisper/` by default
732
+
733
+ #### whisper-timestamped Not Found
734
+ If you get "whisper-timestamped is not installed":
735
+ ```bash
736
+ pip install "karaoke-gen[local-whisper]"
737
+ # Or install directly:
738
+ pip install whisper-timestamped
739
+ ```
740
+
741
+ #### Disabling Local Whisper
742
+ If you want to disable local Whisper (e.g., to force cloud transcription):
743
+ ```bash
744
+ export ENABLE_LOCAL_WHISPER="false"
745
+ ```
746
+
671
747
  ---
672
748
 
673
749
  ## 🧪 Development