karaoke-gen 0.75.16__py3-none-any.whl → 0.76.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. karaoke_gen/audio_fetcher.py +984 -33
  2. karaoke_gen/audio_processor.py +4 -0
  3. karaoke_gen/instrumental_review/static/index.html +37 -14
  4. karaoke_gen/karaoke_finalise/karaoke_finalise.py +25 -1
  5. karaoke_gen/karaoke_gen.py +208 -39
  6. karaoke_gen/lyrics_processor.py +111 -31
  7. karaoke_gen/utils/__init__.py +26 -0
  8. karaoke_gen/utils/cli_args.py +15 -6
  9. karaoke_gen/utils/gen_cli.py +30 -5
  10. karaoke_gen/utils/remote_cli.py +301 -20
  11. {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/METADATA +107 -5
  12. {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/RECORD +47 -43
  13. lyrics_transcriber/core/controller.py +76 -2
  14. lyrics_transcriber/frontend/index.html +5 -1
  15. lyrics_transcriber/frontend/package-lock.json +4553 -0
  16. lyrics_transcriber/frontend/package.json +4 -1
  17. lyrics_transcriber/frontend/playwright.config.ts +69 -0
  18. lyrics_transcriber/frontend/public/nomad-karaoke-logo.svg +5 -0
  19. lyrics_transcriber/frontend/src/App.tsx +94 -63
  20. lyrics_transcriber/frontend/src/api.ts +25 -10
  21. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +55 -21
  22. lyrics_transcriber/frontend/src/components/AppHeader.tsx +65 -0
  23. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +5 -5
  24. lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +9 -9
  25. lyrics_transcriber/frontend/src/components/EditModal.tsx +1 -1
  26. lyrics_transcriber/frontend/src/components/EditWordList.tsx +1 -1
  27. lyrics_transcriber/frontend/src/components/Header.tsx +34 -48
  28. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +22 -21
  29. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +1 -1
  30. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +1 -1
  31. lyrics_transcriber/frontend/src/components/WordDivider.tsx +3 -3
  32. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +2 -2
  33. lyrics_transcriber/frontend/src/components/shared/constants.ts +15 -5
  34. lyrics_transcriber/frontend/src/main.tsx +1 -7
  35. lyrics_transcriber/frontend/src/theme.ts +337 -135
  36. lyrics_transcriber/frontend/vite.config.ts +5 -0
  37. lyrics_transcriber/frontend/web_assets/assets/{index-COYImAcx.js → index-BECn1o8Q.js} +38 -22
  38. lyrics_transcriber/frontend/web_assets/assets/{index-COYImAcx.js.map → index-BECn1o8Q.js.map} +1 -1
  39. lyrics_transcriber/frontend/web_assets/index.html +1 -1
  40. lyrics_transcriber/frontend/yarn.lock +1005 -1046
  41. lyrics_transcriber/output/countdown_processor.py +39 -0
  42. lyrics_transcriber/review/server.py +1 -1
  43. lyrics_transcriber/transcribers/audioshake.py +96 -7
  44. lyrics_transcriber/types.py +14 -12
  45. {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/WHEEL +0 -0
  46. {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/entry_points.txt +0 -0
  47. {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/licenses/LICENSE +0 -0
@@ -27,10 +27,10 @@ class LyricsProcessor:
27
27
 
28
28
  def _detect_countdown_padding_from_lrc(self, lrc_filepath):
29
29
  """
30
- Detect if countdown padding was applied by checking the first lyric timestamp in the LRC file.
30
+ Detect if countdown padding was applied by checking for countdown text in the LRC file.
31
31
 
32
- LRC format timestamps look like: [mm:ss.xx] or [mm:ss.xxx]
33
- If the first lyric timestamp is >= 3.0 seconds, countdown padding was likely applied.
32
+ The countdown segment has the text "3... 2... 1..." at timestamp 0.1-2.9s.
33
+ We detect this by looking for the countdown text pattern.
34
34
 
35
35
  Args:
36
36
  lrc_filepath: Path to the LRC file
@@ -42,7 +42,15 @@ class LyricsProcessor:
42
42
  with open(lrc_filepath, 'r', encoding='utf-8') as f:
43
43
  content = f.read()
44
44
 
45
- # Find all timestamp patterns in the LRC file
45
+ # Method 1: Check for countdown text pattern "3... 2... 1..."
46
+ # This is the most reliable detection method since the countdown text is unique
47
+ countdown_text = "3... 2... 1..."
48
+ if countdown_text in content:
49
+ self.logger.info(f"Detected countdown padding from LRC: found countdown text '{countdown_text}'")
50
+ return (True, self.COUNTDOWN_PADDING_SECONDS)
51
+
52
+ # Method 2 (fallback): Check if first lyric timestamp is >= 3 seconds
53
+ # This handles cases where countdown text format might differ
46
54
  # LRC timestamps: [mm:ss.xx] or [mm:ss.xxx]
47
55
  timestamp_pattern = r'\[(\d{1,2}):(\d{2})\.(\d{2,3})\]'
48
56
  matches = re.findall(timestamp_pattern, content)
@@ -51,8 +59,7 @@ class LyricsProcessor:
51
59
  self.logger.debug("No timestamps found in LRC file")
52
60
  return (False, 0.0)
53
61
 
54
- # Find the first non-metadata timestamp (metadata like [ar:Artist] doesn't have decimal)
55
- # We already filtered for decimal timestamps in our pattern
62
+ # Parse the first timestamp
56
63
  first_timestamp = matches[0]
57
64
  minutes = int(first_timestamp[0])
58
65
  seconds = int(first_timestamp[1])
@@ -160,6 +167,76 @@ class LyricsProcessor:
160
167
 
161
168
  return processed_lines
162
169
 
170
+ def _check_transcription_providers(self) -> dict:
171
+ """
172
+ Check which transcription providers are configured and return their status.
173
+
174
+ Returns:
175
+ dict with 'configured' (list of provider names) and 'missing' (list of missing configs)
176
+ """
177
+ load_dotenv()
178
+
179
+ configured = []
180
+ missing = []
181
+
182
+ # Check AudioShake
183
+ audioshake_token = os.getenv("AUDIOSHAKE_API_TOKEN")
184
+ if audioshake_token:
185
+ configured.append("AudioShake")
186
+ self.logger.debug("AudioShake transcription provider: configured")
187
+ else:
188
+ missing.append("AudioShake (AUDIOSHAKE_API_TOKEN)")
189
+ self.logger.debug("AudioShake transcription provider: not configured (missing AUDIOSHAKE_API_TOKEN)")
190
+
191
+ # Check Whisper via RunPod
192
+ runpod_key = os.getenv("RUNPOD_API_KEY")
193
+ whisper_id = os.getenv("WHISPER_RUNPOD_ID")
194
+ if runpod_key and whisper_id:
195
+ configured.append("Whisper (RunPod)")
196
+ self.logger.debug("Whisper transcription provider: configured")
197
+ elif runpod_key:
198
+ missing.append("Whisper (missing WHISPER_RUNPOD_ID)")
199
+ self.logger.debug("Whisper transcription provider: partially configured (missing WHISPER_RUNPOD_ID)")
200
+ elif whisper_id:
201
+ missing.append("Whisper (missing RUNPOD_API_KEY)")
202
+ self.logger.debug("Whisper transcription provider: partially configured (missing RUNPOD_API_KEY)")
203
+ else:
204
+ missing.append("Whisper (RUNPOD_API_KEY + WHISPER_RUNPOD_ID)")
205
+ self.logger.debug("Whisper transcription provider: not configured")
206
+
207
+ return {"configured": configured, "missing": missing}
208
+
209
+ def _build_transcription_provider_error_message(self, missing_providers: list) -> str:
210
+ """Build a helpful error message when no transcription providers are configured."""
211
+ return (
212
+ "No transcription providers configured!\n"
213
+ "\n"
214
+ "Karaoke video generation requires at least one transcription provider to create "
215
+ "synchronized lyrics. Without a transcription provider, the system cannot generate "
216
+ "the word-level timing data needed for the karaoke video.\n"
217
+ "\n"
218
+ "AVAILABLE TRANSCRIPTION PROVIDERS:\n"
219
+ "\n"
220
+ "1. AudioShake (Recommended - Commercial, high-quality)\n"
221
+ " - Set environment variable: AUDIOSHAKE_API_TOKEN=your_token\n"
222
+ " - Get an API key at: https://www.audioshake.ai/\n"
223
+ "\n"
224
+ "2. Whisper via RunPod (Open-source alternative)\n"
225
+ " - Set environment variables:\n"
226
+ " RUNPOD_API_KEY=your_key\n"
227
+ " WHISPER_RUNPOD_ID=your_endpoint_id\n"
228
+ " - Set up a Whisper endpoint at: https://www.runpod.io/\n"
229
+ "\n"
230
+ "ALTERNATIVES:\n"
231
+ "\n"
232
+ "- Use --skip-lyrics flag to generate instrumental-only karaoke (no synchronized lyrics)\n"
233
+ "- Use --lyrics_file to provide pre-timed lyrics (still needs transcription for timing)\n"
234
+ "\n"
235
+ f"Missing provider configurations: {', '.join(missing_providers)}\n"
236
+ "\n"
237
+ "See README.md 'Transcription Providers' section for detailed setup instructions."
238
+ )
239
+
163
240
  def transcribe_lyrics(self, input_audio_wav, artist, title, track_output_dir, lyrics_artist=None, lyrics_title=None):
164
241
  """
165
242
  Transcribe lyrics for a track.
@@ -171,6 +248,9 @@ class LyricsProcessor:
171
248
  track_output_dir: Output directory path
172
249
  lyrics_artist: Artist name for lyrics processing (defaults to artist if None)
173
250
  lyrics_title: Title for lyrics processing (defaults to title if None)
251
+
252
+ Raises:
253
+ ValueError: If transcription is enabled but no providers are configured
174
254
  """
175
255
  # Use original artist/title for filename generation
176
256
  filename_artist = artist
@@ -234,6 +314,17 @@ class LyricsProcessor:
234
314
  "padded_audio_filepath": None, # Original padded audio may not exist
235
315
  }
236
316
 
317
+ # Check transcription provider configuration if transcription is not being skipped
318
+ # Do this AFTER checking for existing files, since existing files don't need transcription
319
+ if not self.skip_transcription:
320
+ provider_status = self._check_transcription_providers()
321
+
322
+ if provider_status["configured"]:
323
+ self.logger.info(f"Transcription providers configured: {', '.join(provider_status['configured'])}")
324
+ else:
325
+ error_msg = self._build_transcription_provider_error_message(provider_status["missing"])
326
+ raise ValueError(error_msg)
327
+
237
328
  # Create lyrics directory if it doesn't exist
238
329
  os.makedirs(lyrics_dir, exist_ok=True)
239
330
  self.logger.info(f"Created lyrics directory: {lyrics_dir}")
@@ -273,41 +364,30 @@ class LyricsProcessor:
273
364
  self.logger.info(f" rapidapi_key: {env_config.get('rapidapi_key')[:3] + '...' if env_config.get('rapidapi_key') else 'None'}")
274
365
  self.logger.info(f" lyrics_file: {self.lyrics_file}")
275
366
 
276
- # Detect if we're running in a serverless environment (Modal)
277
- # Modal sets specific environment variables we can check for
278
- is_serverless = (
279
- os.getenv("MODAL_TASK_ID") is not None or
280
- os.getenv("MODAL_FUNCTION_NAME") is not None or
281
- os.path.exists("/.modal") # Modal creates this directory in containers
282
- )
283
-
284
- # In serverless environment, disable interactive review even if skip_transcription_review=False
285
- # This preserves CLI behavior while fixing serverless hanging
286
- enable_review_setting = not self.skip_transcription_review and not is_serverless
287
-
288
- if is_serverless and not self.skip_transcription_review:
289
- self.logger.info("Detected serverless environment - disabling interactive review to prevent hanging")
290
-
291
- # In serverless environment, disable video generation during Phase 1 to save compute
292
- # Video will be generated in Phase 2 after human review
293
- serverless_render_video = render_video and not is_serverless
294
-
295
- if is_serverless and render_video:
296
- self.logger.info("Detected serverless environment - deferring video generation until after review")
297
-
367
+ # Always defer countdown and video rendering to a later phase.
368
+ # This ensures the review UI (both local and cloud) shows original timing
369
+ # without the 3-second countdown shift. The caller is responsible for:
370
+ # - Local CLI: karaoke_gen.py adds countdown and renders video after transcription
371
+ # - Cloud backend: render_video_worker.py adds countdown and renders video
372
+ #
373
+ # This design ensures consistent behavior regardless of environment,
374
+ # and the review UI always shows accurate, unshifted timestamps.
375
+ self.logger.info("Deferring countdown and video rendering to post-review phase")
376
+
298
377
  output_config = OutputConfig(
299
378
  output_styles_json=self.style_params_json,
300
379
  output_dir=lyrics_dir,
301
- render_video=serverless_render_video, # Disable video in serverless Phase 1
380
+ render_video=False, # Always defer - caller handles video rendering after countdown
302
381
  fetch_lyrics=True,
303
382
  run_transcription=not self.skip_transcription,
304
383
  run_correction=True,
305
384
  generate_plain_text=True,
306
385
  generate_lrc=True,
307
- generate_cdg=False, # Also defer CDG generation to Phase 2
386
+ generate_cdg=False, # CDG generation disabled (not currently supported)
308
387
  video_resolution="4k",
309
- enable_review=enable_review_setting,
388
+ enable_review=not self.skip_transcription_review, # Honor the caller's setting
310
389
  subtitle_offset_ms=self.subtitle_offset_ms,
390
+ add_countdown=False, # Always defer - caller handles countdown after review
311
391
  )
312
392
 
313
393
  # Add this log entry to debug the OutputConfig
@@ -1,9 +1,35 @@
1
1
  import re
2
2
 
3
+ # Unicode character replacements for ASCII-safe filenames
4
+ # These characters cause issues with HTTP headers (latin-1 encoding) and some filesystems
5
+ UNICODE_REPLACEMENTS = {
6
+ # Curly/smart quotes -> straight quotes
7
+ "\u2018": "'", # LEFT SINGLE QUOTATION MARK
8
+ "\u2019": "'", # RIGHT SINGLE QUOTATION MARK (the one causing the bug)
9
+ "\u201A": "'", # SINGLE LOW-9 QUOTATION MARK
10
+ "\u201B": "'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK
11
+ "\u201C": '"', # LEFT DOUBLE QUOTATION MARK
12
+ "\u201D": '"', # RIGHT DOUBLE QUOTATION MARK
13
+ "\u201E": '"', # DOUBLE LOW-9 QUOTATION MARK
14
+ "\u201F": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
15
+ # Other common problematic characters
16
+ "\u2013": "-", # EN DASH
17
+ "\u2014": "-", # EM DASH
18
+ "\u2026": "...", # HORIZONTAL ELLIPSIS
19
+ "\u00A0": " ", # NON-BREAKING SPACE
20
+ }
21
+
22
+
3
23
  def sanitize_filename(filename):
4
24
  """Replace or remove characters that are unsafe for filenames."""
5
25
  if filename is None:
6
26
  return None
27
+
28
+ # First, normalize Unicode characters that cause HTTP header encoding issues
29
+ # (e.g., curly quotes from macOS/Word that can't be encoded in latin-1)
30
+ for unicode_char, ascii_replacement in UNICODE_REPLACEMENTS.items():
31
+ filename = filename.replace(unicode_char, ascii_replacement)
32
+
7
33
  # Replace problematic characters with underscores
8
34
  for char in ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]:
9
35
  filename = filename.replace(char, "_")
@@ -242,9 +242,17 @@ def create_parser(prog: str = "karaoke-gen") -> argparse.ArgumentParser:
242
242
 
243
243
  # Style Configuration
244
244
  style_group = parser.add_argument_group("Style Configuration")
245
+ style_group.add_argument(
246
+ "--theme",
247
+ help="Optional: Theme ID for pre-made styles stored in GCS (e.g., 'nomad', 'default'). "
248
+ "When using a theme, CDG/TXT are enabled by default. "
249
+ "Example: --theme=nomad",
250
+ )
245
251
  style_group.add_argument(
246
252
  "--style_params_json",
247
- help="Optional: Path to JSON file containing style configuration. Example: --style_params_json='/path/to/style_params.json'",
253
+ help="Optional: Path to JSON file containing style configuration. "
254
+ "Takes precedence over --theme if both are provided. "
255
+ "Example: --style_params_json='/path/to/style_params.json'",
248
256
  )
249
257
  style_group.add_argument(
250
258
  "--style_override",
@@ -258,8 +266,8 @@ def create_parser(prog: str = "karaoke-gen") -> argparse.ArgumentParser:
258
266
  style_group.add_argument(
259
267
  "--background_video_darkness",
260
268
  type=int,
261
- default=0,
262
- help="Optional: Darkness overlay percentage (0-100) for video background (default: %(default)s). Example: --background_video_darkness=50",
269
+ default=50,
270
+ help="Optional: Darkness overlay percentage (0-100) for video background (default: %(default)s). Example: --background_video_darkness=20",
263
271
  )
264
272
 
265
273
  # Finalisation Configuration
@@ -352,9 +360,10 @@ def create_parser(prog: str = "karaoke-gen") -> argparse.ArgumentParser:
352
360
  )
353
361
  remote_group.add_argument(
354
362
  "--review-ui-url",
355
- default=os.environ.get('REVIEW_UI_URL', os.environ.get('LYRICS_REVIEW_UI_URL', 'https://lyrics.nomadkaraoke.com')),
356
- help="Lyrics review UI URL. Default: 'https://lyrics.nomadkaraoke.com'. "
357
- "Use 'http://localhost:5173' for Vite dev server during development. "
363
+ default=os.environ.get('REVIEW_UI_URL', os.environ.get('LYRICS_REVIEW_UI_URL', 'https://gen.nomadkaraoke.com/lyrics')),
364
+ help="Lyrics review UI URL. For remote mode: defaults to 'https://gen.nomadkaraoke.com/lyrics'. "
365
+ "For local mode: defaults to bundled frontend (from lyrics_transcriber/frontend/). "
366
+ "Use 'http://localhost:5173' to develop against Vite dev server. "
358
367
  "(env: REVIEW_UI_URL or LYRICS_REVIEW_UI_URL)",
359
368
  )
360
369
  remote_group.add_argument(
@@ -313,9 +313,18 @@ async def async_main():
313
313
  args = parser.parse_args()
314
314
 
315
315
  # Set review UI URL environment variable for the lyrics transcriber review server
316
- # This allows development against a local frontend dev server (e.g., http://localhost:5173)
316
+ # Only set this if the user explicitly wants to use a dev server (e.g., http://localhost:5173)
317
+ # By default, let the ReviewServer use its bundled local frontend (served from lyrics_transcriber/frontend/)
318
+ # This enables local iteration on the frontend without redeploying
317
319
  if hasattr(args, 'review_ui_url') and args.review_ui_url:
318
- os.environ['LYRICS_REVIEW_UI_URL'] = args.review_ui_url
320
+ # Check if user provided a custom value (not the default hosted URL)
321
+ default_hosted_urls = [
322
+ 'https://gen.nomadkaraoke.com/lyrics',
323
+ 'https://gen.nomadkaraoke.com/lyrics/'
324
+ ]
325
+ if args.review_ui_url.rstrip('/') not in [url.rstrip('/') for url in default_hosted_urls]:
326
+ # User explicitly wants a specific URL (e.g., Vite dev server)
327
+ os.environ['LYRICS_REVIEW_UI_URL'] = args.review_ui_url
319
328
 
320
329
  # Process style overrides
321
330
  try:
@@ -746,7 +755,7 @@ async def async_main():
746
755
  except UserCancelledError:
747
756
  logger.info("Operation cancelled by user")
748
757
  return
749
- except KeyboardInterrupt:
758
+ except (KeyboardInterrupt, asyncio.CancelledError):
750
759
  logger.info("Operation cancelled by user (Ctrl+C)")
751
760
  return
752
761
 
@@ -775,12 +784,28 @@ async def async_main():
775
784
  logger.info(f"Changing to directory: {track_dir}")
776
785
  os.chdir(track_dir)
777
786
 
778
- # Select instrumental file - either via web UI or auto-selection
787
+ # Select instrumental file - either via web UI, auto-selection, or custom instrumental
779
788
  # This ALWAYS produces a selected file - no silent fallback to legacy code
780
789
  selected_instrumental_file = None
781
790
  skip_review = getattr(args, 'skip_instrumental_review', False)
782
791
 
783
- if skip_review:
792
+ # Check if a custom instrumental was provided (via --existing_instrumental)
793
+ # In this case, the instrumental is already chosen - skip review entirely
794
+ separated_audio = track.get("separated_audio", {})
795
+ custom_instrumental = separated_audio.get("Custom", {}).get("instrumental")
796
+
797
+ if custom_instrumental:
798
+ # Custom instrumental was provided - use it directly, no review needed
799
+ resolved_path = _resolve_path_for_cwd(custom_instrumental, track_dir)
800
+ if os.path.exists(resolved_path):
801
+ logger.info(f"Using custom instrumental (--existing_instrumental): {resolved_path}")
802
+ selected_instrumental_file = resolved_path
803
+ else:
804
+ logger.error(f"Custom instrumental file not found: {resolved_path}")
805
+ logger.error("The file may have been moved or deleted after preparation.")
806
+ sys.exit(1)
807
+ return # Explicit return for testing
808
+ elif skip_review:
784
809
  # Auto-select instrumental when review is skipped (non-interactive mode)
785
810
  logger.info("Instrumental review skipped (--skip_instrumental_review), auto-selecting instrumental file...")
786
811
  try: