karaoke-gen 0.71.42__py3-none-any.whl → 0.75.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. karaoke_gen/__init__.py +32 -1
  2. karaoke_gen/audio_fetcher.py +1220 -67
  3. karaoke_gen/audio_processor.py +15 -3
  4. karaoke_gen/instrumental_review/server.py +154 -860
  5. karaoke_gen/instrumental_review/static/index.html +1529 -0
  6. karaoke_gen/karaoke_finalise/karaoke_finalise.py +87 -2
  7. karaoke_gen/karaoke_gen.py +131 -14
  8. karaoke_gen/lyrics_processor.py +172 -4
  9. karaoke_gen/utils/bulk_cli.py +3 -0
  10. karaoke_gen/utils/cli_args.py +7 -4
  11. karaoke_gen/utils/gen_cli.py +221 -5
  12. karaoke_gen/utils/remote_cli.py +786 -43
  13. {karaoke_gen-0.71.42.dist-info → karaoke_gen-0.75.53.dist-info}/METADATA +109 -4
  14. {karaoke_gen-0.71.42.dist-info → karaoke_gen-0.75.53.dist-info}/RECORD +37 -31
  15. lyrics_transcriber/core/controller.py +76 -2
  16. lyrics_transcriber/frontend/package.json +1 -1
  17. lyrics_transcriber/frontend/src/App.tsx +6 -4
  18. lyrics_transcriber/frontend/src/api.ts +25 -10
  19. lyrics_transcriber/frontend/src/components/Header.tsx +38 -12
  20. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +17 -3
  21. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
  22. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
  23. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
  24. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
  25. lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
  26. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +190 -542
  27. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  28. lyrics_transcriber/frontend/web_assets/assets/{index-DdJTDWH3.js → index-BECn1o8Q.js} +1802 -553
  29. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +1 -0
  30. lyrics_transcriber/frontend/web_assets/index.html +1 -1
  31. lyrics_transcriber/output/countdown_processor.py +39 -0
  32. lyrics_transcriber/review/server.py +5 -5
  33. lyrics_transcriber/transcribers/audioshake.py +96 -7
  34. lyrics_transcriber/types.py +14 -12
  35. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js.map +0 -1
  36. {karaoke_gen-0.71.42.dist-info → karaoke_gen-0.75.53.dist-info}/WHEEL +0 -0
  37. {karaoke_gen-0.71.42.dist-info → karaoke_gen-0.75.53.dist-info}/entry_points.txt +0 -0
  38. {karaoke_gen-0.71.42.dist-info → karaoke_gen-0.75.53.dist-info}/licenses/LICENSE +0 -0
@@ -10,7 +10,7 @@
10
10
  <link rel="icon" type="image/png" sizes="512x512" href="/android-chrome-512x512.png" />
11
11
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
12
12
  <title>Nomad Karaoke: Lyrics Review</title>
13
- <script type="module" crossorigin src="/assets/index-DdJTDWH3.js"></script>
13
+ <script type="module" crossorigin src="/assets/index-BECn1o8Q.js"></script>
14
14
  </head>
15
15
  <body>
16
16
  <div id="root"></div>
@@ -265,3 +265,42 @@ class CountdownProcessor:
265
265
 
266
266
  return countdown_segment
267
267
 
268
+ def has_countdown(self, correction_result: CorrectionResult) -> bool:
269
+ """
270
+ Check if a CorrectionResult already has a countdown segment.
271
+
272
+ This is used to detect if countdown padding was applied to corrections
273
+ that were loaded from a saved JSON file (where the padding state is not
274
+ explicitly stored).
275
+
276
+ Args:
277
+ correction_result: The correction result to check
278
+
279
+ Returns:
280
+ True if the first segment is a countdown, False otherwise
281
+ """
282
+ if not correction_result.corrected_segments:
283
+ return False
284
+
285
+ first_segment = correction_result.corrected_segments[0]
286
+ return first_segment.text == self.COUNTDOWN_TEXT
287
+
288
+ def create_padded_audio_only(self, audio_filepath: str) -> str:
289
+ """
290
+ Create a padded audio file without modifying the correction result.
291
+
292
+ This is used when loading existing corrections that already have countdown
293
+ timestamps, but we need to create the padded audio file for video rendering.
294
+
295
+ Args:
296
+ audio_filepath: Path to original audio file
297
+
298
+ Returns:
299
+ Path to padded audio file
300
+
301
+ Raises:
302
+ FileNotFoundError: If input audio file doesn't exist
303
+ RuntimeError: If ffmpeg command fails
304
+ """
305
+ return self._create_padded_audio(audio_filepath)
306
+
@@ -629,14 +629,14 @@ class ReviewServer:
629
629
  else ""
630
630
  )
631
631
 
632
- # Use hosted review UI by default, can be overridden with LYRICS_REVIEW_UI_URL env var
633
- # Set to "local" to use the bundled local frontend instead
634
- review_ui_url = os.environ.get("LYRICS_REVIEW_UI_URL", "https://lyrics.nomadkaraoke.com")
632
+ # Use bundled local frontend by default for local karaoke-gen runs
633
+ # Can override with LYRICS_REVIEW_UI_URL env var (e.g., http://localhost:5173 for Vite dev)
634
+ review_ui_url = os.environ.get("LYRICS_REVIEW_UI_URL", "local")
635
635
  if review_ui_url.lower() == "local":
636
- # Use the bundled local frontend
636
+ # Use the bundled local frontend served by this FastAPI server
637
637
  browser_url = f"http://localhost:8000?baseApiUrl={encoded_api_url}{audio_hash_param}"
638
638
  else:
639
- # Use the hosted/external review UI
639
+ # Use an external review UI (dev server or hosted)
640
640
  browser_url = f"{review_ui_url}?baseApiUrl={encoded_api_url}{audio_hash_param}"
641
641
 
642
642
  self.logger.info(f"Opening review UI: {browser_url}")
@@ -2,12 +2,92 @@ from dataclasses import dataclass
2
2
  import requests
3
3
  import time
4
4
  import os
5
- from typing import Dict, Optional, Any, Union
5
+ import tempfile
6
+ from typing import Dict, Optional, Any, Union, Tuple
6
7
  from pathlib import Path
8
+ from pydub import AudioSegment
7
9
  from lyrics_transcriber.types import TranscriptionData, LyricsSegment, Word
8
10
  from lyrics_transcriber.transcribers.base_transcriber import BaseTranscriber, TranscriptionError
9
11
  from lyrics_transcriber.utils.word_utils import WordUtils
10
12
 
13
+ # Lossy formats that should be uploaded directly (transcoding would cause quality loss)
14
+ LOSSY_FORMATS = {'.mp3', '.aac', '.ogg', '.m4a', '.wma', '.opus'}
15
+ # Lossless formats that are already compressed and can be uploaded directly
16
+ LOSSLESS_COMPRESSED_FORMATS = {'.flac', '.alac'}
17
+ # Uncompressed formats that should be converted to FLAC for efficient upload
18
+ UNCOMPRESSED_FORMATS = {'.wav', '.aiff', '.aif', '.pcm'}
19
+
20
+
21
+ class AudioUploadOptimizer:
22
+ """Optimizes audio files for upload by converting uncompressed formats to FLAC."""
23
+
24
+ def __init__(self, logger):
25
+ self.logger = logger
26
+
27
+ def prepare_for_upload(self, filepath: str) -> Tuple[str, Optional[str]]:
28
+ """
29
+ Prepare audio file for optimal upload.
30
+
31
+ Returns:
32
+ Tuple of (filepath_to_upload, temp_file_to_cleanup)
33
+ - If no conversion needed, returns (original_filepath, None)
34
+ - If converted, returns (temp_flac_filepath, temp_flac_filepath)
35
+ """
36
+ ext = os.path.splitext(filepath)[1].lower()
37
+
38
+ # Lossy formats: upload directly (transcoding would lose quality)
39
+ if ext in LOSSY_FORMATS:
40
+ self.logger.info(f"Uploading lossy format ({ext}) directly to preserve quality")
41
+ return filepath, None
42
+
43
+ # Already compressed lossless: upload directly
44
+ if ext in LOSSLESS_COMPRESSED_FORMATS:
45
+ self.logger.info(f"Uploading lossless compressed format ({ext}) directly")
46
+ return filepath, None
47
+
48
+ # Uncompressed formats: convert to FLAC for smaller upload
49
+ if ext in UNCOMPRESSED_FORMATS:
50
+ self.logger.info(f"Converting uncompressed format ({ext}) to FLAC for efficient upload")
51
+ return self._convert_to_flac(filepath)
52
+
53
+ # Unknown format: try to upload directly
54
+ self.logger.warning(f"Unknown audio format ({ext}), uploading directly")
55
+ return filepath, None
56
+
57
+ def _convert_to_flac(self, filepath: str) -> Tuple[str, str]:
58
+ """Convert audio file to FLAC format."""
59
+ ext = os.path.splitext(filepath)[1].lower()
60
+
61
+ # Load audio based on format
62
+ if ext == '.wav':
63
+ audio = AudioSegment.from_wav(filepath)
64
+ elif ext in {'.aiff', '.aif'}:
65
+ audio = AudioSegment.from_file(filepath, format='aiff')
66
+ else:
67
+ audio = AudioSegment.from_file(filepath)
68
+
69
+ # Create temp file for FLAC output
70
+ with tempfile.NamedTemporaryFile(suffix=".flac", delete=False) as temp_flac:
71
+ flac_path = temp_flac.name
72
+ audio.export(flac_path, format="flac")
73
+
74
+ # Log size reduction
75
+ original_size = os.path.getsize(filepath)
76
+ flac_size = os.path.getsize(flac_path)
77
+ reduction_pct = (1 - flac_size / original_size) * 100
78
+ self.logger.info(f"Converted to FLAC: {original_size / 1024 / 1024:.1f}MB → {flac_size / 1024 / 1024:.1f}MB ({reduction_pct:.0f}% smaller)")
79
+
80
+ return flac_path, flac_path
81
+
82
+ def cleanup(self, temp_filepath: Optional[str]) -> None:
83
+ """Clean up temporary file if it exists."""
84
+ if temp_filepath and os.path.exists(temp_filepath):
85
+ try:
86
+ os.unlink(temp_filepath)
87
+ self.logger.debug(f"Cleaned up temporary file: {temp_filepath}")
88
+ except OSError as e:
89
+ self.logger.warning(f"Failed to clean up temporary file {temp_filepath}: {e}")
90
+
11
91
 
12
92
  @dataclass
13
93
  class AudioShakeConfig:
@@ -162,11 +242,13 @@ class AudioShakeTranscriber(BaseTranscriber):
162
242
  config: Optional[AudioShakeConfig] = None,
163
243
  logger: Optional[Any] = None,
164
244
  api_client: Optional[AudioShakeAPI] = None,
245
+ upload_optimizer: Optional[AudioUploadOptimizer] = None,
165
246
  ):
166
247
  """Initialize AudioShake transcriber."""
167
248
  super().__init__(cache_dir=cache_dir, logger=logger)
168
249
  self.config = config or AudioShakeConfig(api_token=os.getenv("AUDIOSHAKE_API_TOKEN"))
169
250
  self.api = api_client or AudioShakeAPI(self.config, self.logger)
251
+ self.upload_optimizer = upload_optimizer or AudioUploadOptimizer(self.logger)
170
252
 
171
253
  def get_name(self) -> str:
172
254
  return "AudioShake"
@@ -195,14 +277,21 @@ class AudioShakeTranscriber(BaseTranscriber):
195
277
  """Starts the transcription task and returns the task ID."""
196
278
  self.logger.debug(f"Entering start_transcription() for {audio_filepath}")
197
279
 
198
- # Upload file and create task
199
- file_url = self.api.upload_file(audio_filepath)
200
- self.logger.debug(f"File uploaded successfully. File URL: {file_url}")
280
+ # Optimize file format for upload (convert WAV to FLAC, etc.)
281
+ upload_filepath, temp_filepath = self.upload_optimizer.prepare_for_upload(audio_filepath)
282
+
283
+ try:
284
+ # Upload file and create task
285
+ file_url = self.api.upload_file(upload_filepath)
286
+ self.logger.debug(f"File uploaded successfully. File URL: {file_url}")
201
287
 
202
- task_id = self.api.create_task(file_url)
203
- self.logger.debug(f"Task created successfully. Task ID: {task_id}")
288
+ task_id = self.api.create_task(file_url)
289
+ self.logger.debug(f"Task created successfully. Task ID: {task_id}")
204
290
 
205
- return task_id
291
+ return task_id
292
+ finally:
293
+ # Clean up any temporary file created during optimization
294
+ self.upload_optimizer.cleanup(temp_filepath)
206
295
 
207
296
  def get_transcription_result(self, task_id: str) -> Dict[str, Any]:
208
297
  """Gets the raw results for a previously started task."""
@@ -363,30 +363,32 @@ class AnchorSequence:
363
363
  def from_dict(cls, data: Dict[str, Any]) -> "AnchorSequence":
364
364
  """Create AnchorSequence from dictionary."""
365
365
  # Handle both old and new dictionary formats
366
- if "words" in data:
367
- # Old format - convert to new format without setting _words
368
- # This ensures to_dict() always returns the new format
369
- words = data["words"]
366
+ # Check for new format keys FIRST (they take priority even if old keys also present)
367
+ if "transcribed_word_ids" in data:
368
+ # New format - use existing IDs
370
369
  return cls(
371
370
  id=data.get("id", WordUtils.generate_id()),
372
- transcribed_word_ids=[WordUtils.generate_id() for _ in words],
371
+ transcribed_word_ids=data["transcribed_word_ids"],
373
372
  transcription_position=data["transcription_position"],
374
373
  reference_positions=data["reference_positions"],
375
- reference_word_ids={source: [WordUtils.generate_id() for _ in words]
376
- for source in data["reference_positions"].keys()},
374
+ reference_word_ids=data["reference_word_ids"],
377
375
  confidence=data["confidence"],
378
- # Don't set _words - this ensures we always use the new format
379
376
  )
380
- else:
381
- # New format
377
+ elif "words" in data:
378
+ # Old format only - convert to new format by generating IDs
379
+ # This ensures to_dict() always returns the new format
380
+ words = data["words"]
382
381
  return cls(
383
382
  id=data.get("id", WordUtils.generate_id()),
384
- transcribed_word_ids=data["transcribed_word_ids"],
383
+ transcribed_word_ids=[WordUtils.generate_id() for _ in words],
385
384
  transcription_position=data["transcription_position"],
386
385
  reference_positions=data["reference_positions"],
387
- reference_word_ids=data["reference_word_ids"],
386
+ reference_word_ids={source: [WordUtils.generate_id() for _ in words]
387
+ for source in data["reference_positions"].keys()},
388
388
  confidence=data["confidence"],
389
389
  )
390
+ else:
391
+ raise ValueError("AnchorSequence.from_dict requires either 'transcribed_word_ids' or 'words' key")
390
392
 
391
393
 
392
394
  @dataclass