PyPI - karaoke-gen - Versions diffs - 0.75.16__py3-none-any.whl → 0.76.20__py3-none-any.whl - Mend

karaoke-gen 0.75.16py3-none-any.whl → 0.76.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

karaoke_gen/audio_fetcher.py +984 -33
karaoke_gen/audio_processor.py +4 -0
karaoke_gen/instrumental_review/static/index.html +37 -14
karaoke_gen/karaoke_finalise/karaoke_finalise.py +25 -1
karaoke_gen/karaoke_gen.py +208 -39
karaoke_gen/lyrics_processor.py +111 -31
karaoke_gen/utils/__init__.py +26 -0
karaoke_gen/utils/cli_args.py +15 -6
karaoke_gen/utils/gen_cli.py +30 -5
karaoke_gen/utils/remote_cli.py +301 -20
{karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/METADATA +107 -5
{karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/RECORD +47 -43
lyrics_transcriber/core/controller.py +76 -2
lyrics_transcriber/frontend/index.html +5 -1
lyrics_transcriber/frontend/package-lock.json +4553 -0
lyrics_transcriber/frontend/package.json +4 -1
lyrics_transcriber/frontend/playwright.config.ts +69 -0
lyrics_transcriber/frontend/public/nomad-karaoke-logo.svg +5 -0
lyrics_transcriber/frontend/src/App.tsx +94 -63
lyrics_transcriber/frontend/src/api.ts +25 -10
lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +55 -21
lyrics_transcriber/frontend/src/components/AppHeader.tsx +65 -0
lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +5 -5
lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +9 -9
lyrics_transcriber/frontend/src/components/EditModal.tsx +1 -1
lyrics_transcriber/frontend/src/components/EditWordList.tsx +1 -1
lyrics_transcriber/frontend/src/components/Header.tsx +34 -48
lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +22 -21
lyrics_transcriber/frontend/src/components/ReferenceView.tsx +1 -1
lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +1 -1
lyrics_transcriber/frontend/src/components/WordDivider.tsx +3 -3
lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +2 -2
lyrics_transcriber/frontend/src/components/shared/constants.ts +15 -5
lyrics_transcriber/frontend/src/main.tsx +1 -7
lyrics_transcriber/frontend/src/theme.ts +337 -135
lyrics_transcriber/frontend/vite.config.ts +5 -0
lyrics_transcriber/frontend/web_assets/assets/{index-COYImAcx.js → index-BECn1o8Q.js} +38 -22
lyrics_transcriber/frontend/web_assets/assets/{index-COYImAcx.js.map → index-BECn1o8Q.js.map} +1 -1
lyrics_transcriber/frontend/web_assets/index.html +1 -1
lyrics_transcriber/frontend/yarn.lock +1005 -1046
lyrics_transcriber/output/countdown_processor.py +39 -0
lyrics_transcriber/review/server.py +1 -1
lyrics_transcriber/transcribers/audioshake.py +96 -7
lyrics_transcriber/types.py +14 -12
{karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/WHEEL +0 -0
{karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/entry_points.txt +0 -0
{karaoke_gen-0.75.16.dist-info → karaoke_gen-0.76.20.dist-info}/licenses/LICENSE +0 -0

karaoke_gen/lyrics_processor.py CHANGED Viewed

@@ -27,10 +27,10 @@ class LyricsProcessor:
     def _detect_countdown_padding_from_lrc(self, lrc_filepath):
         """
-        Detect if countdown padding was applied by checking the first lyric timestamp in the LRC file.
+        Detect if countdown padding was applied by checking for countdown text in the LRC file.
-        LRC format timestamps look like: [mm:ss.xx] or [mm:ss.xxx]
-        If the first lyric timestamp is >= 3.0 seconds, countdown padding was likely applied.
+        The countdown segment has the text "3... 2... 1..." at timestamp 0.1-2.9s.
+        We detect this by looking for the countdown text pattern.
         Args:
             lrc_filepath: Path to the LRC file
@@ -42,7 +42,15 @@ class LyricsProcessor:
             with open(lrc_filepath, 'r', encoding='utf-8') as f:
                 content = f.read()
-            # Find all timestamp patterns in the LRC file
+            # Method 1: Check for countdown text pattern "3... 2... 1..."
+            # This is the most reliable detection method since the countdown text is unique
+            countdown_text = "3... 2... 1..."
+            if countdown_text in content:
+                self.logger.info(f"Detected countdown padding from LRC: found countdown text '{countdown_text}'")
+                return (True, self.COUNTDOWN_PADDING_SECONDS)
+            # Method 2 (fallback): Check if first lyric timestamp is >= 3 seconds
+            # This handles cases where countdown text format might differ
             # LRC timestamps: [mm:ss.xx] or [mm:ss.xxx]
             timestamp_pattern = r'\[(\d{1,2}):(\d{2})\.(\d{2,3})\]'
             matches = re.findall(timestamp_pattern, content)
@@ -51,8 +59,7 @@ class LyricsProcessor:
                 self.logger.debug("No timestamps found in LRC file")
                 return (False, 0.0)
-            # Find the first non-metadata timestamp (metadata like [ar:Artist] doesn't have decimal)
-            # We already filtered for decimal timestamps in our pattern
+            # Parse the first timestamp
             first_timestamp = matches[0]
             minutes = int(first_timestamp[0])
             seconds = int(first_timestamp[1])
@@ -160,6 +167,76 @@ class LyricsProcessor:
         return processed_lines
+    def _check_transcription_providers(self) -> dict:
+        """
+        Check which transcription providers are configured and return their status.
+        Returns:
+            dict with 'configured' (list of provider names) and 'missing' (list of missing configs)
+        """
+        load_dotenv()
+        configured = []
+        missing = []
+        # Check AudioShake
+        audioshake_token = os.getenv("AUDIOSHAKE_API_TOKEN")
+        if audioshake_token:
+            configured.append("AudioShake")
+            self.logger.debug("AudioShake transcription provider: configured")
+        else:
+            missing.append("AudioShake (AUDIOSHAKE_API_TOKEN)")
+            self.logger.debug("AudioShake transcription provider: not configured (missing AUDIOSHAKE_API_TOKEN)")
+        # Check Whisper via RunPod
+        runpod_key = os.getenv("RUNPOD_API_KEY")
+        whisper_id = os.getenv("WHISPER_RUNPOD_ID")
+        if runpod_key and whisper_id:
+            configured.append("Whisper (RunPod)")
+            self.logger.debug("Whisper transcription provider: configured")
+        elif runpod_key:
+            missing.append("Whisper (missing WHISPER_RUNPOD_ID)")
+            self.logger.debug("Whisper transcription provider: partially configured (missing WHISPER_RUNPOD_ID)")
+        elif whisper_id:
+            missing.append("Whisper (missing RUNPOD_API_KEY)")
+            self.logger.debug("Whisper transcription provider: partially configured (missing RUNPOD_API_KEY)")
+        else:
+            missing.append("Whisper (RUNPOD_API_KEY + WHISPER_RUNPOD_ID)")
+            self.logger.debug("Whisper transcription provider: not configured")
+        return {"configured": configured, "missing": missing}
+    def _build_transcription_provider_error_message(self, missing_providers: list) -> str:
+        """Build a helpful error message when no transcription providers are configured."""
+        return (
+            "No transcription providers configured!\n"
+            "\n"
+            "Karaoke video generation requires at least one transcription provider to create "
+            "synchronized lyrics. Without a transcription provider, the system cannot generate "
+            "the word-level timing data needed for the karaoke video.\n"
+            "\n"
+            "AVAILABLE TRANSCRIPTION PROVIDERS:\n"
+            "\n"
+            "1. AudioShake (Recommended - Commercial, high-quality)\n"
+            "   - Set environment variable: AUDIOSHAKE_API_TOKEN=your_token\n"
+            "   - Get an API key at: https://www.audioshake.ai/\n"
+            "\n"
+            "2. Whisper via RunPod (Open-source alternative)\n"
+            "   - Set environment variables:\n"
+            "     RUNPOD_API_KEY=your_key\n"
+            "     WHISPER_RUNPOD_ID=your_endpoint_id\n"
+            "   - Set up a Whisper endpoint at: https://www.runpod.io/\n"
+            "\n"
+            "ALTERNATIVES:\n"
+            "\n"
+            "- Use --skip-lyrics flag to generate instrumental-only karaoke (no synchronized lyrics)\n"
+            "- Use --lyrics_file to provide pre-timed lyrics (still needs transcription for timing)\n"
+            "\n"
+            f"Missing provider configurations: {', '.join(missing_providers)}\n"
+            "\n"
+            "See README.md 'Transcription Providers' section for detailed setup instructions."
+        )
     def transcribe_lyrics(self, input_audio_wav, artist, title, track_output_dir, lyrics_artist=None, lyrics_title=None):
         """
         Transcribe lyrics for a track.
@@ -171,6 +248,9 @@ class LyricsProcessor:
             track_output_dir: Output directory path
             lyrics_artist: Artist name for lyrics processing (defaults to artist if None)
             lyrics_title: Title for lyrics processing (defaults to title if None)
+        Raises:
+            ValueError: If transcription is enabled but no providers are configured
         """
         # Use original artist/title for filename generation
         filename_artist = artist
@@ -234,6 +314,17 @@ class LyricsProcessor:
                 "padded_audio_filepath": None,  # Original padded audio may not exist
             }
+        # Check transcription provider configuration if transcription is not being skipped
+        # Do this AFTER checking for existing files, since existing files don't need transcription
+        if not self.skip_transcription:
+            provider_status = self._check_transcription_providers()
+            if provider_status["configured"]:
+                self.logger.info(f"Transcription providers configured: {', '.join(provider_status['configured'])}")
+            else:
+                error_msg = self._build_transcription_provider_error_message(provider_status["missing"])
+                raise ValueError(error_msg)
         # Create lyrics directory if it doesn't exist
         os.makedirs(lyrics_dir, exist_ok=True)
         self.logger.info(f"Created lyrics directory: {lyrics_dir}")
@@ -273,41 +364,30 @@ class LyricsProcessor:
         self.logger.info(f"  rapidapi_key: {env_config.get('rapidapi_key')[:3] + '...' if env_config.get('rapidapi_key') else 'None'}")
         self.logger.info(f"  lyrics_file: {self.lyrics_file}")
-        # Detect if we're running in a serverless environment (Modal)
-        # Modal sets specific environment variables we can check for
-        is_serverless = (
-            os.getenv("MODAL_TASK_ID") is not None or
-            os.getenv("MODAL_FUNCTION_NAME") is not None or
-            os.path.exists("/.modal")  # Modal creates this directory in containers
-        )
-        # In serverless environment, disable interactive review even if skip_transcription_review=False
-        # This preserves CLI behavior while fixing serverless hanging
-        enable_review_setting = not self.skip_transcription_review and not is_serverless
-        if is_serverless and not self.skip_transcription_review:
-            self.logger.info("Detected serverless environment - disabling interactive review to prevent hanging")
-        # In serverless environment, disable video generation during Phase 1 to save compute
-        # Video will be generated in Phase 2 after human review
-        serverless_render_video = render_video and not is_serverless
-        if is_serverless and render_video:
-            self.logger.info("Detected serverless environment - deferring video generation until after review")
+        # Always defer countdown and video rendering to a later phase.
+        # This ensures the review UI (both local and cloud) shows original timing
+        # without the 3-second countdown shift. The caller is responsible for:
+        # - Local CLI: karaoke_gen.py adds countdown and renders video after transcription
+        # - Cloud backend: render_video_worker.py adds countdown and renders video
+        #
+        # This design ensures consistent behavior regardless of environment,
+        # and the review UI always shows accurate, unshifted timestamps.
+        self.logger.info("Deferring countdown and video rendering to post-review phase")
         output_config = OutputConfig(
             output_styles_json=self.style_params_json,
             output_dir=lyrics_dir,
-            render_video=serverless_render_video,  # Disable video in serverless Phase 1
+            render_video=False,  # Always defer - caller handles video rendering after countdown
             fetch_lyrics=True,
             run_transcription=not self.skip_transcription,
             run_correction=True,
             generate_plain_text=True,
             generate_lrc=True,
-            generate_cdg=False,  # Also defer CDG generation to Phase 2
+            generate_cdg=False,  # CDG generation disabled (not currently supported)
             video_resolution="4k",
-            enable_review=enable_review_setting,
+            enable_review=not self.skip_transcription_review,  # Honor the caller's setting
             subtitle_offset_ms=self.subtitle_offset_ms,
+            add_countdown=False,  # Always defer - caller handles countdown after review
         )
         # Add this log entry to debug the OutputConfig

karaoke_gen/utils/__init__.py CHANGED Viewed

@@ -1,9 +1,35 @@
 import re
+# Unicode character replacements for ASCII-safe filenames
+# These characters cause issues with HTTP headers (latin-1 encoding) and some filesystems
+UNICODE_REPLACEMENTS = {
+    # Curly/smart quotes -> straight quotes
+    "\u2018": "'",  # LEFT SINGLE QUOTATION MARK
+    "\u2019": "'",  # RIGHT SINGLE QUOTATION MARK (the one causing the bug)
+    "\u201A": "'",  # SINGLE LOW-9 QUOTATION MARK
+    "\u201B": "'",  # SINGLE HIGH-REVERSED-9 QUOTATION MARK
+    "\u201C": '"',  # LEFT DOUBLE QUOTATION MARK
+    "\u201D": '"',  # RIGHT DOUBLE QUOTATION MARK
+    "\u201E": '"',  # DOUBLE LOW-9 QUOTATION MARK
+    "\u201F": '"',  # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+    # Other common problematic characters
+    "\u2013": "-",  # EN DASH
+    "\u2014": "-",  # EM DASH
+    "\u2026": "...",  # HORIZONTAL ELLIPSIS
+    "\u00A0": " ",  # NON-BREAKING SPACE
+}
 def sanitize_filename(filename):
     """Replace or remove characters that are unsafe for filenames."""
     if filename is None:
         return None
+    # First, normalize Unicode characters that cause HTTP header encoding issues
+    # (e.g., curly quotes from macOS/Word that can't be encoded in latin-1)
+    for unicode_char, ascii_replacement in UNICODE_REPLACEMENTS.items():
+        filename = filename.replace(unicode_char, ascii_replacement)
     # Replace problematic characters with underscores
     for char in ["\\", "/", ":", "*", "?", '"', "<", ">", "|"]:
         filename = filename.replace(char, "_")

karaoke_gen/utils/cli_args.py CHANGED Viewed

@@ -242,9 +242,17 @@ def create_parser(prog: str = "karaoke-gen") -> argparse.ArgumentParser:
     # Style Configuration
     style_group = parser.add_argument_group("Style Configuration")
+    style_group.add_argument(
+        "--theme",
+        help="Optional: Theme ID for pre-made styles stored in GCS (e.g., 'nomad', 'default'). "
+             "When using a theme, CDG/TXT are enabled by default. "
+             "Example: --theme=nomad",
+    )
     style_group.add_argument(
         "--style_params_json",
-        help="Optional: Path to JSON file containing style configuration. Example: --style_params_json='/path/to/style_params.json'",
+        help="Optional: Path to JSON file containing style configuration. "
+             "Takes precedence over --theme if both are provided. "
+             "Example: --style_params_json='/path/to/style_params.json'",
     )
     style_group.add_argument(
         "--style_override",
@@ -258,8 +266,8 @@ def create_parser(prog: str = "karaoke-gen") -> argparse.ArgumentParser:
     style_group.add_argument(
         "--background_video_darkness",
         type=int,
-        default=0,
-        help="Optional: Darkness overlay percentage (0-100) for video background (default: %(default)s). Example: --background_video_darkness=50",
+        default=50,
+        help="Optional: Darkness overlay percentage (0-100) for video background (default: %(default)s). Example: --background_video_darkness=20",
     )
     # Finalisation Configuration
@@ -352,9 +360,10 @@ def create_parser(prog: str = "karaoke-gen") -> argparse.ArgumentParser:
     )
     remote_group.add_argument(
         "--review-ui-url",
-        default=os.environ.get('REVIEW_UI_URL', os.environ.get('LYRICS_REVIEW_UI_URL', 'https://lyrics.nomadkaraoke.com')),
-        help="Lyrics review UI URL. Default: 'https://lyrics.nomadkaraoke.com'. "
-             "Use 'http://localhost:5173' for Vite dev server during development. "
+        default=os.environ.get('REVIEW_UI_URL', os.environ.get('LYRICS_REVIEW_UI_URL', 'https://gen.nomadkaraoke.com/lyrics')),
+        help="Lyrics review UI URL. For remote mode: defaults to 'https://gen.nomadkaraoke.com/lyrics'. "
+             "For local mode: defaults to bundled frontend (from lyrics_transcriber/frontend/). "
+             "Use 'http://localhost:5173' to develop against Vite dev server. "
              "(env: REVIEW_UI_URL or LYRICS_REVIEW_UI_URL)",
     )
     remote_group.add_argument(

karaoke_gen/utils/gen_cli.py CHANGED Viewed

@@ -313,9 +313,18 @@ async def async_main():
     args = parser.parse_args()
     # Set review UI URL environment variable for the lyrics transcriber review server
-    # This allows development against a local frontend dev server (e.g., http://localhost:5173)
+    # Only set this if the user explicitly wants to use a dev server (e.g., http://localhost:5173)
+    # By default, let the ReviewServer use its bundled local frontend (served from lyrics_transcriber/frontend/)
+    # This enables local iteration on the frontend without redeploying
     if hasattr(args, 'review_ui_url') and args.review_ui_url:
-        os.environ['LYRICS_REVIEW_UI_URL'] = args.review_ui_url
+        # Check if user provided a custom value (not the default hosted URL)
+        default_hosted_urls = [
+            'https://gen.nomadkaraoke.com/lyrics',
+            'https://gen.nomadkaraoke.com/lyrics/'
+        ]
+        if args.review_ui_url.rstrip('/') not in [url.rstrip('/') for url in default_hosted_urls]:
+            # User explicitly wants a specific URL (e.g., Vite dev server)
+            os.environ['LYRICS_REVIEW_UI_URL'] = args.review_ui_url
     # Process style overrides
     try:
@@ -746,7 +755,7 @@ async def async_main():
     except UserCancelledError:
         logger.info("Operation cancelled by user")
         return
-    except KeyboardInterrupt:
+    except (KeyboardInterrupt, asyncio.CancelledError):
         logger.info("Operation cancelled by user (Ctrl+C)")
         return
@@ -775,12 +784,28 @@ async def async_main():
         logger.info(f"Changing to directory: {track_dir}")
         os.chdir(track_dir)
-        # Select instrumental file - either via web UI or auto-selection
+        # Select instrumental file - either via web UI, auto-selection, or custom instrumental
         # This ALWAYS produces a selected file - no silent fallback to legacy code
         selected_instrumental_file = None
         skip_review = getattr(args, 'skip_instrumental_review', False)
-        if skip_review:
+        # Check if a custom instrumental was provided (via --existing_instrumental)
+        # In this case, the instrumental is already chosen - skip review entirely
+        separated_audio = track.get("separated_audio", {})
+        custom_instrumental = separated_audio.get("Custom", {}).get("instrumental")
+        if custom_instrumental:
+            # Custom instrumental was provided - use it directly, no review needed
+            resolved_path = _resolve_path_for_cwd(custom_instrumental, track_dir)
+            if os.path.exists(resolved_path):
+                logger.info(f"Using custom instrumental (--existing_instrumental): {resolved_path}")
+                selected_instrumental_file = resolved_path
+            else:
+                logger.error(f"Custom instrumental file not found: {resolved_path}")
+                logger.error("The file may have been moved or deleted after preparation.")
+                sys.exit(1)
+                return  # Explicit return for testing
+        elif skip_review:
             # Auto-select instrumental when review is skipped (non-interactive mode)
             logger.info("Instrumental review skipped (--skip_instrumental_review), auto-selecting instrumental file...")
             try:

karaoke-gen 0.75.16__py3-none-any.whl → 0.76.20__py3-none-any.whl

karaoke-gen 0.75.16py3-none-any.whl → 0.76.20py3-none-any.whl