PyPI - karaoke-gen - Versions diffs - 0.71.27__py3-none-any.whl → 0.75.16__py3-none-any.whl - Mend

karaoke-gen 0.71.27py3-none-any.whl → 0.75.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

karaoke_gen/__init__.py +32 -1
karaoke_gen/audio_fetcher.py +476 -56
karaoke_gen/audio_processor.py +11 -3
karaoke_gen/file_handler.py +192 -0
karaoke_gen/instrumental_review/__init__.py +45 -0
karaoke_gen/instrumental_review/analyzer.py +408 -0
karaoke_gen/instrumental_review/editor.py +322 -0
karaoke_gen/instrumental_review/models.py +171 -0
karaoke_gen/instrumental_review/server.py +475 -0
karaoke_gen/instrumental_review/static/index.html +1506 -0
karaoke_gen/instrumental_review/waveform.py +409 -0
karaoke_gen/karaoke_finalise/karaoke_finalise.py +62 -1
karaoke_gen/karaoke_gen.py +114 -1
karaoke_gen/lyrics_processor.py +81 -4
karaoke_gen/utils/bulk_cli.py +3 -0
karaoke_gen/utils/cli_args.py +9 -2
karaoke_gen/utils/gen_cli.py +379 -2
karaoke_gen/utils/remote_cli.py +1126 -77
{karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/METADATA +7 -1
{karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/RECORD +38 -26
lyrics_transcriber/correction/anchor_sequence.py +226 -350
lyrics_transcriber/frontend/package.json +1 -1
lyrics_transcriber/frontend/src/components/Header.tsx +38 -12
lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +17 -3
lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +190 -542
lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
lyrics_transcriber/frontend/web_assets/assets/{index-DdJTDWH3.js → index-COYImAcx.js} +1722 -489
lyrics_transcriber/frontend/web_assets/assets/index-COYImAcx.js.map +1 -0
lyrics_transcriber/frontend/web_assets/index.html +1 -1
lyrics_transcriber/review/server.py +5 -5
lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js.map +0 -1
{karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/WHEEL +0 -0
{karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/entry_points.txt +0 -0
{karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/licenses/LICENSE +0 -0

karaoke_gen/instrumental_review/waveform.py ADDED Viewed

@@ -0,0 +1,409 @@
+"""
+Waveform visualization generator for audio files.
+This module provides the WaveformGenerator class which creates waveform
+images suitable for display in the instrumental review UI.
+"""
+import logging
+import math
+from pathlib import Path
+from typing import List, Optional, Tuple
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend
+import matplotlib.pyplot as plt
+import numpy as np
+from pydub import AudioSegment
+from .models import AudibleSegment, MuteRegion
+logger = logging.getLogger(__name__)
+class WaveformGenerator:
+    """
+    Generates waveform visualization images from audio files.
+    This class creates PNG images showing the amplitude envelope of an
+    audio file over time. It can highlight detected audible segments
+    and mute regions with different colors.
+    The generated images are suitable for display in web UIs and can
+    be used for interactive seeking (click-to-seek) functionality.
+    Attributes:
+        width: Width of the output image in pixels (default: 1200)
+        height: Height of the output image in pixels (default: 200)
+        background_color: Background color (default: "#1a1a2e")
+        waveform_color: Main waveform color (default: "#4a90d9")
+        segment_color: Color for audible segments (default: "#e94560")
+        mute_color: Color for mute regions (default: "#ff6b6b")
+        time_axis_color: Color for time axis (default: "#ffffff")
+    Example:
+        >>> generator = WaveformGenerator(width=1200, height=200)
+        >>> generator.generate(
+        ...     audio_path="/path/to/backing_vocals.flac",
+        ...     output_path="/path/to/waveform.png",
+        ...     segments=analysis_result.audible_segments
+        ... )
+    """
+    def __init__(
+        self,
+        width: int = 1200,
+        height: int = 200,
+        background_color: str = "#1a1a2e",
+        waveform_color: str = "#4a90d9",
+        segment_color: str = "#e94560",
+        mute_color: str = "#ff6b6b",
+        time_axis_color: str = "#ffffff",
+        dpi: int = 100,
+    ):
+        """
+        Initialize the waveform generator.
+        Args:
+            width: Width of the output image in pixels
+            height: Height of the output image in pixels
+            background_color: Background color (hex or named color)
+            waveform_color: Main waveform color
+            segment_color: Color for highlighting audible segments
+            mute_color: Color for highlighting mute regions
+            time_axis_color: Color for time axis labels
+            dpi: DPI for the output image
+        """
+        self.width = width
+        self.height = height
+        self.background_color = background_color
+        self.waveform_color = waveform_color
+        self.segment_color = segment_color
+        self.mute_color = mute_color
+        self.time_axis_color = time_axis_color
+        self.dpi = dpi
+    def generate(
+        self,
+        audio_path: str,
+        output_path: str,
+        segments: Optional[List[AudibleSegment]] = None,
+        mute_regions: Optional[List[MuteRegion]] = None,
+        show_time_axis: bool = True,
+        silence_threshold_db: float = -40.0,
+    ) -> str:
+        """
+        Generate a waveform image from an audio file.
+        Args:
+            audio_path: Path to the audio file
+            output_path: Path where the PNG image will be saved
+            segments: Optional list of audible segments to highlight
+            mute_regions: Optional list of mute regions to highlight
+            show_time_axis: Whether to show time axis labels
+            silence_threshold_db: Threshold for visual reference line
+        Returns:
+            Path to the generated image file
+        Raises:
+            FileNotFoundError: If the audio file doesn't exist
+        """
+        path = Path(audio_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        logger.info(f"Generating waveform for: {audio_path}")
+        # Load audio
+        audio = AudioSegment.from_file(audio_path)
+        duration_seconds = len(audio) / 1000.0
+        # Convert to mono if needed
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        # Get amplitude envelope
+        envelope = self._get_envelope(audio)
+        # Create the figure
+        fig, ax = self._create_figure(duration_seconds, show_time_axis)
+        # Draw waveform
+        self._draw_waveform(ax, envelope, duration_seconds)
+        # Highlight mute regions (if any) - draw first so waveform is on top
+        if mute_regions:
+            self._draw_mute_regions(ax, mute_regions, duration_seconds)
+        # Highlight audible segments (if any)
+        if segments:
+            self._draw_segments(ax, segments, envelope, duration_seconds)
+        # Draw silence threshold reference line
+        self._draw_threshold_line(ax, silence_threshold_db, duration_seconds)
+        # Save the figure
+        output_dir = Path(output_path).parent
+        output_dir.mkdir(parents=True, exist_ok=True)
+        fig.savefig(
+            output_path,
+            facecolor=self.background_color,
+            edgecolor='none',
+            bbox_inches='tight',
+            pad_inches=0.1,
+        )
+        plt.close(fig)
+        logger.info(f"Waveform saved to: {output_path}")
+        return output_path
+    def generate_data_only(
+        self,
+        audio_path: str,
+        num_points: int = 500,
+    ) -> Tuple[List[float], float]:
+        """
+        Generate waveform data without creating an image.
+        This is useful for sending data to a frontend that will
+        render the waveform itself (e.g., using Canvas or SVG).
+        Args:
+            audio_path: Path to the audio file
+            num_points: Number of data points to return
+        Returns:
+            Tuple of (amplitude_values, duration_seconds)
+            Amplitude values are normalized to 0.0-1.0 range.
+        """
+        path = Path(audio_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        audio = AudioSegment.from_file(audio_path)
+        duration_seconds = len(audio) / 1000.0
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        # Calculate window size to get desired number of points
+        duration_ms = len(audio)
+        window_ms = max(1, duration_ms // num_points)
+        amplitudes = []
+        for start_ms in range(0, duration_ms, window_ms):
+            end_ms = min(start_ms + window_ms, duration_ms)
+            window = audio[start_ms:end_ms]
+            if window.rms > 0:
+                db = 20 * math.log10(window.rms / window.max_possible_amplitude)
+            else:
+                db = -100.0
+            # Normalize to 0-1 range (mapping -60dB to 0dB -> 0 to 1)
+            normalized = max(0.0, min(1.0, (db + 60) / 60))
+            amplitudes.append(normalized)
+        return amplitudes, duration_seconds
+    def _get_envelope(
+        self,
+        audio: AudioSegment,
+        window_ms: int = 50,
+    ) -> np.ndarray:
+        """
+        Extract amplitude envelope from audio.
+        Returns array of amplitude values in dB.
+        """
+        duration_ms = len(audio)
+        amplitudes = []
+        for start_ms in range(0, duration_ms, window_ms):
+            end_ms = min(start_ms + window_ms, duration_ms)
+            window = audio[start_ms:end_ms]
+            if window.rms > 0:
+                db = 20 * math.log10(window.rms / window.max_possible_amplitude)
+            else:
+                db = -100.0
+            amplitudes.append(db)
+        return np.array(amplitudes)
+    def _create_figure(
+        self,
+        duration_seconds: float,
+        show_time_axis: bool,
+    ) -> Tuple[plt.Figure, plt.Axes]:
+        """
+        Create matplotlib figure and axes.
+        """
+        fig_width = self.width / self.dpi
+        fig_height = self.height / self.dpi
+        fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=self.dpi)
+        # Set background
+        fig.patch.set_facecolor(self.background_color)
+        ax.set_facecolor(self.background_color)
+        # Configure axes
+        ax.set_xlim(0, duration_seconds)
+        ax.set_ylim(-60, 0)  # dB range
+        # Remove spines
+        for spine in ax.spines.values():
+            spine.set_visible(False)
+        # Configure ticks
+        if show_time_axis:
+            ax.tick_params(
+                axis='x',
+                colors=self.time_axis_color,
+                labelsize=8,
+            )
+            ax.tick_params(axis='y', left=False, labelleft=False)
+            # Set time axis ticks
+            self._set_time_ticks(ax, duration_seconds)
+        else:
+            ax.tick_params(
+                axis='both',
+                left=False,
+                bottom=False,
+                labelleft=False,
+                labelbottom=False,
+            )
+        return fig, ax
+    def _set_time_ticks(self, ax: plt.Axes, duration_seconds: float):
+        """
+        Set appropriate time axis tick marks.
+        """
+        if duration_seconds <= 60:
+            # Under 1 minute: tick every 10 seconds
+            tick_interval = 10
+        elif duration_seconds <= 300:
+            # Under 5 minutes: tick every 30 seconds
+            tick_interval = 30
+        else:
+            # Over 5 minutes: tick every minute
+            tick_interval = 60
+        ticks = np.arange(0, duration_seconds + 1, tick_interval)
+        ax.set_xticks(ticks)
+        # Format tick labels as MM:SS
+        labels = []
+        for t in ticks:
+            minutes = int(t // 60)
+            seconds = int(t % 60)
+            labels.append(f"{minutes}:{seconds:02d}")
+        ax.set_xticklabels(labels)
+    def _draw_waveform(
+        self,
+        ax: plt.Axes,
+        envelope: np.ndarray,
+        duration_seconds: float,
+    ):
+        """
+        Draw the main waveform.
+        """
+        num_points = len(envelope)
+        time_points = np.linspace(0, duration_seconds, num_points)
+        # Draw as filled area
+        ax.fill_between(
+            time_points,
+            envelope,
+            -60,  # Bottom of range
+            color=self.waveform_color,
+            alpha=0.7,
+        )
+        # Draw outline
+        ax.plot(
+            time_points,
+            envelope,
+            color=self.waveform_color,
+            linewidth=0.5,
+            alpha=0.9,
+        )
+    def _draw_segments(
+        self,
+        ax: plt.Axes,
+        segments: List[AudibleSegment],
+        envelope: np.ndarray,
+        duration_seconds: float,
+    ):
+        """
+        Highlight audible segments on the waveform.
+        """
+        num_points = len(envelope)
+        time_points = np.linspace(0, duration_seconds, num_points)
+        for segment in segments:
+            # Find indices corresponding to this segment
+            start_idx = int(segment.start_seconds / duration_seconds * num_points)
+            end_idx = int(segment.end_seconds / duration_seconds * num_points)
+            start_idx = max(0, min(start_idx, num_points - 1))
+            end_idx = max(0, min(end_idx, num_points))
+            if start_idx >= end_idx:
+                continue
+            segment_time = time_points[start_idx:end_idx]
+            segment_envelope = envelope[start_idx:end_idx]
+            # Highlight this segment with a different color
+            ax.fill_between(
+                segment_time,
+                segment_envelope,
+                -60,
+                color=self.segment_color,
+                alpha=0.6,
+            )
+    def _draw_mute_regions(
+        self,
+        ax: plt.Axes,
+        mute_regions: List[MuteRegion],
+        duration_seconds: float,
+    ):
+        """
+        Draw mute region overlays.
+        """
+        for region in mute_regions:
+            ax.axvspan(
+                region.start_seconds,
+                region.end_seconds,
+                color=self.mute_color,
+                alpha=0.3,
+                zorder=0,
+            )
+    def _draw_threshold_line(
+        self,
+        ax: plt.Axes,
+        threshold_db: float,
+        duration_seconds: float,
+    ):
+        """
+        Draw a reference line at the silence threshold.
+        """
+        ax.axhline(
+            y=threshold_db,
+            color=self.time_axis_color,
+            linestyle='--',
+            linewidth=0.5,
+            alpha=0.3,
+        )

karaoke_gen/karaoke_finalise/karaoke_finalise.py CHANGED Viewed

@@ -47,6 +47,7 @@ class KaraokeFinalise:
         user_youtube_credentials=None,  # Add support for pre-stored credentials
         server_side_mode=False,  # New parameter for server-side deployment
         selected_instrumental_file=None,  # Add support for pre-selected instrumental file
+        countdown_padding_seconds=None,  # Padding applied to vocals; instrumental must match
     ):
         self.log_level = log_level
         self.log_formatter = log_formatter
@@ -54,6 +55,9 @@ class KaraokeFinalise:
         if logger is None:
             self.logger = logging.getLogger(__name__)
             self.logger.setLevel(log_level)
+            # Prevent log propagation to root logger to avoid duplicate logs
+            # when external packages (like lyrics_converter) configure root logger handlers
+            self.logger.propagate = False
             self.log_handler = logging.StreamHandler()
@@ -105,6 +109,7 @@ class KaraokeFinalise:
         self.user_youtube_credentials = user_youtube_credentials
         self.server_side_mode = server_side_mode
         self.selected_instrumental_file = selected_instrumental_file
+        self.countdown_padding_seconds = countdown_padding_seconds
         self.suffixes = {
             "title_mov": " (Title).mov",
@@ -421,6 +426,15 @@ class KaraokeFinalise:
         # Check if any videos were found
         if "items" in response and len(response["items"]) > 0:
             for item in response["items"]:
+                # YouTube search API sometimes returns results from other channels even with channelId filter
+                # Verify the video actually belongs to our channel
+                result_channel_id = item["snippet"]["channelId"]
+                if result_channel_id != channel_id:
+                    self.logger.debug(
+                        f"Skipping video from different channel: {item['snippet']['title']} (channel: {result_channel_id})"
+                    )
+                    continue
                 found_title = item["snippet"]["title"]
                 # In server-side mode, require an exact match to avoid false positives.
@@ -720,6 +734,32 @@ class KaraokeFinalise:
         artist, title = base_name.split(" - ", 1)
         return base_name, artist, title
+    def _pad_audio_file(self, input_audio, output_audio, padding_seconds):
+        """
+        Pad an audio file by prepending silence at the beginning.
+        Uses the same ffmpeg approach as LyricsTranscriber's CountdownProcessor
+        to ensure consistent padding behavior.
+        Args:
+            input_audio: Path to input audio file
+            output_audio: Path for the padded output file
+            padding_seconds: Amount of silence to prepend (in seconds)
+        """
+        self.logger.info(f"Padding audio file with {padding_seconds}s of silence")
+        # Use ffmpeg to prepend silence - this matches the approach in audio_processor.py
+        # adelay filter adds delay in milliseconds
+        delay_ms = int(padding_seconds * 1000)
+        ffmpeg_command = (
+            f'{self.ffmpeg_base_command} -i "{input_audio}" '
+            f'-af "adelay={delay_ms}|{delay_ms}" '
+            f'"{output_audio}"'
+        )
+        self.execute_command(ffmpeg_command, f"Padding audio with {padding_seconds}s silence")
     def execute_command(self, command, description):
         """Execute a shell command and log the output. For general commands (rclone, etc.)"""
         self.logger.info(f"{description}")
@@ -764,11 +804,32 @@ class KaraokeFinalise:
     def remux_with_instrumental(self, with_vocals_file, instrumental_audio, output_file):
         """Remux the video with instrumental audio to create karaoke version"""
+        # Safety net: If countdown padding was applied to vocals, ensure instrumental is padded too
+        actual_instrumental = instrumental_audio
+        if self.countdown_padding_seconds and self.countdown_padding_seconds > 0:
+            # Check if the instrumental file is already padded (has "(Padded)" in name)
+            if "(Padded)" not in instrumental_audio:
+                self.logger.warning(
+                    f"Countdown padding ({self.countdown_padding_seconds}s) was applied to vocals, "
+                    f"but instrumental doesn't appear to be padded. Creating padded version..."
+                )
+                # Create a padded version of the instrumental
+                base, ext = os.path.splitext(instrumental_audio)
+                padded_instrumental = f"{base} (Padded){ext}"
+                if not os.path.exists(padded_instrumental):
+                    self._pad_audio_file(instrumental_audio, padded_instrumental, self.countdown_padding_seconds)
+                    self.logger.info(f"Created padded instrumental: {padded_instrumental}")
+                actual_instrumental = padded_instrumental
+            else:
+                self.logger.info(f"Using already-padded instrumental: {instrumental_audio}")
         # This operation is primarily I/O bound (remuxing), so hardware acceleration doesn't provide significant benefit
         # Keep the existing approach but use the new execute method
         ffmpeg_command = (
             f'{self.ffmpeg_base_command} -an -i "{with_vocals_file}" '
-            f'-vn -i "{instrumental_audio}" -c:v copy -c:a pcm_s16le "{output_file}"'
+            f'-vn -i "{actual_instrumental}" -c:v copy -c:a pcm_s16le "{output_file}"'
         )
         self.execute_command(ffmpeg_command, "Remuxing video with instrumental audio")

karaoke_gen/karaoke_gen.py CHANGED Viewed

@@ -29,7 +29,7 @@ from .audio_processor import AudioProcessor
 from .lyrics_processor import LyricsProcessor
 from .video_generator import VideoGenerator
 from .video_background_processor import VideoBackgroundProcessor
-from .audio_fetcher import create_audio_fetcher, AudioFetcherError, NoResultsError
+from .audio_fetcher import create_audio_fetcher, AudioFetcherError, NoResultsError, UserCancelledError
 class KaraokePrep:
@@ -84,6 +84,9 @@ class KaraokePrep:
         if logger is None:
             self.logger = logging.getLogger(__name__)
             self.logger.setLevel(log_level)
+            # Prevent log propagation to root logger to avoid duplicate logs
+            # when external packages (like lyrics_converter) configure root logger handlers
+            self.logger.propagate = False
             self.log_handler = logging.StreamHandler()
@@ -256,6 +259,101 @@ class KaraokePrep:
         self.artist = metadata_result["artist"]
         self.title = metadata_result["title"]
+    def _scan_directory_for_instrumentals(self, track_output_dir, artist_title):
+        """
+        Scan the directory for existing instrumental files and build a separated_audio structure.
+        This is used when transcription was skipped (existing files found) but we need to
+        pad instrumentals due to countdown padding.
+        Args:
+            track_output_dir: The track output directory to scan
+            artist_title: The "{artist} - {title}" string for matching files
+        Returns:
+            Dictionary with separated_audio structure containing found instrumental paths
+        """
+        self.logger.info(f"Scanning directory for existing instrumentals: {track_output_dir}")
+        separated_audio = {
+            "clean_instrumental": {},
+            "backing_vocals": {},
+            "other_stems": {},
+            "combined_instrumentals": {},
+        }
+        # Search patterns for instrumental files
+        # Files are named like: "{artist} - {title} (Instrumental {model}).flac"
+        # Or with backing vocals: "{artist} - {title} (Instrumental +BV {model}).flac"
+        # Look for files in the track output directory
+        search_dir = track_output_dir
+        # Find all instrumental files (not padded ones - we want the originals)
+        instrumental_pattern = os.path.join(search_dir, f"{artist_title} (Instrumental*.flac")
+        instrumental_files = glob.glob(instrumental_pattern)
+        # Also check for wav files
+        instrumental_pattern_wav = os.path.join(search_dir, f"{artist_title} (Instrumental*.wav")
+        instrumental_files.extend(glob.glob(instrumental_pattern_wav))
+        self.logger.debug(f"Found {len(instrumental_files)} instrumental files")
+        for filepath in instrumental_files:
+            filename = os.path.basename(filepath)
+            # Skip already padded files
+            if "(Padded)" in filename:
+                self.logger.debug(f"Skipping already padded file: {filename}")
+                continue
+            # Determine if it's a combined instrumental (+BV) or clean instrumental
+            if "+BV" in filename or "+bv" in filename.lower():
+                # Combined instrumental with backing vocals
+                # Extract model name from filename
+                # Pattern: "(Instrumental +BV {model}).flac"
+                model_match = re.search(r'\(Instrumental \+BV ([^)]+)\)', filename)
+                if model_match:
+                    model_name = model_match.group(1).strip()
+                    separated_audio["combined_instrumentals"][model_name] = filepath
+                    self.logger.info(f"Found combined instrumental: {filename}")
+            else:
+                # Clean instrumental (no backing vocals)
+                # Pattern: "(Instrumental {model}).flac"
+                model_match = re.search(r'\(Instrumental ([^)]+)\)', filename)
+                if model_match:
+                    # Use as clean instrumental if we don't have one yet
+                    if not separated_audio["clean_instrumental"].get("instrumental"):
+                        separated_audio["clean_instrumental"]["instrumental"] = filepath
+                        self.logger.info(f"Found clean instrumental: {filename}")
+                    else:
+                        # Additional clean instrumentals go to combined_instrumentals for padding
+                        model_name = model_match.group(1).strip()
+                        separated_audio["combined_instrumentals"][model_name] = filepath
+                        self.logger.info(f"Found additional instrumental: {filename}")
+        # Also look for backing vocals files
+        backing_vocals_pattern = os.path.join(search_dir, f"{artist_title} (Backing Vocals*.flac")
+        backing_vocals_files = glob.glob(backing_vocals_pattern)
+        backing_vocals_pattern_wav = os.path.join(search_dir, f"{artist_title} (Backing Vocals*.wav")
+        backing_vocals_files.extend(glob.glob(backing_vocals_pattern_wav))
+        for filepath in backing_vocals_files:
+            filename = os.path.basename(filepath)
+            model_match = re.search(r'\(Backing Vocals ([^)]+)\)', filename)
+            if model_match:
+                model_name = model_match.group(1).strip()
+                if model_name not in separated_audio["backing_vocals"]:
+                    separated_audio["backing_vocals"][model_name] = {"backing_vocals": filepath}
+                    self.logger.info(f"Found backing vocals: {filename}")
+        # Log summary
+        clean_count = 1 if separated_audio["clean_instrumental"].get("instrumental") else 0
+        combined_count = len(separated_audio["combined_instrumentals"])
+        self.logger.info(f"Directory scan complete: {clean_count} clean instrumental, {combined_count} combined instrumentals")
+        return separated_audio
     async def prep_single_track(self):
         # Add signal handler at the start
         loop = asyncio.get_running_loop()
@@ -419,6 +517,9 @@ class KaraokePrep:
                         # No still image for audio-only downloads
                         processed_track["input_still_image"] = None
+                    except UserCancelledError:
+                        # User cancelled - propagate up to CLI for graceful exit
+                        raise
                     except NoResultsError as e:
                         self.logger.error(f"No audio found: {e}")
                         return None
@@ -761,6 +862,18 @@ class KaraokePrep:
                     f"Applying {padding_seconds}s padding to all instrumental files to sync with vocal countdown"
                 )
+                # If separated_audio is empty (e.g., transcription was skipped but existing files have countdown),
+                # scan the directory for existing instrumental files
+                has_instrumentals = (
+                    processed_track["separated_audio"].get("clean_instrumental", {}).get("instrumental") or
+                    processed_track["separated_audio"].get("combined_instrumentals")
+                )
+                if not has_instrumentals:
+                    self.logger.info("No instrumentals in separated_audio, scanning directory for existing files...")
+                    processed_track["separated_audio"] = self._scan_directory_for_instrumentals(
+                        track_output_dir, artist_title
+                    )
                 # Apply padding using AudioProcessor
                 padded_separation_result = self.audio_processor.apply_countdown_padding_to_instrumentals(
                     separation_result=processed_track["separated_audio"],

karaoke-gen 0.71.27__py3-none-any.whl → 0.75.16__py3-none-any.whl

karaoke-gen 0.71.27py3-none-any.whl → 0.75.16py3-none-any.whl