PyPI - karaoke-gen - Versions diffs - 0.71.23__py3-none-any.whl → 0.71.42__py3-none-any.whl - Mend

karaoke-gen 0.71.23py3-none-any.whl → 0.71.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

karaoke_gen/file_handler.py +192 -0
karaoke_gen/instrumental_review/__init__.py +45 -0
karaoke_gen/instrumental_review/analyzer.py +408 -0
karaoke_gen/instrumental_review/editor.py +322 -0
karaoke_gen/instrumental_review/models.py +171 -0
karaoke_gen/instrumental_review/server.py +1181 -0
karaoke_gen/instrumental_review/waveform.py +409 -0
karaoke_gen/utils/cli_args.py +5 -0
karaoke_gen/utils/gen_cli.py +186 -0
karaoke_gen/utils/remote_cli.py +864 -154
{karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/METADATA +4 -1
{karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/RECORD +16 -10
lyrics_transcriber/correction/anchor_sequence.py +226 -350
{karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/WHEEL +0 -0
{karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/entry_points.txt +0 -0
{karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/licenses/LICENSE +0 -0

karaoke_gen/instrumental_review/waveform.py ADDED Viewed

@@ -0,0 +1,409 @@
+"""
+Waveform visualization generator for audio files.
+This module provides the WaveformGenerator class which creates waveform
+images suitable for display in the instrumental review UI.
+"""
+import logging
+import math
+from pathlib import Path
+from typing import List, Optional, Tuple
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend
+import matplotlib.pyplot as plt
+import numpy as np
+from pydub import AudioSegment
+from .models import AudibleSegment, MuteRegion
+logger = logging.getLogger(__name__)
+class WaveformGenerator:
+    """
+    Generates waveform visualization images from audio files.
+    This class creates PNG images showing the amplitude envelope of an
+    audio file over time. It can highlight detected audible segments
+    and mute regions with different colors.
+    The generated images are suitable for display in web UIs and can
+    be used for interactive seeking (click-to-seek) functionality.
+    Attributes:
+        width: Width of the output image in pixels (default: 1200)
+        height: Height of the output image in pixels (default: 200)
+        background_color: Background color (default: "#1a1a2e")
+        waveform_color: Main waveform color (default: "#4a90d9")
+        segment_color: Color for audible segments (default: "#e94560")
+        mute_color: Color for mute regions (default: "#ff6b6b")
+        time_axis_color: Color for time axis (default: "#ffffff")
+    Example:
+        >>> generator = WaveformGenerator(width=1200, height=200)
+        >>> generator.generate(
+        ...     audio_path="/path/to/backing_vocals.flac",
+        ...     output_path="/path/to/waveform.png",
+        ...     segments=analysis_result.audible_segments
+        ... )
+    """
+    def __init__(
+        self,
+        width: int = 1200,
+        height: int = 200,
+        background_color: str = "#1a1a2e",
+        waveform_color: str = "#4a90d9",
+        segment_color: str = "#e94560",
+        mute_color: str = "#ff6b6b",
+        time_axis_color: str = "#ffffff",
+        dpi: int = 100,
+    ):
+        """
+        Initialize the waveform generator.
+        Args:
+            width: Width of the output image in pixels
+            height: Height of the output image in pixels
+            background_color: Background color (hex or named color)
+            waveform_color: Main waveform color
+            segment_color: Color for highlighting audible segments
+            mute_color: Color for highlighting mute regions
+            time_axis_color: Color for time axis labels
+            dpi: DPI for the output image
+        """
+        self.width = width
+        self.height = height
+        self.background_color = background_color
+        self.waveform_color = waveform_color
+        self.segment_color = segment_color
+        self.mute_color = mute_color
+        self.time_axis_color = time_axis_color
+        self.dpi = dpi
+    def generate(
+        self,
+        audio_path: str,
+        output_path: str,
+        segments: Optional[List[AudibleSegment]] = None,
+        mute_regions: Optional[List[MuteRegion]] = None,
+        show_time_axis: bool = True,
+        silence_threshold_db: float = -40.0,
+    ) -> str:
+        """
+        Generate a waveform image from an audio file.
+        Args:
+            audio_path: Path to the audio file
+            output_path: Path where the PNG image will be saved
+            segments: Optional list of audible segments to highlight
+            mute_regions: Optional list of mute regions to highlight
+            show_time_axis: Whether to show time axis labels
+            silence_threshold_db: Threshold for visual reference line
+        Returns:
+            Path to the generated image file
+        Raises:
+            FileNotFoundError: If the audio file doesn't exist
+        """
+        path = Path(audio_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        logger.info(f"Generating waveform for: {audio_path}")
+        # Load audio
+        audio = AudioSegment.from_file(audio_path)
+        duration_seconds = len(audio) / 1000.0
+        # Convert to mono if needed
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        # Get amplitude envelope
+        envelope = self._get_envelope(audio)
+        # Create the figure
+        fig, ax = self._create_figure(duration_seconds, show_time_axis)
+        # Draw waveform
+        self._draw_waveform(ax, envelope, duration_seconds)
+        # Highlight mute regions (if any) - draw first so waveform is on top
+        if mute_regions:
+            self._draw_mute_regions(ax, mute_regions, duration_seconds)
+        # Highlight audible segments (if any)
+        if segments:
+            self._draw_segments(ax, segments, envelope, duration_seconds)
+        # Draw silence threshold reference line
+        self._draw_threshold_line(ax, silence_threshold_db, duration_seconds)
+        # Save the figure
+        output_dir = Path(output_path).parent
+        output_dir.mkdir(parents=True, exist_ok=True)
+        fig.savefig(
+            output_path,
+            facecolor=self.background_color,
+            edgecolor='none',
+            bbox_inches='tight',
+            pad_inches=0.1,
+        )
+        plt.close(fig)
+        logger.info(f"Waveform saved to: {output_path}")
+        return output_path
+    def generate_data_only(
+        self,
+        audio_path: str,
+        num_points: int = 500,
+    ) -> Tuple[List[float], float]:
+        """
+        Generate waveform data without creating an image.
+        This is useful for sending data to a frontend that will
+        render the waveform itself (e.g., using Canvas or SVG).
+        Args:
+            audio_path: Path to the audio file
+            num_points: Number of data points to return
+        Returns:
+            Tuple of (amplitude_values, duration_seconds)
+            Amplitude values are normalized to 0.0-1.0 range.
+        """
+        path = Path(audio_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        audio = AudioSegment.from_file(audio_path)
+        duration_seconds = len(audio) / 1000.0
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        # Calculate window size to get desired number of points
+        duration_ms = len(audio)
+        window_ms = max(1, duration_ms // num_points)
+        amplitudes = []
+        for start_ms in range(0, duration_ms, window_ms):
+            end_ms = min(start_ms + window_ms, duration_ms)
+            window = audio[start_ms:end_ms]
+            if window.rms > 0:
+                db = 20 * math.log10(window.rms / window.max_possible_amplitude)
+            else:
+                db = -100.0
+            # Normalize to 0-1 range (mapping -60dB to 0dB -> 0 to 1)
+            normalized = max(0.0, min(1.0, (db + 60) / 60))
+            amplitudes.append(normalized)
+        return amplitudes, duration_seconds
+    def _get_envelope(
+        self,
+        audio: AudioSegment,
+        window_ms: int = 50,
+    ) -> np.ndarray:
+        """
+        Extract amplitude envelope from audio.
+        Returns array of amplitude values in dB.
+        """
+        duration_ms = len(audio)
+        amplitudes = []
+        for start_ms in range(0, duration_ms, window_ms):
+            end_ms = min(start_ms + window_ms, duration_ms)
+            window = audio[start_ms:end_ms]
+            if window.rms > 0:
+                db = 20 * math.log10(window.rms / window.max_possible_amplitude)
+            else:
+                db = -100.0
+            amplitudes.append(db)
+        return np.array(amplitudes)
+    def _create_figure(
+        self,
+        duration_seconds: float,
+        show_time_axis: bool,
+    ) -> Tuple[plt.Figure, plt.Axes]:
+        """
+        Create matplotlib figure and axes.
+        """
+        fig_width = self.width / self.dpi
+        fig_height = self.height / self.dpi
+        fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=self.dpi)
+        # Set background
+        fig.patch.set_facecolor(self.background_color)
+        ax.set_facecolor(self.background_color)
+        # Configure axes
+        ax.set_xlim(0, duration_seconds)
+        ax.set_ylim(-60, 0)  # dB range
+        # Remove spines
+        for spine in ax.spines.values():
+            spine.set_visible(False)
+        # Configure ticks
+        if show_time_axis:
+            ax.tick_params(
+                axis='x',
+                colors=self.time_axis_color,
+                labelsize=8,
+            )
+            ax.tick_params(axis='y', left=False, labelleft=False)
+            # Set time axis ticks
+            self._set_time_ticks(ax, duration_seconds)
+        else:
+            ax.tick_params(
+                axis='both',
+                left=False,
+                bottom=False,
+                labelleft=False,
+                labelbottom=False,
+            )
+        return fig, ax
+    def _set_time_ticks(self, ax: plt.Axes, duration_seconds: float):
+        """
+        Set appropriate time axis tick marks.
+        """
+        if duration_seconds <= 60:
+            # Under 1 minute: tick every 10 seconds
+            tick_interval = 10
+        elif duration_seconds <= 300:
+            # Under 5 minutes: tick every 30 seconds
+            tick_interval = 30
+        else:
+            # Over 5 minutes: tick every minute
+            tick_interval = 60
+        ticks = np.arange(0, duration_seconds + 1, tick_interval)
+        ax.set_xticks(ticks)
+        # Format tick labels as MM:SS
+        labels = []
+        for t in ticks:
+            minutes = int(t // 60)
+            seconds = int(t % 60)
+            labels.append(f"{minutes}:{seconds:02d}")
+        ax.set_xticklabels(labels)
+    def _draw_waveform(
+        self,
+        ax: plt.Axes,
+        envelope: np.ndarray,
+        duration_seconds: float,
+    ):
+        """
+        Draw the main waveform.
+        """
+        num_points = len(envelope)
+        time_points = np.linspace(0, duration_seconds, num_points)
+        # Draw as filled area
+        ax.fill_between(
+            time_points,
+            envelope,
+            -60,  # Bottom of range
+            color=self.waveform_color,
+            alpha=0.7,
+        )
+        # Draw outline
+        ax.plot(
+            time_points,
+            envelope,
+            color=self.waveform_color,
+            linewidth=0.5,
+            alpha=0.9,
+        )
+    def _draw_segments(
+        self,
+        ax: plt.Axes,
+        segments: List[AudibleSegment],
+        envelope: np.ndarray,
+        duration_seconds: float,
+    ):
+        """
+        Highlight audible segments on the waveform.
+        """
+        num_points = len(envelope)
+        time_points = np.linspace(0, duration_seconds, num_points)
+        for segment in segments:
+            # Find indices corresponding to this segment
+            start_idx = int(segment.start_seconds / duration_seconds * num_points)
+            end_idx = int(segment.end_seconds / duration_seconds * num_points)
+            start_idx = max(0, min(start_idx, num_points - 1))
+            end_idx = max(0, min(end_idx, num_points))
+            if start_idx >= end_idx:
+                continue
+            segment_time = time_points[start_idx:end_idx]
+            segment_envelope = envelope[start_idx:end_idx]
+            # Highlight this segment with a different color
+            ax.fill_between(
+                segment_time,
+                segment_envelope,
+                -60,
+                color=self.segment_color,
+                alpha=0.6,
+            )
+    def _draw_mute_regions(
+        self,
+        ax: plt.Axes,
+        mute_regions: List[MuteRegion],
+        duration_seconds: float,
+    ):
+        """
+        Draw mute region overlays.
+        """
+        for region in mute_regions:
+            ax.axvspan(
+                region.start_seconds,
+                region.end_seconds,
+                color=self.mute_color,
+                alpha=0.3,
+                zorder=0,
+            )
+    def _draw_threshold_line(
+        self,
+        ax: plt.Axes,
+        threshold_db: float,
+        duration_seconds: float,
+    ):
+        """
+        Draw a reference line at the silence threshold.
+        """
+        ax.axhline(
+            y=threshold_db,
+            color=self.time_axis_color,
+            linestyle='--',
+            linewidth=0.5,
+            alpha=0.3,
+        )

karaoke_gen/utils/cli_args.py CHANGED Viewed

@@ -208,6 +208,11 @@ def create_parser(prog: str = "karaoke-gen") -> argparse.ArgumentParser:
         default="flac",
         help="Optional: format / file extension for instrumental track to use for remux (default: %(default)s). Example: --instrumental_format=mp3",
     )
+    audio_group.add_argument(
+        "--skip_instrumental_review",
+        action="store_true",
+        help="Optional: Skip the interactive instrumental review UI and use the old numeric selection. Example: --skip_instrumental_review",
+    )
     # Lyrics Configuration
     lyrics_group = parser.add_argument_group("Lyrics Configuration")

karaoke_gen/utils/gen_cli.py CHANGED Viewed

@@ -14,12 +14,189 @@ import sys
 import json
 import asyncio
 import time
+import glob
 import pyperclip
 from karaoke_gen import KaraokePrep
 from karaoke_gen.karaoke_finalise import KaraokeFinalise
+from karaoke_gen.instrumental_review import (
+    AudioAnalyzer,
+    WaveformGenerator,
+    InstrumentalReviewServer,
+)
 from .cli_args import create_parser, process_style_overrides, is_url, is_file
+def _resolve_path_for_cwd(path: str, track_dir: str) -> str:
+    """
+    Resolve a path that may have been created relative to the original working directory.
+    After os.chdir(track_dir), paths like './TrackDir/stems/file.flac' become invalid.
+    This function converts such paths to work from the new current directory.
+    Args:
+        path: The path to resolve (may be relative or absolute)
+        track_dir: The track directory we've chdir'd into
+    Returns:
+        A path that's valid from the current working directory
+    """
+    if os.path.isabs(path):
+        return path
+    # Normalize both paths for comparison
+    norm_path = os.path.normpath(path)
+    norm_track_dir = os.path.normpath(track_dir)
+    # If path starts with track_dir, strip it to get the relative path from within track_dir
+    # e.g., './Four Lanes Male Choir - The White Rose/stems/file.flac' -> 'stems/file.flac'
+    if norm_path.startswith(norm_track_dir + os.sep):
+        return norm_path[len(norm_track_dir) + 1:]
+    elif norm_path.startswith(norm_track_dir):
+        return norm_path[len(norm_track_dir):].lstrip(os.sep) or '.'
+    # If path doesn't start with track_dir, it might already be relative to track_dir
+    # or it's a path that doesn't need transformation
+    return path
+def run_instrumental_review(track: dict, logger: logging.Logger) -> str | None:
+    """
+    Run the instrumental review UI to let user select the best instrumental track.
+    This analyzes the backing vocals, generates a waveform, and opens a browser
+    with an interactive UI for reviewing and selecting the instrumental.
+    Args:
+        track: The track dictionary from KaraokePrep containing separated audio info
+        logger: Logger instance
+    Returns:
+        Path to the selected instrumental file, or None to use the old numeric selection
+    """
+    track_dir = track.get("track_output_dir", ".")
+    artist = track.get("artist", "")
+    title = track.get("title", "")
+    base_name = f"{artist} - {title}"
+    # Get separation results
+    separated = track.get("separated_audio", {})
+    if not separated:
+        logger.info("No separated audio found, skipping instrumental review UI")
+        return None
+    # Find the backing vocals file
+    # Note: Paths in separated_audio may be relative to the original working directory,
+    # but we've already chdir'd into track_dir. Use _resolve_path_for_cwd to fix paths.
+    backing_vocals_path = None
+    backing_vocals_result = separated.get("backing_vocals", {})
+    for model, paths in backing_vocals_result.items():
+        if paths.get("backing_vocals"):
+            backing_vocals_path = _resolve_path_for_cwd(paths["backing_vocals"], track_dir)
+            break
+    if not backing_vocals_path or not os.path.exists(backing_vocals_path):
+        logger.info("No backing vocals file found, skipping instrumental review UI")
+        return None
+    # Find the clean instrumental file
+    clean_result = separated.get("clean_instrumental", {})
+    raw_clean_path = clean_result.get("instrumental")
+    clean_instrumental_path = _resolve_path_for_cwd(raw_clean_path, track_dir) if raw_clean_path else None
+    if not clean_instrumental_path or not os.path.exists(clean_instrumental_path):
+        logger.info("No clean instrumental file found, skipping instrumental review UI")
+        return None
+    # Find the combined instrumental (with backing vocals) file - these have "(Padded)" suffix if padded
+    combined_result = separated.get("combined_instrumentals", {})
+    with_backing_path = None
+    for model, path in combined_result.items():
+        resolved_path = _resolve_path_for_cwd(path, track_dir) if path else None
+        if resolved_path and os.path.exists(resolved_path):
+            with_backing_path = resolved_path
+            break
+    try:
+        logger.info("=== Starting Instrumental Review ===")
+        logger.info(f"Analyzing backing vocals: {backing_vocals_path}")
+        # Analyze backing vocals
+        analyzer = AudioAnalyzer()
+        analysis = analyzer.analyze(backing_vocals_path)
+        logger.info(f"Analysis complete:")
+        logger.info(f"  Has audible content: {analysis.has_audible_content}")
+        logger.info(f"  Total duration: {analysis.total_duration_seconds:.1f}s")
+        logger.info(f"  Audible segments: {len(analysis.audible_segments)}")
+        logger.info(f"  Recommendation: {analysis.recommended_selection.value}")
+        # Generate waveform
+        # Note: We're already in track_dir after chdir, so use current directory
+        logger.info("Generating waveform visualization...")
+        waveform_generator = WaveformGenerator()
+        waveform_path = f"{base_name} (Backing Vocals Waveform).png"
+        waveform_generator.generate(
+            audio_path=backing_vocals_path,
+            output_path=waveform_path,
+            audible_segments=analysis.audible_segments,
+        )
+        # Start the review server
+        # Note: We're already in track_dir after chdir, so output_dir is "."
+        logger.info("Starting instrumental review UI...")
+        server = InstrumentalReviewServer(
+            output_dir=".",
+            base_name=base_name,
+            analysis=analysis,
+            waveform_path=waveform_path,
+            backing_vocals_path=backing_vocals_path,
+            clean_instrumental_path=clean_instrumental_path,
+            with_backing_path=with_backing_path,
+        )
+        # Start server and open browser, wait for selection
+        server.start_and_open_browser()
+        logger.info("Waiting for instrumental selection in browser...")
+        logger.info("(Close the browser tab or press Ctrl+C to cancel)")
+        try:
+            # Wait for user selection (blocking)
+            server._selection_event.wait()
+            selection = server.get_selection()
+            logger.info(f"User selected: {selection}")
+            # Stop the server
+            server.stop()
+            # Return the selected instrumental path
+            if selection == "clean":
+                return clean_instrumental_path
+            elif selection == "with_backing":
+                return with_backing_path
+            elif selection == "custom":
+                custom_path = server.get_custom_instrumental_path()
+                if custom_path and os.path.exists(custom_path):
+                    return custom_path
+                else:
+                    logger.warning("Custom instrumental not found, falling back to clean")
+                    return clean_instrumental_path
+            else:
+                logger.warning(f"Unknown selection: {selection}, falling back to numeric selection")
+                return None
+        except KeyboardInterrupt:
+            logger.info("Instrumental review cancelled by user")
+            server.stop()
+            return None
+    except Exception as e:
+        logger.error(f"Error during instrumental review: {e}")
+        logger.info("Falling back to numeric selection")
+        return None
 async def async_main():
     logger = logging.getLogger(__name__)
     log_handler = logging.StreamHandler()
@@ -461,6 +638,14 @@ async def async_main():
         logger.info(f"Changing to directory: {track_dir}")
         os.chdir(track_dir)
+        # Run instrumental review UI if not skipped
+        selected_instrumental_file = None
+        if not getattr(args, 'skip_instrumental_review', False):
+            selected_instrumental_file = run_instrumental_review(
+                track=track,
+                logger=logger,
+            )
         # Load CDG styles if CDG generation is enabled
         cdg_styles = None
         if args.enable_cdg:
@@ -504,6 +689,7 @@ async def async_main():
             cdg_styles=cdg_styles,
             keep_brand_code=getattr(args, 'keep_brand_code', False),
             non_interactive=args.yes,
+            selected_instrumental_file=selected_instrumental_file,
         )
         try:

karaoke-gen 0.71.23__py3-none-any.whl → 0.71.42__py3-none-any.whl

karaoke-gen 0.71.23py3-none-any.whl → 0.71.42py3-none-any.whl