PyPI - karaoke-gen - Versions diffs - 0.76.20__py3-none-any.whl → 0.82.0__py3-none-any.whl - Mend

karaoke-gen 0.76.20py3-none-any.whl → 0.82.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

lyrics_transcriber/output/generator.py CHANGED Viewed

@@ -52,7 +52,7 @@ class OutputGenerator:
         self.logger.info(f"Initializing OutputGenerator with config: {self.config}")
-        # Load output styles from JSON if provided
+        # Load output styles from JSON if provided, otherwise use defaults
         if self.config.output_styles_json and os.path.exists(self.config.output_styles_json):
             try:
                 with open(self.config.output_styles_json, "r") as f:
@@ -67,9 +67,10 @@ class OutputGenerator:
                     self.logger.warning(f"Failed to load output styles file: {str(e)}")
                     self.config.styles = {}
         else:
-            # No styles file provided or doesn't exist
+            # No styles file provided or doesn't exist - use defaults
             if self.config.render_video or self.config.generate_cdg:
-                raise ValueError(f"Output styles file required for video/CDG generation but not found: {self.config.output_styles_json}")
+                self.logger.info("No output styles file provided, using default karaoke styles")
+                self.config.styles = self._get_default_styles()
             else:
                 self.config.styles = {}
@@ -242,6 +243,52 @@ class OutputGenerator:
         return resolution_dims, font_size, line_height
+    def _get_default_styles(self) -> dict:
+        """Get default styles for video/CDG generation when no styles file is provided."""
+        return {
+            "karaoke": {
+                # Video background
+                "background_color": "#000000",
+                "background_image": None,
+                # Font settings
+                "font": "Arial",
+                "font_path": "",  # Must be string, not None (for ASS generator)
+                "ass_name": "Default",
+                # Colors in "R, G, B, A" format (required by ASS)
+                "primary_color": "112, 112, 247, 255",
+                "secondary_color": "255, 255, 255, 255",
+                "outline_color": "26, 58, 235, 255",
+                "back_color": "0, 0, 0, 0",
+                # Boolean style options
+                "bold": False,
+                "italic": False,
+                "underline": False,
+                "strike_out": False,
+                # Numeric style options (all required for ASS)
+                "scale_x": 100,
+                "scale_y": 100,
+                "spacing": 0,
+                "angle": 0.0,
+                "border_style": 1,
+                "outline": 1,
+                "shadow": 0,
+                "margin_l": 0,
+                "margin_r": 0,
+                "margin_v": 0,
+                "encoding": 0,
+                # Layout settings
+                "max_line_length": 40,
+                "top_padding": 200,
+                "font_size": 100,
+            },
+            "cdg": {
+                "font_path": None,
+                "instrumental_background": None,
+                "title_screen_background": None,
+                "outro_background": None,
+            },
+        }
     def write_corrections_data(self, correction_result: CorrectionResult, output_prefix: str) -> str:
         """Write corrections data to JSON file."""
         self.logger.info("Writing corrections data JSON")

lyrics_transcriber/transcribers/local_whisper.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""Local Whisper transcription service using whisper-timestamped for word-level timestamps."""
+from dataclasses import dataclass
+import os
+import logging
+from typing import Optional, Dict, Any, Union
+from pathlib import Path
+from lyrics_transcriber.types import TranscriptionData, LyricsSegment, Word
+from lyrics_transcriber.transcribers.base_transcriber import BaseTranscriber, TranscriptionError
+from lyrics_transcriber.utils.word_utils import WordUtils
+@dataclass
+class LocalWhisperConfig:
+    """Configuration for local Whisper transcription service."""
+    model_size: str = "medium"  # tiny, base, small, medium, large, large-v2, large-v3
+    device: Optional[str] = None  # None for auto-detect, or "cpu", "cuda", "mps"
+    cache_dir: Optional[str] = None  # Directory for model downloads (~/.cache/whisper by default)
+    language: Optional[str] = None  # Language code for transcription, None for auto-detect
+    compute_type: str = "auto"  # float16, float32, int8, auto
+class LocalWhisperTranscriber(BaseTranscriber):
+    """
+    Transcription service using local Whisper inference via whisper-timestamped.
+    This transcriber runs Whisper models locally on your machine, supporting
+    CPU, CUDA GPU, and Apple Silicon MPS acceleration. It uses the
+    whisper-timestamped library to get accurate word-level timestamps.
+    Requirements:
+        pip install karaoke-gen[local-whisper]
+    Configuration:
+        Set environment variables to customize behavior:
+        - WHISPER_MODEL_SIZE: Model size (tiny, base, small, medium, large)
+        - WHISPER_DEVICE: Device to use (cpu, cuda, mps, or auto)
+        - WHISPER_CACHE_DIR: Directory for model downloads
+        - WHISPER_LANGUAGE: Language code (en, es, fr, etc.) or auto-detect
+    """
+    def __init__(
+        self,
+        cache_dir: Union[str, Path],
+        config: Optional[LocalWhisperConfig] = None,
+        logger: Optional[logging.Logger] = None,
+    ):
+        """
+        Initialize local Whisper transcriber.
+        Args:
+            cache_dir: Directory for caching transcription results
+            config: Configuration options for the transcriber
+            logger: Logger instance to use
+        """
+        super().__init__(cache_dir=cache_dir, logger=logger)
+        # Initialize configuration from env vars or defaults
+        self.config = config or LocalWhisperConfig(
+            model_size=os.getenv("WHISPER_MODEL_SIZE", "medium"),
+            device=os.getenv("WHISPER_DEVICE"),  # None for auto-detect
+            cache_dir=os.getenv("WHISPER_CACHE_DIR"),
+            language=os.getenv("WHISPER_LANGUAGE"),  # None for auto-detect
+        )
+        # Lazy-loaded model instance (loaded on first use)
+        self._model = None
+        self._whisper_module = None
+        self.logger.debug(
+            f"LocalWhisperTranscriber initialized with model_size={self.config.model_size}, "
+            f"device={self.config.device or 'auto'}, language={self.config.language or 'auto-detect'}"
+        )
+    def get_name(self) -> str:
+        """Return the name of this transcription service."""
+        return "LocalWhisper"
+    def _check_dependencies(self) -> None:
+        """Check that required dependencies are installed."""
+        try:
+            import whisper_timestamped  # noqa: F401
+        except ImportError:
+            raise TranscriptionError(
+                "whisper-timestamped is not installed. "
+                "Install it with: pip install karaoke-gen[local-whisper] "
+                "or: pip install whisper-timestamped"
+            )
+    def _get_device(self) -> str:
+        """Determine the best device to use for inference."""
+        if self.config.device:
+            return self.config.device
+        # Auto-detect best available device
+        try:
+            import torch
+            if torch.cuda.is_available():
+                self.logger.info("Using CUDA GPU for Whisper inference")
+                return "cuda"
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                self.logger.info("Using Apple Silicon MPS for Whisper inference")
+                return "cpu"  # whisper-timestamped works better with CPU on MPS
+            else:
+                self.logger.info("Using CPU for Whisper inference (no GPU detected)")
+                return "cpu"
+        except ImportError:
+            self.logger.warning("PyTorch not available, defaulting to CPU")
+            return "cpu"
+    def _load_model(self):
+        """Load the Whisper model (lazy loading on first use)."""
+        if self._model is not None:
+            return self._model
+        self._check_dependencies()
+        import whisper_timestamped as whisper
+        self._whisper_module = whisper
+        device = self._get_device()
+        self.logger.info(f"Loading Whisper model '{self.config.model_size}' on device '{device}'...")
+        try:
+            # Load model with optional custom cache directory
+            download_root = self.config.cache_dir
+            self._model = whisper.load_model(
+                self.config.model_size,
+                device=device,
+                download_root=download_root,
+            )
+            self.logger.info(f"Whisper model '{self.config.model_size}' loaded successfully")
+            return self._model
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower() or "CUDA" in str(e):
+                raise TranscriptionError(
+                    f"GPU out of memory loading model '{self.config.model_size}'. "
+                    "Try using a smaller model (set WHISPER_MODEL_SIZE=small or tiny) "
+                    "or force CPU mode (set WHISPER_DEVICE=cpu)"
+                ) from e
+            raise TranscriptionError(f"Failed to load Whisper model: {e}") from e
+        except Exception as e:
+            raise TranscriptionError(f"Failed to load Whisper model: {e}") from e
+    def _perform_transcription(self, audio_filepath: str) -> Dict[str, Any]:
+        """
+        Perform local Whisper transcription with word-level timestamps.
+        Args:
+            audio_filepath: Path to the audio file to transcribe
+        Returns:
+            Raw transcription result dictionary
+        """
+        self.logger.info(f"Starting local Whisper transcription for {audio_filepath}")
+        # Load model if not already loaded
+        model = self._load_model()
+        try:
+            # Perform transcription with word-level timestamps
+            transcribe_kwargs = {
+                "verbose": False,
+            }
+            # Add language if specified
+            if self.config.language:
+                transcribe_kwargs["language"] = self.config.language
+            self.logger.debug(f"Transcribing with options: {transcribe_kwargs}")
+            result = self._whisper_module.transcribe_timestamped(
+                model,
+                audio_filepath,
+                **transcribe_kwargs,
+            )
+            self.logger.info("Local Whisper transcription completed successfully")
+            return result
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower():
+                raise TranscriptionError(
+                    f"GPU out of memory during transcription. "
+                    "Try using a smaller model (WHISPER_MODEL_SIZE=small) "
+                    "or force CPU mode (WHISPER_DEVICE=cpu)"
+                ) from e
+            raise TranscriptionError(f"Transcription failed: {e}") from e
+        except Exception as e:
+            raise TranscriptionError(f"Transcription failed: {e}") from e
+    def _convert_result_format(self, raw_data: Dict[str, Any]) -> TranscriptionData:
+        """
+        Convert whisper-timestamped output to standard TranscriptionData format.
+        The whisper-timestamped library returns results in this format:
+        {
+            "text": "Full transcription text",
+            "segments": [
+                {
+                    "id": 0,
+                    "text": "Segment text",
+                    "start": 0.0,
+                    "end": 2.5,
+                    "words": [
+                        {"text": "word", "start": 0.0, "end": 0.5, "confidence": 0.95},
+                        ...
+                    ]
+                },
+                ...
+            ],
+            "language": "en"
+        }
+        Args:
+            raw_data: Raw output from whisper_timestamped.transcribe_timestamped()
+        Returns:
+            TranscriptionData with segments, words, and metadata
+        """
+        segments = []
+        all_words = []
+        for seg in raw_data.get("segments", []):
+            segment_words = []
+            for word_data in seg.get("words", []):
+                word = Word(
+                    id=WordUtils.generate_id(),
+                    text=word_data.get("text", "").strip(),
+                    start_time=word_data.get("start", 0.0),
+                    end_time=word_data.get("end", 0.0),
+                    confidence=word_data.get("confidence"),
+                )
+                segment_words.append(word)
+                all_words.append(word)
+            # Create segment with its words
+            segment = LyricsSegment(
+                id=WordUtils.generate_id(),
+                text=seg.get("text", "").strip(),
+                words=segment_words,
+                start_time=seg.get("start", 0.0),
+                end_time=seg.get("end", 0.0),
+            )
+            segments.append(segment)
+        return TranscriptionData(
+            segments=segments,
+            words=all_words,
+            text=raw_data.get("text", "").strip(),
+            source=self.get_name(),
+            metadata={
+                "model_size": self.config.model_size,
+                "detected_language": raw_data.get("language", "unknown"),
+                "device": self._get_device(),
+            },
+        )

lyrics_transcriber/correction/handlers/llm.py DELETED Viewed

@@ -1,293 +0,0 @@
-from typing import List, Optional, Tuple, Dict, Any, Union
-import logging
-import json
-from datetime import datetime
-from pathlib import Path
-from lyrics_transcriber.types import GapSequence, WordCorrection
-from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler
-from lyrics_transcriber.correction.handlers.word_operations import WordOperations
-from lyrics_transcriber.correction.handlers.llm_providers import LLMProvider
-class LLMHandler(GapCorrectionHandler):
-    """Uses an LLM to analyze and correct gaps by comparing with reference lyrics."""
-    def __init__(
-        self, provider: LLMProvider, name: str, logger: Optional[logging.Logger] = None, cache_dir: Optional[Union[str, Path]] = None
-    ):
-        super().__init__(logger)
-        self.logger = logger or logging.getLogger(__name__)
-        self.provider = provider
-        self.name = name
-        self.cache_dir = Path(cache_dir) if cache_dir else None
-    def _format_prompt(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> str:
-        """Format the prompt for the LLM with context about the gap and reference lyrics."""
-        word_map = data.get("word_map", {})
-        metadata = data.get("metadata", {}) if data else {}
-        if not word_map:
-            self.logger.error("No word_map provided in data")
-            return ""
-        # Format transcribed words with their IDs
-        transcribed_words = [{"id": word_id, "text": word_map[word_id].text} for word_id in gap.transcribed_word_ids if word_id in word_map]
-        prompt = (
-            "You are a lyrics correction expert. You will be given transcribed lyrics that may contain errors "
-            "and reference lyrics from multiple sources. Your task is to analyze each word in the transcribed text "
-            "and suggest specific corrections based on the reference lyrics.\n\n"
-            "Each word has a unique ID. When suggesting corrections, you must specify the ID of the word being corrected. "
-            "This ensures accuracy in applying your corrections.\n\n"
-            "For each correction, specify:\n"
-            "1. The word ID being corrected\n"
-            "2. The correction type ('replace', 'split', 'combine', or 'delete')\n"
-            "3. The corrected text\n"
-            "4. Your confidence level\n"
-            "5. The reason for the correction\n\n"
-        )
-        # Add song context if available
-        if metadata and metadata.get("artist") and metadata.get("title"):
-            prompt += f"Song: {metadata['title']}\nArtist: {metadata['artist']}\n\n"
-        # Format transcribed words with IDs
-        prompt += "Transcribed words:\n"
-        for word in transcribed_words:
-            prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
-        prompt += "\nReference lyrics from different sources:\n"
-        # Add each reference source with words and their IDs
-        for source, word_ids in gap.reference_word_ids.items():
-            reference_words = [{"id": word_id, "text": word_map[word_id].text} for word_id in word_ids if word_id in word_map]
-            prompt += f"\n{source} immediate context:\n"
-            for word in reference_words:
-                prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
-            # Add full lyrics if available
-            if metadata and metadata.get("full_reference_texts", {}).get(source):
-                prompt += f"\nFull {source} lyrics:\n{metadata['full_reference_texts'][source]}\n"
-        # Add context about surrounding anchors if available
-        if gap.preceding_anchor_id:
-            preceding_anchor = next((a.anchor for a in data.get("anchor_sequences", []) if a.anchor.id == gap.preceding_anchor_id), None)
-            if preceding_anchor:
-                anchor_words = [
-                    {"id": word_id, "text": word_map[word_id].text}
-                    for word_id in preceding_anchor.transcribed_word_ids
-                    if word_id in word_map
-                ]
-                prompt += "\nPreceding correct words:\n"
-                for word in anchor_words:
-                    prompt += f"- ID: {word['id']}, Text: '{word['text']}'\n"
-        prompt += (
-            "\nProvide corrections in the following JSON format:\n"
-            "{\n"
-            '  "corrections": [\n'
-            "    {\n"
-            '      "word_id": "id_of_word_to_correct",\n'
-            '      "type": "replace|split|combine|delete",\n'
-            '      "corrected_text": "new text",\n'
-            '      "reference_word_id": "id_from_reference_lyrics",  // Optional, use when matching a specific reference word\n'
-            '      "confidence": 0.9,\n'
-            '      "reason": "explanation of correction"\n'
-            "    }\n"
-            "  ]\n"
-            "}\n\n"
-            "Important rules:\n"
-            "1. Always include the word_id for each correction\n"
-            "2. For 'split' type, corrected_text should contain the space-separated words\n"
-            "3. For 'combine' type, word_id should be the first word to combine\n"
-            "4. Include reference_word_id when the correction matches a specific reference word\n"
-            "5. Only suggest corrections when you're confident they improve the lyrics\n"
-            "6. Preserve any existing words that match the reference lyrics\n"
-            "7. Respond ONLY with the JSON object, no other text"
-        )
-        return prompt
-    def can_handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]:
-        """LLM handler can attempt to handle any gap with reference words."""
-        if not gap.reference_word_ids:
-            self.logger.debug("No reference words available")
-            return False, {}
-        return True, {}
-    def _write_debug_info(self, prompt: str, response: str, gap_index: int, audio_file_hash: Optional[str] = None) -> None:
-        """Write prompt and response to debug files."""
-        if not self.cache_dir:
-            self.logger.warning("No cache directory provided, skipping LLM debug output")
-            return
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        debug_dir = self.cache_dir / "llm_debug"
-        debug_dir.mkdir(exist_ok=True, parents=True)
-        hash_prefix = f"{audio_file_hash}_" if audio_file_hash else ""
-        filename = debug_dir / f"llm_debug_{hash_prefix}{gap_index}_{timestamp}.txt"
-        debug_content = "=== LLM PROMPT ===\n" f"{prompt}\n\n" "=== LLM RESPONSE ===\n" f"{response}\n"
-        try:
-            with open(filename, "w", encoding="utf-8") as f:
-                f.write(debug_content)
-        except IOError as e:
-            self.logger.error(f"Failed to write LLM debug file: {e}")
-    def handle(self, gap: GapSequence, data: Optional[Dict[str, Any]] = None) -> List[WordCorrection]:
-        """Process the gap using the LLM and create corrections based on its response."""
-        if not data or "word_map" not in data:
-            self.logger.error("No word_map provided in data")
-            return []
-        word_map = data["word_map"]
-        transcribed_words = [word_map[word_id].text for word_id in gap.transcribed_word_ids if word_id in word_map]
-        # Calculate reference positions using the centralized method
-        reference_positions = (
-            WordOperations.calculate_reference_positions(gap, anchor_sequences=data.get("anchor_sequences", [])) or {}
-        )  # Ensure empty dict if None
-        prompt = self._format_prompt(gap, data)
-        if not prompt:
-            return []
-        # Get a unique index for this gap based on its position
-        gap_index = gap.transcription_position
-        try:
-            self.logger.debug(f"Processing gap words: {transcribed_words}")
-            self.logger.debug(f"Reference word IDs: {gap.reference_word_ids}")
-            response = self.provider.generate_response(prompt)
-            # Write debug info to files
-            self._write_debug_info(prompt, response, gap_index, audio_file_hash=data.get("audio_file_hash"))
-            try:
-                corrections_data = json.loads(response)
-            except json.JSONDecodeError as e:
-                self.logger.error(f"Failed to parse LLM response as JSON: {e}")
-                self.logger.error(f"Raw response content: {response}")
-                return []
-            # Check if corrections exist and are non-empty
-            if not corrections_data.get("corrections"):
-                self.logger.debug("No corrections suggested by LLM")
-                return []
-            corrections = []
-            for correction in corrections_data["corrections"]:
-                # Validate word_id exists in gap
-                if correction["word_id"] not in gap.transcribed_word_ids:
-                    self.logger.error(f"LLM suggested correction for word_id {correction['word_id']} which is not in the gap")
-                    continue
-                # Get original word from word map
-                original_word = word_map[correction["word_id"]]
-                position = gap.transcription_position + gap.transcribed_word_ids.index(correction["word_id"])
-                self.logger.debug(f"Processing correction: {correction}")
-                if correction["type"] == "replace":
-                    self.logger.debug(
-                        f"Creating replacement: '{original_word.text}' -> '{correction['corrected_text']}' " f"at position {position}"
-                    )
-                    corrections.append(
-                        WordOperations.create_word_replacement_correction(
-                            original_word=original_word.text,
-                            corrected_word=correction["corrected_text"],
-                            original_position=position,
-                            source="LLM",
-                            confidence=correction["confidence"],
-                            reason=correction["reason"],
-                            handler=self.name,
-                            reference_positions=reference_positions,
-                            original_word_id=correction["word_id"],
-                            corrected_word_id=correction.get("reference_word_id"),
-                        )
-                    )
-                elif correction["type"] == "split":
-                    split_words = correction["corrected_text"].split()
-                    self.logger.debug(f"Creating split: '{original_word.text}' -> {split_words} " f"at position {position}")
-                    # Get reference word IDs if provided
-                    reference_word_ids = correction.get("reference_word_ids", [None] * len(split_words))
-                    corrections.extend(
-                        WordOperations.create_word_split_corrections(
-                            original_word=original_word.text,
-                            reference_words=split_words,
-                            original_position=position,
-                            source="LLM",
-                            confidence=correction["confidence"],
-                            reason=correction["reason"],
-                            handler=self.name,
-                            reference_positions=reference_positions,
-                            original_word_id=correction["word_id"],
-                            corrected_word_ids=reference_word_ids,
-                        )
-                    )
-                elif correction["type"] == "combine":
-                    # Get all word IDs to combine
-                    word_ids_to_combine = []
-                    current_idx = gap.transcribed_word_ids.index(correction["word_id"])
-                    words_needed = len(correction["corrected_text"].split())
-                    if current_idx + words_needed <= len(gap.transcribed_word_ids):
-                        word_ids_to_combine = gap.transcribed_word_ids[current_idx : current_idx + words_needed]
-                    else:
-                        self.logger.error(f"Not enough words available to combine at position {position}")
-                        continue
-                    words_to_combine = [word_map[word_id].text for word_id in word_ids_to_combine]
-                    self.logger.debug(
-                        f"Creating combine: {words_to_combine} -> '{correction['corrected_text']}' " f"at position {position}"
-                    )
-                    corrections.extend(
-                        WordOperations.create_word_combine_corrections(
-                            original_words=words_to_combine,
-                            reference_word=correction["corrected_text"],
-                            original_position=position,
-                            source="LLM",
-                            confidence=correction["confidence"],
-                            combine_reason=correction["reason"],
-                            delete_reason=f"Part of combining words: {correction['reason']}",
-                            handler=self.name,
-                            reference_positions=reference_positions,
-                            original_word_ids=word_ids_to_combine,
-                            corrected_word_id=correction.get("reference_word_id"),
-                        )
-                    )
-                elif correction["type"] == "delete":
-                    self.logger.debug(f"Creating deletion: '{original_word.text}' at position {position}")
-                    corrections.append(
-                        WordCorrection(
-                            original_word=original_word.text,
-                            corrected_word="",
-                            segment_index=0,
-                            original_position=position,
-                            confidence=correction["confidence"],
-                            source="LLM",
-                            reason=correction["reason"],
-                            alternatives={},
-                            is_deletion=True,
-                            handler=self.name,
-                            reference_positions=reference_positions,
-                            word_id=correction["word_id"],
-                            corrected_word_id=None,
-                        )
-                    )
-            self.logger.debug(f"Created {len(corrections)} corrections: {[f'{c.original_word}->{c.corrected_word}' for c in corrections]}")
-            return corrections
-        except Exception as e:
-            self.logger.error(f"Unexpected error in LLM handler: {e}")
-            return []

karaoke-gen 0.76.20__py3-none-any.whl → 0.82.0__py3-none-any.whl

karaoke-gen 0.76.20py3-none-any.whl → 0.82.0py3-none-any.whl