PyPI - lyrics-transcriber - Versions diffs - 0.14.0__tar.gz → 0.16.0__tar.gz - Mend

lyrics-transcriber 0.14.0tar.gz → 0.16.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lyrics-transcriber
-Version: 0.14.0
+Version: 0.16.0
 Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
 Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
 License: MIT
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Requires-Dist: Cython (>=0)
+Requires-Dist: auditok (>=0.2)
 Requires-Dist: dtw-python (>=1)
 Requires-Dist: llvmlite (>=0)
 Requires-Dist: lyricsgenius (>=3)
@@ -52,7 +53,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
 ### Prerequisites
 - Python 3.9 or higher
-- [Optional] A Genius API token if you want to fetch lyrics from Genius
+- [Optional] Genius API token if you want to fetch lyrics from Genius
+- [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
+- [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
+- [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
 ```
 pip install lyrics-transcriber
@@ -61,6 +65,23 @@ pip install lyrics-transcriber
 > **Warning**
 > The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
+## Docker
+You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
+```sh
+docker run \
+ -v `pwd`/input:/input \
+ -v `pwd`/output:/output \
+beveradb/lyrics-transcriber:0.16.0 \
+ --log_level debug \
+ --output_dir /output \
+ --render_video \
+ --video_background_image /input/your-background-image.png \
+ --video_resolution 360p \
+ /input/song.flac
+```
 ## Usage 🚀
 ### As a standalone CLI

{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/README.md RENAMED Viewed

@@ -17,7 +17,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
 ### Prerequisites
 - Python 3.9 or higher
-- [Optional] A Genius API token if you want to fetch lyrics from Genius
+- [Optional] Genius API token if you want to fetch lyrics from Genius
+- [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
+- [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
+- [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
 ```
 pip install lyrics-transcriber
@@ -26,6 +29,23 @@ pip install lyrics-transcriber
 > **Warning**
 > The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
+## Docker
+You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
+```sh
+docker run \
+ -v `pwd`/input:/input \
+ -v `pwd`/output:/output \
+beveradb/lyrics-transcriber:0.16.0 \
+ --log_level debug \
+ --output_dir /output \
+ --render_video \
+ --video_background_image /input/your-background-image.png \
+ --video_resolution 360p \
+ /input/song.flac
+```
 ## Usage 🚀
 ### As a standalone CLI

lyrics_transcriber-0.16.0/lyrics_transcriber/audioshake_transcriber.py ADDED Viewed

@@ -0,0 +1,35 @@
+import logging
+import requests
+class AudioShakeTranscriber:
+    def __init__(self, api_token, log_level=logging.DEBUG):
+        self.api_token = api_token
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(log_level)
+    def transcribe(self, audio_filepath):
+        # This is a placeholder for the actual AudioShake API implementation
+        self.logger.info(f"Transcribing {audio_filepath} using AudioShake API")
+        self.logger.debug(f"AudioShake API token: {self.api_token}")
+        # TODO: Implement the actual API call to AudioShake
+        # For now, we'll return a dummy result
+        return {
+            "transcription_data_dict": {
+                "segments": [
+                    {
+                        "start": 0,
+                        "end": 5,
+                        "text": "This is a dummy transcription",
+                        "words": [
+                            {"text": "This", "start": 0, "end": 1},
+                            {"text": "is", "start": 1, "end": 2},
+                            {"text": "a", "start": 2, "end": 3},
+                            {"text": "dummy", "start": 3, "end": 4},
+                            {"text": "transcription", "start": 4, "end": 5},
+                        ],
+                    }
+                ]
+            }
+        }

{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/transcriber.py RENAMED Viewed

@@ -22,6 +22,8 @@ class LyricsTranscriber:
         audio_filepath,
         artist=None,
         title=None,
+        openai_api_key=None,
+        audioshake_api_token=None,
         genius_api_token=None,
         spotify_cookie=None,
         output_dir=None,
@@ -59,23 +61,30 @@ class LyricsTranscriber:
         self.title = title
         self.song_known = self.artist is not None and self.title is not None
+        self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
         self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
         self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
+        self.audioshake_api_token = os.getenv("AUDIOSHAKE_TOKEN", default=audioshake_api_token)
         self.transcription_model = transcription_model
         self.llm_model = llm_model
         self.llm_prompt_matching = llm_prompt_matching
         self.llm_prompt_correction = llm_prompt_correction
-        self.openai_client = OpenAI()
+        self.openai_client = None
-        # Uncomment for local models e.g. with ollama
-        # self.openai_client = OpenAI(
-        #     base_url="http://localhost:11434/v1",
-        #     api_key="ollama",
-        # )
+        if self.openai_api_key:
+            self.openai_client = OpenAI(api_key=self.openai_api_key)
-        self.openai_client.log = self.log_level
+            # Uncomment for local models e.g. with ollama
+            # self.openai_client = OpenAI(
+            #     base_url="http://localhost:11434/v1",
+            #     api_key="ollama",
+            # )
+            self.openai_client.log = self.log_level
+        else:
+            self.logger.error("No OpenAI API key found, no correction will be applied to transcription")
         self.render_video = render_video
         self.video_resolution = video_resolution
@@ -154,8 +163,13 @@ class LyricsTranscriber:
         self.validate_lyrics_match_song()
-        self.write_corrected_lyrics_data_file()
-        self.write_corrected_lyrics_plain_text()
+        if self.openai_client:
+            self.write_corrected_lyrics_data_file()
+            self.write_corrected_lyrics_plain_text()
+        else:
+            self.logger.warning("Skipping LLM correction as no OpenAI client is available")
+            self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict"]
+            self.write_corrected_lyrics_plain_text()
         self.calculate_singing_percentage()
@@ -169,7 +183,8 @@ class LyricsTranscriber:
         self.copy_files_to_output_dir()
         self.calculate_llm_costs()
-        self.openai_client.close()
+        if self.openai_client:
+            self.openai_client.close()
         return self.outputs
@@ -198,41 +213,55 @@ class LyricsTranscriber:
             online_lyrics_text_key = f"{online_lyrics_source}_lyrics_text"
             online_lyrics_filepath_key = f"{online_lyrics_source}_lyrics_filepath"
-            if online_lyrics_text_key not in self.outputs:
+            if online_lyrics_text_key not in self.outputs or self.outputs[online_lyrics_text_key] is None:
                 continue
-            data_input_str = (
-                f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
-            )
-            # self.logger.debug(f"system_prompt:\n{system_prompt}\ndata_input_str:\n{data_input_str}")
-            self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
-            response = self.openai_client.chat.completions.create(
-                model=self.llm_model,
-                messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
-            )
+            if self.openai_client:
+                data_input_str = (
+                    f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
+                )
-            message = response.choices[0].message.content
-            finish_reason = response.choices[0].finish_reason
+                self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
+                response = self.openai_client.chat.completions.create(
+                    model=self.llm_model,
+                    messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
+                )
-            self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
-            self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
+                message = response.choices[0].message.content
+                finish_reason = response.choices[0].finish_reason
-            # self.logger.debug(f"LLM API response finish_reason: {finish_reason} message: \n{message}")
+                self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
+                self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
-            if finish_reason == "stop":
-                if message == "Yes":
-                    self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
+                if finish_reason == "stop":
+                    if message == "Yes":
+                        self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
+                        at_least_one_online_lyrics_validated = True
+                    elif message == "No":
+                        self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
+                        self.outputs[online_lyrics_text_key] = None
+                        self.outputs[online_lyrics_filepath_key] = None
+                    else:
+                        self.logger.error(f"Unexpected response from LLM: {message}")
+                else:
+                    self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
+            else:
+                # Fallback primitive word matching
+                self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
+                transcribed_words = set(self.outputs["transcribed_lyrics_text"].split())
+                online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
+                common_words = transcribed_words & online_lyrics_words
+                match_percentage = len(common_words) / len(online_lyrics_words) * 100
+                if match_percentage >= 50:
+                    self.logger.info(
+                        f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
+                    )
                     at_least_one_online_lyrics_validated = True
-                elif message == "No":
+                else:
                     self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
                     self.outputs[online_lyrics_text_key] = None
                     self.outputs[online_lyrics_filepath_key] = None
-                else:
-                    self.logger.error(f"Unexpected response from LLM: {message}")
-            else:
-                self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
         self.logger.info(
             f"Completed validation of transcription using online lyrics sources. Match found: {at_least_one_online_lyrics_validated}"
@@ -242,9 +271,12 @@ class LyricsTranscriber:
             self.logger.error(
                 f"Lyrics from Genius and Spotify did not match the transcription. Please check artist and title are set correctly."
             )
-            raise Exception("Cannot proceed without internet lyrics to validate / correct transcription")
     def write_corrected_lyrics_data_file(self):
+        if not self.openai_client:
+            self.logger.warning("Skipping LLM correction as no OpenAI client is available")
+            return
         self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
         corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + "-corrected.json")
@@ -583,51 +615,57 @@ class LyricsTranscriber:
                     f.write(line)
     def create_screens(self):
-        self.logger.debug(f"create_screens beginning generation of screens from whisper results")
+        self.logger.debug("create_screens beginning generation of screens from whisper results")
         screens: List[subtitles.LyricsScreen] = []
-        line: Optional[subtitles.LyricsLine] = None
         screen: Optional[subtitles.LyricsScreen] = None
-        lines_in_current_screen = 0
+        max_lines_per_screen = 4
+        max_line_length = 36  # Maximum characters per line
+        self.logger.debug(f"Max lines per screen: {max_lines_per_screen}, Max line length: {max_line_length}")
         for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
-            self.logger.debug(f"lines_in_current_screen: {lines_in_current_screen} segment: {segment['text']}")
-            if screen is None:
-                self.logger.debug(f"screen is none, creating new LyricsScreen")
-                screen = subtitles.LyricsScreen()
-                screen.video_size = self.video_resolution_num
-                screen.line_height = self.line_height
-            if line is None:
-                self.logger.debug(f"line is none, creating new LyricsLine")
-                line = subtitles.LyricsLine()
-            num_words_in_segment = len(segment["words"])
-            for word_index, word in enumerate(segment["words"]):
-                segment = subtitles.LyricSegment(
+            self.logger.debug(f"Processing segment: {segment['text']}")
+            if screen is None or len(screen.lines) >= max_lines_per_screen:
+                screen = subtitles.LyricsScreen(video_size=self.video_resolution_num, line_height=self.line_height, logger=self.logger)
+                screens.append(screen)
+                self.logger.debug(f"Created new screen. Total screens: {len(screens)}")
+            words = segment["words"]
+            current_line = subtitles.LyricsLine()
+            current_line_text = ""
+            self.logger.debug(f"Processing {len(words)} words in segment")
+            for word in words:
+                self.logger.debug(f"Processing word: '{word['text']}'")
+                if len(current_line_text) + len(word["text"]) + 1 > max_line_length or (current_line_text and word["text"][0].isupper()):
+                    self.logger.debug(f"Current line would exceed max length or new capitalized word. Line: '{current_line_text}'")
+                    if current_line.segments:
+                        screen.lines.append(current_line)
+                        self.logger.debug(f"Added line to screen. Lines on current screen: {len(screen.lines)}")
+                        if len(screen.lines) >= max_lines_per_screen:
+                            screen = subtitles.LyricsScreen(
+                                video_size=self.video_resolution_num,
+                                line_height=self.line_height,
+                                logger=self.logger,
+                            )
+                            screens.append(screen)
+                            self.logger.debug(f"Screen full, created new screen. Total screens: {len(screens)}")
+                    current_line = subtitles.LyricsLine()
+                    current_line_text = ""
+                    self.logger.debug("Reset current line")
+                current_line_text += (" " if current_line_text else "") + word["text"]
+                lyric_segment = subtitles.LyricSegment(
                     text=word["text"], ts=timedelta(seconds=word["start"]), end_ts=timedelta(seconds=word["end"])
                 )
-                line.segments.append(segment)
-                # If word is last in the line, add line to screen and start new line
-                # Before looping to the next word
-                if word_index == num_words_in_segment - 1:
-                    self.logger.debug(f"word_index is last in segment, adding line to screen and starting new line")
-                    screen.lines.append(line)
-                    lines_in_current_screen += 1
-                    line = None
-            # If current screen has 2 lines already, add screen to list and start new screen
-            # Before looping to the next line
-            if lines_in_current_screen == 2:
-                self.logger.debug(f"lines_in_current_screen is 2, adding screen to list and starting new screen")
-                screens.append(screen)
-                screen = None
-                lines_in_current_screen = 0
+                current_line.segments.append(lyric_segment)
+                self.logger.debug(f"Added word to current line. Current line: '{current_line_text}'")
-        if line is not None:
-            screen.lines.append(line)  # type: ignore[union-attr]
-        if screen is not None and len(screen.lines) > 0:
-            screens.append(screen)  # type: ignore[arg-type]
+            if current_line.segments:
+                screen.lines.append(current_line)
+                self.logger.debug(f"Added final line of segment to screen. Lines on current screen: {len(screen.lines)}")
+        self.logger.debug(f"Finished creating screens. Total screens created: {len(screens)}")
         return screens
     def write_ass_file(self):
@@ -760,7 +798,10 @@ class LyricsTranscriber:
     def write_transcribed_lyrics_plain_text(self):
         if self.outputs["transcription_data_dict"]:
-            transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + "-transcribed.txt")
+            transcription_cache_suffix = "-audioshake-transcribed.txt" if self.audioshake_api_token else "-whisper-transcribed.txt"
+            self.logger.debug(f"transcription_cache_suffix: {transcription_cache_suffix}")
+            transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + transcription_cache_suffix)
             self.outputs["transcribed_lyrics_text_filepath"] = transcribed_lyrics_text_filepath
             self.outputs["transcribed_lyrics_text"] = ""
@@ -773,8 +814,109 @@ class LyricsTranscriber:
         else:
             raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
+    def find_best_split_point(self, text, max_length):
+        self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
+        words = text.split()
+        mid_word_index = len(words) // 2
+        mid_point = len(" ".join(words[:mid_word_index]))
+        self.logger.debug(f"Mid point is at character {mid_point}")
+        # Check for a comma within one or two words of the middle word
+        if "," in text:
+            comma_indices = [i for i, char in enumerate(text) if char == ","]
+            self.logger.debug(f"Found commas at indices: {comma_indices}")
+            for index in comma_indices:
+                if abs(mid_point - index) < 20 and len(text[: index + 1].strip()) <= max_length:
+                    self.logger.debug(f"Choosing comma at index {index} as split point")
+                    return index + 1  # Include the comma in the first part
+        # Check for 'and'
+        if " and " in text:
+            and_indices = [m.start() for m in re.finditer(" and ", text)]
+            self.logger.debug(f"Found 'and' at indices: {and_indices}")
+            for index in sorted(and_indices, key=lambda x: abs(x - mid_point)):
+                if len(text[: index + len(" and ")].strip()) <= max_length:
+                    self.logger.debug(f"Choosing 'and' at index {index} as split point")
+                    return index + len(" and ")
+        # Check for words starting with a capital letter
+        capital_word_indices = [m.start() for m in re.finditer(r"\s[A-Z]", text)]
+        self.logger.debug(f"Found capital words at indices: {capital_word_indices}")
+        for index in sorted(capital_word_indices, key=lambda x: abs(x - mid_point)):
+            if index > 0 and len(text[:index].strip()) <= max_length:
+                self.logger.debug(f"Choosing capital word at index {index} as split point")
+                return index
+        # If no better split point is found, try splitting at the middle word
+        if len(words) > 2 and mid_word_index > 0:
+            split_at_middle = len(" ".join(words[:mid_word_index]))
+            if split_at_middle <= max_length:
+                self.logger.debug(f"Choosing middle word split at index {split_at_middle}")
+                return split_at_middle
+        # If the text is still too long, forcibly split at the maximum length
+        self.logger.debug(f"No suitable split point found, forcibly splitting at max_length {max_length}")
+        return max_length
+    def split_long_segments(self, segments, max_length):
+        self.logger.debug(f"Splitting long segments (max_length: {max_length})")
+        new_segments = []
+        for segment in segments:
+            text = segment["text"]
+            self.logger.debug(f"Processing segment: '{text}' (length: {len(text)})")
+            if len(text) <= max_length:
+                self.logger.debug("Segment is within max_length, keeping as is")
+                new_segments.append(segment)
+            else:
+                self.logger.debug("Segment exceeds max_length, splitting")
+                meta_words = segment["words"]
+                current_text = ""
+                current_start = segment["start"]
+                current_words = []
+                for i, meta in enumerate(meta_words):
+                    word = meta["text"]
+                    if current_text:
+                        current_text += " "
+                    current_text += word
+                    current_words.append(meta)
+                    should_split = len(current_text) > max_length or (i > 0 and word[0].isupper())
+                    if should_split:
+                        self.logger.debug(f"Splitting at: '{current_text}'")
+                        # If splitting due to capitalization, don't include the capitalized word
+                        if word[0].isupper() and len(current_text.strip()) > len(word):
+                            split_text = current_text[: -(len(word) + 1)].strip()
+                            current_words = current_words[:-1]
+                        else:
+                            split_text = current_text.strip()
+                        new_segment = {"text": split_text, "start": current_start, "end": current_words[-1]["end"], "words": current_words}
+                        new_segments.append(new_segment)
+                        self.logger.debug(f"Added new segment: {new_segment}")
+                        # Reset for next segment
+                        if word[0].isupper() and len(current_text.strip()) > len(word):
+                            current_text = word
+                            current_words = [meta]
+                        else:
+                            current_text = ""
+                            current_words = []
+                        current_start = meta["start"]
+                # Add any remaining text as a final segment
+                if current_text:
+                    self.logger.debug(f"Adding final segment: '{current_text}'")
+                    new_segments.append(
+                        {"text": current_text.strip(), "start": current_start, "end": segment["end"], "words": current_words}
+                    )
+        self.logger.debug(f"Splitting complete. Original segments: {len(segments)}, New segments: {len(new_segments)}")
+        return new_segments
     def transcribe(self):
-        self.outputs["transcription_data_filepath"] = self.get_cache_filepath(".json")
+        transcription_cache_suffix = "-audioshake" if self.audioshake_api_token else "-whisper"
+        self.outputs["transcription_data_filepath"] = self.get_cache_filepath(f"{transcription_cache_suffix}.json")
         whisper_cache_filepath = self.outputs["transcription_data_filepath"]
         if os.path.isfile(whisper_cache_filepath):
@@ -783,15 +925,26 @@ class LyricsTranscriber:
                 self.outputs["transcription_data_dict"] = json.load(cache_file)
                 return
-        self.logger.debug(f"no cached transcription file found, running whisper transcribe with model: {self.transcription_model}")
-        audio = whisper.load_audio(self.audio_filepath)
-        model = whisper.load_model(self.transcription_model, device="cpu")
-        result = whisper.transcribe(model, audio, language="en")
+        if self.audioshake_api_token:
+            self.logger.debug(f"Using AudioShake API for transcription")
+            from .audioshake_transcriber import AudioShakeTranscriber
-        self.logger.debug(f"transcription complete, performing post-processing cleanup")
-        # Remove segments with no words, only music
-        result["segments"] = [segment for segment in result["segments"] if segment["text"].strip() != "Music"]
+            audioshake = AudioShakeTranscriber(self.audioshake_api_token, log_level=self.log_level)
+            result = audioshake.transcribe(self.audio_filepath)
+        else:
+            self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
+            audio = whisper.load_audio(self.audio_filepath)
+            model = whisper.load_model(self.transcription_model, device="cpu")
+            result = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
+            # Remove segments with no words, only music
+            result["segments"] = [segment for segment in result["segments"] if segment["text"].strip() != "Music"]
+            self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(result['segments'])}")
+            # Split long segments
+            self.logger.debug("Starting to split long segments")
+            result["segments"] = self.split_long_segments(result["segments"], max_length=36)
+            self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(result['segments'])}")
         self.logger.debug(f"writing transcription data JSON to cache file: {whisper_cache_filepath}")
         with open(whisper_cache_filepath, "w") as cache_file:

{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/cli.py RENAMED Viewed

@@ -34,6 +34,11 @@ def main():
         default=None,
         help="Optional: song title for lyrics lookup and auto-correction",
     )
+    parser.add_argument(
+        "--audioshake_api_token",
+        default=None,
+        help="Optional: AudioShake API token for lyrics transcription and alignment. Can also be set with AUDIOSHAKE_API_TOKEN env var.",
+    )
     parser.add_argument(
         "--genius_api_token",
         default=None,
@@ -77,7 +82,7 @@ def main():
     parser.add_argument(
         "--video_resolution",
-        default="4k",
+        default="360p",
         help="Optional: resolution of the karaoke video to render. Must be one of: 4k, 1080p, 720p, 360p. Default: 360p",
     )
@@ -93,6 +98,12 @@ def main():
         help="Optional: color to use for karaoke video background, in hex format or FFmpeg color name. Default: black",
     )
+    parser.add_argument(
+        "--openai_api_key",
+        default=None,
+        help="Optional: OpenAI API key for LLM model usage. Can also be set with OPENAI_API_KEY env var.",
+    )
     args = parser.parse_args()
     log_level = getattr(logging, args.log_level.upper())
@@ -114,8 +125,10 @@ def main():
     transcriber = LyricsTranscriber(
         args.audio_filepath,
+        audioshake_api_token=args.audioshake_api_token,
         genius_api_token=args.genius_api_token,
         spotify_cookie=args.spotify_cookie,
+        openai_api_key=args.openai_api_key,
         artist=args.artist,
         title=args.title,
         output_dir=args.output_dir,

{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/subtitles.py RENAMED Viewed

@@ -5,6 +5,7 @@ import json
 import itertools
 from pathlib import Path
 from enum import IntEnum
+import logging
 from . import ass
@@ -85,21 +86,19 @@ class LyricsLine:
     def __str__(self):
         return "".join([f"{{{s.text}}}" for s in self.segments])
-    def as_ass_event(
-        self,
-        screen_start: timedelta,
-        screen_end: timedelta,
-        style: ass.ASS.Style,
-        top_margin: int,
-    ):
+    def as_ass_event(self, screen_start: timedelta, screen_end: timedelta, style: ass.ASS.Style, y_position: int):
         e = ass.ASS.Event()
         e.type = "Dialogue"
         e.Layer = 0
         e.Style = style
         e.Start = screen_start.total_seconds()
         e.End = screen_end.total_seconds()
-        e.MarginV = top_margin
+        e.MarginV = y_position
         e.Text = self.decorate_ass_line(self.segments, screen_start)
+        # Set alignment to top-center
+        e.Text = "{\\an8}" + e.Text
         return e
     def decorate_ass_line(self, segments, screen_start_ts: timedelta):
@@ -137,6 +136,7 @@ class LyricsScreen:
     start_ts: Optional[timedelta] = None
     video_size: Tuple[int, int] = None
     line_height: int = None
+    logger: logging.Logger = None
     @property
     def end_ts(self) -> timedelta:
@@ -145,10 +145,36 @@ class LyricsScreen:
     def get_line_y(self, line_num: int) -> int:
         _, h = self.video_size
         line_count = len(self.lines)
-        return (h / 2) - (line_count * self.line_height / 2) + (line_num * self.line_height)
+        total_height = line_count * self.line_height
+        # Calculate the top margin to center the lyrics block
+        top_margin = (h - total_height) / 2
+        # Calculate the y-position for this specific line
+        line_y = top_margin + (line_num * self.line_height)
+        # if self.logger:
+        #     self.logger.debug(f"Line {line_num + 1} positioning:")
+        #     self.logger.debug(f"  Video height: {h}")
+        #     self.logger.debug(f"  Total lines: {line_count}")
+        #     self.logger.debug(f"  Line height: {self.line_height}")
+        #     self.logger.debug(f"  Total lyrics height: {total_height}")
+        #     self.logger.debug(f"  Top margin: {top_margin}")
+        #     self.logger.debug(f"  Line y: {line_y}")
+        return int(line_y)
     def as_ass_events(self, style: ass.ASS.Style) -> List[ass.ASS.Event]:
-        return [line.as_ass_event(self.start_ts, self.end_ts, style, self.get_line_y(i)) for i, line in enumerate(self.lines)]
+        events = []
+        for i, line in enumerate(self.lines):
+            y_position = self.get_line_y(i)
+            # if self.logger:
+            #     self.logger.debug(f"Creating ASS event for line {i + 1} at y-position: {y_position}")
+            event = line.as_ass_event(self.start_ts, self.end_ts, style, y_position)
+            events.append(event)
+        return events
     def __str__(self):
         lines = [f"{self.start_ts} - {self.end_ts}:"]
@@ -264,7 +290,7 @@ def create_styled_subtitles(
     style.BorderStyle = 1
     style.Outline = 1
     style.Shadow = 0
-    style.Alignment = ass.ASS.ALIGN_MIDDLE_CENTER
+    style.Alignment = ass.ASS.ALIGN_TOP_CENTER
     style.MarginL = 0
     style.MarginR = 0
     style.MarginV = 0

{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "lyrics-transcriber"
-version = "0.14.0"
+version = "0.16.0"
 description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
 authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
 license = "MIT"
@@ -28,6 +28,7 @@ syrics = ">=0"
 openai = "^1"
 openai-whisper = ">=20231117"
 transformers = ">=4"
+auditok = ">=0.2"
 whisper-timestamped = ">=1"
 # Note: after adding openai-whisper and whisper-timestamped with poetry lock, I then removed all traces of triton
 # from poetry.lock before running poetry install, as triton doesn't support macOS but isn't actually needed for whisper.