lyrics-transcriber 0.14.0__tar.gz → 0.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/PKG-INFO +23 -2
  2. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/README.md +21 -1
  3. lyrics_transcriber-0.16.0/lyrics_transcriber/audioshake_transcriber.py +35 -0
  4. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/transcriber.py +236 -83
  5. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/cli.py +14 -1
  6. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/subtitles.py +37 -11
  7. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/pyproject.toml +2 -1
  8. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/LICENSE +0 -0
  9. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/__init__.py +0 -0
  10. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/README.md +0 -0
  11. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -0
  12. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -0
  13. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -0
  14. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -0
  15. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -0
  16. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/__init__.py +0 -0
  17. {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/ass.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lyrics-transcriber
3
- Version: 0.14.0
3
+ Version: 0.16.0
4
4
  Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
5
5
  Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
6
6
  License: MIT
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Classifier: Programming Language :: Python :: 3.12
16
16
  Requires-Dist: Cython (>=0)
17
+ Requires-Dist: auditok (>=0.2)
17
18
  Requires-Dist: dtw-python (>=1)
18
19
  Requires-Dist: llvmlite (>=0)
19
20
  Requires-Dist: lyricsgenius (>=3)
@@ -52,7 +53,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
52
53
  ### Prerequisites
53
54
 
54
55
  - Python 3.9 or higher
55
- - [Optional] A Genius API token if you want to fetch lyrics from Genius
56
+ - [Optional] Genius API token if you want to fetch lyrics from Genius
57
+ - [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
58
+ - [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
59
+ - [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
56
60
 
57
61
  ```
58
62
  pip install lyrics-transcriber
@@ -61,6 +65,23 @@ pip install lyrics-transcriber
61
65
  > **Warning**
62
66
  > The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
63
67
 
68
+ ## Docker
69
+
70
+ You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
71
+
72
+ ```sh
73
+ docker run \
74
+ -v `pwd`/input:/input \
75
+ -v `pwd`/output:/output \
76
+ beveradb/lyrics-transcriber:0.16.0 \
77
+ --log_level debug \
78
+ --output_dir /output \
79
+ --render_video \
80
+ --video_background_image /input/your-background-image.png \
81
+ --video_resolution 360p \
82
+ /input/song.flac
83
+ ```
84
+
64
85
  ## Usage 🚀
65
86
 
66
87
  ### As a standalone CLI
@@ -17,7 +17,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
17
17
  ### Prerequisites
18
18
 
19
19
  - Python 3.9 or higher
20
- - [Optional] A Genius API token if you want to fetch lyrics from Genius
20
+ - [Optional] Genius API token if you want to fetch lyrics from Genius
21
+ - [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
22
+ - [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
23
+ - [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
21
24
 
22
25
  ```
23
26
  pip install lyrics-transcriber
@@ -26,6 +29,23 @@ pip install lyrics-transcriber
26
29
  > **Warning**
27
30
  > The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
28
31
 
32
+ ## Docker
33
+
34
+ You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
35
+
36
+ ```sh
37
+ docker run \
38
+ -v `pwd`/input:/input \
39
+ -v `pwd`/output:/output \
40
+ beveradb/lyrics-transcriber:0.16.0 \
41
+ --log_level debug \
42
+ --output_dir /output \
43
+ --render_video \
44
+ --video_background_image /input/your-background-image.png \
45
+ --video_resolution 360p \
46
+ /input/song.flac
47
+ ```
48
+
29
49
  ## Usage 🚀
30
50
 
31
51
  ### As a standalone CLI
@@ -0,0 +1,35 @@
1
+ import logging
2
+ import requests
3
+
4
+
5
+ class AudioShakeTranscriber:
6
+ def __init__(self, api_token, log_level=logging.DEBUG):
7
+ self.api_token = api_token
8
+ self.logger = logging.getLogger(__name__)
9
+ self.logger.setLevel(log_level)
10
+
11
+ def transcribe(self, audio_filepath):
12
+ # This is a placeholder for the actual AudioShake API implementation
13
+ self.logger.info(f"Transcribing {audio_filepath} using AudioShake API")
14
+
15
+ self.logger.debug(f"AudioShake API token: {self.api_token}")
16
+ # TODO: Implement the actual API call to AudioShake
17
+ # For now, we'll return a dummy result
18
+ return {
19
+ "transcription_data_dict": {
20
+ "segments": [
21
+ {
22
+ "start": 0,
23
+ "end": 5,
24
+ "text": "This is a dummy transcription",
25
+ "words": [
26
+ {"text": "This", "start": 0, "end": 1},
27
+ {"text": "is", "start": 1, "end": 2},
28
+ {"text": "a", "start": 2, "end": 3},
29
+ {"text": "dummy", "start": 3, "end": 4},
30
+ {"text": "transcription", "start": 4, "end": 5},
31
+ ],
32
+ }
33
+ ]
34
+ }
35
+ }
@@ -22,6 +22,8 @@ class LyricsTranscriber:
22
22
  audio_filepath,
23
23
  artist=None,
24
24
  title=None,
25
+ openai_api_key=None,
26
+ audioshake_api_token=None,
25
27
  genius_api_token=None,
26
28
  spotify_cookie=None,
27
29
  output_dir=None,
@@ -59,23 +61,30 @@ class LyricsTranscriber:
59
61
  self.title = title
60
62
  self.song_known = self.artist is not None and self.title is not None
61
63
 
64
+ self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
62
65
  self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
63
66
  self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
67
+ self.audioshake_api_token = os.getenv("AUDIOSHAKE_TOKEN", default=audioshake_api_token)
64
68
 
65
69
  self.transcription_model = transcription_model
66
70
  self.llm_model = llm_model
67
71
  self.llm_prompt_matching = llm_prompt_matching
68
72
  self.llm_prompt_correction = llm_prompt_correction
69
73
 
70
- self.openai_client = OpenAI()
74
+ self.openai_client = None
71
75
 
72
- # Uncomment for local models e.g. with ollama
73
- # self.openai_client = OpenAI(
74
- # base_url="http://localhost:11434/v1",
75
- # api_key="ollama",
76
- # )
76
+ if self.openai_api_key:
77
+ self.openai_client = OpenAI(api_key=self.openai_api_key)
77
78
 
78
- self.openai_client.log = self.log_level
79
+ # Uncomment for local models e.g. with ollama
80
+ # self.openai_client = OpenAI(
81
+ # base_url="http://localhost:11434/v1",
82
+ # api_key="ollama",
83
+ # )
84
+
85
+ self.openai_client.log = self.log_level
86
+ else:
87
+ self.logger.error("No OpenAI API key found, no correction will be applied to transcription")
79
88
 
80
89
  self.render_video = render_video
81
90
  self.video_resolution = video_resolution
@@ -154,8 +163,13 @@ class LyricsTranscriber:
154
163
 
155
164
  self.validate_lyrics_match_song()
156
165
 
157
- self.write_corrected_lyrics_data_file()
158
- self.write_corrected_lyrics_plain_text()
166
+ if self.openai_client:
167
+ self.write_corrected_lyrics_data_file()
168
+ self.write_corrected_lyrics_plain_text()
169
+ else:
170
+ self.logger.warning("Skipping LLM correction as no OpenAI client is available")
171
+ self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict"]
172
+ self.write_corrected_lyrics_plain_text()
159
173
 
160
174
  self.calculate_singing_percentage()
161
175
 
@@ -169,7 +183,8 @@ class LyricsTranscriber:
169
183
  self.copy_files_to_output_dir()
170
184
  self.calculate_llm_costs()
171
185
 
172
- self.openai_client.close()
186
+ if self.openai_client:
187
+ self.openai_client.close()
173
188
 
174
189
  return self.outputs
175
190
 
@@ -198,41 +213,55 @@ class LyricsTranscriber:
198
213
  online_lyrics_text_key = f"{online_lyrics_source}_lyrics_text"
199
214
  online_lyrics_filepath_key = f"{online_lyrics_source}_lyrics_filepath"
200
215
 
201
- if online_lyrics_text_key not in self.outputs:
216
+ if online_lyrics_text_key not in self.outputs or self.outputs[online_lyrics_text_key] is None:
202
217
  continue
203
218
 
204
- data_input_str = (
205
- f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
206
- )
207
-
208
- # self.logger.debug(f"system_prompt:\n{system_prompt}\ndata_input_str:\n{data_input_str}")
209
-
210
- self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
211
- response = self.openai_client.chat.completions.create(
212
- model=self.llm_model,
213
- messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
214
- )
219
+ if self.openai_client:
220
+ data_input_str = (
221
+ f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
222
+ )
215
223
 
216
- message = response.choices[0].message.content
217
- finish_reason = response.choices[0].finish_reason
224
+ self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
225
+ response = self.openai_client.chat.completions.create(
226
+ model=self.llm_model,
227
+ messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
228
+ )
218
229
 
219
- self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
220
- self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
230
+ message = response.choices[0].message.content
231
+ finish_reason = response.choices[0].finish_reason
221
232
 
222
- # self.logger.debug(f"LLM API response finish_reason: {finish_reason} message: \n{message}")
233
+ self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
234
+ self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
223
235
 
224
- if finish_reason == "stop":
225
- if message == "Yes":
226
- self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
236
+ if finish_reason == "stop":
237
+ if message == "Yes":
238
+ self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
239
+ at_least_one_online_lyrics_validated = True
240
+ elif message == "No":
241
+ self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
242
+ self.outputs[online_lyrics_text_key] = None
243
+ self.outputs[online_lyrics_filepath_key] = None
244
+ else:
245
+ self.logger.error(f"Unexpected response from LLM: {message}")
246
+ else:
247
+ self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
248
+ else:
249
+ # Fallback primitive word matching
250
+ self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
251
+ transcribed_words = set(self.outputs["transcribed_lyrics_text"].split())
252
+ online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
253
+ common_words = transcribed_words & online_lyrics_words
254
+ match_percentage = len(common_words) / len(online_lyrics_words) * 100
255
+
256
+ if match_percentage >= 50:
257
+ self.logger.info(
258
+ f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
259
+ )
227
260
  at_least_one_online_lyrics_validated = True
228
- elif message == "No":
261
+ else:
229
262
  self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
230
263
  self.outputs[online_lyrics_text_key] = None
231
264
  self.outputs[online_lyrics_filepath_key] = None
232
- else:
233
- self.logger.error(f"Unexpected response from LLM: {message}")
234
- else:
235
- self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
236
265
 
237
266
  self.logger.info(
238
267
  f"Completed validation of transcription using online lyrics sources. Match found: {at_least_one_online_lyrics_validated}"
@@ -242,9 +271,12 @@ class LyricsTranscriber:
242
271
  self.logger.error(
243
272
  f"Lyrics from Genius and Spotify did not match the transcription. Please check artist and title are set correctly."
244
273
  )
245
- raise Exception("Cannot proceed without internet lyrics to validate / correct transcription")
246
274
 
247
275
  def write_corrected_lyrics_data_file(self):
276
+ if not self.openai_client:
277
+ self.logger.warning("Skipping LLM correction as no OpenAI client is available")
278
+ return
279
+
248
280
  self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
249
281
 
250
282
  corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + "-corrected.json")
@@ -583,51 +615,57 @@ class LyricsTranscriber:
583
615
  f.write(line)
584
616
 
585
617
  def create_screens(self):
586
- self.logger.debug(f"create_screens beginning generation of screens from whisper results")
618
+ self.logger.debug("create_screens beginning generation of screens from whisper results")
587
619
  screens: List[subtitles.LyricsScreen] = []
588
- line: Optional[subtitles.LyricsLine] = None
589
620
  screen: Optional[subtitles.LyricsScreen] = None
590
621
 
591
- lines_in_current_screen = 0
622
+ max_lines_per_screen = 4
623
+ max_line_length = 36 # Maximum characters per line
624
+ self.logger.debug(f"Max lines per screen: {max_lines_per_screen}, Max line length: {max_line_length}")
625
+
592
626
  for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
593
- self.logger.debug(f"lines_in_current_screen: {lines_in_current_screen} segment: {segment['text']}")
594
- if screen is None:
595
- self.logger.debug(f"screen is none, creating new LyricsScreen")
596
- screen = subtitles.LyricsScreen()
597
- screen.video_size = self.video_resolution_num
598
- screen.line_height = self.line_height
599
- if line is None:
600
- self.logger.debug(f"line is none, creating new LyricsLine")
601
- line = subtitles.LyricsLine()
602
-
603
- num_words_in_segment = len(segment["words"])
604
- for word_index, word in enumerate(segment["words"]):
605
- segment = subtitles.LyricSegment(
627
+ self.logger.debug(f"Processing segment: {segment['text']}")
628
+ if screen is None or len(screen.lines) >= max_lines_per_screen:
629
+ screen = subtitles.LyricsScreen(video_size=self.video_resolution_num, line_height=self.line_height, logger=self.logger)
630
+ screens.append(screen)
631
+ self.logger.debug(f"Created new screen. Total screens: {len(screens)}")
632
+
633
+ words = segment["words"]
634
+ current_line = subtitles.LyricsLine()
635
+ current_line_text = ""
636
+ self.logger.debug(f"Processing {len(words)} words in segment")
637
+
638
+ for word in words:
639
+ self.logger.debug(f"Processing word: '{word['text']}'")
640
+ if len(current_line_text) + len(word["text"]) + 1 > max_line_length or (current_line_text and word["text"][0].isupper()):
641
+ self.logger.debug(f"Current line would exceed max length or new capitalized word. Line: '{current_line_text}'")
642
+ if current_line.segments:
643
+ screen.lines.append(current_line)
644
+ self.logger.debug(f"Added line to screen. Lines on current screen: {len(screen.lines)}")
645
+ if len(screen.lines) >= max_lines_per_screen:
646
+ screen = subtitles.LyricsScreen(
647
+ video_size=self.video_resolution_num,
648
+ line_height=self.line_height,
649
+ logger=self.logger,
650
+ )
651
+ screens.append(screen)
652
+ self.logger.debug(f"Screen full, created new screen. Total screens: {len(screens)}")
653
+ current_line = subtitles.LyricsLine()
654
+ current_line_text = ""
655
+ self.logger.debug("Reset current line")
656
+
657
+ current_line_text += (" " if current_line_text else "") + word["text"]
658
+ lyric_segment = subtitles.LyricSegment(
606
659
  text=word["text"], ts=timedelta(seconds=word["start"]), end_ts=timedelta(seconds=word["end"])
607
660
  )
608
- line.segments.append(segment)
609
-
610
- # If word is last in the line, add line to screen and start new line
611
- # Before looping to the next word
612
- if word_index == num_words_in_segment - 1:
613
- self.logger.debug(f"word_index is last in segment, adding line to screen and starting new line")
614
- screen.lines.append(line)
615
- lines_in_current_screen += 1
616
- line = None
617
-
618
- # If current screen has 2 lines already, add screen to list and start new screen
619
- # Before looping to the next line
620
- if lines_in_current_screen == 2:
621
- self.logger.debug(f"lines_in_current_screen is 2, adding screen to list and starting new screen")
622
- screens.append(screen)
623
- screen = None
624
- lines_in_current_screen = 0
661
+ current_line.segments.append(lyric_segment)
662
+ self.logger.debug(f"Added word to current line. Current line: '{current_line_text}'")
625
663
 
626
- if line is not None:
627
- screen.lines.append(line) # type: ignore[union-attr]
628
- if screen is not None and len(screen.lines) > 0:
629
- screens.append(screen) # type: ignore[arg-type]
664
+ if current_line.segments:
665
+ screen.lines.append(current_line)
666
+ self.logger.debug(f"Added final line of segment to screen. Lines on current screen: {len(screen.lines)}")
630
667
 
668
+ self.logger.debug(f"Finished creating screens. Total screens created: {len(screens)}")
631
669
  return screens
632
670
 
633
671
  def write_ass_file(self):
@@ -760,7 +798,10 @@ class LyricsTranscriber:
760
798
 
761
799
  def write_transcribed_lyrics_plain_text(self):
762
800
  if self.outputs["transcription_data_dict"]:
763
- transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + "-transcribed.txt")
801
+ transcription_cache_suffix = "-audioshake-transcribed.txt" if self.audioshake_api_token else "-whisper-transcribed.txt"
802
+ self.logger.debug(f"transcription_cache_suffix: {transcription_cache_suffix}")
803
+
804
+ transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + transcription_cache_suffix)
764
805
  self.outputs["transcribed_lyrics_text_filepath"] = transcribed_lyrics_text_filepath
765
806
 
766
807
  self.outputs["transcribed_lyrics_text"] = ""
@@ -773,8 +814,109 @@ class LyricsTranscriber:
773
814
  else:
774
815
  raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
775
816
 
817
+ def find_best_split_point(self, text, max_length):
818
+ self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
819
+ words = text.split()
820
+ mid_word_index = len(words) // 2
821
+ mid_point = len(" ".join(words[:mid_word_index]))
822
+ self.logger.debug(f"Mid point is at character {mid_point}")
823
+
824
+ # Check for a comma within one or two words of the middle word
825
+ if "," in text:
826
+ comma_indices = [i for i, char in enumerate(text) if char == ","]
827
+ self.logger.debug(f"Found commas at indices: {comma_indices}")
828
+ for index in comma_indices:
829
+ if abs(mid_point - index) < 20 and len(text[: index + 1].strip()) <= max_length:
830
+ self.logger.debug(f"Choosing comma at index {index} as split point")
831
+ return index + 1 # Include the comma in the first part
832
+
833
+ # Check for 'and'
834
+ if " and " in text:
835
+ and_indices = [m.start() for m in re.finditer(" and ", text)]
836
+ self.logger.debug(f"Found 'and' at indices: {and_indices}")
837
+ for index in sorted(and_indices, key=lambda x: abs(x - mid_point)):
838
+ if len(text[: index + len(" and ")].strip()) <= max_length:
839
+ self.logger.debug(f"Choosing 'and' at index {index} as split point")
840
+ return index + len(" and ")
841
+
842
+ # Check for words starting with a capital letter
843
+ capital_word_indices = [m.start() for m in re.finditer(r"\s[A-Z]", text)]
844
+ self.logger.debug(f"Found capital words at indices: {capital_word_indices}")
845
+ for index in sorted(capital_word_indices, key=lambda x: abs(x - mid_point)):
846
+ if index > 0 and len(text[:index].strip()) <= max_length:
847
+ self.logger.debug(f"Choosing capital word at index {index} as split point")
848
+ return index
849
+
850
+ # If no better split point is found, try splitting at the middle word
851
+ if len(words) > 2 and mid_word_index > 0:
852
+ split_at_middle = len(" ".join(words[:mid_word_index]))
853
+ if split_at_middle <= max_length:
854
+ self.logger.debug(f"Choosing middle word split at index {split_at_middle}")
855
+ return split_at_middle
856
+
857
+ # If the text is still too long, forcibly split at the maximum length
858
+ self.logger.debug(f"No suitable split point found, forcibly splitting at max_length {max_length}")
859
+ return max_length
860
+
861
+ def split_long_segments(self, segments, max_length):
862
+ self.logger.debug(f"Splitting long segments (max_length: {max_length})")
863
+ new_segments = []
864
+ for segment in segments:
865
+ text = segment["text"]
866
+ self.logger.debug(f"Processing segment: '{text}' (length: {len(text)})")
867
+ if len(text) <= max_length:
868
+ self.logger.debug("Segment is within max_length, keeping as is")
869
+ new_segments.append(segment)
870
+ else:
871
+ self.logger.debug("Segment exceeds max_length, splitting")
872
+ meta_words = segment["words"]
873
+ current_text = ""
874
+ current_start = segment["start"]
875
+ current_words = []
876
+
877
+ for i, meta in enumerate(meta_words):
878
+ word = meta["text"]
879
+ if current_text:
880
+ current_text += " "
881
+ current_text += word
882
+ current_words.append(meta)
883
+
884
+ should_split = len(current_text) > max_length or (i > 0 and word[0].isupper())
885
+ if should_split:
886
+ self.logger.debug(f"Splitting at: '{current_text}'")
887
+ # If splitting due to capitalization, don't include the capitalized word
888
+ if word[0].isupper() and len(current_text.strip()) > len(word):
889
+ split_text = current_text[: -(len(word) + 1)].strip()
890
+ current_words = current_words[:-1]
891
+ else:
892
+ split_text = current_text.strip()
893
+
894
+ new_segment = {"text": split_text, "start": current_start, "end": current_words[-1]["end"], "words": current_words}
895
+ new_segments.append(new_segment)
896
+ self.logger.debug(f"Added new segment: {new_segment}")
897
+
898
+ # Reset for next segment
899
+ if word[0].isupper() and len(current_text.strip()) > len(word):
900
+ current_text = word
901
+ current_words = [meta]
902
+ else:
903
+ current_text = ""
904
+ current_words = []
905
+ current_start = meta["start"]
906
+
907
+ # Add any remaining text as a final segment
908
+ if current_text:
909
+ self.logger.debug(f"Adding final segment: '{current_text}'")
910
+ new_segments.append(
911
+ {"text": current_text.strip(), "start": current_start, "end": segment["end"], "words": current_words}
912
+ )
913
+
914
+ self.logger.debug(f"Splitting complete. Original segments: {len(segments)}, New segments: {len(new_segments)}")
915
+ return new_segments
916
+
776
917
  def transcribe(self):
777
- self.outputs["transcription_data_filepath"] = self.get_cache_filepath(".json")
918
+ transcription_cache_suffix = "-audioshake" if self.audioshake_api_token else "-whisper"
919
+ self.outputs["transcription_data_filepath"] = self.get_cache_filepath(f"{transcription_cache_suffix}.json")
778
920
 
779
921
  whisper_cache_filepath = self.outputs["transcription_data_filepath"]
780
922
  if os.path.isfile(whisper_cache_filepath):
@@ -783,15 +925,26 @@ class LyricsTranscriber:
783
925
  self.outputs["transcription_data_dict"] = json.load(cache_file)
784
926
  return
785
927
 
786
- self.logger.debug(f"no cached transcription file found, running whisper transcribe with model: {self.transcription_model}")
787
- audio = whisper.load_audio(self.audio_filepath)
788
- model = whisper.load_model(self.transcription_model, device="cpu")
789
- result = whisper.transcribe(model, audio, language="en")
928
+ if self.audioshake_api_token:
929
+ self.logger.debug(f"Using AudioShake API for transcription")
930
+ from .audioshake_transcriber import AudioShakeTranscriber
790
931
 
791
- self.logger.debug(f"transcription complete, performing post-processing cleanup")
792
-
793
- # Remove segments with no words, only music
794
- result["segments"] = [segment for segment in result["segments"] if segment["text"].strip() != "Music"]
932
+ audioshake = AudioShakeTranscriber(self.audioshake_api_token, log_level=self.log_level)
933
+ result = audioshake.transcribe(self.audio_filepath)
934
+ else:
935
+ self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
936
+ audio = whisper.load_audio(self.audio_filepath)
937
+ model = whisper.load_model(self.transcription_model, device="cpu")
938
+ result = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
939
+
940
+ # Remove segments with no words, only music
941
+ result["segments"] = [segment for segment in result["segments"] if segment["text"].strip() != "Music"]
942
+ self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(result['segments'])}")
943
+
944
+ # Split long segments
945
+ self.logger.debug("Starting to split long segments")
946
+ result["segments"] = self.split_long_segments(result["segments"], max_length=36)
947
+ self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(result['segments'])}")
795
948
 
796
949
  self.logger.debug(f"writing transcription data JSON to cache file: {whisper_cache_filepath}")
797
950
  with open(whisper_cache_filepath, "w") as cache_file:
@@ -34,6 +34,11 @@ def main():
34
34
  default=None,
35
35
  help="Optional: song title for lyrics lookup and auto-correction",
36
36
  )
37
+ parser.add_argument(
38
+ "--audioshake_api_token",
39
+ default=None,
40
+ help="Optional: AudioShake API token for lyrics transcription and alignment. Can also be set with AUDIOSHAKE_API_TOKEN env var.",
41
+ )
37
42
  parser.add_argument(
38
43
  "--genius_api_token",
39
44
  default=None,
@@ -77,7 +82,7 @@ def main():
77
82
 
78
83
  parser.add_argument(
79
84
  "--video_resolution",
80
- default="4k",
85
+ default="360p",
81
86
  help="Optional: resolution of the karaoke video to render. Must be one of: 4k, 1080p, 720p, 360p. Default: 360p",
82
87
  )
83
88
 
@@ -93,6 +98,12 @@ def main():
93
98
  help="Optional: color to use for karaoke video background, in hex format or FFmpeg color name. Default: black",
94
99
  )
95
100
 
101
+ parser.add_argument(
102
+ "--openai_api_key",
103
+ default=None,
104
+ help="Optional: OpenAI API key for LLM model usage. Can also be set with OPENAI_API_KEY env var.",
105
+ )
106
+
96
107
  args = parser.parse_args()
97
108
 
98
109
  log_level = getattr(logging, args.log_level.upper())
@@ -114,8 +125,10 @@ def main():
114
125
 
115
126
  transcriber = LyricsTranscriber(
116
127
  args.audio_filepath,
128
+ audioshake_api_token=args.audioshake_api_token,
117
129
  genius_api_token=args.genius_api_token,
118
130
  spotify_cookie=args.spotify_cookie,
131
+ openai_api_key=args.openai_api_key,
119
132
  artist=args.artist,
120
133
  title=args.title,
121
134
  output_dir=args.output_dir,
@@ -5,6 +5,7 @@ import json
5
5
  import itertools
6
6
  from pathlib import Path
7
7
  from enum import IntEnum
8
+ import logging
8
9
 
9
10
  from . import ass
10
11
 
@@ -85,21 +86,19 @@ class LyricsLine:
85
86
  def __str__(self):
86
87
  return "".join([f"{{{s.text}}}" for s in self.segments])
87
88
 
88
- def as_ass_event(
89
- self,
90
- screen_start: timedelta,
91
- screen_end: timedelta,
92
- style: ass.ASS.Style,
93
- top_margin: int,
94
- ):
89
+ def as_ass_event(self, screen_start: timedelta, screen_end: timedelta, style: ass.ASS.Style, y_position: int):
95
90
  e = ass.ASS.Event()
96
91
  e.type = "Dialogue"
97
92
  e.Layer = 0
98
93
  e.Style = style
99
94
  e.Start = screen_start.total_seconds()
100
95
  e.End = screen_end.total_seconds()
101
- e.MarginV = top_margin
96
+ e.MarginV = y_position
102
97
  e.Text = self.decorate_ass_line(self.segments, screen_start)
98
+
99
+ # Set alignment to top-center
100
+ e.Text = "{\\an8}" + e.Text
101
+
103
102
  return e
104
103
 
105
104
  def decorate_ass_line(self, segments, screen_start_ts: timedelta):
@@ -137,6 +136,7 @@ class LyricsScreen:
137
136
  start_ts: Optional[timedelta] = None
138
137
  video_size: Tuple[int, int] = None
139
138
  line_height: int = None
139
+ logger: logging.Logger = None
140
140
 
141
141
  @property
142
142
  def end_ts(self) -> timedelta:
@@ -145,10 +145,36 @@ class LyricsScreen:
145
145
  def get_line_y(self, line_num: int) -> int:
146
146
  _, h = self.video_size
147
147
  line_count = len(self.lines)
148
- return (h / 2) - (line_count * self.line_height / 2) + (line_num * self.line_height)
148
+ total_height = line_count * self.line_height
149
+
150
+ # Calculate the top margin to center the lyrics block
151
+ top_margin = (h - total_height) / 2
152
+
153
+ # Calculate the y-position for this specific line
154
+ line_y = top_margin + (line_num * self.line_height)
155
+
156
+ # if self.logger:
157
+ # self.logger.debug(f"Line {line_num + 1} positioning:")
158
+ # self.logger.debug(f" Video height: {h}")
159
+ # self.logger.debug(f" Total lines: {line_count}")
160
+ # self.logger.debug(f" Line height: {self.line_height}")
161
+ # self.logger.debug(f" Total lyrics height: {total_height}")
162
+ # self.logger.debug(f" Top margin: {top_margin}")
163
+ # self.logger.debug(f" Line y: {line_y}")
164
+
165
+ return int(line_y)
149
166
 
150
167
  def as_ass_events(self, style: ass.ASS.Style) -> List[ass.ASS.Event]:
151
- return [line.as_ass_event(self.start_ts, self.end_ts, style, self.get_line_y(i)) for i, line in enumerate(self.lines)]
168
+ events = []
169
+ for i, line in enumerate(self.lines):
170
+ y_position = self.get_line_y(i)
171
+
172
+ # if self.logger:
173
+ # self.logger.debug(f"Creating ASS event for line {i + 1} at y-position: {y_position}")
174
+
175
+ event = line.as_ass_event(self.start_ts, self.end_ts, style, y_position)
176
+ events.append(event)
177
+ return events
152
178
 
153
179
  def __str__(self):
154
180
  lines = [f"{self.start_ts} - {self.end_ts}:"]
@@ -264,7 +290,7 @@ def create_styled_subtitles(
264
290
  style.BorderStyle = 1
265
291
  style.Outline = 1
266
292
  style.Shadow = 0
267
- style.Alignment = ass.ASS.ALIGN_MIDDLE_CENTER
293
+ style.Alignment = ass.ASS.ALIGN_TOP_CENTER
268
294
  style.MarginL = 0
269
295
  style.MarginR = 0
270
296
  style.MarginV = 0
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lyrics-transcriber"
3
- version = "0.14.0"
3
+ version = "0.16.0"
4
4
  description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
5
5
  authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
6
6
  license = "MIT"
@@ -28,6 +28,7 @@ syrics = ">=0"
28
28
  openai = "^1"
29
29
  openai-whisper = ">=20231117"
30
30
  transformers = ">=4"
31
+ auditok = ">=0.2"
31
32
  whisper-timestamped = ">=1"
32
33
  # Note: after adding openai-whisper and whisper-timestamped with poetry lock, I then removed all traces of triton
33
34
  # from poetry.lock before running poetry install, as triton doesn't support macOS but isn't actually needed for whisper.