lyrics-transcriber 0.15.0__tar.gz → 0.16.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/PKG-INFO +22 -2
  2. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/README.md +21 -1
  3. lyrics_transcriber-0.16.1/lyrics_transcriber/audioshake_transcriber.py +35 -0
  4. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/transcriber.py +236 -86
  5. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/utils/cli.py +14 -1
  6. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/utils/subtitles.py +37 -11
  7. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/pyproject.toml +1 -1
  8. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/LICENSE +0 -0
  9. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/__init__.py +0 -0
  10. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/llm_prompts/README.md +0 -0
  11. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -0
  12. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -0
  13. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -0
  14. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -0
  15. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -0
  16. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/utils/__init__.py +0 -0
  17. {lyrics_transcriber-0.15.0 → lyrics_transcriber-0.16.1}/lyrics_transcriber/utils/ass.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lyrics-transcriber
3
- Version: 0.15.0
3
+ Version: 0.16.1
4
4
  Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
5
5
  Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
6
6
  License: MIT
@@ -53,7 +53,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
53
53
  ### Prerequisites
54
54
 
55
55
  - Python 3.9 or higher
56
- - [Optional] A Genius API token if you want to fetch lyrics from Genius
56
+ - [Optional] Genius API token if you want to fetch lyrics from Genius
57
+ - [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
58
+ - [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
59
+ - [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
57
60
 
58
61
  ```
59
62
  pip install lyrics-transcriber
@@ -62,6 +65,23 @@ pip install lyrics-transcriber
62
65
  > **Warning**
63
66
  > The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
64
67
 
68
+ ## Docker
69
+
70
+ You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
71
+
72
+ ```sh
73
+ docker run \
74
+ -v `pwd`/input:/input \
75
+ -v `pwd`/output:/output \
76
+ beveradb/lyrics-transcriber:0.16.0 \
77
+ --log_level debug \
78
+ --output_dir /output \
79
+ --render_video \
80
+ --video_background_image /input/your-background-image.png \
81
+ --video_resolution 360p \
82
+ /input/song.flac
83
+ ```
84
+
65
85
  ## Usage 🚀
66
86
 
67
87
  ### As a standalone CLI
@@ -17,7 +17,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
17
17
  ### Prerequisites
18
18
 
19
19
  - Python 3.9 or higher
20
- - [Optional] A Genius API token if you want to fetch lyrics from Genius
20
+ - [Optional] Genius API token if you want to fetch lyrics from Genius
21
+ - [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
22
+ - [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
23
+ - [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
21
24
 
22
25
  ```
23
26
  pip install lyrics-transcriber
@@ -26,6 +29,23 @@ pip install lyrics-transcriber
26
29
  > **Warning**
27
30
  > The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
28
31
 
32
+ ## Docker
33
+
34
+ You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
35
+
36
+ ```sh
37
+ docker run \
38
+ -v `pwd`/input:/input \
39
+ -v `pwd`/output:/output \
40
+ beveradb/lyrics-transcriber:0.16.0 \
41
+ --log_level debug \
42
+ --output_dir /output \
43
+ --render_video \
44
+ --video_background_image /input/your-background-image.png \
45
+ --video_resolution 360p \
46
+ /input/song.flac
47
+ ```
48
+
29
49
  ## Usage 🚀
30
50
 
31
51
  ### As a standalone CLI
@@ -0,0 +1,35 @@
1
+ import logging
2
+ import requests
3
+
4
+
5
+ class AudioShakeTranscriber:
6
+ def __init__(self, api_token, log_level=logging.DEBUG):
7
+ self.api_token = api_token
8
+ self.logger = logging.getLogger(__name__)
9
+ self.logger.setLevel(log_level)
10
+
11
+ def transcribe(self, audio_filepath):
12
+ # This is a placeholder for the actual AudioShake API implementation
13
+ self.logger.info(f"Transcribing {audio_filepath} using AudioShake API")
14
+
15
+ self.logger.debug(f"AudioShake API token: {self.api_token}")
16
+ # TODO: Implement the actual API call to AudioShake
17
+ # For now, we'll return a dummy result
18
+ return {
19
+ "transcription_data_dict": {
20
+ "segments": [
21
+ {
22
+ "start": 0,
23
+ "end": 5,
24
+ "text": "This is a dummy transcription",
25
+ "words": [
26
+ {"text": "This", "start": 0, "end": 1},
27
+ {"text": "is", "start": 1, "end": 2},
28
+ {"text": "a", "start": 2, "end": 3},
29
+ {"text": "dummy", "start": 3, "end": 4},
30
+ {"text": "transcription", "start": 4, "end": 5},
31
+ ],
32
+ }
33
+ ]
34
+ }
35
+ }
@@ -22,6 +22,8 @@ class LyricsTranscriber:
22
22
  audio_filepath,
23
23
  artist=None,
24
24
  title=None,
25
+ openai_api_key=None,
26
+ audioshake_api_token=None,
25
27
  genius_api_token=None,
26
28
  spotify_cookie=None,
27
29
  output_dir=None,
@@ -59,23 +61,30 @@ class LyricsTranscriber:
59
61
  self.title = title
60
62
  self.song_known = self.artist is not None and self.title is not None
61
63
 
64
+ self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
62
65
  self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
63
66
  self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
67
+ self.audioshake_api_token = os.getenv("AUDIOSHAKE_TOKEN", default=audioshake_api_token)
64
68
 
65
69
  self.transcription_model = transcription_model
66
70
  self.llm_model = llm_model
67
71
  self.llm_prompt_matching = llm_prompt_matching
68
72
  self.llm_prompt_correction = llm_prompt_correction
69
73
 
70
- self.openai_client = OpenAI()
74
+ self.openai_client = None
71
75
 
72
- # Uncomment for local models e.g. with ollama
73
- # self.openai_client = OpenAI(
74
- # base_url="http://localhost:11434/v1",
75
- # api_key="ollama",
76
- # )
76
+ if self.openai_api_key:
77
+ self.openai_client = OpenAI(api_key=self.openai_api_key)
77
78
 
78
- self.openai_client.log = self.log_level
79
+ # Uncomment for local models e.g. with ollama
80
+ # self.openai_client = OpenAI(
81
+ # base_url="http://localhost:11434/v1",
82
+ # api_key="ollama",
83
+ # )
84
+
85
+ self.openai_client.log = self.log_level
86
+ else:
87
+ self.logger.error("No OpenAI API key found, no correction will be applied to transcription")
79
88
 
80
89
  self.render_video = render_video
81
90
  self.video_resolution = video_resolution
@@ -137,9 +146,6 @@ class LyricsTranscriber:
137
146
  if self.audio_filepath is None:
138
147
  raise Exception("audio_filepath must be specified as the input source to transcribe")
139
148
 
140
- if not self.song_known:
141
- raise Exception("cannot correct song lyrics without artist and title to fetch lyrics")
142
-
143
149
  self.create_folders()
144
150
 
145
151
  def generate(self):
@@ -154,8 +160,13 @@ class LyricsTranscriber:
154
160
 
155
161
  self.validate_lyrics_match_song()
156
162
 
157
- self.write_corrected_lyrics_data_file()
158
- self.write_corrected_lyrics_plain_text()
163
+ if self.openai_client:
164
+ self.write_corrected_lyrics_data_file()
165
+ self.write_corrected_lyrics_plain_text()
166
+ else:
167
+ self.logger.warning("Skipping LLM correction as no OpenAI client is available")
168
+ self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict"]
169
+ self.write_corrected_lyrics_plain_text()
159
170
 
160
171
  self.calculate_singing_percentage()
161
172
 
@@ -169,7 +180,8 @@ class LyricsTranscriber:
169
180
  self.copy_files_to_output_dir()
170
181
  self.calculate_llm_costs()
171
182
 
172
- self.openai_client.close()
183
+ if self.openai_client:
184
+ self.openai_client.close()
173
185
 
174
186
  return self.outputs
175
187
 
@@ -198,41 +210,55 @@ class LyricsTranscriber:
198
210
  online_lyrics_text_key = f"{online_lyrics_source}_lyrics_text"
199
211
  online_lyrics_filepath_key = f"{online_lyrics_source}_lyrics_filepath"
200
212
 
201
- if online_lyrics_text_key not in self.outputs:
213
+ if online_lyrics_text_key not in self.outputs or self.outputs[online_lyrics_text_key] is None:
202
214
  continue
203
215
 
204
- data_input_str = (
205
- f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
206
- )
207
-
208
- # self.logger.debug(f"system_prompt:\n{system_prompt}\ndata_input_str:\n{data_input_str}")
209
-
210
- self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
211
- response = self.openai_client.chat.completions.create(
212
- model=self.llm_model,
213
- messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
214
- )
216
+ if self.openai_client:
217
+ data_input_str = (
218
+ f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
219
+ )
215
220
 
216
- message = response.choices[0].message.content
217
- finish_reason = response.choices[0].finish_reason
221
+ self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
222
+ response = self.openai_client.chat.completions.create(
223
+ model=self.llm_model,
224
+ messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
225
+ )
218
226
 
219
- self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
220
- self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
227
+ message = response.choices[0].message.content
228
+ finish_reason = response.choices[0].finish_reason
221
229
 
222
- # self.logger.debug(f"LLM API response finish_reason: {finish_reason} message: \n{message}")
230
+ self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
231
+ self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
223
232
 
224
- if finish_reason == "stop":
225
- if message == "Yes":
226
- self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
233
+ if finish_reason == "stop":
234
+ if message == "Yes":
235
+ self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
236
+ at_least_one_online_lyrics_validated = True
237
+ elif message == "No":
238
+ self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
239
+ self.outputs[online_lyrics_text_key] = None
240
+ self.outputs[online_lyrics_filepath_key] = None
241
+ else:
242
+ self.logger.error(f"Unexpected response from LLM: {message}")
243
+ else:
244
+ self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
245
+ else:
246
+ # Fallback primitive word matching
247
+ self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
248
+ transcribed_words = set(self.outputs["transcribed_lyrics_text"].split())
249
+ online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
250
+ common_words = transcribed_words & online_lyrics_words
251
+ match_percentage = len(common_words) / len(online_lyrics_words) * 100
252
+
253
+ if match_percentage >= 50:
254
+ self.logger.info(
255
+ f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
256
+ )
227
257
  at_least_one_online_lyrics_validated = True
228
- elif message == "No":
258
+ else:
229
259
  self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
230
260
  self.outputs[online_lyrics_text_key] = None
231
261
  self.outputs[online_lyrics_filepath_key] = None
232
- else:
233
- self.logger.error(f"Unexpected response from LLM: {message}")
234
- else:
235
- self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
236
262
 
237
263
  self.logger.info(
238
264
  f"Completed validation of transcription using online lyrics sources. Match found: {at_least_one_online_lyrics_validated}"
@@ -242,9 +268,12 @@ class LyricsTranscriber:
242
268
  self.logger.error(
243
269
  f"Lyrics from Genius and Spotify did not match the transcription. Please check artist and title are set correctly."
244
270
  )
245
- raise Exception("Cannot proceed without internet lyrics to validate / correct transcription")
246
271
 
247
272
  def write_corrected_lyrics_data_file(self):
273
+ if not self.openai_client:
274
+ self.logger.warning("Skipping LLM correction as no OpenAI client is available")
275
+ return
276
+
248
277
  self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
249
278
 
250
279
  corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + "-corrected.json")
@@ -583,51 +612,57 @@ class LyricsTranscriber:
583
612
  f.write(line)
584
613
 
585
614
  def create_screens(self):
586
- self.logger.debug(f"create_screens beginning generation of screens from whisper results")
615
+ self.logger.debug("create_screens beginning generation of screens from whisper results")
587
616
  screens: List[subtitles.LyricsScreen] = []
588
- line: Optional[subtitles.LyricsLine] = None
589
617
  screen: Optional[subtitles.LyricsScreen] = None
590
618
 
591
- lines_in_current_screen = 0
619
+ max_lines_per_screen = 4
620
+ max_line_length = 36 # Maximum characters per line
621
+ self.logger.debug(f"Max lines per screen: {max_lines_per_screen}, Max line length: {max_line_length}")
622
+
592
623
  for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
593
- self.logger.debug(f"lines_in_current_screen: {lines_in_current_screen} segment: {segment['text']}")
594
- if screen is None:
595
- self.logger.debug(f"screen is none, creating new LyricsScreen")
596
- screen = subtitles.LyricsScreen()
597
- screen.video_size = self.video_resolution_num
598
- screen.line_height = self.line_height
599
- if line is None:
600
- self.logger.debug(f"line is none, creating new LyricsLine")
601
- line = subtitles.LyricsLine()
602
-
603
- num_words_in_segment = len(segment["words"])
604
- for word_index, word in enumerate(segment["words"]):
605
- segment = subtitles.LyricSegment(
624
+ self.logger.debug(f"Processing segment: {segment['text']}")
625
+ if screen is None or len(screen.lines) >= max_lines_per_screen:
626
+ screen = subtitles.LyricsScreen(video_size=self.video_resolution_num, line_height=self.line_height, logger=self.logger)
627
+ screens.append(screen)
628
+ self.logger.debug(f"Created new screen. Total screens: {len(screens)}")
629
+
630
+ words = segment["words"]
631
+ current_line = subtitles.LyricsLine()
632
+ current_line_text = ""
633
+ self.logger.debug(f"Processing {len(words)} words in segment")
634
+
635
+ for word in words:
636
+ self.logger.debug(f"Processing word: '{word['text']}'")
637
+ if len(current_line_text) + len(word["text"]) + 1 > max_line_length or (current_line_text and word["text"][0].isupper()):
638
+ self.logger.debug(f"Current line would exceed max length or new capitalized word. Line: '{current_line_text}'")
639
+ if current_line.segments:
640
+ screen.lines.append(current_line)
641
+ self.logger.debug(f"Added line to screen. Lines on current screen: {len(screen.lines)}")
642
+ if len(screen.lines) >= max_lines_per_screen:
643
+ screen = subtitles.LyricsScreen(
644
+ video_size=self.video_resolution_num,
645
+ line_height=self.line_height,
646
+ logger=self.logger,
647
+ )
648
+ screens.append(screen)
649
+ self.logger.debug(f"Screen full, created new screen. Total screens: {len(screens)}")
650
+ current_line = subtitles.LyricsLine()
651
+ current_line_text = ""
652
+ self.logger.debug("Reset current line")
653
+
654
+ current_line_text += (" " if current_line_text else "") + word["text"]
655
+ lyric_segment = subtitles.LyricSegment(
606
656
  text=word["text"], ts=timedelta(seconds=word["start"]), end_ts=timedelta(seconds=word["end"])
607
657
  )
608
- line.segments.append(segment)
609
-
610
- # If word is last in the line, add line to screen and start new line
611
- # Before looping to the next word
612
- if word_index == num_words_in_segment - 1:
613
- self.logger.debug(f"word_index is last in segment, adding line to screen and starting new line")
614
- screen.lines.append(line)
615
- lines_in_current_screen += 1
616
- line = None
617
-
618
- # If current screen has 2 lines already, add screen to list and start new screen
619
- # Before looping to the next line
620
- if lines_in_current_screen == 2:
621
- self.logger.debug(f"lines_in_current_screen is 2, adding screen to list and starting new screen")
622
- screens.append(screen)
623
- screen = None
624
- lines_in_current_screen = 0
658
+ current_line.segments.append(lyric_segment)
659
+ self.logger.debug(f"Added word to current line. Current line: '{current_line_text}'")
625
660
 
626
- if line is not None:
627
- screen.lines.append(line) # type: ignore[union-attr]
628
- if screen is not None and len(screen.lines) > 0:
629
- screens.append(screen) # type: ignore[arg-type]
661
+ if current_line.segments:
662
+ screen.lines.append(current_line)
663
+ self.logger.debug(f"Added final line of segment to screen. Lines on current screen: {len(screen.lines)}")
630
664
 
665
+ self.logger.debug(f"Finished creating screens. Total screens created: {len(screens)}")
631
666
  return screens
632
667
 
633
668
  def write_ass_file(self):
@@ -760,7 +795,10 @@ class LyricsTranscriber:
760
795
 
761
796
  def write_transcribed_lyrics_plain_text(self):
762
797
  if self.outputs["transcription_data_dict"]:
763
- transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + "-transcribed.txt")
798
+ transcription_cache_suffix = "-audioshake-transcribed.txt" if self.audioshake_api_token else "-whisper-transcribed.txt"
799
+ self.logger.debug(f"transcription_cache_suffix: {transcription_cache_suffix}")
800
+
801
+ transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + transcription_cache_suffix)
764
802
  self.outputs["transcribed_lyrics_text_filepath"] = transcribed_lyrics_text_filepath
765
803
 
766
804
  self.outputs["transcribed_lyrics_text"] = ""
@@ -773,8 +811,109 @@ class LyricsTranscriber:
773
811
  else:
774
812
  raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
775
813
 
814
+ def find_best_split_point(self, text, max_length):
815
+ self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
816
+ words = text.split()
817
+ mid_word_index = len(words) // 2
818
+ mid_point = len(" ".join(words[:mid_word_index]))
819
+ self.logger.debug(f"Mid point is at character {mid_point}")
820
+
821
+ # Check for a comma within one or two words of the middle word
822
+ if "," in text:
823
+ comma_indices = [i for i, char in enumerate(text) if char == ","]
824
+ self.logger.debug(f"Found commas at indices: {comma_indices}")
825
+ for index in comma_indices:
826
+ if abs(mid_point - index) < 20 and len(text[: index + 1].strip()) <= max_length:
827
+ self.logger.debug(f"Choosing comma at index {index} as split point")
828
+ return index + 1 # Include the comma in the first part
829
+
830
+ # Check for 'and'
831
+ if " and " in text:
832
+ and_indices = [m.start() for m in re.finditer(" and ", text)]
833
+ self.logger.debug(f"Found 'and' at indices: {and_indices}")
834
+ for index in sorted(and_indices, key=lambda x: abs(x - mid_point)):
835
+ if len(text[: index + len(" and ")].strip()) <= max_length:
836
+ self.logger.debug(f"Choosing 'and' at index {index} as split point")
837
+ return index + len(" and ")
838
+
839
+ # Check for words starting with a capital letter
840
+ capital_word_indices = [m.start() for m in re.finditer(r"\s[A-Z]", text)]
841
+ self.logger.debug(f"Found capital words at indices: {capital_word_indices}")
842
+ for index in sorted(capital_word_indices, key=lambda x: abs(x - mid_point)):
843
+ if index > 0 and len(text[:index].strip()) <= max_length:
844
+ self.logger.debug(f"Choosing capital word at index {index} as split point")
845
+ return index
846
+
847
+ # If no better split point is found, try splitting at the middle word
848
+ if len(words) > 2 and mid_word_index > 0:
849
+ split_at_middle = len(" ".join(words[:mid_word_index]))
850
+ if split_at_middle <= max_length:
851
+ self.logger.debug(f"Choosing middle word split at index {split_at_middle}")
852
+ return split_at_middle
853
+
854
+ # If the text is still too long, forcibly split at the maximum length
855
+ self.logger.debug(f"No suitable split point found, forcibly splitting at max_length {max_length}")
856
+ return max_length
857
+
858
+ def split_long_segments(self, segments, max_length):
859
+ self.logger.debug(f"Splitting long segments (max_length: {max_length})")
860
+ new_segments = []
861
+ for segment in segments:
862
+ text = segment["text"]
863
+ self.logger.debug(f"Processing segment: '{text}' (length: {len(text)})")
864
+ if len(text) <= max_length:
865
+ self.logger.debug("Segment is within max_length, keeping as is")
866
+ new_segments.append(segment)
867
+ else:
868
+ self.logger.debug("Segment exceeds max_length, splitting")
869
+ meta_words = segment["words"]
870
+ current_text = ""
871
+ current_start = segment["start"]
872
+ current_words = []
873
+
874
+ for i, meta in enumerate(meta_words):
875
+ word = meta["text"]
876
+ if current_text:
877
+ current_text += " "
878
+ current_text += word
879
+ current_words.append(meta)
880
+
881
+ should_split = len(current_text) > max_length or (i > 0 and word[0].isupper())
882
+ if should_split:
883
+ self.logger.debug(f"Splitting at: '{current_text}'")
884
+ # If splitting due to capitalization, don't include the capitalized word
885
+ if word[0].isupper() and len(current_text.strip()) > len(word):
886
+ split_text = current_text[: -(len(word) + 1)].strip()
887
+ current_words = current_words[:-1]
888
+ else:
889
+ split_text = current_text.strip()
890
+
891
+ new_segment = {"text": split_text, "start": current_start, "end": current_words[-1]["end"], "words": current_words}
892
+ new_segments.append(new_segment)
893
+ self.logger.debug(f"Added new segment: {new_segment}")
894
+
895
+ # Reset for next segment
896
+ if word[0].isupper() and len(current_text.strip()) > len(word):
897
+ current_text = word
898
+ current_words = [meta]
899
+ else:
900
+ current_text = ""
901
+ current_words = []
902
+ current_start = meta["start"]
903
+
904
+ # Add any remaining text as a final segment
905
+ if current_text:
906
+ self.logger.debug(f"Adding final segment: '{current_text}'")
907
+ new_segments.append(
908
+ {"text": current_text.strip(), "start": current_start, "end": segment["end"], "words": current_words}
909
+ )
910
+
911
+ self.logger.debug(f"Splitting complete. Original segments: {len(segments)}, New segments: {len(new_segments)}")
912
+ return new_segments
913
+
776
914
  def transcribe(self):
777
- self.outputs["transcription_data_filepath"] = self.get_cache_filepath(".json")
915
+ transcription_cache_suffix = "-audioshake" if self.audioshake_api_token else "-whisper"
916
+ self.outputs["transcription_data_filepath"] = self.get_cache_filepath(f"{transcription_cache_suffix}.json")
778
917
 
779
918
  whisper_cache_filepath = self.outputs["transcription_data_filepath"]
780
919
  if os.path.isfile(whisper_cache_filepath):
@@ -783,15 +922,26 @@ class LyricsTranscriber:
783
922
  self.outputs["transcription_data_dict"] = json.load(cache_file)
784
923
  return
785
924
 
786
- self.logger.debug(f"no cached transcription file found, running whisper transcribe with model: {self.transcription_model}")
787
- audio = whisper.load_audio(self.audio_filepath)
788
- model = whisper.load_model(self.transcription_model, device="cpu")
789
- result = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
925
+ if self.audioshake_api_token:
926
+ self.logger.debug(f"Using AudioShake API for transcription")
927
+ from .audioshake_transcriber import AudioShakeTranscriber
790
928
 
791
- self.logger.debug(f"transcription complete, performing post-processing cleanup")
792
-
793
- # Remove segments with no words, only music
794
- result["segments"] = [segment for segment in result["segments"] if segment["text"].strip() != "Music"]
929
+ audioshake = AudioShakeTranscriber(self.audioshake_api_token, log_level=self.log_level)
930
+ result = audioshake.transcribe(self.audio_filepath)
931
+ else:
932
+ self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
933
+ audio = whisper.load_audio(self.audio_filepath)
934
+ model = whisper.load_model(self.transcription_model, device="cpu")
935
+ result = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
936
+
937
+ # Remove segments with no words, only music
938
+ result["segments"] = [segment for segment in result["segments"] if segment["text"].strip() != "Music"]
939
+ self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(result['segments'])}")
940
+
941
+ # Split long segments
942
+ self.logger.debug("Starting to split long segments")
943
+ result["segments"] = self.split_long_segments(result["segments"], max_length=36)
944
+ self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(result['segments'])}")
795
945
 
796
946
  self.logger.debug(f"writing transcription data JSON to cache file: {whisper_cache_filepath}")
797
947
  with open(whisper_cache_filepath, "w") as cache_file:
@@ -34,6 +34,11 @@ def main():
34
34
  default=None,
35
35
  help="Optional: song title for lyrics lookup and auto-correction",
36
36
  )
37
+ parser.add_argument(
38
+ "--audioshake_api_token",
39
+ default=None,
40
+ help="Optional: AudioShake API token for lyrics transcription and alignment. Can also be set with AUDIOSHAKE_API_TOKEN env var.",
41
+ )
37
42
  parser.add_argument(
38
43
  "--genius_api_token",
39
44
  default=None,
@@ -77,7 +82,7 @@ def main():
77
82
 
78
83
  parser.add_argument(
79
84
  "--video_resolution",
80
- default="4k",
85
+ default="360p",
81
86
  help="Optional: resolution of the karaoke video to render. Must be one of: 4k, 1080p, 720p, 360p. Default: 360p",
82
87
  )
83
88
 
@@ -93,6 +98,12 @@ def main():
93
98
  help="Optional: color to use for karaoke video background, in hex format or FFmpeg color name. Default: black",
94
99
  )
95
100
 
101
+ parser.add_argument(
102
+ "--openai_api_key",
103
+ default=None,
104
+ help="Optional: OpenAI API key for LLM model usage. Can also be set with OPENAI_API_KEY env var.",
105
+ )
106
+
96
107
  args = parser.parse_args()
97
108
 
98
109
  log_level = getattr(logging, args.log_level.upper())
@@ -114,8 +125,10 @@ def main():
114
125
 
115
126
  transcriber = LyricsTranscriber(
116
127
  args.audio_filepath,
128
+ audioshake_api_token=args.audioshake_api_token,
117
129
  genius_api_token=args.genius_api_token,
118
130
  spotify_cookie=args.spotify_cookie,
131
+ openai_api_key=args.openai_api_key,
119
132
  artist=args.artist,
120
133
  title=args.title,
121
134
  output_dir=args.output_dir,
@@ -5,6 +5,7 @@ import json
5
5
  import itertools
6
6
  from pathlib import Path
7
7
  from enum import IntEnum
8
+ import logging
8
9
 
9
10
  from . import ass
10
11
 
@@ -85,21 +86,19 @@ class LyricsLine:
85
86
  def __str__(self):
86
87
  return "".join([f"{{{s.text}}}" for s in self.segments])
87
88
 
88
- def as_ass_event(
89
- self,
90
- screen_start: timedelta,
91
- screen_end: timedelta,
92
- style: ass.ASS.Style,
93
- top_margin: int,
94
- ):
89
+ def as_ass_event(self, screen_start: timedelta, screen_end: timedelta, style: ass.ASS.Style, y_position: int):
95
90
  e = ass.ASS.Event()
96
91
  e.type = "Dialogue"
97
92
  e.Layer = 0
98
93
  e.Style = style
99
94
  e.Start = screen_start.total_seconds()
100
95
  e.End = screen_end.total_seconds()
101
- e.MarginV = top_margin
96
+ e.MarginV = y_position
102
97
  e.Text = self.decorate_ass_line(self.segments, screen_start)
98
+
99
+ # Set alignment to top-center
100
+ e.Text = "{\\an8}" + e.Text
101
+
103
102
  return e
104
103
 
105
104
  def decorate_ass_line(self, segments, screen_start_ts: timedelta):
@@ -137,6 +136,7 @@ class LyricsScreen:
137
136
  start_ts: Optional[timedelta] = None
138
137
  video_size: Tuple[int, int] = None
139
138
  line_height: int = None
139
+ logger: logging.Logger = None
140
140
 
141
141
  @property
142
142
  def end_ts(self) -> timedelta:
@@ -145,10 +145,36 @@ class LyricsScreen:
145
145
  def get_line_y(self, line_num: int) -> int:
146
146
  _, h = self.video_size
147
147
  line_count = len(self.lines)
148
- return (h / 2) - (line_count * self.line_height / 2) + (line_num * self.line_height)
148
+ total_height = line_count * self.line_height
149
+
150
+ # Calculate the top margin to center the lyrics block
151
+ top_margin = (h - total_height) / 2
152
+
153
+ # Calculate the y-position for this specific line
154
+ line_y = top_margin + (line_num * self.line_height)
155
+
156
+ # if self.logger:
157
+ # self.logger.debug(f"Line {line_num + 1} positioning:")
158
+ # self.logger.debug(f" Video height: {h}")
159
+ # self.logger.debug(f" Total lines: {line_count}")
160
+ # self.logger.debug(f" Line height: {self.line_height}")
161
+ # self.logger.debug(f" Total lyrics height: {total_height}")
162
+ # self.logger.debug(f" Top margin: {top_margin}")
163
+ # self.logger.debug(f" Line y: {line_y}")
164
+
165
+ return int(line_y)
149
166
 
150
167
  def as_ass_events(self, style: ass.ASS.Style) -> List[ass.ASS.Event]:
151
- return [line.as_ass_event(self.start_ts, self.end_ts, style, self.get_line_y(i)) for i, line in enumerate(self.lines)]
168
+ events = []
169
+ for i, line in enumerate(self.lines):
170
+ y_position = self.get_line_y(i)
171
+
172
+ # if self.logger:
173
+ # self.logger.debug(f"Creating ASS event for line {i + 1} at y-position: {y_position}")
174
+
175
+ event = line.as_ass_event(self.start_ts, self.end_ts, style, y_position)
176
+ events.append(event)
177
+ return events
152
178
 
153
179
  def __str__(self):
154
180
  lines = [f"{self.start_ts} - {self.end_ts}:"]
@@ -264,7 +290,7 @@ def create_styled_subtitles(
264
290
  style.BorderStyle = 1
265
291
  style.Outline = 1
266
292
  style.Shadow = 0
267
- style.Alignment = ass.ASS.ALIGN_MIDDLE_CENTER
293
+ style.Alignment = ass.ASS.ALIGN_TOP_CENTER
268
294
  style.MarginL = 0
269
295
  style.MarginR = 0
270
296
  style.MarginV = 0
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lyrics-transcriber"
3
- version = "0.15.0"
3
+ version = "0.16.1"
4
4
  description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
5
5
  authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
6
6
  license = "MIT"