lyrics-transcriber 0.15.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/audioshake_transcriber.py +35 -0
- lyrics_transcriber/transcriber.py +236 -86
- lyrics_transcriber/utils/cli.py +14 -1
- lyrics_transcriber/utils/subtitles.py +37 -11
- {lyrics_transcriber-0.15.0.dist-info → lyrics_transcriber-0.16.1.dist-info}/METADATA +22 -2
- {lyrics_transcriber-0.15.0.dist-info → lyrics_transcriber-0.16.1.dist-info}/RECORD +9 -8
- {lyrics_transcriber-0.15.0.dist-info → lyrics_transcriber-0.16.1.dist-info}/LICENSE +0 -0
- {lyrics_transcriber-0.15.0.dist-info → lyrics_transcriber-0.16.1.dist-info}/WHEEL +0 -0
- {lyrics_transcriber-0.15.0.dist-info → lyrics_transcriber-0.16.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
import logging
|
2
|
+
import requests
|
3
|
+
|
4
|
+
|
5
|
+
class AudioShakeTranscriber:
|
6
|
+
def __init__(self, api_token, log_level=logging.DEBUG):
|
7
|
+
self.api_token = api_token
|
8
|
+
self.logger = logging.getLogger(__name__)
|
9
|
+
self.logger.setLevel(log_level)
|
10
|
+
|
11
|
+
def transcribe(self, audio_filepath):
|
12
|
+
# This is a placeholder for the actual AudioShake API implementation
|
13
|
+
self.logger.info(f"Transcribing {audio_filepath} using AudioShake API")
|
14
|
+
|
15
|
+
self.logger.debug(f"AudioShake API token: {self.api_token}")
|
16
|
+
# TODO: Implement the actual API call to AudioShake
|
17
|
+
# For now, we'll return a dummy result
|
18
|
+
return {
|
19
|
+
"transcription_data_dict": {
|
20
|
+
"segments": [
|
21
|
+
{
|
22
|
+
"start": 0,
|
23
|
+
"end": 5,
|
24
|
+
"text": "This is a dummy transcription",
|
25
|
+
"words": [
|
26
|
+
{"text": "This", "start": 0, "end": 1},
|
27
|
+
{"text": "is", "start": 1, "end": 2},
|
28
|
+
{"text": "a", "start": 2, "end": 3},
|
29
|
+
{"text": "dummy", "start": 3, "end": 4},
|
30
|
+
{"text": "transcription", "start": 4, "end": 5},
|
31
|
+
],
|
32
|
+
}
|
33
|
+
]
|
34
|
+
}
|
35
|
+
}
|
@@ -22,6 +22,8 @@ class LyricsTranscriber:
|
|
22
22
|
audio_filepath,
|
23
23
|
artist=None,
|
24
24
|
title=None,
|
25
|
+
openai_api_key=None,
|
26
|
+
audioshake_api_token=None,
|
25
27
|
genius_api_token=None,
|
26
28
|
spotify_cookie=None,
|
27
29
|
output_dir=None,
|
@@ -59,23 +61,30 @@ class LyricsTranscriber:
|
|
59
61
|
self.title = title
|
60
62
|
self.song_known = self.artist is not None and self.title is not None
|
61
63
|
|
64
|
+
self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
|
62
65
|
self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
|
63
66
|
self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
|
67
|
+
self.audioshake_api_token = os.getenv("AUDIOSHAKE_TOKEN", default=audioshake_api_token)
|
64
68
|
|
65
69
|
self.transcription_model = transcription_model
|
66
70
|
self.llm_model = llm_model
|
67
71
|
self.llm_prompt_matching = llm_prompt_matching
|
68
72
|
self.llm_prompt_correction = llm_prompt_correction
|
69
73
|
|
70
|
-
self.openai_client =
|
74
|
+
self.openai_client = None
|
71
75
|
|
72
|
-
|
73
|
-
|
74
|
-
# base_url="http://localhost:11434/v1",
|
75
|
-
# api_key="ollama",
|
76
|
-
# )
|
76
|
+
if self.openai_api_key:
|
77
|
+
self.openai_client = OpenAI(api_key=self.openai_api_key)
|
77
78
|
|
78
|
-
|
79
|
+
# Uncomment for local models e.g. with ollama
|
80
|
+
# self.openai_client = OpenAI(
|
81
|
+
# base_url="http://localhost:11434/v1",
|
82
|
+
# api_key="ollama",
|
83
|
+
# )
|
84
|
+
|
85
|
+
self.openai_client.log = self.log_level
|
86
|
+
else:
|
87
|
+
self.logger.error("No OpenAI API key found, no correction will be applied to transcription")
|
79
88
|
|
80
89
|
self.render_video = render_video
|
81
90
|
self.video_resolution = video_resolution
|
@@ -137,9 +146,6 @@ class LyricsTranscriber:
|
|
137
146
|
if self.audio_filepath is None:
|
138
147
|
raise Exception("audio_filepath must be specified as the input source to transcribe")
|
139
148
|
|
140
|
-
if not self.song_known:
|
141
|
-
raise Exception("cannot correct song lyrics without artist and title to fetch lyrics")
|
142
|
-
|
143
149
|
self.create_folders()
|
144
150
|
|
145
151
|
def generate(self):
|
@@ -154,8 +160,13 @@ class LyricsTranscriber:
|
|
154
160
|
|
155
161
|
self.validate_lyrics_match_song()
|
156
162
|
|
157
|
-
self.
|
158
|
-
|
163
|
+
if self.openai_client:
|
164
|
+
self.write_corrected_lyrics_data_file()
|
165
|
+
self.write_corrected_lyrics_plain_text()
|
166
|
+
else:
|
167
|
+
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
168
|
+
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict"]
|
169
|
+
self.write_corrected_lyrics_plain_text()
|
159
170
|
|
160
171
|
self.calculate_singing_percentage()
|
161
172
|
|
@@ -169,7 +180,8 @@ class LyricsTranscriber:
|
|
169
180
|
self.copy_files_to_output_dir()
|
170
181
|
self.calculate_llm_costs()
|
171
182
|
|
172
|
-
self.openai_client
|
183
|
+
if self.openai_client:
|
184
|
+
self.openai_client.close()
|
173
185
|
|
174
186
|
return self.outputs
|
175
187
|
|
@@ -198,41 +210,55 @@ class LyricsTranscriber:
|
|
198
210
|
online_lyrics_text_key = f"{online_lyrics_source}_lyrics_text"
|
199
211
|
online_lyrics_filepath_key = f"{online_lyrics_source}_lyrics_filepath"
|
200
212
|
|
201
|
-
if online_lyrics_text_key not in self.outputs:
|
213
|
+
if online_lyrics_text_key not in self.outputs or self.outputs[online_lyrics_text_key] is None:
|
202
214
|
continue
|
203
215
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
# self.logger.debug(f"system_prompt:\n{system_prompt}\ndata_input_str:\n{data_input_str}")
|
209
|
-
|
210
|
-
self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
|
211
|
-
response = self.openai_client.chat.completions.create(
|
212
|
-
model=self.llm_model,
|
213
|
-
messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
|
214
|
-
)
|
216
|
+
if self.openai_client:
|
217
|
+
data_input_str = (
|
218
|
+
f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
|
219
|
+
)
|
215
220
|
|
216
|
-
|
217
|
-
|
221
|
+
self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
|
222
|
+
response = self.openai_client.chat.completions.create(
|
223
|
+
model=self.llm_model,
|
224
|
+
messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
|
225
|
+
)
|
218
226
|
|
219
|
-
|
220
|
-
|
227
|
+
message = response.choices[0].message.content
|
228
|
+
finish_reason = response.choices[0].finish_reason
|
221
229
|
|
222
|
-
|
230
|
+
self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
|
231
|
+
self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
|
223
232
|
|
224
|
-
|
225
|
-
|
226
|
-
|
233
|
+
if finish_reason == "stop":
|
234
|
+
if message == "Yes":
|
235
|
+
self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
|
236
|
+
at_least_one_online_lyrics_validated = True
|
237
|
+
elif message == "No":
|
238
|
+
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
239
|
+
self.outputs[online_lyrics_text_key] = None
|
240
|
+
self.outputs[online_lyrics_filepath_key] = None
|
241
|
+
else:
|
242
|
+
self.logger.error(f"Unexpected response from LLM: {message}")
|
243
|
+
else:
|
244
|
+
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
|
245
|
+
else:
|
246
|
+
# Fallback primitive word matching
|
247
|
+
self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
|
248
|
+
transcribed_words = set(self.outputs["transcribed_lyrics_text"].split())
|
249
|
+
online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
|
250
|
+
common_words = transcribed_words & online_lyrics_words
|
251
|
+
match_percentage = len(common_words) / len(online_lyrics_words) * 100
|
252
|
+
|
253
|
+
if match_percentage >= 50:
|
254
|
+
self.logger.info(
|
255
|
+
f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
|
256
|
+
)
|
227
257
|
at_least_one_online_lyrics_validated = True
|
228
|
-
|
258
|
+
else:
|
229
259
|
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
230
260
|
self.outputs[online_lyrics_text_key] = None
|
231
261
|
self.outputs[online_lyrics_filepath_key] = None
|
232
|
-
else:
|
233
|
-
self.logger.error(f"Unexpected response from LLM: {message}")
|
234
|
-
else:
|
235
|
-
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
|
236
262
|
|
237
263
|
self.logger.info(
|
238
264
|
f"Completed validation of transcription using online lyrics sources. Match found: {at_least_one_online_lyrics_validated}"
|
@@ -242,9 +268,12 @@ class LyricsTranscriber:
|
|
242
268
|
self.logger.error(
|
243
269
|
f"Lyrics from Genius and Spotify did not match the transcription. Please check artist and title are set correctly."
|
244
270
|
)
|
245
|
-
raise Exception("Cannot proceed without internet lyrics to validate / correct transcription")
|
246
271
|
|
247
272
|
def write_corrected_lyrics_data_file(self):
|
273
|
+
if not self.openai_client:
|
274
|
+
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
275
|
+
return
|
276
|
+
|
248
277
|
self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
|
249
278
|
|
250
279
|
corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + "-corrected.json")
|
@@ -583,51 +612,57 @@ class LyricsTranscriber:
|
|
583
612
|
f.write(line)
|
584
613
|
|
585
614
|
def create_screens(self):
|
586
|
-
self.logger.debug(
|
615
|
+
self.logger.debug("create_screens beginning generation of screens from whisper results")
|
587
616
|
screens: List[subtitles.LyricsScreen] = []
|
588
|
-
line: Optional[subtitles.LyricsLine] = None
|
589
617
|
screen: Optional[subtitles.LyricsScreen] = None
|
590
618
|
|
591
|
-
|
619
|
+
max_lines_per_screen = 4
|
620
|
+
max_line_length = 36 # Maximum characters per line
|
621
|
+
self.logger.debug(f"Max lines per screen: {max_lines_per_screen}, Max line length: {max_line_length}")
|
622
|
+
|
592
623
|
for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
|
593
|
-
self.logger.debug(f"
|
594
|
-
if screen is None:
|
595
|
-
|
596
|
-
|
597
|
-
screen.
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
for
|
605
|
-
|
624
|
+
self.logger.debug(f"Processing segment: {segment['text']}")
|
625
|
+
if screen is None or len(screen.lines) >= max_lines_per_screen:
|
626
|
+
screen = subtitles.LyricsScreen(video_size=self.video_resolution_num, line_height=self.line_height, logger=self.logger)
|
627
|
+
screens.append(screen)
|
628
|
+
self.logger.debug(f"Created new screen. Total screens: {len(screens)}")
|
629
|
+
|
630
|
+
words = segment["words"]
|
631
|
+
current_line = subtitles.LyricsLine()
|
632
|
+
current_line_text = ""
|
633
|
+
self.logger.debug(f"Processing {len(words)} words in segment")
|
634
|
+
|
635
|
+
for word in words:
|
636
|
+
self.logger.debug(f"Processing word: '{word['text']}'")
|
637
|
+
if len(current_line_text) + len(word["text"]) + 1 > max_line_length or (current_line_text and word["text"][0].isupper()):
|
638
|
+
self.logger.debug(f"Current line would exceed max length or new capitalized word. Line: '{current_line_text}'")
|
639
|
+
if current_line.segments:
|
640
|
+
screen.lines.append(current_line)
|
641
|
+
self.logger.debug(f"Added line to screen. Lines on current screen: {len(screen.lines)}")
|
642
|
+
if len(screen.lines) >= max_lines_per_screen:
|
643
|
+
screen = subtitles.LyricsScreen(
|
644
|
+
video_size=self.video_resolution_num,
|
645
|
+
line_height=self.line_height,
|
646
|
+
logger=self.logger,
|
647
|
+
)
|
648
|
+
screens.append(screen)
|
649
|
+
self.logger.debug(f"Screen full, created new screen. Total screens: {len(screens)}")
|
650
|
+
current_line = subtitles.LyricsLine()
|
651
|
+
current_line_text = ""
|
652
|
+
self.logger.debug("Reset current line")
|
653
|
+
|
654
|
+
current_line_text += (" " if current_line_text else "") + word["text"]
|
655
|
+
lyric_segment = subtitles.LyricSegment(
|
606
656
|
text=word["text"], ts=timedelta(seconds=word["start"]), end_ts=timedelta(seconds=word["end"])
|
607
657
|
)
|
608
|
-
|
609
|
-
|
610
|
-
# If word is last in the line, add line to screen and start new line
|
611
|
-
# Before looping to the next word
|
612
|
-
if word_index == num_words_in_segment - 1:
|
613
|
-
self.logger.debug(f"word_index is last in segment, adding line to screen and starting new line")
|
614
|
-
screen.lines.append(line)
|
615
|
-
lines_in_current_screen += 1
|
616
|
-
line = None
|
617
|
-
|
618
|
-
# If current screen has 2 lines already, add screen to list and start new screen
|
619
|
-
# Before looping to the next line
|
620
|
-
if lines_in_current_screen == 2:
|
621
|
-
self.logger.debug(f"lines_in_current_screen is 2, adding screen to list and starting new screen")
|
622
|
-
screens.append(screen)
|
623
|
-
screen = None
|
624
|
-
lines_in_current_screen = 0
|
658
|
+
current_line.segments.append(lyric_segment)
|
659
|
+
self.logger.debug(f"Added word to current line. Current line: '{current_line_text}'")
|
625
660
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
screens.append(screen) # type: ignore[arg-type]
|
661
|
+
if current_line.segments:
|
662
|
+
screen.lines.append(current_line)
|
663
|
+
self.logger.debug(f"Added final line of segment to screen. Lines on current screen: {len(screen.lines)}")
|
630
664
|
|
665
|
+
self.logger.debug(f"Finished creating screens. Total screens created: {len(screens)}")
|
631
666
|
return screens
|
632
667
|
|
633
668
|
def write_ass_file(self):
|
@@ -760,7 +795,10 @@ class LyricsTranscriber:
|
|
760
795
|
|
761
796
|
def write_transcribed_lyrics_plain_text(self):
|
762
797
|
if self.outputs["transcription_data_dict"]:
|
763
|
-
|
798
|
+
transcription_cache_suffix = "-audioshake-transcribed.txt" if self.audioshake_api_token else "-whisper-transcribed.txt"
|
799
|
+
self.logger.debug(f"transcription_cache_suffix: {transcription_cache_suffix}")
|
800
|
+
|
801
|
+
transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + transcription_cache_suffix)
|
764
802
|
self.outputs["transcribed_lyrics_text_filepath"] = transcribed_lyrics_text_filepath
|
765
803
|
|
766
804
|
self.outputs["transcribed_lyrics_text"] = ""
|
@@ -773,8 +811,109 @@ class LyricsTranscriber:
|
|
773
811
|
else:
|
774
812
|
raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
|
775
813
|
|
814
|
+
def find_best_split_point(self, text, max_length):
|
815
|
+
self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
|
816
|
+
words = text.split()
|
817
|
+
mid_word_index = len(words) // 2
|
818
|
+
mid_point = len(" ".join(words[:mid_word_index]))
|
819
|
+
self.logger.debug(f"Mid point is at character {mid_point}")
|
820
|
+
|
821
|
+
# Check for a comma within one or two words of the middle word
|
822
|
+
if "," in text:
|
823
|
+
comma_indices = [i for i, char in enumerate(text) if char == ","]
|
824
|
+
self.logger.debug(f"Found commas at indices: {comma_indices}")
|
825
|
+
for index in comma_indices:
|
826
|
+
if abs(mid_point - index) < 20 and len(text[: index + 1].strip()) <= max_length:
|
827
|
+
self.logger.debug(f"Choosing comma at index {index} as split point")
|
828
|
+
return index + 1 # Include the comma in the first part
|
829
|
+
|
830
|
+
# Check for 'and'
|
831
|
+
if " and " in text:
|
832
|
+
and_indices = [m.start() for m in re.finditer(" and ", text)]
|
833
|
+
self.logger.debug(f"Found 'and' at indices: {and_indices}")
|
834
|
+
for index in sorted(and_indices, key=lambda x: abs(x - mid_point)):
|
835
|
+
if len(text[: index + len(" and ")].strip()) <= max_length:
|
836
|
+
self.logger.debug(f"Choosing 'and' at index {index} as split point")
|
837
|
+
return index + len(" and ")
|
838
|
+
|
839
|
+
# Check for words starting with a capital letter
|
840
|
+
capital_word_indices = [m.start() for m in re.finditer(r"\s[A-Z]", text)]
|
841
|
+
self.logger.debug(f"Found capital words at indices: {capital_word_indices}")
|
842
|
+
for index in sorted(capital_word_indices, key=lambda x: abs(x - mid_point)):
|
843
|
+
if index > 0 and len(text[:index].strip()) <= max_length:
|
844
|
+
self.logger.debug(f"Choosing capital word at index {index} as split point")
|
845
|
+
return index
|
846
|
+
|
847
|
+
# If no better split point is found, try splitting at the middle word
|
848
|
+
if len(words) > 2 and mid_word_index > 0:
|
849
|
+
split_at_middle = len(" ".join(words[:mid_word_index]))
|
850
|
+
if split_at_middle <= max_length:
|
851
|
+
self.logger.debug(f"Choosing middle word split at index {split_at_middle}")
|
852
|
+
return split_at_middle
|
853
|
+
|
854
|
+
# If the text is still too long, forcibly split at the maximum length
|
855
|
+
self.logger.debug(f"No suitable split point found, forcibly splitting at max_length {max_length}")
|
856
|
+
return max_length
|
857
|
+
|
858
|
+
def split_long_segments(self, segments, max_length):
|
859
|
+
self.logger.debug(f"Splitting long segments (max_length: {max_length})")
|
860
|
+
new_segments = []
|
861
|
+
for segment in segments:
|
862
|
+
text = segment["text"]
|
863
|
+
self.logger.debug(f"Processing segment: '{text}' (length: {len(text)})")
|
864
|
+
if len(text) <= max_length:
|
865
|
+
self.logger.debug("Segment is within max_length, keeping as is")
|
866
|
+
new_segments.append(segment)
|
867
|
+
else:
|
868
|
+
self.logger.debug("Segment exceeds max_length, splitting")
|
869
|
+
meta_words = segment["words"]
|
870
|
+
current_text = ""
|
871
|
+
current_start = segment["start"]
|
872
|
+
current_words = []
|
873
|
+
|
874
|
+
for i, meta in enumerate(meta_words):
|
875
|
+
word = meta["text"]
|
876
|
+
if current_text:
|
877
|
+
current_text += " "
|
878
|
+
current_text += word
|
879
|
+
current_words.append(meta)
|
880
|
+
|
881
|
+
should_split = len(current_text) > max_length or (i > 0 and word[0].isupper())
|
882
|
+
if should_split:
|
883
|
+
self.logger.debug(f"Splitting at: '{current_text}'")
|
884
|
+
# If splitting due to capitalization, don't include the capitalized word
|
885
|
+
if word[0].isupper() and len(current_text.strip()) > len(word):
|
886
|
+
split_text = current_text[: -(len(word) + 1)].strip()
|
887
|
+
current_words = current_words[:-1]
|
888
|
+
else:
|
889
|
+
split_text = current_text.strip()
|
890
|
+
|
891
|
+
new_segment = {"text": split_text, "start": current_start, "end": current_words[-1]["end"], "words": current_words}
|
892
|
+
new_segments.append(new_segment)
|
893
|
+
self.logger.debug(f"Added new segment: {new_segment}")
|
894
|
+
|
895
|
+
# Reset for next segment
|
896
|
+
if word[0].isupper() and len(current_text.strip()) > len(word):
|
897
|
+
current_text = word
|
898
|
+
current_words = [meta]
|
899
|
+
else:
|
900
|
+
current_text = ""
|
901
|
+
current_words = []
|
902
|
+
current_start = meta["start"]
|
903
|
+
|
904
|
+
# Add any remaining text as a final segment
|
905
|
+
if current_text:
|
906
|
+
self.logger.debug(f"Adding final segment: '{current_text}'")
|
907
|
+
new_segments.append(
|
908
|
+
{"text": current_text.strip(), "start": current_start, "end": segment["end"], "words": current_words}
|
909
|
+
)
|
910
|
+
|
911
|
+
self.logger.debug(f"Splitting complete. Original segments: {len(segments)}, New segments: {len(new_segments)}")
|
912
|
+
return new_segments
|
913
|
+
|
776
914
|
def transcribe(self):
|
777
|
-
|
915
|
+
transcription_cache_suffix = "-audioshake" if self.audioshake_api_token else "-whisper"
|
916
|
+
self.outputs["transcription_data_filepath"] = self.get_cache_filepath(f"{transcription_cache_suffix}.json")
|
778
917
|
|
779
918
|
whisper_cache_filepath = self.outputs["transcription_data_filepath"]
|
780
919
|
if os.path.isfile(whisper_cache_filepath):
|
@@ -783,15 +922,26 @@ class LyricsTranscriber:
|
|
783
922
|
self.outputs["transcription_data_dict"] = json.load(cache_file)
|
784
923
|
return
|
785
924
|
|
786
|
-
self.
|
787
|
-
|
788
|
-
|
789
|
-
result = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
|
925
|
+
if self.audioshake_api_token:
|
926
|
+
self.logger.debug(f"Using AudioShake API for transcription")
|
927
|
+
from .audioshake_transcriber import AudioShakeTranscriber
|
790
928
|
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
929
|
+
audioshake = AudioShakeTranscriber(self.audioshake_api_token, log_level=self.log_level)
|
930
|
+
result = audioshake.transcribe(self.audio_filepath)
|
931
|
+
else:
|
932
|
+
self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
|
933
|
+
audio = whisper.load_audio(self.audio_filepath)
|
934
|
+
model = whisper.load_model(self.transcription_model, device="cpu")
|
935
|
+
result = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
|
936
|
+
|
937
|
+
# Remove segments with no words, only music
|
938
|
+
result["segments"] = [segment for segment in result["segments"] if segment["text"].strip() != "Music"]
|
939
|
+
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(result['segments'])}")
|
940
|
+
|
941
|
+
# Split long segments
|
942
|
+
self.logger.debug("Starting to split long segments")
|
943
|
+
result["segments"] = self.split_long_segments(result["segments"], max_length=36)
|
944
|
+
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(result['segments'])}")
|
795
945
|
|
796
946
|
self.logger.debug(f"writing transcription data JSON to cache file: {whisper_cache_filepath}")
|
797
947
|
with open(whisper_cache_filepath, "w") as cache_file:
|
lyrics_transcriber/utils/cli.py
CHANGED
@@ -34,6 +34,11 @@ def main():
|
|
34
34
|
default=None,
|
35
35
|
help="Optional: song title for lyrics lookup and auto-correction",
|
36
36
|
)
|
37
|
+
parser.add_argument(
|
38
|
+
"--audioshake_api_token",
|
39
|
+
default=None,
|
40
|
+
help="Optional: AudioShake API token for lyrics transcription and alignment. Can also be set with AUDIOSHAKE_API_TOKEN env var.",
|
41
|
+
)
|
37
42
|
parser.add_argument(
|
38
43
|
"--genius_api_token",
|
39
44
|
default=None,
|
@@ -77,7 +82,7 @@ def main():
|
|
77
82
|
|
78
83
|
parser.add_argument(
|
79
84
|
"--video_resolution",
|
80
|
-
default="
|
85
|
+
default="360p",
|
81
86
|
help="Optional: resolution of the karaoke video to render. Must be one of: 4k, 1080p, 720p, 360p. Default: 360p",
|
82
87
|
)
|
83
88
|
|
@@ -93,6 +98,12 @@ def main():
|
|
93
98
|
help="Optional: color to use for karaoke video background, in hex format or FFmpeg color name. Default: black",
|
94
99
|
)
|
95
100
|
|
101
|
+
parser.add_argument(
|
102
|
+
"--openai_api_key",
|
103
|
+
default=None,
|
104
|
+
help="Optional: OpenAI API key for LLM model usage. Can also be set with OPENAI_API_KEY env var.",
|
105
|
+
)
|
106
|
+
|
96
107
|
args = parser.parse_args()
|
97
108
|
|
98
109
|
log_level = getattr(logging, args.log_level.upper())
|
@@ -114,8 +125,10 @@ def main():
|
|
114
125
|
|
115
126
|
transcriber = LyricsTranscriber(
|
116
127
|
args.audio_filepath,
|
128
|
+
audioshake_api_token=args.audioshake_api_token,
|
117
129
|
genius_api_token=args.genius_api_token,
|
118
130
|
spotify_cookie=args.spotify_cookie,
|
131
|
+
openai_api_key=args.openai_api_key,
|
119
132
|
artist=args.artist,
|
120
133
|
title=args.title,
|
121
134
|
output_dir=args.output_dir,
|
@@ -5,6 +5,7 @@ import json
|
|
5
5
|
import itertools
|
6
6
|
from pathlib import Path
|
7
7
|
from enum import IntEnum
|
8
|
+
import logging
|
8
9
|
|
9
10
|
from . import ass
|
10
11
|
|
@@ -85,21 +86,19 @@ class LyricsLine:
|
|
85
86
|
def __str__(self):
|
86
87
|
return "".join([f"{{{s.text}}}" for s in self.segments])
|
87
88
|
|
88
|
-
def as_ass_event(
|
89
|
-
self,
|
90
|
-
screen_start: timedelta,
|
91
|
-
screen_end: timedelta,
|
92
|
-
style: ass.ASS.Style,
|
93
|
-
top_margin: int,
|
94
|
-
):
|
89
|
+
def as_ass_event(self, screen_start: timedelta, screen_end: timedelta, style: ass.ASS.Style, y_position: int):
|
95
90
|
e = ass.ASS.Event()
|
96
91
|
e.type = "Dialogue"
|
97
92
|
e.Layer = 0
|
98
93
|
e.Style = style
|
99
94
|
e.Start = screen_start.total_seconds()
|
100
95
|
e.End = screen_end.total_seconds()
|
101
|
-
e.MarginV =
|
96
|
+
e.MarginV = y_position
|
102
97
|
e.Text = self.decorate_ass_line(self.segments, screen_start)
|
98
|
+
|
99
|
+
# Set alignment to top-center
|
100
|
+
e.Text = "{\\an8}" + e.Text
|
101
|
+
|
103
102
|
return e
|
104
103
|
|
105
104
|
def decorate_ass_line(self, segments, screen_start_ts: timedelta):
|
@@ -137,6 +136,7 @@ class LyricsScreen:
|
|
137
136
|
start_ts: Optional[timedelta] = None
|
138
137
|
video_size: Tuple[int, int] = None
|
139
138
|
line_height: int = None
|
139
|
+
logger: logging.Logger = None
|
140
140
|
|
141
141
|
@property
|
142
142
|
def end_ts(self) -> timedelta:
|
@@ -145,10 +145,36 @@ class LyricsScreen:
|
|
145
145
|
def get_line_y(self, line_num: int) -> int:
|
146
146
|
_, h = self.video_size
|
147
147
|
line_count = len(self.lines)
|
148
|
-
|
148
|
+
total_height = line_count * self.line_height
|
149
|
+
|
150
|
+
# Calculate the top margin to center the lyrics block
|
151
|
+
top_margin = (h - total_height) / 2
|
152
|
+
|
153
|
+
# Calculate the y-position for this specific line
|
154
|
+
line_y = top_margin + (line_num * self.line_height)
|
155
|
+
|
156
|
+
# if self.logger:
|
157
|
+
# self.logger.debug(f"Line {line_num + 1} positioning:")
|
158
|
+
# self.logger.debug(f" Video height: {h}")
|
159
|
+
# self.logger.debug(f" Total lines: {line_count}")
|
160
|
+
# self.logger.debug(f" Line height: {self.line_height}")
|
161
|
+
# self.logger.debug(f" Total lyrics height: {total_height}")
|
162
|
+
# self.logger.debug(f" Top margin: {top_margin}")
|
163
|
+
# self.logger.debug(f" Line y: {line_y}")
|
164
|
+
|
165
|
+
return int(line_y)
|
149
166
|
|
150
167
|
def as_ass_events(self, style: ass.ASS.Style) -> List[ass.ASS.Event]:
|
151
|
-
|
168
|
+
events = []
|
169
|
+
for i, line in enumerate(self.lines):
|
170
|
+
y_position = self.get_line_y(i)
|
171
|
+
|
172
|
+
# if self.logger:
|
173
|
+
# self.logger.debug(f"Creating ASS event for line {i + 1} at y-position: {y_position}")
|
174
|
+
|
175
|
+
event = line.as_ass_event(self.start_ts, self.end_ts, style, y_position)
|
176
|
+
events.append(event)
|
177
|
+
return events
|
152
178
|
|
153
179
|
def __str__(self):
|
154
180
|
lines = [f"{self.start_ts} - {self.end_ts}:"]
|
@@ -264,7 +290,7 @@ def create_styled_subtitles(
|
|
264
290
|
style.BorderStyle = 1
|
265
291
|
style.Outline = 1
|
266
292
|
style.Shadow = 0
|
267
|
-
style.Alignment = ass.ASS.
|
293
|
+
style.Alignment = ass.ASS.ALIGN_TOP_CENTER
|
268
294
|
style.MarginL = 0
|
269
295
|
style.MarginR = 0
|
270
296
|
style.MarginV = 0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lyrics-transcriber
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.16.1
|
4
4
|
Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
|
5
5
|
Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
|
6
6
|
License: MIT
|
@@ -53,7 +53,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
|
|
53
53
|
### Prerequisites
|
54
54
|
|
55
55
|
- Python 3.9 or higher
|
56
|
-
- [Optional]
|
56
|
+
- [Optional] Genius API token if you want to fetch lyrics from Genius
|
57
|
+
- [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
|
58
|
+
- [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
|
59
|
+
- [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
|
57
60
|
|
58
61
|
```
|
59
62
|
pip install lyrics-transcriber
|
@@ -62,6 +65,23 @@ pip install lyrics-transcriber
|
|
62
65
|
> **Warning**
|
63
66
|
> The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
|
64
67
|
|
68
|
+
## Docker
|
69
|
+
|
70
|
+
You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
|
71
|
+
|
72
|
+
```sh
|
73
|
+
docker run \
|
74
|
+
-v `pwd`/input:/input \
|
75
|
+
-v `pwd`/output:/output \
|
76
|
+
beveradb/lyrics-transcriber:0.16.0 \
|
77
|
+
--log_level debug \
|
78
|
+
--output_dir /output \
|
79
|
+
--render_video \
|
80
|
+
--video_background_image /input/your-background-image.png \
|
81
|
+
--video_resolution 360p \
|
82
|
+
/input/song.flac
|
83
|
+
```
|
84
|
+
|
65
85
|
## Usage 🚀
|
66
86
|
|
67
87
|
### As a standalone CLI
|
@@ -1,17 +1,18 @@
|
|
1
1
|
lyrics_transcriber/__init__.py,sha256=bIRjsXAzlghS1rQxWNLU0wppZy0T_iciN9EclHLwNrQ,94
|
2
|
+
lyrics_transcriber/audioshake_transcriber.py,sha256=ZZjH47edTMuEElzoQiDMDZ1VTK-rdJA_jIULjhD49to,1340
|
2
3
|
lyrics_transcriber/llm_prompts/README.md,sha256=DPAGRDVGt9ZNcQAAoQGFhwesLY3D6hD8apL71yHP4yo,196
|
3
4
|
lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt,sha256=a3XjAYfyhWt1uCKKqm_n2Pc0STdmBdiHHtJ7ODP99Nk,4046
|
4
5
|
lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt,sha256=r6HN3DD_3gwh3B_JPd2R0I4lDXuB5iy7B90J9agOxbQ,2369
|
5
6
|
lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt,sha256=hvk2Vs3M3Q4zGQsiQnXvnpd8wXWfwsudYeqN5qFyNWs,1754
|
6
7
|
lyrics_transcriber/llm_prompts/promptfooconfig.yaml,sha256=O4YxlLV7XSUiSw_1Q9G7ELC2VAbrYUV_N5QxrPbd1jE,3735
|
7
8
|
lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt,sha256=8d-RvZtyINKUlpQLwMi-VD--Y59J-epPt7SZSqjFbPI,1690
|
8
|
-
lyrics_transcriber/transcriber.py,sha256=
|
9
|
+
lyrics_transcriber/transcriber.py,sha256=ho5MYGBESbL-LT-v7GfboQspaqyVzkOLXKBVtt5xtA0,48239
|
9
10
|
lyrics_transcriber/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
11
|
lyrics_transcriber/utils/ass.py,sha256=b8lnjgXGD1OD1ld_b1xxUmSOf4nSEfz9BpgSkh16R4g,90291
|
11
|
-
lyrics_transcriber/utils/cli.py,sha256=
|
12
|
-
lyrics_transcriber/utils/subtitles.py,sha256=
|
13
|
-
lyrics_transcriber-0.
|
14
|
-
lyrics_transcriber-0.
|
15
|
-
lyrics_transcriber-0.
|
16
|
-
lyrics_transcriber-0.
|
17
|
-
lyrics_transcriber-0.
|
12
|
+
lyrics_transcriber/utils/cli.py,sha256=8Poba_9wQw0VmOK73vuK-w-abR9QmO4y4FYDHiAQbc0,6972
|
13
|
+
lyrics_transcriber/utils/subtitles.py,sha256=_WG0pFoZMXcrGe6gbARkC9KrWzFNTMOsiqQwNL-H2lU,11812
|
14
|
+
lyrics_transcriber-0.16.1.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
|
15
|
+
lyrics_transcriber-0.16.1.dist-info/METADATA,sha256=ThpXVaCAnzp_7Q8O_8vY9jeWTeba2qrlhBpt9UvBDvc,5775
|
16
|
+
lyrics_transcriber-0.16.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
17
|
+
lyrics_transcriber-0.16.1.dist-info/entry_points.txt,sha256=lh6L-iR5CGELaNcouDK94X78eS5Ua_tK9lI4UEkza-k,72
|
18
|
+
lyrics_transcriber-0.16.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.15.0.dist-info → lyrics_transcriber-0.16.1.dist-info}/entry_points.txt
RENAMED
File without changes
|