lyrics-transcriber 0.19.0__tar.gz → 0.19.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/PKG-INFO +1 -1
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/audioshake_transcriber.py +16 -7
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/transcriber.py +166 -70
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/pyproject.toml +1 -1
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/LICENSE +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/README.md +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/__init__.py +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/README.md +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/__init__.py +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/ass.py +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/cli.py +0 -0
- {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/subtitles.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lyrics-transcriber
|
3
|
-
Version: 0.19.
|
3
|
+
Version: 0.19.2
|
4
4
|
Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
|
5
5
|
Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
|
6
6
|
License: MIT
|
{lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/audioshake_transcriber.py
RENAMED
@@ -11,8 +11,9 @@ class AudioShakeTranscriber:
|
|
11
11
|
self.logger = logger
|
12
12
|
self.output_prefix = output_prefix
|
13
13
|
|
14
|
-
def
|
15
|
-
|
14
|
+
def start_transcription(self, audio_filepath):
|
15
|
+
"""Starts the transcription job and returns the job ID without waiting for completion"""
|
16
|
+
self.logger.info(f"Starting transcription for {audio_filepath} using AudioShake API")
|
16
17
|
|
17
18
|
# Step 1: Upload the audio file
|
18
19
|
asset_id = self._upload_file(audio_filepath)
|
@@ -22,6 +23,12 @@ class AudioShakeTranscriber:
|
|
22
23
|
job_id = self._create_job(asset_id)
|
23
24
|
self.logger.info(f"Job created successfully. Job ID: {job_id}")
|
24
25
|
|
26
|
+
return job_id
|
27
|
+
|
28
|
+
def get_transcription_result(self, job_id):
|
29
|
+
"""Gets the results for a previously started job"""
|
30
|
+
self.logger.info(f"Getting results for job ID: {job_id}")
|
31
|
+
|
25
32
|
# Step 3: Wait for the job to complete and get the results
|
26
33
|
result = self._get_job_result(job_id)
|
27
34
|
self.logger.info(f"Job completed. Processing results...")
|
@@ -29,6 +36,11 @@ class AudioShakeTranscriber:
|
|
29
36
|
# Step 4: Process the result and return in the required format
|
30
37
|
return self._process_result(result)
|
31
38
|
|
39
|
+
def transcribe(self, audio_filepath):
|
40
|
+
"""Original method now just combines the two steps"""
|
41
|
+
job_id = self.start_transcription(audio_filepath)
|
42
|
+
return self.get_transcription_result(job_id)
|
43
|
+
|
32
44
|
def _upload_file(self, filepath):
|
33
45
|
self.logger.info(f"Uploading {filepath} to AudioShake")
|
34
46
|
url = f"{self.base_url}/upload"
|
@@ -77,13 +89,10 @@ class AudioShakeTranscriber:
|
|
77
89
|
output_assets = job_data.get("outputAssets", [])
|
78
90
|
self.logger.debug(f"Output assets: {output_assets}")
|
79
91
|
|
80
|
-
output_asset = next((asset for asset in output_assets if asset["name"] == "
|
81
|
-
if not output_asset:
|
82
|
-
self.logger.warning("'transcription.json' not found, looking for 'alignment.json'")
|
83
|
-
output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
|
92
|
+
output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
|
84
93
|
|
85
94
|
if not output_asset:
|
86
|
-
self.logger.error("
|
95
|
+
self.logger.error("'alignment.json' found in job results")
|
87
96
|
self.logger.error(f"Available output assets: {[asset['name'] for asset in output_assets]}")
|
88
97
|
raise Exception("Required output not found in job results")
|
89
98
|
|
@@ -66,7 +66,7 @@ class LyricsTranscriber:
|
|
66
66
|
self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
|
67
67
|
self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
|
68
68
|
self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
|
69
|
-
self.audioshake_api_token = os.getenv("
|
69
|
+
self.audioshake_api_token = os.getenv("AUDIOSHAKE_API_TOKEN", default=audioshake_api_token)
|
70
70
|
|
71
71
|
self.transcription_model = transcription_model
|
72
72
|
self.llm_model = llm_model
|
@@ -102,7 +102,7 @@ class LyricsTranscriber:
|
|
102
102
|
|
103
103
|
self.openai_client.log = self.log_level
|
104
104
|
else:
|
105
|
-
self.logger.
|
105
|
+
self.logger.warning("No OpenAI API key found, no correction will be applied to transcription")
|
106
106
|
|
107
107
|
self.render_video = render_video
|
108
108
|
self.video_resolution = video_resolution
|
@@ -137,10 +137,18 @@ class LyricsTranscriber:
|
|
137
137
|
raise FileNotFoundError(f"video_background is not a valid file path: {self.video_background_image}")
|
138
138
|
|
139
139
|
self.outputs = {
|
140
|
-
"
|
141
|
-
"
|
142
|
-
"
|
143
|
-
"
|
140
|
+
"transcription_data_dict_whisper": None,
|
141
|
+
"transcription_data_whisper_filepath": None,
|
142
|
+
"transcribed_lyrics_text_whisper": None,
|
143
|
+
"transcribed_lyrics_text_whisper_filepath": None,
|
144
|
+
"transcription_data_dict_audioshake": None,
|
145
|
+
"transcription_data_audioshake_filepath": None,
|
146
|
+
"transcribed_lyrics_text_audioshake": None,
|
147
|
+
"transcribed_lyrics_text_audioshake_filepath": None,
|
148
|
+
"transcription_data_dict_primary": None,
|
149
|
+
"transcription_data_primary_filepath": None,
|
150
|
+
"transcribed_lyrics_text_primary": None,
|
151
|
+
"transcribed_lyrics_text_primary_filepath": None,
|
144
152
|
"genius_lyrics_text": None,
|
145
153
|
"genius_lyrics_filepath": None,
|
146
154
|
"spotify_lyrics_data_dict": None,
|
@@ -169,9 +177,12 @@ class LyricsTranscriber:
|
|
169
177
|
self.output_prefix = f"{artist} - {title}"
|
170
178
|
|
171
179
|
def generate(self):
|
180
|
+
self.logger.debug(f"Starting generate() with cache_dir: {self.cache_dir} and output_dir: {self.output_dir}")
|
181
|
+
|
172
182
|
self.logger.debug(f"audio_filepath is set: {self.audio_filepath}, beginning initial whisper transcription")
|
173
183
|
|
174
184
|
self.transcribe()
|
185
|
+
|
175
186
|
self.write_transcribed_lyrics_plain_text()
|
176
187
|
|
177
188
|
self.write_genius_lyrics_file()
|
@@ -185,7 +196,7 @@ class LyricsTranscriber:
|
|
185
196
|
self.write_corrected_lyrics_plain_text()
|
186
197
|
else:
|
187
198
|
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
188
|
-
self.outputs["corrected_lyrics_data_dict"] = self.outputs["
|
199
|
+
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
|
189
200
|
self.write_corrected_lyrics_plain_text()
|
190
201
|
|
191
202
|
self.calculate_singing_percentage()
|
@@ -210,11 +221,15 @@ class LyricsTranscriber:
|
|
210
221
|
self.output_dir = os.getcwd()
|
211
222
|
|
212
223
|
self.logger.debug(f"copying temporary files to output dir: {self.output_dir}")
|
213
|
-
|
214
|
-
for key in self.outputs:
|
224
|
+
self.logger.debug("Files to copy:")
|
225
|
+
for key, value in self.outputs.items():
|
215
226
|
if key.endswith("_filepath"):
|
216
|
-
|
217
|
-
|
227
|
+
self.logger.debug(f" {key}: {value}")
|
228
|
+
if value and os.path.isfile(value):
|
229
|
+
self.logger.debug(f" File exists, copying to {self.output_dir}")
|
230
|
+
shutil.copy(value, self.output_dir)
|
231
|
+
else:
|
232
|
+
self.logger.debug(f" File doesn't exist or is None")
|
218
233
|
|
219
234
|
self.outputs["output_dir"] = self.output_dir
|
220
235
|
|
@@ -234,9 +249,7 @@ class LyricsTranscriber:
|
|
234
249
|
continue
|
235
250
|
|
236
251
|
if self.openai_client:
|
237
|
-
data_input_str =
|
238
|
-
f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
|
239
|
-
)
|
252
|
+
data_input_str = f'Data input 1:\n{self.outputs["transcribed_lyrics_text_primary"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
|
240
253
|
|
241
254
|
self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
|
242
255
|
response = self.openai_client.chat.completions.create(
|
@@ -265,7 +278,7 @@ class LyricsTranscriber:
|
|
265
278
|
else:
|
266
279
|
# Fallback primitive word matching
|
267
280
|
self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
|
268
|
-
transcribed_words = set(self.outputs["
|
281
|
+
transcribed_words = set(self.outputs["transcribed_lyrics_text_primary"].split())
|
269
282
|
online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
|
270
283
|
common_words = transcribed_words & online_lyrics_words
|
271
284
|
match_percentage = len(common_words) / len(online_lyrics_words) * 100
|
@@ -314,7 +327,7 @@ class LyricsTranscriber:
|
|
314
327
|
|
315
328
|
if not reference_lyrics:
|
316
329
|
self.logger.warning("No reference lyrics found from Genius or Spotify. Skipping LLM correction.")
|
317
|
-
self.outputs["corrected_lyrics_data_dict"] = self.outputs["
|
330
|
+
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
|
318
331
|
return
|
319
332
|
|
320
333
|
self.logger.debug(
|
@@ -335,7 +348,7 @@ class LyricsTranscriber:
|
|
335
348
|
self.outputs["llm_transcript"] = ""
|
336
349
|
self.outputs["llm_transcript_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (LLM Transcript).txt"))
|
337
350
|
|
338
|
-
total_segments = len(self.outputs["
|
351
|
+
total_segments = len(self.outputs["transcription_data_dict_primary"]["segments"])
|
339
352
|
self.logger.info(f"Beginning correction using LLM, total segments: {total_segments}")
|
340
353
|
|
341
354
|
with open(self.outputs["llm_transcript_filepath"], "a", buffering=1, encoding="utf-8") as llm_transcript_file:
|
@@ -345,7 +358,7 @@ class LyricsTranscriber:
|
|
345
358
|
self.outputs["llm_transcript"] += llm_transcript_header
|
346
359
|
llm_transcript_file.write(llm_transcript_header)
|
347
360
|
|
348
|
-
for segment in self.outputs["
|
361
|
+
for segment in self.outputs["transcription_data_dict_primary"]["segments"]:
|
349
362
|
# # Don't waste OpenAI dollars when testing!
|
350
363
|
# if segment["id"] > 10:
|
351
364
|
# continue
|
@@ -371,7 +384,7 @@ class LyricsTranscriber:
|
|
371
384
|
if previous_segment["id"] in (segment["id"] - 2, segment["id"] - 1):
|
372
385
|
previous_two_corrected_lines += previous_segment["text"].strip() + "\n"
|
373
386
|
|
374
|
-
for next_segment in self.outputs["
|
387
|
+
for next_segment in self.outputs["transcription_data_dict_primary"]["segments"]:
|
375
388
|
if next_segment["id"] in (segment["id"] + 1, segment["id"] + 2):
|
376
389
|
upcoming_two_uncorrected_lines += next_segment["text"].strip() + "\n"
|
377
390
|
|
@@ -569,6 +582,7 @@ class LyricsTranscriber:
|
|
569
582
|
|
570
583
|
genius_lyrics_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Genius).txt"))
|
571
584
|
|
585
|
+
# Check cache first
|
572
586
|
if os.path.isfile(genius_lyrics_cache_filepath):
|
573
587
|
self.logger.debug(f"found existing file at genius_lyrics_cache_filepath, reading: {genius_lyrics_cache_filepath}")
|
574
588
|
|
@@ -576,15 +590,21 @@ class LyricsTranscriber:
|
|
576
590
|
self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
|
577
591
|
self.outputs["genius_lyrics_text"] = cached_lyrics.read()
|
578
592
|
return
|
579
|
-
|
580
593
|
self.logger.debug(f"no cached lyrics found at genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}, fetching from Genius")
|
581
|
-
|
594
|
+
|
595
|
+
# Initialize Genius with better defaults
|
596
|
+
genius = lyricsgenius.Genius(
|
597
|
+
self.genius_api_token,
|
598
|
+
verbose=(self.log_level == logging.DEBUG),
|
599
|
+
remove_section_headers=True,
|
600
|
+
)
|
582
601
|
|
583
602
|
try:
|
584
603
|
song = self.fetch_genius_lyrics(genius, self.title, self.artist)
|
585
604
|
if song is None:
|
586
605
|
self.logger.warning(f'Could not find lyrics on Genius for "{self.title}" by {self.artist}')
|
587
|
-
return
|
606
|
+
return None
|
607
|
+
|
588
608
|
lyrics = self.clean_genius_lyrics(song.lyrics)
|
589
609
|
|
590
610
|
self.logger.debug(f"writing clean lyrics to genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}")
|
@@ -593,6 +613,8 @@ class LyricsTranscriber:
|
|
593
613
|
|
594
614
|
self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
|
595
615
|
self.outputs["genius_lyrics_text"] = lyrics
|
616
|
+
return lyrics.split("\n") # Return lines like write_lyrics_from_genius
|
617
|
+
|
596
618
|
except requests.exceptions.RequestException as e:
|
597
619
|
self.logger.error(f"Failed to fetch lyrics from Genius after multiple retries: {e}")
|
598
620
|
raise
|
@@ -600,8 +622,13 @@ class LyricsTranscriber:
|
|
600
622
|
def clean_genius_lyrics(self, lyrics):
|
601
623
|
lyrics = lyrics.replace("\\n", "\n")
|
602
624
|
lyrics = re.sub(r"You might also like", "", lyrics)
|
603
|
-
|
604
|
-
|
625
|
+
lyrics = re.sub(
|
626
|
+
r".*?Lyrics([A-Z])", r"\1", lyrics
|
627
|
+
) # Remove the song name and word "Lyrics" if this has a non-newline char at the start
|
628
|
+
lyrics = re.sub(r"^[0-9]* Contributors.*Lyrics", "", lyrics) # Remove this example: 27 ContributorsSex Bomb Lyrics
|
629
|
+
lyrics = re.sub(
|
630
|
+
r"See.*Live.*Get tickets as low as \$[0-9]+", "", lyrics
|
631
|
+
) # Remove this example: See Tom Jones LiveGet tickets as low as $71
|
605
632
|
lyrics = re.sub(r"[0-9]+Embed$", "", lyrics) # Remove the word "Embed" at end of line with preceding numbers if found
|
606
633
|
lyrics = re.sub(r"(\S)Embed$", r"\1", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
|
607
634
|
lyrics = re.sub(r"^Embed$", r"", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
|
@@ -611,7 +638,9 @@ class LyricsTranscriber:
|
|
611
638
|
|
612
639
|
def calculate_singing_percentage(self):
|
613
640
|
# Calculate total seconds of singing using timings from whisper transcription results
|
614
|
-
total_singing_duration = sum(
|
641
|
+
total_singing_duration = sum(
|
642
|
+
segment["end"] - segment["start"] for segment in self.outputs["transcription_data_dict_primary"]["segments"]
|
643
|
+
)
|
615
644
|
|
616
645
|
self.logger.debug(f"calculated total_singing_duration: {int(total_singing_duration)} seconds, now running ffprobe")
|
617
646
|
|
@@ -641,9 +670,7 @@ class LyricsTranscriber:
|
|
641
670
|
# then loops over each word and writes all words with MidiCo segment start/end formatting
|
642
671
|
# and word-level timestamps to a MidiCo-compatible LRC file
|
643
672
|
def write_midico_lrc_file(self):
|
644
|
-
self.outputs["midico_lrc_filepath"] = os.path.join(
|
645
|
-
self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc") # Updated suffix
|
646
|
-
)
|
673
|
+
self.outputs["midico_lrc_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc"))
|
647
674
|
|
648
675
|
lrc_filename = self.outputs["midico_lrc_filepath"]
|
649
676
|
self.logger.debug(f"writing midico formatted word timestamps to LRC file: {lrc_filename}")
|
@@ -660,7 +687,7 @@ class LyricsTranscriber:
|
|
660
687
|
f.write(line)
|
661
688
|
|
662
689
|
def create_screens(self):
|
663
|
-
self.logger.debug("create_screens beginning generation of screens from
|
690
|
+
self.logger.debug("create_screens beginning generation of screens from transcription results")
|
664
691
|
screens: List[subtitles.LyricsScreen] = []
|
665
692
|
screen: Optional[subtitles.LyricsScreen] = None
|
666
693
|
|
@@ -725,8 +752,8 @@ class LyricsTranscriber:
|
|
725
752
|
ass_filepath = self.outputs["ass_subtitles_filepath"]
|
726
753
|
self.logger.debug(f"writing ASS formatted subtitle file: {ass_filepath}")
|
727
754
|
|
728
|
-
|
729
|
-
screens = subtitles.set_segment_end_times(
|
755
|
+
initial_screens = self.create_screens()
|
756
|
+
screens = subtitles.set_segment_end_times(initial_screens, int(self.outputs["song_duration"]))
|
730
757
|
screens = subtitles.set_screen_start_times(screens)
|
731
758
|
lyric_subtitles_ass = subtitles.create_styled_subtitles(screens, self.video_resolution_num, self.font_size)
|
732
759
|
lyric_subtitles_ass.write(ass_filepath)
|
@@ -845,22 +872,29 @@ class LyricsTranscriber:
|
|
845
872
|
return formatted_time
|
846
873
|
|
847
874
|
def write_transcribed_lyrics_plain_text(self):
|
848
|
-
if self.outputs["
|
849
|
-
|
850
|
-
self.logger.debug(f"
|
851
|
-
|
852
|
-
|
853
|
-
|
875
|
+
if self.outputs["transcription_data_dict_whisper"]:
|
876
|
+
transcribed_lyrics_text_whisper_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Whisper).txt"))
|
877
|
+
self.logger.debug(f"Setting Whisper text filepath to: {transcribed_lyrics_text_whisper_filepath}")
|
878
|
+
self.outputs["transcribed_lyrics_text_whisper_filepath"] = transcribed_lyrics_text_whisper_filepath
|
879
|
+
self.outputs["transcribed_lyrics_text_whisper"] = ""
|
880
|
+
|
881
|
+
self.logger.debug(f"Writing Whisper lyrics to: {transcribed_lyrics_text_whisper_filepath}")
|
882
|
+
with open(transcribed_lyrics_text_whisper_filepath, "w", encoding="utf-8") as f:
|
883
|
+
for segment in self.outputs["transcription_data_dict_whisper"]["segments"]:
|
884
|
+
self.outputs["transcribed_lyrics_text_whisper"] += segment["text"] + "\n"
|
885
|
+
f.write(segment["text"].strip() + "\n")
|
886
|
+
self.logger.debug(f"Finished writing Whisper lyrics, file exists: {os.path.exists(transcribed_lyrics_text_whisper_filepath)}")
|
854
887
|
|
855
|
-
|
888
|
+
if self.outputs["transcription_data_dict_audioshake"]:
|
889
|
+
transcribed_lyrics_text_audioshake_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics AudioShake).txt"))
|
890
|
+
self.outputs["transcribed_lyrics_text_audioshake_filepath"] = transcribed_lyrics_text_audioshake_filepath
|
891
|
+
self.outputs["transcribed_lyrics_text_audioshake"] = ""
|
856
892
|
|
857
|
-
self.logger.debug(f"
|
858
|
-
with open(
|
859
|
-
for segment in self.outputs["
|
860
|
-
self.outputs["
|
893
|
+
self.logger.debug(f"Writing AudioShake lyrics to: {transcribed_lyrics_text_audioshake_filepath}")
|
894
|
+
with open(transcribed_lyrics_text_audioshake_filepath, "w", encoding="utf-8") as f:
|
895
|
+
for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]:
|
896
|
+
self.outputs["transcribed_lyrics_text_audioshake"] += segment["text"] + "\n"
|
861
897
|
f.write(segment["text"].strip() + "\n")
|
862
|
-
else:
|
863
|
-
raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
|
864
898
|
|
865
899
|
def find_best_split_point(self, text, max_length):
|
866
900
|
self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
|
@@ -963,45 +997,107 @@ class LyricsTranscriber:
|
|
963
997
|
return new_segments
|
964
998
|
|
965
999
|
def transcribe(self):
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
1000
|
+
# Check cache first
|
1001
|
+
transcription_cache_filepath_whisper = self.get_cache_filepath(" (Lyrics Whisper).json")
|
1002
|
+
transcription_cache_filepath_audioshake = self.get_cache_filepath(" (Lyrics AudioShake).json")
|
1003
|
+
|
1004
|
+
self.logger.debug(f"Cache directory: {self.cache_dir}")
|
1005
|
+
self.logger.debug(f"Output directory: {self.output_dir}")
|
1006
|
+
|
1007
|
+
if os.path.isfile(transcription_cache_filepath_whisper):
|
1008
|
+
self.logger.debug(f"Found existing Whisper transcription, reading: {transcription_cache_filepath_whisper}")
|
1009
|
+
with open(transcription_cache_filepath_whisper, "r") as cache_file:
|
1010
|
+
self.outputs["transcription_data_dict_whisper"] = json.load(cache_file)
|
1011
|
+
self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
|
1012
|
+
self.logger.debug(f"Loaded Whisper data and set filepath to: {self.outputs['transcription_data_whisper_filepath']}")
|
1013
|
+
|
1014
|
+
if os.path.isfile(transcription_cache_filepath_audioshake):
|
1015
|
+
self.logger.debug(f"Found existing AudioShake transcription, reading: {transcription_cache_filepath_audioshake}")
|
1016
|
+
with open(transcription_cache_filepath_audioshake, "r") as cache_file:
|
1017
|
+
self.outputs["transcription_data_dict_audioshake"] = json.load(cache_file)
|
1018
|
+
self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
|
1019
|
+
|
1020
|
+
# If we have both cached transcriptions, set primary and return early
|
1021
|
+
if self.outputs["transcription_data_dict_whisper"] and self.outputs["transcription_data_dict_audioshake"]:
|
1022
|
+
self.set_primary_transcription()
|
1023
|
+
return
|
1024
|
+
# If we have Whisper cached and AudioShake isn't available, set primary and return early
|
1025
|
+
elif self.outputs["transcription_data_dict_whisper"] and not self.audioshake_api_token:
|
1026
|
+
self.set_primary_transcription()
|
1027
|
+
return
|
975
1028
|
|
976
|
-
|
977
|
-
|
1029
|
+
# Continue with transcription for any missing data...
|
1030
|
+
audioshake_job_id = None
|
1031
|
+
if self.audioshake_api_token and not self.outputs["transcription_data_dict_audioshake"]:
|
1032
|
+
self.logger.debug(f"Starting AudioShake transcription")
|
978
1033
|
from .audioshake_transcriber import AudioShakeTranscriber
|
979
1034
|
|
980
1035
|
audioshake = AudioShakeTranscriber(api_token=self.audioshake_api_token, logger=self.logger, output_prefix=self.output_prefix)
|
981
|
-
|
982
|
-
|
1036
|
+
audioshake_job_id = audioshake.start_transcription(self.audio_filepath)
|
1037
|
+
|
1038
|
+
# Run Whisper transcription if needed while AudioShake processes
|
1039
|
+
if not self.outputs["transcription_data_dict_whisper"]:
|
983
1040
|
self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
|
984
1041
|
audio = whisper.load_audio(self.audio_filepath)
|
985
1042
|
model = whisper.load_model(self.transcription_model, device="cpu")
|
986
|
-
|
987
|
-
|
988
|
-
# auditok is needed for voice activity detection, but it has OS package dependencies that are hard to install on some platforms
|
989
|
-
# transcription_data = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
|
1043
|
+
whisper_data = whisper.transcribe(model, audio, language="en", beam_size=5, temperature=0.2, best_of=5)
|
990
1044
|
|
991
1045
|
# Remove segments with no words, only music
|
992
|
-
|
993
|
-
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(
|
1046
|
+
whisper_data["segments"] = [segment for segment in whisper_data["segments"] if segment["text"].strip() != "Music"]
|
1047
|
+
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(whisper_data['segments'])}")
|
994
1048
|
|
995
1049
|
# Split long segments
|
996
1050
|
self.logger.debug("Starting to split long segments")
|
997
|
-
|
998
|
-
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1051
|
+
whisper_data["segments"] = self.split_long_segments(whisper_data["segments"], max_length=36)
|
1052
|
+
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(whisper_data['segments'])}")
|
1053
|
+
|
1054
|
+
# Store Whisper results
|
1055
|
+
self.outputs["transcription_data_dict_whisper"] = whisper_data
|
1056
|
+
self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
|
1057
|
+
with open(transcription_cache_filepath_whisper, "w") as cache_file:
|
1058
|
+
json.dump(whisper_data, cache_file, indent=4)
|
1059
|
+
|
1060
|
+
# Now that Whisper is done, get AudioShake results if available
|
1061
|
+
if audioshake_job_id:
|
1062
|
+
self.logger.debug("Getting AudioShake results")
|
1063
|
+
audioshake_data = audioshake.get_transcription_result(audioshake_job_id)
|
1064
|
+
self.outputs["transcription_data_dict_audioshake"] = audioshake_data
|
1065
|
+
self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
|
1066
|
+
with open(transcription_cache_filepath_audioshake, "w") as cache_file:
|
1067
|
+
json.dump(audioshake_data, cache_file, indent=4)
|
1068
|
+
|
1069
|
+
# Set the primary transcription source
|
1070
|
+
self.set_primary_transcription()
|
1071
|
+
|
1072
|
+
# Write the text files
|
1073
|
+
self.write_transcribed_lyrics_plain_text()
|
1003
1074
|
|
1004
|
-
|
1075
|
+
def set_primary_transcription(self):
|
1076
|
+
"""Set the primary transcription source (AudioShake if available, otherwise Whisper)"""
|
1077
|
+
if self.outputs["transcription_data_dict_audioshake"]:
|
1078
|
+
self.logger.info("Using AudioShake as primary transcription source")
|
1079
|
+
self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_audioshake"]
|
1080
|
+
self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_audioshake_filepath"]
|
1081
|
+
|
1082
|
+
# Set the primary text content
|
1083
|
+
if "transcribed_lyrics_text_audioshake" not in self.outputs or not self.outputs["transcribed_lyrics_text_audioshake"]:
|
1084
|
+
self.outputs["transcribed_lyrics_text_audioshake"] = "\n".join(
|
1085
|
+
segment["text"].strip() for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]
|
1086
|
+
)
|
1087
|
+
self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_audioshake"]
|
1088
|
+
self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_audioshake_filepath"]
|
1089
|
+
else:
|
1090
|
+
self.logger.info("Using Whisper as primary transcription source")
|
1091
|
+
self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_whisper"]
|
1092
|
+
self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_whisper_filepath"]
|
1093
|
+
|
1094
|
+
# Set the primary text content
|
1095
|
+
if "transcribed_lyrics_text_whisper" not in self.outputs or not self.outputs["transcribed_lyrics_text_whisper"]:
|
1096
|
+
self.outputs["transcribed_lyrics_text_whisper"] = "\n".join(
|
1097
|
+
segment["text"].strip() for segment in self.outputs["transcription_data_dict_whisper"]["segments"]
|
1098
|
+
)
|
1099
|
+
self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_whisper"]
|
1100
|
+
self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_whisper_filepath"]
|
1005
1101
|
|
1006
1102
|
def get_cache_filepath(self, extension):
|
1007
1103
|
# Instead of using slugify and hash, use the consistent naming pattern
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lyrics-transcriber"
|
3
|
-
version = "0.19.
|
3
|
+
version = "0.19.2"
|
4
4
|
description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
|
5
5
|
authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
|
6
6
|
license = "MIT"
|
File without changes
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/subtitles.py
RENAMED
File without changes
|