lyrics-transcriber 0.19.0__tar.gz → 0.19.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/PKG-INFO +1 -1
  2. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/audioshake_transcriber.py +16 -7
  3. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/transcriber.py +166 -70
  4. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/pyproject.toml +1 -1
  5. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/LICENSE +0 -0
  6. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/README.md +0 -0
  7. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/__init__.py +0 -0
  8. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/README.md +0 -0
  9. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -0
  10. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -0
  11. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -0
  12. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -0
  13. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -0
  14. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/__init__.py +0 -0
  15. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/ass.py +0 -0
  16. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/cli.py +0 -0
  17. {lyrics_transcriber-0.19.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/subtitles.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lyrics-transcriber
3
- Version: 0.19.0
3
+ Version: 0.19.2
4
4
  Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
5
5
  Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
6
6
  License: MIT
@@ -11,8 +11,9 @@ class AudioShakeTranscriber:
11
11
  self.logger = logger
12
12
  self.output_prefix = output_prefix
13
13
 
14
- def transcribe(self, audio_filepath):
15
- self.logger.info(f"Transcribing {audio_filepath} using AudioShake API")
14
+ def start_transcription(self, audio_filepath):
15
+ """Starts the transcription job and returns the job ID without waiting for completion"""
16
+ self.logger.info(f"Starting transcription for {audio_filepath} using AudioShake API")
16
17
 
17
18
  # Step 1: Upload the audio file
18
19
  asset_id = self._upload_file(audio_filepath)
@@ -22,6 +23,12 @@ class AudioShakeTranscriber:
22
23
  job_id = self._create_job(asset_id)
23
24
  self.logger.info(f"Job created successfully. Job ID: {job_id}")
24
25
 
26
+ return job_id
27
+
28
+ def get_transcription_result(self, job_id):
29
+ """Gets the results for a previously started job"""
30
+ self.logger.info(f"Getting results for job ID: {job_id}")
31
+
25
32
  # Step 3: Wait for the job to complete and get the results
26
33
  result = self._get_job_result(job_id)
27
34
  self.logger.info(f"Job completed. Processing results...")
@@ -29,6 +36,11 @@ class AudioShakeTranscriber:
29
36
  # Step 4: Process the result and return in the required format
30
37
  return self._process_result(result)
31
38
 
39
+ def transcribe(self, audio_filepath):
40
+ """Original method now just combines the two steps"""
41
+ job_id = self.start_transcription(audio_filepath)
42
+ return self.get_transcription_result(job_id)
43
+
32
44
  def _upload_file(self, filepath):
33
45
  self.logger.info(f"Uploading {filepath} to AudioShake")
34
46
  url = f"{self.base_url}/upload"
@@ -77,13 +89,10 @@ class AudioShakeTranscriber:
77
89
  output_assets = job_data.get("outputAssets", [])
78
90
  self.logger.debug(f"Output assets: {output_assets}")
79
91
 
80
- output_asset = next((asset for asset in output_assets if asset["name"] == "transcription.json"), None)
81
- if not output_asset:
82
- self.logger.warning("'transcription.json' not found, looking for 'alignment.json'")
83
- output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
92
+ output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
84
93
 
85
94
  if not output_asset:
86
- self.logger.error("Neither 'transcription.json' nor 'alignment.json' found in job results")
95
+ self.logger.error("'alignment.json' found in job results")
87
96
  self.logger.error(f"Available output assets: {[asset['name'] for asset in output_assets]}")
88
97
  raise Exception("Required output not found in job results")
89
98
 
@@ -66,7 +66,7 @@ class LyricsTranscriber:
66
66
  self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
67
67
  self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
68
68
  self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
69
- self.audioshake_api_token = os.getenv("AUDIOSHAKE_TOKEN", default=audioshake_api_token)
69
+ self.audioshake_api_token = os.getenv("AUDIOSHAKE_API_TOKEN", default=audioshake_api_token)
70
70
 
71
71
  self.transcription_model = transcription_model
72
72
  self.llm_model = llm_model
@@ -102,7 +102,7 @@ class LyricsTranscriber:
102
102
 
103
103
  self.openai_client.log = self.log_level
104
104
  else:
105
- self.logger.error("No OpenAI API key found, no correction will be applied to transcription")
105
+ self.logger.warning("No OpenAI API key found, no correction will be applied to transcription")
106
106
 
107
107
  self.render_video = render_video
108
108
  self.video_resolution = video_resolution
@@ -137,10 +137,18 @@ class LyricsTranscriber:
137
137
  raise FileNotFoundError(f"video_background is not a valid file path: {self.video_background_image}")
138
138
 
139
139
  self.outputs = {
140
- "transcription_data_dict": None,
141
- "transcription_data_filepath": None,
142
- "transcribed_lyrics_text": None,
143
- "transcribed_lyrics_text_filepath": None,
140
+ "transcription_data_dict_whisper": None,
141
+ "transcription_data_whisper_filepath": None,
142
+ "transcribed_lyrics_text_whisper": None,
143
+ "transcribed_lyrics_text_whisper_filepath": None,
144
+ "transcription_data_dict_audioshake": None,
145
+ "transcription_data_audioshake_filepath": None,
146
+ "transcribed_lyrics_text_audioshake": None,
147
+ "transcribed_lyrics_text_audioshake_filepath": None,
148
+ "transcription_data_dict_primary": None,
149
+ "transcription_data_primary_filepath": None,
150
+ "transcribed_lyrics_text_primary": None,
151
+ "transcribed_lyrics_text_primary_filepath": None,
144
152
  "genius_lyrics_text": None,
145
153
  "genius_lyrics_filepath": None,
146
154
  "spotify_lyrics_data_dict": None,
@@ -169,9 +177,12 @@ class LyricsTranscriber:
169
177
  self.output_prefix = f"{artist} - {title}"
170
178
 
171
179
  def generate(self):
180
+ self.logger.debug(f"Starting generate() with cache_dir: {self.cache_dir} and output_dir: {self.output_dir}")
181
+
172
182
  self.logger.debug(f"audio_filepath is set: {self.audio_filepath}, beginning initial whisper transcription")
173
183
 
174
184
  self.transcribe()
185
+
175
186
  self.write_transcribed_lyrics_plain_text()
176
187
 
177
188
  self.write_genius_lyrics_file()
@@ -185,7 +196,7 @@ class LyricsTranscriber:
185
196
  self.write_corrected_lyrics_plain_text()
186
197
  else:
187
198
  self.logger.warning("Skipping LLM correction as no OpenAI client is available")
188
- self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict"]
199
+ self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
189
200
  self.write_corrected_lyrics_plain_text()
190
201
 
191
202
  self.calculate_singing_percentage()
@@ -210,11 +221,15 @@ class LyricsTranscriber:
210
221
  self.output_dir = os.getcwd()
211
222
 
212
223
  self.logger.debug(f"copying temporary files to output dir: {self.output_dir}")
213
-
214
- for key in self.outputs:
224
+ self.logger.debug("Files to copy:")
225
+ for key, value in self.outputs.items():
215
226
  if key.endswith("_filepath"):
216
- if self.outputs[key] and os.path.isfile(self.outputs[key]):
217
- shutil.copy(self.outputs[key], self.output_dir)
227
+ self.logger.debug(f" {key}: {value}")
228
+ if value and os.path.isfile(value):
229
+ self.logger.debug(f" File exists, copying to {self.output_dir}")
230
+ shutil.copy(value, self.output_dir)
231
+ else:
232
+ self.logger.debug(f" File doesn't exist or is None")
218
233
 
219
234
  self.outputs["output_dir"] = self.output_dir
220
235
 
@@ -234,9 +249,7 @@ class LyricsTranscriber:
234
249
  continue
235
250
 
236
251
  if self.openai_client:
237
- data_input_str = (
238
- f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
239
- )
252
+ data_input_str = f'Data input 1:\n{self.outputs["transcribed_lyrics_text_primary"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
240
253
 
241
254
  self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
242
255
  response = self.openai_client.chat.completions.create(
@@ -265,7 +278,7 @@ class LyricsTranscriber:
265
278
  else:
266
279
  # Fallback primitive word matching
267
280
  self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
268
- transcribed_words = set(self.outputs["transcribed_lyrics_text"].split())
281
+ transcribed_words = set(self.outputs["transcribed_lyrics_text_primary"].split())
269
282
  online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
270
283
  common_words = transcribed_words & online_lyrics_words
271
284
  match_percentage = len(common_words) / len(online_lyrics_words) * 100
@@ -314,7 +327,7 @@ class LyricsTranscriber:
314
327
 
315
328
  if not reference_lyrics:
316
329
  self.logger.warning("No reference lyrics found from Genius or Spotify. Skipping LLM correction.")
317
- self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict"]
330
+ self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
318
331
  return
319
332
 
320
333
  self.logger.debug(
@@ -335,7 +348,7 @@ class LyricsTranscriber:
335
348
  self.outputs["llm_transcript"] = ""
336
349
  self.outputs["llm_transcript_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (LLM Transcript).txt"))
337
350
 
338
- total_segments = len(self.outputs["transcription_data_dict"]["segments"])
351
+ total_segments = len(self.outputs["transcription_data_dict_primary"]["segments"])
339
352
  self.logger.info(f"Beginning correction using LLM, total segments: {total_segments}")
340
353
 
341
354
  with open(self.outputs["llm_transcript_filepath"], "a", buffering=1, encoding="utf-8") as llm_transcript_file:
@@ -345,7 +358,7 @@ class LyricsTranscriber:
345
358
  self.outputs["llm_transcript"] += llm_transcript_header
346
359
  llm_transcript_file.write(llm_transcript_header)
347
360
 
348
- for segment in self.outputs["transcription_data_dict"]["segments"]:
361
+ for segment in self.outputs["transcription_data_dict_primary"]["segments"]:
349
362
  # # Don't waste OpenAI dollars when testing!
350
363
  # if segment["id"] > 10:
351
364
  # continue
@@ -371,7 +384,7 @@ class LyricsTranscriber:
371
384
  if previous_segment["id"] in (segment["id"] - 2, segment["id"] - 1):
372
385
  previous_two_corrected_lines += previous_segment["text"].strip() + "\n"
373
386
 
374
- for next_segment in self.outputs["transcription_data_dict"]["segments"]:
387
+ for next_segment in self.outputs["transcription_data_dict_primary"]["segments"]:
375
388
  if next_segment["id"] in (segment["id"] + 1, segment["id"] + 2):
376
389
  upcoming_two_uncorrected_lines += next_segment["text"].strip() + "\n"
377
390
 
@@ -569,6 +582,7 @@ class LyricsTranscriber:
569
582
 
570
583
  genius_lyrics_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Genius).txt"))
571
584
 
585
+ # Check cache first
572
586
  if os.path.isfile(genius_lyrics_cache_filepath):
573
587
  self.logger.debug(f"found existing file at genius_lyrics_cache_filepath, reading: {genius_lyrics_cache_filepath}")
574
588
 
@@ -576,15 +590,21 @@ class LyricsTranscriber:
576
590
  self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
577
591
  self.outputs["genius_lyrics_text"] = cached_lyrics.read()
578
592
  return
579
-
580
593
  self.logger.debug(f"no cached lyrics found at genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}, fetching from Genius")
581
- genius = lyricsgenius.Genius(self.genius_api_token, verbose=(self.log_level == logging.DEBUG))
594
+
595
+ # Initialize Genius with better defaults
596
+ genius = lyricsgenius.Genius(
597
+ self.genius_api_token,
598
+ verbose=(self.log_level == logging.DEBUG),
599
+ remove_section_headers=True,
600
+ )
582
601
 
583
602
  try:
584
603
  song = self.fetch_genius_lyrics(genius, self.title, self.artist)
585
604
  if song is None:
586
605
  self.logger.warning(f'Could not find lyrics on Genius for "{self.title}" by {self.artist}')
587
- return
606
+ return None
607
+
588
608
  lyrics = self.clean_genius_lyrics(song.lyrics)
589
609
 
590
610
  self.logger.debug(f"writing clean lyrics to genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}")
@@ -593,6 +613,8 @@ class LyricsTranscriber:
593
613
 
594
614
  self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
595
615
  self.outputs["genius_lyrics_text"] = lyrics
616
+ return lyrics.split("\n") # Return lines like write_lyrics_from_genius
617
+
596
618
  except requests.exceptions.RequestException as e:
597
619
  self.logger.error(f"Failed to fetch lyrics from Genius after multiple retries: {e}")
598
620
  raise
@@ -600,8 +622,13 @@ class LyricsTranscriber:
600
622
  def clean_genius_lyrics(self, lyrics):
601
623
  lyrics = lyrics.replace("\\n", "\n")
602
624
  lyrics = re.sub(r"You might also like", "", lyrics)
603
- # Remove the song name and word "Lyrics" if this has a non-newline char at the start
604
- lyrics = re.sub(r".*?Lyrics([A-Z])", r"\1", lyrics)
625
+ lyrics = re.sub(
626
+ r".*?Lyrics([A-Z])", r"\1", lyrics
627
+ ) # Remove the song name and word "Lyrics" if this has a non-newline char at the start
628
+ lyrics = re.sub(r"^[0-9]* Contributors.*Lyrics", "", lyrics) # Remove this example: 27 ContributorsSex Bomb Lyrics
629
+ lyrics = re.sub(
630
+ r"See.*Live.*Get tickets as low as \$[0-9]+", "", lyrics
631
+ ) # Remove this example: See Tom Jones LiveGet tickets as low as $71
605
632
  lyrics = re.sub(r"[0-9]+Embed$", "", lyrics) # Remove the word "Embed" at end of line with preceding numbers if found
606
633
  lyrics = re.sub(r"(\S)Embed$", r"\1", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
607
634
  lyrics = re.sub(r"^Embed$", r"", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
@@ -611,7 +638,9 @@ class LyricsTranscriber:
611
638
 
612
639
  def calculate_singing_percentage(self):
613
640
  # Calculate total seconds of singing using timings from whisper transcription results
614
- total_singing_duration = sum(segment["end"] - segment["start"] for segment in self.outputs["transcription_data_dict"]["segments"])
641
+ total_singing_duration = sum(
642
+ segment["end"] - segment["start"] for segment in self.outputs["transcription_data_dict_primary"]["segments"]
643
+ )
615
644
 
616
645
  self.logger.debug(f"calculated total_singing_duration: {int(total_singing_duration)} seconds, now running ffprobe")
617
646
 
@@ -641,9 +670,7 @@ class LyricsTranscriber:
641
670
  # then loops over each word and writes all words with MidiCo segment start/end formatting
642
671
  # and word-level timestamps to a MidiCo-compatible LRC file
643
672
  def write_midico_lrc_file(self):
644
- self.outputs["midico_lrc_filepath"] = os.path.join(
645
- self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc") # Updated suffix
646
- )
673
+ self.outputs["midico_lrc_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc"))
647
674
 
648
675
  lrc_filename = self.outputs["midico_lrc_filepath"]
649
676
  self.logger.debug(f"writing midico formatted word timestamps to LRC file: {lrc_filename}")
@@ -660,7 +687,7 @@ class LyricsTranscriber:
660
687
  f.write(line)
661
688
 
662
689
  def create_screens(self):
663
- self.logger.debug("create_screens beginning generation of screens from whisper results")
690
+ self.logger.debug("create_screens beginning generation of screens from transcription results")
664
691
  screens: List[subtitles.LyricsScreen] = []
665
692
  screen: Optional[subtitles.LyricsScreen] = None
666
693
 
@@ -725,8 +752,8 @@ class LyricsTranscriber:
725
752
  ass_filepath = self.outputs["ass_subtitles_filepath"]
726
753
  self.logger.debug(f"writing ASS formatted subtitle file: {ass_filepath}")
727
754
 
728
- intial_screens = self.create_screens()
729
- screens = subtitles.set_segment_end_times(intial_screens, int(self.outputs["song_duration"]))
755
+ initial_screens = self.create_screens()
756
+ screens = subtitles.set_segment_end_times(initial_screens, int(self.outputs["song_duration"]))
730
757
  screens = subtitles.set_screen_start_times(screens)
731
758
  lyric_subtitles_ass = subtitles.create_styled_subtitles(screens, self.video_resolution_num, self.font_size)
732
759
  lyric_subtitles_ass.write(ass_filepath)
@@ -845,22 +872,29 @@ class LyricsTranscriber:
845
872
  return formatted_time
846
873
 
847
874
  def write_transcribed_lyrics_plain_text(self):
848
- if self.outputs["transcription_data_dict"]:
849
- transcription_cache_suffix = " (Lyrics AudioShake).txt" if self.audioshake_api_token else " (Lyrics Whisper).txt"
850
- self.logger.debug(f"transcription_cache_suffix: {transcription_cache_suffix}")
851
-
852
- transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, self.get_output_filename(transcription_cache_suffix))
853
- self.outputs["transcribed_lyrics_text_filepath"] = transcribed_lyrics_text_filepath
875
+ if self.outputs["transcription_data_dict_whisper"]:
876
+ transcribed_lyrics_text_whisper_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Whisper).txt"))
877
+ self.logger.debug(f"Setting Whisper text filepath to: {transcribed_lyrics_text_whisper_filepath}")
878
+ self.outputs["transcribed_lyrics_text_whisper_filepath"] = transcribed_lyrics_text_whisper_filepath
879
+ self.outputs["transcribed_lyrics_text_whisper"] = ""
880
+
881
+ self.logger.debug(f"Writing Whisper lyrics to: {transcribed_lyrics_text_whisper_filepath}")
882
+ with open(transcribed_lyrics_text_whisper_filepath, "w", encoding="utf-8") as f:
883
+ for segment in self.outputs["transcription_data_dict_whisper"]["segments"]:
884
+ self.outputs["transcribed_lyrics_text_whisper"] += segment["text"] + "\n"
885
+ f.write(segment["text"].strip() + "\n")
886
+ self.logger.debug(f"Finished writing Whisper lyrics, file exists: {os.path.exists(transcribed_lyrics_text_whisper_filepath)}")
854
887
 
855
- self.outputs["transcribed_lyrics_text"] = ""
888
+ if self.outputs["transcription_data_dict_audioshake"]:
889
+ transcribed_lyrics_text_audioshake_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics AudioShake).txt"))
890
+ self.outputs["transcribed_lyrics_text_audioshake_filepath"] = transcribed_lyrics_text_audioshake_filepath
891
+ self.outputs["transcribed_lyrics_text_audioshake"] = ""
856
892
 
857
- self.logger.debug(f"writing lyrics plain text to transcribed_lyrics_text_filepath: {transcribed_lyrics_text_filepath}")
858
- with open(transcribed_lyrics_text_filepath, "w", encoding="utf-8") as f:
859
- for segment in self.outputs["transcription_data_dict"]["segments"]:
860
- self.outputs["transcribed_lyrics_text"] += segment["text"] + "\n"
893
+ self.logger.debug(f"Writing AudioShake lyrics to: {transcribed_lyrics_text_audioshake_filepath}")
894
+ with open(transcribed_lyrics_text_audioshake_filepath, "w", encoding="utf-8") as f:
895
+ for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]:
896
+ self.outputs["transcribed_lyrics_text_audioshake"] += segment["text"] + "\n"
861
897
  f.write(segment["text"].strip() + "\n")
862
- else:
863
- raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
864
898
 
865
899
  def find_best_split_point(self, text, max_length):
866
900
  self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
@@ -963,45 +997,107 @@ class LyricsTranscriber:
963
997
  return new_segments
964
998
 
965
999
  def transcribe(self):
966
- transcription_cache_suffix = " (AudioShake).json" if self.audioshake_api_token else " (Whisper).json"
967
- self.outputs["transcription_data_filepath"] = self.get_cache_filepath(transcription_cache_suffix)
968
-
969
- transcription_cache_filepath = self.outputs["transcription_data_filepath"]
970
- if os.path.isfile(transcription_cache_filepath):
971
- self.logger.debug(f"transcribe found existing file at transcription_cache_filepath, reading: {transcription_cache_filepath}")
972
- with open(transcription_cache_filepath, "r") as cache_file:
973
- self.outputs["transcription_data_dict"] = json.load(cache_file)
974
- return
1000
+ # Check cache first
1001
+ transcription_cache_filepath_whisper = self.get_cache_filepath(" (Lyrics Whisper).json")
1002
+ transcription_cache_filepath_audioshake = self.get_cache_filepath(" (Lyrics AudioShake).json")
1003
+
1004
+ self.logger.debug(f"Cache directory: {self.cache_dir}")
1005
+ self.logger.debug(f"Output directory: {self.output_dir}")
1006
+
1007
+ if os.path.isfile(transcription_cache_filepath_whisper):
1008
+ self.logger.debug(f"Found existing Whisper transcription, reading: {transcription_cache_filepath_whisper}")
1009
+ with open(transcription_cache_filepath_whisper, "r") as cache_file:
1010
+ self.outputs["transcription_data_dict_whisper"] = json.load(cache_file)
1011
+ self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
1012
+ self.logger.debug(f"Loaded Whisper data and set filepath to: {self.outputs['transcription_data_whisper_filepath']}")
1013
+
1014
+ if os.path.isfile(transcription_cache_filepath_audioshake):
1015
+ self.logger.debug(f"Found existing AudioShake transcription, reading: {transcription_cache_filepath_audioshake}")
1016
+ with open(transcription_cache_filepath_audioshake, "r") as cache_file:
1017
+ self.outputs["transcription_data_dict_audioshake"] = json.load(cache_file)
1018
+ self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
1019
+
1020
+ # If we have both cached transcriptions, set primary and return early
1021
+ if self.outputs["transcription_data_dict_whisper"] and self.outputs["transcription_data_dict_audioshake"]:
1022
+ self.set_primary_transcription()
1023
+ return
1024
+ # If we have Whisper cached and AudioShake isn't available, set primary and return early
1025
+ elif self.outputs["transcription_data_dict_whisper"] and not self.audioshake_api_token:
1026
+ self.set_primary_transcription()
1027
+ return
975
1028
 
976
- if self.audioshake_api_token:
977
- self.logger.debug(f"Using AudioShake API for transcription")
1029
+ # Continue with transcription for any missing data...
1030
+ audioshake_job_id = None
1031
+ if self.audioshake_api_token and not self.outputs["transcription_data_dict_audioshake"]:
1032
+ self.logger.debug(f"Starting AudioShake transcription")
978
1033
  from .audioshake_transcriber import AudioShakeTranscriber
979
1034
 
980
1035
  audioshake = AudioShakeTranscriber(api_token=self.audioshake_api_token, logger=self.logger, output_prefix=self.output_prefix)
981
- transcription_data = audioshake.transcribe(self.audio_filepath)
982
- else:
1036
+ audioshake_job_id = audioshake.start_transcription(self.audio_filepath)
1037
+
1038
+ # Run Whisper transcription if needed while AudioShake processes
1039
+ if not self.outputs["transcription_data_dict_whisper"]:
983
1040
  self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
984
1041
  audio = whisper.load_audio(self.audio_filepath)
985
1042
  model = whisper.load_model(self.transcription_model, device="cpu")
986
- transcription_data = whisper.transcribe(model, audio, language="en", beam_size=5, temperature=0.2, best_of=5)
987
-
988
- # auditok is needed for voice activity detection, but it has OS package dependencies that are hard to install on some platforms
989
- # transcription_data = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
1043
+ whisper_data = whisper.transcribe(model, audio, language="en", beam_size=5, temperature=0.2, best_of=5)
990
1044
 
991
1045
  # Remove segments with no words, only music
992
- transcription_data["segments"] = [segment for segment in transcription_data["segments"] if segment["text"].strip() != "Music"]
993
- self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(transcription_data['segments'])}")
1046
+ whisper_data["segments"] = [segment for segment in whisper_data["segments"] if segment["text"].strip() != "Music"]
1047
+ self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(whisper_data['segments'])}")
994
1048
 
995
1049
  # Split long segments
996
1050
  self.logger.debug("Starting to split long segments")
997
- transcription_data["segments"] = self.split_long_segments(transcription_data["segments"], max_length=36)
998
- self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(transcription_data['segments'])}")
999
-
1000
- self.logger.debug(f"writing transcription data JSON to cache file: {transcription_cache_filepath}")
1001
- with open(transcription_cache_filepath, "w") as cache_file:
1002
- json.dump(transcription_data, cache_file, indent=4)
1051
+ whisper_data["segments"] = self.split_long_segments(whisper_data["segments"], max_length=36)
1052
+ self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(whisper_data['segments'])}")
1053
+
1054
+ # Store Whisper results
1055
+ self.outputs["transcription_data_dict_whisper"] = whisper_data
1056
+ self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
1057
+ with open(transcription_cache_filepath_whisper, "w") as cache_file:
1058
+ json.dump(whisper_data, cache_file, indent=4)
1059
+
1060
+ # Now that Whisper is done, get AudioShake results if available
1061
+ if audioshake_job_id:
1062
+ self.logger.debug("Getting AudioShake results")
1063
+ audioshake_data = audioshake.get_transcription_result(audioshake_job_id)
1064
+ self.outputs["transcription_data_dict_audioshake"] = audioshake_data
1065
+ self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
1066
+ with open(transcription_cache_filepath_audioshake, "w") as cache_file:
1067
+ json.dump(audioshake_data, cache_file, indent=4)
1068
+
1069
+ # Set the primary transcription source
1070
+ self.set_primary_transcription()
1071
+
1072
+ # Write the text files
1073
+ self.write_transcribed_lyrics_plain_text()
1003
1074
 
1004
- self.outputs["transcription_data_dict"] = transcription_data
1075
+ def set_primary_transcription(self):
1076
+ """Set the primary transcription source (AudioShake if available, otherwise Whisper)"""
1077
+ if self.outputs["transcription_data_dict_audioshake"]:
1078
+ self.logger.info("Using AudioShake as primary transcription source")
1079
+ self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_audioshake"]
1080
+ self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_audioshake_filepath"]
1081
+
1082
+ # Set the primary text content
1083
+ if "transcribed_lyrics_text_audioshake" not in self.outputs or not self.outputs["transcribed_lyrics_text_audioshake"]:
1084
+ self.outputs["transcribed_lyrics_text_audioshake"] = "\n".join(
1085
+ segment["text"].strip() for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]
1086
+ )
1087
+ self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_audioshake"]
1088
+ self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_audioshake_filepath"]
1089
+ else:
1090
+ self.logger.info("Using Whisper as primary transcription source")
1091
+ self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_whisper"]
1092
+ self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_whisper_filepath"]
1093
+
1094
+ # Set the primary text content
1095
+ if "transcribed_lyrics_text_whisper" not in self.outputs or not self.outputs["transcribed_lyrics_text_whisper"]:
1096
+ self.outputs["transcribed_lyrics_text_whisper"] = "\n".join(
1097
+ segment["text"].strip() for segment in self.outputs["transcription_data_dict_whisper"]["segments"]
1098
+ )
1099
+ self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_whisper"]
1100
+ self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_whisper_filepath"]
1005
1101
 
1006
1102
  def get_cache_filepath(self, extension):
1007
1103
  # Instead of using slugify and hash, use the consistent naming pattern
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lyrics-transcriber"
3
- version = "0.19.0"
3
+ version = "0.19.2"
4
4
  description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
5
5
  authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
6
6
  license = "MIT"