lyrics-transcriber 0.18.0__tar.gz → 0.19.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/PKG-INFO +1 -1
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/audioshake_transcriber.py +24 -8
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/transcriber.py +196 -84
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/pyproject.toml +1 -1
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/LICENSE +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/README.md +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/__init__.py +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/README.md +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/__init__.py +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/ass.py +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/cli.py +0 -0
- {lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/subtitles.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lyrics-transcriber
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.19.2
|
4
4
|
Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
|
5
5
|
Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
|
6
6
|
License: MIT
|
{lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/audioshake_transcriber.py
RENAMED
@@ -5,13 +5,15 @@ import json
|
|
5
5
|
|
6
6
|
|
7
7
|
class AudioShakeTranscriber:
|
8
|
-
def __init__(self, api_token, logger):
|
8
|
+
def __init__(self, api_token, logger, output_prefix):
|
9
9
|
self.api_token = api_token
|
10
10
|
self.base_url = "https://groovy.audioshake.ai"
|
11
11
|
self.logger = logger
|
12
|
+
self.output_prefix = output_prefix
|
12
13
|
|
13
|
-
def
|
14
|
-
|
14
|
+
def start_transcription(self, audio_filepath):
|
15
|
+
"""Starts the transcription job and returns the job ID without waiting for completion"""
|
16
|
+
self.logger.info(f"Starting transcription for {audio_filepath} using AudioShake API")
|
15
17
|
|
16
18
|
# Step 1: Upload the audio file
|
17
19
|
asset_id = self._upload_file(audio_filepath)
|
@@ -21,6 +23,12 @@ class AudioShakeTranscriber:
|
|
21
23
|
job_id = self._create_job(asset_id)
|
22
24
|
self.logger.info(f"Job created successfully. Job ID: {job_id}")
|
23
25
|
|
26
|
+
return job_id
|
27
|
+
|
28
|
+
def get_transcription_result(self, job_id):
|
29
|
+
"""Gets the results for a previously started job"""
|
30
|
+
self.logger.info(f"Getting results for job ID: {job_id}")
|
31
|
+
|
24
32
|
# Step 3: Wait for the job to complete and get the results
|
25
33
|
result = self._get_job_result(job_id)
|
26
34
|
self.logger.info(f"Job completed. Processing results...")
|
@@ -28,6 +36,11 @@ class AudioShakeTranscriber:
|
|
28
36
|
# Step 4: Process the result and return in the required format
|
29
37
|
return self._process_result(result)
|
30
38
|
|
39
|
+
def transcribe(self, audio_filepath):
|
40
|
+
"""Original method now just combines the two steps"""
|
41
|
+
job_id = self.start_transcription(audio_filepath)
|
42
|
+
return self.get_transcription_result(job_id)
|
43
|
+
|
31
44
|
def _upload_file(self, filepath):
|
32
45
|
self.logger.info(f"Uploading {filepath} to AudioShake")
|
33
46
|
url = f"{self.base_url}/upload"
|
@@ -76,13 +89,10 @@ class AudioShakeTranscriber:
|
|
76
89
|
output_assets = job_data.get("outputAssets", [])
|
77
90
|
self.logger.debug(f"Output assets: {output_assets}")
|
78
91
|
|
79
|
-
output_asset = next((asset for asset in output_assets if asset["name"] == "
|
80
|
-
if not output_asset:
|
81
|
-
self.logger.warning("'transcription.json' not found, looking for 'alignment.json'")
|
82
|
-
output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
|
92
|
+
output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
|
83
93
|
|
84
94
|
if not output_asset:
|
85
|
-
self.logger.error("
|
95
|
+
self.logger.error("'alignment.json' found in job results")
|
86
96
|
self.logger.error(f"Available output assets: {[asset['name'] for asset in output_assets]}")
|
87
97
|
raise Exception("Required output not found in job results")
|
88
98
|
|
@@ -103,4 +113,10 @@ class AudioShakeTranscriber:
|
|
103
113
|
if "text" not in segment:
|
104
114
|
segment["text"] = " ".join(word["text"] for word in segment["words"])
|
105
115
|
|
116
|
+
transcription_data["output_filename"] = self.get_output_filename(" (AudioShake)")
|
117
|
+
|
106
118
|
return transcription_data
|
119
|
+
|
120
|
+
def get_output_filename(self, suffix):
|
121
|
+
"""Generate consistent filename with (Purpose) suffix pattern"""
|
122
|
+
return f"{self.output_prefix}{suffix}"
|
@@ -66,7 +66,7 @@ class LyricsTranscriber:
|
|
66
66
|
self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
|
67
67
|
self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
|
68
68
|
self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
|
69
|
-
self.audioshake_api_token = os.getenv("
|
69
|
+
self.audioshake_api_token = os.getenv("AUDIOSHAKE_API_TOKEN", default=audioshake_api_token)
|
70
70
|
|
71
71
|
self.transcription_model = transcription_model
|
72
72
|
self.llm_model = llm_model
|
@@ -102,7 +102,7 @@ class LyricsTranscriber:
|
|
102
102
|
|
103
103
|
self.openai_client.log = self.log_level
|
104
104
|
else:
|
105
|
-
self.logger.
|
105
|
+
self.logger.warning("No OpenAI API key found, no correction will be applied to transcription")
|
106
106
|
|
107
107
|
self.render_video = render_video
|
108
108
|
self.video_resolution = video_resolution
|
@@ -137,10 +137,18 @@ class LyricsTranscriber:
|
|
137
137
|
raise FileNotFoundError(f"video_background is not a valid file path: {self.video_background_image}")
|
138
138
|
|
139
139
|
self.outputs = {
|
140
|
-
"
|
141
|
-
"
|
142
|
-
"
|
143
|
-
"
|
140
|
+
"transcription_data_dict_whisper": None,
|
141
|
+
"transcription_data_whisper_filepath": None,
|
142
|
+
"transcribed_lyrics_text_whisper": None,
|
143
|
+
"transcribed_lyrics_text_whisper_filepath": None,
|
144
|
+
"transcription_data_dict_audioshake": None,
|
145
|
+
"transcription_data_audioshake_filepath": None,
|
146
|
+
"transcribed_lyrics_text_audioshake": None,
|
147
|
+
"transcribed_lyrics_text_audioshake_filepath": None,
|
148
|
+
"transcription_data_dict_primary": None,
|
149
|
+
"transcription_data_primary_filepath": None,
|
150
|
+
"transcribed_lyrics_text_primary": None,
|
151
|
+
"transcribed_lyrics_text_primary_filepath": None,
|
144
152
|
"genius_lyrics_text": None,
|
145
153
|
"genius_lyrics_filepath": None,
|
146
154
|
"spotify_lyrics_data_dict": None,
|
@@ -166,10 +174,15 @@ class LyricsTranscriber:
|
|
166
174
|
|
167
175
|
self.create_folders()
|
168
176
|
|
177
|
+
self.output_prefix = f"{artist} - {title}"
|
178
|
+
|
169
179
|
def generate(self):
|
180
|
+
self.logger.debug(f"Starting generate() with cache_dir: {self.cache_dir} and output_dir: {self.output_dir}")
|
181
|
+
|
170
182
|
self.logger.debug(f"audio_filepath is set: {self.audio_filepath}, beginning initial whisper transcription")
|
171
183
|
|
172
184
|
self.transcribe()
|
185
|
+
|
173
186
|
self.write_transcribed_lyrics_plain_text()
|
174
187
|
|
175
188
|
self.write_genius_lyrics_file()
|
@@ -183,7 +196,7 @@ class LyricsTranscriber:
|
|
183
196
|
self.write_corrected_lyrics_plain_text()
|
184
197
|
else:
|
185
198
|
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
186
|
-
self.outputs["corrected_lyrics_data_dict"] = self.outputs["
|
199
|
+
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
|
187
200
|
self.write_corrected_lyrics_plain_text()
|
188
201
|
|
189
202
|
self.calculate_singing_percentage()
|
@@ -208,11 +221,15 @@ class LyricsTranscriber:
|
|
208
221
|
self.output_dir = os.getcwd()
|
209
222
|
|
210
223
|
self.logger.debug(f"copying temporary files to output dir: {self.output_dir}")
|
211
|
-
|
212
|
-
for key in self.outputs:
|
224
|
+
self.logger.debug("Files to copy:")
|
225
|
+
for key, value in self.outputs.items():
|
213
226
|
if key.endswith("_filepath"):
|
214
|
-
|
215
|
-
|
227
|
+
self.logger.debug(f" {key}: {value}")
|
228
|
+
if value and os.path.isfile(value):
|
229
|
+
self.logger.debug(f" File exists, copying to {self.output_dir}")
|
230
|
+
shutil.copy(value, self.output_dir)
|
231
|
+
else:
|
232
|
+
self.logger.debug(f" File doesn't exist or is None")
|
216
233
|
|
217
234
|
self.outputs["output_dir"] = self.output_dir
|
218
235
|
|
@@ -232,9 +249,7 @@ class LyricsTranscriber:
|
|
232
249
|
continue
|
233
250
|
|
234
251
|
if self.openai_client:
|
235
|
-
data_input_str =
|
236
|
-
f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
|
237
|
-
)
|
252
|
+
data_input_str = f'Data input 1:\n{self.outputs["transcribed_lyrics_text_primary"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
|
238
253
|
|
239
254
|
self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
|
240
255
|
response = self.openai_client.chat.completions.create(
|
@@ -263,7 +278,7 @@ class LyricsTranscriber:
|
|
263
278
|
else:
|
264
279
|
# Fallback primitive word matching
|
265
280
|
self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
|
266
|
-
transcribed_words = set(self.outputs["
|
281
|
+
transcribed_words = set(self.outputs["transcribed_lyrics_text_primary"].split())
|
267
282
|
online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
|
268
283
|
common_words = transcribed_words & online_lyrics_words
|
269
284
|
match_percentage = len(common_words) / len(online_lyrics_words) * 100
|
@@ -294,7 +309,7 @@ class LyricsTranscriber:
|
|
294
309
|
|
295
310
|
self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
|
296
311
|
|
297
|
-
corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir,
|
312
|
+
corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).json"))
|
298
313
|
|
299
314
|
if os.path.isfile(corrected_lyrics_data_json_cache_filepath):
|
300
315
|
self.logger.debug(
|
@@ -312,7 +327,7 @@ class LyricsTranscriber:
|
|
312
327
|
|
313
328
|
if not reference_lyrics:
|
314
329
|
self.logger.warning("No reference lyrics found from Genius or Spotify. Skipping LLM correction.")
|
315
|
-
self.outputs["corrected_lyrics_data_dict"] = self.outputs["
|
330
|
+
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
|
316
331
|
return
|
317
332
|
|
318
333
|
self.logger.debug(
|
@@ -331,11 +346,9 @@ class LyricsTranscriber:
|
|
331
346
|
# TODO: Possibly add a step after segment-based correct to get the LLM to self-analyse the diff
|
332
347
|
|
333
348
|
self.outputs["llm_transcript"] = ""
|
334
|
-
self.outputs["llm_transcript_filepath"] = os.path.join(
|
335
|
-
self.cache_dir, "lyrics-" + self.get_song_slug() + "-llm-correction-transcript.txt"
|
336
|
-
)
|
349
|
+
self.outputs["llm_transcript_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (LLM Transcript).txt"))
|
337
350
|
|
338
|
-
total_segments = len(self.outputs["
|
351
|
+
total_segments = len(self.outputs["transcription_data_dict_primary"]["segments"])
|
339
352
|
self.logger.info(f"Beginning correction using LLM, total segments: {total_segments}")
|
340
353
|
|
341
354
|
with open(self.outputs["llm_transcript_filepath"], "a", buffering=1, encoding="utf-8") as llm_transcript_file:
|
@@ -345,7 +358,7 @@ class LyricsTranscriber:
|
|
345
358
|
self.outputs["llm_transcript"] += llm_transcript_header
|
346
359
|
llm_transcript_file.write(llm_transcript_header)
|
347
360
|
|
348
|
-
for segment in self.outputs["
|
361
|
+
for segment in self.outputs["transcription_data_dict_primary"]["segments"]:
|
349
362
|
# # Don't waste OpenAI dollars when testing!
|
350
363
|
# if segment["id"] > 10:
|
351
364
|
# continue
|
@@ -371,7 +384,7 @@ class LyricsTranscriber:
|
|
371
384
|
if previous_segment["id"] in (segment["id"] - 2, segment["id"] - 1):
|
372
385
|
previous_two_corrected_lines += previous_segment["text"].strip() + "\n"
|
373
386
|
|
374
|
-
for next_segment in self.outputs["
|
387
|
+
for next_segment in self.outputs["transcription_data_dict_primary"]["segments"]:
|
375
388
|
if next_segment["id"] in (segment["id"] + 1, segment["id"] + 2):
|
376
389
|
upcoming_two_uncorrected_lines += next_segment["text"].strip() + "\n"
|
377
390
|
|
@@ -466,7 +479,9 @@ class LyricsTranscriber:
|
|
466
479
|
if self.outputs["corrected_lyrics_data_dict"]:
|
467
480
|
self.logger.debug(f"corrected_lyrics_data_dict exists, writing plain text lyrics file")
|
468
481
|
|
469
|
-
corrected_lyrics_text_filepath = os.path.join(
|
482
|
+
corrected_lyrics_text_filepath = os.path.join(
|
483
|
+
self.cache_dir, self.get_output_filename(" (Lyrics Corrected).txt") # Updated to use consistent naming
|
484
|
+
)
|
470
485
|
self.outputs["corrected_lyrics_text_filepath"] = corrected_lyrics_text_filepath
|
471
486
|
|
472
487
|
self.outputs["corrected_lyrics_text"] = ""
|
@@ -475,7 +490,7 @@ class LyricsTranscriber:
|
|
475
490
|
with open(corrected_lyrics_text_filepath, "w", encoding="utf-8") as f:
|
476
491
|
for corrected_segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
|
477
492
|
self.outputs["corrected_lyrics_text"] += corrected_segment["text"].strip() + "\n"
|
478
|
-
f.write(corrected_segment["text".strip()
|
493
|
+
f.write(corrected_segment["text"].strip() + "\n")
|
479
494
|
|
480
495
|
def write_spotify_lyrics_data_file(self):
|
481
496
|
if self.spotify_cookie and self.song_known:
|
@@ -484,7 +499,9 @@ class LyricsTranscriber:
|
|
484
499
|
self.logger.warning(f"skipping spotify fetch as not all spotify params were set")
|
485
500
|
return
|
486
501
|
|
487
|
-
spotify_lyrics_data_json_cache_filepath = os.path.join(
|
502
|
+
spotify_lyrics_data_json_cache_filepath = os.path.join(
|
503
|
+
self.cache_dir, self.get_output_filename(" (Lyrics Spotify).json") # Updated to use consistent naming
|
504
|
+
)
|
488
505
|
|
489
506
|
if os.path.isfile(spotify_lyrics_data_json_cache_filepath):
|
490
507
|
self.logger.debug(
|
@@ -531,7 +548,9 @@ class LyricsTranscriber:
|
|
531
548
|
if self.outputs["spotify_lyrics_data_dict"]:
|
532
549
|
self.logger.debug(f"spotify_lyrics data found, checking/writing plain text lyrics file")
|
533
550
|
|
534
|
-
spotify_lyrics_text_filepath = os.path.join(
|
551
|
+
spotify_lyrics_text_filepath = os.path.join(
|
552
|
+
self.cache_dir, self.get_output_filename(" (Lyrics Spotify).txt") # Updated to use consistent naming
|
553
|
+
)
|
535
554
|
self.outputs["spotify_lyrics_text_filepath"] = spotify_lyrics_text_filepath
|
536
555
|
|
537
556
|
lines = self.outputs["spotify_lyrics_data_dict"]["lyrics"]["lines"]
|
@@ -561,8 +580,9 @@ class LyricsTranscriber:
|
|
561
580
|
self.logger.warning(f"skipping genius fetch as not all genius params were set")
|
562
581
|
return
|
563
582
|
|
564
|
-
genius_lyrics_cache_filepath = os.path.join(self.cache_dir,
|
583
|
+
genius_lyrics_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Genius).txt"))
|
565
584
|
|
585
|
+
# Check cache first
|
566
586
|
if os.path.isfile(genius_lyrics_cache_filepath):
|
567
587
|
self.logger.debug(f"found existing file at genius_lyrics_cache_filepath, reading: {genius_lyrics_cache_filepath}")
|
568
588
|
|
@@ -570,15 +590,21 @@ class LyricsTranscriber:
|
|
570
590
|
self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
|
571
591
|
self.outputs["genius_lyrics_text"] = cached_lyrics.read()
|
572
592
|
return
|
573
|
-
|
574
593
|
self.logger.debug(f"no cached lyrics found at genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}, fetching from Genius")
|
575
|
-
|
594
|
+
|
595
|
+
# Initialize Genius with better defaults
|
596
|
+
genius = lyricsgenius.Genius(
|
597
|
+
self.genius_api_token,
|
598
|
+
verbose=(self.log_level == logging.DEBUG),
|
599
|
+
remove_section_headers=True,
|
600
|
+
)
|
576
601
|
|
577
602
|
try:
|
578
603
|
song = self.fetch_genius_lyrics(genius, self.title, self.artist)
|
579
604
|
if song is None:
|
580
605
|
self.logger.warning(f'Could not find lyrics on Genius for "{self.title}" by {self.artist}')
|
581
|
-
return
|
606
|
+
return None
|
607
|
+
|
582
608
|
lyrics = self.clean_genius_lyrics(song.lyrics)
|
583
609
|
|
584
610
|
self.logger.debug(f"writing clean lyrics to genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}")
|
@@ -587,6 +613,8 @@ class LyricsTranscriber:
|
|
587
613
|
|
588
614
|
self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
|
589
615
|
self.outputs["genius_lyrics_text"] = lyrics
|
616
|
+
return lyrics.split("\n") # Return lines like write_lyrics_from_genius
|
617
|
+
|
590
618
|
except requests.exceptions.RequestException as e:
|
591
619
|
self.logger.error(f"Failed to fetch lyrics from Genius after multiple retries: {e}")
|
592
620
|
raise
|
@@ -594,8 +622,13 @@ class LyricsTranscriber:
|
|
594
622
|
def clean_genius_lyrics(self, lyrics):
|
595
623
|
lyrics = lyrics.replace("\\n", "\n")
|
596
624
|
lyrics = re.sub(r"You might also like", "", lyrics)
|
597
|
-
|
598
|
-
|
625
|
+
lyrics = re.sub(
|
626
|
+
r".*?Lyrics([A-Z])", r"\1", lyrics
|
627
|
+
) # Remove the song name and word "Lyrics" if this has a non-newline char at the start
|
628
|
+
lyrics = re.sub(r"^[0-9]* Contributors.*Lyrics", "", lyrics) # Remove this example: 27 ContributorsSex Bomb Lyrics
|
629
|
+
lyrics = re.sub(
|
630
|
+
r"See.*Live.*Get tickets as low as \$[0-9]+", "", lyrics
|
631
|
+
) # Remove this example: See Tom Jones LiveGet tickets as low as $71
|
599
632
|
lyrics = re.sub(r"[0-9]+Embed$", "", lyrics) # Remove the word "Embed" at end of line with preceding numbers if found
|
600
633
|
lyrics = re.sub(r"(\S)Embed$", r"\1", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
|
601
634
|
lyrics = re.sub(r"^Embed$", r"", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
|
@@ -605,7 +638,9 @@ class LyricsTranscriber:
|
|
605
638
|
|
606
639
|
def calculate_singing_percentage(self):
|
607
640
|
# Calculate total seconds of singing using timings from whisper transcription results
|
608
|
-
total_singing_duration = sum(
|
641
|
+
total_singing_duration = sum(
|
642
|
+
segment["end"] - segment["start"] for segment in self.outputs["transcription_data_dict_primary"]["segments"]
|
643
|
+
)
|
609
644
|
|
610
645
|
self.logger.debug(f"calculated total_singing_duration: {int(total_singing_duration)} seconds, now running ffprobe")
|
611
646
|
|
@@ -635,7 +670,7 @@ class LyricsTranscriber:
|
|
635
670
|
# then loops over each word and writes all words with MidiCo segment start/end formatting
|
636
671
|
# and word-level timestamps to a MidiCo-compatible LRC file
|
637
672
|
def write_midico_lrc_file(self):
|
638
|
-
self.outputs["midico_lrc_filepath"] = self.
|
673
|
+
self.outputs["midico_lrc_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc"))
|
639
674
|
|
640
675
|
lrc_filename = self.outputs["midico_lrc_filepath"]
|
641
676
|
self.logger.debug(f"writing midico formatted word timestamps to LRC file: {lrc_filename}")
|
@@ -652,7 +687,7 @@ class LyricsTranscriber:
|
|
652
687
|
f.write(line)
|
653
688
|
|
654
689
|
def create_screens(self):
|
655
|
-
self.logger.debug("create_screens beginning generation of screens from
|
690
|
+
self.logger.debug("create_screens beginning generation of screens from transcription results")
|
656
691
|
screens: List[subtitles.LyricsScreen] = []
|
657
692
|
screen: Optional[subtitles.LyricsScreen] = None
|
658
693
|
|
@@ -692,9 +727,15 @@ class LyricsTranscriber:
|
|
692
727
|
self.logger.debug("Reset current line")
|
693
728
|
|
694
729
|
current_line_text += (" " if current_line_text else "") + word["text"]
|
730
|
+
|
731
|
+
# fmt: off
|
695
732
|
lyric_segment = subtitles.LyricSegment(
|
696
|
-
text=word["text"],
|
733
|
+
text=word["text"],
|
734
|
+
ts=timedelta(seconds=word["start"]),
|
735
|
+
end_ts=timedelta(seconds=word["end"])
|
697
736
|
)
|
737
|
+
# fmt: on
|
738
|
+
|
698
739
|
current_line.segments.append(lyric_segment)
|
699
740
|
self.logger.debug(f"Added word to current line. Current line: '{current_line_text}'")
|
700
741
|
|
@@ -706,13 +747,13 @@ class LyricsTranscriber:
|
|
706
747
|
return screens
|
707
748
|
|
708
749
|
def write_ass_file(self):
|
709
|
-
self.outputs["ass_subtitles_filepath"] = self.
|
750
|
+
self.outputs["ass_subtitles_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).ass"))
|
710
751
|
|
711
752
|
ass_filepath = self.outputs["ass_subtitles_filepath"]
|
712
753
|
self.logger.debug(f"writing ASS formatted subtitle file: {ass_filepath}")
|
713
754
|
|
714
|
-
|
715
|
-
screens = subtitles.set_segment_end_times(
|
755
|
+
initial_screens = self.create_screens()
|
756
|
+
screens = subtitles.set_segment_end_times(initial_screens, int(self.outputs["song_duration"]))
|
716
757
|
screens = subtitles.set_screen_start_times(screens)
|
717
758
|
lyric_subtitles_ass = subtitles.create_styled_subtitles(screens, self.video_resolution_num, self.font_size)
|
718
759
|
lyric_subtitles_ass.write(ass_filepath)
|
@@ -831,22 +872,29 @@ class LyricsTranscriber:
|
|
831
872
|
return formatted_time
|
832
873
|
|
833
874
|
def write_transcribed_lyrics_plain_text(self):
|
834
|
-
if self.outputs["
|
835
|
-
|
836
|
-
self.logger.debug(f"
|
837
|
-
|
838
|
-
|
839
|
-
|
875
|
+
if self.outputs["transcription_data_dict_whisper"]:
|
876
|
+
transcribed_lyrics_text_whisper_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Whisper).txt"))
|
877
|
+
self.logger.debug(f"Setting Whisper text filepath to: {transcribed_lyrics_text_whisper_filepath}")
|
878
|
+
self.outputs["transcribed_lyrics_text_whisper_filepath"] = transcribed_lyrics_text_whisper_filepath
|
879
|
+
self.outputs["transcribed_lyrics_text_whisper"] = ""
|
880
|
+
|
881
|
+
self.logger.debug(f"Writing Whisper lyrics to: {transcribed_lyrics_text_whisper_filepath}")
|
882
|
+
with open(transcribed_lyrics_text_whisper_filepath, "w", encoding="utf-8") as f:
|
883
|
+
for segment in self.outputs["transcription_data_dict_whisper"]["segments"]:
|
884
|
+
self.outputs["transcribed_lyrics_text_whisper"] += segment["text"] + "\n"
|
885
|
+
f.write(segment["text"].strip() + "\n")
|
886
|
+
self.logger.debug(f"Finished writing Whisper lyrics, file exists: {os.path.exists(transcribed_lyrics_text_whisper_filepath)}")
|
840
887
|
|
841
|
-
|
888
|
+
if self.outputs["transcription_data_dict_audioshake"]:
|
889
|
+
transcribed_lyrics_text_audioshake_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics AudioShake).txt"))
|
890
|
+
self.outputs["transcribed_lyrics_text_audioshake_filepath"] = transcribed_lyrics_text_audioshake_filepath
|
891
|
+
self.outputs["transcribed_lyrics_text_audioshake"] = ""
|
842
892
|
|
843
|
-
self.logger.debug(f"
|
844
|
-
with open(
|
845
|
-
for segment in self.outputs["
|
846
|
-
self.outputs["
|
893
|
+
self.logger.debug(f"Writing AudioShake lyrics to: {transcribed_lyrics_text_audioshake_filepath}")
|
894
|
+
with open(transcribed_lyrics_text_audioshake_filepath, "w", encoding="utf-8") as f:
|
895
|
+
for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]:
|
896
|
+
self.outputs["transcribed_lyrics_text_audioshake"] += segment["text"] + "\n"
|
847
897
|
f.write(segment["text"].strip() + "\n")
|
848
|
-
else:
|
849
|
-
raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
|
850
898
|
|
851
899
|
def find_best_split_point(self, text, max_length):
|
852
900
|
self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
|
@@ -949,51 +997,111 @@ class LyricsTranscriber:
|
|
949
997
|
return new_segments
|
950
998
|
|
951
999
|
def transcribe(self):
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
1000
|
+
# Check cache first
|
1001
|
+
transcription_cache_filepath_whisper = self.get_cache_filepath(" (Lyrics Whisper).json")
|
1002
|
+
transcription_cache_filepath_audioshake = self.get_cache_filepath(" (Lyrics AudioShake).json")
|
1003
|
+
|
1004
|
+
self.logger.debug(f"Cache directory: {self.cache_dir}")
|
1005
|
+
self.logger.debug(f"Output directory: {self.output_dir}")
|
1006
|
+
|
1007
|
+
if os.path.isfile(transcription_cache_filepath_whisper):
|
1008
|
+
self.logger.debug(f"Found existing Whisper transcription, reading: {transcription_cache_filepath_whisper}")
|
1009
|
+
with open(transcription_cache_filepath_whisper, "r") as cache_file:
|
1010
|
+
self.outputs["transcription_data_dict_whisper"] = json.load(cache_file)
|
1011
|
+
self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
|
1012
|
+
self.logger.debug(f"Loaded Whisper data and set filepath to: {self.outputs['transcription_data_whisper_filepath']}")
|
1013
|
+
|
1014
|
+
if os.path.isfile(transcription_cache_filepath_audioshake):
|
1015
|
+
self.logger.debug(f"Found existing AudioShake transcription, reading: {transcription_cache_filepath_audioshake}")
|
1016
|
+
with open(transcription_cache_filepath_audioshake, "r") as cache_file:
|
1017
|
+
self.outputs["transcription_data_dict_audioshake"] = json.load(cache_file)
|
1018
|
+
self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
|
1019
|
+
|
1020
|
+
# If we have both cached transcriptions, set primary and return early
|
1021
|
+
if self.outputs["transcription_data_dict_whisper"] and self.outputs["transcription_data_dict_audioshake"]:
|
1022
|
+
self.set_primary_transcription()
|
1023
|
+
return
|
1024
|
+
# If we have Whisper cached and AudioShake isn't available, set primary and return early
|
1025
|
+
elif self.outputs["transcription_data_dict_whisper"] and not self.audioshake_api_token:
|
1026
|
+
self.set_primary_transcription()
|
1027
|
+
return
|
961
1028
|
|
962
|
-
|
963
|
-
|
1029
|
+
# Continue with transcription for any missing data...
|
1030
|
+
audioshake_job_id = None
|
1031
|
+
if self.audioshake_api_token and not self.outputs["transcription_data_dict_audioshake"]:
|
1032
|
+
self.logger.debug(f"Starting AudioShake transcription")
|
964
1033
|
from .audioshake_transcriber import AudioShakeTranscriber
|
965
1034
|
|
966
|
-
audioshake = AudioShakeTranscriber(self.audioshake_api_token, logger=self.logger)
|
967
|
-
|
968
|
-
|
1035
|
+
audioshake = AudioShakeTranscriber(api_token=self.audioshake_api_token, logger=self.logger, output_prefix=self.output_prefix)
|
1036
|
+
audioshake_job_id = audioshake.start_transcription(self.audio_filepath)
|
1037
|
+
|
1038
|
+
# Run Whisper transcription if needed while AudioShake processes
|
1039
|
+
if not self.outputs["transcription_data_dict_whisper"]:
|
969
1040
|
self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
|
970
1041
|
audio = whisper.load_audio(self.audio_filepath)
|
971
1042
|
model = whisper.load_model(self.transcription_model, device="cpu")
|
972
|
-
|
973
|
-
|
974
|
-
# auditok is needed for voice activity detection, but it has OS package dependencies that are hard to install on some platforms
|
975
|
-
# transcription_data = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
|
1043
|
+
whisper_data = whisper.transcribe(model, audio, language="en", beam_size=5, temperature=0.2, best_of=5)
|
976
1044
|
|
977
1045
|
# Remove segments with no words, only music
|
978
|
-
|
979
|
-
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(
|
1046
|
+
whisper_data["segments"] = [segment for segment in whisper_data["segments"] if segment["text"].strip() != "Music"]
|
1047
|
+
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(whisper_data['segments'])}")
|
980
1048
|
|
981
1049
|
# Split long segments
|
982
1050
|
self.logger.debug("Starting to split long segments")
|
983
|
-
|
984
|
-
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
1051
|
+
whisper_data["segments"] = self.split_long_segments(whisper_data["segments"], max_length=36)
|
1052
|
+
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(whisper_data['segments'])}")
|
1053
|
+
|
1054
|
+
# Store Whisper results
|
1055
|
+
self.outputs["transcription_data_dict_whisper"] = whisper_data
|
1056
|
+
self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
|
1057
|
+
with open(transcription_cache_filepath_whisper, "w") as cache_file:
|
1058
|
+
json.dump(whisper_data, cache_file, indent=4)
|
1059
|
+
|
1060
|
+
# Now that Whisper is done, get AudioShake results if available
|
1061
|
+
if audioshake_job_id:
|
1062
|
+
self.logger.debug("Getting AudioShake results")
|
1063
|
+
audioshake_data = audioshake.get_transcription_result(audioshake_job_id)
|
1064
|
+
self.outputs["transcription_data_dict_audioshake"] = audioshake_data
|
1065
|
+
self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
|
1066
|
+
with open(transcription_cache_filepath_audioshake, "w") as cache_file:
|
1067
|
+
json.dump(audioshake_data, cache_file, indent=4)
|
1068
|
+
|
1069
|
+
# Set the primary transcription source
|
1070
|
+
self.set_primary_transcription()
|
1071
|
+
|
1072
|
+
# Write the text files
|
1073
|
+
self.write_transcribed_lyrics_plain_text()
|
989
1074
|
|
990
|
-
|
1075
|
+
def set_primary_transcription(self):
|
1076
|
+
"""Set the primary transcription source (AudioShake if available, otherwise Whisper)"""
|
1077
|
+
if self.outputs["transcription_data_dict_audioshake"]:
|
1078
|
+
self.logger.info("Using AudioShake as primary transcription source")
|
1079
|
+
self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_audioshake"]
|
1080
|
+
self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_audioshake_filepath"]
|
1081
|
+
|
1082
|
+
# Set the primary text content
|
1083
|
+
if "transcribed_lyrics_text_audioshake" not in self.outputs or not self.outputs["transcribed_lyrics_text_audioshake"]:
|
1084
|
+
self.outputs["transcribed_lyrics_text_audioshake"] = "\n".join(
|
1085
|
+
segment["text"].strip() for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]
|
1086
|
+
)
|
1087
|
+
self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_audioshake"]
|
1088
|
+
self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_audioshake_filepath"]
|
1089
|
+
else:
|
1090
|
+
self.logger.info("Using Whisper as primary transcription source")
|
1091
|
+
self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_whisper"]
|
1092
|
+
self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_whisper_filepath"]
|
1093
|
+
|
1094
|
+
# Set the primary text content
|
1095
|
+
if "transcribed_lyrics_text_whisper" not in self.outputs or not self.outputs["transcribed_lyrics_text_whisper"]:
|
1096
|
+
self.outputs["transcribed_lyrics_text_whisper"] = "\n".join(
|
1097
|
+
segment["text"].strip() for segment in self.outputs["transcription_data_dict_whisper"]["segments"]
|
1098
|
+
)
|
1099
|
+
self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_whisper"]
|
1100
|
+
self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_whisper_filepath"]
|
991
1101
|
|
992
1102
|
def get_cache_filepath(self, extension):
|
993
|
-
|
994
|
-
|
995
|
-
hash_value = self.get_file_hash(self.audio_filepath)
|
996
|
-
cache_filepath = os.path.join(self.cache_dir, filename_slug + "_" + hash_value + extension)
|
1103
|
+
# Instead of using slugify and hash, use the consistent naming pattern
|
1104
|
+
cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(extension))
|
997
1105
|
self.logger.debug(f"get_cache_filepath returning cache_filepath: {cache_filepath}")
|
998
1106
|
return cache_filepath
|
999
1107
|
|
@@ -1014,3 +1122,7 @@ class LyricsTranscriber:
|
|
1014
1122
|
|
1015
1123
|
if self.output_dir is not None:
|
1016
1124
|
os.makedirs(self.output_dir, exist_ok=True)
|
1125
|
+
|
1126
|
+
def get_output_filename(self, suffix):
|
1127
|
+
"""Generate consistent filename with (Purpose) suffix pattern"""
|
1128
|
+
return f"{self.output_prefix}{suffix}"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lyrics-transcriber"
|
3
|
-
version = "0.
|
3
|
+
version = "0.19.2"
|
4
4
|
description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
|
5
5
|
authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
|
6
6
|
license = "MIT"
|
File without changes
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/llm_prompts/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.18.0 → lyrics_transcriber-0.19.2}/lyrics_transcriber/utils/subtitles.py
RENAMED
File without changes
|