lyrics-transcriber 0.19.0__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/audioshake_transcriber.py +16 -7
- lyrics_transcriber/corrector.py +57 -0
- lyrics_transcriber/transcriber.py +234 -332
- {lyrics_transcriber-0.19.0.dist-info → lyrics_transcriber-0.20.0.dist-info}/METADATA +3 -3
- {lyrics_transcriber-0.19.0.dist-info → lyrics_transcriber-0.20.0.dist-info}/RECORD +8 -7
- {lyrics_transcriber-0.19.0.dist-info → lyrics_transcriber-0.20.0.dist-info}/LICENSE +0 -0
- {lyrics_transcriber-0.19.0.dist-info → lyrics_transcriber-0.20.0.dist-info}/WHEEL +0 -0
- {lyrics_transcriber-0.19.0.dist-info → lyrics_transcriber-0.20.0.dist-info}/entry_points.txt +0 -0
@@ -11,8 +11,9 @@ class AudioShakeTranscriber:
|
|
11
11
|
self.logger = logger
|
12
12
|
self.output_prefix = output_prefix
|
13
13
|
|
14
|
-
def
|
15
|
-
|
14
|
+
def start_transcription(self, audio_filepath):
|
15
|
+
"""Starts the transcription job and returns the job ID without waiting for completion"""
|
16
|
+
self.logger.info(f"Starting transcription for {audio_filepath} using AudioShake API")
|
16
17
|
|
17
18
|
# Step 1: Upload the audio file
|
18
19
|
asset_id = self._upload_file(audio_filepath)
|
@@ -22,6 +23,12 @@ class AudioShakeTranscriber:
|
|
22
23
|
job_id = self._create_job(asset_id)
|
23
24
|
self.logger.info(f"Job created successfully. Job ID: {job_id}")
|
24
25
|
|
26
|
+
return job_id
|
27
|
+
|
28
|
+
def get_transcription_result(self, job_id):
|
29
|
+
"""Gets the results for a previously started job"""
|
30
|
+
self.logger.info(f"Getting results for job ID: {job_id}")
|
31
|
+
|
25
32
|
# Step 3: Wait for the job to complete and get the results
|
26
33
|
result = self._get_job_result(job_id)
|
27
34
|
self.logger.info(f"Job completed. Processing results...")
|
@@ -29,6 +36,11 @@ class AudioShakeTranscriber:
|
|
29
36
|
# Step 4: Process the result and return in the required format
|
30
37
|
return self._process_result(result)
|
31
38
|
|
39
|
+
def transcribe(self, audio_filepath):
|
40
|
+
"""Original method now just combines the two steps"""
|
41
|
+
job_id = self.start_transcription(audio_filepath)
|
42
|
+
return self.get_transcription_result(job_id)
|
43
|
+
|
32
44
|
def _upload_file(self, filepath):
|
33
45
|
self.logger.info(f"Uploading {filepath} to AudioShake")
|
34
46
|
url = f"{self.base_url}/upload"
|
@@ -77,13 +89,10 @@ class AudioShakeTranscriber:
|
|
77
89
|
output_assets = job_data.get("outputAssets", [])
|
78
90
|
self.logger.debug(f"Output assets: {output_assets}")
|
79
91
|
|
80
|
-
output_asset = next((asset for asset in output_assets if asset["name"] == "
|
81
|
-
if not output_asset:
|
82
|
-
self.logger.warning("'transcription.json' not found, looking for 'alignment.json'")
|
83
|
-
output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
|
92
|
+
output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
|
84
93
|
|
85
94
|
if not output_asset:
|
86
|
-
self.logger.error("
|
95
|
+
self.logger.error("'alignment.json' found in job results")
|
87
96
|
self.logger.error(f"Available output assets: {[asset['name'] for asset in output_assets]}")
|
88
97
|
raise Exception("Required output not found in job results")
|
89
98
|
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from openai import OpenAI
|
4
|
+
from typing import Dict, Optional
|
5
|
+
|
6
|
+
|
7
|
+
class LyricsTranscriptionCorrector:
|
8
|
+
def __init__(
|
9
|
+
self,
|
10
|
+
logger: Optional[logging.Logger] = None,
|
11
|
+
):
|
12
|
+
self.logger = logger or logging.getLogger(__name__)
|
13
|
+
|
14
|
+
# Initialize instance variables for input data
|
15
|
+
self.spotify_lyrics_data_dict = None
|
16
|
+
self.spotify_lyrics_text = None
|
17
|
+
self.genius_lyrics_text = None
|
18
|
+
self.transcription_data_dict_whisper = None
|
19
|
+
self.transcription_data_dict_audioshake = None
|
20
|
+
|
21
|
+
def set_input_data(
|
22
|
+
self,
|
23
|
+
spotify_lyrics_data_dict: Optional[Dict] = None,
|
24
|
+
spotify_lyrics_text: Optional[str] = None,
|
25
|
+
genius_lyrics_text: Optional[str] = None,
|
26
|
+
transcription_data_dict_whisper: Optional[Dict] = None,
|
27
|
+
transcription_data_dict_audioshake: Optional[Dict] = None,
|
28
|
+
) -> None:
|
29
|
+
"""Store the input data as instance variables"""
|
30
|
+
self.spotify_lyrics_data_dict = spotify_lyrics_data_dict
|
31
|
+
self.spotify_lyrics_text = spotify_lyrics_text
|
32
|
+
self.genius_lyrics_text = genius_lyrics_text
|
33
|
+
self.transcription_data_dict_whisper = transcription_data_dict_whisper
|
34
|
+
self.transcription_data_dict_audioshake = transcription_data_dict_audioshake
|
35
|
+
|
36
|
+
def run_corrector(self) -> Dict:
|
37
|
+
"""
|
38
|
+
Test implementation that replaces every third word with 'YOLO' in the AudioShake transcription.
|
39
|
+
"""
|
40
|
+
self.logger.info("Running corrector (test implementation - replacing every 3rd word with YOLO)")
|
41
|
+
|
42
|
+
# Create a deep copy to avoid modifying the original
|
43
|
+
modified_data = json.loads(json.dumps(self.transcription_data_dict_audioshake))
|
44
|
+
|
45
|
+
# Process each segment
|
46
|
+
for segment in modified_data["segments"]:
|
47
|
+
# Replace every third word in the words list
|
48
|
+
for i in range(2, len(segment["words"]), 3):
|
49
|
+
segment["words"][i]["text"] = "YOLO"
|
50
|
+
|
51
|
+
# Reconstruct the segment text from the modified words
|
52
|
+
segment["text"] = " ".join(word["text"] for word in segment["words"])
|
53
|
+
|
54
|
+
# Reconstruct the full text from all segments
|
55
|
+
modified_data["text"] = "".join(segment["text"] for segment in modified_data["segments"])
|
56
|
+
|
57
|
+
return modified_data
|
@@ -13,9 +13,10 @@ import syrics.api
|
|
13
13
|
from datetime import timedelta
|
14
14
|
from .utils import subtitles
|
15
15
|
from typing import List, Optional
|
16
|
-
from openai import OpenAI
|
17
16
|
from tenacity import retry, stop_after_delay, wait_exponential, retry_if_exception_type
|
18
17
|
import requests
|
18
|
+
from karaoke_lyrics_processor import KaraokeLyricsProcessor
|
19
|
+
from .corrector import LyricsTranscriptionCorrector
|
19
20
|
|
20
21
|
|
21
22
|
class LyricsTranscriber:
|
@@ -24,18 +25,15 @@ class LyricsTranscriber:
|
|
24
25
|
audio_filepath,
|
25
26
|
artist=None,
|
26
27
|
title=None,
|
27
|
-
openai_api_key=None,
|
28
28
|
audioshake_api_token=None,
|
29
29
|
genius_api_token=None,
|
30
30
|
spotify_cookie=None,
|
31
|
+
skip_transcription=False,
|
31
32
|
output_dir=None,
|
32
33
|
cache_dir="/tmp/lyrics-transcriber-cache/",
|
33
34
|
log_level=logging.DEBUG,
|
34
35
|
log_formatter=None,
|
35
36
|
transcription_model="medium",
|
36
|
-
llm_model="gpt-4o",
|
37
|
-
llm_prompt_matching=None,
|
38
|
-
llm_prompt_correction=None,
|
39
37
|
render_video=False,
|
40
38
|
video_resolution="360p",
|
41
39
|
video_background_image=None,
|
@@ -63,47 +61,11 @@ class LyricsTranscriber:
|
|
63
61
|
self.title = title
|
64
62
|
self.song_known = self.artist is not None and self.title is not None
|
65
63
|
|
66
|
-
self.
|
64
|
+
self.audioshake_api_token = os.getenv("AUDIOSHAKE_API_TOKEN", default=audioshake_api_token)
|
67
65
|
self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
|
68
66
|
self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
|
69
|
-
self.audioshake_api_token = os.getenv("AUDIOSHAKE_TOKEN", default=audioshake_api_token)
|
70
67
|
|
71
68
|
self.transcription_model = transcription_model
|
72
|
-
self.llm_model = llm_model
|
73
|
-
|
74
|
-
# Use package-relative paths for prompt files
|
75
|
-
if llm_prompt_matching is None:
|
76
|
-
llm_prompt_matching = os.path.join(
|
77
|
-
os.path.dirname(__file__), "llm_prompts", "llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt"
|
78
|
-
)
|
79
|
-
if llm_prompt_correction is None:
|
80
|
-
llm_prompt_correction = os.path.join(
|
81
|
-
os.path.dirname(__file__), "llm_prompts", "llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt"
|
82
|
-
)
|
83
|
-
|
84
|
-
self.llm_prompt_matching = llm_prompt_matching
|
85
|
-
self.llm_prompt_correction = llm_prompt_correction
|
86
|
-
|
87
|
-
if not os.path.exists(self.llm_prompt_matching):
|
88
|
-
raise FileNotFoundError(f"LLM prompt file not found: {self.llm_prompt_matching}")
|
89
|
-
if not os.path.exists(self.llm_prompt_correction):
|
90
|
-
raise FileNotFoundError(f"LLM prompt file not found: {self.llm_prompt_correction}")
|
91
|
-
|
92
|
-
self.openai_client = None
|
93
|
-
|
94
|
-
if self.openai_api_key:
|
95
|
-
self.openai_client = OpenAI(api_key=self.openai_api_key)
|
96
|
-
|
97
|
-
# Uncomment for local models e.g. with ollama
|
98
|
-
# self.openai_client = OpenAI(
|
99
|
-
# base_url="http://localhost:11434/v1",
|
100
|
-
# api_key="ollama",
|
101
|
-
# )
|
102
|
-
|
103
|
-
self.openai_client.log = self.log_level
|
104
|
-
else:
|
105
|
-
self.logger.error("No OpenAI API key found, no correction will be applied to transcription")
|
106
|
-
|
107
69
|
self.render_video = render_video
|
108
70
|
self.video_resolution = video_resolution
|
109
71
|
self.video_background_image = video_background_image
|
@@ -137,19 +99,25 @@ class LyricsTranscriber:
|
|
137
99
|
raise FileNotFoundError(f"video_background is not a valid file path: {self.video_background_image}")
|
138
100
|
|
139
101
|
self.outputs = {
|
140
|
-
"
|
141
|
-
"
|
142
|
-
"
|
143
|
-
"
|
102
|
+
"transcription_data_dict_whisper": None,
|
103
|
+
"transcription_data_whisper_filepath": None,
|
104
|
+
"transcribed_lyrics_text_whisper": None,
|
105
|
+
"transcribed_lyrics_text_whisper_filepath": None,
|
106
|
+
"transcription_data_dict_audioshake": None,
|
107
|
+
"transcription_data_audioshake_filepath": None,
|
108
|
+
"transcribed_lyrics_text_audioshake": None,
|
109
|
+
"transcribed_lyrics_text_audioshake_filepath": None,
|
110
|
+
"transcription_data_dict_primary": None,
|
111
|
+
"transcription_data_primary_filepath": None,
|
112
|
+
"transcribed_lyrics_text_primary": None,
|
113
|
+
"transcribed_lyrics_text_primary_filepath": None,
|
144
114
|
"genius_lyrics_text": None,
|
145
|
-
"
|
115
|
+
"genius_lyrics_text_filepath": None,
|
116
|
+
"genius_lyrics_processed_filepath": None,
|
146
117
|
"spotify_lyrics_data_dict": None,
|
147
118
|
"spotify_lyrics_data_filepath": None,
|
148
119
|
"spotify_lyrics_text_filepath": None,
|
149
|
-
"
|
150
|
-
"llm_costs_usd": {"input": 0.0, "output": 0.0, "total": 0.0},
|
151
|
-
"llm_transcript": None,
|
152
|
-
"llm_transcript_filepath": None,
|
120
|
+
"spotify_lyrics_processed_filepath": None,
|
153
121
|
"corrected_lyrics_text": None,
|
154
122
|
"corrected_lyrics_text_filepath": None,
|
155
123
|
"midico_lrc_filepath": None,
|
@@ -168,40 +136,47 @@ class LyricsTranscriber:
|
|
168
136
|
|
169
137
|
self.output_prefix = f"{artist} - {title}"
|
170
138
|
|
139
|
+
self.skip_transcription = skip_transcription
|
140
|
+
|
171
141
|
def generate(self):
|
172
|
-
self.logger.debug(f"
|
142
|
+
self.logger.debug(f"Starting generate() with cache_dir: {self.cache_dir} and output_dir: {self.output_dir}")
|
173
143
|
|
174
|
-
self.
|
175
|
-
self.write_transcribed_lyrics_plain_text()
|
144
|
+
self.logger.debug(f"audio_filepath is set: {self.audio_filepath}, beginning initial whisper transcription")
|
176
145
|
|
177
|
-
self.write_genius_lyrics_file()
|
178
146
|
self.write_spotify_lyrics_data_file()
|
179
147
|
self.write_spotify_lyrics_plain_text()
|
148
|
+
if self.outputs["spotify_lyrics_text_filepath"]:
|
149
|
+
self.outputs["spotify_lyrics_processed_filepath"] = os.path.join(
|
150
|
+
self.cache_dir, self.get_output_filename(" (Lyrics Spotify Processed).txt")
|
151
|
+
)
|
152
|
+
self.write_processed_lyrics(self.outputs["spotify_lyrics_text_filepath"], self.outputs["spotify_lyrics_processed_filepath"])
|
153
|
+
|
154
|
+
self.write_genius_lyrics_file()
|
155
|
+
if self.outputs["genius_lyrics_text_filepath"]:
|
156
|
+
self.outputs["genius_lyrics_processed_filepath"] = os.path.join(
|
157
|
+
self.cache_dir, self.get_output_filename(" (Lyrics Genius Processed).txt")
|
158
|
+
)
|
159
|
+
self.write_processed_lyrics(self.outputs["genius_lyrics_text_filepath"], self.outputs["genius_lyrics_processed_filepath"])
|
180
160
|
|
181
|
-
self.
|
161
|
+
if not self.skip_transcription:
|
162
|
+
self.transcribe()
|
163
|
+
self.validate_lyrics_match_song()
|
182
164
|
|
183
|
-
|
184
|
-
self.write_corrected_lyrics_data_file()
|
185
|
-
self.write_corrected_lyrics_plain_text()
|
186
|
-
else:
|
187
|
-
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
188
|
-
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict"]
|
189
|
-
self.write_corrected_lyrics_plain_text()
|
165
|
+
self.correct_lyrics_transcription()
|
190
166
|
|
191
|
-
|
167
|
+
self.calculate_singing_percentage()
|
192
168
|
|
193
|
-
|
194
|
-
|
169
|
+
self.write_midico_lrc_file()
|
170
|
+
self.write_ass_file()
|
195
171
|
|
196
|
-
|
197
|
-
|
198
|
-
|
172
|
+
if self.render_video:
|
173
|
+
self.outputs["karaoke_video_filepath"] = self.get_cache_filepath(".mp4")
|
174
|
+
self.create_video()
|
175
|
+
else:
|
176
|
+
self.outputs["corrected_lyrics_text_filepath"] = self.outputs["genius_lyrics_text_filepath"]
|
177
|
+
self.outputs["corrected_lyrics_text"] = self.outputs["genius_lyrics_text"]
|
199
178
|
|
200
179
|
self.copy_files_to_output_dir()
|
201
|
-
self.calculate_llm_costs()
|
202
|
-
|
203
|
-
if self.openai_client:
|
204
|
-
self.openai_client.close()
|
205
180
|
|
206
181
|
return self.outputs
|
207
182
|
|
@@ -210,20 +185,21 @@ class LyricsTranscriber:
|
|
210
185
|
self.output_dir = os.getcwd()
|
211
186
|
|
212
187
|
self.logger.debug(f"copying temporary files to output dir: {self.output_dir}")
|
213
|
-
|
214
|
-
for key in self.outputs:
|
188
|
+
self.logger.debug("Files to copy:")
|
189
|
+
for key, value in self.outputs.items():
|
215
190
|
if key.endswith("_filepath"):
|
216
|
-
|
217
|
-
|
191
|
+
self.logger.debug(f" {key}: {value}")
|
192
|
+
if value and os.path.isfile(value):
|
193
|
+
self.logger.debug(f" File exists, copying to {self.output_dir}")
|
194
|
+
shutil.copy(value, self.output_dir)
|
195
|
+
else:
|
196
|
+
self.logger.debug(f" File doesn't exist or is None")
|
218
197
|
|
219
198
|
self.outputs["output_dir"] = self.output_dir
|
220
199
|
|
221
200
|
def validate_lyrics_match_song(self):
|
222
201
|
at_least_one_online_lyrics_validated = False
|
223
202
|
|
224
|
-
with open(self.llm_prompt_matching, "r") as file:
|
225
|
-
llm_matching_instructions = file.read()
|
226
|
-
|
227
203
|
for online_lyrics_source in ["genius", "spotify"]:
|
228
204
|
self.logger.debug(f"validating transcribed lyrics match lyrics from {online_lyrics_source}")
|
229
205
|
|
@@ -233,52 +209,21 @@ class LyricsTranscriber:
|
|
233
209
|
if online_lyrics_text_key not in self.outputs or self.outputs[online_lyrics_text_key] is None:
|
234
210
|
continue
|
235
211
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
212
|
+
self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
|
213
|
+
transcribed_words = set(self.outputs["transcribed_lyrics_text_primary"].split())
|
214
|
+
online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
|
215
|
+
common_words = transcribed_words & online_lyrics_words
|
216
|
+
match_percentage = len(common_words) / len(online_lyrics_words) * 100
|
240
217
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
|
218
|
+
if match_percentage >= 50:
|
219
|
+
self.logger.info(
|
220
|
+
f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
|
245
221
|
)
|
246
|
-
|
247
|
-
message = response.choices[0].message.content
|
248
|
-
finish_reason = response.choices[0].finish_reason
|
249
|
-
|
250
|
-
self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
|
251
|
-
self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
|
252
|
-
|
253
|
-
if finish_reason == "stop":
|
254
|
-
if message == "Yes":
|
255
|
-
self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
|
256
|
-
at_least_one_online_lyrics_validated = True
|
257
|
-
elif message == "No":
|
258
|
-
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
259
|
-
self.outputs[online_lyrics_text_key] = None
|
260
|
-
self.outputs[online_lyrics_filepath_key] = None
|
261
|
-
else:
|
262
|
-
self.logger.error(f"Unexpected response from LLM: {message}")
|
263
|
-
else:
|
264
|
-
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
|
222
|
+
at_least_one_online_lyrics_validated = True
|
265
223
|
else:
|
266
|
-
|
267
|
-
self.
|
268
|
-
|
269
|
-
online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
|
270
|
-
common_words = transcribed_words & online_lyrics_words
|
271
|
-
match_percentage = len(common_words) / len(online_lyrics_words) * 100
|
272
|
-
|
273
|
-
if match_percentage >= 50:
|
274
|
-
self.logger.info(
|
275
|
-
f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
|
276
|
-
)
|
277
|
-
at_least_one_online_lyrics_validated = True
|
278
|
-
else:
|
279
|
-
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
280
|
-
self.outputs[online_lyrics_text_key] = None
|
281
|
-
self.outputs[online_lyrics_filepath_key] = None
|
224
|
+
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
225
|
+
self.outputs[online_lyrics_text_key] = None
|
226
|
+
self.outputs[online_lyrics_filepath_key] = None
|
282
227
|
|
283
228
|
self.logger.info(
|
284
229
|
f"Completed validation of transcription using online lyrics sources. Match found: {at_least_one_online_lyrics_validated}"
|
@@ -289,178 +234,37 @@ class LyricsTranscriber:
|
|
289
234
|
f"Lyrics from Genius and Spotify did not match the transcription. Please check artist and title are set correctly."
|
290
235
|
)
|
291
236
|
|
292
|
-
def
|
293
|
-
if not self.openai_client:
|
294
|
-
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
295
|
-
return
|
296
|
-
|
297
|
-
self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
|
298
|
-
|
237
|
+
def correct_lyrics_transcription(self):
|
299
238
|
corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).json"))
|
300
239
|
|
301
240
|
if os.path.isfile(corrected_lyrics_data_json_cache_filepath):
|
302
|
-
self.logger.
|
241
|
+
self.logger.info(
|
303
242
|
f"found existing file at corrected_lyrics_data_json_cache_filepath, reading: {corrected_lyrics_data_json_cache_filepath}"
|
304
243
|
)
|
305
244
|
|
306
245
|
with open(corrected_lyrics_data_json_cache_filepath, "r") as corrected_lyrics_data_json:
|
307
246
|
self.outputs["corrected_lyrics_data_filepath"] = corrected_lyrics_data_json_cache_filepath
|
308
|
-
|
309
|
-
corrected_lyrics_data_dict = json.load(corrected_lyrics_data_json)
|
310
|
-
self.outputs["corrected_lyrics_data_dict"] = corrected_lyrics_data_dict
|
247
|
+
self.outputs["corrected_lyrics_data_dict"] = json.load(corrected_lyrics_data_json)
|
311
248
|
return
|
312
249
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
self.
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
self.logger.debug(
|
321
|
-
f"no cached lyrics found at corrected_lyrics_data_json_cache_filepath: {corrected_lyrics_data_json_cache_filepath}, attempting to run correction using LLM"
|
250
|
+
lyrics_corrector = LyricsTranscriptionCorrector(logger=self.logger)
|
251
|
+
lyrics_corrector.set_input_data(
|
252
|
+
spotify_lyrics_data_dict=self.outputs["spotify_lyrics_data_dict"],
|
253
|
+
spotify_lyrics_text=self.outputs["spotify_lyrics_text"],
|
254
|
+
genius_lyrics_text=self.outputs["genius_lyrics_text"],
|
255
|
+
transcription_data_dict_whisper=self.outputs["transcription_data_dict_whisper"],
|
256
|
+
transcription_data_dict_audioshake=self.outputs["transcription_data_dict_audioshake"],
|
322
257
|
)
|
258
|
+
self.outputs["corrected_lyrics_data_dict"] = lyrics_corrector.run_corrector()
|
323
259
|
|
324
|
-
|
325
|
-
|
326
|
-
with open(
|
327
|
-
|
328
|
-
|
329
|
-
system_prompt = system_prompt_template.replace("{{reference_lyrics}}", reference_lyrics)
|
330
|
-
|
331
|
-
# TODO: Test if results are cleaner when using the vocal file from a background vocal audio separation model
|
332
|
-
# TODO: Record more info about the correction process (e.g before/after diffs for each segment) to a file for debugging
|
333
|
-
# TODO: Possibly add a step after segment-based correct to get the LLM to self-analyse the diff
|
334
|
-
|
335
|
-
self.outputs["llm_transcript"] = ""
|
336
|
-
self.outputs["llm_transcript_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (LLM Transcript).txt"))
|
337
|
-
|
338
|
-
total_segments = len(self.outputs["transcription_data_dict"]["segments"])
|
339
|
-
self.logger.info(f"Beginning correction using LLM, total segments: {total_segments}")
|
340
|
-
|
341
|
-
with open(self.outputs["llm_transcript_filepath"], "a", buffering=1, encoding="utf-8") as llm_transcript_file:
|
342
|
-
self.logger.debug(f"writing LLM chat instructions: {self.outputs['llm_transcript_filepath']}")
|
343
|
-
|
344
|
-
llm_transcript_header = f"--- SYSTEM instructions passed in for all segments ---:\n\n{system_prompt}\n"
|
345
|
-
self.outputs["llm_transcript"] += llm_transcript_header
|
346
|
-
llm_transcript_file.write(llm_transcript_header)
|
347
|
-
|
348
|
-
for segment in self.outputs["transcription_data_dict"]["segments"]:
|
349
|
-
# # Don't waste OpenAI dollars when testing!
|
350
|
-
# if segment["id"] > 10:
|
351
|
-
# continue
|
352
|
-
# if segment["id"] < 20 or segment["id"] > 24:
|
353
|
-
# continue
|
354
|
-
|
355
|
-
llm_transcript_segment = ""
|
356
|
-
segment_input = json.dumps(
|
357
|
-
{
|
358
|
-
"id": segment["id"],
|
359
|
-
"start": segment["start"],
|
360
|
-
"end": segment["end"],
|
361
|
-
"confidence": segment["confidence"],
|
362
|
-
"text": segment["text"],
|
363
|
-
"words": segment["words"],
|
364
|
-
}
|
365
|
-
)
|
366
|
-
|
367
|
-
previous_two_corrected_lines = ""
|
368
|
-
upcoming_two_uncorrected_lines = ""
|
369
|
-
|
370
|
-
for previous_segment in corrected_lyrics_dict["segments"]:
|
371
|
-
if previous_segment["id"] in (segment["id"] - 2, segment["id"] - 1):
|
372
|
-
previous_two_corrected_lines += previous_segment["text"].strip() + "\n"
|
373
|
-
|
374
|
-
for next_segment in self.outputs["transcription_data_dict"]["segments"]:
|
375
|
-
if next_segment["id"] in (segment["id"] + 1, segment["id"] + 2):
|
376
|
-
upcoming_two_uncorrected_lines += next_segment["text"].strip() + "\n"
|
377
|
-
|
378
|
-
llm_transcript_segment += f"--- Segment {segment['id']} / {total_segments} ---\n"
|
379
|
-
llm_transcript_segment += f"Previous two corrected lines:\n\n{previous_two_corrected_lines}\nUpcoming two uncorrected lines:\n\n{upcoming_two_uncorrected_lines}\nData input:\n\n{segment_input}\n"
|
380
|
-
|
381
|
-
# fmt: off
|
382
|
-
segment_prompt = system_prompt_template.replace(
|
383
|
-
"{{previous_two_corrected_lines}}", previous_two_corrected_lines
|
384
|
-
).replace(
|
385
|
-
"{{upcoming_two_uncorrected_lines}}", upcoming_two_uncorrected_lines
|
386
|
-
).replace(
|
387
|
-
"{{segment_input}}", segment_input
|
388
|
-
)
|
389
|
-
|
390
|
-
self.logger.info(
|
391
|
-
f'Calling completion model {self.llm_model} with instructions and data input for segment {segment["id"]} / {total_segments}:'
|
392
|
-
)
|
393
|
-
|
394
|
-
response = self.openai_client.chat.completions.create(
|
395
|
-
model=self.llm_model,
|
396
|
-
response_format={"type": "json_object"},
|
397
|
-
seed=10,
|
398
|
-
temperature=0.4,
|
399
|
-
messages=[
|
400
|
-
{
|
401
|
-
"role": "user",
|
402
|
-
"content": segment_prompt
|
403
|
-
}
|
404
|
-
],
|
405
|
-
)
|
406
|
-
# fmt: on
|
407
|
-
|
408
|
-
message = response.choices[0].message.content
|
409
|
-
finish_reason = response.choices[0].finish_reason
|
410
|
-
|
411
|
-
llm_transcript_segment += f"\n--- RESPONSE for segment {segment['id']} ---:\n\n"
|
412
|
-
llm_transcript_segment += message
|
413
|
-
llm_transcript_segment += f"\n--- END segment {segment['id']} / {total_segments} ---:\n\n"
|
414
|
-
|
415
|
-
self.logger.debug(f"writing LLM chat transcript for segment to: {self.outputs['llm_transcript_filepath']}")
|
416
|
-
llm_transcript_file.write(llm_transcript_segment)
|
417
|
-
self.outputs["llm_transcript"] += llm_transcript_segment
|
418
|
-
|
419
|
-
self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
|
420
|
-
self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
|
421
|
-
|
422
|
-
# self.logger.debug(f"response finish_reason: {finish_reason} message: \n{message}")
|
423
|
-
|
424
|
-
if finish_reason == "stop":
|
425
|
-
try:
|
426
|
-
corrected_segment_dict = json.loads(message)
|
427
|
-
corrected_lyrics_dict["segments"].append(corrected_segment_dict)
|
428
|
-
self.logger.info("Successfully parsed response from GPT as JSON and appended to corrected_lyrics_dict.segments")
|
429
|
-
except json.JSONDecodeError as e:
|
430
|
-
raise Exception("Failed to parse response from GPT as JSON") from e
|
431
|
-
else:
|
432
|
-
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
|
433
|
-
|
434
|
-
self.logger.info(f'Successfully processed correction for all {len(corrected_lyrics_dict["segments"])} lyrics segments')
|
435
|
-
|
436
|
-
self.logger.debug(f"writing corrected lyrics data JSON filepath: {corrected_lyrics_data_json_cache_filepath}")
|
437
|
-
with open(corrected_lyrics_data_json_cache_filepath, "w", encoding="utf-8") as corrected_lyrics_data_json_cache_file:
|
438
|
-
corrected_lyrics_data_json_cache_file.write(json.dumps(corrected_lyrics_dict, indent=4))
|
260
|
+
# Save the corrected lyrics to output JSON file
|
261
|
+
self.logger.debug(f"writing corrected lyrics data JSON filepath: {corrected_lyrics_data_json_cache_filepath}")
|
262
|
+
with open(corrected_lyrics_data_json_cache_filepath, "w", encoding="utf-8") as f:
|
263
|
+
f.write(json.dumps(self.outputs["corrected_lyrics_data_dict"], indent=4))
|
439
264
|
|
440
265
|
self.outputs["corrected_lyrics_data_filepath"] = corrected_lyrics_data_json_cache_filepath
|
441
|
-
self.outputs["corrected_lyrics_data_dict"] = corrected_lyrics_dict
|
442
|
-
|
443
|
-
def calculate_llm_costs(self):
|
444
|
-
price_dollars_per_1000_tokens = {
|
445
|
-
"gpt-3.5-turbo-1106": {
|
446
|
-
"input": 0.0010,
|
447
|
-
"output": 0.0020,
|
448
|
-
},
|
449
|
-
"gpt-4-1106-preview": {
|
450
|
-
"input": 0.01,
|
451
|
-
"output": 0.03,
|
452
|
-
},
|
453
|
-
}
|
454
266
|
|
455
|
-
|
456
|
-
output_price = price_dollars_per_1000_tokens.get(self.llm_model, {"input": 0, "output": 0})["output"]
|
457
|
-
|
458
|
-
input_cost = input_price * (self.outputs["llm_token_usage"]["input"] / 1000)
|
459
|
-
output_cost = output_price * (self.outputs["llm_token_usage"]["output"] / 1000)
|
460
|
-
|
461
|
-
self.outputs["llm_costs_usd"]["input"] = round(input_cost, 3)
|
462
|
-
self.outputs["llm_costs_usd"]["output"] = round(output_cost, 3)
|
463
|
-
self.outputs["llm_costs_usd"]["total"] = round(input_cost + output_cost, 3)
|
267
|
+
self.write_corrected_lyrics_plain_text()
|
464
268
|
|
465
269
|
def write_corrected_lyrics_plain_text(self):
|
466
270
|
if self.outputs["corrected_lyrics_data_dict"]:
|
@@ -569,30 +373,39 @@ class LyricsTranscriber:
|
|
569
373
|
|
570
374
|
genius_lyrics_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Genius).txt"))
|
571
375
|
|
376
|
+
# Check cache first
|
572
377
|
if os.path.isfile(genius_lyrics_cache_filepath):
|
573
378
|
self.logger.debug(f"found existing file at genius_lyrics_cache_filepath, reading: {genius_lyrics_cache_filepath}")
|
574
379
|
|
575
380
|
with open(genius_lyrics_cache_filepath, "r") as cached_lyrics:
|
576
|
-
self.outputs["
|
381
|
+
self.outputs["genius_lyrics_text_filepath"] = genius_lyrics_cache_filepath
|
577
382
|
self.outputs["genius_lyrics_text"] = cached_lyrics.read()
|
578
383
|
return
|
579
|
-
|
580
384
|
self.logger.debug(f"no cached lyrics found at genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}, fetching from Genius")
|
581
|
-
|
385
|
+
|
386
|
+
# Initialize Genius with better defaults
|
387
|
+
genius = lyricsgenius.Genius(
|
388
|
+
self.genius_api_token,
|
389
|
+
verbose=(self.log_level == logging.DEBUG),
|
390
|
+
remove_section_headers=True,
|
391
|
+
)
|
582
392
|
|
583
393
|
try:
|
584
394
|
song = self.fetch_genius_lyrics(genius, self.title, self.artist)
|
585
395
|
if song is None:
|
586
396
|
self.logger.warning(f'Could not find lyrics on Genius for "{self.title}" by {self.artist}')
|
587
|
-
return
|
397
|
+
return None
|
398
|
+
|
588
399
|
lyrics = self.clean_genius_lyrics(song.lyrics)
|
589
400
|
|
590
401
|
self.logger.debug(f"writing clean lyrics to genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}")
|
591
402
|
with open(genius_lyrics_cache_filepath, "w", encoding="utf-8") as f:
|
592
403
|
f.write(lyrics)
|
593
404
|
|
594
|
-
self.outputs["
|
405
|
+
self.outputs["genius_lyrics_text_filepath"] = genius_lyrics_cache_filepath
|
595
406
|
self.outputs["genius_lyrics_text"] = lyrics
|
407
|
+
return lyrics.split("\n") # Return lines like write_lyrics_from_genius
|
408
|
+
|
596
409
|
except requests.exceptions.RequestException as e:
|
597
410
|
self.logger.error(f"Failed to fetch lyrics from Genius after multiple retries: {e}")
|
598
411
|
raise
|
@@ -600,8 +413,13 @@ class LyricsTranscriber:
|
|
600
413
|
def clean_genius_lyrics(self, lyrics):
|
601
414
|
lyrics = lyrics.replace("\\n", "\n")
|
602
415
|
lyrics = re.sub(r"You might also like", "", lyrics)
|
603
|
-
|
604
|
-
|
416
|
+
lyrics = re.sub(
|
417
|
+
r".*?Lyrics([A-Z])", r"\1", lyrics
|
418
|
+
) # Remove the song name and word "Lyrics" if this has a non-newline char at the start
|
419
|
+
lyrics = re.sub(r"^[0-9]* Contributors.*Lyrics", "", lyrics) # Remove this example: 27 ContributorsSex Bomb Lyrics
|
420
|
+
lyrics = re.sub(
|
421
|
+
r"See.*Live.*Get tickets as low as \$[0-9]+", "", lyrics
|
422
|
+
) # Remove this example: See Tom Jones LiveGet tickets as low as $71
|
605
423
|
lyrics = re.sub(r"[0-9]+Embed$", "", lyrics) # Remove the word "Embed" at end of line with preceding numbers if found
|
606
424
|
lyrics = re.sub(r"(\S)Embed$", r"\1", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
|
607
425
|
lyrics = re.sub(r"^Embed$", r"", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
|
@@ -611,7 +429,9 @@ class LyricsTranscriber:
|
|
611
429
|
|
612
430
|
def calculate_singing_percentage(self):
|
613
431
|
# Calculate total seconds of singing using timings from whisper transcription results
|
614
|
-
total_singing_duration = sum(
|
432
|
+
total_singing_duration = sum(
|
433
|
+
segment["end"] - segment["start"] for segment in self.outputs["transcription_data_dict_primary"]["segments"]
|
434
|
+
)
|
615
435
|
|
616
436
|
self.logger.debug(f"calculated total_singing_duration: {int(total_singing_duration)} seconds, now running ffprobe")
|
617
437
|
|
@@ -641,9 +461,7 @@ class LyricsTranscriber:
|
|
641
461
|
# then loops over each word and writes all words with MidiCo segment start/end formatting
|
642
462
|
# and word-level timestamps to a MidiCo-compatible LRC file
|
643
463
|
def write_midico_lrc_file(self):
|
644
|
-
self.outputs["midico_lrc_filepath"] = os.path.join(
|
645
|
-
self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc") # Updated suffix
|
646
|
-
)
|
464
|
+
self.outputs["midico_lrc_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc"))
|
647
465
|
|
648
466
|
lrc_filename = self.outputs["midico_lrc_filepath"]
|
649
467
|
self.logger.debug(f"writing midico formatted word timestamps to LRC file: {lrc_filename}")
|
@@ -660,7 +478,7 @@ class LyricsTranscriber:
|
|
660
478
|
f.write(line)
|
661
479
|
|
662
480
|
def create_screens(self):
|
663
|
-
self.logger.debug("create_screens beginning generation of screens from
|
481
|
+
self.logger.debug("create_screens beginning generation of screens from transcription results")
|
664
482
|
screens: List[subtitles.LyricsScreen] = []
|
665
483
|
screen: Optional[subtitles.LyricsScreen] = None
|
666
484
|
|
@@ -725,8 +543,8 @@ class LyricsTranscriber:
|
|
725
543
|
ass_filepath = self.outputs["ass_subtitles_filepath"]
|
726
544
|
self.logger.debug(f"writing ASS formatted subtitle file: {ass_filepath}")
|
727
545
|
|
728
|
-
|
729
|
-
screens = subtitles.set_segment_end_times(
|
546
|
+
initial_screens = self.create_screens()
|
547
|
+
screens = subtitles.set_segment_end_times(initial_screens, int(self.outputs["song_duration"]))
|
730
548
|
screens = subtitles.set_screen_start_times(screens)
|
731
549
|
lyric_subtitles_ass = subtitles.create_styled_subtitles(screens, self.video_resolution_num, self.font_size)
|
732
550
|
lyric_subtitles_ass.write(ass_filepath)
|
@@ -845,22 +663,29 @@ class LyricsTranscriber:
|
|
845
663
|
return formatted_time
|
846
664
|
|
847
665
|
def write_transcribed_lyrics_plain_text(self):
|
848
|
-
if self.outputs["
|
849
|
-
|
850
|
-
self.logger.debug(f"
|
851
|
-
|
852
|
-
|
853
|
-
|
666
|
+
if self.outputs["transcription_data_dict_whisper"]:
|
667
|
+
transcribed_lyrics_text_whisper_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Whisper).txt"))
|
668
|
+
self.logger.debug(f"Setting Whisper text filepath to: {transcribed_lyrics_text_whisper_filepath}")
|
669
|
+
self.outputs["transcribed_lyrics_text_whisper_filepath"] = transcribed_lyrics_text_whisper_filepath
|
670
|
+
self.outputs["transcribed_lyrics_text_whisper"] = ""
|
671
|
+
|
672
|
+
self.logger.debug(f"Writing Whisper lyrics to: {transcribed_lyrics_text_whisper_filepath}")
|
673
|
+
with open(transcribed_lyrics_text_whisper_filepath, "w", encoding="utf-8") as f:
|
674
|
+
for segment in self.outputs["transcription_data_dict_whisper"]["segments"]:
|
675
|
+
self.outputs["transcribed_lyrics_text_whisper"] += segment["text"] + "\n"
|
676
|
+
f.write(segment["text"].strip() + "\n")
|
677
|
+
self.logger.debug(f"Finished writing Whisper lyrics, file exists: {os.path.exists(transcribed_lyrics_text_whisper_filepath)}")
|
854
678
|
|
855
|
-
|
679
|
+
if self.outputs["transcription_data_dict_audioshake"]:
|
680
|
+
transcribed_lyrics_text_audioshake_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics AudioShake).txt"))
|
681
|
+
self.outputs["transcribed_lyrics_text_audioshake_filepath"] = transcribed_lyrics_text_audioshake_filepath
|
682
|
+
self.outputs["transcribed_lyrics_text_audioshake"] = ""
|
856
683
|
|
857
|
-
self.logger.debug(f"
|
858
|
-
with open(
|
859
|
-
for segment in self.outputs["
|
860
|
-
self.outputs["
|
684
|
+
self.logger.debug(f"Writing AudioShake lyrics to: {transcribed_lyrics_text_audioshake_filepath}")
|
685
|
+
with open(transcribed_lyrics_text_audioshake_filepath, "w", encoding="utf-8") as f:
|
686
|
+
for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]:
|
687
|
+
self.outputs["transcribed_lyrics_text_audioshake"] += segment["text"] + "\n"
|
861
688
|
f.write(segment["text"].strip() + "\n")
|
862
|
-
else:
|
863
|
-
raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
|
864
689
|
|
865
690
|
def find_best_split_point(self, text, max_length):
|
866
691
|
self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
|
@@ -963,45 +788,122 @@ class LyricsTranscriber:
|
|
963
788
|
return new_segments
|
964
789
|
|
965
790
|
def transcribe(self):
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
791
|
+
# Check cache first
|
792
|
+
transcription_cache_filepath_whisper = self.get_cache_filepath(" (Lyrics Whisper).json")
|
793
|
+
transcription_cache_filepath_audioshake = self.get_cache_filepath(" (Lyrics AudioShake).json")
|
794
|
+
|
795
|
+
self.logger.debug(f"Cache directory: {self.cache_dir}")
|
796
|
+
self.logger.debug(f"Output directory: {self.output_dir}")
|
797
|
+
|
798
|
+
if os.path.isfile(transcription_cache_filepath_whisper):
|
799
|
+
self.logger.debug(f"Found existing Whisper transcription, reading: {transcription_cache_filepath_whisper}")
|
800
|
+
with open(transcription_cache_filepath_whisper, "r") as cache_file:
|
801
|
+
self.outputs["transcription_data_dict_whisper"] = json.load(cache_file)
|
802
|
+
self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
|
803
|
+
self.logger.debug(f"Loaded Whisper data and set filepath to: {self.outputs['transcription_data_whisper_filepath']}")
|
804
|
+
|
805
|
+
if os.path.isfile(transcription_cache_filepath_audioshake):
|
806
|
+
self.logger.debug(f"Found existing AudioShake transcription, reading: {transcription_cache_filepath_audioshake}")
|
807
|
+
with open(transcription_cache_filepath_audioshake, "r") as cache_file:
|
808
|
+
self.outputs["transcription_data_dict_audioshake"] = json.load(cache_file)
|
809
|
+
self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
|
810
|
+
|
811
|
+
# If we have both cached transcriptions, set primary and return early
|
812
|
+
if self.outputs["transcription_data_dict_whisper"] and self.outputs["transcription_data_dict_audioshake"]:
|
813
|
+
self.set_primary_transcription()
|
814
|
+
return
|
815
|
+
# If we have Whisper cached and AudioShake isn't available, set primary and return early
|
816
|
+
elif self.outputs["transcription_data_dict_whisper"] and not self.audioshake_api_token:
|
817
|
+
self.set_primary_transcription()
|
818
|
+
return
|
975
819
|
|
976
|
-
|
977
|
-
|
820
|
+
# Continue with transcription for any missing data...
|
821
|
+
audioshake_job_id = None
|
822
|
+
if self.audioshake_api_token and not self.outputs["transcription_data_dict_audioshake"]:
|
823
|
+
self.logger.debug(f"Starting AudioShake transcription")
|
978
824
|
from .audioshake_transcriber import AudioShakeTranscriber
|
979
825
|
|
980
826
|
audioshake = AudioShakeTranscriber(api_token=self.audioshake_api_token, logger=self.logger, output_prefix=self.output_prefix)
|
981
|
-
|
982
|
-
|
827
|
+
audioshake_job_id = audioshake.start_transcription(self.audio_filepath)
|
828
|
+
|
829
|
+
# Run Whisper transcription if needed while AudioShake processes
|
830
|
+
if not self.outputs["transcription_data_dict_whisper"]:
|
983
831
|
self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
|
984
832
|
audio = whisper.load_audio(self.audio_filepath)
|
985
833
|
model = whisper.load_model(self.transcription_model, device="cpu")
|
986
|
-
|
987
|
-
|
988
|
-
# auditok is needed for voice activity detection, but it has OS package dependencies that are hard to install on some platforms
|
989
|
-
# transcription_data = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
|
834
|
+
whisper_data = whisper.transcribe(model, audio, language="en", beam_size=5, temperature=0.2, best_of=5)
|
990
835
|
|
991
836
|
# Remove segments with no words, only music
|
992
|
-
|
993
|
-
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(
|
837
|
+
whisper_data["segments"] = [segment for segment in whisper_data["segments"] if segment["text"].strip() != "Music"]
|
838
|
+
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(whisper_data['segments'])}")
|
994
839
|
|
995
840
|
# Split long segments
|
996
841
|
self.logger.debug("Starting to split long segments")
|
997
|
-
|
998
|
-
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(
|
842
|
+
whisper_data["segments"] = self.split_long_segments(whisper_data["segments"], max_length=36)
|
843
|
+
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(whisper_data['segments'])}")
|
844
|
+
|
845
|
+
# Store Whisper results
|
846
|
+
self.outputs["transcription_data_dict_whisper"] = whisper_data
|
847
|
+
self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
|
848
|
+
with open(transcription_cache_filepath_whisper, "w") as cache_file:
|
849
|
+
json.dump(whisper_data, cache_file, indent=4)
|
850
|
+
|
851
|
+
# Now that Whisper is done, get AudioShake results if available
|
852
|
+
if audioshake_job_id:
|
853
|
+
self.logger.debug("Getting AudioShake results")
|
854
|
+
audioshake_data = audioshake.get_transcription_result(audioshake_job_id)
|
855
|
+
self.outputs["transcription_data_dict_audioshake"] = audioshake_data
|
856
|
+
self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
|
857
|
+
with open(transcription_cache_filepath_audioshake, "w") as cache_file:
|
858
|
+
json.dump(audioshake_data, cache_file, indent=4)
|
859
|
+
|
860
|
+
# Set the primary transcription source
|
861
|
+
self.set_primary_transcription()
|
862
|
+
|
863
|
+
# Write the text files
|
864
|
+
self.write_transcribed_lyrics_plain_text()
|
999
865
|
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
866
|
+
def set_primary_transcription(self):
|
867
|
+
"""Set the primary transcription source (AudioShake if available, otherwise Whisper)"""
|
868
|
+
if self.outputs["transcription_data_dict_audioshake"]:
|
869
|
+
self.logger.info("Using AudioShake as primary transcription source")
|
870
|
+
self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_audioshake"]
|
871
|
+
self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_audioshake_filepath"]
|
872
|
+
|
873
|
+
# Set the primary text content
|
874
|
+
if "transcribed_lyrics_text_audioshake" not in self.outputs or not self.outputs["transcribed_lyrics_text_audioshake"]:
|
875
|
+
self.outputs["transcribed_lyrics_text_audioshake"] = "\n".join(
|
876
|
+
segment["text"].strip() for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]
|
877
|
+
)
|
878
|
+
self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_audioshake"]
|
879
|
+
self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_audioshake_filepath"]
|
880
|
+
else:
|
881
|
+
self.logger.info("Using Whisper as primary transcription source")
|
882
|
+
self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_whisper"]
|
883
|
+
self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_whisper_filepath"]
|
884
|
+
|
885
|
+
# Set the primary text content
|
886
|
+
if "transcribed_lyrics_text_whisper" not in self.outputs or not self.outputs["transcribed_lyrics_text_whisper"]:
|
887
|
+
self.outputs["transcribed_lyrics_text_whisper"] = "\n".join(
|
888
|
+
segment["text"].strip() for segment in self.outputs["transcription_data_dict_whisper"]["segments"]
|
889
|
+
)
|
890
|
+
self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_whisper"]
|
891
|
+
self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_whisper_filepath"]
|
892
|
+
|
893
|
+
def write_processed_lyrics(self, lyrics_file, processed_lyrics_file):
|
894
|
+
self.logger.info(f"Processing lyrics from {lyrics_file} and writing to {processed_lyrics_file}")
|
895
|
+
|
896
|
+
processor = KaraokeLyricsProcessor(
|
897
|
+
log_level=self.log_level,
|
898
|
+
log_formatter=self.log_formatter,
|
899
|
+
input_filename=lyrics_file,
|
900
|
+
output_filename=processed_lyrics_file,
|
901
|
+
max_line_length=36,
|
902
|
+
)
|
903
|
+
processor.process()
|
904
|
+
processor.write_to_output_file()
|
1003
905
|
|
1004
|
-
self.
|
906
|
+
self.logger.info(f"Lyrics processing complete, processed lyrics written to: {processed_lyrics_file}")
|
1005
907
|
|
1006
908
|
def get_cache_filepath(self, extension):
|
1007
909
|
# Instead of using slugify and hash, use the consistent naming pattern
|
@@ -1,21 +1,21 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lyrics-transcriber
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.20.0
|
4
4
|
Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
|
5
5
|
Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
|
6
6
|
License: MIT
|
7
7
|
Author: Andrew Beveridge
|
8
8
|
Author-email: andrew@beveridge.uk
|
9
|
-
Requires-Python: >=3.9
|
9
|
+
Requires-Python: >=3.9,<3.13
|
10
10
|
Classifier: License :: OSI Approved :: MIT License
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Classifier: Programming Language :: Python :: 3.9
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
|
-
Classifier: Programming Language :: Python :: 3.13
|
17
16
|
Requires-Dist: Cython (>=0)
|
18
17
|
Requires-Dist: dtw-python (>=1)
|
18
|
+
Requires-Dist: karaoke-lyrics-processor (>=0.4.1)
|
19
19
|
Requires-Dist: llvmlite (>=0)
|
20
20
|
Requires-Dist: lyricsgenius (>=3)
|
21
21
|
Requires-Dist: numba (>=0.57)
|
@@ -1,18 +1,19 @@
|
|
1
1
|
lyrics_transcriber/__init__.py,sha256=bIRjsXAzlghS1rQxWNLU0wppZy0T_iciN9EclHLwNrQ,94
|
2
|
-
lyrics_transcriber/audioshake_transcriber.py,sha256=
|
2
|
+
lyrics_transcriber/audioshake_transcriber.py,sha256=AbIkghvguI1PV0fCMUHGRnidQwLPM_pQ96FI0Qk-aI0,5221
|
3
|
+
lyrics_transcriber/corrector.py,sha256=LVicUYBCz2TpzzPUbzgLfNYebYJLj7yVvbERMHuXzTY,2300
|
3
4
|
lyrics_transcriber/llm_prompts/README.md,sha256=DPAGRDVGt9ZNcQAAoQGFhwesLY3D6hD8apL71yHP4yo,196
|
4
5
|
lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt,sha256=a3XjAYfyhWt1uCKKqm_n2Pc0STdmBdiHHtJ7ODP99Nk,4046
|
5
6
|
lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt,sha256=r6HN3DD_3gwh3B_JPd2R0I4lDXuB5iy7B90J9agOxbQ,2369
|
6
7
|
lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt,sha256=hvk2Vs3M3Q4zGQsiQnXvnpd8wXWfwsudYeqN5qFyNWs,1754
|
7
8
|
lyrics_transcriber/llm_prompts/promptfooconfig.yaml,sha256=O4YxlLV7XSUiSw_1Q9G7ELC2VAbrYUV_N5QxrPbd1jE,3735
|
8
9
|
lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt,sha256=8d-RvZtyINKUlpQLwMi-VD--Y59J-epPt7SZSqjFbPI,1690
|
9
|
-
lyrics_transcriber/transcriber.py,sha256=
|
10
|
+
lyrics_transcriber/transcriber.py,sha256=SrZLY4zEqSd--jgXqRUtgX6oyhM8POpL91AMas_Dpzw,47897
|
10
11
|
lyrics_transcriber/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
12
|
lyrics_transcriber/utils/ass.py,sha256=b8lnjgXGD1OD1ld_b1xxUmSOf4nSEfz9BpgSkh16R4g,90291
|
12
13
|
lyrics_transcriber/utils/cli.py,sha256=8Poba_9wQw0VmOK73vuK-w-abR9QmO4y4FYDHiAQbc0,6972
|
13
14
|
lyrics_transcriber/utils/subtitles.py,sha256=_WG0pFoZMXcrGe6gbARkC9KrWzFNTMOsiqQwNL-H2lU,11812
|
14
|
-
lyrics_transcriber-0.
|
15
|
-
lyrics_transcriber-0.
|
16
|
-
lyrics_transcriber-0.
|
17
|
-
lyrics_transcriber-0.
|
18
|
-
lyrics_transcriber-0.
|
15
|
+
lyrics_transcriber-0.20.0.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
|
16
|
+
lyrics_transcriber-0.20.0.dist-info/METADATA,sha256=1mOcGn2Hb5Nw3nKH0Cc41Zv7_gp4a-H4DLDnktEeRNs,5830
|
17
|
+
lyrics_transcriber-0.20.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
18
|
+
lyrics_transcriber-0.20.0.dist-info/entry_points.txt,sha256=lh6L-iR5CGELaNcouDK94X78eS5Ua_tK9lI4UEkza-k,72
|
19
|
+
lyrics_transcriber-0.20.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.19.0.dist-info → lyrics_transcriber-0.20.0.dist-info}/entry_points.txt
RENAMED
File without changes
|