lyrics-transcriber 0.19.2__py3-none-any.whl → 0.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/__init__.py +2 -5
- lyrics_transcriber/cli/main.py +194 -0
- lyrics_transcriber/core/__init__.py +0 -0
- lyrics_transcriber/core/controller.py +283 -0
- lyrics_transcriber/core/corrector.py +56 -0
- lyrics_transcriber/core/fetcher.py +143 -0
- lyrics_transcriber/output/__init__.py +0 -0
- lyrics_transcriber/output/generator.py +210 -0
- lyrics_transcriber/storage/__init__.py +0 -0
- lyrics_transcriber/storage/dropbox.py +249 -0
- lyrics_transcriber/storage/tokens.py +116 -0
- lyrics_transcriber/{audioshake_transcriber.py → transcribers/audioshake.py} +44 -15
- lyrics_transcriber/transcribers/base.py +31 -0
- lyrics_transcriber/transcribers/whisper.py +186 -0
- {lyrics_transcriber-0.19.2.dist-info → lyrics_transcriber-0.30.0.dist-info}/METADATA +6 -17
- lyrics_transcriber-0.30.0.dist-info/RECORD +22 -0
- lyrics_transcriber-0.30.0.dist-info/entry_points.txt +3 -0
- lyrics_transcriber/llm_prompts/README.md +0 -10
- lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -55
- lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -36
- lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -19
- lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -61
- lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -48
- lyrics_transcriber/transcriber.py +0 -1128
- lyrics_transcriber/utils/cli.py +0 -179
- lyrics_transcriber-0.19.2.dist-info/RECORD +0 -18
- lyrics_transcriber-0.19.2.dist-info/entry_points.txt +0 -3
- /lyrics_transcriber/{utils → cli}/__init__.py +0 -0
- /lyrics_transcriber/{utils → output}/ass.py +0 -0
- /lyrics_transcriber/{utils → output}/subtitles.py +0 -0
- {lyrics_transcriber-0.19.2.dist-info → lyrics_transcriber-0.30.0.dist-info}/LICENSE +0 -0
- {lyrics_transcriber-0.19.2.dist-info → lyrics_transcriber-0.30.0.dist-info}/WHEEL +0 -0
@@ -1,1128 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import sys
|
3
|
-
import re
|
4
|
-
import json
|
5
|
-
import logging
|
6
|
-
import shutil
|
7
|
-
import hashlib
|
8
|
-
import subprocess
|
9
|
-
import slugify
|
10
|
-
import whisper_timestamped as whisper
|
11
|
-
import lyricsgenius
|
12
|
-
import syrics.api
|
13
|
-
from datetime import timedelta
|
14
|
-
from .utils import subtitles
|
15
|
-
from typing import List, Optional
|
16
|
-
from openai import OpenAI
|
17
|
-
from tenacity import retry, stop_after_delay, wait_exponential, retry_if_exception_type
|
18
|
-
import requests
|
19
|
-
|
20
|
-
|
21
|
-
class LyricsTranscriber:
|
22
|
-
def __init__(
|
23
|
-
self,
|
24
|
-
audio_filepath,
|
25
|
-
artist=None,
|
26
|
-
title=None,
|
27
|
-
openai_api_key=None,
|
28
|
-
audioshake_api_token=None,
|
29
|
-
genius_api_token=None,
|
30
|
-
spotify_cookie=None,
|
31
|
-
output_dir=None,
|
32
|
-
cache_dir="/tmp/lyrics-transcriber-cache/",
|
33
|
-
log_level=logging.DEBUG,
|
34
|
-
log_formatter=None,
|
35
|
-
transcription_model="medium",
|
36
|
-
llm_model="gpt-4o",
|
37
|
-
llm_prompt_matching=None,
|
38
|
-
llm_prompt_correction=None,
|
39
|
-
render_video=False,
|
40
|
-
video_resolution="360p",
|
41
|
-
video_background_image=None,
|
42
|
-
video_background_color="black",
|
43
|
-
):
|
44
|
-
self.logger = logging.getLogger(__name__)
|
45
|
-
self.logger.setLevel(log_level)
|
46
|
-
self.log_level = log_level
|
47
|
-
self.log_formatter = log_formatter
|
48
|
-
|
49
|
-
self.log_handler = logging.StreamHandler()
|
50
|
-
|
51
|
-
if self.log_formatter is None:
|
52
|
-
self.log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(module)s - %(message)s")
|
53
|
-
|
54
|
-
self.log_handler.setFormatter(self.log_formatter)
|
55
|
-
self.logger.addHandler(self.log_handler)
|
56
|
-
|
57
|
-
self.logger.debug(f"LyricsTranscriber instantiating with input file: {audio_filepath}")
|
58
|
-
|
59
|
-
self.cache_dir = cache_dir
|
60
|
-
self.output_dir = output_dir
|
61
|
-
self.audio_filepath = audio_filepath
|
62
|
-
self.artist = artist
|
63
|
-
self.title = title
|
64
|
-
self.song_known = self.artist is not None and self.title is not None
|
65
|
-
|
66
|
-
self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
|
67
|
-
self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
|
68
|
-
self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
|
69
|
-
self.audioshake_api_token = os.getenv("AUDIOSHAKE_API_TOKEN", default=audioshake_api_token)
|
70
|
-
|
71
|
-
self.transcription_model = transcription_model
|
72
|
-
self.llm_model = llm_model
|
73
|
-
|
74
|
-
# Use package-relative paths for prompt files
|
75
|
-
if llm_prompt_matching is None:
|
76
|
-
llm_prompt_matching = os.path.join(
|
77
|
-
os.path.dirname(__file__), "llm_prompts", "llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt"
|
78
|
-
)
|
79
|
-
if llm_prompt_correction is None:
|
80
|
-
llm_prompt_correction = os.path.join(
|
81
|
-
os.path.dirname(__file__), "llm_prompts", "llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt"
|
82
|
-
)
|
83
|
-
|
84
|
-
self.llm_prompt_matching = llm_prompt_matching
|
85
|
-
self.llm_prompt_correction = llm_prompt_correction
|
86
|
-
|
87
|
-
if not os.path.exists(self.llm_prompt_matching):
|
88
|
-
raise FileNotFoundError(f"LLM prompt file not found: {self.llm_prompt_matching}")
|
89
|
-
if not os.path.exists(self.llm_prompt_correction):
|
90
|
-
raise FileNotFoundError(f"LLM prompt file not found: {self.llm_prompt_correction}")
|
91
|
-
|
92
|
-
self.openai_client = None
|
93
|
-
|
94
|
-
if self.openai_api_key:
|
95
|
-
self.openai_client = OpenAI(api_key=self.openai_api_key)
|
96
|
-
|
97
|
-
# Uncomment for local models e.g. with ollama
|
98
|
-
# self.openai_client = OpenAI(
|
99
|
-
# base_url="http://localhost:11434/v1",
|
100
|
-
# api_key="ollama",
|
101
|
-
# )
|
102
|
-
|
103
|
-
self.openai_client.log = self.log_level
|
104
|
-
else:
|
105
|
-
self.logger.warning("No OpenAI API key found, no correction will be applied to transcription")
|
106
|
-
|
107
|
-
self.render_video = render_video
|
108
|
-
self.video_resolution = video_resolution
|
109
|
-
self.video_background_image = video_background_image
|
110
|
-
self.video_background_color = video_background_color
|
111
|
-
|
112
|
-
match video_resolution:
|
113
|
-
case "4k":
|
114
|
-
self.video_resolution_num = (3840, 2160)
|
115
|
-
self.font_size = 250
|
116
|
-
self.line_height = 250
|
117
|
-
case "1080p":
|
118
|
-
self.video_resolution_num = (1920, 1080)
|
119
|
-
self.font_size = 120
|
120
|
-
self.line_height = 120
|
121
|
-
case "720p":
|
122
|
-
self.video_resolution_num = (1280, 720)
|
123
|
-
self.font_size = 100
|
124
|
-
self.line_height = 100
|
125
|
-
case "360p":
|
126
|
-
self.video_resolution_num = (640, 360)
|
127
|
-
self.font_size = 50
|
128
|
-
self.line_height = 50
|
129
|
-
case _:
|
130
|
-
raise ValueError("Invalid video_resolution value. Must be one of: 4k, 1080p, 720p, 360p")
|
131
|
-
|
132
|
-
# If a video background is provided, validate file exists
|
133
|
-
if self.video_background_image is not None:
|
134
|
-
if os.path.isfile(self.video_background_image):
|
135
|
-
self.logger.debug(f"video_background is valid file path: {self.video_background_image}")
|
136
|
-
else:
|
137
|
-
raise FileNotFoundError(f"video_background is not a valid file path: {self.video_background_image}")
|
138
|
-
|
139
|
-
self.outputs = {
|
140
|
-
"transcription_data_dict_whisper": None,
|
141
|
-
"transcription_data_whisper_filepath": None,
|
142
|
-
"transcribed_lyrics_text_whisper": None,
|
143
|
-
"transcribed_lyrics_text_whisper_filepath": None,
|
144
|
-
"transcription_data_dict_audioshake": None,
|
145
|
-
"transcription_data_audioshake_filepath": None,
|
146
|
-
"transcribed_lyrics_text_audioshake": None,
|
147
|
-
"transcribed_lyrics_text_audioshake_filepath": None,
|
148
|
-
"transcription_data_dict_primary": None,
|
149
|
-
"transcription_data_primary_filepath": None,
|
150
|
-
"transcribed_lyrics_text_primary": None,
|
151
|
-
"transcribed_lyrics_text_primary_filepath": None,
|
152
|
-
"genius_lyrics_text": None,
|
153
|
-
"genius_lyrics_filepath": None,
|
154
|
-
"spotify_lyrics_data_dict": None,
|
155
|
-
"spotify_lyrics_data_filepath": None,
|
156
|
-
"spotify_lyrics_text_filepath": None,
|
157
|
-
"llm_token_usage": {"input": 0, "output": 0},
|
158
|
-
"llm_costs_usd": {"input": 0.0, "output": 0.0, "total": 0.0},
|
159
|
-
"llm_transcript": None,
|
160
|
-
"llm_transcript_filepath": None,
|
161
|
-
"corrected_lyrics_text": None,
|
162
|
-
"corrected_lyrics_text_filepath": None,
|
163
|
-
"midico_lrc_filepath": None,
|
164
|
-
"ass_subtitles_filepath": None,
|
165
|
-
"karaoke_video_filepath": None,
|
166
|
-
"singing_percentage": None,
|
167
|
-
"total_singing_duration": None,
|
168
|
-
"song_duration": None,
|
169
|
-
"output_dir": None,
|
170
|
-
}
|
171
|
-
|
172
|
-
if self.audio_filepath is None:
|
173
|
-
raise Exception("audio_filepath must be specified as the input source to transcribe")
|
174
|
-
|
175
|
-
self.create_folders()
|
176
|
-
|
177
|
-
self.output_prefix = f"{artist} - {title}"
|
178
|
-
|
179
|
-
def generate(self):
|
180
|
-
self.logger.debug(f"Starting generate() with cache_dir: {self.cache_dir} and output_dir: {self.output_dir}")
|
181
|
-
|
182
|
-
self.logger.debug(f"audio_filepath is set: {self.audio_filepath}, beginning initial whisper transcription")
|
183
|
-
|
184
|
-
self.transcribe()
|
185
|
-
|
186
|
-
self.write_transcribed_lyrics_plain_text()
|
187
|
-
|
188
|
-
self.write_genius_lyrics_file()
|
189
|
-
self.write_spotify_lyrics_data_file()
|
190
|
-
self.write_spotify_lyrics_plain_text()
|
191
|
-
|
192
|
-
self.validate_lyrics_match_song()
|
193
|
-
|
194
|
-
if self.openai_client:
|
195
|
-
self.write_corrected_lyrics_data_file()
|
196
|
-
self.write_corrected_lyrics_plain_text()
|
197
|
-
else:
|
198
|
-
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
199
|
-
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
|
200
|
-
self.write_corrected_lyrics_plain_text()
|
201
|
-
|
202
|
-
self.calculate_singing_percentage()
|
203
|
-
|
204
|
-
self.write_midico_lrc_file()
|
205
|
-
self.write_ass_file()
|
206
|
-
|
207
|
-
if self.render_video:
|
208
|
-
self.outputs["karaoke_video_filepath"] = self.get_cache_filepath(".mp4")
|
209
|
-
self.create_video()
|
210
|
-
|
211
|
-
self.copy_files_to_output_dir()
|
212
|
-
self.calculate_llm_costs()
|
213
|
-
|
214
|
-
if self.openai_client:
|
215
|
-
self.openai_client.close()
|
216
|
-
|
217
|
-
return self.outputs
|
218
|
-
|
219
|
-
def copy_files_to_output_dir(self):
|
220
|
-
if self.output_dir is None:
|
221
|
-
self.output_dir = os.getcwd()
|
222
|
-
|
223
|
-
self.logger.debug(f"copying temporary files to output dir: {self.output_dir}")
|
224
|
-
self.logger.debug("Files to copy:")
|
225
|
-
for key, value in self.outputs.items():
|
226
|
-
if key.endswith("_filepath"):
|
227
|
-
self.logger.debug(f" {key}: {value}")
|
228
|
-
if value and os.path.isfile(value):
|
229
|
-
self.logger.debug(f" File exists, copying to {self.output_dir}")
|
230
|
-
shutil.copy(value, self.output_dir)
|
231
|
-
else:
|
232
|
-
self.logger.debug(f" File doesn't exist or is None")
|
233
|
-
|
234
|
-
self.outputs["output_dir"] = self.output_dir
|
235
|
-
|
236
|
-
def validate_lyrics_match_song(self):
|
237
|
-
at_least_one_online_lyrics_validated = False
|
238
|
-
|
239
|
-
with open(self.llm_prompt_matching, "r") as file:
|
240
|
-
llm_matching_instructions = file.read()
|
241
|
-
|
242
|
-
for online_lyrics_source in ["genius", "spotify"]:
|
243
|
-
self.logger.debug(f"validating transcribed lyrics match lyrics from {online_lyrics_source}")
|
244
|
-
|
245
|
-
online_lyrics_text_key = f"{online_lyrics_source}_lyrics_text"
|
246
|
-
online_lyrics_filepath_key = f"{online_lyrics_source}_lyrics_filepath"
|
247
|
-
|
248
|
-
if online_lyrics_text_key not in self.outputs or self.outputs[online_lyrics_text_key] is None:
|
249
|
-
continue
|
250
|
-
|
251
|
-
if self.openai_client:
|
252
|
-
data_input_str = f'Data input 1:\n{self.outputs["transcribed_lyrics_text_primary"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
|
253
|
-
|
254
|
-
self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
|
255
|
-
response = self.openai_client.chat.completions.create(
|
256
|
-
model=self.llm_model,
|
257
|
-
messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
|
258
|
-
)
|
259
|
-
|
260
|
-
message = response.choices[0].message.content
|
261
|
-
finish_reason = response.choices[0].finish_reason
|
262
|
-
|
263
|
-
self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
|
264
|
-
self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
|
265
|
-
|
266
|
-
if finish_reason == "stop":
|
267
|
-
if message == "Yes":
|
268
|
-
self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
|
269
|
-
at_least_one_online_lyrics_validated = True
|
270
|
-
elif message == "No":
|
271
|
-
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
272
|
-
self.outputs[online_lyrics_text_key] = None
|
273
|
-
self.outputs[online_lyrics_filepath_key] = None
|
274
|
-
else:
|
275
|
-
self.logger.error(f"Unexpected response from LLM: {message}")
|
276
|
-
else:
|
277
|
-
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
|
278
|
-
else:
|
279
|
-
# Fallback primitive word matching
|
280
|
-
self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
|
281
|
-
transcribed_words = set(self.outputs["transcribed_lyrics_text_primary"].split())
|
282
|
-
online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
|
283
|
-
common_words = transcribed_words & online_lyrics_words
|
284
|
-
match_percentage = len(common_words) / len(online_lyrics_words) * 100
|
285
|
-
|
286
|
-
if match_percentage >= 50:
|
287
|
-
self.logger.info(
|
288
|
-
f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
|
289
|
-
)
|
290
|
-
at_least_one_online_lyrics_validated = True
|
291
|
-
else:
|
292
|
-
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
293
|
-
self.outputs[online_lyrics_text_key] = None
|
294
|
-
self.outputs[online_lyrics_filepath_key] = None
|
295
|
-
|
296
|
-
self.logger.info(
|
297
|
-
f"Completed validation of transcription using online lyrics sources. Match found: {at_least_one_online_lyrics_validated}"
|
298
|
-
)
|
299
|
-
|
300
|
-
if not at_least_one_online_lyrics_validated:
|
301
|
-
self.logger.error(
|
302
|
-
f"Lyrics from Genius and Spotify did not match the transcription. Please check artist and title are set correctly."
|
303
|
-
)
|
304
|
-
|
305
|
-
def write_corrected_lyrics_data_file(self):
|
306
|
-
if not self.openai_client:
|
307
|
-
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
308
|
-
return
|
309
|
-
|
310
|
-
self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
|
311
|
-
|
312
|
-
corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).json"))
|
313
|
-
|
314
|
-
if os.path.isfile(corrected_lyrics_data_json_cache_filepath):
|
315
|
-
self.logger.debug(
|
316
|
-
f"found existing file at corrected_lyrics_data_json_cache_filepath, reading: {corrected_lyrics_data_json_cache_filepath}"
|
317
|
-
)
|
318
|
-
|
319
|
-
with open(corrected_lyrics_data_json_cache_filepath, "r") as corrected_lyrics_data_json:
|
320
|
-
self.outputs["corrected_lyrics_data_filepath"] = corrected_lyrics_data_json_cache_filepath
|
321
|
-
|
322
|
-
corrected_lyrics_data_dict = json.load(corrected_lyrics_data_json)
|
323
|
-
self.outputs["corrected_lyrics_data_dict"] = corrected_lyrics_data_dict
|
324
|
-
return
|
325
|
-
|
326
|
-
reference_lyrics = self.outputs.get("genius_lyrics_text") or self.outputs.get("spotify_lyrics_text")
|
327
|
-
|
328
|
-
if not reference_lyrics:
|
329
|
-
self.logger.warning("No reference lyrics found from Genius or Spotify. Skipping LLM correction.")
|
330
|
-
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
|
331
|
-
return
|
332
|
-
|
333
|
-
self.logger.debug(
|
334
|
-
f"no cached lyrics found at corrected_lyrics_data_json_cache_filepath: {corrected_lyrics_data_json_cache_filepath}, attempting to run correction using LLM"
|
335
|
-
)
|
336
|
-
|
337
|
-
corrected_lyrics_dict = {"segments": []}
|
338
|
-
|
339
|
-
with open(self.llm_prompt_correction, "r") as file:
|
340
|
-
system_prompt_template = file.read()
|
341
|
-
|
342
|
-
system_prompt = system_prompt_template.replace("{{reference_lyrics}}", reference_lyrics)
|
343
|
-
|
344
|
-
# TODO: Test if results are cleaner when using the vocal file from a background vocal audio separation model
|
345
|
-
# TODO: Record more info about the correction process (e.g before/after diffs for each segment) to a file for debugging
|
346
|
-
# TODO: Possibly add a step after segment-based correct to get the LLM to self-analyse the diff
|
347
|
-
|
348
|
-
self.outputs["llm_transcript"] = ""
|
349
|
-
self.outputs["llm_transcript_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (LLM Transcript).txt"))
|
350
|
-
|
351
|
-
total_segments = len(self.outputs["transcription_data_dict_primary"]["segments"])
|
352
|
-
self.logger.info(f"Beginning correction using LLM, total segments: {total_segments}")
|
353
|
-
|
354
|
-
with open(self.outputs["llm_transcript_filepath"], "a", buffering=1, encoding="utf-8") as llm_transcript_file:
|
355
|
-
self.logger.debug(f"writing LLM chat instructions: {self.outputs['llm_transcript_filepath']}")
|
356
|
-
|
357
|
-
llm_transcript_header = f"--- SYSTEM instructions passed in for all segments ---:\n\n{system_prompt}\n"
|
358
|
-
self.outputs["llm_transcript"] += llm_transcript_header
|
359
|
-
llm_transcript_file.write(llm_transcript_header)
|
360
|
-
|
361
|
-
for segment in self.outputs["transcription_data_dict_primary"]["segments"]:
|
362
|
-
# # Don't waste OpenAI dollars when testing!
|
363
|
-
# if segment["id"] > 10:
|
364
|
-
# continue
|
365
|
-
# if segment["id"] < 20 or segment["id"] > 24:
|
366
|
-
# continue
|
367
|
-
|
368
|
-
llm_transcript_segment = ""
|
369
|
-
segment_input = json.dumps(
|
370
|
-
{
|
371
|
-
"id": segment["id"],
|
372
|
-
"start": segment["start"],
|
373
|
-
"end": segment["end"],
|
374
|
-
"confidence": segment["confidence"],
|
375
|
-
"text": segment["text"],
|
376
|
-
"words": segment["words"],
|
377
|
-
}
|
378
|
-
)
|
379
|
-
|
380
|
-
previous_two_corrected_lines = ""
|
381
|
-
upcoming_two_uncorrected_lines = ""
|
382
|
-
|
383
|
-
for previous_segment in corrected_lyrics_dict["segments"]:
|
384
|
-
if previous_segment["id"] in (segment["id"] - 2, segment["id"] - 1):
|
385
|
-
previous_two_corrected_lines += previous_segment["text"].strip() + "\n"
|
386
|
-
|
387
|
-
for next_segment in self.outputs["transcription_data_dict_primary"]["segments"]:
|
388
|
-
if next_segment["id"] in (segment["id"] + 1, segment["id"] + 2):
|
389
|
-
upcoming_two_uncorrected_lines += next_segment["text"].strip() + "\n"
|
390
|
-
|
391
|
-
llm_transcript_segment += f"--- Segment {segment['id']} / {total_segments} ---\n"
|
392
|
-
llm_transcript_segment += f"Previous two corrected lines:\n\n{previous_two_corrected_lines}\nUpcoming two uncorrected lines:\n\n{upcoming_two_uncorrected_lines}\nData input:\n\n{segment_input}\n"
|
393
|
-
|
394
|
-
# fmt: off
|
395
|
-
segment_prompt = system_prompt_template.replace(
|
396
|
-
"{{previous_two_corrected_lines}}", previous_two_corrected_lines
|
397
|
-
).replace(
|
398
|
-
"{{upcoming_two_uncorrected_lines}}", upcoming_two_uncorrected_lines
|
399
|
-
).replace(
|
400
|
-
"{{segment_input}}", segment_input
|
401
|
-
)
|
402
|
-
|
403
|
-
self.logger.info(
|
404
|
-
f'Calling completion model {self.llm_model} with instructions and data input for segment {segment["id"]} / {total_segments}:'
|
405
|
-
)
|
406
|
-
|
407
|
-
response = self.openai_client.chat.completions.create(
|
408
|
-
model=self.llm_model,
|
409
|
-
response_format={"type": "json_object"},
|
410
|
-
seed=10,
|
411
|
-
temperature=0.4,
|
412
|
-
messages=[
|
413
|
-
{
|
414
|
-
"role": "user",
|
415
|
-
"content": segment_prompt
|
416
|
-
}
|
417
|
-
],
|
418
|
-
)
|
419
|
-
# fmt: on
|
420
|
-
|
421
|
-
message = response.choices[0].message.content
|
422
|
-
finish_reason = response.choices[0].finish_reason
|
423
|
-
|
424
|
-
llm_transcript_segment += f"\n--- RESPONSE for segment {segment['id']} ---:\n\n"
|
425
|
-
llm_transcript_segment += message
|
426
|
-
llm_transcript_segment += f"\n--- END segment {segment['id']} / {total_segments} ---:\n\n"
|
427
|
-
|
428
|
-
self.logger.debug(f"writing LLM chat transcript for segment to: {self.outputs['llm_transcript_filepath']}")
|
429
|
-
llm_transcript_file.write(llm_transcript_segment)
|
430
|
-
self.outputs["llm_transcript"] += llm_transcript_segment
|
431
|
-
|
432
|
-
self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
|
433
|
-
self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
|
434
|
-
|
435
|
-
# self.logger.debug(f"response finish_reason: {finish_reason} message: \n{message}")
|
436
|
-
|
437
|
-
if finish_reason == "stop":
|
438
|
-
try:
|
439
|
-
corrected_segment_dict = json.loads(message)
|
440
|
-
corrected_lyrics_dict["segments"].append(corrected_segment_dict)
|
441
|
-
self.logger.info("Successfully parsed response from GPT as JSON and appended to corrected_lyrics_dict.segments")
|
442
|
-
except json.JSONDecodeError as e:
|
443
|
-
raise Exception("Failed to parse response from GPT as JSON") from e
|
444
|
-
else:
|
445
|
-
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
|
446
|
-
|
447
|
-
self.logger.info(f'Successfully processed correction for all {len(corrected_lyrics_dict["segments"])} lyrics segments')
|
448
|
-
|
449
|
-
self.logger.debug(f"writing corrected lyrics data JSON filepath: {corrected_lyrics_data_json_cache_filepath}")
|
450
|
-
with open(corrected_lyrics_data_json_cache_filepath, "w", encoding="utf-8") as corrected_lyrics_data_json_cache_file:
|
451
|
-
corrected_lyrics_data_json_cache_file.write(json.dumps(corrected_lyrics_dict, indent=4))
|
452
|
-
|
453
|
-
self.outputs["corrected_lyrics_data_filepath"] = corrected_lyrics_data_json_cache_filepath
|
454
|
-
self.outputs["corrected_lyrics_data_dict"] = corrected_lyrics_dict
|
455
|
-
|
456
|
-
def calculate_llm_costs(self):
|
457
|
-
price_dollars_per_1000_tokens = {
|
458
|
-
"gpt-3.5-turbo-1106": {
|
459
|
-
"input": 0.0010,
|
460
|
-
"output": 0.0020,
|
461
|
-
},
|
462
|
-
"gpt-4-1106-preview": {
|
463
|
-
"input": 0.01,
|
464
|
-
"output": 0.03,
|
465
|
-
},
|
466
|
-
}
|
467
|
-
|
468
|
-
input_price = price_dollars_per_1000_tokens.get(self.llm_model, {"input": 0, "output": 0})["input"]
|
469
|
-
output_price = price_dollars_per_1000_tokens.get(self.llm_model, {"input": 0, "output": 0})["output"]
|
470
|
-
|
471
|
-
input_cost = input_price * (self.outputs["llm_token_usage"]["input"] / 1000)
|
472
|
-
output_cost = output_price * (self.outputs["llm_token_usage"]["output"] / 1000)
|
473
|
-
|
474
|
-
self.outputs["llm_costs_usd"]["input"] = round(input_cost, 3)
|
475
|
-
self.outputs["llm_costs_usd"]["output"] = round(output_cost, 3)
|
476
|
-
self.outputs["llm_costs_usd"]["total"] = round(input_cost + output_cost, 3)
|
477
|
-
|
478
|
-
def write_corrected_lyrics_plain_text(self):
|
479
|
-
if self.outputs["corrected_lyrics_data_dict"]:
|
480
|
-
self.logger.debug(f"corrected_lyrics_data_dict exists, writing plain text lyrics file")
|
481
|
-
|
482
|
-
corrected_lyrics_text_filepath = os.path.join(
|
483
|
-
self.cache_dir, self.get_output_filename(" (Lyrics Corrected).txt") # Updated to use consistent naming
|
484
|
-
)
|
485
|
-
self.outputs["corrected_lyrics_text_filepath"] = corrected_lyrics_text_filepath
|
486
|
-
|
487
|
-
self.outputs["corrected_lyrics_text"] = ""
|
488
|
-
|
489
|
-
self.logger.debug(f"writing lyrics plain text to corrected_lyrics_text_filepath: {corrected_lyrics_text_filepath}")
|
490
|
-
with open(corrected_lyrics_text_filepath, "w", encoding="utf-8") as f:
|
491
|
-
for corrected_segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
|
492
|
-
self.outputs["corrected_lyrics_text"] += corrected_segment["text"].strip() + "\n"
|
493
|
-
f.write(corrected_segment["text"].strip() + "\n")
|
494
|
-
|
495
|
-
def write_spotify_lyrics_data_file(self):
|
496
|
-
if self.spotify_cookie and self.song_known:
|
497
|
-
self.logger.debug(f"attempting spotify fetch as spotify_cookie and song name was set")
|
498
|
-
else:
|
499
|
-
self.logger.warning(f"skipping spotify fetch as not all spotify params were set")
|
500
|
-
return
|
501
|
-
|
502
|
-
spotify_lyrics_data_json_cache_filepath = os.path.join(
|
503
|
-
self.cache_dir, self.get_output_filename(" (Lyrics Spotify).json") # Updated to use consistent naming
|
504
|
-
)
|
505
|
-
|
506
|
-
if os.path.isfile(spotify_lyrics_data_json_cache_filepath):
|
507
|
-
self.logger.debug(
|
508
|
-
f"found existing file at spotify_lyrics_data_json_cache_filepath, reading: {spotify_lyrics_data_json_cache_filepath}"
|
509
|
-
)
|
510
|
-
|
511
|
-
with open(spotify_lyrics_data_json_cache_filepath, "r") as spotify_lyrics_data_json:
|
512
|
-
spotify_lyrics_data_dict = json.load(spotify_lyrics_data_json)
|
513
|
-
self.outputs["spotify_lyrics_data_filepath"] = spotify_lyrics_data_json_cache_filepath
|
514
|
-
self.outputs["spotify_lyrics_data_dict"] = spotify_lyrics_data_dict
|
515
|
-
return
|
516
|
-
|
517
|
-
self.logger.debug(
|
518
|
-
f"no cached lyrics found at spotify_lyrics_data_json_cache_filepath: {spotify_lyrics_data_json_cache_filepath}, attempting to fetch from spotify"
|
519
|
-
)
|
520
|
-
|
521
|
-
spotify_lyrics_json = None
|
522
|
-
|
523
|
-
try:
|
524
|
-
spotify_client = syrics.api.Spotify(self.spotify_cookie)
|
525
|
-
spotify_search_query = f"{self.title} - {self.artist}"
|
526
|
-
spotify_search_results = spotify_client.search(spotify_search_query, type="track", limit=5)
|
527
|
-
|
528
|
-
spotify_top_result = spotify_search_results["tracks"]["items"][0]
|
529
|
-
self.logger.debug(
|
530
|
-
f"spotify_top_result: {spotify_top_result['artists'][0]['name']} - {spotify_top_result['name']} ({spotify_top_result['external_urls']['spotify']})"
|
531
|
-
)
|
532
|
-
|
533
|
-
spotify_lyrics_dict = spotify_client.get_lyrics(spotify_top_result["id"])
|
534
|
-
spotify_lyrics_json = json.dumps(spotify_lyrics_dict, indent=4)
|
535
|
-
|
536
|
-
self.logger.debug(
|
537
|
-
f"writing lyrics data JSON to spotify_lyrics_data_json_cache_filepath: {spotify_lyrics_data_json_cache_filepath}"
|
538
|
-
)
|
539
|
-
with open(spotify_lyrics_data_json_cache_filepath, "w", encoding="utf-8") as f:
|
540
|
-
f.write(spotify_lyrics_json)
|
541
|
-
except Exception as e:
|
542
|
-
self.logger.warn(f"caught exception while attempting to fetch from spotify: ", e)
|
543
|
-
|
544
|
-
self.outputs["spotify_lyrics_data_filepath"] = spotify_lyrics_data_json_cache_filepath
|
545
|
-
self.outputs["spotify_lyrics_data_dict"] = spotify_lyrics_dict
|
546
|
-
|
547
|
-
def write_spotify_lyrics_plain_text(self):
|
548
|
-
if self.outputs["spotify_lyrics_data_dict"]:
|
549
|
-
self.logger.debug(f"spotify_lyrics data found, checking/writing plain text lyrics file")
|
550
|
-
|
551
|
-
spotify_lyrics_text_filepath = os.path.join(
|
552
|
-
self.cache_dir, self.get_output_filename(" (Lyrics Spotify).txt") # Updated to use consistent naming
|
553
|
-
)
|
554
|
-
self.outputs["spotify_lyrics_text_filepath"] = spotify_lyrics_text_filepath
|
555
|
-
|
556
|
-
lines = self.outputs["spotify_lyrics_data_dict"]["lyrics"]["lines"]
|
557
|
-
|
558
|
-
self.outputs["spotify_lyrics_text"] = ""
|
559
|
-
|
560
|
-
self.logger.debug(f"writing lyrics plain text to spotify_lyrics_text_filepath: {spotify_lyrics_text_filepath}")
|
561
|
-
with open(spotify_lyrics_text_filepath, "w", encoding="utf-8") as f:
|
562
|
-
for line in lines:
|
563
|
-
self.outputs["spotify_lyrics_text"] += line["words"].strip() + "\n"
|
564
|
-
f.write(line["words"].strip() + "\n")
|
565
|
-
|
566
|
-
@retry(
|
567
|
-
stop=stop_after_delay(120), # Stop after 2 minutes
|
568
|
-
wait=wait_exponential(multiplier=1, min=4, max=60), # Exponential backoff starting at 4 seconds
|
569
|
-
retry=retry_if_exception_type(requests.exceptions.RequestException), # Retry on request exceptions
|
570
|
-
reraise=True, # Reraise the last exception if all retries fail
|
571
|
-
)
|
572
|
-
def fetch_genius_lyrics(self, genius, title, artist):
|
573
|
-
self.logger.debug(f"fetch_genius_lyrics attempting to fetch lyrics from Genius for {title} by {artist}")
|
574
|
-
return genius.search_song(title, artist)
|
575
|
-
|
576
|
-
def write_genius_lyrics_file(self):
|
577
|
-
if self.genius_api_token and self.song_known:
|
578
|
-
self.logger.debug(f"attempting genius fetch as genius_api_token and song name was set")
|
579
|
-
else:
|
580
|
-
self.logger.warning(f"skipping genius fetch as not all genius params were set")
|
581
|
-
return
|
582
|
-
|
583
|
-
genius_lyrics_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Genius).txt"))
|
584
|
-
|
585
|
-
# Check cache first
|
586
|
-
if os.path.isfile(genius_lyrics_cache_filepath):
|
587
|
-
self.logger.debug(f"found existing file at genius_lyrics_cache_filepath, reading: {genius_lyrics_cache_filepath}")
|
588
|
-
|
589
|
-
with open(genius_lyrics_cache_filepath, "r") as cached_lyrics:
|
590
|
-
self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
|
591
|
-
self.outputs["genius_lyrics_text"] = cached_lyrics.read()
|
592
|
-
return
|
593
|
-
self.logger.debug(f"no cached lyrics found at genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}, fetching from Genius")
|
594
|
-
|
595
|
-
# Initialize Genius with better defaults
|
596
|
-
genius = lyricsgenius.Genius(
|
597
|
-
self.genius_api_token,
|
598
|
-
verbose=(self.log_level == logging.DEBUG),
|
599
|
-
remove_section_headers=True,
|
600
|
-
)
|
601
|
-
|
602
|
-
try:
|
603
|
-
song = self.fetch_genius_lyrics(genius, self.title, self.artist)
|
604
|
-
if song is None:
|
605
|
-
self.logger.warning(f'Could not find lyrics on Genius for "{self.title}" by {self.artist}')
|
606
|
-
return None
|
607
|
-
|
608
|
-
lyrics = self.clean_genius_lyrics(song.lyrics)
|
609
|
-
|
610
|
-
self.logger.debug(f"writing clean lyrics to genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}")
|
611
|
-
with open(genius_lyrics_cache_filepath, "w", encoding="utf-8") as f:
|
612
|
-
f.write(lyrics)
|
613
|
-
|
614
|
-
self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
|
615
|
-
self.outputs["genius_lyrics_text"] = lyrics
|
616
|
-
return lyrics.split("\n") # Return lines like write_lyrics_from_genius
|
617
|
-
|
618
|
-
except requests.exceptions.RequestException as e:
|
619
|
-
self.logger.error(f"Failed to fetch lyrics from Genius after multiple retries: {e}")
|
620
|
-
raise
|
621
|
-
|
622
|
-
def clean_genius_lyrics(self, lyrics):
|
623
|
-
lyrics = lyrics.replace("\\n", "\n")
|
624
|
-
lyrics = re.sub(r"You might also like", "", lyrics)
|
625
|
-
lyrics = re.sub(
|
626
|
-
r".*?Lyrics([A-Z])", r"\1", lyrics
|
627
|
-
) # Remove the song name and word "Lyrics" if this has a non-newline char at the start
|
628
|
-
lyrics = re.sub(r"^[0-9]* Contributors.*Lyrics", "", lyrics) # Remove this example: 27 ContributorsSex Bomb Lyrics
|
629
|
-
lyrics = re.sub(
|
630
|
-
r"See.*Live.*Get tickets as low as \$[0-9]+", "", lyrics
|
631
|
-
) # Remove this example: See Tom Jones LiveGet tickets as low as $71
|
632
|
-
lyrics = re.sub(r"[0-9]+Embed$", "", lyrics) # Remove the word "Embed" at end of line with preceding numbers if found
|
633
|
-
lyrics = re.sub(r"(\S)Embed$", r"\1", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
|
634
|
-
lyrics = re.sub(r"^Embed$", r"", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
|
635
|
-
lyrics = re.sub(r".*?\[.*?\].*?", "", lyrics) # Remove lines containing square brackets
|
636
|
-
# add any additional cleaning rules here
|
637
|
-
return lyrics
|
638
|
-
|
639
|
-
def calculate_singing_percentage(self):
|
640
|
-
# Calculate total seconds of singing using timings from whisper transcription results
|
641
|
-
total_singing_duration = sum(
|
642
|
-
segment["end"] - segment["start"] for segment in self.outputs["transcription_data_dict_primary"]["segments"]
|
643
|
-
)
|
644
|
-
|
645
|
-
self.logger.debug(f"calculated total_singing_duration: {int(total_singing_duration)} seconds, now running ffprobe")
|
646
|
-
|
647
|
-
# Calculate total song duration using ffprobe
|
648
|
-
duration_command = [
|
649
|
-
"ffprobe",
|
650
|
-
"-i",
|
651
|
-
self.audio_filepath,
|
652
|
-
"-show_entries",
|
653
|
-
"format=duration",
|
654
|
-
"-v",
|
655
|
-
"quiet",
|
656
|
-
"-of",
|
657
|
-
"csv=%s" % ("p=0"),
|
658
|
-
]
|
659
|
-
duration_output = subprocess.check_output(duration_command, universal_newlines=True)
|
660
|
-
song_duration = float(duration_output)
|
661
|
-
|
662
|
-
# Calculate singing percentage
|
663
|
-
singing_percentage = int((total_singing_duration / song_duration) * 100)
|
664
|
-
|
665
|
-
self.outputs["singing_percentage"] = singing_percentage
|
666
|
-
self.outputs["total_singing_duration"] = total_singing_duration
|
667
|
-
self.outputs["song_duration"] = song_duration
|
668
|
-
|
669
|
-
# Loops through lyrics segments (typically sentences) from whisper_timestamps JSON output,
|
670
|
-
# then loops over each word and writes all words with MidiCo segment start/end formatting
|
671
|
-
# and word-level timestamps to a MidiCo-compatible LRC file
|
672
|
-
def write_midico_lrc_file(self):
|
673
|
-
self.outputs["midico_lrc_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc"))
|
674
|
-
|
675
|
-
lrc_filename = self.outputs["midico_lrc_filepath"]
|
676
|
-
self.logger.debug(f"writing midico formatted word timestamps to LRC file: {lrc_filename}")
|
677
|
-
with open(lrc_filename, "w", encoding="utf-8") as f:
|
678
|
-
f.write("[re:MidiCo]\n")
|
679
|
-
for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
|
680
|
-
for i, word in enumerate(segment["words"]):
|
681
|
-
start_time = self.format_time_lrc(word["start"])
|
682
|
-
if i != len(segment["words"]) - 1:
|
683
|
-
if not word["text"].endswith(" "):
|
684
|
-
self.logger.debug(f"word '{word['text']}' does not end with a space, adding one")
|
685
|
-
word["text"] += " "
|
686
|
-
line = "[{}]1:{}{}\n".format(start_time, "/" if i == 0 else "", word["text"])
|
687
|
-
f.write(line)
|
688
|
-
|
689
|
-
def create_screens(self):
|
690
|
-
self.logger.debug("create_screens beginning generation of screens from transcription results")
|
691
|
-
screens: List[subtitles.LyricsScreen] = []
|
692
|
-
screen: Optional[subtitles.LyricsScreen] = None
|
693
|
-
|
694
|
-
max_lines_per_screen = 4
|
695
|
-
max_line_length = 36 # Maximum characters per line
|
696
|
-
self.logger.debug(f"Max lines per screen: {max_lines_per_screen}, Max line length: {max_line_length}")
|
697
|
-
|
698
|
-
for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
|
699
|
-
self.logger.debug(f"Processing segment: {segment['text']}")
|
700
|
-
if screen is None or len(screen.lines) >= max_lines_per_screen:
|
701
|
-
screen = subtitles.LyricsScreen(video_size=self.video_resolution_num, line_height=self.line_height, logger=self.logger)
|
702
|
-
screens.append(screen)
|
703
|
-
self.logger.debug(f"Created new screen. Total screens: {len(screens)}")
|
704
|
-
|
705
|
-
words = segment["words"]
|
706
|
-
current_line = subtitles.LyricsLine()
|
707
|
-
current_line_text = ""
|
708
|
-
self.logger.debug(f"Processing {len(words)} words in segment")
|
709
|
-
|
710
|
-
for word in words:
|
711
|
-
self.logger.debug(f"Processing word: '{word['text']}'")
|
712
|
-
if len(current_line_text) + len(word["text"]) + 1 > max_line_length or (current_line_text and word["text"][0].isupper()):
|
713
|
-
self.logger.debug(f"Current line would exceed max length or new capitalized word. Line: '{current_line_text}'")
|
714
|
-
if current_line.segments:
|
715
|
-
screen.lines.append(current_line)
|
716
|
-
self.logger.debug(f"Added line to screen. Lines on current screen: {len(screen.lines)}")
|
717
|
-
if len(screen.lines) >= max_lines_per_screen:
|
718
|
-
screen = subtitles.LyricsScreen(
|
719
|
-
video_size=self.video_resolution_num,
|
720
|
-
line_height=self.line_height,
|
721
|
-
logger=self.logger,
|
722
|
-
)
|
723
|
-
screens.append(screen)
|
724
|
-
self.logger.debug(f"Screen full, created new screen. Total screens: {len(screens)}")
|
725
|
-
current_line = subtitles.LyricsLine()
|
726
|
-
current_line_text = ""
|
727
|
-
self.logger.debug("Reset current line")
|
728
|
-
|
729
|
-
current_line_text += (" " if current_line_text else "") + word["text"]
|
730
|
-
|
731
|
-
# fmt: off
|
732
|
-
lyric_segment = subtitles.LyricSegment(
|
733
|
-
text=word["text"],
|
734
|
-
ts=timedelta(seconds=word["start"]),
|
735
|
-
end_ts=timedelta(seconds=word["end"])
|
736
|
-
)
|
737
|
-
# fmt: on
|
738
|
-
|
739
|
-
current_line.segments.append(lyric_segment)
|
740
|
-
self.logger.debug(f"Added word to current line. Current line: '{current_line_text}'")
|
741
|
-
|
742
|
-
if current_line.segments:
|
743
|
-
screen.lines.append(current_line)
|
744
|
-
self.logger.debug(f"Added final line of segment to screen. Lines on current screen: {len(screen.lines)}")
|
745
|
-
|
746
|
-
self.logger.debug(f"Finished creating screens. Total screens created: {len(screens)}")
|
747
|
-
return screens
|
748
|
-
|
749
|
-
def write_ass_file(self):
|
750
|
-
self.outputs["ass_subtitles_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).ass"))
|
751
|
-
|
752
|
-
ass_filepath = self.outputs["ass_subtitles_filepath"]
|
753
|
-
self.logger.debug(f"writing ASS formatted subtitle file: {ass_filepath}")
|
754
|
-
|
755
|
-
initial_screens = self.create_screens()
|
756
|
-
screens = subtitles.set_segment_end_times(initial_screens, int(self.outputs["song_duration"]))
|
757
|
-
screens = subtitles.set_screen_start_times(screens)
|
758
|
-
lyric_subtitles_ass = subtitles.create_styled_subtitles(screens, self.video_resolution_num, self.font_size)
|
759
|
-
lyric_subtitles_ass.write(ass_filepath)
|
760
|
-
|
761
|
-
def resize_background_image(self):
|
762
|
-
self.logger.debug(
|
763
|
-
f"resize_background_image attempting to resize background image: {self.video_background_image} to resolution: {self.video_resolution}"
|
764
|
-
)
|
765
|
-
background_image_resized = self.get_cache_filepath(f"-{self.video_resolution}.png")
|
766
|
-
|
767
|
-
if os.path.isfile(background_image_resized):
|
768
|
-
self.logger.debug(
|
769
|
-
f"resize_background_image found existing resized background image, skipping resize: {background_image_resized}"
|
770
|
-
)
|
771
|
-
return background_image_resized
|
772
|
-
|
773
|
-
resize_command = ["ffmpeg", "-i", self.video_background_image]
|
774
|
-
resize_command += ["-vf", f"scale={self.video_resolution_num[0]}x{self.video_resolution_num[1]}"]
|
775
|
-
|
776
|
-
resize_command += [background_image_resized]
|
777
|
-
subprocess.check_output(resize_command, universal_newlines=True)
|
778
|
-
|
779
|
-
if not os.path.isfile(background_image_resized):
|
780
|
-
raise FileNotFoundError(
|
781
|
-
f"background_image_resized was not a valid file after running ffmpeg to resize: {background_image_resized}"
|
782
|
-
)
|
783
|
-
|
784
|
-
return background_image_resized
|
785
|
-
|
786
|
-
def create_video(self):
|
787
|
-
self.logger.debug(f"create_video attempting to generate video file: {self.outputs['karaoke_video_filepath']}")
|
788
|
-
|
789
|
-
audio_delay = 0
|
790
|
-
audio_delay_ms = int(audio_delay * 1000) # milliseconds
|
791
|
-
|
792
|
-
video_metadata = []
|
793
|
-
if self.artist:
|
794
|
-
video_metadata.append("-metadata")
|
795
|
-
video_metadata.append(f"artist={self.artist}")
|
796
|
-
if self.title:
|
797
|
-
video_metadata.append("-metadata")
|
798
|
-
video_metadata.append(f"title={self.title}")
|
799
|
-
|
800
|
-
# fmt: off
|
801
|
-
ffmpeg_cmd = [
|
802
|
-
"ffmpeg",
|
803
|
-
"-r", "30", # Set frame rate to 30 fps
|
804
|
-
]
|
805
|
-
|
806
|
-
if self.video_background_image:
|
807
|
-
self.logger.debug(f"background image set: {self.video_background_image}, resizing to resolution: {self.video_resolution}")
|
808
|
-
|
809
|
-
background_image_resized = self.resize_background_image()
|
810
|
-
|
811
|
-
ffmpeg_cmd += [
|
812
|
-
# Use provided image as background
|
813
|
-
"-loop", "1", # Loop the image
|
814
|
-
"-i", background_image_resized, # Input image file
|
815
|
-
]
|
816
|
-
|
817
|
-
else:
|
818
|
-
self.logger.debug(f"background not set, using solid {self.video_background_color} background with resolution: {self.video_resolution}")
|
819
|
-
ffmpeg_cmd += ["-f", "lavfi"]
|
820
|
-
ffmpeg_cmd += ["-i", f"color=c={self.video_background_color}:s={self.video_resolution_num[0]}x{self.video_resolution_num[1]}:r=30"]
|
821
|
-
|
822
|
-
|
823
|
-
# Check for hardware acclerated h.264 encoding and use if available
|
824
|
-
video_codec = "libx264"
|
825
|
-
ffmpeg_codes = subprocess.getoutput("ffmpeg -codecs")
|
826
|
-
|
827
|
-
if "h264_videotoolbox" in ffmpeg_codes:
|
828
|
-
video_codec = "h264_videotoolbox"
|
829
|
-
self.logger.info(f"video codec set to hardware accelerated h264_videotoolbox")
|
830
|
-
|
831
|
-
ffmpeg_cmd += [
|
832
|
-
# Use accompaniment track as audio
|
833
|
-
"-i", self.audio_filepath,
|
834
|
-
# Set audio delay if needed
|
835
|
-
# https://ffmpeg.org/ffmpeg-filters.html#adelay
|
836
|
-
# "-af",
|
837
|
-
# f"adelay=delays={audio_delay_ms}:all=1",
|
838
|
-
# Re-encode audio as mp3
|
839
|
-
"-c:a", "aac",
|
840
|
-
# Add subtitles
|
841
|
-
"-vf", "ass=" + self.outputs["ass_subtitles_filepath"],
|
842
|
-
# Encode as H264 using hardware acceleration if available
|
843
|
-
"-c:v", video_codec,
|
844
|
-
# Increase output video quality
|
845
|
-
"-preset", "slow", # Use a slower preset for better compression efficiency
|
846
|
-
# "-crf", "1", # Lower CRF for higher quality. Adjust as needed, lower is better quality
|
847
|
-
"-b:v", "5000k", # Set the video bitrate, for example, 5000 kbps
|
848
|
-
"-minrate", "5000k", # Minimum bitrate
|
849
|
-
"-maxrate", "20000k", # Maximum bitrate
|
850
|
-
"-bufsize", "10000k", # Set the buffer size, typically 2x maxrate
|
851
|
-
# End encoding after the shortest stream
|
852
|
-
"-shortest",
|
853
|
-
# Overwrite files without asking
|
854
|
-
"-y",
|
855
|
-
# Only encode the first 30 seconds (for testing, fast iteration when editing this)
|
856
|
-
# "-t", "30",
|
857
|
-
*video_metadata,
|
858
|
-
# Output path of video
|
859
|
-
self.outputs["karaoke_video_filepath"],
|
860
|
-
]
|
861
|
-
# fmt: on
|
862
|
-
|
863
|
-
self.logger.debug(f"running ffmpeg command to generate video: {' '.join(ffmpeg_cmd)}")
|
864
|
-
ffmpeg_output = subprocess.check_output(ffmpeg_cmd, universal_newlines=True)
|
865
|
-
return ffmpeg_output
|
866
|
-
|
867
|
-
def format_time_lrc(self, duration):
|
868
|
-
minutes = int(duration // 60)
|
869
|
-
seconds = int(duration % 60)
|
870
|
-
milliseconds = int((duration % 1) * 1000)
|
871
|
-
formatted_time = f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
|
872
|
-
return formatted_time
|
873
|
-
|
874
|
-
def write_transcribed_lyrics_plain_text(self):
|
875
|
-
if self.outputs["transcription_data_dict_whisper"]:
|
876
|
-
transcribed_lyrics_text_whisper_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Whisper).txt"))
|
877
|
-
self.logger.debug(f"Setting Whisper text filepath to: {transcribed_lyrics_text_whisper_filepath}")
|
878
|
-
self.outputs["transcribed_lyrics_text_whisper_filepath"] = transcribed_lyrics_text_whisper_filepath
|
879
|
-
self.outputs["transcribed_lyrics_text_whisper"] = ""
|
880
|
-
|
881
|
-
self.logger.debug(f"Writing Whisper lyrics to: {transcribed_lyrics_text_whisper_filepath}")
|
882
|
-
with open(transcribed_lyrics_text_whisper_filepath, "w", encoding="utf-8") as f:
|
883
|
-
for segment in self.outputs["transcription_data_dict_whisper"]["segments"]:
|
884
|
-
self.outputs["transcribed_lyrics_text_whisper"] += segment["text"] + "\n"
|
885
|
-
f.write(segment["text"].strip() + "\n")
|
886
|
-
self.logger.debug(f"Finished writing Whisper lyrics, file exists: {os.path.exists(transcribed_lyrics_text_whisper_filepath)}")
|
887
|
-
|
888
|
-
if self.outputs["transcription_data_dict_audioshake"]:
|
889
|
-
transcribed_lyrics_text_audioshake_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics AudioShake).txt"))
|
890
|
-
self.outputs["transcribed_lyrics_text_audioshake_filepath"] = transcribed_lyrics_text_audioshake_filepath
|
891
|
-
self.outputs["transcribed_lyrics_text_audioshake"] = ""
|
892
|
-
|
893
|
-
self.logger.debug(f"Writing AudioShake lyrics to: {transcribed_lyrics_text_audioshake_filepath}")
|
894
|
-
with open(transcribed_lyrics_text_audioshake_filepath, "w", encoding="utf-8") as f:
|
895
|
-
for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]:
|
896
|
-
self.outputs["transcribed_lyrics_text_audioshake"] += segment["text"] + "\n"
|
897
|
-
f.write(segment["text"].strip() + "\n")
|
898
|
-
|
899
|
-
def find_best_split_point(self, text, max_length):
|
900
|
-
self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
|
901
|
-
words = text.split()
|
902
|
-
mid_word_index = len(words) // 2
|
903
|
-
mid_point = len(" ".join(words[:mid_word_index]))
|
904
|
-
self.logger.debug(f"Mid point is at character {mid_point}")
|
905
|
-
|
906
|
-
# Check for a comma within one or two words of the middle word
|
907
|
-
if "," in text:
|
908
|
-
comma_indices = [i for i, char in enumerate(text) if char == ","]
|
909
|
-
self.logger.debug(f"Found commas at indices: {comma_indices}")
|
910
|
-
for index in comma_indices:
|
911
|
-
if abs(mid_point - index) < 20 and len(text[: index + 1].strip()) <= max_length:
|
912
|
-
self.logger.debug(f"Choosing comma at index {index} as split point")
|
913
|
-
return index + 1 # Include the comma in the first part
|
914
|
-
|
915
|
-
# Check for 'and'
|
916
|
-
if " and " in text:
|
917
|
-
and_indices = [m.start() for m in re.finditer(" and ", text)]
|
918
|
-
self.logger.debug(f"Found 'and' at indices: {and_indices}")
|
919
|
-
for index in sorted(and_indices, key=lambda x: abs(x - mid_point)):
|
920
|
-
if len(text[: index + len(" and ")].strip()) <= max_length:
|
921
|
-
self.logger.debug(f"Choosing 'and' at index {index} as split point")
|
922
|
-
return index + len(" and ")
|
923
|
-
|
924
|
-
# Check for words starting with a capital letter
|
925
|
-
capital_word_indices = [m.start() for m in re.finditer(r"\s[A-Z]", text)]
|
926
|
-
self.logger.debug(f"Found capital words at indices: {capital_word_indices}")
|
927
|
-
for index in sorted(capital_word_indices, key=lambda x: abs(x - mid_point)):
|
928
|
-
if index > 0 and len(text[:index].strip()) <= max_length:
|
929
|
-
self.logger.debug(f"Choosing capital word at index {index} as split point")
|
930
|
-
return index
|
931
|
-
|
932
|
-
# If no better split point is found, try splitting at the middle word
|
933
|
-
if len(words) > 2 and mid_word_index > 0:
|
934
|
-
split_at_middle = len(" ".join(words[:mid_word_index]))
|
935
|
-
if split_at_middle <= max_length:
|
936
|
-
self.logger.debug(f"Choosing middle word split at index {split_at_middle}")
|
937
|
-
return split_at_middle
|
938
|
-
|
939
|
-
# If the text is still too long, forcibly split at the maximum length
|
940
|
-
self.logger.debug(f"No suitable split point found, forcibly splitting at max_length {max_length}")
|
941
|
-
return max_length
|
942
|
-
|
943
|
-
def split_long_segments(self, segments, max_length):
|
944
|
-
self.logger.debug(f"Splitting long segments (max_length: {max_length})")
|
945
|
-
new_segments = []
|
946
|
-
for segment in segments:
|
947
|
-
text = segment["text"]
|
948
|
-
self.logger.debug(f"Processing segment: '{text}' (length: {len(text)})")
|
949
|
-
if len(text) <= max_length:
|
950
|
-
self.logger.debug("Segment is within max_length, keeping as is")
|
951
|
-
new_segments.append(segment)
|
952
|
-
else:
|
953
|
-
self.logger.debug("Segment exceeds max_length, splitting")
|
954
|
-
meta_words = segment["words"]
|
955
|
-
current_text = ""
|
956
|
-
current_start = segment["start"]
|
957
|
-
current_words = []
|
958
|
-
|
959
|
-
for i, meta in enumerate(meta_words):
|
960
|
-
word = meta["text"]
|
961
|
-
if current_text:
|
962
|
-
current_text += " "
|
963
|
-
current_text += word
|
964
|
-
current_words.append(meta)
|
965
|
-
|
966
|
-
should_split = len(current_text) > max_length or (i > 0 and word[0].isupper())
|
967
|
-
if should_split:
|
968
|
-
self.logger.debug(f"Splitting at: '{current_text}'")
|
969
|
-
# If splitting due to capitalization, don't include the capitalized word
|
970
|
-
if word[0].isupper() and len(current_text.strip()) > len(word):
|
971
|
-
split_text = current_text[: -(len(word) + 1)].strip()
|
972
|
-
current_words = current_words[:-1]
|
973
|
-
else:
|
974
|
-
split_text = current_text.strip()
|
975
|
-
|
976
|
-
new_segment = {"text": split_text, "start": current_start, "end": current_words[-1]["end"], "words": current_words}
|
977
|
-
new_segments.append(new_segment)
|
978
|
-
self.logger.debug(f"Added new segment: {new_segment}")
|
979
|
-
|
980
|
-
# Reset for next segment
|
981
|
-
if word[0].isupper() and len(current_text.strip()) > len(word):
|
982
|
-
current_text = word
|
983
|
-
current_words = [meta]
|
984
|
-
else:
|
985
|
-
current_text = ""
|
986
|
-
current_words = []
|
987
|
-
current_start = meta["start"]
|
988
|
-
|
989
|
-
# Add any remaining text as a final segment
|
990
|
-
if current_text:
|
991
|
-
self.logger.debug(f"Adding final segment: '{current_text}'")
|
992
|
-
new_segments.append(
|
993
|
-
{"text": current_text.strip(), "start": current_start, "end": segment["end"], "words": current_words}
|
994
|
-
)
|
995
|
-
|
996
|
-
self.logger.debug(f"Splitting complete. Original segments: {len(segments)}, New segments: {len(new_segments)}")
|
997
|
-
return new_segments
|
998
|
-
|
999
|
-
def transcribe(self):
|
1000
|
-
# Check cache first
|
1001
|
-
transcription_cache_filepath_whisper = self.get_cache_filepath(" (Lyrics Whisper).json")
|
1002
|
-
transcription_cache_filepath_audioshake = self.get_cache_filepath(" (Lyrics AudioShake).json")
|
1003
|
-
|
1004
|
-
self.logger.debug(f"Cache directory: {self.cache_dir}")
|
1005
|
-
self.logger.debug(f"Output directory: {self.output_dir}")
|
1006
|
-
|
1007
|
-
if os.path.isfile(transcription_cache_filepath_whisper):
|
1008
|
-
self.logger.debug(f"Found existing Whisper transcription, reading: {transcription_cache_filepath_whisper}")
|
1009
|
-
with open(transcription_cache_filepath_whisper, "r") as cache_file:
|
1010
|
-
self.outputs["transcription_data_dict_whisper"] = json.load(cache_file)
|
1011
|
-
self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
|
1012
|
-
self.logger.debug(f"Loaded Whisper data and set filepath to: {self.outputs['transcription_data_whisper_filepath']}")
|
1013
|
-
|
1014
|
-
if os.path.isfile(transcription_cache_filepath_audioshake):
|
1015
|
-
self.logger.debug(f"Found existing AudioShake transcription, reading: {transcription_cache_filepath_audioshake}")
|
1016
|
-
with open(transcription_cache_filepath_audioshake, "r") as cache_file:
|
1017
|
-
self.outputs["transcription_data_dict_audioshake"] = json.load(cache_file)
|
1018
|
-
self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
|
1019
|
-
|
1020
|
-
# If we have both cached transcriptions, set primary and return early
|
1021
|
-
if self.outputs["transcription_data_dict_whisper"] and self.outputs["transcription_data_dict_audioshake"]:
|
1022
|
-
self.set_primary_transcription()
|
1023
|
-
return
|
1024
|
-
# If we have Whisper cached and AudioShake isn't available, set primary and return early
|
1025
|
-
elif self.outputs["transcription_data_dict_whisper"] and not self.audioshake_api_token:
|
1026
|
-
self.set_primary_transcription()
|
1027
|
-
return
|
1028
|
-
|
1029
|
-
# Continue with transcription for any missing data...
|
1030
|
-
audioshake_job_id = None
|
1031
|
-
if self.audioshake_api_token and not self.outputs["transcription_data_dict_audioshake"]:
|
1032
|
-
self.logger.debug(f"Starting AudioShake transcription")
|
1033
|
-
from .audioshake_transcriber import AudioShakeTranscriber
|
1034
|
-
|
1035
|
-
audioshake = AudioShakeTranscriber(api_token=self.audioshake_api_token, logger=self.logger, output_prefix=self.output_prefix)
|
1036
|
-
audioshake_job_id = audioshake.start_transcription(self.audio_filepath)
|
1037
|
-
|
1038
|
-
# Run Whisper transcription if needed while AudioShake processes
|
1039
|
-
if not self.outputs["transcription_data_dict_whisper"]:
|
1040
|
-
self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
|
1041
|
-
audio = whisper.load_audio(self.audio_filepath)
|
1042
|
-
model = whisper.load_model(self.transcription_model, device="cpu")
|
1043
|
-
whisper_data = whisper.transcribe(model, audio, language="en", beam_size=5, temperature=0.2, best_of=5)
|
1044
|
-
|
1045
|
-
# Remove segments with no words, only music
|
1046
|
-
whisper_data["segments"] = [segment for segment in whisper_data["segments"] if segment["text"].strip() != "Music"]
|
1047
|
-
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(whisper_data['segments'])}")
|
1048
|
-
|
1049
|
-
# Split long segments
|
1050
|
-
self.logger.debug("Starting to split long segments")
|
1051
|
-
whisper_data["segments"] = self.split_long_segments(whisper_data["segments"], max_length=36)
|
1052
|
-
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(whisper_data['segments'])}")
|
1053
|
-
|
1054
|
-
# Store Whisper results
|
1055
|
-
self.outputs["transcription_data_dict_whisper"] = whisper_data
|
1056
|
-
self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
|
1057
|
-
with open(transcription_cache_filepath_whisper, "w") as cache_file:
|
1058
|
-
json.dump(whisper_data, cache_file, indent=4)
|
1059
|
-
|
1060
|
-
# Now that Whisper is done, get AudioShake results if available
|
1061
|
-
if audioshake_job_id:
|
1062
|
-
self.logger.debug("Getting AudioShake results")
|
1063
|
-
audioshake_data = audioshake.get_transcription_result(audioshake_job_id)
|
1064
|
-
self.outputs["transcription_data_dict_audioshake"] = audioshake_data
|
1065
|
-
self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
|
1066
|
-
with open(transcription_cache_filepath_audioshake, "w") as cache_file:
|
1067
|
-
json.dump(audioshake_data, cache_file, indent=4)
|
1068
|
-
|
1069
|
-
# Set the primary transcription source
|
1070
|
-
self.set_primary_transcription()
|
1071
|
-
|
1072
|
-
# Write the text files
|
1073
|
-
self.write_transcribed_lyrics_plain_text()
|
1074
|
-
|
1075
|
-
def set_primary_transcription(self):
|
1076
|
-
"""Set the primary transcription source (AudioShake if available, otherwise Whisper)"""
|
1077
|
-
if self.outputs["transcription_data_dict_audioshake"]:
|
1078
|
-
self.logger.info("Using AudioShake as primary transcription source")
|
1079
|
-
self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_audioshake"]
|
1080
|
-
self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_audioshake_filepath"]
|
1081
|
-
|
1082
|
-
# Set the primary text content
|
1083
|
-
if "transcribed_lyrics_text_audioshake" not in self.outputs or not self.outputs["transcribed_lyrics_text_audioshake"]:
|
1084
|
-
self.outputs["transcribed_lyrics_text_audioshake"] = "\n".join(
|
1085
|
-
segment["text"].strip() for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]
|
1086
|
-
)
|
1087
|
-
self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_audioshake"]
|
1088
|
-
self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_audioshake_filepath"]
|
1089
|
-
else:
|
1090
|
-
self.logger.info("Using Whisper as primary transcription source")
|
1091
|
-
self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_whisper"]
|
1092
|
-
self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_whisper_filepath"]
|
1093
|
-
|
1094
|
-
# Set the primary text content
|
1095
|
-
if "transcribed_lyrics_text_whisper" not in self.outputs or not self.outputs["transcribed_lyrics_text_whisper"]:
|
1096
|
-
self.outputs["transcribed_lyrics_text_whisper"] = "\n".join(
|
1097
|
-
segment["text"].strip() for segment in self.outputs["transcription_data_dict_whisper"]["segments"]
|
1098
|
-
)
|
1099
|
-
self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_whisper"]
|
1100
|
-
self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_whisper_filepath"]
|
1101
|
-
|
1102
|
-
def get_cache_filepath(self, extension):
|
1103
|
-
# Instead of using slugify and hash, use the consistent naming pattern
|
1104
|
-
cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(extension))
|
1105
|
-
self.logger.debug(f"get_cache_filepath returning cache_filepath: {cache_filepath}")
|
1106
|
-
return cache_filepath
|
1107
|
-
|
1108
|
-
def get_song_slug(self):
|
1109
|
-
if not self.artist and not self.title:
|
1110
|
-
return "unknown_song_" + self.get_file_hash(self.audio_filepath)
|
1111
|
-
|
1112
|
-
artist_slug = slugify.slugify(self.artist or "unknown_artist", lowercase=False)
|
1113
|
-
title_slug = slugify.slugify(self.title or "unknown_title", lowercase=False)
|
1114
|
-
return artist_slug + "-" + title_slug
|
1115
|
-
|
1116
|
-
def get_file_hash(self, filepath):
|
1117
|
-
return hashlib.md5(open(filepath, "rb").read()).hexdigest()
|
1118
|
-
|
1119
|
-
def create_folders(self):
|
1120
|
-
if self.cache_dir is not None:
|
1121
|
-
os.makedirs(self.cache_dir, exist_ok=True)
|
1122
|
-
|
1123
|
-
if self.output_dir is not None:
|
1124
|
-
os.makedirs(self.output_dir, exist_ok=True)
|
1125
|
-
|
1126
|
-
def get_output_filename(self, suffix):
|
1127
|
-
"""Generate consistent filename with (Purpose) suffix pattern"""
|
1128
|
-
return f"{self.output_prefix}{suffix}"
|