lyrics-transcriber 0.19.2__py3-none-any.whl → 0.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. lyrics_transcriber/__init__.py +2 -5
  2. lyrics_transcriber/cli/main.py +194 -0
  3. lyrics_transcriber/core/__init__.py +0 -0
  4. lyrics_transcriber/core/controller.py +283 -0
  5. lyrics_transcriber/core/corrector.py +56 -0
  6. lyrics_transcriber/core/fetcher.py +143 -0
  7. lyrics_transcriber/output/__init__.py +0 -0
  8. lyrics_transcriber/output/generator.py +210 -0
  9. lyrics_transcriber/storage/__init__.py +0 -0
  10. lyrics_transcriber/storage/dropbox.py +249 -0
  11. lyrics_transcriber/storage/tokens.py +116 -0
  12. lyrics_transcriber/{audioshake_transcriber.py → transcribers/audioshake.py} +44 -15
  13. lyrics_transcriber/transcribers/base.py +31 -0
  14. lyrics_transcriber/transcribers/whisper.py +186 -0
  15. {lyrics_transcriber-0.19.2.dist-info → lyrics_transcriber-0.30.0.dist-info}/METADATA +6 -17
  16. lyrics_transcriber-0.30.0.dist-info/RECORD +22 -0
  17. lyrics_transcriber-0.30.0.dist-info/entry_points.txt +3 -0
  18. lyrics_transcriber/llm_prompts/README.md +0 -10
  19. lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -55
  20. lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -36
  21. lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -19
  22. lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -61
  23. lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -48
  24. lyrics_transcriber/transcriber.py +0 -1128
  25. lyrics_transcriber/utils/cli.py +0 -179
  26. lyrics_transcriber-0.19.2.dist-info/RECORD +0 -18
  27. lyrics_transcriber-0.19.2.dist-info/entry_points.txt +0 -3
  28. /lyrics_transcriber/{utils → cli}/__init__.py +0 -0
  29. /lyrics_transcriber/{utils → output}/ass.py +0 -0
  30. /lyrics_transcriber/{utils → output}/subtitles.py +0 -0
  31. {lyrics_transcriber-0.19.2.dist-info → lyrics_transcriber-0.30.0.dist-info}/LICENSE +0 -0
  32. {lyrics_transcriber-0.19.2.dist-info → lyrics_transcriber-0.30.0.dist-info}/WHEEL +0 -0
@@ -1,1128 +0,0 @@
1
- import os
2
- import sys
3
- import re
4
- import json
5
- import logging
6
- import shutil
7
- import hashlib
8
- import subprocess
9
- import slugify
10
- import whisper_timestamped as whisper
11
- import lyricsgenius
12
- import syrics.api
13
- from datetime import timedelta
14
- from .utils import subtitles
15
- from typing import List, Optional
16
- from openai import OpenAI
17
- from tenacity import retry, stop_after_delay, wait_exponential, retry_if_exception_type
18
- import requests
19
-
20
-
21
- class LyricsTranscriber:
22
- def __init__(
23
- self,
24
- audio_filepath,
25
- artist=None,
26
- title=None,
27
- openai_api_key=None,
28
- audioshake_api_token=None,
29
- genius_api_token=None,
30
- spotify_cookie=None,
31
- output_dir=None,
32
- cache_dir="/tmp/lyrics-transcriber-cache/",
33
- log_level=logging.DEBUG,
34
- log_formatter=None,
35
- transcription_model="medium",
36
- llm_model="gpt-4o",
37
- llm_prompt_matching=None,
38
- llm_prompt_correction=None,
39
- render_video=False,
40
- video_resolution="360p",
41
- video_background_image=None,
42
- video_background_color="black",
43
- ):
44
- self.logger = logging.getLogger(__name__)
45
- self.logger.setLevel(log_level)
46
- self.log_level = log_level
47
- self.log_formatter = log_formatter
48
-
49
- self.log_handler = logging.StreamHandler()
50
-
51
- if self.log_formatter is None:
52
- self.log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(module)s - %(message)s")
53
-
54
- self.log_handler.setFormatter(self.log_formatter)
55
- self.logger.addHandler(self.log_handler)
56
-
57
- self.logger.debug(f"LyricsTranscriber instantiating with input file: {audio_filepath}")
58
-
59
- self.cache_dir = cache_dir
60
- self.output_dir = output_dir
61
- self.audio_filepath = audio_filepath
62
- self.artist = artist
63
- self.title = title
64
- self.song_known = self.artist is not None and self.title is not None
65
-
66
- self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
67
- self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
68
- self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
69
- self.audioshake_api_token = os.getenv("AUDIOSHAKE_API_TOKEN", default=audioshake_api_token)
70
-
71
- self.transcription_model = transcription_model
72
- self.llm_model = llm_model
73
-
74
- # Use package-relative paths for prompt files
75
- if llm_prompt_matching is None:
76
- llm_prompt_matching = os.path.join(
77
- os.path.dirname(__file__), "llm_prompts", "llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt"
78
- )
79
- if llm_prompt_correction is None:
80
- llm_prompt_correction = os.path.join(
81
- os.path.dirname(__file__), "llm_prompts", "llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt"
82
- )
83
-
84
- self.llm_prompt_matching = llm_prompt_matching
85
- self.llm_prompt_correction = llm_prompt_correction
86
-
87
- if not os.path.exists(self.llm_prompt_matching):
88
- raise FileNotFoundError(f"LLM prompt file not found: {self.llm_prompt_matching}")
89
- if not os.path.exists(self.llm_prompt_correction):
90
- raise FileNotFoundError(f"LLM prompt file not found: {self.llm_prompt_correction}")
91
-
92
- self.openai_client = None
93
-
94
- if self.openai_api_key:
95
- self.openai_client = OpenAI(api_key=self.openai_api_key)
96
-
97
- # Uncomment for local models e.g. with ollama
98
- # self.openai_client = OpenAI(
99
- # base_url="http://localhost:11434/v1",
100
- # api_key="ollama",
101
- # )
102
-
103
- self.openai_client.log = self.log_level
104
- else:
105
- self.logger.warning("No OpenAI API key found, no correction will be applied to transcription")
106
-
107
- self.render_video = render_video
108
- self.video_resolution = video_resolution
109
- self.video_background_image = video_background_image
110
- self.video_background_color = video_background_color
111
-
112
- match video_resolution:
113
- case "4k":
114
- self.video_resolution_num = (3840, 2160)
115
- self.font_size = 250
116
- self.line_height = 250
117
- case "1080p":
118
- self.video_resolution_num = (1920, 1080)
119
- self.font_size = 120
120
- self.line_height = 120
121
- case "720p":
122
- self.video_resolution_num = (1280, 720)
123
- self.font_size = 100
124
- self.line_height = 100
125
- case "360p":
126
- self.video_resolution_num = (640, 360)
127
- self.font_size = 50
128
- self.line_height = 50
129
- case _:
130
- raise ValueError("Invalid video_resolution value. Must be one of: 4k, 1080p, 720p, 360p")
131
-
132
- # If a video background is provided, validate file exists
133
- if self.video_background_image is not None:
134
- if os.path.isfile(self.video_background_image):
135
- self.logger.debug(f"video_background is valid file path: {self.video_background_image}")
136
- else:
137
- raise FileNotFoundError(f"video_background is not a valid file path: {self.video_background_image}")
138
-
139
- self.outputs = {
140
- "transcription_data_dict_whisper": None,
141
- "transcription_data_whisper_filepath": None,
142
- "transcribed_lyrics_text_whisper": None,
143
- "transcribed_lyrics_text_whisper_filepath": None,
144
- "transcription_data_dict_audioshake": None,
145
- "transcription_data_audioshake_filepath": None,
146
- "transcribed_lyrics_text_audioshake": None,
147
- "transcribed_lyrics_text_audioshake_filepath": None,
148
- "transcription_data_dict_primary": None,
149
- "transcription_data_primary_filepath": None,
150
- "transcribed_lyrics_text_primary": None,
151
- "transcribed_lyrics_text_primary_filepath": None,
152
- "genius_lyrics_text": None,
153
- "genius_lyrics_filepath": None,
154
- "spotify_lyrics_data_dict": None,
155
- "spotify_lyrics_data_filepath": None,
156
- "spotify_lyrics_text_filepath": None,
157
- "llm_token_usage": {"input": 0, "output": 0},
158
- "llm_costs_usd": {"input": 0.0, "output": 0.0, "total": 0.0},
159
- "llm_transcript": None,
160
- "llm_transcript_filepath": None,
161
- "corrected_lyrics_text": None,
162
- "corrected_lyrics_text_filepath": None,
163
- "midico_lrc_filepath": None,
164
- "ass_subtitles_filepath": None,
165
- "karaoke_video_filepath": None,
166
- "singing_percentage": None,
167
- "total_singing_duration": None,
168
- "song_duration": None,
169
- "output_dir": None,
170
- }
171
-
172
- if self.audio_filepath is None:
173
- raise Exception("audio_filepath must be specified as the input source to transcribe")
174
-
175
- self.create_folders()
176
-
177
- self.output_prefix = f"{artist} - {title}"
178
-
179
- def generate(self):
180
- self.logger.debug(f"Starting generate() with cache_dir: {self.cache_dir} and output_dir: {self.output_dir}")
181
-
182
- self.logger.debug(f"audio_filepath is set: {self.audio_filepath}, beginning initial whisper transcription")
183
-
184
- self.transcribe()
185
-
186
- self.write_transcribed_lyrics_plain_text()
187
-
188
- self.write_genius_lyrics_file()
189
- self.write_spotify_lyrics_data_file()
190
- self.write_spotify_lyrics_plain_text()
191
-
192
- self.validate_lyrics_match_song()
193
-
194
- if self.openai_client:
195
- self.write_corrected_lyrics_data_file()
196
- self.write_corrected_lyrics_plain_text()
197
- else:
198
- self.logger.warning("Skipping LLM correction as no OpenAI client is available")
199
- self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
200
- self.write_corrected_lyrics_plain_text()
201
-
202
- self.calculate_singing_percentage()
203
-
204
- self.write_midico_lrc_file()
205
- self.write_ass_file()
206
-
207
- if self.render_video:
208
- self.outputs["karaoke_video_filepath"] = self.get_cache_filepath(".mp4")
209
- self.create_video()
210
-
211
- self.copy_files_to_output_dir()
212
- self.calculate_llm_costs()
213
-
214
- if self.openai_client:
215
- self.openai_client.close()
216
-
217
- return self.outputs
218
-
219
- def copy_files_to_output_dir(self):
220
- if self.output_dir is None:
221
- self.output_dir = os.getcwd()
222
-
223
- self.logger.debug(f"copying temporary files to output dir: {self.output_dir}")
224
- self.logger.debug("Files to copy:")
225
- for key, value in self.outputs.items():
226
- if key.endswith("_filepath"):
227
- self.logger.debug(f" {key}: {value}")
228
- if value and os.path.isfile(value):
229
- self.logger.debug(f" File exists, copying to {self.output_dir}")
230
- shutil.copy(value, self.output_dir)
231
- else:
232
- self.logger.debug(f" File doesn't exist or is None")
233
-
234
- self.outputs["output_dir"] = self.output_dir
235
-
236
- def validate_lyrics_match_song(self):
237
- at_least_one_online_lyrics_validated = False
238
-
239
- with open(self.llm_prompt_matching, "r") as file:
240
- llm_matching_instructions = file.read()
241
-
242
- for online_lyrics_source in ["genius", "spotify"]:
243
- self.logger.debug(f"validating transcribed lyrics match lyrics from {online_lyrics_source}")
244
-
245
- online_lyrics_text_key = f"{online_lyrics_source}_lyrics_text"
246
- online_lyrics_filepath_key = f"{online_lyrics_source}_lyrics_filepath"
247
-
248
- if online_lyrics_text_key not in self.outputs or self.outputs[online_lyrics_text_key] is None:
249
- continue
250
-
251
- if self.openai_client:
252
- data_input_str = f'Data input 1:\n{self.outputs["transcribed_lyrics_text_primary"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
253
-
254
- self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
255
- response = self.openai_client.chat.completions.create(
256
- model=self.llm_model,
257
- messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
258
- )
259
-
260
- message = response.choices[0].message.content
261
- finish_reason = response.choices[0].finish_reason
262
-
263
- self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
264
- self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
265
-
266
- if finish_reason == "stop":
267
- if message == "Yes":
268
- self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
269
- at_least_one_online_lyrics_validated = True
270
- elif message == "No":
271
- self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
272
- self.outputs[online_lyrics_text_key] = None
273
- self.outputs[online_lyrics_filepath_key] = None
274
- else:
275
- self.logger.error(f"Unexpected response from LLM: {message}")
276
- else:
277
- self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
278
- else:
279
- # Fallback primitive word matching
280
- self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
281
- transcribed_words = set(self.outputs["transcribed_lyrics_text_primary"].split())
282
- online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
283
- common_words = transcribed_words & online_lyrics_words
284
- match_percentage = len(common_words) / len(online_lyrics_words) * 100
285
-
286
- if match_percentage >= 50:
287
- self.logger.info(
288
- f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
289
- )
290
- at_least_one_online_lyrics_validated = True
291
- else:
292
- self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
293
- self.outputs[online_lyrics_text_key] = None
294
- self.outputs[online_lyrics_filepath_key] = None
295
-
296
- self.logger.info(
297
- f"Completed validation of transcription using online lyrics sources. Match found: {at_least_one_online_lyrics_validated}"
298
- )
299
-
300
- if not at_least_one_online_lyrics_validated:
301
- self.logger.error(
302
- f"Lyrics from Genius and Spotify did not match the transcription. Please check artist and title are set correctly."
303
- )
304
-
305
- def write_corrected_lyrics_data_file(self):
306
- if not self.openai_client:
307
- self.logger.warning("Skipping LLM correction as no OpenAI client is available")
308
- return
309
-
310
- self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
311
-
312
- corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).json"))
313
-
314
- if os.path.isfile(corrected_lyrics_data_json_cache_filepath):
315
- self.logger.debug(
316
- f"found existing file at corrected_lyrics_data_json_cache_filepath, reading: {corrected_lyrics_data_json_cache_filepath}"
317
- )
318
-
319
- with open(corrected_lyrics_data_json_cache_filepath, "r") as corrected_lyrics_data_json:
320
- self.outputs["corrected_lyrics_data_filepath"] = corrected_lyrics_data_json_cache_filepath
321
-
322
- corrected_lyrics_data_dict = json.load(corrected_lyrics_data_json)
323
- self.outputs["corrected_lyrics_data_dict"] = corrected_lyrics_data_dict
324
- return
325
-
326
- reference_lyrics = self.outputs.get("genius_lyrics_text") or self.outputs.get("spotify_lyrics_text")
327
-
328
- if not reference_lyrics:
329
- self.logger.warning("No reference lyrics found from Genius or Spotify. Skipping LLM correction.")
330
- self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict_primary"]
331
- return
332
-
333
- self.logger.debug(
334
- f"no cached lyrics found at corrected_lyrics_data_json_cache_filepath: {corrected_lyrics_data_json_cache_filepath}, attempting to run correction using LLM"
335
- )
336
-
337
- corrected_lyrics_dict = {"segments": []}
338
-
339
- with open(self.llm_prompt_correction, "r") as file:
340
- system_prompt_template = file.read()
341
-
342
- system_prompt = system_prompt_template.replace("{{reference_lyrics}}", reference_lyrics)
343
-
344
- # TODO: Test if results are cleaner when using the vocal file from a background vocal audio separation model
345
- # TODO: Record more info about the correction process (e.g before/after diffs for each segment) to a file for debugging
346
- # TODO: Possibly add a step after segment-based correct to get the LLM to self-analyse the diff
347
-
348
- self.outputs["llm_transcript"] = ""
349
- self.outputs["llm_transcript_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (LLM Transcript).txt"))
350
-
351
- total_segments = len(self.outputs["transcription_data_dict_primary"]["segments"])
352
- self.logger.info(f"Beginning correction using LLM, total segments: {total_segments}")
353
-
354
- with open(self.outputs["llm_transcript_filepath"], "a", buffering=1, encoding="utf-8") as llm_transcript_file:
355
- self.logger.debug(f"writing LLM chat instructions: {self.outputs['llm_transcript_filepath']}")
356
-
357
- llm_transcript_header = f"--- SYSTEM instructions passed in for all segments ---:\n\n{system_prompt}\n"
358
- self.outputs["llm_transcript"] += llm_transcript_header
359
- llm_transcript_file.write(llm_transcript_header)
360
-
361
- for segment in self.outputs["transcription_data_dict_primary"]["segments"]:
362
- # # Don't waste OpenAI dollars when testing!
363
- # if segment["id"] > 10:
364
- # continue
365
- # if segment["id"] < 20 or segment["id"] > 24:
366
- # continue
367
-
368
- llm_transcript_segment = ""
369
- segment_input = json.dumps(
370
- {
371
- "id": segment["id"],
372
- "start": segment["start"],
373
- "end": segment["end"],
374
- "confidence": segment["confidence"],
375
- "text": segment["text"],
376
- "words": segment["words"],
377
- }
378
- )
379
-
380
- previous_two_corrected_lines = ""
381
- upcoming_two_uncorrected_lines = ""
382
-
383
- for previous_segment in corrected_lyrics_dict["segments"]:
384
- if previous_segment["id"] in (segment["id"] - 2, segment["id"] - 1):
385
- previous_two_corrected_lines += previous_segment["text"].strip() + "\n"
386
-
387
- for next_segment in self.outputs["transcription_data_dict_primary"]["segments"]:
388
- if next_segment["id"] in (segment["id"] + 1, segment["id"] + 2):
389
- upcoming_two_uncorrected_lines += next_segment["text"].strip() + "\n"
390
-
391
- llm_transcript_segment += f"--- Segment {segment['id']} / {total_segments} ---\n"
392
- llm_transcript_segment += f"Previous two corrected lines:\n\n{previous_two_corrected_lines}\nUpcoming two uncorrected lines:\n\n{upcoming_two_uncorrected_lines}\nData input:\n\n{segment_input}\n"
393
-
394
- # fmt: off
395
- segment_prompt = system_prompt_template.replace(
396
- "{{previous_two_corrected_lines}}", previous_two_corrected_lines
397
- ).replace(
398
- "{{upcoming_two_uncorrected_lines}}", upcoming_two_uncorrected_lines
399
- ).replace(
400
- "{{segment_input}}", segment_input
401
- )
402
-
403
- self.logger.info(
404
- f'Calling completion model {self.llm_model} with instructions and data input for segment {segment["id"]} / {total_segments}:'
405
- )
406
-
407
- response = self.openai_client.chat.completions.create(
408
- model=self.llm_model,
409
- response_format={"type": "json_object"},
410
- seed=10,
411
- temperature=0.4,
412
- messages=[
413
- {
414
- "role": "user",
415
- "content": segment_prompt
416
- }
417
- ],
418
- )
419
- # fmt: on
420
-
421
- message = response.choices[0].message.content
422
- finish_reason = response.choices[0].finish_reason
423
-
424
- llm_transcript_segment += f"\n--- RESPONSE for segment {segment['id']} ---:\n\n"
425
- llm_transcript_segment += message
426
- llm_transcript_segment += f"\n--- END segment {segment['id']} / {total_segments} ---:\n\n"
427
-
428
- self.logger.debug(f"writing LLM chat transcript for segment to: {self.outputs['llm_transcript_filepath']}")
429
- llm_transcript_file.write(llm_transcript_segment)
430
- self.outputs["llm_transcript"] += llm_transcript_segment
431
-
432
- self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
433
- self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
434
-
435
- # self.logger.debug(f"response finish_reason: {finish_reason} message: \n{message}")
436
-
437
- if finish_reason == "stop":
438
- try:
439
- corrected_segment_dict = json.loads(message)
440
- corrected_lyrics_dict["segments"].append(corrected_segment_dict)
441
- self.logger.info("Successfully parsed response from GPT as JSON and appended to corrected_lyrics_dict.segments")
442
- except json.JSONDecodeError as e:
443
- raise Exception("Failed to parse response from GPT as JSON") from e
444
- else:
445
- self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
446
-
447
- self.logger.info(f'Successfully processed correction for all {len(corrected_lyrics_dict["segments"])} lyrics segments')
448
-
449
- self.logger.debug(f"writing corrected lyrics data JSON filepath: {corrected_lyrics_data_json_cache_filepath}")
450
- with open(corrected_lyrics_data_json_cache_filepath, "w", encoding="utf-8") as corrected_lyrics_data_json_cache_file:
451
- corrected_lyrics_data_json_cache_file.write(json.dumps(corrected_lyrics_dict, indent=4))
452
-
453
- self.outputs["corrected_lyrics_data_filepath"] = corrected_lyrics_data_json_cache_filepath
454
- self.outputs["corrected_lyrics_data_dict"] = corrected_lyrics_dict
455
-
456
- def calculate_llm_costs(self):
457
- price_dollars_per_1000_tokens = {
458
- "gpt-3.5-turbo-1106": {
459
- "input": 0.0010,
460
- "output": 0.0020,
461
- },
462
- "gpt-4-1106-preview": {
463
- "input": 0.01,
464
- "output": 0.03,
465
- },
466
- }
467
-
468
- input_price = price_dollars_per_1000_tokens.get(self.llm_model, {"input": 0, "output": 0})["input"]
469
- output_price = price_dollars_per_1000_tokens.get(self.llm_model, {"input": 0, "output": 0})["output"]
470
-
471
- input_cost = input_price * (self.outputs["llm_token_usage"]["input"] / 1000)
472
- output_cost = output_price * (self.outputs["llm_token_usage"]["output"] / 1000)
473
-
474
- self.outputs["llm_costs_usd"]["input"] = round(input_cost, 3)
475
- self.outputs["llm_costs_usd"]["output"] = round(output_cost, 3)
476
- self.outputs["llm_costs_usd"]["total"] = round(input_cost + output_cost, 3)
477
-
478
- def write_corrected_lyrics_plain_text(self):
479
- if self.outputs["corrected_lyrics_data_dict"]:
480
- self.logger.debug(f"corrected_lyrics_data_dict exists, writing plain text lyrics file")
481
-
482
- corrected_lyrics_text_filepath = os.path.join(
483
- self.cache_dir, self.get_output_filename(" (Lyrics Corrected).txt") # Updated to use consistent naming
484
- )
485
- self.outputs["corrected_lyrics_text_filepath"] = corrected_lyrics_text_filepath
486
-
487
- self.outputs["corrected_lyrics_text"] = ""
488
-
489
- self.logger.debug(f"writing lyrics plain text to corrected_lyrics_text_filepath: {corrected_lyrics_text_filepath}")
490
- with open(corrected_lyrics_text_filepath, "w", encoding="utf-8") as f:
491
- for corrected_segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
492
- self.outputs["corrected_lyrics_text"] += corrected_segment["text"].strip() + "\n"
493
- f.write(corrected_segment["text"].strip() + "\n")
494
-
495
- def write_spotify_lyrics_data_file(self):
496
- if self.spotify_cookie and self.song_known:
497
- self.logger.debug(f"attempting spotify fetch as spotify_cookie and song name was set")
498
- else:
499
- self.logger.warning(f"skipping spotify fetch as not all spotify params were set")
500
- return
501
-
502
- spotify_lyrics_data_json_cache_filepath = os.path.join(
503
- self.cache_dir, self.get_output_filename(" (Lyrics Spotify).json") # Updated to use consistent naming
504
- )
505
-
506
- if os.path.isfile(spotify_lyrics_data_json_cache_filepath):
507
- self.logger.debug(
508
- f"found existing file at spotify_lyrics_data_json_cache_filepath, reading: {spotify_lyrics_data_json_cache_filepath}"
509
- )
510
-
511
- with open(spotify_lyrics_data_json_cache_filepath, "r") as spotify_lyrics_data_json:
512
- spotify_lyrics_data_dict = json.load(spotify_lyrics_data_json)
513
- self.outputs["spotify_lyrics_data_filepath"] = spotify_lyrics_data_json_cache_filepath
514
- self.outputs["spotify_lyrics_data_dict"] = spotify_lyrics_data_dict
515
- return
516
-
517
- self.logger.debug(
518
- f"no cached lyrics found at spotify_lyrics_data_json_cache_filepath: {spotify_lyrics_data_json_cache_filepath}, attempting to fetch from spotify"
519
- )
520
-
521
- spotify_lyrics_json = None
522
-
523
- try:
524
- spotify_client = syrics.api.Spotify(self.spotify_cookie)
525
- spotify_search_query = f"{self.title} - {self.artist}"
526
- spotify_search_results = spotify_client.search(spotify_search_query, type="track", limit=5)
527
-
528
- spotify_top_result = spotify_search_results["tracks"]["items"][0]
529
- self.logger.debug(
530
- f"spotify_top_result: {spotify_top_result['artists'][0]['name']} - {spotify_top_result['name']} ({spotify_top_result['external_urls']['spotify']})"
531
- )
532
-
533
- spotify_lyrics_dict = spotify_client.get_lyrics(spotify_top_result["id"])
534
- spotify_lyrics_json = json.dumps(spotify_lyrics_dict, indent=4)
535
-
536
- self.logger.debug(
537
- f"writing lyrics data JSON to spotify_lyrics_data_json_cache_filepath: {spotify_lyrics_data_json_cache_filepath}"
538
- )
539
- with open(spotify_lyrics_data_json_cache_filepath, "w", encoding="utf-8") as f:
540
- f.write(spotify_lyrics_json)
541
- except Exception as e:
542
- self.logger.warn(f"caught exception while attempting to fetch from spotify: ", e)
543
-
544
- self.outputs["spotify_lyrics_data_filepath"] = spotify_lyrics_data_json_cache_filepath
545
- self.outputs["spotify_lyrics_data_dict"] = spotify_lyrics_dict
546
-
547
- def write_spotify_lyrics_plain_text(self):
548
- if self.outputs["spotify_lyrics_data_dict"]:
549
- self.logger.debug(f"spotify_lyrics data found, checking/writing plain text lyrics file")
550
-
551
- spotify_lyrics_text_filepath = os.path.join(
552
- self.cache_dir, self.get_output_filename(" (Lyrics Spotify).txt") # Updated to use consistent naming
553
- )
554
- self.outputs["spotify_lyrics_text_filepath"] = spotify_lyrics_text_filepath
555
-
556
- lines = self.outputs["spotify_lyrics_data_dict"]["lyrics"]["lines"]
557
-
558
- self.outputs["spotify_lyrics_text"] = ""
559
-
560
- self.logger.debug(f"writing lyrics plain text to spotify_lyrics_text_filepath: {spotify_lyrics_text_filepath}")
561
- with open(spotify_lyrics_text_filepath, "w", encoding="utf-8") as f:
562
- for line in lines:
563
- self.outputs["spotify_lyrics_text"] += line["words"].strip() + "\n"
564
- f.write(line["words"].strip() + "\n")
565
-
566
- @retry(
567
- stop=stop_after_delay(120), # Stop after 2 minutes
568
- wait=wait_exponential(multiplier=1, min=4, max=60), # Exponential backoff starting at 4 seconds
569
- retry=retry_if_exception_type(requests.exceptions.RequestException), # Retry on request exceptions
570
- reraise=True, # Reraise the last exception if all retries fail
571
- )
572
- def fetch_genius_lyrics(self, genius, title, artist):
573
- self.logger.debug(f"fetch_genius_lyrics attempting to fetch lyrics from Genius for {title} by {artist}")
574
- return genius.search_song(title, artist)
575
-
576
- def write_genius_lyrics_file(self):
577
- if self.genius_api_token and self.song_known:
578
- self.logger.debug(f"attempting genius fetch as genius_api_token and song name was set")
579
- else:
580
- self.logger.warning(f"skipping genius fetch as not all genius params were set")
581
- return
582
-
583
- genius_lyrics_cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Genius).txt"))
584
-
585
- # Check cache first
586
- if os.path.isfile(genius_lyrics_cache_filepath):
587
- self.logger.debug(f"found existing file at genius_lyrics_cache_filepath, reading: {genius_lyrics_cache_filepath}")
588
-
589
- with open(genius_lyrics_cache_filepath, "r") as cached_lyrics:
590
- self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
591
- self.outputs["genius_lyrics_text"] = cached_lyrics.read()
592
- return
593
- self.logger.debug(f"no cached lyrics found at genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}, fetching from Genius")
594
-
595
- # Initialize Genius with better defaults
596
- genius = lyricsgenius.Genius(
597
- self.genius_api_token,
598
- verbose=(self.log_level == logging.DEBUG),
599
- remove_section_headers=True,
600
- )
601
-
602
- try:
603
- song = self.fetch_genius_lyrics(genius, self.title, self.artist)
604
- if song is None:
605
- self.logger.warning(f'Could not find lyrics on Genius for "{self.title}" by {self.artist}')
606
- return None
607
-
608
- lyrics = self.clean_genius_lyrics(song.lyrics)
609
-
610
- self.logger.debug(f"writing clean lyrics to genius_lyrics_cache_filepath: {genius_lyrics_cache_filepath}")
611
- with open(genius_lyrics_cache_filepath, "w", encoding="utf-8") as f:
612
- f.write(lyrics)
613
-
614
- self.outputs["genius_lyrics_filepath"] = genius_lyrics_cache_filepath
615
- self.outputs["genius_lyrics_text"] = lyrics
616
- return lyrics.split("\n") # Return lines like write_lyrics_from_genius
617
-
618
- except requests.exceptions.RequestException as e:
619
- self.logger.error(f"Failed to fetch lyrics from Genius after multiple retries: {e}")
620
- raise
621
-
622
- def clean_genius_lyrics(self, lyrics):
623
- lyrics = lyrics.replace("\\n", "\n")
624
- lyrics = re.sub(r"You might also like", "", lyrics)
625
- lyrics = re.sub(
626
- r".*?Lyrics([A-Z])", r"\1", lyrics
627
- ) # Remove the song name and word "Lyrics" if this has a non-newline char at the start
628
- lyrics = re.sub(r"^[0-9]* Contributors.*Lyrics", "", lyrics) # Remove this example: 27 ContributorsSex Bomb Lyrics
629
- lyrics = re.sub(
630
- r"See.*Live.*Get tickets as low as \$[0-9]+", "", lyrics
631
- ) # Remove this example: See Tom Jones LiveGet tickets as low as $71
632
- lyrics = re.sub(r"[0-9]+Embed$", "", lyrics) # Remove the word "Embed" at end of line with preceding numbers if found
633
- lyrics = re.sub(r"(\S)Embed$", r"\1", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
634
- lyrics = re.sub(r"^Embed$", r"", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
635
- lyrics = re.sub(r".*?\[.*?\].*?", "", lyrics) # Remove lines containing square brackets
636
- # add any additional cleaning rules here
637
- return lyrics
638
-
639
- def calculate_singing_percentage(self):
640
- # Calculate total seconds of singing using timings from whisper transcription results
641
- total_singing_duration = sum(
642
- segment["end"] - segment["start"] for segment in self.outputs["transcription_data_dict_primary"]["segments"]
643
- )
644
-
645
- self.logger.debug(f"calculated total_singing_duration: {int(total_singing_duration)} seconds, now running ffprobe")
646
-
647
- # Calculate total song duration using ffprobe
648
- duration_command = [
649
- "ffprobe",
650
- "-i",
651
- self.audio_filepath,
652
- "-show_entries",
653
- "format=duration",
654
- "-v",
655
- "quiet",
656
- "-of",
657
- "csv=%s" % ("p=0"),
658
- ]
659
- duration_output = subprocess.check_output(duration_command, universal_newlines=True)
660
- song_duration = float(duration_output)
661
-
662
- # Calculate singing percentage
663
- singing_percentage = int((total_singing_duration / song_duration) * 100)
664
-
665
- self.outputs["singing_percentage"] = singing_percentage
666
- self.outputs["total_singing_duration"] = total_singing_duration
667
- self.outputs["song_duration"] = song_duration
668
-
669
- # Loops through lyrics segments (typically sentences) from whisper_timestamps JSON output,
670
- # then loops over each word and writes all words with MidiCo segment start/end formatting
671
- # and word-level timestamps to a MidiCo-compatible LRC file
672
- def write_midico_lrc_file(self):
673
- self.outputs["midico_lrc_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).lrc"))
674
-
675
- lrc_filename = self.outputs["midico_lrc_filepath"]
676
- self.logger.debug(f"writing midico formatted word timestamps to LRC file: {lrc_filename}")
677
- with open(lrc_filename, "w", encoding="utf-8") as f:
678
- f.write("[re:MidiCo]\n")
679
- for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
680
- for i, word in enumerate(segment["words"]):
681
- start_time = self.format_time_lrc(word["start"])
682
- if i != len(segment["words"]) - 1:
683
- if not word["text"].endswith(" "):
684
- self.logger.debug(f"word '{word['text']}' does not end with a space, adding one")
685
- word["text"] += " "
686
- line = "[{}]1:{}{}\n".format(start_time, "/" if i == 0 else "", word["text"])
687
- f.write(line)
688
-
689
- def create_screens(self):
690
- self.logger.debug("create_screens beginning generation of screens from transcription results")
691
- screens: List[subtitles.LyricsScreen] = []
692
- screen: Optional[subtitles.LyricsScreen] = None
693
-
694
- max_lines_per_screen = 4
695
- max_line_length = 36 # Maximum characters per line
696
- self.logger.debug(f"Max lines per screen: {max_lines_per_screen}, Max line length: {max_line_length}")
697
-
698
- for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
699
- self.logger.debug(f"Processing segment: {segment['text']}")
700
- if screen is None or len(screen.lines) >= max_lines_per_screen:
701
- screen = subtitles.LyricsScreen(video_size=self.video_resolution_num, line_height=self.line_height, logger=self.logger)
702
- screens.append(screen)
703
- self.logger.debug(f"Created new screen. Total screens: {len(screens)}")
704
-
705
- words = segment["words"]
706
- current_line = subtitles.LyricsLine()
707
- current_line_text = ""
708
- self.logger.debug(f"Processing {len(words)} words in segment")
709
-
710
- for word in words:
711
- self.logger.debug(f"Processing word: '{word['text']}'")
712
- if len(current_line_text) + len(word["text"]) + 1 > max_line_length or (current_line_text and word["text"][0].isupper()):
713
- self.logger.debug(f"Current line would exceed max length or new capitalized word. Line: '{current_line_text}'")
714
- if current_line.segments:
715
- screen.lines.append(current_line)
716
- self.logger.debug(f"Added line to screen. Lines on current screen: {len(screen.lines)}")
717
- if len(screen.lines) >= max_lines_per_screen:
718
- screen = subtitles.LyricsScreen(
719
- video_size=self.video_resolution_num,
720
- line_height=self.line_height,
721
- logger=self.logger,
722
- )
723
- screens.append(screen)
724
- self.logger.debug(f"Screen full, created new screen. Total screens: {len(screens)}")
725
- current_line = subtitles.LyricsLine()
726
- current_line_text = ""
727
- self.logger.debug("Reset current line")
728
-
729
- current_line_text += (" " if current_line_text else "") + word["text"]
730
-
731
- # fmt: off
732
- lyric_segment = subtitles.LyricSegment(
733
- text=word["text"],
734
- ts=timedelta(seconds=word["start"]),
735
- end_ts=timedelta(seconds=word["end"])
736
- )
737
- # fmt: on
738
-
739
- current_line.segments.append(lyric_segment)
740
- self.logger.debug(f"Added word to current line. Current line: '{current_line_text}'")
741
-
742
- if current_line.segments:
743
- screen.lines.append(current_line)
744
- self.logger.debug(f"Added final line of segment to screen. Lines on current screen: {len(screen.lines)}")
745
-
746
- self.logger.debug(f"Finished creating screens. Total screens created: {len(screens)}")
747
- return screens
748
-
749
- def write_ass_file(self):
750
- self.outputs["ass_subtitles_filepath"] = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Corrected).ass"))
751
-
752
- ass_filepath = self.outputs["ass_subtitles_filepath"]
753
- self.logger.debug(f"writing ASS formatted subtitle file: {ass_filepath}")
754
-
755
- initial_screens = self.create_screens()
756
- screens = subtitles.set_segment_end_times(initial_screens, int(self.outputs["song_duration"]))
757
- screens = subtitles.set_screen_start_times(screens)
758
- lyric_subtitles_ass = subtitles.create_styled_subtitles(screens, self.video_resolution_num, self.font_size)
759
- lyric_subtitles_ass.write(ass_filepath)
760
-
761
- def resize_background_image(self):
762
- self.logger.debug(
763
- f"resize_background_image attempting to resize background image: {self.video_background_image} to resolution: {self.video_resolution}"
764
- )
765
- background_image_resized = self.get_cache_filepath(f"-{self.video_resolution}.png")
766
-
767
- if os.path.isfile(background_image_resized):
768
- self.logger.debug(
769
- f"resize_background_image found existing resized background image, skipping resize: {background_image_resized}"
770
- )
771
- return background_image_resized
772
-
773
- resize_command = ["ffmpeg", "-i", self.video_background_image]
774
- resize_command += ["-vf", f"scale={self.video_resolution_num[0]}x{self.video_resolution_num[1]}"]
775
-
776
- resize_command += [background_image_resized]
777
- subprocess.check_output(resize_command, universal_newlines=True)
778
-
779
- if not os.path.isfile(background_image_resized):
780
- raise FileNotFoundError(
781
- f"background_image_resized was not a valid file after running ffmpeg to resize: {background_image_resized}"
782
- )
783
-
784
- return background_image_resized
785
-
786
- def create_video(self):
787
- self.logger.debug(f"create_video attempting to generate video file: {self.outputs['karaoke_video_filepath']}")
788
-
789
- audio_delay = 0
790
- audio_delay_ms = int(audio_delay * 1000) # milliseconds
791
-
792
- video_metadata = []
793
- if self.artist:
794
- video_metadata.append("-metadata")
795
- video_metadata.append(f"artist={self.artist}")
796
- if self.title:
797
- video_metadata.append("-metadata")
798
- video_metadata.append(f"title={self.title}")
799
-
800
- # fmt: off
801
- ffmpeg_cmd = [
802
- "ffmpeg",
803
- "-r", "30", # Set frame rate to 30 fps
804
- ]
805
-
806
- if self.video_background_image:
807
- self.logger.debug(f"background image set: {self.video_background_image}, resizing to resolution: {self.video_resolution}")
808
-
809
- background_image_resized = self.resize_background_image()
810
-
811
- ffmpeg_cmd += [
812
- # Use provided image as background
813
- "-loop", "1", # Loop the image
814
- "-i", background_image_resized, # Input image file
815
- ]
816
-
817
- else:
818
- self.logger.debug(f"background not set, using solid {self.video_background_color} background with resolution: {self.video_resolution}")
819
- ffmpeg_cmd += ["-f", "lavfi"]
820
- ffmpeg_cmd += ["-i", f"color=c={self.video_background_color}:s={self.video_resolution_num[0]}x{self.video_resolution_num[1]}:r=30"]
821
-
822
-
823
- # Check for hardware acclerated h.264 encoding and use if available
824
- video_codec = "libx264"
825
- ffmpeg_codes = subprocess.getoutput("ffmpeg -codecs")
826
-
827
- if "h264_videotoolbox" in ffmpeg_codes:
828
- video_codec = "h264_videotoolbox"
829
- self.logger.info(f"video codec set to hardware accelerated h264_videotoolbox")
830
-
831
- ffmpeg_cmd += [
832
- # Use accompaniment track as audio
833
- "-i", self.audio_filepath,
834
- # Set audio delay if needed
835
- # https://ffmpeg.org/ffmpeg-filters.html#adelay
836
- # "-af",
837
- # f"adelay=delays={audio_delay_ms}:all=1",
838
- # Re-encode audio as mp3
839
- "-c:a", "aac",
840
- # Add subtitles
841
- "-vf", "ass=" + self.outputs["ass_subtitles_filepath"],
842
- # Encode as H264 using hardware acceleration if available
843
- "-c:v", video_codec,
844
- # Increase output video quality
845
- "-preset", "slow", # Use a slower preset for better compression efficiency
846
- # "-crf", "1", # Lower CRF for higher quality. Adjust as needed, lower is better quality
847
- "-b:v", "5000k", # Set the video bitrate, for example, 5000 kbps
848
- "-minrate", "5000k", # Minimum bitrate
849
- "-maxrate", "20000k", # Maximum bitrate
850
- "-bufsize", "10000k", # Set the buffer size, typically 2x maxrate
851
- # End encoding after the shortest stream
852
- "-shortest",
853
- # Overwrite files without asking
854
- "-y",
855
- # Only encode the first 30 seconds (for testing, fast iteration when editing this)
856
- # "-t", "30",
857
- *video_metadata,
858
- # Output path of video
859
- self.outputs["karaoke_video_filepath"],
860
- ]
861
- # fmt: on
862
-
863
- self.logger.debug(f"running ffmpeg command to generate video: {' '.join(ffmpeg_cmd)}")
864
- ffmpeg_output = subprocess.check_output(ffmpeg_cmd, universal_newlines=True)
865
- return ffmpeg_output
866
-
867
- def format_time_lrc(self, duration):
868
- minutes = int(duration // 60)
869
- seconds = int(duration % 60)
870
- milliseconds = int((duration % 1) * 1000)
871
- formatted_time = f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
872
- return formatted_time
873
-
874
- def write_transcribed_lyrics_plain_text(self):
875
- if self.outputs["transcription_data_dict_whisper"]:
876
- transcribed_lyrics_text_whisper_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics Whisper).txt"))
877
- self.logger.debug(f"Setting Whisper text filepath to: {transcribed_lyrics_text_whisper_filepath}")
878
- self.outputs["transcribed_lyrics_text_whisper_filepath"] = transcribed_lyrics_text_whisper_filepath
879
- self.outputs["transcribed_lyrics_text_whisper"] = ""
880
-
881
- self.logger.debug(f"Writing Whisper lyrics to: {transcribed_lyrics_text_whisper_filepath}")
882
- with open(transcribed_lyrics_text_whisper_filepath, "w", encoding="utf-8") as f:
883
- for segment in self.outputs["transcription_data_dict_whisper"]["segments"]:
884
- self.outputs["transcribed_lyrics_text_whisper"] += segment["text"] + "\n"
885
- f.write(segment["text"].strip() + "\n")
886
- self.logger.debug(f"Finished writing Whisper lyrics, file exists: {os.path.exists(transcribed_lyrics_text_whisper_filepath)}")
887
-
888
- if self.outputs["transcription_data_dict_audioshake"]:
889
- transcribed_lyrics_text_audioshake_filepath = os.path.join(self.cache_dir, self.get_output_filename(" (Lyrics AudioShake).txt"))
890
- self.outputs["transcribed_lyrics_text_audioshake_filepath"] = transcribed_lyrics_text_audioshake_filepath
891
- self.outputs["transcribed_lyrics_text_audioshake"] = ""
892
-
893
- self.logger.debug(f"Writing AudioShake lyrics to: {transcribed_lyrics_text_audioshake_filepath}")
894
- with open(transcribed_lyrics_text_audioshake_filepath, "w", encoding="utf-8") as f:
895
- for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]:
896
- self.outputs["transcribed_lyrics_text_audioshake"] += segment["text"] + "\n"
897
- f.write(segment["text"].strip() + "\n")
898
-
899
- def find_best_split_point(self, text, max_length):
900
- self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
901
- words = text.split()
902
- mid_word_index = len(words) // 2
903
- mid_point = len(" ".join(words[:mid_word_index]))
904
- self.logger.debug(f"Mid point is at character {mid_point}")
905
-
906
- # Check for a comma within one or two words of the middle word
907
- if "," in text:
908
- comma_indices = [i for i, char in enumerate(text) if char == ","]
909
- self.logger.debug(f"Found commas at indices: {comma_indices}")
910
- for index in comma_indices:
911
- if abs(mid_point - index) < 20 and len(text[: index + 1].strip()) <= max_length:
912
- self.logger.debug(f"Choosing comma at index {index} as split point")
913
- return index + 1 # Include the comma in the first part
914
-
915
- # Check for 'and'
916
- if " and " in text:
917
- and_indices = [m.start() for m in re.finditer(" and ", text)]
918
- self.logger.debug(f"Found 'and' at indices: {and_indices}")
919
- for index in sorted(and_indices, key=lambda x: abs(x - mid_point)):
920
- if len(text[: index + len(" and ")].strip()) <= max_length:
921
- self.logger.debug(f"Choosing 'and' at index {index} as split point")
922
- return index + len(" and ")
923
-
924
- # Check for words starting with a capital letter
925
- capital_word_indices = [m.start() for m in re.finditer(r"\s[A-Z]", text)]
926
- self.logger.debug(f"Found capital words at indices: {capital_word_indices}")
927
- for index in sorted(capital_word_indices, key=lambda x: abs(x - mid_point)):
928
- if index > 0 and len(text[:index].strip()) <= max_length:
929
- self.logger.debug(f"Choosing capital word at index {index} as split point")
930
- return index
931
-
932
- # If no better split point is found, try splitting at the middle word
933
- if len(words) > 2 and mid_word_index > 0:
934
- split_at_middle = len(" ".join(words[:mid_word_index]))
935
- if split_at_middle <= max_length:
936
- self.logger.debug(f"Choosing middle word split at index {split_at_middle}")
937
- return split_at_middle
938
-
939
- # If the text is still too long, forcibly split at the maximum length
940
- self.logger.debug(f"No suitable split point found, forcibly splitting at max_length {max_length}")
941
- return max_length
942
-
943
- def split_long_segments(self, segments, max_length):
944
- self.logger.debug(f"Splitting long segments (max_length: {max_length})")
945
- new_segments = []
946
- for segment in segments:
947
- text = segment["text"]
948
- self.logger.debug(f"Processing segment: '{text}' (length: {len(text)})")
949
- if len(text) <= max_length:
950
- self.logger.debug("Segment is within max_length, keeping as is")
951
- new_segments.append(segment)
952
- else:
953
- self.logger.debug("Segment exceeds max_length, splitting")
954
- meta_words = segment["words"]
955
- current_text = ""
956
- current_start = segment["start"]
957
- current_words = []
958
-
959
- for i, meta in enumerate(meta_words):
960
- word = meta["text"]
961
- if current_text:
962
- current_text += " "
963
- current_text += word
964
- current_words.append(meta)
965
-
966
- should_split = len(current_text) > max_length or (i > 0 and word[0].isupper())
967
- if should_split:
968
- self.logger.debug(f"Splitting at: '{current_text}'")
969
- # If splitting due to capitalization, don't include the capitalized word
970
- if word[0].isupper() and len(current_text.strip()) > len(word):
971
- split_text = current_text[: -(len(word) + 1)].strip()
972
- current_words = current_words[:-1]
973
- else:
974
- split_text = current_text.strip()
975
-
976
- new_segment = {"text": split_text, "start": current_start, "end": current_words[-1]["end"], "words": current_words}
977
- new_segments.append(new_segment)
978
- self.logger.debug(f"Added new segment: {new_segment}")
979
-
980
- # Reset for next segment
981
- if word[0].isupper() and len(current_text.strip()) > len(word):
982
- current_text = word
983
- current_words = [meta]
984
- else:
985
- current_text = ""
986
- current_words = []
987
- current_start = meta["start"]
988
-
989
- # Add any remaining text as a final segment
990
- if current_text:
991
- self.logger.debug(f"Adding final segment: '{current_text}'")
992
- new_segments.append(
993
- {"text": current_text.strip(), "start": current_start, "end": segment["end"], "words": current_words}
994
- )
995
-
996
- self.logger.debug(f"Splitting complete. Original segments: {len(segments)}, New segments: {len(new_segments)}")
997
- return new_segments
998
-
999
- def transcribe(self):
1000
- # Check cache first
1001
- transcription_cache_filepath_whisper = self.get_cache_filepath(" (Lyrics Whisper).json")
1002
- transcription_cache_filepath_audioshake = self.get_cache_filepath(" (Lyrics AudioShake).json")
1003
-
1004
- self.logger.debug(f"Cache directory: {self.cache_dir}")
1005
- self.logger.debug(f"Output directory: {self.output_dir}")
1006
-
1007
- if os.path.isfile(transcription_cache_filepath_whisper):
1008
- self.logger.debug(f"Found existing Whisper transcription, reading: {transcription_cache_filepath_whisper}")
1009
- with open(transcription_cache_filepath_whisper, "r") as cache_file:
1010
- self.outputs["transcription_data_dict_whisper"] = json.load(cache_file)
1011
- self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
1012
- self.logger.debug(f"Loaded Whisper data and set filepath to: {self.outputs['transcription_data_whisper_filepath']}")
1013
-
1014
- if os.path.isfile(transcription_cache_filepath_audioshake):
1015
- self.logger.debug(f"Found existing AudioShake transcription, reading: {transcription_cache_filepath_audioshake}")
1016
- with open(transcription_cache_filepath_audioshake, "r") as cache_file:
1017
- self.outputs["transcription_data_dict_audioshake"] = json.load(cache_file)
1018
- self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
1019
-
1020
- # If we have both cached transcriptions, set primary and return early
1021
- if self.outputs["transcription_data_dict_whisper"] and self.outputs["transcription_data_dict_audioshake"]:
1022
- self.set_primary_transcription()
1023
- return
1024
- # If we have Whisper cached and AudioShake isn't available, set primary and return early
1025
- elif self.outputs["transcription_data_dict_whisper"] and not self.audioshake_api_token:
1026
- self.set_primary_transcription()
1027
- return
1028
-
1029
- # Continue with transcription for any missing data...
1030
- audioshake_job_id = None
1031
- if self.audioshake_api_token and not self.outputs["transcription_data_dict_audioshake"]:
1032
- self.logger.debug(f"Starting AudioShake transcription")
1033
- from .audioshake_transcriber import AudioShakeTranscriber
1034
-
1035
- audioshake = AudioShakeTranscriber(api_token=self.audioshake_api_token, logger=self.logger, output_prefix=self.output_prefix)
1036
- audioshake_job_id = audioshake.start_transcription(self.audio_filepath)
1037
-
1038
- # Run Whisper transcription if needed while AudioShake processes
1039
- if not self.outputs["transcription_data_dict_whisper"]:
1040
- self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
1041
- audio = whisper.load_audio(self.audio_filepath)
1042
- model = whisper.load_model(self.transcription_model, device="cpu")
1043
- whisper_data = whisper.transcribe(model, audio, language="en", beam_size=5, temperature=0.2, best_of=5)
1044
-
1045
- # Remove segments with no words, only music
1046
- whisper_data["segments"] = [segment for segment in whisper_data["segments"] if segment["text"].strip() != "Music"]
1047
- self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(whisper_data['segments'])}")
1048
-
1049
- # Split long segments
1050
- self.logger.debug("Starting to split long segments")
1051
- whisper_data["segments"] = self.split_long_segments(whisper_data["segments"], max_length=36)
1052
- self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(whisper_data['segments'])}")
1053
-
1054
- # Store Whisper results
1055
- self.outputs["transcription_data_dict_whisper"] = whisper_data
1056
- self.outputs["transcription_data_whisper_filepath"] = transcription_cache_filepath_whisper
1057
- with open(transcription_cache_filepath_whisper, "w") as cache_file:
1058
- json.dump(whisper_data, cache_file, indent=4)
1059
-
1060
- # Now that Whisper is done, get AudioShake results if available
1061
- if audioshake_job_id:
1062
- self.logger.debug("Getting AudioShake results")
1063
- audioshake_data = audioshake.get_transcription_result(audioshake_job_id)
1064
- self.outputs["transcription_data_dict_audioshake"] = audioshake_data
1065
- self.outputs["transcription_data_audioshake_filepath"] = transcription_cache_filepath_audioshake
1066
- with open(transcription_cache_filepath_audioshake, "w") as cache_file:
1067
- json.dump(audioshake_data, cache_file, indent=4)
1068
-
1069
- # Set the primary transcription source
1070
- self.set_primary_transcription()
1071
-
1072
- # Write the text files
1073
- self.write_transcribed_lyrics_plain_text()
1074
-
1075
- def set_primary_transcription(self):
1076
- """Set the primary transcription source (AudioShake if available, otherwise Whisper)"""
1077
- if self.outputs["transcription_data_dict_audioshake"]:
1078
- self.logger.info("Using AudioShake as primary transcription source")
1079
- self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_audioshake"]
1080
- self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_audioshake_filepath"]
1081
-
1082
- # Set the primary text content
1083
- if "transcribed_lyrics_text_audioshake" not in self.outputs or not self.outputs["transcribed_lyrics_text_audioshake"]:
1084
- self.outputs["transcribed_lyrics_text_audioshake"] = "\n".join(
1085
- segment["text"].strip() for segment in self.outputs["transcription_data_dict_audioshake"]["segments"]
1086
- )
1087
- self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_audioshake"]
1088
- self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_audioshake_filepath"]
1089
- else:
1090
- self.logger.info("Using Whisper as primary transcription source")
1091
- self.outputs["transcription_data_dict_primary"] = self.outputs["transcription_data_dict_whisper"]
1092
- self.outputs["transcription_data_primary_filepath"] = self.outputs["transcription_data_whisper_filepath"]
1093
-
1094
- # Set the primary text content
1095
- if "transcribed_lyrics_text_whisper" not in self.outputs or not self.outputs["transcribed_lyrics_text_whisper"]:
1096
- self.outputs["transcribed_lyrics_text_whisper"] = "\n".join(
1097
- segment["text"].strip() for segment in self.outputs["transcription_data_dict_whisper"]["segments"]
1098
- )
1099
- self.outputs["transcribed_lyrics_text_primary"] = self.outputs["transcribed_lyrics_text_whisper"]
1100
- self.outputs["transcribed_lyrics_text_primary_filepath"] = self.outputs["transcribed_lyrics_text_whisper_filepath"]
1101
-
1102
- def get_cache_filepath(self, extension):
1103
- # Instead of using slugify and hash, use the consistent naming pattern
1104
- cache_filepath = os.path.join(self.cache_dir, self.get_output_filename(extension))
1105
- self.logger.debug(f"get_cache_filepath returning cache_filepath: {cache_filepath}")
1106
- return cache_filepath
1107
-
1108
- def get_song_slug(self):
1109
- if not self.artist and not self.title:
1110
- return "unknown_song_" + self.get_file_hash(self.audio_filepath)
1111
-
1112
- artist_slug = slugify.slugify(self.artist or "unknown_artist", lowercase=False)
1113
- title_slug = slugify.slugify(self.title or "unknown_title", lowercase=False)
1114
- return artist_slug + "-" + title_slug
1115
-
1116
- def get_file_hash(self, filepath):
1117
- return hashlib.md5(open(filepath, "rb").read()).hexdigest()
1118
-
1119
- def create_folders(self):
1120
- if self.cache_dir is not None:
1121
- os.makedirs(self.cache_dir, exist_ok=True)
1122
-
1123
- if self.output_dir is not None:
1124
- os.makedirs(self.output_dir, exist_ok=True)
1125
-
1126
- def get_output_filename(self, suffix):
1127
- """Generate consistent filename with (Purpose) suffix pattern"""
1128
- return f"{self.output_prefix}{suffix}"