lyrics-transcriber 0.14.0__tar.gz → 0.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/PKG-INFO +23 -2
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/README.md +21 -1
- lyrics_transcriber-0.16.0/lyrics_transcriber/audioshake_transcriber.py +35 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/transcriber.py +236 -83
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/cli.py +14 -1
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/subtitles.py +37 -11
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/pyproject.toml +2 -1
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/LICENSE +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/__init__.py +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/README.md +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/__init__.py +0 -0
- {lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/ass.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lyrics-transcriber
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.16.0
|
4
4
|
Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
|
5
5
|
Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
|
6
6
|
License: MIT
|
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
16
|
Requires-Dist: Cython (>=0)
|
17
|
+
Requires-Dist: auditok (>=0.2)
|
17
18
|
Requires-Dist: dtw-python (>=1)
|
18
19
|
Requires-Dist: llvmlite (>=0)
|
19
20
|
Requires-Dist: lyricsgenius (>=3)
|
@@ -52,7 +53,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
|
|
52
53
|
### Prerequisites
|
53
54
|
|
54
55
|
- Python 3.9 or higher
|
55
|
-
- [Optional]
|
56
|
+
- [Optional] Genius API token if you want to fetch lyrics from Genius
|
57
|
+
- [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
|
58
|
+
- [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
|
59
|
+
- [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
|
56
60
|
|
57
61
|
```
|
58
62
|
pip install lyrics-transcriber
|
@@ -61,6 +65,23 @@ pip install lyrics-transcriber
|
|
61
65
|
> **Warning**
|
62
66
|
> The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
|
63
67
|
|
68
|
+
## Docker
|
69
|
+
|
70
|
+
You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
|
71
|
+
|
72
|
+
```sh
|
73
|
+
docker run \
|
74
|
+
-v `pwd`/input:/input \
|
75
|
+
-v `pwd`/output:/output \
|
76
|
+
beveradb/lyrics-transcriber:0.16.0 \
|
77
|
+
--log_level debug \
|
78
|
+
--output_dir /output \
|
79
|
+
--render_video \
|
80
|
+
--video_background_image /input/your-background-image.png \
|
81
|
+
--video_resolution 360p \
|
82
|
+
/input/song.flac
|
83
|
+
```
|
84
|
+
|
64
85
|
## Usage 🚀
|
65
86
|
|
66
87
|
### As a standalone CLI
|
@@ -17,7 +17,10 @@ Automatically create synchronised lyrics files in ASS and MidiCo LRC formats wit
|
|
17
17
|
### Prerequisites
|
18
18
|
|
19
19
|
- Python 3.9 or higher
|
20
|
-
- [Optional]
|
20
|
+
- [Optional] Genius API token if you want to fetch lyrics from Genius
|
21
|
+
- [Optional] Spotify cookie value if you want to fetch lyrics from Spotify
|
22
|
+
- [Optional] OpenAI API token if you want to use LLM correction of the transcribed lyrics
|
23
|
+
- [Optional] AudioShake API token if you want to use a much higher quality (but paid) API for lyrics transcription
|
21
24
|
|
22
25
|
```
|
23
26
|
pip install lyrics-transcriber
|
@@ -26,6 +29,23 @@ pip install lyrics-transcriber
|
|
26
29
|
> **Warning**
|
27
30
|
> The package published to PyPI was created by manually editing `poetry.lock` to remove [triton](https://github.com/openai/triton), as it is technically a sub-dependency from openai-whisper but is currently only supported on Linux (whisper still works fine without it, and I want this package to be usable on any platform)
|
28
31
|
|
32
|
+
## Docker
|
33
|
+
|
34
|
+
You can use the pre-built container image `beveradb/lyrics-transcriber:0.16.0` on Docker hub if you want, here's an example:
|
35
|
+
|
36
|
+
```sh
|
37
|
+
docker run \
|
38
|
+
-v `pwd`/input:/input \
|
39
|
+
-v `pwd`/output:/output \
|
40
|
+
beveradb/lyrics-transcriber:0.16.0 \
|
41
|
+
--log_level debug \
|
42
|
+
--output_dir /output \
|
43
|
+
--render_video \
|
44
|
+
--video_background_image /input/your-background-image.png \
|
45
|
+
--video_resolution 360p \
|
46
|
+
/input/song.flac
|
47
|
+
```
|
48
|
+
|
29
49
|
## Usage 🚀
|
30
50
|
|
31
51
|
### As a standalone CLI
|
@@ -0,0 +1,35 @@
|
|
1
|
+
import logging
|
2
|
+
import requests
|
3
|
+
|
4
|
+
|
5
|
+
class AudioShakeTranscriber:
|
6
|
+
def __init__(self, api_token, log_level=logging.DEBUG):
|
7
|
+
self.api_token = api_token
|
8
|
+
self.logger = logging.getLogger(__name__)
|
9
|
+
self.logger.setLevel(log_level)
|
10
|
+
|
11
|
+
def transcribe(self, audio_filepath):
|
12
|
+
# This is a placeholder for the actual AudioShake API implementation
|
13
|
+
self.logger.info(f"Transcribing {audio_filepath} using AudioShake API")
|
14
|
+
|
15
|
+
self.logger.debug(f"AudioShake API token: {self.api_token}")
|
16
|
+
# TODO: Implement the actual API call to AudioShake
|
17
|
+
# For now, we'll return a dummy result
|
18
|
+
return {
|
19
|
+
"transcription_data_dict": {
|
20
|
+
"segments": [
|
21
|
+
{
|
22
|
+
"start": 0,
|
23
|
+
"end": 5,
|
24
|
+
"text": "This is a dummy transcription",
|
25
|
+
"words": [
|
26
|
+
{"text": "This", "start": 0, "end": 1},
|
27
|
+
{"text": "is", "start": 1, "end": 2},
|
28
|
+
{"text": "a", "start": 2, "end": 3},
|
29
|
+
{"text": "dummy", "start": 3, "end": 4},
|
30
|
+
{"text": "transcription", "start": 4, "end": 5},
|
31
|
+
],
|
32
|
+
}
|
33
|
+
]
|
34
|
+
}
|
35
|
+
}
|
@@ -22,6 +22,8 @@ class LyricsTranscriber:
|
|
22
22
|
audio_filepath,
|
23
23
|
artist=None,
|
24
24
|
title=None,
|
25
|
+
openai_api_key=None,
|
26
|
+
audioshake_api_token=None,
|
25
27
|
genius_api_token=None,
|
26
28
|
spotify_cookie=None,
|
27
29
|
output_dir=None,
|
@@ -59,23 +61,30 @@ class LyricsTranscriber:
|
|
59
61
|
self.title = title
|
60
62
|
self.song_known = self.artist is not None and self.title is not None
|
61
63
|
|
64
|
+
self.openai_api_key = os.getenv("OPENAI_API_KEY", default=openai_api_key)
|
62
65
|
self.genius_api_token = os.getenv("GENIUS_API_TOKEN", default=genius_api_token)
|
63
66
|
self.spotify_cookie = os.getenv("SPOTIFY_COOKIE_SP_DC", default=spotify_cookie)
|
67
|
+
self.audioshake_api_token = os.getenv("AUDIOSHAKE_TOKEN", default=audioshake_api_token)
|
64
68
|
|
65
69
|
self.transcription_model = transcription_model
|
66
70
|
self.llm_model = llm_model
|
67
71
|
self.llm_prompt_matching = llm_prompt_matching
|
68
72
|
self.llm_prompt_correction = llm_prompt_correction
|
69
73
|
|
70
|
-
self.openai_client =
|
74
|
+
self.openai_client = None
|
71
75
|
|
72
|
-
|
73
|
-
|
74
|
-
# base_url="http://localhost:11434/v1",
|
75
|
-
# api_key="ollama",
|
76
|
-
# )
|
76
|
+
if self.openai_api_key:
|
77
|
+
self.openai_client = OpenAI(api_key=self.openai_api_key)
|
77
78
|
|
78
|
-
|
79
|
+
# Uncomment for local models e.g. with ollama
|
80
|
+
# self.openai_client = OpenAI(
|
81
|
+
# base_url="http://localhost:11434/v1",
|
82
|
+
# api_key="ollama",
|
83
|
+
# )
|
84
|
+
|
85
|
+
self.openai_client.log = self.log_level
|
86
|
+
else:
|
87
|
+
self.logger.error("No OpenAI API key found, no correction will be applied to transcription")
|
79
88
|
|
80
89
|
self.render_video = render_video
|
81
90
|
self.video_resolution = video_resolution
|
@@ -154,8 +163,13 @@ class LyricsTranscriber:
|
|
154
163
|
|
155
164
|
self.validate_lyrics_match_song()
|
156
165
|
|
157
|
-
self.
|
158
|
-
|
166
|
+
if self.openai_client:
|
167
|
+
self.write_corrected_lyrics_data_file()
|
168
|
+
self.write_corrected_lyrics_plain_text()
|
169
|
+
else:
|
170
|
+
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
171
|
+
self.outputs["corrected_lyrics_data_dict"] = self.outputs["transcription_data_dict"]
|
172
|
+
self.write_corrected_lyrics_plain_text()
|
159
173
|
|
160
174
|
self.calculate_singing_percentage()
|
161
175
|
|
@@ -169,7 +183,8 @@ class LyricsTranscriber:
|
|
169
183
|
self.copy_files_to_output_dir()
|
170
184
|
self.calculate_llm_costs()
|
171
185
|
|
172
|
-
self.openai_client
|
186
|
+
if self.openai_client:
|
187
|
+
self.openai_client.close()
|
173
188
|
|
174
189
|
return self.outputs
|
175
190
|
|
@@ -198,41 +213,55 @@ class LyricsTranscriber:
|
|
198
213
|
online_lyrics_text_key = f"{online_lyrics_source}_lyrics_text"
|
199
214
|
online_lyrics_filepath_key = f"{online_lyrics_source}_lyrics_filepath"
|
200
215
|
|
201
|
-
if online_lyrics_text_key not in self.outputs:
|
216
|
+
if online_lyrics_text_key not in self.outputs or self.outputs[online_lyrics_text_key] is None:
|
202
217
|
continue
|
203
218
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
# self.logger.debug(f"system_prompt:\n{system_prompt}\ndata_input_str:\n{data_input_str}")
|
209
|
-
|
210
|
-
self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
|
211
|
-
response = self.openai_client.chat.completions.create(
|
212
|
-
model=self.llm_model,
|
213
|
-
messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
|
214
|
-
)
|
219
|
+
if self.openai_client:
|
220
|
+
data_input_str = (
|
221
|
+
f'Data input 1:\n{self.outputs["transcribed_lyrics_text"]}\nData input 2:\n{self.outputs[online_lyrics_text_key]}\n'
|
222
|
+
)
|
215
223
|
|
216
|
-
|
217
|
-
|
224
|
+
self.logger.debug(f"making API call to LLM model {self.llm_model} to validate {online_lyrics_source} lyrics match")
|
225
|
+
response = self.openai_client.chat.completions.create(
|
226
|
+
model=self.llm_model,
|
227
|
+
messages=[{"role": "system", "content": llm_matching_instructions}, {"role": "user", "content": data_input_str}],
|
228
|
+
)
|
218
229
|
|
219
|
-
|
220
|
-
|
230
|
+
message = response.choices[0].message.content
|
231
|
+
finish_reason = response.choices[0].finish_reason
|
221
232
|
|
222
|
-
|
233
|
+
self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens
|
234
|
+
self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens
|
223
235
|
|
224
|
-
|
225
|
-
|
226
|
-
|
236
|
+
if finish_reason == "stop":
|
237
|
+
if message == "Yes":
|
238
|
+
self.logger.info(f"{online_lyrics_source} lyrics successfully validated to match transcription")
|
239
|
+
at_least_one_online_lyrics_validated = True
|
240
|
+
elif message == "No":
|
241
|
+
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
242
|
+
self.outputs[online_lyrics_text_key] = None
|
243
|
+
self.outputs[online_lyrics_filepath_key] = None
|
244
|
+
else:
|
245
|
+
self.logger.error(f"Unexpected response from LLM: {message}")
|
246
|
+
else:
|
247
|
+
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
|
248
|
+
else:
|
249
|
+
# Fallback primitive word matching
|
250
|
+
self.logger.debug(f"Using primitive word matching to validate {online_lyrics_source} lyrics match")
|
251
|
+
transcribed_words = set(self.outputs["transcribed_lyrics_text"].split())
|
252
|
+
online_lyrics_words = set(self.outputs[online_lyrics_text_key].split())
|
253
|
+
common_words = transcribed_words & online_lyrics_words
|
254
|
+
match_percentage = len(common_words) / len(online_lyrics_words) * 100
|
255
|
+
|
256
|
+
if match_percentage >= 50:
|
257
|
+
self.logger.info(
|
258
|
+
f"{online_lyrics_source} lyrics successfully validated to match transcription with {match_percentage:.2f}% word match"
|
259
|
+
)
|
227
260
|
at_least_one_online_lyrics_validated = True
|
228
|
-
|
261
|
+
else:
|
229
262
|
self.logger.warning(f"{online_lyrics_source} lyrics do not match transcription, deleting that source from outputs")
|
230
263
|
self.outputs[online_lyrics_text_key] = None
|
231
264
|
self.outputs[online_lyrics_filepath_key] = None
|
232
|
-
else:
|
233
|
-
self.logger.error(f"Unexpected response from LLM: {message}")
|
234
|
-
else:
|
235
|
-
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
|
236
265
|
|
237
266
|
self.logger.info(
|
238
267
|
f"Completed validation of transcription using online lyrics sources. Match found: {at_least_one_online_lyrics_validated}"
|
@@ -242,9 +271,12 @@ class LyricsTranscriber:
|
|
242
271
|
self.logger.error(
|
243
272
|
f"Lyrics from Genius and Spotify did not match the transcription. Please check artist and title are set correctly."
|
244
273
|
)
|
245
|
-
raise Exception("Cannot proceed without internet lyrics to validate / correct transcription")
|
246
274
|
|
247
275
|
def write_corrected_lyrics_data_file(self):
|
276
|
+
if not self.openai_client:
|
277
|
+
self.logger.warning("Skipping LLM correction as no OpenAI client is available")
|
278
|
+
return
|
279
|
+
|
248
280
|
self.logger.debug("write_corrected_lyrics_data_file initiating OpenAI client")
|
249
281
|
|
250
282
|
corrected_lyrics_data_json_cache_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + "-corrected.json")
|
@@ -583,51 +615,57 @@ class LyricsTranscriber:
|
|
583
615
|
f.write(line)
|
584
616
|
|
585
617
|
def create_screens(self):
|
586
|
-
self.logger.debug(
|
618
|
+
self.logger.debug("create_screens beginning generation of screens from whisper results")
|
587
619
|
screens: List[subtitles.LyricsScreen] = []
|
588
|
-
line: Optional[subtitles.LyricsLine] = None
|
589
620
|
screen: Optional[subtitles.LyricsScreen] = None
|
590
621
|
|
591
|
-
|
622
|
+
max_lines_per_screen = 4
|
623
|
+
max_line_length = 36 # Maximum characters per line
|
624
|
+
self.logger.debug(f"Max lines per screen: {max_lines_per_screen}, Max line length: {max_line_length}")
|
625
|
+
|
592
626
|
for segment in self.outputs["corrected_lyrics_data_dict"]["segments"]:
|
593
|
-
self.logger.debug(f"
|
594
|
-
if screen is None:
|
595
|
-
|
596
|
-
|
597
|
-
screen.
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
for
|
605
|
-
|
627
|
+
self.logger.debug(f"Processing segment: {segment['text']}")
|
628
|
+
if screen is None or len(screen.lines) >= max_lines_per_screen:
|
629
|
+
screen = subtitles.LyricsScreen(video_size=self.video_resolution_num, line_height=self.line_height, logger=self.logger)
|
630
|
+
screens.append(screen)
|
631
|
+
self.logger.debug(f"Created new screen. Total screens: {len(screens)}")
|
632
|
+
|
633
|
+
words = segment["words"]
|
634
|
+
current_line = subtitles.LyricsLine()
|
635
|
+
current_line_text = ""
|
636
|
+
self.logger.debug(f"Processing {len(words)} words in segment")
|
637
|
+
|
638
|
+
for word in words:
|
639
|
+
self.logger.debug(f"Processing word: '{word['text']}'")
|
640
|
+
if len(current_line_text) + len(word["text"]) + 1 > max_line_length or (current_line_text and word["text"][0].isupper()):
|
641
|
+
self.logger.debug(f"Current line would exceed max length or new capitalized word. Line: '{current_line_text}'")
|
642
|
+
if current_line.segments:
|
643
|
+
screen.lines.append(current_line)
|
644
|
+
self.logger.debug(f"Added line to screen. Lines on current screen: {len(screen.lines)}")
|
645
|
+
if len(screen.lines) >= max_lines_per_screen:
|
646
|
+
screen = subtitles.LyricsScreen(
|
647
|
+
video_size=self.video_resolution_num,
|
648
|
+
line_height=self.line_height,
|
649
|
+
logger=self.logger,
|
650
|
+
)
|
651
|
+
screens.append(screen)
|
652
|
+
self.logger.debug(f"Screen full, created new screen. Total screens: {len(screens)}")
|
653
|
+
current_line = subtitles.LyricsLine()
|
654
|
+
current_line_text = ""
|
655
|
+
self.logger.debug("Reset current line")
|
656
|
+
|
657
|
+
current_line_text += (" " if current_line_text else "") + word["text"]
|
658
|
+
lyric_segment = subtitles.LyricSegment(
|
606
659
|
text=word["text"], ts=timedelta(seconds=word["start"]), end_ts=timedelta(seconds=word["end"])
|
607
660
|
)
|
608
|
-
|
609
|
-
|
610
|
-
# If word is last in the line, add line to screen and start new line
|
611
|
-
# Before looping to the next word
|
612
|
-
if word_index == num_words_in_segment - 1:
|
613
|
-
self.logger.debug(f"word_index is last in segment, adding line to screen and starting new line")
|
614
|
-
screen.lines.append(line)
|
615
|
-
lines_in_current_screen += 1
|
616
|
-
line = None
|
617
|
-
|
618
|
-
# If current screen has 2 lines already, add screen to list and start new screen
|
619
|
-
# Before looping to the next line
|
620
|
-
if lines_in_current_screen == 2:
|
621
|
-
self.logger.debug(f"lines_in_current_screen is 2, adding screen to list and starting new screen")
|
622
|
-
screens.append(screen)
|
623
|
-
screen = None
|
624
|
-
lines_in_current_screen = 0
|
661
|
+
current_line.segments.append(lyric_segment)
|
662
|
+
self.logger.debug(f"Added word to current line. Current line: '{current_line_text}'")
|
625
663
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
screens.append(screen) # type: ignore[arg-type]
|
664
|
+
if current_line.segments:
|
665
|
+
screen.lines.append(current_line)
|
666
|
+
self.logger.debug(f"Added final line of segment to screen. Lines on current screen: {len(screen.lines)}")
|
630
667
|
|
668
|
+
self.logger.debug(f"Finished creating screens. Total screens created: {len(screens)}")
|
631
669
|
return screens
|
632
670
|
|
633
671
|
def write_ass_file(self):
|
@@ -760,7 +798,10 @@ class LyricsTranscriber:
|
|
760
798
|
|
761
799
|
def write_transcribed_lyrics_plain_text(self):
|
762
800
|
if self.outputs["transcription_data_dict"]:
|
763
|
-
|
801
|
+
transcription_cache_suffix = "-audioshake-transcribed.txt" if self.audioshake_api_token else "-whisper-transcribed.txt"
|
802
|
+
self.logger.debug(f"transcription_cache_suffix: {transcription_cache_suffix}")
|
803
|
+
|
804
|
+
transcribed_lyrics_text_filepath = os.path.join(self.cache_dir, "lyrics-" + self.get_song_slug() + transcription_cache_suffix)
|
764
805
|
self.outputs["transcribed_lyrics_text_filepath"] = transcribed_lyrics_text_filepath
|
765
806
|
|
766
807
|
self.outputs["transcribed_lyrics_text"] = ""
|
@@ -773,8 +814,109 @@ class LyricsTranscriber:
|
|
773
814
|
else:
|
774
815
|
raise Exception("Cannot write transcribed lyrics plain text as transcription_data_dict is not set")
|
775
816
|
|
817
|
+
def find_best_split_point(self, text, max_length):
|
818
|
+
self.logger.debug(f"Finding best split point for text: '{text}' (max_length: {max_length})")
|
819
|
+
words = text.split()
|
820
|
+
mid_word_index = len(words) // 2
|
821
|
+
mid_point = len(" ".join(words[:mid_word_index]))
|
822
|
+
self.logger.debug(f"Mid point is at character {mid_point}")
|
823
|
+
|
824
|
+
# Check for a comma within one or two words of the middle word
|
825
|
+
if "," in text:
|
826
|
+
comma_indices = [i for i, char in enumerate(text) if char == ","]
|
827
|
+
self.logger.debug(f"Found commas at indices: {comma_indices}")
|
828
|
+
for index in comma_indices:
|
829
|
+
if abs(mid_point - index) < 20 and len(text[: index + 1].strip()) <= max_length:
|
830
|
+
self.logger.debug(f"Choosing comma at index {index} as split point")
|
831
|
+
return index + 1 # Include the comma in the first part
|
832
|
+
|
833
|
+
# Check for 'and'
|
834
|
+
if " and " in text:
|
835
|
+
and_indices = [m.start() for m in re.finditer(" and ", text)]
|
836
|
+
self.logger.debug(f"Found 'and' at indices: {and_indices}")
|
837
|
+
for index in sorted(and_indices, key=lambda x: abs(x - mid_point)):
|
838
|
+
if len(text[: index + len(" and ")].strip()) <= max_length:
|
839
|
+
self.logger.debug(f"Choosing 'and' at index {index} as split point")
|
840
|
+
return index + len(" and ")
|
841
|
+
|
842
|
+
# Check for words starting with a capital letter
|
843
|
+
capital_word_indices = [m.start() for m in re.finditer(r"\s[A-Z]", text)]
|
844
|
+
self.logger.debug(f"Found capital words at indices: {capital_word_indices}")
|
845
|
+
for index in sorted(capital_word_indices, key=lambda x: abs(x - mid_point)):
|
846
|
+
if index > 0 and len(text[:index].strip()) <= max_length:
|
847
|
+
self.logger.debug(f"Choosing capital word at index {index} as split point")
|
848
|
+
return index
|
849
|
+
|
850
|
+
# If no better split point is found, try splitting at the middle word
|
851
|
+
if len(words) > 2 and mid_word_index > 0:
|
852
|
+
split_at_middle = len(" ".join(words[:mid_word_index]))
|
853
|
+
if split_at_middle <= max_length:
|
854
|
+
self.logger.debug(f"Choosing middle word split at index {split_at_middle}")
|
855
|
+
return split_at_middle
|
856
|
+
|
857
|
+
# If the text is still too long, forcibly split at the maximum length
|
858
|
+
self.logger.debug(f"No suitable split point found, forcibly splitting at max_length {max_length}")
|
859
|
+
return max_length
|
860
|
+
|
861
|
+
def split_long_segments(self, segments, max_length):
|
862
|
+
self.logger.debug(f"Splitting long segments (max_length: {max_length})")
|
863
|
+
new_segments = []
|
864
|
+
for segment in segments:
|
865
|
+
text = segment["text"]
|
866
|
+
self.logger.debug(f"Processing segment: '{text}' (length: {len(text)})")
|
867
|
+
if len(text) <= max_length:
|
868
|
+
self.logger.debug("Segment is within max_length, keeping as is")
|
869
|
+
new_segments.append(segment)
|
870
|
+
else:
|
871
|
+
self.logger.debug("Segment exceeds max_length, splitting")
|
872
|
+
meta_words = segment["words"]
|
873
|
+
current_text = ""
|
874
|
+
current_start = segment["start"]
|
875
|
+
current_words = []
|
876
|
+
|
877
|
+
for i, meta in enumerate(meta_words):
|
878
|
+
word = meta["text"]
|
879
|
+
if current_text:
|
880
|
+
current_text += " "
|
881
|
+
current_text += word
|
882
|
+
current_words.append(meta)
|
883
|
+
|
884
|
+
should_split = len(current_text) > max_length or (i > 0 and word[0].isupper())
|
885
|
+
if should_split:
|
886
|
+
self.logger.debug(f"Splitting at: '{current_text}'")
|
887
|
+
# If splitting due to capitalization, don't include the capitalized word
|
888
|
+
if word[0].isupper() and len(current_text.strip()) > len(word):
|
889
|
+
split_text = current_text[: -(len(word) + 1)].strip()
|
890
|
+
current_words = current_words[:-1]
|
891
|
+
else:
|
892
|
+
split_text = current_text.strip()
|
893
|
+
|
894
|
+
new_segment = {"text": split_text, "start": current_start, "end": current_words[-1]["end"], "words": current_words}
|
895
|
+
new_segments.append(new_segment)
|
896
|
+
self.logger.debug(f"Added new segment: {new_segment}")
|
897
|
+
|
898
|
+
# Reset for next segment
|
899
|
+
if word[0].isupper() and len(current_text.strip()) > len(word):
|
900
|
+
current_text = word
|
901
|
+
current_words = [meta]
|
902
|
+
else:
|
903
|
+
current_text = ""
|
904
|
+
current_words = []
|
905
|
+
current_start = meta["start"]
|
906
|
+
|
907
|
+
# Add any remaining text as a final segment
|
908
|
+
if current_text:
|
909
|
+
self.logger.debug(f"Adding final segment: '{current_text}'")
|
910
|
+
new_segments.append(
|
911
|
+
{"text": current_text.strip(), "start": current_start, "end": segment["end"], "words": current_words}
|
912
|
+
)
|
913
|
+
|
914
|
+
self.logger.debug(f"Splitting complete. Original segments: {len(segments)}, New segments: {len(new_segments)}")
|
915
|
+
return new_segments
|
916
|
+
|
776
917
|
def transcribe(self):
|
777
|
-
|
918
|
+
transcription_cache_suffix = "-audioshake" if self.audioshake_api_token else "-whisper"
|
919
|
+
self.outputs["transcription_data_filepath"] = self.get_cache_filepath(f"{transcription_cache_suffix}.json")
|
778
920
|
|
779
921
|
whisper_cache_filepath = self.outputs["transcription_data_filepath"]
|
780
922
|
if os.path.isfile(whisper_cache_filepath):
|
@@ -783,15 +925,26 @@ class LyricsTranscriber:
|
|
783
925
|
self.outputs["transcription_data_dict"] = json.load(cache_file)
|
784
926
|
return
|
785
927
|
|
786
|
-
self.
|
787
|
-
|
788
|
-
|
789
|
-
result = whisper.transcribe(model, audio, language="en")
|
928
|
+
if self.audioshake_api_token:
|
929
|
+
self.logger.debug(f"Using AudioShake API for transcription")
|
930
|
+
from .audioshake_transcriber import AudioShakeTranscriber
|
790
931
|
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
932
|
+
audioshake = AudioShakeTranscriber(self.audioshake_api_token, log_level=self.log_level)
|
933
|
+
result = audioshake.transcribe(self.audio_filepath)
|
934
|
+
else:
|
935
|
+
self.logger.debug(f"Using Whisper for transcription with model: {self.transcription_model}")
|
936
|
+
audio = whisper.load_audio(self.audio_filepath)
|
937
|
+
model = whisper.load_model(self.transcription_model, device="cpu")
|
938
|
+
result = whisper.transcribe(model, audio, language="en", vad="auditok", beam_size=5, temperature=0.2, best_of=5)
|
939
|
+
|
940
|
+
# Remove segments with no words, only music
|
941
|
+
result["segments"] = [segment for segment in result["segments"] if segment["text"].strip() != "Music"]
|
942
|
+
self.logger.debug(f"Removed 'Music' segments. Remaining segments: {len(result['segments'])}")
|
943
|
+
|
944
|
+
# Split long segments
|
945
|
+
self.logger.debug("Starting to split long segments")
|
946
|
+
result["segments"] = self.split_long_segments(result["segments"], max_length=36)
|
947
|
+
self.logger.debug(f"Finished splitting segments. Total segments after splitting: {len(result['segments'])}")
|
795
948
|
|
796
949
|
self.logger.debug(f"writing transcription data JSON to cache file: {whisper_cache_filepath}")
|
797
950
|
with open(whisper_cache_filepath, "w") as cache_file:
|
@@ -34,6 +34,11 @@ def main():
|
|
34
34
|
default=None,
|
35
35
|
help="Optional: song title for lyrics lookup and auto-correction",
|
36
36
|
)
|
37
|
+
parser.add_argument(
|
38
|
+
"--audioshake_api_token",
|
39
|
+
default=None,
|
40
|
+
help="Optional: AudioShake API token for lyrics transcription and alignment. Can also be set with AUDIOSHAKE_API_TOKEN env var.",
|
41
|
+
)
|
37
42
|
parser.add_argument(
|
38
43
|
"--genius_api_token",
|
39
44
|
default=None,
|
@@ -77,7 +82,7 @@ def main():
|
|
77
82
|
|
78
83
|
parser.add_argument(
|
79
84
|
"--video_resolution",
|
80
|
-
default="
|
85
|
+
default="360p",
|
81
86
|
help="Optional: resolution of the karaoke video to render. Must be one of: 4k, 1080p, 720p, 360p. Default: 360p",
|
82
87
|
)
|
83
88
|
|
@@ -93,6 +98,12 @@ def main():
|
|
93
98
|
help="Optional: color to use for karaoke video background, in hex format or FFmpeg color name. Default: black",
|
94
99
|
)
|
95
100
|
|
101
|
+
parser.add_argument(
|
102
|
+
"--openai_api_key",
|
103
|
+
default=None,
|
104
|
+
help="Optional: OpenAI API key for LLM model usage. Can also be set with OPENAI_API_KEY env var.",
|
105
|
+
)
|
106
|
+
|
96
107
|
args = parser.parse_args()
|
97
108
|
|
98
109
|
log_level = getattr(logging, args.log_level.upper())
|
@@ -114,8 +125,10 @@ def main():
|
|
114
125
|
|
115
126
|
transcriber = LyricsTranscriber(
|
116
127
|
args.audio_filepath,
|
128
|
+
audioshake_api_token=args.audioshake_api_token,
|
117
129
|
genius_api_token=args.genius_api_token,
|
118
130
|
spotify_cookie=args.spotify_cookie,
|
131
|
+
openai_api_key=args.openai_api_key,
|
119
132
|
artist=args.artist,
|
120
133
|
title=args.title,
|
121
134
|
output_dir=args.output_dir,
|
{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/subtitles.py
RENAMED
@@ -5,6 +5,7 @@ import json
|
|
5
5
|
import itertools
|
6
6
|
from pathlib import Path
|
7
7
|
from enum import IntEnum
|
8
|
+
import logging
|
8
9
|
|
9
10
|
from . import ass
|
10
11
|
|
@@ -85,21 +86,19 @@ class LyricsLine:
|
|
85
86
|
def __str__(self):
|
86
87
|
return "".join([f"{{{s.text}}}" for s in self.segments])
|
87
88
|
|
88
|
-
def as_ass_event(
|
89
|
-
self,
|
90
|
-
screen_start: timedelta,
|
91
|
-
screen_end: timedelta,
|
92
|
-
style: ass.ASS.Style,
|
93
|
-
top_margin: int,
|
94
|
-
):
|
89
|
+
def as_ass_event(self, screen_start: timedelta, screen_end: timedelta, style: ass.ASS.Style, y_position: int):
|
95
90
|
e = ass.ASS.Event()
|
96
91
|
e.type = "Dialogue"
|
97
92
|
e.Layer = 0
|
98
93
|
e.Style = style
|
99
94
|
e.Start = screen_start.total_seconds()
|
100
95
|
e.End = screen_end.total_seconds()
|
101
|
-
e.MarginV =
|
96
|
+
e.MarginV = y_position
|
102
97
|
e.Text = self.decorate_ass_line(self.segments, screen_start)
|
98
|
+
|
99
|
+
# Set alignment to top-center
|
100
|
+
e.Text = "{\\an8}" + e.Text
|
101
|
+
|
103
102
|
return e
|
104
103
|
|
105
104
|
def decorate_ass_line(self, segments, screen_start_ts: timedelta):
|
@@ -137,6 +136,7 @@ class LyricsScreen:
|
|
137
136
|
start_ts: Optional[timedelta] = None
|
138
137
|
video_size: Tuple[int, int] = None
|
139
138
|
line_height: int = None
|
139
|
+
logger: logging.Logger = None
|
140
140
|
|
141
141
|
@property
|
142
142
|
def end_ts(self) -> timedelta:
|
@@ -145,10 +145,36 @@ class LyricsScreen:
|
|
145
145
|
def get_line_y(self, line_num: int) -> int:
|
146
146
|
_, h = self.video_size
|
147
147
|
line_count = len(self.lines)
|
148
|
-
|
148
|
+
total_height = line_count * self.line_height
|
149
|
+
|
150
|
+
# Calculate the top margin to center the lyrics block
|
151
|
+
top_margin = (h - total_height) / 2
|
152
|
+
|
153
|
+
# Calculate the y-position for this specific line
|
154
|
+
line_y = top_margin + (line_num * self.line_height)
|
155
|
+
|
156
|
+
# if self.logger:
|
157
|
+
# self.logger.debug(f"Line {line_num + 1} positioning:")
|
158
|
+
# self.logger.debug(f" Video height: {h}")
|
159
|
+
# self.logger.debug(f" Total lines: {line_count}")
|
160
|
+
# self.logger.debug(f" Line height: {self.line_height}")
|
161
|
+
# self.logger.debug(f" Total lyrics height: {total_height}")
|
162
|
+
# self.logger.debug(f" Top margin: {top_margin}")
|
163
|
+
# self.logger.debug(f" Line y: {line_y}")
|
164
|
+
|
165
|
+
return int(line_y)
|
149
166
|
|
150
167
|
def as_ass_events(self, style: ass.ASS.Style) -> List[ass.ASS.Event]:
|
151
|
-
|
168
|
+
events = []
|
169
|
+
for i, line in enumerate(self.lines):
|
170
|
+
y_position = self.get_line_y(i)
|
171
|
+
|
172
|
+
# if self.logger:
|
173
|
+
# self.logger.debug(f"Creating ASS event for line {i + 1} at y-position: {y_position}")
|
174
|
+
|
175
|
+
event = line.as_ass_event(self.start_ts, self.end_ts, style, y_position)
|
176
|
+
events.append(event)
|
177
|
+
return events
|
152
178
|
|
153
179
|
def __str__(self):
|
154
180
|
lines = [f"{self.start_ts} - {self.end_ts}:"]
|
@@ -264,7 +290,7 @@ def create_styled_subtitles(
|
|
264
290
|
style.BorderStyle = 1
|
265
291
|
style.Outline = 1
|
266
292
|
style.Shadow = 0
|
267
|
-
style.Alignment = ass.ASS.
|
293
|
+
style.Alignment = ass.ASS.ALIGN_TOP_CENTER
|
268
294
|
style.MarginL = 0
|
269
295
|
style.MarginR = 0
|
270
296
|
style.MarginV = 0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lyrics-transcriber"
|
3
|
-
version = "0.
|
3
|
+
version = "0.16.0"
|
4
4
|
description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
|
5
5
|
authors = ["Andrew Beveridge <andrew@beveridge.uk>"]
|
6
6
|
license = "MIT"
|
@@ -28,6 +28,7 @@ syrics = ">=0"
|
|
28
28
|
openai = "^1"
|
29
29
|
openai-whisper = ">=20231117"
|
30
30
|
transformers = ">=4"
|
31
|
+
auditok = ">=0.2"
|
31
32
|
whisper-timestamped = ">=1"
|
32
33
|
# Note: after adding openai-whisper and whisper-timestamped with poetry lock, I then removed all traces of triton
|
33
34
|
# from poetry.lock before running poetry install, as triton doesn't support macOS but isn't actually needed for whisper.
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/llm_prompts/README.md
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{lyrics_transcriber-0.14.0 → lyrics_transcriber-0.16.0}/lyrics_transcriber/utils/__init__.py
RENAMED
File without changes
|
File without changes
|