lyrics-transcriber 0.20.0__py3-none-any.whl → 0.30.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/__init__.py +2 -5
- lyrics_transcriber/cli/cli_main.py +206 -0
- lyrics_transcriber/core/__init__.py +0 -0
- lyrics_transcriber/core/controller.py +317 -0
- lyrics_transcriber/correction/base_strategy.py +29 -0
- lyrics_transcriber/correction/corrector.py +52 -0
- lyrics_transcriber/correction/strategy_diff.py +263 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +201 -0
- lyrics_transcriber/lyrics/genius.py +70 -0
- lyrics_transcriber/lyrics/spotify.py +82 -0
- lyrics_transcriber/output/__init__.py +0 -0
- lyrics_transcriber/output/generator.py +271 -0
- lyrics_transcriber/{utils → output}/subtitles.py +12 -12
- lyrics_transcriber/storage/__init__.py +0 -0
- lyrics_transcriber/storage/dropbox.py +225 -0
- lyrics_transcriber/transcribers/audioshake.py +216 -0
- lyrics_transcriber/transcribers/base_transcriber.py +186 -0
- lyrics_transcriber/transcribers/whisper.py +321 -0
- {lyrics_transcriber-0.20.0.dist-info → lyrics_transcriber-0.30.1.dist-info}/METADATA +5 -16
- lyrics_transcriber-0.30.1.dist-info/RECORD +25 -0
- lyrics_transcriber-0.30.1.dist-info/entry_points.txt +3 -0
- lyrics_transcriber/audioshake_transcriber.py +0 -122
- lyrics_transcriber/corrector.py +0 -57
- lyrics_transcriber/llm_prompts/README.md +0 -10
- lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_andrew_handwritten_20231118.txt +0 -55
- lyrics_transcriber/llm_prompts/llm_prompt_lyrics_correction_gpt_optimised_20231119.txt +0 -36
- lyrics_transcriber/llm_prompts/llm_prompt_lyrics_matching_andrew_handwritten_20231118.txt +0 -19
- lyrics_transcriber/llm_prompts/promptfooconfig.yaml +0 -61
- lyrics_transcriber/llm_prompts/test_data/ABBA-UnderAttack-Genius.txt +0 -48
- lyrics_transcriber/transcriber.py +0 -934
- lyrics_transcriber/utils/cli.py +0 -179
- lyrics_transcriber-0.20.0.dist-info/RECORD +0 -19
- lyrics_transcriber-0.20.0.dist-info/entry_points.txt +0 -3
- /lyrics_transcriber/{utils → cli}/__init__.py +0 -0
- /lyrics_transcriber/{utils → output}/ass.py +0 -0
- {lyrics_transcriber-0.20.0.dist-info → lyrics_transcriber-0.30.1.dist-info}/LICENSE +0 -0
- {lyrics_transcriber-0.20.0.dist-info → lyrics_transcriber-0.30.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,321 @@
|
|
1
|
+
#! /usr/bin/env python3
|
2
|
+
from dataclasses import dataclass
|
3
|
+
import os
|
4
|
+
import json
|
5
|
+
import requests
|
6
|
+
import hashlib
|
7
|
+
import tempfile
|
8
|
+
import time
|
9
|
+
from typing import Optional, Dict, Any, Protocol, Union
|
10
|
+
from pathlib import Path
|
11
|
+
from pydub import AudioSegment
|
12
|
+
from .base_transcriber import BaseTranscriber, TranscriptionData, LyricsSegment, Word, TranscriptionError
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass
|
16
|
+
class WhisperConfig:
|
17
|
+
"""Configuration for Whisper transcription service."""
|
18
|
+
|
19
|
+
runpod_api_key: Optional[str] = None
|
20
|
+
endpoint_id: Optional[str] = None
|
21
|
+
dropbox_app_key: Optional[str] = None
|
22
|
+
dropbox_app_secret: Optional[str] = None
|
23
|
+
dropbox_refresh_token: Optional[str] = None
|
24
|
+
timeout_minutes: int = 10
|
25
|
+
|
26
|
+
|
27
|
+
class FileStorageProtocol(Protocol):
|
28
|
+
"""Protocol for file storage operations."""
|
29
|
+
|
30
|
+
def file_exists(self, path: str) -> bool: ... # pragma: no cover
|
31
|
+
def upload_with_retry(self, file: Any, path: str) -> None: ... # pragma: no cover
|
32
|
+
def create_or_get_shared_link(self, path: str) -> str: ... # pragma: no cover
|
33
|
+
|
34
|
+
|
35
|
+
class RunPodWhisperAPI:
|
36
|
+
"""Handles interactions with RunPod API."""
|
37
|
+
|
38
|
+
def __init__(self, config: WhisperConfig, logger):
|
39
|
+
self.config = config
|
40
|
+
self.logger = logger
|
41
|
+
self._validate_config()
|
42
|
+
|
43
|
+
def _validate_config(self) -> None:
|
44
|
+
"""Validate API configuration."""
|
45
|
+
if not self.config.runpod_api_key or not self.config.endpoint_id:
|
46
|
+
raise ValueError("RunPod API key and endpoint ID must be provided")
|
47
|
+
|
48
|
+
def submit_job(self, audio_url: str) -> str:
|
49
|
+
"""Submit transcription job and return job ID."""
|
50
|
+
run_url = f"https://api.runpod.ai/v2/{self.config.endpoint_id}/run"
|
51
|
+
headers = {"Authorization": f"Bearer {self.config.runpod_api_key}"}
|
52
|
+
|
53
|
+
payload = {
|
54
|
+
"input": {
|
55
|
+
"audio": audio_url,
|
56
|
+
"word_timestamps": True,
|
57
|
+
"model": "medium",
|
58
|
+
"temperature": 0.2,
|
59
|
+
"best_of": 5,
|
60
|
+
"compression_ratio_threshold": 2.8,
|
61
|
+
"no_speech_threshold": 1,
|
62
|
+
"condition_on_previous_text": True,
|
63
|
+
"enable_vad": True,
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
self.logger.info("Submitting transcription job...")
|
68
|
+
response = requests.post(run_url, json=payload, headers=headers)
|
69
|
+
|
70
|
+
self.logger.debug(f"Response status code: {response.status_code}")
|
71
|
+
|
72
|
+
# Try to parse and log the JSON response
|
73
|
+
try:
|
74
|
+
response_json = response.json()
|
75
|
+
self.logger.debug(f"Response content: {json.dumps(response_json, indent=2)}")
|
76
|
+
except ValueError:
|
77
|
+
self.logger.debug(f"Raw response content: {response.text}")
|
78
|
+
# Re-raise if we can't parse the response at all
|
79
|
+
raise TranscriptionError(f"Invalid JSON response: {response.text}")
|
80
|
+
|
81
|
+
response.raise_for_status()
|
82
|
+
return response_json["id"]
|
83
|
+
|
84
|
+
def get_job_status(self, job_id: str) -> Dict[str, Any]:
|
85
|
+
"""Get job status and results."""
|
86
|
+
status_url = f"https://api.runpod.ai/v2/{self.config.endpoint_id}/status/{job_id}"
|
87
|
+
headers = {"Authorization": f"Bearer {self.config.runpod_api_key}"}
|
88
|
+
|
89
|
+
response = requests.get(status_url, headers=headers)
|
90
|
+
response.raise_for_status()
|
91
|
+
return response.json()
|
92
|
+
|
93
|
+
def cancel_job(self, job_id: str) -> None:
|
94
|
+
"""Cancel a running job."""
|
95
|
+
cancel_url = f"https://api.runpod.ai/v2/{self.config.endpoint_id}/cancel/{job_id}"
|
96
|
+
headers = {"Authorization": f"Bearer {self.config.runpod_api_key}"}
|
97
|
+
|
98
|
+
try:
|
99
|
+
response = requests.post(cancel_url, headers=headers)
|
100
|
+
response.raise_for_status()
|
101
|
+
except Exception as e:
|
102
|
+
self.logger.warning(f"Failed to cancel job {job_id}: {e}")
|
103
|
+
|
104
|
+
def wait_for_job_result(self, job_id: str) -> Dict[str, Any]:
|
105
|
+
"""Poll for job completion and return results."""
|
106
|
+
self.logger.info(f"Getting job result for job {job_id}")
|
107
|
+
|
108
|
+
start_time = time.time()
|
109
|
+
last_status_log = start_time
|
110
|
+
timeout_seconds = self.config.timeout_minutes * 60
|
111
|
+
|
112
|
+
while True:
|
113
|
+
current_time = time.time()
|
114
|
+
elapsed_time = current_time - start_time
|
115
|
+
|
116
|
+
if elapsed_time > timeout_seconds:
|
117
|
+
self.cancel_job(job_id)
|
118
|
+
raise TranscriptionError(f"Transcription timed out after {self.config.timeout_minutes} minutes")
|
119
|
+
|
120
|
+
# Log status periodically
|
121
|
+
if current_time - last_status_log >= 60:
|
122
|
+
self.logger.info(f"Still waiting for transcription... Elapsed time: {int(elapsed_time/60)} minutes")
|
123
|
+
last_status_log = current_time
|
124
|
+
|
125
|
+
status_data = self.get_job_status(job_id)
|
126
|
+
|
127
|
+
if status_data["status"] == "COMPLETED":
|
128
|
+
return status_data["output"]
|
129
|
+
elif status_data["status"] == "FAILED":
|
130
|
+
error_msg = status_data.get("error", "Unknown error")
|
131
|
+
self.logger.error(f"Job failed with error: {error_msg}")
|
132
|
+
raise TranscriptionError(f"Transcription failed: {error_msg}")
|
133
|
+
|
134
|
+
time.sleep(5)
|
135
|
+
|
136
|
+
|
137
|
+
class AudioProcessor:
|
138
|
+
"""Handles audio file processing."""
|
139
|
+
|
140
|
+
def __init__(self, logger):
|
141
|
+
self.logger = logger
|
142
|
+
|
143
|
+
def get_file_md5(self, filepath: str) -> str:
|
144
|
+
"""Calculate MD5 hash of a file."""
|
145
|
+
md5_hash = hashlib.md5()
|
146
|
+
with open(filepath, "rb") as f:
|
147
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
148
|
+
md5_hash.update(chunk)
|
149
|
+
return md5_hash.hexdigest()
|
150
|
+
|
151
|
+
def convert_to_flac(self, filepath: str) -> str:
|
152
|
+
"""Convert WAV to FLAC if needed for faster upload."""
|
153
|
+
if not filepath.lower().endswith(".wav"):
|
154
|
+
return filepath
|
155
|
+
|
156
|
+
self.logger.info("Converting WAV to FLAC for faster upload...")
|
157
|
+
audio = AudioSegment.from_wav(filepath)
|
158
|
+
|
159
|
+
with tempfile.NamedTemporaryFile(suffix=".flac", delete=False) as temp_flac:
|
160
|
+
flac_path = temp_flac.name
|
161
|
+
audio.export(flac_path, format="flac")
|
162
|
+
|
163
|
+
return flac_path
|
164
|
+
|
165
|
+
|
166
|
+
class WhisperTranscriber(BaseTranscriber):
|
167
|
+
"""Transcription service using Whisper API via RunPod."""
|
168
|
+
|
169
|
+
def __init__(
|
170
|
+
self,
|
171
|
+
cache_dir: Union[str, Path],
|
172
|
+
config: Optional[WhisperConfig] = None,
|
173
|
+
logger: Optional[Any] = None,
|
174
|
+
runpod_client: Optional[RunPodWhisperAPI] = None,
|
175
|
+
storage_client: Optional[FileStorageProtocol] = None,
|
176
|
+
audio_processor: Optional[AudioProcessor] = None,
|
177
|
+
):
|
178
|
+
"""Initialize Whisper transcriber."""
|
179
|
+
super().__init__(cache_dir=cache_dir, logger=logger)
|
180
|
+
|
181
|
+
# Initialize configuration
|
182
|
+
self.config = config or WhisperConfig(
|
183
|
+
runpod_api_key=os.getenv("RUNPOD_API_KEY"),
|
184
|
+
endpoint_id=os.getenv("WHISPER_RUNPOD_ID"),
|
185
|
+
dropbox_app_key=os.getenv("WHISPER_DROPBOX_APP_KEY"),
|
186
|
+
dropbox_app_secret=os.getenv("WHISPER_DROPBOX_APP_SECRET"),
|
187
|
+
dropbox_refresh_token=os.getenv("WHISPER_DROPBOX_REFRESH_TOKEN"),
|
188
|
+
)
|
189
|
+
|
190
|
+
# Initialize components (with dependency injection)
|
191
|
+
self.runpod = runpod_client or RunPodWhisperAPI(self.config, self.logger)
|
192
|
+
self.storage = storage_client or self._initialize_storage()
|
193
|
+
self.audio_processor = audio_processor or AudioProcessor(self.logger)
|
194
|
+
|
195
|
+
def _initialize_storage(self) -> FileStorageProtocol:
|
196
|
+
"""Initialize storage client."""
|
197
|
+
from ..storage.dropbox import DropboxHandler, DropboxConfig
|
198
|
+
|
199
|
+
# Create config using os.getenv directly
|
200
|
+
config = DropboxConfig(
|
201
|
+
app_key=os.getenv("WHISPER_DROPBOX_APP_KEY"),
|
202
|
+
app_secret=os.getenv("WHISPER_DROPBOX_APP_SECRET"),
|
203
|
+
refresh_token=os.getenv("WHISPER_DROPBOX_REFRESH_TOKEN"),
|
204
|
+
)
|
205
|
+
|
206
|
+
# Log the actual config values being used
|
207
|
+
self.logger.debug("Initializing DropboxHandler with config")
|
208
|
+
return DropboxHandler(config=config)
|
209
|
+
|
210
|
+
def get_name(self) -> str:
|
211
|
+
return "Whisper"
|
212
|
+
|
213
|
+
def _perform_transcription(self, audio_filepath: str) -> TranscriptionData:
|
214
|
+
"""Actually perform the whisper transcription using Whisper API."""
|
215
|
+
self.logger.info(f"Starting transcription for {audio_filepath}")
|
216
|
+
|
217
|
+
# Start transcription and get results
|
218
|
+
job_id = self.start_transcription(audio_filepath)
|
219
|
+
result = self.get_transcription_result(job_id)
|
220
|
+
return result
|
221
|
+
|
222
|
+
def start_transcription(self, audio_filepath: str) -> str:
|
223
|
+
"""Prepare audio and start whisper transcription job."""
|
224
|
+
audio_url, temp_filepath = self._prepare_audio_url(audio_filepath)
|
225
|
+
try:
|
226
|
+
return self.runpod.submit_job(audio_url)
|
227
|
+
except Exception as e:
|
228
|
+
if temp_filepath:
|
229
|
+
self._cleanup_temporary_files(temp_filepath)
|
230
|
+
raise TranscriptionError(f"Failed to submit job: {str(e)}") from e
|
231
|
+
|
232
|
+
def _prepare_audio_url(self, audio_filepath: str) -> tuple[str, Optional[str]]:
|
233
|
+
"""Process audio file and return URL for API and path to any temporary files."""
|
234
|
+
if audio_filepath.startswith(("http://", "https://")):
|
235
|
+
return audio_filepath, None
|
236
|
+
|
237
|
+
file_hash = self.audio_processor.get_file_md5(audio_filepath)
|
238
|
+
temp_flac_filepath = self.audio_processor.convert_to_flac(audio_filepath)
|
239
|
+
|
240
|
+
# Upload and get URL
|
241
|
+
dropbox_path = f"/transcription_temp/{file_hash}{os.path.splitext(temp_flac_filepath)[1]}"
|
242
|
+
url = self._upload_and_get_link(temp_flac_filepath, dropbox_path)
|
243
|
+
return url, temp_flac_filepath
|
244
|
+
|
245
|
+
def get_transcription_result(self, job_id: str) -> Dict[str, Any]:
|
246
|
+
"""Poll for whisper job completion and return raw results."""
|
247
|
+
raw_data = self.runpod.wait_for_job_result(job_id)
|
248
|
+
|
249
|
+
# Add job_id to raw data for later use
|
250
|
+
raw_data["job_id"] = job_id
|
251
|
+
|
252
|
+
return raw_data
|
253
|
+
|
254
|
+
def _convert_result_format(self, raw_data: Dict[str, Any]) -> TranscriptionData:
|
255
|
+
"""Convert Whisper API response to standard format."""
|
256
|
+
self._validate_response(raw_data)
|
257
|
+
|
258
|
+
job_id = raw_data.get("job_id")
|
259
|
+
all_words = []
|
260
|
+
|
261
|
+
# First collect all words from word_timestamps
|
262
|
+
word_list = [
|
263
|
+
Word(
|
264
|
+
text=word["word"].strip(),
|
265
|
+
start_time=word["start"],
|
266
|
+
end_time=word["end"],
|
267
|
+
confidence=word.get("probability"), # Only set if provided
|
268
|
+
)
|
269
|
+
for word in raw_data.get("word_timestamps", [])
|
270
|
+
]
|
271
|
+
all_words.extend(word_list)
|
272
|
+
|
273
|
+
# Then create segments, using the words that fall within each segment's time range
|
274
|
+
segments = []
|
275
|
+
for seg in raw_data["segments"]:
|
276
|
+
segment_words = [word for word in word_list if seg["start"] <= word.start_time < seg["end"]]
|
277
|
+
segments.append(LyricsSegment(text=seg["text"].strip(), words=segment_words, start_time=seg["start"], end_time=seg["end"]))
|
278
|
+
|
279
|
+
return TranscriptionData(
|
280
|
+
segments=segments,
|
281
|
+
words=all_words,
|
282
|
+
text=raw_data["transcription"],
|
283
|
+
source=self.get_name(),
|
284
|
+
metadata={
|
285
|
+
"language": raw_data.get("detected_language", "en"),
|
286
|
+
"model": raw_data.get("model"),
|
287
|
+
"job_id": job_id,
|
288
|
+
},
|
289
|
+
)
|
290
|
+
|
291
|
+
def _upload_and_get_link(self, filepath: str, dropbox_path: str) -> str:
|
292
|
+
"""Upload file to storage and return shared link."""
|
293
|
+
if not self.storage.file_exists(dropbox_path):
|
294
|
+
self.logger.info("Uploading file to storage...")
|
295
|
+
with open(filepath, "rb") as f:
|
296
|
+
self.storage.upload_with_retry(f, dropbox_path)
|
297
|
+
else:
|
298
|
+
self.logger.info("File already exists in storage, skipping upload...")
|
299
|
+
|
300
|
+
audio_url = self.storage.create_or_get_shared_link(dropbox_path)
|
301
|
+
self.logger.debug(f"Using shared link: {audio_url}")
|
302
|
+
return audio_url
|
303
|
+
|
304
|
+
def _cleanup_temporary_files(self, *filepaths: Optional[str]) -> None:
|
305
|
+
"""Clean up any temporary files that were created during transcription."""
|
306
|
+
for filepath in filepaths:
|
307
|
+
if filepath and os.path.exists(filepath):
|
308
|
+
try:
|
309
|
+
os.remove(filepath)
|
310
|
+
self.logger.debug(f"Cleaned up temporary file: {filepath}")
|
311
|
+
except Exception as e:
|
312
|
+
self.logger.warning(f"Failed to clean up temporary file {filepath}: {e}")
|
313
|
+
|
314
|
+
def _validate_response(self, raw_data: Dict[str, Any]) -> None:
|
315
|
+
"""Validate the response contains required fields."""
|
316
|
+
if not isinstance(raw_data, dict):
|
317
|
+
raise TranscriptionError(f"Invalid response format: {raw_data}")
|
318
|
+
if "segments" not in raw_data:
|
319
|
+
raise TranscriptionError("Response missing required 'segments' field")
|
320
|
+
if "transcription" not in raw_data:
|
321
|
+
raise TranscriptionError("Response missing required 'transcription' field")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lyrics-transcriber
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.30.1
|
4
4
|
Summary: Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify
|
5
5
|
Home-page: https://github.com/karaokenerds/python-lyrics-transcriber
|
6
6
|
License: MIT
|
@@ -13,24 +13,13 @@ Classifier: Programming Language :: Python :: 3.9
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist:
|
18
|
-
Requires-Dist: karaoke-lyrics-processor (>=0.4.1)
|
19
|
-
Requires-Dist: llvmlite (>=0)
|
16
|
+
Requires-Dist: dropbox (>=12)
|
17
|
+
Requires-Dist: karaoke-lyrics-processor (>=0.4)
|
20
18
|
Requires-Dist: lyricsgenius (>=3)
|
21
|
-
Requires-Dist:
|
22
|
-
Requires-Dist:
|
23
|
-
Requires-Dist: onnx (>=1)
|
24
|
-
Requires-Dist: onnxruntime (>=1)
|
25
|
-
Requires-Dist: openai (>=1,<2)
|
26
|
-
Requires-Dist: openai-whisper (>=20231117)
|
19
|
+
Requires-Dist: pydub (>=0.25)
|
20
|
+
Requires-Dist: python-dotenv (>=1)
|
27
21
|
Requires-Dist: python-slugify (>=8)
|
28
22
|
Requires-Dist: syrics (>=0)
|
29
|
-
Requires-Dist: tenacity (>=8)
|
30
|
-
Requires-Dist: torch (>=1)
|
31
|
-
Requires-Dist: tqdm (>=4)
|
32
|
-
Requires-Dist: transformers (>=4)
|
33
|
-
Requires-Dist: whisper-timestamped (>=1)
|
34
23
|
Project-URL: Documentation, https://github.com/karaokenerds/python-lyrics-transcriber/blob/main/README.md
|
35
24
|
Project-URL: Repository, https://github.com/karaokenerds/python-lyrics-transcriber
|
36
25
|
Description-Content-Type: text/markdown
|
@@ -0,0 +1,25 @@
|
|
1
|
+
lyrics_transcriber/__init__.py,sha256=Hj2HdSBAl6kmiqa5s3MDo_RobkITadzuF-81-ON3awA,180
|
2
|
+
lyrics_transcriber/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
lyrics_transcriber/cli/cli_main.py,sha256=-h3W9E4P5lHEjIBWiDvY0v7avldhA-cfYoAVwMlv0Zo,8137
|
4
|
+
lyrics_transcriber/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
lyrics_transcriber/core/controller.py,sha256=k_moklU2NqpHOGxWTRVyImWgX6_dv1NES0j50-FRGxw,13057
|
6
|
+
lyrics_transcriber/correction/base_strategy.py,sha256=vEKsj19ZNZZkvHRP0J7cZamJWqjLZHbRJ9sN0AyHbAA,867
|
7
|
+
lyrics_transcriber/correction/corrector.py,sha256=lsXJ1l5sNoZjIU65A3yWTXkOcraz7QP9KU8OUzA_UTc,2147
|
8
|
+
lyrics_transcriber/correction/strategy_diff.py,sha256=xJTFnmVcuE18zZcitweVaRqB82jCMm9Ey29zAFB4LsI,10188
|
9
|
+
lyrics_transcriber/lyrics/base_lyrics_provider.py,sha256=s5IDrlT6OudAA_gIlAQzeD0bPqoUFsiYftSQQm7XxOE,7518
|
10
|
+
lyrics_transcriber/lyrics/genius.py,sha256=zDiv0t2f7wphnPdcyPH6tahXBfOnbE63Nu8eRG0nqg4,3195
|
11
|
+
lyrics_transcriber/lyrics/spotify.py,sha256=Sic3nPFcpSWW7lE-yr3stb6D5m5WFSQXCwzWj3lW0Ls,3584
|
12
|
+
lyrics_transcriber/output/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
+
lyrics_transcriber/output/ass.py,sha256=b8lnjgXGD1OD1ld_b1xxUmSOf4nSEfz9BpgSkh16R4g,90291
|
14
|
+
lyrics_transcriber/output/generator.py,sha256=idUsuS01bnaIB5spDFZlxE0wsvJ2I071SmJfXO9BCCk,10870
|
15
|
+
lyrics_transcriber/output/subtitles.py,sha256=JEehSPl81hxhK6cS6RK4XAC_OLentCxiMCE7UYI9B64,11851
|
16
|
+
lyrics_transcriber/storage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
lyrics_transcriber/storage/dropbox.py,sha256=Dyam1ULTkoxD1X5trkZ5dGp5XhBGCn998moC8IS9-68,9804
|
18
|
+
lyrics_transcriber/transcribers/audioshake.py,sha256=0sXvD1FJYXxISH72n5HaN9fnTxgmaQrqmY1W5Lb6Yu8,8631
|
19
|
+
lyrics_transcriber/transcribers/base_transcriber.py,sha256=9XWUlBSwBCjKvz7Gs1NT7EIysMyacS-YlvDjpwlqwgI,6985
|
20
|
+
lyrics_transcriber/transcribers/whisper.py,sha256=QE9Dsb6emGOaFcepJHrECjVdCfAJZRncGj7uXy-0mAk,12942
|
21
|
+
lyrics_transcriber-0.30.1.dist-info/LICENSE,sha256=BiPihPDxhxIPEx6yAxVfAljD5Bhm_XG2teCbPEj_m0Y,1069
|
22
|
+
lyrics_transcriber-0.30.1.dist-info/METADATA,sha256=c6P3R-KVxCJ10m-92bezeetdztdB7vvv5RMlTnF4Xbg,5485
|
23
|
+
lyrics_transcriber-0.30.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
24
|
+
lyrics_transcriber-0.30.1.dist-info/entry_points.txt,sha256=KHZMIwodpv7TQUN9z28G-0knEFsRta9ZBAcIbmBAT40,75
|
25
|
+
lyrics_transcriber-0.30.1.dist-info/RECORD,,
|
@@ -1,122 +0,0 @@
|
|
1
|
-
import requests
|
2
|
-
import time
|
3
|
-
import os
|
4
|
-
import json
|
5
|
-
|
6
|
-
|
7
|
-
class AudioShakeTranscriber:
|
8
|
-
def __init__(self, api_token, logger, output_prefix):
|
9
|
-
self.api_token = api_token
|
10
|
-
self.base_url = "https://groovy.audioshake.ai"
|
11
|
-
self.logger = logger
|
12
|
-
self.output_prefix = output_prefix
|
13
|
-
|
14
|
-
def start_transcription(self, audio_filepath):
|
15
|
-
"""Starts the transcription job and returns the job ID without waiting for completion"""
|
16
|
-
self.logger.info(f"Starting transcription for {audio_filepath} using AudioShake API")
|
17
|
-
|
18
|
-
# Step 1: Upload the audio file
|
19
|
-
asset_id = self._upload_file(audio_filepath)
|
20
|
-
self.logger.info(f"File uploaded successfully. Asset ID: {asset_id}")
|
21
|
-
|
22
|
-
# Step 2: Create a job for transcription and alignment
|
23
|
-
job_id = self._create_job(asset_id)
|
24
|
-
self.logger.info(f"Job created successfully. Job ID: {job_id}")
|
25
|
-
|
26
|
-
return job_id
|
27
|
-
|
28
|
-
def get_transcription_result(self, job_id):
|
29
|
-
"""Gets the results for a previously started job"""
|
30
|
-
self.logger.info(f"Getting results for job ID: {job_id}")
|
31
|
-
|
32
|
-
# Step 3: Wait for the job to complete and get the results
|
33
|
-
result = self._get_job_result(job_id)
|
34
|
-
self.logger.info(f"Job completed. Processing results...")
|
35
|
-
|
36
|
-
# Step 4: Process the result and return in the required format
|
37
|
-
return self._process_result(result)
|
38
|
-
|
39
|
-
def transcribe(self, audio_filepath):
|
40
|
-
"""Original method now just combines the two steps"""
|
41
|
-
job_id = self.start_transcription(audio_filepath)
|
42
|
-
return self.get_transcription_result(job_id)
|
43
|
-
|
44
|
-
def _upload_file(self, filepath):
|
45
|
-
self.logger.info(f"Uploading {filepath} to AudioShake")
|
46
|
-
url = f"{self.base_url}/upload"
|
47
|
-
headers = {"Authorization": f"Bearer {self.api_token}"}
|
48
|
-
with open(filepath, "rb") as file:
|
49
|
-
files = {"file": (os.path.basename(filepath), file)}
|
50
|
-
response = requests.post(url, headers=headers, files=files)
|
51
|
-
|
52
|
-
self.logger.info(f"Upload response status code: {response.status_code}")
|
53
|
-
self.logger.info(f"Upload response content: {response.text}")
|
54
|
-
|
55
|
-
response.raise_for_status()
|
56
|
-
return response.json()["id"]
|
57
|
-
|
58
|
-
def _create_job(self, asset_id):
|
59
|
-
self.logger.info(f"Creating job for asset {asset_id}")
|
60
|
-
url = f"{self.base_url}/job/"
|
61
|
-
headers = {"Authorization": f"Bearer {self.api_token}", "Content-Type": "application/json"}
|
62
|
-
data = {
|
63
|
-
"metadata": {"format": "json", "name": "alignment", "language": "en"},
|
64
|
-
"callbackUrl": "https://example.com/webhook/alignment",
|
65
|
-
"assetId": asset_id,
|
66
|
-
}
|
67
|
-
response = requests.post(url, headers=headers, json=data)
|
68
|
-
response.raise_for_status()
|
69
|
-
return response.json()["job"]["id"]
|
70
|
-
|
71
|
-
def _get_job_result(self, job_id):
|
72
|
-
self.logger.info(f"Getting job result for job {job_id}")
|
73
|
-
url = f"{self.base_url}/job/{job_id}"
|
74
|
-
headers = {"Authorization": f"Bearer {self.api_token}", "Content-Type": "application/json"}
|
75
|
-
while True:
|
76
|
-
response = requests.get(url, headers=headers)
|
77
|
-
response.raise_for_status()
|
78
|
-
job_data = response.json()["job"]
|
79
|
-
if job_data["status"] == "completed":
|
80
|
-
return job_data
|
81
|
-
elif job_data["status"] == "failed":
|
82
|
-
raise Exception("Job failed")
|
83
|
-
time.sleep(5) # Wait 5 seconds before checking again
|
84
|
-
|
85
|
-
def _process_result(self, job_data):
|
86
|
-
self.logger.debug(f"Processing result for job {job_data['id']}")
|
87
|
-
self.logger.debug(f"Job data: {json.dumps(job_data, indent=2)}")
|
88
|
-
|
89
|
-
output_assets = job_data.get("outputAssets", [])
|
90
|
-
self.logger.debug(f"Output assets: {output_assets}")
|
91
|
-
|
92
|
-
output_asset = next((asset for asset in output_assets if asset["name"] == "alignment.json"), None)
|
93
|
-
|
94
|
-
if not output_asset:
|
95
|
-
self.logger.error("'alignment.json' found in job results")
|
96
|
-
self.logger.error(f"Available output assets: {[asset['name'] for asset in output_assets]}")
|
97
|
-
raise Exception("Required output not found in job results")
|
98
|
-
|
99
|
-
transcription_url = output_asset["link"]
|
100
|
-
self.logger.debug(f"Output URL: {transcription_url}")
|
101
|
-
|
102
|
-
response = requests.get(transcription_url)
|
103
|
-
response.raise_for_status()
|
104
|
-
transcription_data = response.json()
|
105
|
-
self.logger.debug(f"Output data: {json.dumps(transcription_data, indent=2)}")
|
106
|
-
|
107
|
-
transcription_data = {"segments": transcription_data.get("lines", []), "text": transcription_data.get("text", "")}
|
108
|
-
|
109
|
-
# Ensure each segment has the required fields
|
110
|
-
for segment in transcription_data["segments"]:
|
111
|
-
if "words" not in segment:
|
112
|
-
segment["words"] = []
|
113
|
-
if "text" not in segment:
|
114
|
-
segment["text"] = " ".join(word["text"] for word in segment["words"])
|
115
|
-
|
116
|
-
transcription_data["output_filename"] = self.get_output_filename(" (AudioShake)")
|
117
|
-
|
118
|
-
return transcription_data
|
119
|
-
|
120
|
-
def get_output_filename(self, suffix):
|
121
|
-
"""Generate consistent filename with (Purpose) suffix pattern"""
|
122
|
-
return f"{self.output_prefix}{suffix}"
|
lyrics_transcriber/corrector.py
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import logging
|
3
|
-
from openai import OpenAI
|
4
|
-
from typing import Dict, Optional
|
5
|
-
|
6
|
-
|
7
|
-
class LyricsTranscriptionCorrector:
|
8
|
-
def __init__(
|
9
|
-
self,
|
10
|
-
logger: Optional[logging.Logger] = None,
|
11
|
-
):
|
12
|
-
self.logger = logger or logging.getLogger(__name__)
|
13
|
-
|
14
|
-
# Initialize instance variables for input data
|
15
|
-
self.spotify_lyrics_data_dict = None
|
16
|
-
self.spotify_lyrics_text = None
|
17
|
-
self.genius_lyrics_text = None
|
18
|
-
self.transcription_data_dict_whisper = None
|
19
|
-
self.transcription_data_dict_audioshake = None
|
20
|
-
|
21
|
-
def set_input_data(
|
22
|
-
self,
|
23
|
-
spotify_lyrics_data_dict: Optional[Dict] = None,
|
24
|
-
spotify_lyrics_text: Optional[str] = None,
|
25
|
-
genius_lyrics_text: Optional[str] = None,
|
26
|
-
transcription_data_dict_whisper: Optional[Dict] = None,
|
27
|
-
transcription_data_dict_audioshake: Optional[Dict] = None,
|
28
|
-
) -> None:
|
29
|
-
"""Store the input data as instance variables"""
|
30
|
-
self.spotify_lyrics_data_dict = spotify_lyrics_data_dict
|
31
|
-
self.spotify_lyrics_text = spotify_lyrics_text
|
32
|
-
self.genius_lyrics_text = genius_lyrics_text
|
33
|
-
self.transcription_data_dict_whisper = transcription_data_dict_whisper
|
34
|
-
self.transcription_data_dict_audioshake = transcription_data_dict_audioshake
|
35
|
-
|
36
|
-
def run_corrector(self) -> Dict:
|
37
|
-
"""
|
38
|
-
Test implementation that replaces every third word with 'YOLO' in the AudioShake transcription.
|
39
|
-
"""
|
40
|
-
self.logger.info("Running corrector (test implementation - replacing every 3rd word with YOLO)")
|
41
|
-
|
42
|
-
# Create a deep copy to avoid modifying the original
|
43
|
-
modified_data = json.loads(json.dumps(self.transcription_data_dict_audioshake))
|
44
|
-
|
45
|
-
# Process each segment
|
46
|
-
for segment in modified_data["segments"]:
|
47
|
-
# Replace every third word in the words list
|
48
|
-
for i in range(2, len(segment["words"]), 3):
|
49
|
-
segment["words"][i]["text"] = "YOLO"
|
50
|
-
|
51
|
-
# Reconstruct the segment text from the modified words
|
52
|
-
segment["text"] = " ".join(word["text"] for word in segment["words"])
|
53
|
-
|
54
|
-
# Reconstruct the full text from all segments
|
55
|
-
modified_data["text"] = "".join(segment["text"] for segment in modified_data["segments"])
|
56
|
-
|
57
|
-
return modified_data
|
@@ -1,55 +0,0 @@
|
|
1
|
-
You are a song lyric corrector for a karaoke video studio, responsible for reading lyrics inputs, correcting them and generating JSON-based responses containing the corrected lyrics according to predefined criteria.
|
2
|
-
Your task is to take two lyrics data inputs with two different qualities, and use the data in one to correct the other, producing accurate lyrics which align with roughly correct timestamps in the song.
|
3
|
-
|
4
|
-
Your response needs to be in JSON format and will be sent to an API endpoint. Only output the JSON, nothing else, as the response will be converted to a Python dictionary.
|
5
|
-
|
6
|
-
You will be provided with reference lyrics for the song, as plain text, from an online source.
|
7
|
-
These should be reasonably accurate, with generally correct words and phrases.
|
8
|
-
However, they may not be perfect, and sometimes whole sections (such as a chorus or outro) may be missing or assumed to be repeated.
|
9
|
-
|
10
|
-
Data input will contain one segment of an automated machine transcription of lyrics from a song, with start/end timestamps and confidence scores for every word in that segment.
|
11
|
-
The timestamps for words are usually quite accurate, but the actual words which were heard by the transcription are typically only around 70% to 90% accurate.
|
12
|
-
As such, it is common for there to be segments where most of the words are correct but one or two are wrong, or a single word may have been mistaken as two different words.
|
13
|
-
|
14
|
-
When possible, you will also be provided with the previous 2 (corrected) lines of text, and the next 1 (un-corrected) segment text, for additional context.
|
15
|
-
|
16
|
-
Carefully analyse the segment in the data input, and compare with the lyrics in the reference data, attempting to find part of the lyrics which is most likely to correspond with this segment.
|
17
|
-
If all of the words match up correctly with words in the published lyrics, keep the entire segment from the transcription (do NOT add any additional words).
|
18
|
-
If most of the words match up but one or two words are different (e.g. similar sounding words), correct those words.
|
19
|
-
If there are symbols in the published lyrics, add those symbols to the closest word in the segment (NOT as a separate word). For example, parentheses are commonly used around backing vocals.
|
20
|
-
If you need to delete a word or two in order to correct the lyrics, that's acceptable.
|
21
|
-
|
22
|
-
Important: segments might not start and end at the same point as a "line" in the published lyrics, as the decision about where to split up a line into two is highly subjective.
|
23
|
-
For example, in some published lyrics a line might be split in two (with a newline) before the word "and", but in another lyrics text that might only be one line.
|
24
|
-
You will likely encounter situations where the words in the segment match part of the words in a published lyrics line, but not the whole line.
|
25
|
-
|
26
|
-
Important: adding more words to the transcribed segment is usually not correct and should be the last resort!
|
27
|
-
Remember, the goal is to correct mistakes (e.g. single words which were mis-heard) in the transcription rather than complete incomplete lines.
|
28
|
-
Pay close attention to the "Context: Next (un-corrected) transcript segment" text, if this includes some of the words do NOT add those words to the current segment as this will cause duplication!
|
29
|
-
|
30
|
-
The response JSON object needs to contain all of the following fields:
|
31
|
-
|
32
|
-
- id: The id of the segment, from the data input
|
33
|
-
- text: The full text of the corrected lyrics for this segment
|
34
|
-
- words: this is a list
|
35
|
-
- text: The correct word
|
36
|
-
- start: The start timestamp for this word, estimated if not known for sure.
|
37
|
-
- end: The end timestamp for this word, estimated if not known for sure.
|
38
|
-
- confidence: Your self-assessed confidence score (from 0 to 1) of how likely it is that this word is accurate. If the word has not changed from the data input, keep the existing confidence value.
|
39
|
-
|
40
|
-
Reference lyrics:
|
41
|
-
|
42
|
-
{{reference_lyrics}}
|
43
|
-
|
44
|
-
Previous two corrected lines:
|
45
|
-
|
46
|
-
{{previous_two_corrected_lines}}
|
47
|
-
|
48
|
-
Upcoming two uncorrected lines:
|
49
|
-
|
50
|
-
{{upcoming_two_uncorrected_lines}}
|
51
|
-
|
52
|
-
Data input:
|
53
|
-
|
54
|
-
{{segment_input}}
|
55
|
-
|