karaoke-gen 0.75.16__py3-none-any.whl → 0.75.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- karaoke_gen/audio_fetcher.py +766 -33
- karaoke_gen/audio_processor.py +4 -0
- karaoke_gen/instrumental_review/static/index.html +37 -14
- karaoke_gen/karaoke_finalise/karaoke_finalise.py +25 -1
- karaoke_gen/karaoke_gen.py +18 -14
- karaoke_gen/lyrics_processor.py +97 -6
- karaoke_gen/utils/cli_args.py +6 -5
- karaoke_gen/utils/gen_cli.py +30 -5
- karaoke_gen/utils/remote_cli.py +269 -15
- {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.75.53.dist-info}/METADATA +106 -4
- {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.75.53.dist-info}/RECORD +24 -24
- lyrics_transcriber/core/controller.py +76 -2
- lyrics_transcriber/frontend/package.json +1 -1
- lyrics_transcriber/frontend/src/App.tsx +6 -4
- lyrics_transcriber/frontend/src/api.ts +25 -10
- lyrics_transcriber/frontend/web_assets/assets/{index-COYImAcx.js → index-BECn1o8Q.js} +38 -22
- lyrics_transcriber/frontend/web_assets/assets/{index-COYImAcx.js.map → index-BECn1o8Q.js.map} +1 -1
- lyrics_transcriber/frontend/web_assets/index.html +1 -1
- lyrics_transcriber/output/countdown_processor.py +39 -0
- lyrics_transcriber/transcribers/audioshake.py +96 -7
- lyrics_transcriber/types.py +14 -12
- {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.75.53.dist-info}/WHEEL +0 -0
- {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.75.53.dist-info}/entry_points.txt +0 -0
- {karaoke_gen-0.75.16.dist-info → karaoke_gen-0.75.53.dist-info}/licenses/LICENSE +0 -0
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
<link rel="icon" type="image/png" sizes="512x512" href="/android-chrome-512x512.png" />
|
|
11
11
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
12
12
|
<title>Nomad Karaoke: Lyrics Review</title>
|
|
13
|
-
<script type="module" crossorigin src="/assets/index-
|
|
13
|
+
<script type="module" crossorigin src="/assets/index-BECn1o8Q.js"></script>
|
|
14
14
|
</head>
|
|
15
15
|
<body>
|
|
16
16
|
<div id="root"></div>
|
|
@@ -265,3 +265,42 @@ class CountdownProcessor:
|
|
|
265
265
|
|
|
266
266
|
return countdown_segment
|
|
267
267
|
|
|
268
|
+
def has_countdown(self, correction_result: CorrectionResult) -> bool:
|
|
269
|
+
"""
|
|
270
|
+
Check if a CorrectionResult already has a countdown segment.
|
|
271
|
+
|
|
272
|
+
This is used to detect if countdown padding was applied to corrections
|
|
273
|
+
that were loaded from a saved JSON file (where the padding state is not
|
|
274
|
+
explicitly stored).
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
correction_result: The correction result to check
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
True if the first segment is a countdown, False otherwise
|
|
281
|
+
"""
|
|
282
|
+
if not correction_result.corrected_segments:
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
first_segment = correction_result.corrected_segments[0]
|
|
286
|
+
return first_segment.text == self.COUNTDOWN_TEXT
|
|
287
|
+
|
|
288
|
+
def create_padded_audio_only(self, audio_filepath: str) -> str:
|
|
289
|
+
"""
|
|
290
|
+
Create a padded audio file without modifying the correction result.
|
|
291
|
+
|
|
292
|
+
This is used when loading existing corrections that already have countdown
|
|
293
|
+
timestamps, but we need to create the padded audio file for video rendering.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
audio_filepath: Path to original audio file
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Path to padded audio file
|
|
300
|
+
|
|
301
|
+
Raises:
|
|
302
|
+
FileNotFoundError: If input audio file doesn't exist
|
|
303
|
+
RuntimeError: If ffmpeg command fails
|
|
304
|
+
"""
|
|
305
|
+
return self._create_padded_audio(audio_filepath)
|
|
306
|
+
|
|
@@ -2,12 +2,92 @@ from dataclasses import dataclass
|
|
|
2
2
|
import requests
|
|
3
3
|
import time
|
|
4
4
|
import os
|
|
5
|
-
|
|
5
|
+
import tempfile
|
|
6
|
+
from typing import Dict, Optional, Any, Union, Tuple
|
|
6
7
|
from pathlib import Path
|
|
8
|
+
from pydub import AudioSegment
|
|
7
9
|
from lyrics_transcriber.types import TranscriptionData, LyricsSegment, Word
|
|
8
10
|
from lyrics_transcriber.transcribers.base_transcriber import BaseTranscriber, TranscriptionError
|
|
9
11
|
from lyrics_transcriber.utils.word_utils import WordUtils
|
|
10
12
|
|
|
13
|
+
# Lossy formats that should be uploaded directly (transcoding would cause quality loss)
|
|
14
|
+
LOSSY_FORMATS = {'.mp3', '.aac', '.ogg', '.m4a', '.wma', '.opus'}
|
|
15
|
+
# Lossless formats that are already compressed and can be uploaded directly
|
|
16
|
+
LOSSLESS_COMPRESSED_FORMATS = {'.flac', '.alac'}
|
|
17
|
+
# Uncompressed formats that should be converted to FLAC for efficient upload
|
|
18
|
+
UNCOMPRESSED_FORMATS = {'.wav', '.aiff', '.aif', '.pcm'}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AudioUploadOptimizer:
|
|
22
|
+
"""Optimizes audio files for upload by converting uncompressed formats to FLAC."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, logger):
|
|
25
|
+
self.logger = logger
|
|
26
|
+
|
|
27
|
+
def prepare_for_upload(self, filepath: str) -> Tuple[str, Optional[str]]:
|
|
28
|
+
"""
|
|
29
|
+
Prepare audio file for optimal upload.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Tuple of (filepath_to_upload, temp_file_to_cleanup)
|
|
33
|
+
- If no conversion needed, returns (original_filepath, None)
|
|
34
|
+
- If converted, returns (temp_flac_filepath, temp_flac_filepath)
|
|
35
|
+
"""
|
|
36
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
37
|
+
|
|
38
|
+
# Lossy formats: upload directly (transcoding would lose quality)
|
|
39
|
+
if ext in LOSSY_FORMATS:
|
|
40
|
+
self.logger.info(f"Uploading lossy format ({ext}) directly to preserve quality")
|
|
41
|
+
return filepath, None
|
|
42
|
+
|
|
43
|
+
# Already compressed lossless: upload directly
|
|
44
|
+
if ext in LOSSLESS_COMPRESSED_FORMATS:
|
|
45
|
+
self.logger.info(f"Uploading lossless compressed format ({ext}) directly")
|
|
46
|
+
return filepath, None
|
|
47
|
+
|
|
48
|
+
# Uncompressed formats: convert to FLAC for smaller upload
|
|
49
|
+
if ext in UNCOMPRESSED_FORMATS:
|
|
50
|
+
self.logger.info(f"Converting uncompressed format ({ext}) to FLAC for efficient upload")
|
|
51
|
+
return self._convert_to_flac(filepath)
|
|
52
|
+
|
|
53
|
+
# Unknown format: try to upload directly
|
|
54
|
+
self.logger.warning(f"Unknown audio format ({ext}), uploading directly")
|
|
55
|
+
return filepath, None
|
|
56
|
+
|
|
57
|
+
def _convert_to_flac(self, filepath: str) -> Tuple[str, str]:
|
|
58
|
+
"""Convert audio file to FLAC format."""
|
|
59
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
60
|
+
|
|
61
|
+
# Load audio based on format
|
|
62
|
+
if ext == '.wav':
|
|
63
|
+
audio = AudioSegment.from_wav(filepath)
|
|
64
|
+
elif ext in {'.aiff', '.aif'}:
|
|
65
|
+
audio = AudioSegment.from_file(filepath, format='aiff')
|
|
66
|
+
else:
|
|
67
|
+
audio = AudioSegment.from_file(filepath)
|
|
68
|
+
|
|
69
|
+
# Create temp file for FLAC output
|
|
70
|
+
with tempfile.NamedTemporaryFile(suffix=".flac", delete=False) as temp_flac:
|
|
71
|
+
flac_path = temp_flac.name
|
|
72
|
+
audio.export(flac_path, format="flac")
|
|
73
|
+
|
|
74
|
+
# Log size reduction
|
|
75
|
+
original_size = os.path.getsize(filepath)
|
|
76
|
+
flac_size = os.path.getsize(flac_path)
|
|
77
|
+
reduction_pct = (1 - flac_size / original_size) * 100
|
|
78
|
+
self.logger.info(f"Converted to FLAC: {original_size / 1024 / 1024:.1f}MB → {flac_size / 1024 / 1024:.1f}MB ({reduction_pct:.0f}% smaller)")
|
|
79
|
+
|
|
80
|
+
return flac_path, flac_path
|
|
81
|
+
|
|
82
|
+
def cleanup(self, temp_filepath: Optional[str]) -> None:
|
|
83
|
+
"""Clean up temporary file if it exists."""
|
|
84
|
+
if temp_filepath and os.path.exists(temp_filepath):
|
|
85
|
+
try:
|
|
86
|
+
os.unlink(temp_filepath)
|
|
87
|
+
self.logger.debug(f"Cleaned up temporary file: {temp_filepath}")
|
|
88
|
+
except OSError as e:
|
|
89
|
+
self.logger.warning(f"Failed to clean up temporary file {temp_filepath}: {e}")
|
|
90
|
+
|
|
11
91
|
|
|
12
92
|
@dataclass
|
|
13
93
|
class AudioShakeConfig:
|
|
@@ -162,11 +242,13 @@ class AudioShakeTranscriber(BaseTranscriber):
|
|
|
162
242
|
config: Optional[AudioShakeConfig] = None,
|
|
163
243
|
logger: Optional[Any] = None,
|
|
164
244
|
api_client: Optional[AudioShakeAPI] = None,
|
|
245
|
+
upload_optimizer: Optional[AudioUploadOptimizer] = None,
|
|
165
246
|
):
|
|
166
247
|
"""Initialize AudioShake transcriber."""
|
|
167
248
|
super().__init__(cache_dir=cache_dir, logger=logger)
|
|
168
249
|
self.config = config or AudioShakeConfig(api_token=os.getenv("AUDIOSHAKE_API_TOKEN"))
|
|
169
250
|
self.api = api_client or AudioShakeAPI(self.config, self.logger)
|
|
251
|
+
self.upload_optimizer = upload_optimizer or AudioUploadOptimizer(self.logger)
|
|
170
252
|
|
|
171
253
|
def get_name(self) -> str:
|
|
172
254
|
return "AudioShake"
|
|
@@ -195,14 +277,21 @@ class AudioShakeTranscriber(BaseTranscriber):
|
|
|
195
277
|
"""Starts the transcription task and returns the task ID."""
|
|
196
278
|
self.logger.debug(f"Entering start_transcription() for {audio_filepath}")
|
|
197
279
|
|
|
198
|
-
#
|
|
199
|
-
|
|
200
|
-
|
|
280
|
+
# Optimize file format for upload (convert WAV to FLAC, etc.)
|
|
281
|
+
upload_filepath, temp_filepath = self.upload_optimizer.prepare_for_upload(audio_filepath)
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
# Upload file and create task
|
|
285
|
+
file_url = self.api.upload_file(upload_filepath)
|
|
286
|
+
self.logger.debug(f"File uploaded successfully. File URL: {file_url}")
|
|
201
287
|
|
|
202
|
-
|
|
203
|
-
|
|
288
|
+
task_id = self.api.create_task(file_url)
|
|
289
|
+
self.logger.debug(f"Task created successfully. Task ID: {task_id}")
|
|
204
290
|
|
|
205
|
-
|
|
291
|
+
return task_id
|
|
292
|
+
finally:
|
|
293
|
+
# Clean up any temporary file created during optimization
|
|
294
|
+
self.upload_optimizer.cleanup(temp_filepath)
|
|
206
295
|
|
|
207
296
|
def get_transcription_result(self, task_id: str) -> Dict[str, Any]:
|
|
208
297
|
"""Gets the raw results for a previously started task."""
|
lyrics_transcriber/types.py
CHANGED
|
@@ -363,30 +363,32 @@ class AnchorSequence:
|
|
|
363
363
|
def from_dict(cls, data: Dict[str, Any]) -> "AnchorSequence":
|
|
364
364
|
"""Create AnchorSequence from dictionary."""
|
|
365
365
|
# Handle both old and new dictionary formats
|
|
366
|
-
if
|
|
367
|
-
|
|
368
|
-
#
|
|
369
|
-
words = data["words"]
|
|
366
|
+
# Check for new format keys FIRST (they take priority even if old keys also present)
|
|
367
|
+
if "transcribed_word_ids" in data:
|
|
368
|
+
# New format - use existing IDs
|
|
370
369
|
return cls(
|
|
371
370
|
id=data.get("id", WordUtils.generate_id()),
|
|
372
|
-
transcribed_word_ids=[
|
|
371
|
+
transcribed_word_ids=data["transcribed_word_ids"],
|
|
373
372
|
transcription_position=data["transcription_position"],
|
|
374
373
|
reference_positions=data["reference_positions"],
|
|
375
|
-
reference_word_ids=
|
|
376
|
-
for source in data["reference_positions"].keys()},
|
|
374
|
+
reference_word_ids=data["reference_word_ids"],
|
|
377
375
|
confidence=data["confidence"],
|
|
378
|
-
# Don't set _words - this ensures we always use the new format
|
|
379
376
|
)
|
|
380
|
-
|
|
381
|
-
#
|
|
377
|
+
elif "words" in data:
|
|
378
|
+
# Old format only - convert to new format by generating IDs
|
|
379
|
+
# This ensures to_dict() always returns the new format
|
|
380
|
+
words = data["words"]
|
|
382
381
|
return cls(
|
|
383
382
|
id=data.get("id", WordUtils.generate_id()),
|
|
384
|
-
transcribed_word_ids=
|
|
383
|
+
transcribed_word_ids=[WordUtils.generate_id() for _ in words],
|
|
385
384
|
transcription_position=data["transcription_position"],
|
|
386
385
|
reference_positions=data["reference_positions"],
|
|
387
|
-
reference_word_ids=
|
|
386
|
+
reference_word_ids={source: [WordUtils.generate_id() for _ in words]
|
|
387
|
+
for source in data["reference_positions"].keys()},
|
|
388
388
|
confidence=data["confidence"],
|
|
389
389
|
)
|
|
390
|
+
else:
|
|
391
|
+
raise ValueError("AnchorSequence.from_dict requires either 'transcribed_word_ids' or 'words' key")
|
|
390
392
|
|
|
391
393
|
|
|
392
394
|
@dataclass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|