karaoke-gen 0.71.27__py3-none-any.whl → 0.75.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- karaoke_gen/__init__.py +32 -1
- karaoke_gen/audio_fetcher.py +476 -56
- karaoke_gen/audio_processor.py +11 -3
- karaoke_gen/file_handler.py +192 -0
- karaoke_gen/instrumental_review/__init__.py +45 -0
- karaoke_gen/instrumental_review/analyzer.py +408 -0
- karaoke_gen/instrumental_review/editor.py +322 -0
- karaoke_gen/instrumental_review/models.py +171 -0
- karaoke_gen/instrumental_review/server.py +475 -0
- karaoke_gen/instrumental_review/static/index.html +1506 -0
- karaoke_gen/instrumental_review/waveform.py +409 -0
- karaoke_gen/karaoke_finalise/karaoke_finalise.py +62 -1
- karaoke_gen/karaoke_gen.py +114 -1
- karaoke_gen/lyrics_processor.py +81 -4
- karaoke_gen/utils/bulk_cli.py +3 -0
- karaoke_gen/utils/cli_args.py +9 -2
- karaoke_gen/utils/gen_cli.py +379 -2
- karaoke_gen/utils/remote_cli.py +1126 -77
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/METADATA +7 -1
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/RECORD +38 -26
- lyrics_transcriber/correction/anchor_sequence.py +226 -350
- lyrics_transcriber/frontend/package.json +1 -1
- lyrics_transcriber/frontend/src/components/Header.tsx +38 -12
- lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +17 -3
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
- lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
- lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +190 -542
- lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
- lyrics_transcriber/frontend/web_assets/assets/{index-DdJTDWH3.js → index-COYImAcx.js} +1722 -489
- lyrics_transcriber/frontend/web_assets/assets/index-COYImAcx.js.map +1 -0
- lyrics_transcriber/frontend/web_assets/index.html +1 -1
- lyrics_transcriber/review/server.py +5 -5
- lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js.map +0 -1
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/WHEEL +0 -0
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/entry_points.txt +0 -0
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Waveform visualization generator for audio files.
|
|
3
|
+
|
|
4
|
+
This module provides the WaveformGenerator class which creates waveform
|
|
5
|
+
images suitable for display in the instrumental review UI.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import math
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
import matplotlib
|
|
14
|
+
matplotlib.use('Agg') # Use non-interactive backend
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
import numpy as np
|
|
17
|
+
from pydub import AudioSegment
|
|
18
|
+
|
|
19
|
+
from .models import AudibleSegment, MuteRegion
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class WaveformGenerator:
|
|
26
|
+
"""
|
|
27
|
+
Generates waveform visualization images from audio files.
|
|
28
|
+
|
|
29
|
+
This class creates PNG images showing the amplitude envelope of an
|
|
30
|
+
audio file over time. It can highlight detected audible segments
|
|
31
|
+
and mute regions with different colors.
|
|
32
|
+
|
|
33
|
+
The generated images are suitable for display in web UIs and can
|
|
34
|
+
be used for interactive seeking (click-to-seek) functionality.
|
|
35
|
+
|
|
36
|
+
Attributes:
|
|
37
|
+
width: Width of the output image in pixels (default: 1200)
|
|
38
|
+
height: Height of the output image in pixels (default: 200)
|
|
39
|
+
background_color: Background color (default: "#1a1a2e")
|
|
40
|
+
waveform_color: Main waveform color (default: "#4a90d9")
|
|
41
|
+
segment_color: Color for audible segments (default: "#e94560")
|
|
42
|
+
mute_color: Color for mute regions (default: "#ff6b6b")
|
|
43
|
+
time_axis_color: Color for time axis (default: "#ffffff")
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
>>> generator = WaveformGenerator(width=1200, height=200)
|
|
47
|
+
>>> generator.generate(
|
|
48
|
+
... audio_path="/path/to/backing_vocals.flac",
|
|
49
|
+
... output_path="/path/to/waveform.png",
|
|
50
|
+
... segments=analysis_result.audible_segments
|
|
51
|
+
... )
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
width: int = 1200,
|
|
57
|
+
height: int = 200,
|
|
58
|
+
background_color: str = "#1a1a2e",
|
|
59
|
+
waveform_color: str = "#4a90d9",
|
|
60
|
+
segment_color: str = "#e94560",
|
|
61
|
+
mute_color: str = "#ff6b6b",
|
|
62
|
+
time_axis_color: str = "#ffffff",
|
|
63
|
+
dpi: int = 100,
|
|
64
|
+
):
|
|
65
|
+
"""
|
|
66
|
+
Initialize the waveform generator.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
width: Width of the output image in pixels
|
|
70
|
+
height: Height of the output image in pixels
|
|
71
|
+
background_color: Background color (hex or named color)
|
|
72
|
+
waveform_color: Main waveform color
|
|
73
|
+
segment_color: Color for highlighting audible segments
|
|
74
|
+
mute_color: Color for highlighting mute regions
|
|
75
|
+
time_axis_color: Color for time axis labels
|
|
76
|
+
dpi: DPI for the output image
|
|
77
|
+
"""
|
|
78
|
+
self.width = width
|
|
79
|
+
self.height = height
|
|
80
|
+
self.background_color = background_color
|
|
81
|
+
self.waveform_color = waveform_color
|
|
82
|
+
self.segment_color = segment_color
|
|
83
|
+
self.mute_color = mute_color
|
|
84
|
+
self.time_axis_color = time_axis_color
|
|
85
|
+
self.dpi = dpi
|
|
86
|
+
|
|
87
|
+
def generate(
|
|
88
|
+
self,
|
|
89
|
+
audio_path: str,
|
|
90
|
+
output_path: str,
|
|
91
|
+
segments: Optional[List[AudibleSegment]] = None,
|
|
92
|
+
mute_regions: Optional[List[MuteRegion]] = None,
|
|
93
|
+
show_time_axis: bool = True,
|
|
94
|
+
silence_threshold_db: float = -40.0,
|
|
95
|
+
) -> str:
|
|
96
|
+
"""
|
|
97
|
+
Generate a waveform image from an audio file.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
audio_path: Path to the audio file
|
|
101
|
+
output_path: Path where the PNG image will be saved
|
|
102
|
+
segments: Optional list of audible segments to highlight
|
|
103
|
+
mute_regions: Optional list of mute regions to highlight
|
|
104
|
+
show_time_axis: Whether to show time axis labels
|
|
105
|
+
silence_threshold_db: Threshold for visual reference line
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Path to the generated image file
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
FileNotFoundError: If the audio file doesn't exist
|
|
112
|
+
"""
|
|
113
|
+
path = Path(audio_path)
|
|
114
|
+
if not path.exists():
|
|
115
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
116
|
+
|
|
117
|
+
logger.info(f"Generating waveform for: {audio_path}")
|
|
118
|
+
|
|
119
|
+
# Load audio
|
|
120
|
+
audio = AudioSegment.from_file(audio_path)
|
|
121
|
+
duration_seconds = len(audio) / 1000.0
|
|
122
|
+
|
|
123
|
+
# Convert to mono if needed
|
|
124
|
+
if audio.channels > 1:
|
|
125
|
+
audio = audio.set_channels(1)
|
|
126
|
+
|
|
127
|
+
# Get amplitude envelope
|
|
128
|
+
envelope = self._get_envelope(audio)
|
|
129
|
+
|
|
130
|
+
# Create the figure
|
|
131
|
+
fig, ax = self._create_figure(duration_seconds, show_time_axis)
|
|
132
|
+
|
|
133
|
+
# Draw waveform
|
|
134
|
+
self._draw_waveform(ax, envelope, duration_seconds)
|
|
135
|
+
|
|
136
|
+
# Highlight mute regions (if any) - draw first so waveform is on top
|
|
137
|
+
if mute_regions:
|
|
138
|
+
self._draw_mute_regions(ax, mute_regions, duration_seconds)
|
|
139
|
+
|
|
140
|
+
# Highlight audible segments (if any)
|
|
141
|
+
if segments:
|
|
142
|
+
self._draw_segments(ax, segments, envelope, duration_seconds)
|
|
143
|
+
|
|
144
|
+
# Draw silence threshold reference line
|
|
145
|
+
self._draw_threshold_line(ax, silence_threshold_db, duration_seconds)
|
|
146
|
+
|
|
147
|
+
# Save the figure
|
|
148
|
+
output_dir = Path(output_path).parent
|
|
149
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
|
|
151
|
+
fig.savefig(
|
|
152
|
+
output_path,
|
|
153
|
+
facecolor=self.background_color,
|
|
154
|
+
edgecolor='none',
|
|
155
|
+
bbox_inches='tight',
|
|
156
|
+
pad_inches=0.1,
|
|
157
|
+
)
|
|
158
|
+
plt.close(fig)
|
|
159
|
+
|
|
160
|
+
logger.info(f"Waveform saved to: {output_path}")
|
|
161
|
+
return output_path
|
|
162
|
+
|
|
163
|
+
def generate_data_only(
|
|
164
|
+
self,
|
|
165
|
+
audio_path: str,
|
|
166
|
+
num_points: int = 500,
|
|
167
|
+
) -> Tuple[List[float], float]:
|
|
168
|
+
"""
|
|
169
|
+
Generate waveform data without creating an image.
|
|
170
|
+
|
|
171
|
+
This is useful for sending data to a frontend that will
|
|
172
|
+
render the waveform itself (e.g., using Canvas or SVG).
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
audio_path: Path to the audio file
|
|
176
|
+
num_points: Number of data points to return
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Tuple of (amplitude_values, duration_seconds)
|
|
180
|
+
Amplitude values are normalized to 0.0-1.0 range.
|
|
181
|
+
"""
|
|
182
|
+
path = Path(audio_path)
|
|
183
|
+
if not path.exists():
|
|
184
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
185
|
+
|
|
186
|
+
audio = AudioSegment.from_file(audio_path)
|
|
187
|
+
duration_seconds = len(audio) / 1000.0
|
|
188
|
+
|
|
189
|
+
if audio.channels > 1:
|
|
190
|
+
audio = audio.set_channels(1)
|
|
191
|
+
|
|
192
|
+
# Calculate window size to get desired number of points
|
|
193
|
+
duration_ms = len(audio)
|
|
194
|
+
window_ms = max(1, duration_ms // num_points)
|
|
195
|
+
|
|
196
|
+
amplitudes = []
|
|
197
|
+
for start_ms in range(0, duration_ms, window_ms):
|
|
198
|
+
end_ms = min(start_ms + window_ms, duration_ms)
|
|
199
|
+
window = audio[start_ms:end_ms]
|
|
200
|
+
|
|
201
|
+
if window.rms > 0:
|
|
202
|
+
db = 20 * math.log10(window.rms / window.max_possible_amplitude)
|
|
203
|
+
else:
|
|
204
|
+
db = -100.0
|
|
205
|
+
|
|
206
|
+
# Normalize to 0-1 range (mapping -60dB to 0dB -> 0 to 1)
|
|
207
|
+
normalized = max(0.0, min(1.0, (db + 60) / 60))
|
|
208
|
+
amplitudes.append(normalized)
|
|
209
|
+
|
|
210
|
+
return amplitudes, duration_seconds
|
|
211
|
+
|
|
212
|
+
def _get_envelope(
|
|
213
|
+
self,
|
|
214
|
+
audio: AudioSegment,
|
|
215
|
+
window_ms: int = 50,
|
|
216
|
+
) -> np.ndarray:
|
|
217
|
+
"""
|
|
218
|
+
Extract amplitude envelope from audio.
|
|
219
|
+
|
|
220
|
+
Returns array of amplitude values in dB.
|
|
221
|
+
"""
|
|
222
|
+
duration_ms = len(audio)
|
|
223
|
+
amplitudes = []
|
|
224
|
+
|
|
225
|
+
for start_ms in range(0, duration_ms, window_ms):
|
|
226
|
+
end_ms = min(start_ms + window_ms, duration_ms)
|
|
227
|
+
window = audio[start_ms:end_ms]
|
|
228
|
+
|
|
229
|
+
if window.rms > 0:
|
|
230
|
+
db = 20 * math.log10(window.rms / window.max_possible_amplitude)
|
|
231
|
+
else:
|
|
232
|
+
db = -100.0
|
|
233
|
+
|
|
234
|
+
amplitudes.append(db)
|
|
235
|
+
|
|
236
|
+
return np.array(amplitudes)
|
|
237
|
+
|
|
238
|
+
def _create_figure(
|
|
239
|
+
self,
|
|
240
|
+
duration_seconds: float,
|
|
241
|
+
show_time_axis: bool,
|
|
242
|
+
) -> Tuple[plt.Figure, plt.Axes]:
|
|
243
|
+
"""
|
|
244
|
+
Create matplotlib figure and axes.
|
|
245
|
+
"""
|
|
246
|
+
fig_width = self.width / self.dpi
|
|
247
|
+
fig_height = self.height / self.dpi
|
|
248
|
+
|
|
249
|
+
fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=self.dpi)
|
|
250
|
+
|
|
251
|
+
# Set background
|
|
252
|
+
fig.patch.set_facecolor(self.background_color)
|
|
253
|
+
ax.set_facecolor(self.background_color)
|
|
254
|
+
|
|
255
|
+
# Configure axes
|
|
256
|
+
ax.set_xlim(0, duration_seconds)
|
|
257
|
+
ax.set_ylim(-60, 0) # dB range
|
|
258
|
+
|
|
259
|
+
# Remove spines
|
|
260
|
+
for spine in ax.spines.values():
|
|
261
|
+
spine.set_visible(False)
|
|
262
|
+
|
|
263
|
+
# Configure ticks
|
|
264
|
+
if show_time_axis:
|
|
265
|
+
ax.tick_params(
|
|
266
|
+
axis='x',
|
|
267
|
+
colors=self.time_axis_color,
|
|
268
|
+
labelsize=8,
|
|
269
|
+
)
|
|
270
|
+
ax.tick_params(axis='y', left=False, labelleft=False)
|
|
271
|
+
|
|
272
|
+
# Set time axis ticks
|
|
273
|
+
self._set_time_ticks(ax, duration_seconds)
|
|
274
|
+
else:
|
|
275
|
+
ax.tick_params(
|
|
276
|
+
axis='both',
|
|
277
|
+
left=False,
|
|
278
|
+
bottom=False,
|
|
279
|
+
labelleft=False,
|
|
280
|
+
labelbottom=False,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return fig, ax
|
|
284
|
+
|
|
285
|
+
def _set_time_ticks(self, ax: plt.Axes, duration_seconds: float):
|
|
286
|
+
"""
|
|
287
|
+
Set appropriate time axis tick marks.
|
|
288
|
+
"""
|
|
289
|
+
if duration_seconds <= 60:
|
|
290
|
+
# Under 1 minute: tick every 10 seconds
|
|
291
|
+
tick_interval = 10
|
|
292
|
+
elif duration_seconds <= 300:
|
|
293
|
+
# Under 5 minutes: tick every 30 seconds
|
|
294
|
+
tick_interval = 30
|
|
295
|
+
else:
|
|
296
|
+
# Over 5 minutes: tick every minute
|
|
297
|
+
tick_interval = 60
|
|
298
|
+
|
|
299
|
+
ticks = np.arange(0, duration_seconds + 1, tick_interval)
|
|
300
|
+
ax.set_xticks(ticks)
|
|
301
|
+
|
|
302
|
+
# Format tick labels as MM:SS
|
|
303
|
+
labels = []
|
|
304
|
+
for t in ticks:
|
|
305
|
+
minutes = int(t // 60)
|
|
306
|
+
seconds = int(t % 60)
|
|
307
|
+
labels.append(f"{minutes}:{seconds:02d}")
|
|
308
|
+
ax.set_xticklabels(labels)
|
|
309
|
+
|
|
310
|
+
def _draw_waveform(
|
|
311
|
+
self,
|
|
312
|
+
ax: plt.Axes,
|
|
313
|
+
envelope: np.ndarray,
|
|
314
|
+
duration_seconds: float,
|
|
315
|
+
):
|
|
316
|
+
"""
|
|
317
|
+
Draw the main waveform.
|
|
318
|
+
"""
|
|
319
|
+
num_points = len(envelope)
|
|
320
|
+
time_points = np.linspace(0, duration_seconds, num_points)
|
|
321
|
+
|
|
322
|
+
# Draw as filled area
|
|
323
|
+
ax.fill_between(
|
|
324
|
+
time_points,
|
|
325
|
+
envelope,
|
|
326
|
+
-60, # Bottom of range
|
|
327
|
+
color=self.waveform_color,
|
|
328
|
+
alpha=0.7,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Draw outline
|
|
332
|
+
ax.plot(
|
|
333
|
+
time_points,
|
|
334
|
+
envelope,
|
|
335
|
+
color=self.waveform_color,
|
|
336
|
+
linewidth=0.5,
|
|
337
|
+
alpha=0.9,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
def _draw_segments(
|
|
341
|
+
self,
|
|
342
|
+
ax: plt.Axes,
|
|
343
|
+
segments: List[AudibleSegment],
|
|
344
|
+
envelope: np.ndarray,
|
|
345
|
+
duration_seconds: float,
|
|
346
|
+
):
|
|
347
|
+
"""
|
|
348
|
+
Highlight audible segments on the waveform.
|
|
349
|
+
"""
|
|
350
|
+
num_points = len(envelope)
|
|
351
|
+
time_points = np.linspace(0, duration_seconds, num_points)
|
|
352
|
+
|
|
353
|
+
for segment in segments:
|
|
354
|
+
# Find indices corresponding to this segment
|
|
355
|
+
start_idx = int(segment.start_seconds / duration_seconds * num_points)
|
|
356
|
+
end_idx = int(segment.end_seconds / duration_seconds * num_points)
|
|
357
|
+
|
|
358
|
+
start_idx = max(0, min(start_idx, num_points - 1))
|
|
359
|
+
end_idx = max(0, min(end_idx, num_points))
|
|
360
|
+
|
|
361
|
+
if start_idx >= end_idx:
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
segment_time = time_points[start_idx:end_idx]
|
|
365
|
+
segment_envelope = envelope[start_idx:end_idx]
|
|
366
|
+
|
|
367
|
+
# Highlight this segment with a different color
|
|
368
|
+
ax.fill_between(
|
|
369
|
+
segment_time,
|
|
370
|
+
segment_envelope,
|
|
371
|
+
-60,
|
|
372
|
+
color=self.segment_color,
|
|
373
|
+
alpha=0.6,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
def _draw_mute_regions(
|
|
377
|
+
self,
|
|
378
|
+
ax: plt.Axes,
|
|
379
|
+
mute_regions: List[MuteRegion],
|
|
380
|
+
duration_seconds: float,
|
|
381
|
+
):
|
|
382
|
+
"""
|
|
383
|
+
Draw mute region overlays.
|
|
384
|
+
"""
|
|
385
|
+
for region in mute_regions:
|
|
386
|
+
ax.axvspan(
|
|
387
|
+
region.start_seconds,
|
|
388
|
+
region.end_seconds,
|
|
389
|
+
color=self.mute_color,
|
|
390
|
+
alpha=0.3,
|
|
391
|
+
zorder=0,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
def _draw_threshold_line(
|
|
395
|
+
self,
|
|
396
|
+
ax: plt.Axes,
|
|
397
|
+
threshold_db: float,
|
|
398
|
+
duration_seconds: float,
|
|
399
|
+
):
|
|
400
|
+
"""
|
|
401
|
+
Draw a reference line at the silence threshold.
|
|
402
|
+
"""
|
|
403
|
+
ax.axhline(
|
|
404
|
+
y=threshold_db,
|
|
405
|
+
color=self.time_axis_color,
|
|
406
|
+
linestyle='--',
|
|
407
|
+
linewidth=0.5,
|
|
408
|
+
alpha=0.3,
|
|
409
|
+
)
|
|
@@ -47,6 +47,7 @@ class KaraokeFinalise:
|
|
|
47
47
|
user_youtube_credentials=None, # Add support for pre-stored credentials
|
|
48
48
|
server_side_mode=False, # New parameter for server-side deployment
|
|
49
49
|
selected_instrumental_file=None, # Add support for pre-selected instrumental file
|
|
50
|
+
countdown_padding_seconds=None, # Padding applied to vocals; instrumental must match
|
|
50
51
|
):
|
|
51
52
|
self.log_level = log_level
|
|
52
53
|
self.log_formatter = log_formatter
|
|
@@ -54,6 +55,9 @@ class KaraokeFinalise:
|
|
|
54
55
|
if logger is None:
|
|
55
56
|
self.logger = logging.getLogger(__name__)
|
|
56
57
|
self.logger.setLevel(log_level)
|
|
58
|
+
# Prevent log propagation to root logger to avoid duplicate logs
|
|
59
|
+
# when external packages (like lyrics_converter) configure root logger handlers
|
|
60
|
+
self.logger.propagate = False
|
|
57
61
|
|
|
58
62
|
self.log_handler = logging.StreamHandler()
|
|
59
63
|
|
|
@@ -105,6 +109,7 @@ class KaraokeFinalise:
|
|
|
105
109
|
self.user_youtube_credentials = user_youtube_credentials
|
|
106
110
|
self.server_side_mode = server_side_mode
|
|
107
111
|
self.selected_instrumental_file = selected_instrumental_file
|
|
112
|
+
self.countdown_padding_seconds = countdown_padding_seconds
|
|
108
113
|
|
|
109
114
|
self.suffixes = {
|
|
110
115
|
"title_mov": " (Title).mov",
|
|
@@ -421,6 +426,15 @@ class KaraokeFinalise:
|
|
|
421
426
|
# Check if any videos were found
|
|
422
427
|
if "items" in response and len(response["items"]) > 0:
|
|
423
428
|
for item in response["items"]:
|
|
429
|
+
# YouTube search API sometimes returns results from other channels even with channelId filter
|
|
430
|
+
# Verify the video actually belongs to our channel
|
|
431
|
+
result_channel_id = item["snippet"]["channelId"]
|
|
432
|
+
if result_channel_id != channel_id:
|
|
433
|
+
self.logger.debug(
|
|
434
|
+
f"Skipping video from different channel: {item['snippet']['title']} (channel: {result_channel_id})"
|
|
435
|
+
)
|
|
436
|
+
continue
|
|
437
|
+
|
|
424
438
|
found_title = item["snippet"]["title"]
|
|
425
439
|
|
|
426
440
|
# In server-side mode, require an exact match to avoid false positives.
|
|
@@ -720,6 +734,32 @@ class KaraokeFinalise:
|
|
|
720
734
|
artist, title = base_name.split(" - ", 1)
|
|
721
735
|
return base_name, artist, title
|
|
722
736
|
|
|
737
|
+
def _pad_audio_file(self, input_audio, output_audio, padding_seconds):
|
|
738
|
+
"""
|
|
739
|
+
Pad an audio file by prepending silence at the beginning.
|
|
740
|
+
|
|
741
|
+
Uses the same ffmpeg approach as LyricsTranscriber's CountdownProcessor
|
|
742
|
+
to ensure consistent padding behavior.
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
input_audio: Path to input audio file
|
|
746
|
+
output_audio: Path for the padded output file
|
|
747
|
+
padding_seconds: Amount of silence to prepend (in seconds)
|
|
748
|
+
"""
|
|
749
|
+
self.logger.info(f"Padding audio file with {padding_seconds}s of silence")
|
|
750
|
+
|
|
751
|
+
# Use ffmpeg to prepend silence - this matches the approach in audio_processor.py
|
|
752
|
+
# adelay filter adds delay in milliseconds
|
|
753
|
+
delay_ms = int(padding_seconds * 1000)
|
|
754
|
+
|
|
755
|
+
ffmpeg_command = (
|
|
756
|
+
f'{self.ffmpeg_base_command} -i "{input_audio}" '
|
|
757
|
+
f'-af "adelay={delay_ms}|{delay_ms}" '
|
|
758
|
+
f'"{output_audio}"'
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
self.execute_command(ffmpeg_command, f"Padding audio with {padding_seconds}s silence")
|
|
762
|
+
|
|
723
763
|
def execute_command(self, command, description):
|
|
724
764
|
"""Execute a shell command and log the output. For general commands (rclone, etc.)"""
|
|
725
765
|
self.logger.info(f"{description}")
|
|
@@ -764,11 +804,32 @@ class KaraokeFinalise:
|
|
|
764
804
|
|
|
765
805
|
def remux_with_instrumental(self, with_vocals_file, instrumental_audio, output_file):
|
|
766
806
|
"""Remux the video with instrumental audio to create karaoke version"""
|
|
807
|
+
# Safety net: If countdown padding was applied to vocals, ensure instrumental is padded too
|
|
808
|
+
actual_instrumental = instrumental_audio
|
|
809
|
+
if self.countdown_padding_seconds and self.countdown_padding_seconds > 0:
|
|
810
|
+
# Check if the instrumental file is already padded (has "(Padded)" in name)
|
|
811
|
+
if "(Padded)" not in instrumental_audio:
|
|
812
|
+
self.logger.warning(
|
|
813
|
+
f"Countdown padding ({self.countdown_padding_seconds}s) was applied to vocals, "
|
|
814
|
+
f"but instrumental doesn't appear to be padded. Creating padded version..."
|
|
815
|
+
)
|
|
816
|
+
# Create a padded version of the instrumental
|
|
817
|
+
base, ext = os.path.splitext(instrumental_audio)
|
|
818
|
+
padded_instrumental = f"{base} (Padded){ext}"
|
|
819
|
+
|
|
820
|
+
if not os.path.exists(padded_instrumental):
|
|
821
|
+
self._pad_audio_file(instrumental_audio, padded_instrumental, self.countdown_padding_seconds)
|
|
822
|
+
self.logger.info(f"Created padded instrumental: {padded_instrumental}")
|
|
823
|
+
|
|
824
|
+
actual_instrumental = padded_instrumental
|
|
825
|
+
else:
|
|
826
|
+
self.logger.info(f"Using already-padded instrumental: {instrumental_audio}")
|
|
827
|
+
|
|
767
828
|
# This operation is primarily I/O bound (remuxing), so hardware acceleration doesn't provide significant benefit
|
|
768
829
|
# Keep the existing approach but use the new execute method
|
|
769
830
|
ffmpeg_command = (
|
|
770
831
|
f'{self.ffmpeg_base_command} -an -i "{with_vocals_file}" '
|
|
771
|
-
f'-vn -i "{
|
|
832
|
+
f'-vn -i "{actual_instrumental}" -c:v copy -c:a pcm_s16le "{output_file}"'
|
|
772
833
|
)
|
|
773
834
|
self.execute_command(ffmpeg_command, "Remuxing video with instrumental audio")
|
|
774
835
|
|
karaoke_gen/karaoke_gen.py
CHANGED
|
@@ -29,7 +29,7 @@ from .audio_processor import AudioProcessor
|
|
|
29
29
|
from .lyrics_processor import LyricsProcessor
|
|
30
30
|
from .video_generator import VideoGenerator
|
|
31
31
|
from .video_background_processor import VideoBackgroundProcessor
|
|
32
|
-
from .audio_fetcher import create_audio_fetcher, AudioFetcherError, NoResultsError
|
|
32
|
+
from .audio_fetcher import create_audio_fetcher, AudioFetcherError, NoResultsError, UserCancelledError
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class KaraokePrep:
|
|
@@ -84,6 +84,9 @@ class KaraokePrep:
|
|
|
84
84
|
if logger is None:
|
|
85
85
|
self.logger = logging.getLogger(__name__)
|
|
86
86
|
self.logger.setLevel(log_level)
|
|
87
|
+
# Prevent log propagation to root logger to avoid duplicate logs
|
|
88
|
+
# when external packages (like lyrics_converter) configure root logger handlers
|
|
89
|
+
self.logger.propagate = False
|
|
87
90
|
|
|
88
91
|
self.log_handler = logging.StreamHandler()
|
|
89
92
|
|
|
@@ -256,6 +259,101 @@ class KaraokePrep:
|
|
|
256
259
|
self.artist = metadata_result["artist"]
|
|
257
260
|
self.title = metadata_result["title"]
|
|
258
261
|
|
|
262
|
+
def _scan_directory_for_instrumentals(self, track_output_dir, artist_title):
|
|
263
|
+
"""
|
|
264
|
+
Scan the directory for existing instrumental files and build a separated_audio structure.
|
|
265
|
+
|
|
266
|
+
This is used when transcription was skipped (existing files found) but we need to
|
|
267
|
+
pad instrumentals due to countdown padding.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
track_output_dir: The track output directory to scan
|
|
271
|
+
artist_title: The "{artist} - {title}" string for matching files
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Dictionary with separated_audio structure containing found instrumental paths
|
|
275
|
+
"""
|
|
276
|
+
self.logger.info(f"Scanning directory for existing instrumentals: {track_output_dir}")
|
|
277
|
+
|
|
278
|
+
separated_audio = {
|
|
279
|
+
"clean_instrumental": {},
|
|
280
|
+
"backing_vocals": {},
|
|
281
|
+
"other_stems": {},
|
|
282
|
+
"combined_instrumentals": {},
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
# Search patterns for instrumental files
|
|
286
|
+
# Files are named like: "{artist} - {title} (Instrumental {model}).flac"
|
|
287
|
+
# Or with backing vocals: "{artist} - {title} (Instrumental +BV {model}).flac"
|
|
288
|
+
|
|
289
|
+
# Look for files in the track output directory
|
|
290
|
+
search_dir = track_output_dir
|
|
291
|
+
|
|
292
|
+
# Find all instrumental files (not padded ones - we want the originals)
|
|
293
|
+
instrumental_pattern = os.path.join(search_dir, f"{artist_title} (Instrumental*.flac")
|
|
294
|
+
instrumental_files = glob.glob(instrumental_pattern)
|
|
295
|
+
|
|
296
|
+
# Also check for wav files
|
|
297
|
+
instrumental_pattern_wav = os.path.join(search_dir, f"{artist_title} (Instrumental*.wav")
|
|
298
|
+
instrumental_files.extend(glob.glob(instrumental_pattern_wav))
|
|
299
|
+
|
|
300
|
+
self.logger.debug(f"Found {len(instrumental_files)} instrumental files")
|
|
301
|
+
|
|
302
|
+
for filepath in instrumental_files:
|
|
303
|
+
filename = os.path.basename(filepath)
|
|
304
|
+
|
|
305
|
+
# Skip already padded files
|
|
306
|
+
if "(Padded)" in filename:
|
|
307
|
+
self.logger.debug(f"Skipping already padded file: {filename}")
|
|
308
|
+
continue
|
|
309
|
+
|
|
310
|
+
# Determine if it's a combined instrumental (+BV) or clean instrumental
|
|
311
|
+
if "+BV" in filename or "+bv" in filename.lower():
|
|
312
|
+
# Combined instrumental with backing vocals
|
|
313
|
+
# Extract model name from filename
|
|
314
|
+
# Pattern: "(Instrumental +BV {model}).flac"
|
|
315
|
+
model_match = re.search(r'\(Instrumental \+BV ([^)]+)\)', filename)
|
|
316
|
+
if model_match:
|
|
317
|
+
model_name = model_match.group(1).strip()
|
|
318
|
+
separated_audio["combined_instrumentals"][model_name] = filepath
|
|
319
|
+
self.logger.info(f"Found combined instrumental: {filename}")
|
|
320
|
+
else:
|
|
321
|
+
# Clean instrumental (no backing vocals)
|
|
322
|
+
# Pattern: "(Instrumental {model}).flac"
|
|
323
|
+
model_match = re.search(r'\(Instrumental ([^)]+)\)', filename)
|
|
324
|
+
if model_match:
|
|
325
|
+
# Use as clean instrumental if we don't have one yet
|
|
326
|
+
if not separated_audio["clean_instrumental"].get("instrumental"):
|
|
327
|
+
separated_audio["clean_instrumental"]["instrumental"] = filepath
|
|
328
|
+
self.logger.info(f"Found clean instrumental: {filename}")
|
|
329
|
+
else:
|
|
330
|
+
# Additional clean instrumentals go to combined_instrumentals for padding
|
|
331
|
+
model_name = model_match.group(1).strip()
|
|
332
|
+
separated_audio["combined_instrumentals"][model_name] = filepath
|
|
333
|
+
self.logger.info(f"Found additional instrumental: {filename}")
|
|
334
|
+
|
|
335
|
+
# Also look for backing vocals files
|
|
336
|
+
backing_vocals_pattern = os.path.join(search_dir, f"{artist_title} (Backing Vocals*.flac")
|
|
337
|
+
backing_vocals_files = glob.glob(backing_vocals_pattern)
|
|
338
|
+
backing_vocals_pattern_wav = os.path.join(search_dir, f"{artist_title} (Backing Vocals*.wav")
|
|
339
|
+
backing_vocals_files.extend(glob.glob(backing_vocals_pattern_wav))
|
|
340
|
+
|
|
341
|
+
for filepath in backing_vocals_files:
|
|
342
|
+
filename = os.path.basename(filepath)
|
|
343
|
+
model_match = re.search(r'\(Backing Vocals ([^)]+)\)', filename)
|
|
344
|
+
if model_match:
|
|
345
|
+
model_name = model_match.group(1).strip()
|
|
346
|
+
if model_name not in separated_audio["backing_vocals"]:
|
|
347
|
+
separated_audio["backing_vocals"][model_name] = {"backing_vocals": filepath}
|
|
348
|
+
self.logger.info(f"Found backing vocals: {filename}")
|
|
349
|
+
|
|
350
|
+
# Log summary
|
|
351
|
+
clean_count = 1 if separated_audio["clean_instrumental"].get("instrumental") else 0
|
|
352
|
+
combined_count = len(separated_audio["combined_instrumentals"])
|
|
353
|
+
self.logger.info(f"Directory scan complete: {clean_count} clean instrumental, {combined_count} combined instrumentals")
|
|
354
|
+
|
|
355
|
+
return separated_audio
|
|
356
|
+
|
|
259
357
|
async def prep_single_track(self):
|
|
260
358
|
# Add signal handler at the start
|
|
261
359
|
loop = asyncio.get_running_loop()
|
|
@@ -419,6 +517,9 @@ class KaraokePrep:
|
|
|
419
517
|
# No still image for audio-only downloads
|
|
420
518
|
processed_track["input_still_image"] = None
|
|
421
519
|
|
|
520
|
+
except UserCancelledError:
|
|
521
|
+
# User cancelled - propagate up to CLI for graceful exit
|
|
522
|
+
raise
|
|
422
523
|
except NoResultsError as e:
|
|
423
524
|
self.logger.error(f"No audio found: {e}")
|
|
424
525
|
return None
|
|
@@ -761,6 +862,18 @@ class KaraokePrep:
|
|
|
761
862
|
f"Applying {padding_seconds}s padding to all instrumental files to sync with vocal countdown"
|
|
762
863
|
)
|
|
763
864
|
|
|
865
|
+
# If separated_audio is empty (e.g., transcription was skipped but existing files have countdown),
|
|
866
|
+
# scan the directory for existing instrumental files
|
|
867
|
+
has_instrumentals = (
|
|
868
|
+
processed_track["separated_audio"].get("clean_instrumental", {}).get("instrumental") or
|
|
869
|
+
processed_track["separated_audio"].get("combined_instrumentals")
|
|
870
|
+
)
|
|
871
|
+
if not has_instrumentals:
|
|
872
|
+
self.logger.info("No instrumentals in separated_audio, scanning directory for existing files...")
|
|
873
|
+
processed_track["separated_audio"] = self._scan_directory_for_instrumentals(
|
|
874
|
+
track_output_dir, artist_title
|
|
875
|
+
)
|
|
876
|
+
|
|
764
877
|
# Apply padding using AudioProcessor
|
|
765
878
|
padded_separation_result = self.audio_processor.apply_countdown_padding_to_instrumentals(
|
|
766
879
|
separation_result=processed_track["separated_audio"],
|