karaoke-gen 0.71.27__py3-none-any.whl → 0.75.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. karaoke_gen/__init__.py +32 -1
  2. karaoke_gen/audio_fetcher.py +476 -56
  3. karaoke_gen/audio_processor.py +11 -3
  4. karaoke_gen/file_handler.py +192 -0
  5. karaoke_gen/instrumental_review/__init__.py +45 -0
  6. karaoke_gen/instrumental_review/analyzer.py +408 -0
  7. karaoke_gen/instrumental_review/editor.py +322 -0
  8. karaoke_gen/instrumental_review/models.py +171 -0
  9. karaoke_gen/instrumental_review/server.py +475 -0
  10. karaoke_gen/instrumental_review/static/index.html +1506 -0
  11. karaoke_gen/instrumental_review/waveform.py +409 -0
  12. karaoke_gen/karaoke_finalise/karaoke_finalise.py +62 -1
  13. karaoke_gen/karaoke_gen.py +114 -1
  14. karaoke_gen/lyrics_processor.py +81 -4
  15. karaoke_gen/utils/bulk_cli.py +3 -0
  16. karaoke_gen/utils/cli_args.py +9 -2
  17. karaoke_gen/utils/gen_cli.py +379 -2
  18. karaoke_gen/utils/remote_cli.py +1126 -77
  19. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/METADATA +7 -1
  20. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/RECORD +38 -26
  21. lyrics_transcriber/correction/anchor_sequence.py +226 -350
  22. lyrics_transcriber/frontend/package.json +1 -1
  23. lyrics_transcriber/frontend/src/components/Header.tsx +38 -12
  24. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +17 -3
  25. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
  26. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
  27. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
  28. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
  29. lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
  30. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +190 -542
  31. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  32. lyrics_transcriber/frontend/web_assets/assets/{index-DdJTDWH3.js → index-COYImAcx.js} +1722 -489
  33. lyrics_transcriber/frontend/web_assets/assets/index-COYImAcx.js.map +1 -0
  34. lyrics_transcriber/frontend/web_assets/index.html +1 -1
  35. lyrics_transcriber/review/server.py +5 -5
  36. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js.map +0 -1
  37. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/WHEEL +0 -0
  38. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/entry_points.txt +0 -0
  39. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,409 @@
1
+ """
2
+ Waveform visualization generator for audio files.
3
+
4
+ This module provides the WaveformGenerator class which creates waveform
5
+ images suitable for display in the instrumental review UI.
6
+ """
7
+
8
+ import logging
9
+ import math
10
+ from pathlib import Path
11
+ from typing import List, Optional, Tuple
12
+
13
+ import matplotlib
14
+ matplotlib.use('Agg') # Use non-interactive backend
15
+ import matplotlib.pyplot as plt
16
+ import numpy as np
17
+ from pydub import AudioSegment
18
+
19
+ from .models import AudibleSegment, MuteRegion
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class WaveformGenerator:
26
+ """
27
+ Generates waveform visualization images from audio files.
28
+
29
+ This class creates PNG images showing the amplitude envelope of an
30
+ audio file over time. It can highlight detected audible segments
31
+ and mute regions with different colors.
32
+
33
+ The generated images are suitable for display in web UIs and can
34
+ be used for interactive seeking (click-to-seek) functionality.
35
+
36
+ Attributes:
37
+ width: Width of the output image in pixels (default: 1200)
38
+ height: Height of the output image in pixels (default: 200)
39
+ background_color: Background color (default: "#1a1a2e")
40
+ waveform_color: Main waveform color (default: "#4a90d9")
41
+ segment_color: Color for audible segments (default: "#e94560")
42
+ mute_color: Color for mute regions (default: "#ff6b6b")
43
+ time_axis_color: Color for time axis (default: "#ffffff")
44
+
45
+ Example:
46
+ >>> generator = WaveformGenerator(width=1200, height=200)
47
+ >>> generator.generate(
48
+ ... audio_path="/path/to/backing_vocals.flac",
49
+ ... output_path="/path/to/waveform.png",
50
+ ... segments=analysis_result.audible_segments
51
+ ... )
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ width: int = 1200,
57
+ height: int = 200,
58
+ background_color: str = "#1a1a2e",
59
+ waveform_color: str = "#4a90d9",
60
+ segment_color: str = "#e94560",
61
+ mute_color: str = "#ff6b6b",
62
+ time_axis_color: str = "#ffffff",
63
+ dpi: int = 100,
64
+ ):
65
+ """
66
+ Initialize the waveform generator.
67
+
68
+ Args:
69
+ width: Width of the output image in pixels
70
+ height: Height of the output image in pixels
71
+ background_color: Background color (hex or named color)
72
+ waveform_color: Main waveform color
73
+ segment_color: Color for highlighting audible segments
74
+ mute_color: Color for highlighting mute regions
75
+ time_axis_color: Color for time axis labels
76
+ dpi: DPI for the output image
77
+ """
78
+ self.width = width
79
+ self.height = height
80
+ self.background_color = background_color
81
+ self.waveform_color = waveform_color
82
+ self.segment_color = segment_color
83
+ self.mute_color = mute_color
84
+ self.time_axis_color = time_axis_color
85
+ self.dpi = dpi
86
+
87
+ def generate(
88
+ self,
89
+ audio_path: str,
90
+ output_path: str,
91
+ segments: Optional[List[AudibleSegment]] = None,
92
+ mute_regions: Optional[List[MuteRegion]] = None,
93
+ show_time_axis: bool = True,
94
+ silence_threshold_db: float = -40.0,
95
+ ) -> str:
96
+ """
97
+ Generate a waveform image from an audio file.
98
+
99
+ Args:
100
+ audio_path: Path to the audio file
101
+ output_path: Path where the PNG image will be saved
102
+ segments: Optional list of audible segments to highlight
103
+ mute_regions: Optional list of mute regions to highlight
104
+ show_time_axis: Whether to show time axis labels
105
+ silence_threshold_db: Threshold for visual reference line
106
+
107
+ Returns:
108
+ Path to the generated image file
109
+
110
+ Raises:
111
+ FileNotFoundError: If the audio file doesn't exist
112
+ """
113
+ path = Path(audio_path)
114
+ if not path.exists():
115
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
116
+
117
+ logger.info(f"Generating waveform for: {audio_path}")
118
+
119
+ # Load audio
120
+ audio = AudioSegment.from_file(audio_path)
121
+ duration_seconds = len(audio) / 1000.0
122
+
123
+ # Convert to mono if needed
124
+ if audio.channels > 1:
125
+ audio = audio.set_channels(1)
126
+
127
+ # Get amplitude envelope
128
+ envelope = self._get_envelope(audio)
129
+
130
+ # Create the figure
131
+ fig, ax = self._create_figure(duration_seconds, show_time_axis)
132
+
133
+ # Draw waveform
134
+ self._draw_waveform(ax, envelope, duration_seconds)
135
+
136
+ # Highlight mute regions (if any) - draw first so waveform is on top
137
+ if mute_regions:
138
+ self._draw_mute_regions(ax, mute_regions, duration_seconds)
139
+
140
+ # Highlight audible segments (if any)
141
+ if segments:
142
+ self._draw_segments(ax, segments, envelope, duration_seconds)
143
+
144
+ # Draw silence threshold reference line
145
+ self._draw_threshold_line(ax, silence_threshold_db, duration_seconds)
146
+
147
+ # Save the figure
148
+ output_dir = Path(output_path).parent
149
+ output_dir.mkdir(parents=True, exist_ok=True)
150
+
151
+ fig.savefig(
152
+ output_path,
153
+ facecolor=self.background_color,
154
+ edgecolor='none',
155
+ bbox_inches='tight',
156
+ pad_inches=0.1,
157
+ )
158
+ plt.close(fig)
159
+
160
+ logger.info(f"Waveform saved to: {output_path}")
161
+ return output_path
162
+
163
+ def generate_data_only(
164
+ self,
165
+ audio_path: str,
166
+ num_points: int = 500,
167
+ ) -> Tuple[List[float], float]:
168
+ """
169
+ Generate waveform data without creating an image.
170
+
171
+ This is useful for sending data to a frontend that will
172
+ render the waveform itself (e.g., using Canvas or SVG).
173
+
174
+ Args:
175
+ audio_path: Path to the audio file
176
+ num_points: Number of data points to return
177
+
178
+ Returns:
179
+ Tuple of (amplitude_values, duration_seconds)
180
+ Amplitude values are normalized to 0.0-1.0 range.
181
+ """
182
+ path = Path(audio_path)
183
+ if not path.exists():
184
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
185
+
186
+ audio = AudioSegment.from_file(audio_path)
187
+ duration_seconds = len(audio) / 1000.0
188
+
189
+ if audio.channels > 1:
190
+ audio = audio.set_channels(1)
191
+
192
+ # Calculate window size to get desired number of points
193
+ duration_ms = len(audio)
194
+ window_ms = max(1, duration_ms // num_points)
195
+
196
+ amplitudes = []
197
+ for start_ms in range(0, duration_ms, window_ms):
198
+ end_ms = min(start_ms + window_ms, duration_ms)
199
+ window = audio[start_ms:end_ms]
200
+
201
+ if window.rms > 0:
202
+ db = 20 * math.log10(window.rms / window.max_possible_amplitude)
203
+ else:
204
+ db = -100.0
205
+
206
+ # Normalize to 0-1 range (mapping -60dB to 0dB -> 0 to 1)
207
+ normalized = max(0.0, min(1.0, (db + 60) / 60))
208
+ amplitudes.append(normalized)
209
+
210
+ return amplitudes, duration_seconds
211
+
212
+ def _get_envelope(
213
+ self,
214
+ audio: AudioSegment,
215
+ window_ms: int = 50,
216
+ ) -> np.ndarray:
217
+ """
218
+ Extract amplitude envelope from audio.
219
+
220
+ Returns array of amplitude values in dB.
221
+ """
222
+ duration_ms = len(audio)
223
+ amplitudes = []
224
+
225
+ for start_ms in range(0, duration_ms, window_ms):
226
+ end_ms = min(start_ms + window_ms, duration_ms)
227
+ window = audio[start_ms:end_ms]
228
+
229
+ if window.rms > 0:
230
+ db = 20 * math.log10(window.rms / window.max_possible_amplitude)
231
+ else:
232
+ db = -100.0
233
+
234
+ amplitudes.append(db)
235
+
236
+ return np.array(amplitudes)
237
+
238
+ def _create_figure(
239
+ self,
240
+ duration_seconds: float,
241
+ show_time_axis: bool,
242
+ ) -> Tuple[plt.Figure, plt.Axes]:
243
+ """
244
+ Create matplotlib figure and axes.
245
+ """
246
+ fig_width = self.width / self.dpi
247
+ fig_height = self.height / self.dpi
248
+
249
+ fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=self.dpi)
250
+
251
+ # Set background
252
+ fig.patch.set_facecolor(self.background_color)
253
+ ax.set_facecolor(self.background_color)
254
+
255
+ # Configure axes
256
+ ax.set_xlim(0, duration_seconds)
257
+ ax.set_ylim(-60, 0) # dB range
258
+
259
+ # Remove spines
260
+ for spine in ax.spines.values():
261
+ spine.set_visible(False)
262
+
263
+ # Configure ticks
264
+ if show_time_axis:
265
+ ax.tick_params(
266
+ axis='x',
267
+ colors=self.time_axis_color,
268
+ labelsize=8,
269
+ )
270
+ ax.tick_params(axis='y', left=False, labelleft=False)
271
+
272
+ # Set time axis ticks
273
+ self._set_time_ticks(ax, duration_seconds)
274
+ else:
275
+ ax.tick_params(
276
+ axis='both',
277
+ left=False,
278
+ bottom=False,
279
+ labelleft=False,
280
+ labelbottom=False,
281
+ )
282
+
283
+ return fig, ax
284
+
285
+ def _set_time_ticks(self, ax: plt.Axes, duration_seconds: float):
286
+ """
287
+ Set appropriate time axis tick marks.
288
+ """
289
+ if duration_seconds <= 60:
290
+ # Under 1 minute: tick every 10 seconds
291
+ tick_interval = 10
292
+ elif duration_seconds <= 300:
293
+ # Under 5 minutes: tick every 30 seconds
294
+ tick_interval = 30
295
+ else:
296
+ # Over 5 minutes: tick every minute
297
+ tick_interval = 60
298
+
299
+ ticks = np.arange(0, duration_seconds + 1, tick_interval)
300
+ ax.set_xticks(ticks)
301
+
302
+ # Format tick labels as MM:SS
303
+ labels = []
304
+ for t in ticks:
305
+ minutes = int(t // 60)
306
+ seconds = int(t % 60)
307
+ labels.append(f"{minutes}:{seconds:02d}")
308
+ ax.set_xticklabels(labels)
309
+
310
+ def _draw_waveform(
311
+ self,
312
+ ax: plt.Axes,
313
+ envelope: np.ndarray,
314
+ duration_seconds: float,
315
+ ):
316
+ """
317
+ Draw the main waveform.
318
+ """
319
+ num_points = len(envelope)
320
+ time_points = np.linspace(0, duration_seconds, num_points)
321
+
322
+ # Draw as filled area
323
+ ax.fill_between(
324
+ time_points,
325
+ envelope,
326
+ -60, # Bottom of range
327
+ color=self.waveform_color,
328
+ alpha=0.7,
329
+ )
330
+
331
+ # Draw outline
332
+ ax.plot(
333
+ time_points,
334
+ envelope,
335
+ color=self.waveform_color,
336
+ linewidth=0.5,
337
+ alpha=0.9,
338
+ )
339
+
340
+ def _draw_segments(
341
+ self,
342
+ ax: plt.Axes,
343
+ segments: List[AudibleSegment],
344
+ envelope: np.ndarray,
345
+ duration_seconds: float,
346
+ ):
347
+ """
348
+ Highlight audible segments on the waveform.
349
+ """
350
+ num_points = len(envelope)
351
+ time_points = np.linspace(0, duration_seconds, num_points)
352
+
353
+ for segment in segments:
354
+ # Find indices corresponding to this segment
355
+ start_idx = int(segment.start_seconds / duration_seconds * num_points)
356
+ end_idx = int(segment.end_seconds / duration_seconds * num_points)
357
+
358
+ start_idx = max(0, min(start_idx, num_points - 1))
359
+ end_idx = max(0, min(end_idx, num_points))
360
+
361
+ if start_idx >= end_idx:
362
+ continue
363
+
364
+ segment_time = time_points[start_idx:end_idx]
365
+ segment_envelope = envelope[start_idx:end_idx]
366
+
367
+ # Highlight this segment with a different color
368
+ ax.fill_between(
369
+ segment_time,
370
+ segment_envelope,
371
+ -60,
372
+ color=self.segment_color,
373
+ alpha=0.6,
374
+ )
375
+
376
+ def _draw_mute_regions(
377
+ self,
378
+ ax: plt.Axes,
379
+ mute_regions: List[MuteRegion],
380
+ duration_seconds: float,
381
+ ):
382
+ """
383
+ Draw mute region overlays.
384
+ """
385
+ for region in mute_regions:
386
+ ax.axvspan(
387
+ region.start_seconds,
388
+ region.end_seconds,
389
+ color=self.mute_color,
390
+ alpha=0.3,
391
+ zorder=0,
392
+ )
393
+
394
+ def _draw_threshold_line(
395
+ self,
396
+ ax: plt.Axes,
397
+ threshold_db: float,
398
+ duration_seconds: float,
399
+ ):
400
+ """
401
+ Draw a reference line at the silence threshold.
402
+ """
403
+ ax.axhline(
404
+ y=threshold_db,
405
+ color=self.time_axis_color,
406
+ linestyle='--',
407
+ linewidth=0.5,
408
+ alpha=0.3,
409
+ )
@@ -47,6 +47,7 @@ class KaraokeFinalise:
47
47
  user_youtube_credentials=None, # Add support for pre-stored credentials
48
48
  server_side_mode=False, # New parameter for server-side deployment
49
49
  selected_instrumental_file=None, # Add support for pre-selected instrumental file
50
+ countdown_padding_seconds=None, # Padding applied to vocals; instrumental must match
50
51
  ):
51
52
  self.log_level = log_level
52
53
  self.log_formatter = log_formatter
@@ -54,6 +55,9 @@ class KaraokeFinalise:
54
55
  if logger is None:
55
56
  self.logger = logging.getLogger(__name__)
56
57
  self.logger.setLevel(log_level)
58
+ # Prevent log propagation to root logger to avoid duplicate logs
59
+ # when external packages (like lyrics_converter) configure root logger handlers
60
+ self.logger.propagate = False
57
61
 
58
62
  self.log_handler = logging.StreamHandler()
59
63
 
@@ -105,6 +109,7 @@ class KaraokeFinalise:
105
109
  self.user_youtube_credentials = user_youtube_credentials
106
110
  self.server_side_mode = server_side_mode
107
111
  self.selected_instrumental_file = selected_instrumental_file
112
+ self.countdown_padding_seconds = countdown_padding_seconds
108
113
 
109
114
  self.suffixes = {
110
115
  "title_mov": " (Title).mov",
@@ -421,6 +426,15 @@ class KaraokeFinalise:
421
426
  # Check if any videos were found
422
427
  if "items" in response and len(response["items"]) > 0:
423
428
  for item in response["items"]:
429
+ # YouTube search API sometimes returns results from other channels even with channelId filter
430
+ # Verify the video actually belongs to our channel
431
+ result_channel_id = item["snippet"]["channelId"]
432
+ if result_channel_id != channel_id:
433
+ self.logger.debug(
434
+ f"Skipping video from different channel: {item['snippet']['title']} (channel: {result_channel_id})"
435
+ )
436
+ continue
437
+
424
438
  found_title = item["snippet"]["title"]
425
439
 
426
440
  # In server-side mode, require an exact match to avoid false positives.
@@ -720,6 +734,32 @@ class KaraokeFinalise:
720
734
  artist, title = base_name.split(" - ", 1)
721
735
  return base_name, artist, title
722
736
 
737
+ def _pad_audio_file(self, input_audio, output_audio, padding_seconds):
738
+ """
739
+ Pad an audio file by prepending silence at the beginning.
740
+
741
+ Uses the same ffmpeg approach as LyricsTranscriber's CountdownProcessor
742
+ to ensure consistent padding behavior.
743
+
744
+ Args:
745
+ input_audio: Path to input audio file
746
+ output_audio: Path for the padded output file
747
+ padding_seconds: Amount of silence to prepend (in seconds)
748
+ """
749
+ self.logger.info(f"Padding audio file with {padding_seconds}s of silence")
750
+
751
+ # Use ffmpeg to prepend silence - this matches the approach in audio_processor.py
752
+ # adelay filter adds delay in milliseconds
753
+ delay_ms = int(padding_seconds * 1000)
754
+
755
+ ffmpeg_command = (
756
+ f'{self.ffmpeg_base_command} -i "{input_audio}" '
757
+ f'-af "adelay={delay_ms}|{delay_ms}" '
758
+ f'"{output_audio}"'
759
+ )
760
+
761
+ self.execute_command(ffmpeg_command, f"Padding audio with {padding_seconds}s silence")
762
+
723
763
  def execute_command(self, command, description):
724
764
  """Execute a shell command and log the output. For general commands (rclone, etc.)"""
725
765
  self.logger.info(f"{description}")
@@ -764,11 +804,32 @@ class KaraokeFinalise:
764
804
 
765
805
  def remux_with_instrumental(self, with_vocals_file, instrumental_audio, output_file):
766
806
  """Remux the video with instrumental audio to create karaoke version"""
807
+ # Safety net: If countdown padding was applied to vocals, ensure instrumental is padded too
808
+ actual_instrumental = instrumental_audio
809
+ if self.countdown_padding_seconds and self.countdown_padding_seconds > 0:
810
+ # Check if the instrumental file is already padded (has "(Padded)" in name)
811
+ if "(Padded)" not in instrumental_audio:
812
+ self.logger.warning(
813
+ f"Countdown padding ({self.countdown_padding_seconds}s) was applied to vocals, "
814
+ f"but instrumental doesn't appear to be padded. Creating padded version..."
815
+ )
816
+ # Create a padded version of the instrumental
817
+ base, ext = os.path.splitext(instrumental_audio)
818
+ padded_instrumental = f"{base} (Padded){ext}"
819
+
820
+ if not os.path.exists(padded_instrumental):
821
+ self._pad_audio_file(instrumental_audio, padded_instrumental, self.countdown_padding_seconds)
822
+ self.logger.info(f"Created padded instrumental: {padded_instrumental}")
823
+
824
+ actual_instrumental = padded_instrumental
825
+ else:
826
+ self.logger.info(f"Using already-padded instrumental: {instrumental_audio}")
827
+
767
828
  # This operation is primarily I/O bound (remuxing), so hardware acceleration doesn't provide significant benefit
768
829
  # Keep the existing approach but use the new execute method
769
830
  ffmpeg_command = (
770
831
  f'{self.ffmpeg_base_command} -an -i "{with_vocals_file}" '
771
- f'-vn -i "{instrumental_audio}" -c:v copy -c:a pcm_s16le "{output_file}"'
832
+ f'-vn -i "{actual_instrumental}" -c:v copy -c:a pcm_s16le "{output_file}"'
772
833
  )
773
834
  self.execute_command(ffmpeg_command, "Remuxing video with instrumental audio")
774
835
 
@@ -29,7 +29,7 @@ from .audio_processor import AudioProcessor
29
29
  from .lyrics_processor import LyricsProcessor
30
30
  from .video_generator import VideoGenerator
31
31
  from .video_background_processor import VideoBackgroundProcessor
32
- from .audio_fetcher import create_audio_fetcher, AudioFetcherError, NoResultsError
32
+ from .audio_fetcher import create_audio_fetcher, AudioFetcherError, NoResultsError, UserCancelledError
33
33
 
34
34
 
35
35
  class KaraokePrep:
@@ -84,6 +84,9 @@ class KaraokePrep:
84
84
  if logger is None:
85
85
  self.logger = logging.getLogger(__name__)
86
86
  self.logger.setLevel(log_level)
87
+ # Prevent log propagation to root logger to avoid duplicate logs
88
+ # when external packages (like lyrics_converter) configure root logger handlers
89
+ self.logger.propagate = False
87
90
 
88
91
  self.log_handler = logging.StreamHandler()
89
92
 
@@ -256,6 +259,101 @@ class KaraokePrep:
256
259
  self.artist = metadata_result["artist"]
257
260
  self.title = metadata_result["title"]
258
261
 
262
+ def _scan_directory_for_instrumentals(self, track_output_dir, artist_title):
263
+ """
264
+ Scan the directory for existing instrumental files and build a separated_audio structure.
265
+
266
+ This is used when transcription was skipped (existing files found) but we need to
267
+ pad instrumentals due to countdown padding.
268
+
269
+ Args:
270
+ track_output_dir: The track output directory to scan
271
+ artist_title: The "{artist} - {title}" string for matching files
272
+
273
+ Returns:
274
+ Dictionary with separated_audio structure containing found instrumental paths
275
+ """
276
+ self.logger.info(f"Scanning directory for existing instrumentals: {track_output_dir}")
277
+
278
+ separated_audio = {
279
+ "clean_instrumental": {},
280
+ "backing_vocals": {},
281
+ "other_stems": {},
282
+ "combined_instrumentals": {},
283
+ }
284
+
285
+ # Search patterns for instrumental files
286
+ # Files are named like: "{artist} - {title} (Instrumental {model}).flac"
287
+ # Or with backing vocals: "{artist} - {title} (Instrumental +BV {model}).flac"
288
+
289
+ # Look for files in the track output directory
290
+ search_dir = track_output_dir
291
+
292
+ # Find all instrumental files (not padded ones - we want the originals)
293
+ instrumental_pattern = os.path.join(search_dir, f"{artist_title} (Instrumental*.flac")
294
+ instrumental_files = glob.glob(instrumental_pattern)
295
+
296
+ # Also check for wav files
297
+ instrumental_pattern_wav = os.path.join(search_dir, f"{artist_title} (Instrumental*.wav")
298
+ instrumental_files.extend(glob.glob(instrumental_pattern_wav))
299
+
300
+ self.logger.debug(f"Found {len(instrumental_files)} instrumental files")
301
+
302
+ for filepath in instrumental_files:
303
+ filename = os.path.basename(filepath)
304
+
305
+ # Skip already padded files
306
+ if "(Padded)" in filename:
307
+ self.logger.debug(f"Skipping already padded file: {filename}")
308
+ continue
309
+
310
+ # Determine if it's a combined instrumental (+BV) or clean instrumental
311
+ if "+BV" in filename or "+bv" in filename.lower():
312
+ # Combined instrumental with backing vocals
313
+ # Extract model name from filename
314
+ # Pattern: "(Instrumental +BV {model}).flac"
315
+ model_match = re.search(r'\(Instrumental \+BV ([^)]+)\)', filename)
316
+ if model_match:
317
+ model_name = model_match.group(1).strip()
318
+ separated_audio["combined_instrumentals"][model_name] = filepath
319
+ self.logger.info(f"Found combined instrumental: {filename}")
320
+ else:
321
+ # Clean instrumental (no backing vocals)
322
+ # Pattern: "(Instrumental {model}).flac"
323
+ model_match = re.search(r'\(Instrumental ([^)]+)\)', filename)
324
+ if model_match:
325
+ # Use as clean instrumental if we don't have one yet
326
+ if not separated_audio["clean_instrumental"].get("instrumental"):
327
+ separated_audio["clean_instrumental"]["instrumental"] = filepath
328
+ self.logger.info(f"Found clean instrumental: {filename}")
329
+ else:
330
+ # Additional clean instrumentals go to combined_instrumentals for padding
331
+ model_name = model_match.group(1).strip()
332
+ separated_audio["combined_instrumentals"][model_name] = filepath
333
+ self.logger.info(f"Found additional instrumental: {filename}")
334
+
335
+ # Also look for backing vocals files
336
+ backing_vocals_pattern = os.path.join(search_dir, f"{artist_title} (Backing Vocals*.flac")
337
+ backing_vocals_files = glob.glob(backing_vocals_pattern)
338
+ backing_vocals_pattern_wav = os.path.join(search_dir, f"{artist_title} (Backing Vocals*.wav")
339
+ backing_vocals_files.extend(glob.glob(backing_vocals_pattern_wav))
340
+
341
+ for filepath in backing_vocals_files:
342
+ filename = os.path.basename(filepath)
343
+ model_match = re.search(r'\(Backing Vocals ([^)]+)\)', filename)
344
+ if model_match:
345
+ model_name = model_match.group(1).strip()
346
+ if model_name not in separated_audio["backing_vocals"]:
347
+ separated_audio["backing_vocals"][model_name] = {"backing_vocals": filepath}
348
+ self.logger.info(f"Found backing vocals: {filename}")
349
+
350
+ # Log summary
351
+ clean_count = 1 if separated_audio["clean_instrumental"].get("instrumental") else 0
352
+ combined_count = len(separated_audio["combined_instrumentals"])
353
+ self.logger.info(f"Directory scan complete: {clean_count} clean instrumental, {combined_count} combined instrumentals")
354
+
355
+ return separated_audio
356
+
259
357
  async def prep_single_track(self):
260
358
  # Add signal handler at the start
261
359
  loop = asyncio.get_running_loop()
@@ -419,6 +517,9 @@ class KaraokePrep:
419
517
  # No still image for audio-only downloads
420
518
  processed_track["input_still_image"] = None
421
519
 
520
+ except UserCancelledError:
521
+ # User cancelled - propagate up to CLI for graceful exit
522
+ raise
422
523
  except NoResultsError as e:
423
524
  self.logger.error(f"No audio found: {e}")
424
525
  return None
@@ -761,6 +862,18 @@ class KaraokePrep:
761
862
  f"Applying {padding_seconds}s padding to all instrumental files to sync with vocal countdown"
762
863
  )
763
864
 
865
+ # If separated_audio is empty (e.g., transcription was skipped but existing files have countdown),
866
+ # scan the directory for existing instrumental files
867
+ has_instrumentals = (
868
+ processed_track["separated_audio"].get("clean_instrumental", {}).get("instrumental") or
869
+ processed_track["separated_audio"].get("combined_instrumentals")
870
+ )
871
+ if not has_instrumentals:
872
+ self.logger.info("No instrumentals in separated_audio, scanning directory for existing files...")
873
+ processed_track["separated_audio"] = self._scan_directory_for_instrumentals(
874
+ track_output_dir, artist_title
875
+ )
876
+
764
877
  # Apply padding using AudioProcessor
765
878
  padded_separation_result = self.audio_processor.apply_countdown_padding_to_instrumentals(
766
879
  separation_result=processed_track["separated_audio"],