karaoke-gen 0.71.23__py3-none-any.whl → 0.71.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,409 @@
1
+ """
2
+ Waveform visualization generator for audio files.
3
+
4
+ This module provides the WaveformGenerator class which creates waveform
5
+ images suitable for display in the instrumental review UI.
6
+ """
7
+
8
+ import logging
9
+ import math
10
+ from pathlib import Path
11
+ from typing import List, Optional, Tuple
12
+
13
+ import matplotlib
14
+ matplotlib.use('Agg') # Use non-interactive backend
15
+ import matplotlib.pyplot as plt
16
+ import numpy as np
17
+ from pydub import AudioSegment
18
+
19
+ from .models import AudibleSegment, MuteRegion
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class WaveformGenerator:
26
+ """
27
+ Generates waveform visualization images from audio files.
28
+
29
+ This class creates PNG images showing the amplitude envelope of an
30
+ audio file over time. It can highlight detected audible segments
31
+ and mute regions with different colors.
32
+
33
+ The generated images are suitable for display in web UIs and can
34
+ be used for interactive seeking (click-to-seek) functionality.
35
+
36
+ Attributes:
37
+ width: Width of the output image in pixels (default: 1200)
38
+ height: Height of the output image in pixels (default: 200)
39
+ background_color: Background color (default: "#1a1a2e")
40
+ waveform_color: Main waveform color (default: "#4a90d9")
41
+ segment_color: Color for audible segments (default: "#e94560")
42
+ mute_color: Color for mute regions (default: "#ff6b6b")
43
+ time_axis_color: Color for time axis (default: "#ffffff")
44
+
45
+ Example:
46
+ >>> generator = WaveformGenerator(width=1200, height=200)
47
+ >>> generator.generate(
48
+ ... audio_path="/path/to/backing_vocals.flac",
49
+ ... output_path="/path/to/waveform.png",
50
+ ... segments=analysis_result.audible_segments
51
+ ... )
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ width: int = 1200,
57
+ height: int = 200,
58
+ background_color: str = "#1a1a2e",
59
+ waveform_color: str = "#4a90d9",
60
+ segment_color: str = "#e94560",
61
+ mute_color: str = "#ff6b6b",
62
+ time_axis_color: str = "#ffffff",
63
+ dpi: int = 100,
64
+ ):
65
+ """
66
+ Initialize the waveform generator.
67
+
68
+ Args:
69
+ width: Width of the output image in pixels
70
+ height: Height of the output image in pixels
71
+ background_color: Background color (hex or named color)
72
+ waveform_color: Main waveform color
73
+ segment_color: Color for highlighting audible segments
74
+ mute_color: Color for highlighting mute regions
75
+ time_axis_color: Color for time axis labels
76
+ dpi: DPI for the output image
77
+ """
78
+ self.width = width
79
+ self.height = height
80
+ self.background_color = background_color
81
+ self.waveform_color = waveform_color
82
+ self.segment_color = segment_color
83
+ self.mute_color = mute_color
84
+ self.time_axis_color = time_axis_color
85
+ self.dpi = dpi
86
+
87
+ def generate(
88
+ self,
89
+ audio_path: str,
90
+ output_path: str,
91
+ segments: Optional[List[AudibleSegment]] = None,
92
+ mute_regions: Optional[List[MuteRegion]] = None,
93
+ show_time_axis: bool = True,
94
+ silence_threshold_db: float = -40.0,
95
+ ) -> str:
96
+ """
97
+ Generate a waveform image from an audio file.
98
+
99
+ Args:
100
+ audio_path: Path to the audio file
101
+ output_path: Path where the PNG image will be saved
102
+ segments: Optional list of audible segments to highlight
103
+ mute_regions: Optional list of mute regions to highlight
104
+ show_time_axis: Whether to show time axis labels
105
+ silence_threshold_db: Threshold for visual reference line
106
+
107
+ Returns:
108
+ Path to the generated image file
109
+
110
+ Raises:
111
+ FileNotFoundError: If the audio file doesn't exist
112
+ """
113
+ path = Path(audio_path)
114
+ if not path.exists():
115
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
116
+
117
+ logger.info(f"Generating waveform for: {audio_path}")
118
+
119
+ # Load audio
120
+ audio = AudioSegment.from_file(audio_path)
121
+ duration_seconds = len(audio) / 1000.0
122
+
123
+ # Convert to mono if needed
124
+ if audio.channels > 1:
125
+ audio = audio.set_channels(1)
126
+
127
+ # Get amplitude envelope
128
+ envelope = self._get_envelope(audio)
129
+
130
+ # Create the figure
131
+ fig, ax = self._create_figure(duration_seconds, show_time_axis)
132
+
133
+ # Draw waveform
134
+ self._draw_waveform(ax, envelope, duration_seconds)
135
+
136
+ # Highlight mute regions (if any) - draw first so waveform is on top
137
+ if mute_regions:
138
+ self._draw_mute_regions(ax, mute_regions, duration_seconds)
139
+
140
+ # Highlight audible segments (if any)
141
+ if segments:
142
+ self._draw_segments(ax, segments, envelope, duration_seconds)
143
+
144
+ # Draw silence threshold reference line
145
+ self._draw_threshold_line(ax, silence_threshold_db, duration_seconds)
146
+
147
+ # Save the figure
148
+ output_dir = Path(output_path).parent
149
+ output_dir.mkdir(parents=True, exist_ok=True)
150
+
151
+ fig.savefig(
152
+ output_path,
153
+ facecolor=self.background_color,
154
+ edgecolor='none',
155
+ bbox_inches='tight',
156
+ pad_inches=0.1,
157
+ )
158
+ plt.close(fig)
159
+
160
+ logger.info(f"Waveform saved to: {output_path}")
161
+ return output_path
162
+
163
+ def generate_data_only(
164
+ self,
165
+ audio_path: str,
166
+ num_points: int = 500,
167
+ ) -> Tuple[List[float], float]:
168
+ """
169
+ Generate waveform data without creating an image.
170
+
171
+ This is useful for sending data to a frontend that will
172
+ render the waveform itself (e.g., using Canvas or SVG).
173
+
174
+ Args:
175
+ audio_path: Path to the audio file
176
+ num_points: Number of data points to return
177
+
178
+ Returns:
179
+ Tuple of (amplitude_values, duration_seconds)
180
+ Amplitude values are normalized to 0.0-1.0 range.
181
+ """
182
+ path = Path(audio_path)
183
+ if not path.exists():
184
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
185
+
186
+ audio = AudioSegment.from_file(audio_path)
187
+ duration_seconds = len(audio) / 1000.0
188
+
189
+ if audio.channels > 1:
190
+ audio = audio.set_channels(1)
191
+
192
+ # Calculate window size to get desired number of points
193
+ duration_ms = len(audio)
194
+ window_ms = max(1, duration_ms // num_points)
195
+
196
+ amplitudes = []
197
+ for start_ms in range(0, duration_ms, window_ms):
198
+ end_ms = min(start_ms + window_ms, duration_ms)
199
+ window = audio[start_ms:end_ms]
200
+
201
+ if window.rms > 0:
202
+ db = 20 * math.log10(window.rms / window.max_possible_amplitude)
203
+ else:
204
+ db = -100.0
205
+
206
+ # Normalize to 0-1 range (mapping -60dB to 0dB -> 0 to 1)
207
+ normalized = max(0.0, min(1.0, (db + 60) / 60))
208
+ amplitudes.append(normalized)
209
+
210
+ return amplitudes, duration_seconds
211
+
212
+ def _get_envelope(
213
+ self,
214
+ audio: AudioSegment,
215
+ window_ms: int = 50,
216
+ ) -> np.ndarray:
217
+ """
218
+ Extract amplitude envelope from audio.
219
+
220
+ Returns array of amplitude values in dB.
221
+ """
222
+ duration_ms = len(audio)
223
+ amplitudes = []
224
+
225
+ for start_ms in range(0, duration_ms, window_ms):
226
+ end_ms = min(start_ms + window_ms, duration_ms)
227
+ window = audio[start_ms:end_ms]
228
+
229
+ if window.rms > 0:
230
+ db = 20 * math.log10(window.rms / window.max_possible_amplitude)
231
+ else:
232
+ db = -100.0
233
+
234
+ amplitudes.append(db)
235
+
236
+ return np.array(amplitudes)
237
+
238
+ def _create_figure(
239
+ self,
240
+ duration_seconds: float,
241
+ show_time_axis: bool,
242
+ ) -> Tuple[plt.Figure, plt.Axes]:
243
+ """
244
+ Create matplotlib figure and axes.
245
+ """
246
+ fig_width = self.width / self.dpi
247
+ fig_height = self.height / self.dpi
248
+
249
+ fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=self.dpi)
250
+
251
+ # Set background
252
+ fig.patch.set_facecolor(self.background_color)
253
+ ax.set_facecolor(self.background_color)
254
+
255
+ # Configure axes
256
+ ax.set_xlim(0, duration_seconds)
257
+ ax.set_ylim(-60, 0) # dB range
258
+
259
+ # Remove spines
260
+ for spine in ax.spines.values():
261
+ spine.set_visible(False)
262
+
263
+ # Configure ticks
264
+ if show_time_axis:
265
+ ax.tick_params(
266
+ axis='x',
267
+ colors=self.time_axis_color,
268
+ labelsize=8,
269
+ )
270
+ ax.tick_params(axis='y', left=False, labelleft=False)
271
+
272
+ # Set time axis ticks
273
+ self._set_time_ticks(ax, duration_seconds)
274
+ else:
275
+ ax.tick_params(
276
+ axis='both',
277
+ left=False,
278
+ bottom=False,
279
+ labelleft=False,
280
+ labelbottom=False,
281
+ )
282
+
283
+ return fig, ax
284
+
285
+ def _set_time_ticks(self, ax: plt.Axes, duration_seconds: float):
286
+ """
287
+ Set appropriate time axis tick marks.
288
+ """
289
+ if duration_seconds <= 60:
290
+ # Under 1 minute: tick every 10 seconds
291
+ tick_interval = 10
292
+ elif duration_seconds <= 300:
293
+ # Under 5 minutes: tick every 30 seconds
294
+ tick_interval = 30
295
+ else:
296
+ # Over 5 minutes: tick every minute
297
+ tick_interval = 60
298
+
299
+ ticks = np.arange(0, duration_seconds + 1, tick_interval)
300
+ ax.set_xticks(ticks)
301
+
302
+ # Format tick labels as MM:SS
303
+ labels = []
304
+ for t in ticks:
305
+ minutes = int(t // 60)
306
+ seconds = int(t % 60)
307
+ labels.append(f"{minutes}:{seconds:02d}")
308
+ ax.set_xticklabels(labels)
309
+
310
+ def _draw_waveform(
311
+ self,
312
+ ax: plt.Axes,
313
+ envelope: np.ndarray,
314
+ duration_seconds: float,
315
+ ):
316
+ """
317
+ Draw the main waveform.
318
+ """
319
+ num_points = len(envelope)
320
+ time_points = np.linspace(0, duration_seconds, num_points)
321
+
322
+ # Draw as filled area
323
+ ax.fill_between(
324
+ time_points,
325
+ envelope,
326
+ -60, # Bottom of range
327
+ color=self.waveform_color,
328
+ alpha=0.7,
329
+ )
330
+
331
+ # Draw outline
332
+ ax.plot(
333
+ time_points,
334
+ envelope,
335
+ color=self.waveform_color,
336
+ linewidth=0.5,
337
+ alpha=0.9,
338
+ )
339
+
340
+ def _draw_segments(
341
+ self,
342
+ ax: plt.Axes,
343
+ segments: List[AudibleSegment],
344
+ envelope: np.ndarray,
345
+ duration_seconds: float,
346
+ ):
347
+ """
348
+ Highlight audible segments on the waveform.
349
+ """
350
+ num_points = len(envelope)
351
+ time_points = np.linspace(0, duration_seconds, num_points)
352
+
353
+ for segment in segments:
354
+ # Find indices corresponding to this segment
355
+ start_idx = int(segment.start_seconds / duration_seconds * num_points)
356
+ end_idx = int(segment.end_seconds / duration_seconds * num_points)
357
+
358
+ start_idx = max(0, min(start_idx, num_points - 1))
359
+ end_idx = max(0, min(end_idx, num_points))
360
+
361
+ if start_idx >= end_idx:
362
+ continue
363
+
364
+ segment_time = time_points[start_idx:end_idx]
365
+ segment_envelope = envelope[start_idx:end_idx]
366
+
367
+ # Highlight this segment with a different color
368
+ ax.fill_between(
369
+ segment_time,
370
+ segment_envelope,
371
+ -60,
372
+ color=self.segment_color,
373
+ alpha=0.6,
374
+ )
375
+
376
+ def _draw_mute_regions(
377
+ self,
378
+ ax: plt.Axes,
379
+ mute_regions: List[MuteRegion],
380
+ duration_seconds: float,
381
+ ):
382
+ """
383
+ Draw mute region overlays.
384
+ """
385
+ for region in mute_regions:
386
+ ax.axvspan(
387
+ region.start_seconds,
388
+ region.end_seconds,
389
+ color=self.mute_color,
390
+ alpha=0.3,
391
+ zorder=0,
392
+ )
393
+
394
+ def _draw_threshold_line(
395
+ self,
396
+ ax: plt.Axes,
397
+ threshold_db: float,
398
+ duration_seconds: float,
399
+ ):
400
+ """
401
+ Draw a reference line at the silence threshold.
402
+ """
403
+ ax.axhline(
404
+ y=threshold_db,
405
+ color=self.time_axis_color,
406
+ linestyle='--',
407
+ linewidth=0.5,
408
+ alpha=0.3,
409
+ )
@@ -208,6 +208,11 @@ def create_parser(prog: str = "karaoke-gen") -> argparse.ArgumentParser:
208
208
  default="flac",
209
209
  help="Optional: format / file extension for instrumental track to use for remux (default: %(default)s). Example: --instrumental_format=mp3",
210
210
  )
211
+ audio_group.add_argument(
212
+ "--skip_instrumental_review",
213
+ action="store_true",
214
+ help="Optional: Skip the interactive instrumental review UI and use the old numeric selection. Example: --skip_instrumental_review",
215
+ )
211
216
 
212
217
  # Lyrics Configuration
213
218
  lyrics_group = parser.add_argument_group("Lyrics Configuration")
@@ -14,12 +14,189 @@ import sys
14
14
  import json
15
15
  import asyncio
16
16
  import time
17
+ import glob
17
18
  import pyperclip
18
19
  from karaoke_gen import KaraokePrep
19
20
  from karaoke_gen.karaoke_finalise import KaraokeFinalise
21
+ from karaoke_gen.instrumental_review import (
22
+ AudioAnalyzer,
23
+ WaveformGenerator,
24
+ InstrumentalReviewServer,
25
+ )
20
26
  from .cli_args import create_parser, process_style_overrides, is_url, is_file
21
27
 
22
28
 
29
+ def _resolve_path_for_cwd(path: str, track_dir: str) -> str:
30
+ """
31
+ Resolve a path that may have been created relative to the original working directory.
32
+
33
+ After os.chdir(track_dir), paths like './TrackDir/stems/file.flac' become invalid.
34
+ This function converts such paths to work from the new current directory.
35
+
36
+ Args:
37
+ path: The path to resolve (may be relative or absolute)
38
+ track_dir: The track directory we've chdir'd into
39
+
40
+ Returns:
41
+ A path that's valid from the current working directory
42
+ """
43
+ if os.path.isabs(path):
44
+ return path
45
+
46
+ # Normalize both paths for comparison
47
+ norm_path = os.path.normpath(path)
48
+ norm_track_dir = os.path.normpath(track_dir)
49
+
50
+ # If path starts with track_dir, strip it to get the relative path from within track_dir
51
+ # e.g., './Four Lanes Male Choir - The White Rose/stems/file.flac' -> 'stems/file.flac'
52
+ if norm_path.startswith(norm_track_dir + os.sep):
53
+ return norm_path[len(norm_track_dir) + 1:]
54
+ elif norm_path.startswith(norm_track_dir):
55
+ return norm_path[len(norm_track_dir):].lstrip(os.sep) or '.'
56
+
57
+ # If path doesn't start with track_dir, it might already be relative to track_dir
58
+ # or it's a path that doesn't need transformation
59
+ return path
60
+
61
+
62
+ def run_instrumental_review(track: dict, logger: logging.Logger) -> str | None:
63
+ """
64
+ Run the instrumental review UI to let user select the best instrumental track.
65
+
66
+ This analyzes the backing vocals, generates a waveform, and opens a browser
67
+ with an interactive UI for reviewing and selecting the instrumental.
68
+
69
+ Args:
70
+ track: The track dictionary from KaraokePrep containing separated audio info
71
+ logger: Logger instance
72
+
73
+ Returns:
74
+ Path to the selected instrumental file, or None to use the old numeric selection
75
+ """
76
+ track_dir = track.get("track_output_dir", ".")
77
+ artist = track.get("artist", "")
78
+ title = track.get("title", "")
79
+ base_name = f"{artist} - {title}"
80
+
81
+ # Get separation results
82
+ separated = track.get("separated_audio", {})
83
+ if not separated:
84
+ logger.info("No separated audio found, skipping instrumental review UI")
85
+ return None
86
+
87
+ # Find the backing vocals file
88
+ # Note: Paths in separated_audio may be relative to the original working directory,
89
+ # but we've already chdir'd into track_dir. Use _resolve_path_for_cwd to fix paths.
90
+ backing_vocals_path = None
91
+ backing_vocals_result = separated.get("backing_vocals", {})
92
+ for model, paths in backing_vocals_result.items():
93
+ if paths.get("backing_vocals"):
94
+ backing_vocals_path = _resolve_path_for_cwd(paths["backing_vocals"], track_dir)
95
+ break
96
+
97
+ if not backing_vocals_path or not os.path.exists(backing_vocals_path):
98
+ logger.info("No backing vocals file found, skipping instrumental review UI")
99
+ return None
100
+
101
+ # Find the clean instrumental file
102
+ clean_result = separated.get("clean_instrumental", {})
103
+ raw_clean_path = clean_result.get("instrumental")
104
+ clean_instrumental_path = _resolve_path_for_cwd(raw_clean_path, track_dir) if raw_clean_path else None
105
+
106
+ if not clean_instrumental_path or not os.path.exists(clean_instrumental_path):
107
+ logger.info("No clean instrumental file found, skipping instrumental review UI")
108
+ return None
109
+
110
+ # Find the combined instrumental (with backing vocals) file - these have "(Padded)" suffix if padded
111
+ combined_result = separated.get("combined_instrumentals", {})
112
+ with_backing_path = None
113
+ for model, path in combined_result.items():
114
+ resolved_path = _resolve_path_for_cwd(path, track_dir) if path else None
115
+ if resolved_path and os.path.exists(resolved_path):
116
+ with_backing_path = resolved_path
117
+ break
118
+
119
+ try:
120
+ logger.info("=== Starting Instrumental Review ===")
121
+ logger.info(f"Analyzing backing vocals: {backing_vocals_path}")
122
+
123
+ # Analyze backing vocals
124
+ analyzer = AudioAnalyzer()
125
+ analysis = analyzer.analyze(backing_vocals_path)
126
+
127
+ logger.info(f"Analysis complete:")
128
+ logger.info(f" Has audible content: {analysis.has_audible_content}")
129
+ logger.info(f" Total duration: {analysis.total_duration_seconds:.1f}s")
130
+ logger.info(f" Audible segments: {len(analysis.audible_segments)}")
131
+ logger.info(f" Recommendation: {analysis.recommended_selection.value}")
132
+
133
+ # Generate waveform
134
+ # Note: We're already in track_dir after chdir, so use current directory
135
+ logger.info("Generating waveform visualization...")
136
+ waveform_generator = WaveformGenerator()
137
+ waveform_path = f"{base_name} (Backing Vocals Waveform).png"
138
+ waveform_generator.generate(
139
+ audio_path=backing_vocals_path,
140
+ output_path=waveform_path,
141
+ audible_segments=analysis.audible_segments,
142
+ )
143
+
144
+ # Start the review server
145
+ # Note: We're already in track_dir after chdir, so output_dir is "."
146
+ logger.info("Starting instrumental review UI...")
147
+ server = InstrumentalReviewServer(
148
+ output_dir=".",
149
+ base_name=base_name,
150
+ analysis=analysis,
151
+ waveform_path=waveform_path,
152
+ backing_vocals_path=backing_vocals_path,
153
+ clean_instrumental_path=clean_instrumental_path,
154
+ with_backing_path=with_backing_path,
155
+ )
156
+
157
+ # Start server and open browser, wait for selection
158
+ server.start_and_open_browser()
159
+
160
+ logger.info("Waiting for instrumental selection in browser...")
161
+ logger.info("(Close the browser tab or press Ctrl+C to cancel)")
162
+
163
+ try:
164
+ # Wait for user selection (blocking)
165
+ server._selection_event.wait()
166
+ selection = server.get_selection()
167
+
168
+ logger.info(f"User selected: {selection}")
169
+
170
+ # Stop the server
171
+ server.stop()
172
+
173
+ # Return the selected instrumental path
174
+ if selection == "clean":
175
+ return clean_instrumental_path
176
+ elif selection == "with_backing":
177
+ return with_backing_path
178
+ elif selection == "custom":
179
+ custom_path = server.get_custom_instrumental_path()
180
+ if custom_path and os.path.exists(custom_path):
181
+ return custom_path
182
+ else:
183
+ logger.warning("Custom instrumental not found, falling back to clean")
184
+ return clean_instrumental_path
185
+ else:
186
+ logger.warning(f"Unknown selection: {selection}, falling back to numeric selection")
187
+ return None
188
+
189
+ except KeyboardInterrupt:
190
+ logger.info("Instrumental review cancelled by user")
191
+ server.stop()
192
+ return None
193
+
194
+ except Exception as e:
195
+ logger.error(f"Error during instrumental review: {e}")
196
+ logger.info("Falling back to numeric selection")
197
+ return None
198
+
199
+
23
200
  async def async_main():
24
201
  logger = logging.getLogger(__name__)
25
202
  log_handler = logging.StreamHandler()
@@ -461,6 +638,14 @@ async def async_main():
461
638
  logger.info(f"Changing to directory: {track_dir}")
462
639
  os.chdir(track_dir)
463
640
 
641
+ # Run instrumental review UI if not skipped
642
+ selected_instrumental_file = None
643
+ if not getattr(args, 'skip_instrumental_review', False):
644
+ selected_instrumental_file = run_instrumental_review(
645
+ track=track,
646
+ logger=logger,
647
+ )
648
+
464
649
  # Load CDG styles if CDG generation is enabled
465
650
  cdg_styles = None
466
651
  if args.enable_cdg:
@@ -504,6 +689,7 @@ async def async_main():
504
689
  cdg_styles=cdg_styles,
505
690
  keep_brand_code=getattr(args, 'keep_brand_code', False),
506
691
  non_interactive=args.yes,
692
+ selected_instrumental_file=selected_instrumental_file,
507
693
  )
508
694
 
509
695
  try: