karaoke-gen 0.71.27__py3-none-any.whl → 0.71.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,322 @@
1
+ """
2
+ Audio editor for creating custom instrumentals with muted regions.
3
+
4
+ This module provides the AudioEditor class which creates custom instrumental
5
+ tracks by muting specified regions of backing vocals audio and combining
6
+ it with the clean instrumental.
7
+ """
8
+
9
+ import logging
10
+ import os
11
+ from pathlib import Path
12
+ from typing import List, Optional
13
+
14
+ from pydub import AudioSegment
15
+
16
+ from .models import CustomInstrumentalResult, MuteRegion
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class AudioEditor:
23
+ """
24
+ Creates custom instrumentals by muting regions of backing vocals.
25
+
26
+ This class is pure Python with no cloud dependencies. It works with
27
+ local file paths and uses pydub for audio manipulation.
28
+
29
+ The editor takes a clean instrumental and backing vocals track,
30
+ applies silence to specified regions of the backing vocals, and
31
+ combines them to create a custom instrumental track.
32
+
33
+ Example:
34
+ >>> editor = AudioEditor()
35
+ >>> mute_regions = [
36
+ ... MuteRegion(start_seconds=45.0, end_seconds=48.5),
37
+ ... MuteRegion(start_seconds=120.0, end_seconds=125.0),
38
+ ... ]
39
+ >>> result = editor.create_custom_instrumental(
40
+ ... clean_instrumental_path="/path/to/clean.flac",
41
+ ... backing_vocals_path="/path/to/backing.flac",
42
+ ... mute_regions=mute_regions,
43
+ ... output_path="/path/to/custom.flac"
44
+ ... )
45
+ >>> print(f"Created: {result.output_path}")
46
+ """
47
+
48
+ def __init__(self, output_format: str = "flac"):
49
+ """
50
+ Initialize the audio editor.
51
+
52
+ Args:
53
+ output_format: Output audio format. Default is "flac".
54
+ Supported formats depend on ffmpeg installation.
55
+ """
56
+ self.output_format = output_format
57
+
58
+ def create_custom_instrumental(
59
+ self,
60
+ clean_instrumental_path: str,
61
+ backing_vocals_path: str,
62
+ mute_regions: List[MuteRegion],
63
+ output_path: str,
64
+ ) -> CustomInstrumentalResult:
65
+ """
66
+ Create a custom instrumental by muting regions of backing vocals.
67
+
68
+ This method:
69
+ 1. Loads the clean instrumental and backing vocals tracks
70
+ 2. Applies silence to the specified regions of the backing vocals
71
+ 3. Combines the clean instrumental with the edited backing vocals
72
+ 4. Exports the result to the specified output path
73
+
74
+ Args:
75
+ clean_instrumental_path: Path to the clean instrumental audio file
76
+ backing_vocals_path: Path to the backing vocals audio file
77
+ mute_regions: List of regions to mute in the backing vocals
78
+ output_path: Path where the output file should be saved
79
+
80
+ Returns:
81
+ CustomInstrumentalResult containing the output path and statistics
82
+
83
+ Raises:
84
+ FileNotFoundError: If input files don't exist
85
+ ValueError: If mute regions are invalid
86
+ """
87
+ # Validate inputs
88
+ if not Path(clean_instrumental_path).exists():
89
+ raise FileNotFoundError(
90
+ f"Clean instrumental not found: {clean_instrumental_path}"
91
+ )
92
+ if not Path(backing_vocals_path).exists():
93
+ raise FileNotFoundError(
94
+ f"Backing vocals not found: {backing_vocals_path}"
95
+ )
96
+
97
+ # Normalize and validate mute regions
98
+ normalized_regions = self._normalize_mute_regions(mute_regions)
99
+
100
+ logger.info(
101
+ f"Creating custom instrumental with {len(normalized_regions)} "
102
+ f"mute regions"
103
+ )
104
+
105
+ # Load audio files
106
+ logger.debug(f"Loading clean instrumental: {clean_instrumental_path}")
107
+ clean_instrumental = AudioSegment.from_file(clean_instrumental_path)
108
+
109
+ logger.debug(f"Loading backing vocals: {backing_vocals_path}")
110
+ backing_vocals = AudioSegment.from_file(backing_vocals_path)
111
+
112
+ # Ensure same duration (use shorter one)
113
+ clean_duration_ms = len(clean_instrumental)
114
+ backing_duration_ms = len(backing_vocals)
115
+
116
+ if abs(clean_duration_ms - backing_duration_ms) > 1000:
117
+ logger.warning(
118
+ f"Duration mismatch: clean={clean_duration_ms}ms, "
119
+ f"backing={backing_duration_ms}ms. Using shorter duration."
120
+ )
121
+
122
+ target_duration_ms = min(clean_duration_ms, backing_duration_ms)
123
+ clean_instrumental = clean_instrumental[:target_duration_ms]
124
+ backing_vocals = backing_vocals[:target_duration_ms]
125
+
126
+ # Apply mute regions to backing vocals
127
+ edited_backing = self._apply_mute_regions(
128
+ backing_vocals, normalized_regions
129
+ )
130
+
131
+ # Combine: clean instrumental + edited backing vocals
132
+ # The backing vocals are mixed on top of the clean instrumental
133
+ combined = clean_instrumental.overlay(edited_backing)
134
+
135
+ # Ensure output directory exists
136
+ output_dir = Path(output_path).parent
137
+ output_dir.mkdir(parents=True, exist_ok=True)
138
+
139
+ # Export
140
+ logger.info(f"Exporting custom instrumental to: {output_path}")
141
+ combined.export(output_path, format=self.output_format)
142
+
143
+ # Calculate statistics with clamping to actual audio duration
144
+ output_duration_seconds = len(combined) / 1000.0
145
+ total_muted_ms = sum(
146
+ (min(r.end_seconds, output_duration_seconds) - max(r.start_seconds, 0)) * 1000
147
+ for r in normalized_regions
148
+ if r.start_seconds < output_duration_seconds # Skip regions entirely outside audio
149
+ )
150
+
151
+ return CustomInstrumentalResult(
152
+ output_path=output_path,
153
+ mute_regions_applied=normalized_regions,
154
+ total_muted_duration_seconds=max(0, total_muted_ms / 1000.0),
155
+ output_duration_seconds=output_duration_seconds,
156
+ )
157
+
158
+ def apply_mute_to_single_track(
159
+ self,
160
+ audio_path: str,
161
+ mute_regions: List[MuteRegion],
162
+ output_path: str,
163
+ ) -> str:
164
+ """
165
+ Apply mute regions to a single audio track.
166
+
167
+ This is useful for muting sections of backing vocals without
168
+ combining with the clean instrumental.
169
+
170
+ Args:
171
+ audio_path: Path to the input audio file
172
+ mute_regions: List of regions to mute
173
+ output_path: Path where the output file should be saved
174
+
175
+ Returns:
176
+ Path to the output file
177
+ """
178
+ if not Path(audio_path).exists():
179
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
180
+
181
+ normalized_regions = self._normalize_mute_regions(mute_regions)
182
+
183
+ logger.info(f"Applying {len(normalized_regions)} mute regions to audio")
184
+
185
+ audio = AudioSegment.from_file(audio_path)
186
+ edited = self._apply_mute_regions(audio, normalized_regions)
187
+
188
+ output_dir = Path(output_path).parent
189
+ output_dir.mkdir(parents=True, exist_ok=True)
190
+
191
+ edited.export(output_path, format=self.output_format)
192
+
193
+ return output_path
194
+
195
+ def _normalize_mute_regions(
196
+ self,
197
+ regions: List[MuteRegion]
198
+ ) -> List[MuteRegion]:
199
+ """
200
+ Normalize mute regions: sort, validate, and merge overlapping ones.
201
+ """
202
+ if not regions:
203
+ return []
204
+
205
+ # Validate regions
206
+ for region in regions:
207
+ if region.start_seconds < 0:
208
+ raise ValueError(
209
+ f"Mute region start cannot be negative: {region.start_seconds}"
210
+ )
211
+ if region.end_seconds <= region.start_seconds:
212
+ raise ValueError(
213
+ f"Mute region end ({region.end_seconds}) must be after "
214
+ f"start ({region.start_seconds})"
215
+ )
216
+
217
+ # Sort by start time
218
+ sorted_regions = sorted(regions, key=lambda r: r.start_seconds)
219
+
220
+ # Merge overlapping regions
221
+ merged = [sorted_regions[0]]
222
+
223
+ for region in sorted_regions[1:]:
224
+ last = merged[-1]
225
+
226
+ # Check if overlapping or adjacent
227
+ if region.start_seconds <= last.end_seconds:
228
+ # Merge
229
+ merged[-1] = MuteRegion(
230
+ start_seconds=last.start_seconds,
231
+ end_seconds=max(last.end_seconds, region.end_seconds)
232
+ )
233
+ else:
234
+ merged.append(region)
235
+
236
+ return merged
237
+
238
+ def _apply_mute_regions(
239
+ self,
240
+ audio: AudioSegment,
241
+ regions: List[MuteRegion]
242
+ ) -> AudioSegment:
243
+ """
244
+ Apply silence to the specified regions of an audio segment.
245
+ """
246
+ if not regions:
247
+ return audio
248
+
249
+ duration_ms = len(audio)
250
+ result = audio
251
+
252
+ for region in regions:
253
+ start_ms = int(region.start_seconds * 1000)
254
+ end_ms = int(region.end_seconds * 1000)
255
+
256
+ # Clamp to audio boundaries
257
+ start_ms = max(0, start_ms)
258
+ end_ms = min(duration_ms, end_ms)
259
+
260
+ if start_ms >= end_ms:
261
+ continue
262
+
263
+ # Create silence segment
264
+ silence_duration = end_ms - start_ms
265
+ silence = AudioSegment.silent(
266
+ duration=silence_duration,
267
+ frame_rate=audio.frame_rate
268
+ )
269
+
270
+ # Replace the region with silence
271
+ before = result[:start_ms]
272
+ after = result[end_ms:]
273
+ result = before + silence + after
274
+
275
+ logger.debug(
276
+ f"Muted region: {start_ms/1000:.2f}s - {end_ms/1000:.2f}s"
277
+ )
278
+
279
+ return result
280
+
281
+ def preview_with_mutes(
282
+ self,
283
+ clean_instrumental_path: str,
284
+ backing_vocals_path: str,
285
+ mute_regions: List[MuteRegion],
286
+ output_path: Optional[str] = None,
287
+ ) -> AudioSegment:
288
+ """
289
+ Create a preview of the custom instrumental (in memory).
290
+
291
+ This is useful for creating temporary previews without saving
292
+ to disk. If output_path is provided, the preview is also saved.
293
+
294
+ Args:
295
+ clean_instrumental_path: Path to the clean instrumental
296
+ backing_vocals_path: Path to the backing vocals
297
+ mute_regions: Regions to mute
298
+ output_path: Optional path to save the preview
299
+
300
+ Returns:
301
+ AudioSegment of the preview
302
+ """
303
+ clean_instrumental = AudioSegment.from_file(clean_instrumental_path)
304
+ backing_vocals = AudioSegment.from_file(backing_vocals_path)
305
+
306
+ # Match durations
307
+ target_duration = min(len(clean_instrumental), len(backing_vocals))
308
+ clean_instrumental = clean_instrumental[:target_duration]
309
+ backing_vocals = backing_vocals[:target_duration]
310
+
311
+ # Apply mutes
312
+ normalized = self._normalize_mute_regions(mute_regions)
313
+ edited_backing = self._apply_mute_regions(backing_vocals, normalized)
314
+
315
+ # Combine
316
+ combined = clean_instrumental.overlay(edited_backing)
317
+
318
+ if output_path:
319
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
320
+ combined.export(output_path, format=self.output_format)
321
+
322
+ return combined
@@ -0,0 +1,171 @@
1
+ """
2
+ Pydantic models for the instrumental review module.
3
+
4
+ These models define the data structures used for audio analysis results,
5
+ mute regions, and recommendations. They're designed to be serializable
6
+ to JSON for API responses.
7
+ """
8
+
9
+ from enum import Enum
10
+ from typing import List, Optional
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+
15
+ class RecommendedSelection(str, Enum):
16
+ """Recommendation for which instrumental to use."""
17
+
18
+ CLEAN = "clean"
19
+ """Use clean instrumental - no backing vocals detected or recommended."""
20
+
21
+ WITH_BACKING = "with_backing"
22
+ """Use instrumental with backing vocals - backing vocals sound good."""
23
+
24
+ REVIEW_NEEDED = "review_needed"
25
+ """Human review needed - backing vocals detected but quality uncertain."""
26
+
27
+
28
+ class AudibleSegment(BaseModel):
29
+ """A detected segment of audible content in the backing vocals."""
30
+
31
+ start_seconds: float = Field(
32
+ ...,
33
+ ge=0,
34
+ description="Start time of the segment in seconds"
35
+ )
36
+ end_seconds: float = Field(
37
+ ...,
38
+ ge=0,
39
+ description="End time of the segment in seconds"
40
+ )
41
+ duration_seconds: float = Field(
42
+ ...,
43
+ ge=0,
44
+ description="Duration of the segment in seconds"
45
+ )
46
+ avg_amplitude_db: float = Field(
47
+ ...,
48
+ description="Average amplitude of the segment in dB"
49
+ )
50
+ peak_amplitude_db: float = Field(
51
+ default=0.0,
52
+ description="Peak amplitude of the segment in dB"
53
+ )
54
+
55
+ @property
56
+ def is_loud(self) -> bool:
57
+ """Check if this segment is relatively loud (> -20dB average)."""
58
+ return self.avg_amplitude_db > -20.0
59
+
60
+
61
+ class MuteRegion(BaseModel):
62
+ """A region to mute in the backing vocals audio."""
63
+
64
+ start_seconds: float = Field(
65
+ ...,
66
+ ge=0,
67
+ description="Start time of the region to mute in seconds"
68
+ )
69
+ end_seconds: float = Field(
70
+ ...,
71
+ ge=0,
72
+ description="End time of the region to mute in seconds"
73
+ )
74
+
75
+ @property
76
+ def duration_seconds(self) -> float:
77
+ """Duration of the mute region in seconds."""
78
+ return self.end_seconds - self.start_seconds
79
+
80
+ def overlaps(self, other: "MuteRegion") -> bool:
81
+ """Check if this region overlaps with another."""
82
+ return not (self.end_seconds <= other.start_seconds or
83
+ self.start_seconds >= other.end_seconds)
84
+
85
+ def merge(self, other: "MuteRegion") -> "MuteRegion":
86
+ """Merge this region with another overlapping region."""
87
+ return MuteRegion(
88
+ start_seconds=min(self.start_seconds, other.start_seconds),
89
+ end_seconds=max(self.end_seconds, other.end_seconds)
90
+ )
91
+
92
+
93
+ class AnalysisResult(BaseModel):
94
+ """Result of analyzing backing vocals audio for audible content."""
95
+
96
+ has_audible_content: bool = Field(
97
+ ...,
98
+ description="Whether any audible content was detected"
99
+ )
100
+ total_duration_seconds: float = Field(
101
+ ...,
102
+ ge=0,
103
+ description="Total duration of the audio file in seconds"
104
+ )
105
+ audible_segments: List[AudibleSegment] = Field(
106
+ default_factory=list,
107
+ description="List of detected audible segments"
108
+ )
109
+ recommended_selection: RecommendedSelection = Field(
110
+ ...,
111
+ description="Recommended instrumental selection based on analysis"
112
+ )
113
+ silence_threshold_db: float = Field(
114
+ default=-40.0,
115
+ description="Threshold used to detect silence (in dB)"
116
+ )
117
+ total_audible_duration_seconds: float = Field(
118
+ default=0.0,
119
+ ge=0,
120
+ description="Total duration of all audible segments combined"
121
+ )
122
+ audible_percentage: float = Field(
123
+ default=0.0,
124
+ ge=0,
125
+ le=100,
126
+ description="Percentage of audio that is audible"
127
+ )
128
+ waveform_data: Optional[List[float]] = Field(
129
+ default=None,
130
+ description="Amplitude envelope data for waveform rendering (optional)"
131
+ )
132
+
133
+ @property
134
+ def segment_count(self) -> int:
135
+ """Number of audible segments detected."""
136
+ return len(self.audible_segments)
137
+
138
+ def get_segments_in_range(
139
+ self,
140
+ start_seconds: float,
141
+ end_seconds: float
142
+ ) -> List[AudibleSegment]:
143
+ """Get segments that overlap with the given time range."""
144
+ return [
145
+ seg for seg in self.audible_segments
146
+ if not (seg.end_seconds <= start_seconds or
147
+ seg.start_seconds >= end_seconds)
148
+ ]
149
+
150
+
151
+ class CustomInstrumentalResult(BaseModel):
152
+ """Result of creating a custom instrumental with muted regions."""
153
+
154
+ output_path: str = Field(
155
+ ...,
156
+ description="Path to the created custom instrumental file"
157
+ )
158
+ mute_regions_applied: List[MuteRegion] = Field(
159
+ default_factory=list,
160
+ description="List of mute regions that were applied"
161
+ )
162
+ total_muted_duration_seconds: float = Field(
163
+ default=0.0,
164
+ ge=0,
165
+ description="Total duration of muted audio in seconds"
166
+ )
167
+ output_duration_seconds: float = Field(
168
+ default=0.0,
169
+ ge=0,
170
+ description="Duration of the output file in seconds"
171
+ )