karaoke-gen 0.71.23__py3-none-any.whl → 0.71.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- karaoke_gen/file_handler.py +192 -0
- karaoke_gen/instrumental_review/__init__.py +45 -0
- karaoke_gen/instrumental_review/analyzer.py +408 -0
- karaoke_gen/instrumental_review/editor.py +322 -0
- karaoke_gen/instrumental_review/models.py +171 -0
- karaoke_gen/instrumental_review/server.py +1181 -0
- karaoke_gen/instrumental_review/waveform.py +409 -0
- karaoke_gen/utils/cli_args.py +5 -0
- karaoke_gen/utils/gen_cli.py +186 -0
- karaoke_gen/utils/remote_cli.py +864 -154
- {karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/METADATA +4 -1
- {karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/RECORD +16 -10
- lyrics_transcriber/correction/anchor_sequence.py +226 -350
- {karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/WHEEL +0 -0
- {karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/entry_points.txt +0 -0
- {karaoke_gen-0.71.23.dist-info → karaoke_gen-0.71.42.dist-info}/licenses/LICENSE +0 -0
karaoke_gen/file_handler.py
CHANGED
|
@@ -5,6 +5,12 @@ import shutil
|
|
|
5
5
|
import tempfile
|
|
6
6
|
from .utils import sanitize_filename
|
|
7
7
|
|
|
8
|
+
try:
|
|
9
|
+
import yt_dlp
|
|
10
|
+
YT_DLP_AVAILABLE = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
YT_DLP_AVAILABLE = False
|
|
13
|
+
|
|
8
14
|
|
|
9
15
|
# Placeholder class or functions for file handling
|
|
10
16
|
class FileHandler:
|
|
@@ -71,6 +77,192 @@ class FileHandler:
|
|
|
71
77
|
|
|
72
78
|
return target_path
|
|
73
79
|
|
|
80
|
+
def download_video(self, url, output_filename_no_extension, cookies_str=None):
|
|
81
|
+
"""
|
|
82
|
+
Download audio from a URL (YouTube, etc.) using yt-dlp.
|
|
83
|
+
|
|
84
|
+
This method downloads the best quality audio from a URL and saves it
|
|
85
|
+
to the specified output path. It handles YouTube and other video platforms
|
|
86
|
+
supported by yt-dlp.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
url: URL to download from (YouTube, Vimeo, etc.)
|
|
90
|
+
output_filename_no_extension: Output filename without extension
|
|
91
|
+
cookies_str: Optional cookies string for authenticated downloads
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Path to downloaded audio file, or None if failed
|
|
95
|
+
"""
|
|
96
|
+
if not YT_DLP_AVAILABLE:
|
|
97
|
+
self.logger.error("yt-dlp is not installed. Install with: pip install yt-dlp")
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
self.logger.info(f"Downloading audio from URL: {url}")
|
|
101
|
+
|
|
102
|
+
# Configure yt-dlp options
|
|
103
|
+
ydl_opts = {
|
|
104
|
+
'format': 'bestaudio/best',
|
|
105
|
+
'outtmpl': output_filename_no_extension + '.%(ext)s',
|
|
106
|
+
'postprocessors': [{
|
|
107
|
+
'key': 'FFmpegExtractAudio',
|
|
108
|
+
'preferredcodec': 'best',
|
|
109
|
+
'preferredquality': '0', # Best quality
|
|
110
|
+
}],
|
|
111
|
+
'quiet': True,
|
|
112
|
+
'no_warnings': True,
|
|
113
|
+
'extract_flat': False,
|
|
114
|
+
# Anti-detection options
|
|
115
|
+
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
116
|
+
'retries': 3,
|
|
117
|
+
'fragment_retries': 3,
|
|
118
|
+
'http_headers': {
|
|
119
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
120
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
121
|
+
},
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
# Handle cookies if provided - use safe tempfile pattern to avoid leaks
|
|
125
|
+
cookie_file_path = None
|
|
126
|
+
if cookies_str:
|
|
127
|
+
try:
|
|
128
|
+
# Use context manager to safely write cookies file
|
|
129
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as cookie_file:
|
|
130
|
+
cookie_file.write(cookies_str)
|
|
131
|
+
cookie_file_path = cookie_file.name
|
|
132
|
+
ydl_opts['cookiefile'] = cookie_file_path
|
|
133
|
+
except Exception as e:
|
|
134
|
+
self.logger.warning(f"Failed to write cookies file: {e}")
|
|
135
|
+
cookie_file_path = None
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
139
|
+
# Extract info first to get actual filename
|
|
140
|
+
info = ydl.extract_info(url, download=True)
|
|
141
|
+
|
|
142
|
+
if info is None:
|
|
143
|
+
self.logger.error("Failed to extract info from URL")
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
# Find the downloaded file
|
|
147
|
+
# The actual filename might differ from template due to post-processing
|
|
148
|
+
downloaded_file = None
|
|
149
|
+
|
|
150
|
+
# Check common extensions
|
|
151
|
+
for ext in ['m4a', 'opus', 'webm', 'mp3', 'flac', 'wav', 'ogg', 'aac']:
|
|
152
|
+
candidate = f"{output_filename_no_extension}.{ext}"
|
|
153
|
+
if os.path.exists(candidate):
|
|
154
|
+
downloaded_file = candidate
|
|
155
|
+
break
|
|
156
|
+
|
|
157
|
+
if downloaded_file is None:
|
|
158
|
+
# Try to find any audio file with matching prefix
|
|
159
|
+
import glob
|
|
160
|
+
matches = glob.glob(f"{output_filename_no_extension}.*")
|
|
161
|
+
audio_extensions = ['.m4a', '.opus', '.webm', '.mp3', '.flac', '.wav', '.ogg', '.aac']
|
|
162
|
+
for match in matches:
|
|
163
|
+
if any(match.endswith(ext) for ext in audio_extensions):
|
|
164
|
+
downloaded_file = match
|
|
165
|
+
break
|
|
166
|
+
|
|
167
|
+
if downloaded_file and os.path.exists(downloaded_file):
|
|
168
|
+
self.logger.info(f"Successfully downloaded: {downloaded_file}")
|
|
169
|
+
return downloaded_file
|
|
170
|
+
else:
|
|
171
|
+
self.logger.error("Downloaded file not found after yt-dlp completed")
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
except yt_dlp.DownloadError as e:
|
|
175
|
+
self.logger.error(f"yt-dlp download error: {e}")
|
|
176
|
+
return None
|
|
177
|
+
except Exception as e:
|
|
178
|
+
self.logger.error(f"Failed to download from URL: {e}")
|
|
179
|
+
return None
|
|
180
|
+
finally:
|
|
181
|
+
# Clean up cookie file if we created one
|
|
182
|
+
if cookie_file_path is not None:
|
|
183
|
+
try:
|
|
184
|
+
os.unlink(cookie_file_path)
|
|
185
|
+
except Exception:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
def extract_metadata_from_url(self, url):
|
|
189
|
+
"""
|
|
190
|
+
Extract metadata (artist, title) from a URL without downloading.
|
|
191
|
+
|
|
192
|
+
Uses yt-dlp to fetch video metadata including title, uploader/artist,
|
|
193
|
+
and other information that can be used for the karaoke generation.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
url: URL to extract metadata from
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Dict with 'artist', 'title', 'duration', and 'raw_info', or None if failed
|
|
200
|
+
"""
|
|
201
|
+
if not YT_DLP_AVAILABLE:
|
|
202
|
+
self.logger.error("yt-dlp is not installed. Install with: pip install yt-dlp")
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
self.logger.info(f"Extracting metadata from URL: {url}")
|
|
206
|
+
|
|
207
|
+
ydl_opts = {
|
|
208
|
+
'quiet': True,
|
|
209
|
+
'no_warnings': True,
|
|
210
|
+
'extract_flat': False,
|
|
211
|
+
'skip_download': True,
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
216
|
+
info = ydl.extract_info(url, download=False)
|
|
217
|
+
|
|
218
|
+
if info is None:
|
|
219
|
+
self.logger.error("Failed to extract metadata from URL")
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
# Try to extract artist and title from various fields
|
|
223
|
+
raw_title = info.get('title', '')
|
|
224
|
+
uploader = info.get('uploader', '') or info.get('channel', '') or info.get('artist', '')
|
|
225
|
+
duration = info.get('duration', 0)
|
|
226
|
+
|
|
227
|
+
# Attempt to parse "Artist - Title" format from title
|
|
228
|
+
artist = None
|
|
229
|
+
title = raw_title
|
|
230
|
+
|
|
231
|
+
if ' - ' in raw_title:
|
|
232
|
+
parts = raw_title.split(' - ', 1)
|
|
233
|
+
if len(parts) == 2:
|
|
234
|
+
artist = parts[0].strip()
|
|
235
|
+
title = parts[1].strip()
|
|
236
|
+
|
|
237
|
+
# Fall back to uploader as artist if not found in title
|
|
238
|
+
if not artist:
|
|
239
|
+
artist = uploader
|
|
240
|
+
|
|
241
|
+
# Clean up title (remove common suffixes like "(Official Video)")
|
|
242
|
+
title_cleanup_patterns = [
|
|
243
|
+
'(official video)', '(official music video)', '(official audio)',
|
|
244
|
+
'(lyric video)', '(lyrics)', '(visualizer)', '(music video)',
|
|
245
|
+
'[official video]', '[official music video]', '[official audio]',
|
|
246
|
+
'(hd)', '(4k)', '(remastered)', '| official video', '| official audio',
|
|
247
|
+
]
|
|
248
|
+
title_lower = title.lower()
|
|
249
|
+
for pattern in title_cleanup_patterns:
|
|
250
|
+
if pattern in title_lower:
|
|
251
|
+
idx = title_lower.find(pattern)
|
|
252
|
+
title = title[:idx].strip()
|
|
253
|
+
title_lower = title.lower()
|
|
254
|
+
|
|
255
|
+
return {
|
|
256
|
+
'artist': artist,
|
|
257
|
+
'title': title,
|
|
258
|
+
'duration': duration,
|
|
259
|
+
'raw_info': info,
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
self.logger.error(f"Failed to extract metadata from URL: {e}")
|
|
264
|
+
return None
|
|
265
|
+
|
|
74
266
|
def extract_still_image_from_video(self, input_filename, output_filename_no_extension):
|
|
75
267
|
output_filename = output_filename_no_extension + ".png"
|
|
76
268
|
self.logger.info(f"Extracting still image from position 30s input media")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Instrumental Review Module - Shared core for both local and remote CLI.
|
|
3
|
+
|
|
4
|
+
This module provides audio analysis and editing functionality for instrumental
|
|
5
|
+
selection in karaoke generation. It's designed to be:
|
|
6
|
+
- Pure Python with no cloud dependencies (GCS, etc.)
|
|
7
|
+
- Reusable by both local CLI (karaoke-gen) and remote backend (Cloud Run)
|
|
8
|
+
- Easy to test without mocking cloud services
|
|
9
|
+
|
|
10
|
+
Classes:
|
|
11
|
+
AudioAnalyzer: Analyzes backing vocals audio for audible content
|
|
12
|
+
AudioEditor: Creates custom instrumentals by muting regions
|
|
13
|
+
WaveformGenerator: Generates waveform visualization images
|
|
14
|
+
InstrumentalReviewServer: Local HTTP server for browser-based review
|
|
15
|
+
|
|
16
|
+
Models:
|
|
17
|
+
AnalysisResult: Result of audio analysis
|
|
18
|
+
AudibleSegment: A detected segment of audible content
|
|
19
|
+
MuteRegion: A region to mute in the backing vocals
|
|
20
|
+
RecommendedSelection: Enum of selection recommendations
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .models import (
|
|
24
|
+
AnalysisResult,
|
|
25
|
+
AudibleSegment,
|
|
26
|
+
MuteRegion,
|
|
27
|
+
RecommendedSelection,
|
|
28
|
+
)
|
|
29
|
+
from .analyzer import AudioAnalyzer
|
|
30
|
+
from .editor import AudioEditor
|
|
31
|
+
from .waveform import WaveformGenerator
|
|
32
|
+
from .server import InstrumentalReviewServer
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
# Models
|
|
36
|
+
"AnalysisResult",
|
|
37
|
+
"AudibleSegment",
|
|
38
|
+
"MuteRegion",
|
|
39
|
+
"RecommendedSelection",
|
|
40
|
+
# Classes
|
|
41
|
+
"AudioAnalyzer",
|
|
42
|
+
"AudioEditor",
|
|
43
|
+
"WaveformGenerator",
|
|
44
|
+
"InstrumentalReviewServer",
|
|
45
|
+
]
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio analyzer for detecting audible content in backing vocals.
|
|
3
|
+
|
|
4
|
+
This module provides the AudioAnalyzer class which analyzes audio files
|
|
5
|
+
to detect segments of audible content above a silence threshold. It's used
|
|
6
|
+
to help determine whether backing vocals should be included in the final
|
|
7
|
+
karaoke instrumental.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import math
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import List, Optional, Tuple
|
|
14
|
+
|
|
15
|
+
from pydub import AudioSegment
|
|
16
|
+
|
|
17
|
+
from .models import AnalysisResult, AudibleSegment, RecommendedSelection
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AudioAnalyzer:
|
|
24
|
+
"""
|
|
25
|
+
Analyzes audio files for backing vocals content.
|
|
26
|
+
|
|
27
|
+
This class is pure Python with no cloud dependencies. It works with
|
|
28
|
+
local file paths and uses pydub for audio loading and analysis.
|
|
29
|
+
|
|
30
|
+
The analyzer detects segments of audible content (above a silence threshold)
|
|
31
|
+
and provides recommendations for instrumental selection based on the
|
|
32
|
+
analysis results.
|
|
33
|
+
|
|
34
|
+
Attributes:
|
|
35
|
+
silence_threshold_db: Amplitude threshold below which audio is
|
|
36
|
+
considered silent (default: -40.0 dB)
|
|
37
|
+
min_segment_duration_ms: Minimum duration for a segment to be
|
|
38
|
+
considered audible (default: 100ms)
|
|
39
|
+
merge_gap_ms: Maximum gap between segments to merge them
|
|
40
|
+
(default: 500ms)
|
|
41
|
+
window_ms: Analysis window size in milliseconds (default: 50ms)
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
>>> analyzer = AudioAnalyzer(silence_threshold_db=-40.0)
|
|
45
|
+
>>> result = analyzer.analyze("/path/to/backing_vocals.flac")
|
|
46
|
+
>>> if result.has_audible_content:
|
|
47
|
+
... print(f"Found {result.segment_count} audible segments")
|
|
48
|
+
... for seg in result.audible_segments:
|
|
49
|
+
... print(f" {seg.start_seconds:.1f}s - {seg.end_seconds:.1f}s")
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
silence_threshold_db: float = -40.0,
|
|
55
|
+
min_segment_duration_ms: int = 100,
|
|
56
|
+
merge_gap_ms: int = 500,
|
|
57
|
+
window_ms: int = 50,
|
|
58
|
+
):
|
|
59
|
+
"""
|
|
60
|
+
Initialize the audio analyzer.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
silence_threshold_db: Amplitude threshold below which audio is
|
|
64
|
+
considered silent. Default is -40.0 dB.
|
|
65
|
+
min_segment_duration_ms: Minimum duration for a segment to be
|
|
66
|
+
reported as audible. Segments shorter than this are ignored.
|
|
67
|
+
Default is 100ms.
|
|
68
|
+
merge_gap_ms: If two audible segments are separated by a gap
|
|
69
|
+
shorter than this, they are merged into one segment.
|
|
70
|
+
Default is 500ms.
|
|
71
|
+
window_ms: Size of the analysis window in milliseconds.
|
|
72
|
+
Smaller windows give more precise timing but slower analysis.
|
|
73
|
+
Default is 50ms.
|
|
74
|
+
"""
|
|
75
|
+
self.silence_threshold_db = silence_threshold_db
|
|
76
|
+
self.min_segment_duration_ms = min_segment_duration_ms
|
|
77
|
+
self.merge_gap_ms = merge_gap_ms
|
|
78
|
+
self.window_ms = window_ms
|
|
79
|
+
|
|
80
|
+
def analyze(self, audio_path: str) -> AnalysisResult:
|
|
81
|
+
"""
|
|
82
|
+
Analyze an audio file for audible content.
|
|
83
|
+
|
|
84
|
+
This method loads the audio file, calculates amplitude levels across
|
|
85
|
+
the duration, and identifies segments where the amplitude exceeds
|
|
86
|
+
the silence threshold.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
audio_path: Path to the audio file to analyze. Supports formats
|
|
90
|
+
that pydub/ffmpeg can read (FLAC, WAV, MP3, etc.)
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
AnalysisResult containing:
|
|
94
|
+
- has_audible_content: Whether any audible content was found
|
|
95
|
+
- total_duration_seconds: Total duration of the audio
|
|
96
|
+
- audible_segments: List of detected audible segments
|
|
97
|
+
- recommended_selection: Recommendation for which instrumental
|
|
98
|
+
- Various statistics about the audible content
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
FileNotFoundError: If the audio file doesn't exist
|
|
102
|
+
Exception: If the audio file cannot be loaded
|
|
103
|
+
"""
|
|
104
|
+
path = Path(audio_path)
|
|
105
|
+
if not path.exists():
|
|
106
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
107
|
+
|
|
108
|
+
logger.info(f"Analyzing audio file: {audio_path}")
|
|
109
|
+
|
|
110
|
+
# Load audio file
|
|
111
|
+
audio = AudioSegment.from_file(audio_path)
|
|
112
|
+
total_duration_ms = len(audio)
|
|
113
|
+
total_duration_seconds = total_duration_ms / 1000.0
|
|
114
|
+
|
|
115
|
+
logger.debug(f"Audio duration: {total_duration_seconds:.2f}s, "
|
|
116
|
+
f"channels: {audio.channels}, "
|
|
117
|
+
f"sample_rate: {audio.frame_rate}")
|
|
118
|
+
|
|
119
|
+
# Convert to mono for consistent analysis
|
|
120
|
+
if audio.channels > 1:
|
|
121
|
+
audio = audio.set_channels(1)
|
|
122
|
+
|
|
123
|
+
# Analyze amplitude in windows
|
|
124
|
+
audible_windows = self._find_audible_windows(audio)
|
|
125
|
+
|
|
126
|
+
# Merge adjacent windows into segments
|
|
127
|
+
raw_segments = self._windows_to_segments(audible_windows, audio)
|
|
128
|
+
|
|
129
|
+
# Merge close segments and filter short ones
|
|
130
|
+
segments = self._merge_and_filter_segments(raw_segments)
|
|
131
|
+
|
|
132
|
+
# Calculate statistics
|
|
133
|
+
total_audible_ms = sum(
|
|
134
|
+
seg.duration_seconds * 1000 for seg in segments
|
|
135
|
+
)
|
|
136
|
+
total_audible_seconds = total_audible_ms / 1000.0
|
|
137
|
+
audible_percentage = (
|
|
138
|
+
(total_audible_seconds / total_duration_seconds * 100)
|
|
139
|
+
if total_duration_seconds > 0 else 0.0
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
has_audible_content = len(segments) > 0
|
|
143
|
+
|
|
144
|
+
# Determine recommendation
|
|
145
|
+
recommended_selection = self._get_recommendation(
|
|
146
|
+
has_audible_content,
|
|
147
|
+
segments,
|
|
148
|
+
audible_percentage
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
logger.info(
|
|
152
|
+
f"Analysis complete: {len(segments)} segments, "
|
|
153
|
+
f"{audible_percentage:.1f}% audible, "
|
|
154
|
+
f"recommendation: {recommended_selection.value}"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return AnalysisResult(
|
|
158
|
+
has_audible_content=has_audible_content,
|
|
159
|
+
total_duration_seconds=total_duration_seconds,
|
|
160
|
+
audible_segments=segments,
|
|
161
|
+
recommended_selection=recommended_selection,
|
|
162
|
+
silence_threshold_db=self.silence_threshold_db,
|
|
163
|
+
total_audible_duration_seconds=total_audible_seconds,
|
|
164
|
+
audible_percentage=audible_percentage,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def get_amplitude_envelope(
|
|
168
|
+
self,
|
|
169
|
+
audio_path: str,
|
|
170
|
+
window_ms: int = 100,
|
|
171
|
+
normalize: bool = True,
|
|
172
|
+
) -> List[float]:
|
|
173
|
+
"""
|
|
174
|
+
Get the amplitude envelope for waveform visualization.
|
|
175
|
+
|
|
176
|
+
This method returns a list of amplitude values suitable for
|
|
177
|
+
rendering a waveform display. Each value represents the RMS
|
|
178
|
+
amplitude of a window of audio.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
audio_path: Path to the audio file
|
|
182
|
+
window_ms: Size of each window in milliseconds. Smaller values
|
|
183
|
+
give more detail but larger data. Default is 100ms.
|
|
184
|
+
normalize: If True, normalize amplitudes to 0.0-1.0 range.
|
|
185
|
+
Default is True.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
List of amplitude values (floats). If normalize=True, values
|
|
189
|
+
are in the range [0.0, 1.0]. Otherwise, values are in dBFS.
|
|
190
|
+
"""
|
|
191
|
+
path = Path(audio_path)
|
|
192
|
+
if not path.exists():
|
|
193
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
194
|
+
|
|
195
|
+
audio = AudioSegment.from_file(audio_path)
|
|
196
|
+
|
|
197
|
+
# Convert to mono
|
|
198
|
+
if audio.channels > 1:
|
|
199
|
+
audio = audio.set_channels(1)
|
|
200
|
+
|
|
201
|
+
amplitudes = []
|
|
202
|
+
duration_ms = len(audio)
|
|
203
|
+
|
|
204
|
+
for start_ms in range(0, duration_ms, window_ms):
|
|
205
|
+
end_ms = min(start_ms + window_ms, duration_ms)
|
|
206
|
+
window = audio[start_ms:end_ms]
|
|
207
|
+
|
|
208
|
+
# Get RMS amplitude in dBFS
|
|
209
|
+
if window.rms > 0:
|
|
210
|
+
db = 20 * math.log10(window.rms / window.max_possible_amplitude)
|
|
211
|
+
else:
|
|
212
|
+
db = -100.0 # Effectively silent
|
|
213
|
+
|
|
214
|
+
amplitudes.append(db)
|
|
215
|
+
|
|
216
|
+
if normalize and amplitudes:
|
|
217
|
+
# Normalize to 0.0 - 1.0 range
|
|
218
|
+
# Map from [silence_threshold, 0] to [0, 1]
|
|
219
|
+
min_db = self.silence_threshold_db
|
|
220
|
+
max_db = 0.0
|
|
221
|
+
amplitudes = [
|
|
222
|
+
max(0.0, min(1.0, (db - min_db) / (max_db - min_db)))
|
|
223
|
+
for db in amplitudes
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
return amplitudes
|
|
227
|
+
|
|
228
|
+
def _find_audible_windows(
|
|
229
|
+
self,
|
|
230
|
+
audio: AudioSegment
|
|
231
|
+
) -> List[Tuple[int, float, float]]:
|
|
232
|
+
"""
|
|
233
|
+
Find windows with amplitude above the silence threshold.
|
|
234
|
+
|
|
235
|
+
Returns a list of tuples: (start_ms, avg_db, peak_db)
|
|
236
|
+
"""
|
|
237
|
+
audible_windows = []
|
|
238
|
+
duration_ms = len(audio)
|
|
239
|
+
|
|
240
|
+
for start_ms in range(0, duration_ms, self.window_ms):
|
|
241
|
+
end_ms = min(start_ms + self.window_ms, duration_ms)
|
|
242
|
+
window = audio[start_ms:end_ms]
|
|
243
|
+
|
|
244
|
+
# Calculate RMS amplitude in dB
|
|
245
|
+
if window.rms > 0:
|
|
246
|
+
avg_db = 20 * math.log10(window.rms / window.max_possible_amplitude)
|
|
247
|
+
# Peak is approximated as max sample value
|
|
248
|
+
peak_db = window.dBFS if hasattr(window, 'dBFS') else avg_db
|
|
249
|
+
else:
|
|
250
|
+
avg_db = -100.0
|
|
251
|
+
peak_db = -100.0
|
|
252
|
+
|
|
253
|
+
if avg_db > self.silence_threshold_db:
|
|
254
|
+
audible_windows.append((start_ms, avg_db, peak_db))
|
|
255
|
+
|
|
256
|
+
return audible_windows
|
|
257
|
+
|
|
258
|
+
def _windows_to_segments(
|
|
259
|
+
self,
|
|
260
|
+
audible_windows: List[Tuple[int, float, float]],
|
|
261
|
+
audio: AudioSegment
|
|
262
|
+
) -> List[AudibleSegment]:
|
|
263
|
+
"""
|
|
264
|
+
Convert list of audible windows into contiguous segments.
|
|
265
|
+
"""
|
|
266
|
+
if not audible_windows:
|
|
267
|
+
return []
|
|
268
|
+
|
|
269
|
+
segments = []
|
|
270
|
+
segment_start_ms = audible_windows[0][0]
|
|
271
|
+
segment_dbs = [audible_windows[0][1]]
|
|
272
|
+
segment_peaks = [audible_windows[0][2]]
|
|
273
|
+
last_end_ms = audible_windows[0][0] + self.window_ms
|
|
274
|
+
|
|
275
|
+
for i in range(1, len(audible_windows)):
|
|
276
|
+
start_ms, avg_db, peak_db = audible_windows[i]
|
|
277
|
+
|
|
278
|
+
# Check if this window is contiguous with the previous
|
|
279
|
+
gap_ms = start_ms - last_end_ms
|
|
280
|
+
|
|
281
|
+
if gap_ms <= self.window_ms:
|
|
282
|
+
# Extend current segment
|
|
283
|
+
segment_dbs.append(avg_db)
|
|
284
|
+
segment_peaks.append(peak_db)
|
|
285
|
+
last_end_ms = start_ms + self.window_ms
|
|
286
|
+
else:
|
|
287
|
+
# Save current segment and start a new one
|
|
288
|
+
segments.append(self._create_segment(
|
|
289
|
+
segment_start_ms, last_end_ms, segment_dbs, segment_peaks
|
|
290
|
+
))
|
|
291
|
+
|
|
292
|
+
segment_start_ms = start_ms
|
|
293
|
+
segment_dbs = [avg_db]
|
|
294
|
+
segment_peaks = [peak_db]
|
|
295
|
+
last_end_ms = start_ms + self.window_ms
|
|
296
|
+
|
|
297
|
+
# Don't forget the last segment
|
|
298
|
+
segments.append(self._create_segment(
|
|
299
|
+
segment_start_ms, last_end_ms, segment_dbs, segment_peaks
|
|
300
|
+
))
|
|
301
|
+
|
|
302
|
+
return segments
|
|
303
|
+
|
|
304
|
+
def _create_segment(
|
|
305
|
+
self,
|
|
306
|
+
start_ms: int,
|
|
307
|
+
end_ms: int,
|
|
308
|
+
dbs: List[float],
|
|
309
|
+
peaks: List[float]
|
|
310
|
+
) -> AudibleSegment:
|
|
311
|
+
"""Create an AudibleSegment from window data."""
|
|
312
|
+
return AudibleSegment(
|
|
313
|
+
start_seconds=start_ms / 1000.0,
|
|
314
|
+
end_seconds=end_ms / 1000.0,
|
|
315
|
+
duration_seconds=(end_ms - start_ms) / 1000.0,
|
|
316
|
+
avg_amplitude_db=sum(dbs) / len(dbs) if dbs else -100.0,
|
|
317
|
+
peak_amplitude_db=max(peaks) if peaks else -100.0,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def _merge_and_filter_segments(
|
|
321
|
+
self,
|
|
322
|
+
segments: List[AudibleSegment]
|
|
323
|
+
) -> List[AudibleSegment]:
|
|
324
|
+
"""
|
|
325
|
+
Merge segments that are close together and filter out short ones.
|
|
326
|
+
"""
|
|
327
|
+
if not segments:
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
# Sort by start time
|
|
331
|
+
segments = sorted(segments, key=lambda s: s.start_seconds)
|
|
332
|
+
|
|
333
|
+
# Merge segments with small gaps
|
|
334
|
+
merged = []
|
|
335
|
+
current = segments[0]
|
|
336
|
+
|
|
337
|
+
for next_seg in segments[1:]:
|
|
338
|
+
gap_ms = (next_seg.start_seconds - current.end_seconds) * 1000
|
|
339
|
+
|
|
340
|
+
if gap_ms <= self.merge_gap_ms:
|
|
341
|
+
# Merge segments
|
|
342
|
+
combined_duration = (
|
|
343
|
+
next_seg.end_seconds - current.start_seconds
|
|
344
|
+
)
|
|
345
|
+
# Weight average amplitude by duration
|
|
346
|
+
total_duration = (
|
|
347
|
+
current.duration_seconds + next_seg.duration_seconds
|
|
348
|
+
)
|
|
349
|
+
weighted_avg_db = (
|
|
350
|
+
(current.avg_amplitude_db * current.duration_seconds +
|
|
351
|
+
next_seg.avg_amplitude_db * next_seg.duration_seconds)
|
|
352
|
+
/ total_duration
|
|
353
|
+
) if total_duration > 0 else -100.0
|
|
354
|
+
|
|
355
|
+
current = AudibleSegment(
|
|
356
|
+
start_seconds=current.start_seconds,
|
|
357
|
+
end_seconds=next_seg.end_seconds,
|
|
358
|
+
duration_seconds=combined_duration,
|
|
359
|
+
avg_amplitude_db=weighted_avg_db,
|
|
360
|
+
peak_amplitude_db=max(
|
|
361
|
+
current.peak_amplitude_db,
|
|
362
|
+
next_seg.peak_amplitude_db
|
|
363
|
+
),
|
|
364
|
+
)
|
|
365
|
+
else:
|
|
366
|
+
merged.append(current)
|
|
367
|
+
current = next_seg
|
|
368
|
+
|
|
369
|
+
merged.append(current)
|
|
370
|
+
|
|
371
|
+
# Filter out segments shorter than minimum duration
|
|
372
|
+
min_duration_seconds = self.min_segment_duration_ms / 1000.0
|
|
373
|
+
filtered = [
|
|
374
|
+
seg for seg in merged
|
|
375
|
+
if seg.duration_seconds >= min_duration_seconds
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
return filtered
|
|
379
|
+
|
|
380
|
+
def _get_recommendation(
|
|
381
|
+
self,
|
|
382
|
+
has_audible_content: bool,
|
|
383
|
+
segments: List[AudibleSegment],
|
|
384
|
+
audible_percentage: float
|
|
385
|
+
) -> RecommendedSelection:
|
|
386
|
+
"""
|
|
387
|
+
Determine the recommended instrumental selection.
|
|
388
|
+
|
|
389
|
+
Logic:
|
|
390
|
+
- If no audible content: recommend clean instrumental
|
|
391
|
+
- If audible content covers > 20% of the audio: likely has
|
|
392
|
+
meaningful backing vocals, recommend review
|
|
393
|
+
- Otherwise: minimal content, recommend clean
|
|
394
|
+
"""
|
|
395
|
+
if not has_audible_content:
|
|
396
|
+
return RecommendedSelection.CLEAN
|
|
397
|
+
|
|
398
|
+
# If there's significant audible content, recommend review
|
|
399
|
+
if audible_percentage > 20.0:
|
|
400
|
+
return RecommendedSelection.REVIEW_NEEDED
|
|
401
|
+
|
|
402
|
+
# If there are loud segments, recommend review
|
|
403
|
+
loud_segments = [seg for seg in segments if seg.is_loud]
|
|
404
|
+
if loud_segments:
|
|
405
|
+
return RecommendedSelection.REVIEW_NEEDED
|
|
406
|
+
|
|
407
|
+
# Minimal content - recommend clean
|
|
408
|
+
return RecommendedSelection.CLEAN
|