karaoke-gen 0.75.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of karaoke-gen might be problematic. Click here for more details.
- karaoke_gen/__init__.py +38 -0
- karaoke_gen/audio_fetcher.py +1614 -0
- karaoke_gen/audio_processor.py +790 -0
- karaoke_gen/config.py +83 -0
- karaoke_gen/file_handler.py +387 -0
- karaoke_gen/instrumental_review/__init__.py +45 -0
- karaoke_gen/instrumental_review/analyzer.py +408 -0
- karaoke_gen/instrumental_review/editor.py +322 -0
- karaoke_gen/instrumental_review/models.py +171 -0
- karaoke_gen/instrumental_review/server.py +475 -0
- karaoke_gen/instrumental_review/static/index.html +1529 -0
- karaoke_gen/instrumental_review/waveform.py +409 -0
- karaoke_gen/karaoke_finalise/__init__.py +1 -0
- karaoke_gen/karaoke_finalise/karaoke_finalise.py +1833 -0
- karaoke_gen/karaoke_gen.py +1026 -0
- karaoke_gen/lyrics_processor.py +474 -0
- karaoke_gen/metadata.py +160 -0
- karaoke_gen/pipeline/__init__.py +87 -0
- karaoke_gen/pipeline/base.py +215 -0
- karaoke_gen/pipeline/context.py +230 -0
- karaoke_gen/pipeline/executors/__init__.py +21 -0
- karaoke_gen/pipeline/executors/local.py +159 -0
- karaoke_gen/pipeline/executors/remote.py +257 -0
- karaoke_gen/pipeline/stages/__init__.py +27 -0
- karaoke_gen/pipeline/stages/finalize.py +202 -0
- karaoke_gen/pipeline/stages/render.py +165 -0
- karaoke_gen/pipeline/stages/screens.py +139 -0
- karaoke_gen/pipeline/stages/separation.py +191 -0
- karaoke_gen/pipeline/stages/transcription.py +191 -0
- karaoke_gen/resources/AvenirNext-Bold.ttf +0 -0
- karaoke_gen/resources/Montserrat-Bold.ttf +0 -0
- karaoke_gen/resources/Oswald-Bold.ttf +0 -0
- karaoke_gen/resources/Oswald-SemiBold.ttf +0 -0
- karaoke_gen/resources/Zurich_Cn_BT_Bold.ttf +0 -0
- karaoke_gen/style_loader.py +531 -0
- karaoke_gen/utils/__init__.py +18 -0
- karaoke_gen/utils/bulk_cli.py +492 -0
- karaoke_gen/utils/cli_args.py +432 -0
- karaoke_gen/utils/gen_cli.py +978 -0
- karaoke_gen/utils/remote_cli.py +3268 -0
- karaoke_gen/video_background_processor.py +351 -0
- karaoke_gen/video_generator.py +424 -0
- karaoke_gen-0.75.54.dist-info/METADATA +718 -0
- karaoke_gen-0.75.54.dist-info/RECORD +287 -0
- karaoke_gen-0.75.54.dist-info/WHEEL +4 -0
- karaoke_gen-0.75.54.dist-info/entry_points.txt +5 -0
- karaoke_gen-0.75.54.dist-info/licenses/LICENSE +21 -0
- lyrics_transcriber/__init__.py +10 -0
- lyrics_transcriber/cli/__init__.py +0 -0
- lyrics_transcriber/cli/cli_main.py +285 -0
- lyrics_transcriber/core/__init__.py +0 -0
- lyrics_transcriber/core/config.py +50 -0
- lyrics_transcriber/core/controller.py +594 -0
- lyrics_transcriber/correction/__init__.py +0 -0
- lyrics_transcriber/correction/agentic/__init__.py +9 -0
- lyrics_transcriber/correction/agentic/adapter.py +71 -0
- lyrics_transcriber/correction/agentic/agent.py +313 -0
- lyrics_transcriber/correction/agentic/feedback/aggregator.py +12 -0
- lyrics_transcriber/correction/agentic/feedback/collector.py +17 -0
- lyrics_transcriber/correction/agentic/feedback/retention.py +24 -0
- lyrics_transcriber/correction/agentic/feedback/store.py +76 -0
- lyrics_transcriber/correction/agentic/handlers/__init__.py +24 -0
- lyrics_transcriber/correction/agentic/handlers/ambiguous.py +44 -0
- lyrics_transcriber/correction/agentic/handlers/background_vocals.py +68 -0
- lyrics_transcriber/correction/agentic/handlers/base.py +51 -0
- lyrics_transcriber/correction/agentic/handlers/complex_multi_error.py +46 -0
- lyrics_transcriber/correction/agentic/handlers/extra_words.py +74 -0
- lyrics_transcriber/correction/agentic/handlers/no_error.py +42 -0
- lyrics_transcriber/correction/agentic/handlers/punctuation.py +44 -0
- lyrics_transcriber/correction/agentic/handlers/registry.py +60 -0
- lyrics_transcriber/correction/agentic/handlers/repeated_section.py +44 -0
- lyrics_transcriber/correction/agentic/handlers/sound_alike.py +126 -0
- lyrics_transcriber/correction/agentic/models/__init__.py +5 -0
- lyrics_transcriber/correction/agentic/models/ai_correction.py +31 -0
- lyrics_transcriber/correction/agentic/models/correction_session.py +30 -0
- lyrics_transcriber/correction/agentic/models/enums.py +38 -0
- lyrics_transcriber/correction/agentic/models/human_feedback.py +30 -0
- lyrics_transcriber/correction/agentic/models/learning_data.py +26 -0
- lyrics_transcriber/correction/agentic/models/observability_metrics.py +28 -0
- lyrics_transcriber/correction/agentic/models/schemas.py +46 -0
- lyrics_transcriber/correction/agentic/models/utils.py +19 -0
- lyrics_transcriber/correction/agentic/observability/__init__.py +5 -0
- lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +35 -0
- lyrics_transcriber/correction/agentic/observability/metrics.py +46 -0
- lyrics_transcriber/correction/agentic/observability/performance.py +19 -0
- lyrics_transcriber/correction/agentic/prompts/__init__.py +2 -0
- lyrics_transcriber/correction/agentic/prompts/classifier.py +227 -0
- lyrics_transcriber/correction/agentic/providers/__init__.py +6 -0
- lyrics_transcriber/correction/agentic/providers/base.py +36 -0
- lyrics_transcriber/correction/agentic/providers/circuit_breaker.py +145 -0
- lyrics_transcriber/correction/agentic/providers/config.py +73 -0
- lyrics_transcriber/correction/agentic/providers/constants.py +24 -0
- lyrics_transcriber/correction/agentic/providers/health.py +28 -0
- lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +212 -0
- lyrics_transcriber/correction/agentic/providers/model_factory.py +209 -0
- lyrics_transcriber/correction/agentic/providers/response_cache.py +218 -0
- lyrics_transcriber/correction/agentic/providers/response_parser.py +111 -0
- lyrics_transcriber/correction/agentic/providers/retry_executor.py +127 -0
- lyrics_transcriber/correction/agentic/router.py +35 -0
- lyrics_transcriber/correction/agentic/workflows/__init__.py +5 -0
- lyrics_transcriber/correction/agentic/workflows/consensus_workflow.py +24 -0
- lyrics_transcriber/correction/agentic/workflows/correction_graph.py +59 -0
- lyrics_transcriber/correction/agentic/workflows/feedback_workflow.py +24 -0
- lyrics_transcriber/correction/anchor_sequence.py +919 -0
- lyrics_transcriber/correction/corrector.py +760 -0
- lyrics_transcriber/correction/feedback/__init__.py +2 -0
- lyrics_transcriber/correction/feedback/schemas.py +107 -0
- lyrics_transcriber/correction/feedback/store.py +236 -0
- lyrics_transcriber/correction/handlers/__init__.py +0 -0
- lyrics_transcriber/correction/handlers/base.py +52 -0
- lyrics_transcriber/correction/handlers/extend_anchor.py +149 -0
- lyrics_transcriber/correction/handlers/levenshtein.py +189 -0
- lyrics_transcriber/correction/handlers/llm.py +293 -0
- lyrics_transcriber/correction/handlers/llm_providers.py +60 -0
- lyrics_transcriber/correction/handlers/no_space_punct_match.py +154 -0
- lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +85 -0
- lyrics_transcriber/correction/handlers/repeat.py +88 -0
- lyrics_transcriber/correction/handlers/sound_alike.py +259 -0
- lyrics_transcriber/correction/handlers/syllables_match.py +252 -0
- lyrics_transcriber/correction/handlers/word_count_match.py +80 -0
- lyrics_transcriber/correction/handlers/word_operations.py +187 -0
- lyrics_transcriber/correction/operations.py +352 -0
- lyrics_transcriber/correction/phrase_analyzer.py +435 -0
- lyrics_transcriber/correction/text_utils.py +30 -0
- lyrics_transcriber/frontend/.gitignore +23 -0
- lyrics_transcriber/frontend/.yarn/releases/yarn-4.7.0.cjs +935 -0
- lyrics_transcriber/frontend/.yarnrc.yml +3 -0
- lyrics_transcriber/frontend/README.md +50 -0
- lyrics_transcriber/frontend/REPLACE_ALL_FUNCTIONALITY.md +210 -0
- lyrics_transcriber/frontend/__init__.py +25 -0
- lyrics_transcriber/frontend/eslint.config.js +28 -0
- lyrics_transcriber/frontend/index.html +18 -0
- lyrics_transcriber/frontend/package.json +42 -0
- lyrics_transcriber/frontend/public/android-chrome-192x192.png +0 -0
- lyrics_transcriber/frontend/public/android-chrome-512x512.png +0 -0
- lyrics_transcriber/frontend/public/apple-touch-icon.png +0 -0
- lyrics_transcriber/frontend/public/favicon-16x16.png +0 -0
- lyrics_transcriber/frontend/public/favicon-32x32.png +0 -0
- lyrics_transcriber/frontend/public/favicon.ico +0 -0
- lyrics_transcriber/frontend/public/nomad-karaoke-logo.png +0 -0
- lyrics_transcriber/frontend/src/App.tsx +214 -0
- lyrics_transcriber/frontend/src/api.ts +254 -0
- lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +77 -0
- lyrics_transcriber/frontend/src/components/AddLyricsModal.tsx +114 -0
- lyrics_transcriber/frontend/src/components/AgenticCorrectionMetrics.tsx +204 -0
- lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +180 -0
- lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +167 -0
- lyrics_transcriber/frontend/src/components/CorrectionAnnotationModal.tsx +359 -0
- lyrics_transcriber/frontend/src/components/CorrectionDetailCard.tsx +281 -0
- lyrics_transcriber/frontend/src/components/CorrectionMetrics.tsx +162 -0
- lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +257 -0
- lyrics_transcriber/frontend/src/components/EditActionBar.tsx +68 -0
- lyrics_transcriber/frontend/src/components/EditModal.tsx +702 -0
- lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +496 -0
- lyrics_transcriber/frontend/src/components/EditWordList.tsx +379 -0
- lyrics_transcriber/frontend/src/components/FileUpload.tsx +77 -0
- lyrics_transcriber/frontend/src/components/FindReplaceModal.tsx +467 -0
- lyrics_transcriber/frontend/src/components/Header.tsx +413 -0
- lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +1387 -0
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
- lyrics_transcriber/frontend/src/components/MetricsDashboard.tsx +51 -0
- lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
- lyrics_transcriber/frontend/src/components/ModeSelector.tsx +67 -0
- lyrics_transcriber/frontend/src/components/ModelSelector.tsx +23 -0
- lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +144 -0
- lyrics_transcriber/frontend/src/components/ReferenceView.tsx +268 -0
- lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +336 -0
- lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +354 -0
- lyrics_transcriber/frontend/src/components/SegmentDetailsModal.tsx +64 -0
- lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +376 -0
- lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +131 -0
- lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +256 -0
- lyrics_transcriber/frontend/src/components/WordDivider.tsx +187 -0
- lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +379 -0
- lyrics_transcriber/frontend/src/components/shared/components/SourceSelector.tsx +56 -0
- lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +87 -0
- lyrics_transcriber/frontend/src/components/shared/constants.ts +20 -0
- lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +180 -0
- lyrics_transcriber/frontend/src/components/shared/styles.ts +13 -0
- lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
- lyrics_transcriber/frontend/src/components/shared/types.ts +129 -0
- lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +177 -0
- lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
- lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +75 -0
- lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +360 -0
- lyrics_transcriber/frontend/src/components/shared/utils/timingUtils.ts +110 -0
- lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
- lyrics_transcriber/frontend/src/hooks/useManualSync.ts +435 -0
- lyrics_transcriber/frontend/src/main.tsx +17 -0
- lyrics_transcriber/frontend/src/theme.ts +177 -0
- lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
- lyrics_transcriber/frontend/src/types.js +2 -0
- lyrics_transcriber/frontend/src/types.ts +199 -0
- lyrics_transcriber/frontend/src/validation.ts +132 -0
- lyrics_transcriber/frontend/src/vite-env.d.ts +1 -0
- lyrics_transcriber/frontend/tsconfig.app.json +26 -0
- lyrics_transcriber/frontend/tsconfig.json +25 -0
- lyrics_transcriber/frontend/tsconfig.node.json +23 -0
- lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -0
- lyrics_transcriber/frontend/update_version.js +11 -0
- lyrics_transcriber/frontend/vite.config.d.ts +2 -0
- lyrics_transcriber/frontend/vite.config.js +10 -0
- lyrics_transcriber/frontend/vite.config.ts +11 -0
- lyrics_transcriber/frontend/web_assets/android-chrome-192x192.png +0 -0
- lyrics_transcriber/frontend/web_assets/android-chrome-512x512.png +0 -0
- lyrics_transcriber/frontend/web_assets/apple-touch-icon.png +0 -0
- lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js +43288 -0
- lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +1 -0
- lyrics_transcriber/frontend/web_assets/favicon-16x16.png +0 -0
- lyrics_transcriber/frontend/web_assets/favicon-32x32.png +0 -0
- lyrics_transcriber/frontend/web_assets/favicon.ico +0 -0
- lyrics_transcriber/frontend/web_assets/index.html +18 -0
- lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.png +0 -0
- lyrics_transcriber/frontend/yarn.lock +3752 -0
- lyrics_transcriber/lyrics/__init__.py +0 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +211 -0
- lyrics_transcriber/lyrics/file_provider.py +95 -0
- lyrics_transcriber/lyrics/genius.py +384 -0
- lyrics_transcriber/lyrics/lrclib.py +231 -0
- lyrics_transcriber/lyrics/musixmatch.py +156 -0
- lyrics_transcriber/lyrics/spotify.py +290 -0
- lyrics_transcriber/lyrics/user_input_provider.py +44 -0
- lyrics_transcriber/output/__init__.py +0 -0
- lyrics_transcriber/output/ass/__init__.py +21 -0
- lyrics_transcriber/output/ass/ass.py +2088 -0
- lyrics_transcriber/output/ass/ass_specs.txt +732 -0
- lyrics_transcriber/output/ass/config.py +180 -0
- lyrics_transcriber/output/ass/constants.py +23 -0
- lyrics_transcriber/output/ass/event.py +94 -0
- lyrics_transcriber/output/ass/formatters.py +132 -0
- lyrics_transcriber/output/ass/lyrics_line.py +265 -0
- lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
- lyrics_transcriber/output/ass/section_detector.py +89 -0
- lyrics_transcriber/output/ass/section_screen.py +106 -0
- lyrics_transcriber/output/ass/style.py +187 -0
- lyrics_transcriber/output/cdg.py +619 -0
- lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
- lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
- lyrics_transcriber/output/cdgmaker/composer.py +2260 -0
- lyrics_transcriber/output/cdgmaker/config.py +151 -0
- lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
- lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
- lyrics_transcriber/output/cdgmaker/pack.py +507 -0
- lyrics_transcriber/output/cdgmaker/render.py +346 -0
- lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
- lyrics_transcriber/output/cdgmaker/utils.py +132 -0
- lyrics_transcriber/output/countdown_processor.py +306 -0
- lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
- lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
- lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/arial.ttf +0 -0
- lyrics_transcriber/output/fonts/georgia.ttf +0 -0
- lyrics_transcriber/output/fonts/verdana.ttf +0 -0
- lyrics_transcriber/output/generator.py +257 -0
- lyrics_transcriber/output/lrc_to_cdg.py +61 -0
- lyrics_transcriber/output/lyrics_file.py +102 -0
- lyrics_transcriber/output/plain_text.py +96 -0
- lyrics_transcriber/output/segment_resizer.py +431 -0
- lyrics_transcriber/output/subtitles.py +397 -0
- lyrics_transcriber/output/video.py +544 -0
- lyrics_transcriber/review/__init__.py +0 -0
- lyrics_transcriber/review/server.py +676 -0
- lyrics_transcriber/storage/__init__.py +0 -0
- lyrics_transcriber/storage/dropbox.py +225 -0
- lyrics_transcriber/transcribers/__init__.py +0 -0
- lyrics_transcriber/transcribers/audioshake.py +379 -0
- lyrics_transcriber/transcribers/base_transcriber.py +157 -0
- lyrics_transcriber/transcribers/whisper.py +330 -0
- lyrics_transcriber/types.py +650 -0
- lyrics_transcriber/utils/__init__.py +0 -0
- lyrics_transcriber/utils/word_utils.py +27 -0
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio analyzer for detecting audible content in backing vocals.
|
|
3
|
+
|
|
4
|
+
This module provides the AudioAnalyzer class which analyzes audio files
|
|
5
|
+
to detect segments of audible content above a silence threshold. It's used
|
|
6
|
+
to help determine whether backing vocals should be included in the final
|
|
7
|
+
karaoke instrumental.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import math
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import List, Optional, Tuple
|
|
14
|
+
|
|
15
|
+
from pydub import AudioSegment
|
|
16
|
+
|
|
17
|
+
from .models import AnalysisResult, AudibleSegment, RecommendedSelection
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AudioAnalyzer:
|
|
24
|
+
"""
|
|
25
|
+
Analyzes audio files for backing vocals content.
|
|
26
|
+
|
|
27
|
+
This class is pure Python with no cloud dependencies. It works with
|
|
28
|
+
local file paths and uses pydub for audio loading and analysis.
|
|
29
|
+
|
|
30
|
+
The analyzer detects segments of audible content (above a silence threshold)
|
|
31
|
+
and provides recommendations for instrumental selection based on the
|
|
32
|
+
analysis results.
|
|
33
|
+
|
|
34
|
+
Attributes:
|
|
35
|
+
silence_threshold_db: Amplitude threshold below which audio is
|
|
36
|
+
considered silent (default: -40.0 dB)
|
|
37
|
+
min_segment_duration_ms: Minimum duration for a segment to be
|
|
38
|
+
considered audible (default: 100ms)
|
|
39
|
+
merge_gap_ms: Maximum gap between segments to merge them
|
|
40
|
+
(default: 500ms)
|
|
41
|
+
window_ms: Analysis window size in milliseconds (default: 50ms)
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
>>> analyzer = AudioAnalyzer(silence_threshold_db=-40.0)
|
|
45
|
+
>>> result = analyzer.analyze("/path/to/backing_vocals.flac")
|
|
46
|
+
>>> if result.has_audible_content:
|
|
47
|
+
... print(f"Found {result.segment_count} audible segments")
|
|
48
|
+
... for seg in result.audible_segments:
|
|
49
|
+
... print(f" {seg.start_seconds:.1f}s - {seg.end_seconds:.1f}s")
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
silence_threshold_db: float = -40.0,
|
|
55
|
+
min_segment_duration_ms: int = 100,
|
|
56
|
+
merge_gap_ms: int = 500,
|
|
57
|
+
window_ms: int = 50,
|
|
58
|
+
):
|
|
59
|
+
"""
|
|
60
|
+
Initialize the audio analyzer.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
silence_threshold_db: Amplitude threshold below which audio is
|
|
64
|
+
considered silent. Default is -40.0 dB.
|
|
65
|
+
min_segment_duration_ms: Minimum duration for a segment to be
|
|
66
|
+
reported as audible. Segments shorter than this are ignored.
|
|
67
|
+
Default is 100ms.
|
|
68
|
+
merge_gap_ms: If two audible segments are separated by a gap
|
|
69
|
+
shorter than this, they are merged into one segment.
|
|
70
|
+
Default is 500ms.
|
|
71
|
+
window_ms: Size of the analysis window in milliseconds.
|
|
72
|
+
Smaller windows give more precise timing but slower analysis.
|
|
73
|
+
Default is 50ms.
|
|
74
|
+
"""
|
|
75
|
+
self.silence_threshold_db = silence_threshold_db
|
|
76
|
+
self.min_segment_duration_ms = min_segment_duration_ms
|
|
77
|
+
self.merge_gap_ms = merge_gap_ms
|
|
78
|
+
self.window_ms = window_ms
|
|
79
|
+
|
|
80
|
+
def analyze(self, audio_path: str) -> AnalysisResult:
|
|
81
|
+
"""
|
|
82
|
+
Analyze an audio file for audible content.
|
|
83
|
+
|
|
84
|
+
This method loads the audio file, calculates amplitude levels across
|
|
85
|
+
the duration, and identifies segments where the amplitude exceeds
|
|
86
|
+
the silence threshold.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
audio_path: Path to the audio file to analyze. Supports formats
|
|
90
|
+
that pydub/ffmpeg can read (FLAC, WAV, MP3, etc.)
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
AnalysisResult containing:
|
|
94
|
+
- has_audible_content: Whether any audible content was found
|
|
95
|
+
- total_duration_seconds: Total duration of the audio
|
|
96
|
+
- audible_segments: List of detected audible segments
|
|
97
|
+
- recommended_selection: Recommendation for which instrumental
|
|
98
|
+
- Various statistics about the audible content
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
FileNotFoundError: If the audio file doesn't exist
|
|
102
|
+
Exception: If the audio file cannot be loaded
|
|
103
|
+
"""
|
|
104
|
+
path = Path(audio_path)
|
|
105
|
+
if not path.exists():
|
|
106
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
107
|
+
|
|
108
|
+
logger.info(f"Analyzing audio file: {audio_path}")
|
|
109
|
+
|
|
110
|
+
# Load audio file
|
|
111
|
+
audio = AudioSegment.from_file(audio_path)
|
|
112
|
+
total_duration_ms = len(audio)
|
|
113
|
+
total_duration_seconds = total_duration_ms / 1000.0
|
|
114
|
+
|
|
115
|
+
logger.debug(f"Audio duration: {total_duration_seconds:.2f}s, "
|
|
116
|
+
f"channels: {audio.channels}, "
|
|
117
|
+
f"sample_rate: {audio.frame_rate}")
|
|
118
|
+
|
|
119
|
+
# Convert to mono for consistent analysis
|
|
120
|
+
if audio.channels > 1:
|
|
121
|
+
audio = audio.set_channels(1)
|
|
122
|
+
|
|
123
|
+
# Analyze amplitude in windows
|
|
124
|
+
audible_windows = self._find_audible_windows(audio)
|
|
125
|
+
|
|
126
|
+
# Merge adjacent windows into segments
|
|
127
|
+
raw_segments = self._windows_to_segments(audible_windows, audio)
|
|
128
|
+
|
|
129
|
+
# Merge close segments and filter short ones
|
|
130
|
+
segments = self._merge_and_filter_segments(raw_segments)
|
|
131
|
+
|
|
132
|
+
# Calculate statistics
|
|
133
|
+
total_audible_ms = sum(
|
|
134
|
+
seg.duration_seconds * 1000 for seg in segments
|
|
135
|
+
)
|
|
136
|
+
total_audible_seconds = total_audible_ms / 1000.0
|
|
137
|
+
audible_percentage = (
|
|
138
|
+
(total_audible_seconds / total_duration_seconds * 100)
|
|
139
|
+
if total_duration_seconds > 0 else 0.0
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
has_audible_content = len(segments) > 0
|
|
143
|
+
|
|
144
|
+
# Determine recommendation
|
|
145
|
+
recommended_selection = self._get_recommendation(
|
|
146
|
+
has_audible_content,
|
|
147
|
+
segments,
|
|
148
|
+
audible_percentage
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
logger.info(
|
|
152
|
+
f"Analysis complete: {len(segments)} segments, "
|
|
153
|
+
f"{audible_percentage:.1f}% audible, "
|
|
154
|
+
f"recommendation: {recommended_selection.value}"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return AnalysisResult(
|
|
158
|
+
has_audible_content=has_audible_content,
|
|
159
|
+
total_duration_seconds=total_duration_seconds,
|
|
160
|
+
audible_segments=segments,
|
|
161
|
+
recommended_selection=recommended_selection,
|
|
162
|
+
silence_threshold_db=self.silence_threshold_db,
|
|
163
|
+
total_audible_duration_seconds=total_audible_seconds,
|
|
164
|
+
audible_percentage=audible_percentage,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def get_amplitude_envelope(
|
|
168
|
+
self,
|
|
169
|
+
audio_path: str,
|
|
170
|
+
window_ms: int = 100,
|
|
171
|
+
normalize: bool = True,
|
|
172
|
+
) -> List[float]:
|
|
173
|
+
"""
|
|
174
|
+
Get the amplitude envelope for waveform visualization.
|
|
175
|
+
|
|
176
|
+
This method returns a list of amplitude values suitable for
|
|
177
|
+
rendering a waveform display. Each value represents the RMS
|
|
178
|
+
amplitude of a window of audio.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
audio_path: Path to the audio file
|
|
182
|
+
window_ms: Size of each window in milliseconds. Smaller values
|
|
183
|
+
give more detail but larger data. Default is 100ms.
|
|
184
|
+
normalize: If True, normalize amplitudes to 0.0-1.0 range.
|
|
185
|
+
Default is True.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
List of amplitude values (floats). If normalize=True, values
|
|
189
|
+
are in the range [0.0, 1.0]. Otherwise, values are in dBFS.
|
|
190
|
+
"""
|
|
191
|
+
path = Path(audio_path)
|
|
192
|
+
if not path.exists():
|
|
193
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
194
|
+
|
|
195
|
+
audio = AudioSegment.from_file(audio_path)
|
|
196
|
+
|
|
197
|
+
# Convert to mono
|
|
198
|
+
if audio.channels > 1:
|
|
199
|
+
audio = audio.set_channels(1)
|
|
200
|
+
|
|
201
|
+
amplitudes = []
|
|
202
|
+
duration_ms = len(audio)
|
|
203
|
+
|
|
204
|
+
for start_ms in range(0, duration_ms, window_ms):
|
|
205
|
+
end_ms = min(start_ms + window_ms, duration_ms)
|
|
206
|
+
window = audio[start_ms:end_ms]
|
|
207
|
+
|
|
208
|
+
# Get RMS amplitude in dBFS
|
|
209
|
+
if window.rms > 0:
|
|
210
|
+
db = 20 * math.log10(window.rms / window.max_possible_amplitude)
|
|
211
|
+
else:
|
|
212
|
+
db = -100.0 # Effectively silent
|
|
213
|
+
|
|
214
|
+
amplitudes.append(db)
|
|
215
|
+
|
|
216
|
+
if normalize and amplitudes:
|
|
217
|
+
# Normalize to 0.0 - 1.0 range
|
|
218
|
+
# Map from [silence_threshold, 0] to [0, 1]
|
|
219
|
+
min_db = self.silence_threshold_db
|
|
220
|
+
max_db = 0.0
|
|
221
|
+
amplitudes = [
|
|
222
|
+
max(0.0, min(1.0, (db - min_db) / (max_db - min_db)))
|
|
223
|
+
for db in amplitudes
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
return amplitudes
|
|
227
|
+
|
|
228
|
+
def _find_audible_windows(
|
|
229
|
+
self,
|
|
230
|
+
audio: AudioSegment
|
|
231
|
+
) -> List[Tuple[int, float, float]]:
|
|
232
|
+
"""
|
|
233
|
+
Find windows with amplitude above the silence threshold.
|
|
234
|
+
|
|
235
|
+
Returns a list of tuples: (start_ms, avg_db, peak_db)
|
|
236
|
+
"""
|
|
237
|
+
audible_windows = []
|
|
238
|
+
duration_ms = len(audio)
|
|
239
|
+
|
|
240
|
+
for start_ms in range(0, duration_ms, self.window_ms):
|
|
241
|
+
end_ms = min(start_ms + self.window_ms, duration_ms)
|
|
242
|
+
window = audio[start_ms:end_ms]
|
|
243
|
+
|
|
244
|
+
# Calculate RMS amplitude in dB
|
|
245
|
+
if window.rms > 0:
|
|
246
|
+
avg_db = 20 * math.log10(window.rms / window.max_possible_amplitude)
|
|
247
|
+
# Peak is approximated as max sample value
|
|
248
|
+
peak_db = window.dBFS if hasattr(window, 'dBFS') else avg_db
|
|
249
|
+
else:
|
|
250
|
+
avg_db = -100.0
|
|
251
|
+
peak_db = -100.0
|
|
252
|
+
|
|
253
|
+
if avg_db > self.silence_threshold_db:
|
|
254
|
+
audible_windows.append((start_ms, avg_db, peak_db))
|
|
255
|
+
|
|
256
|
+
return audible_windows
|
|
257
|
+
|
|
258
|
+
def _windows_to_segments(
|
|
259
|
+
self,
|
|
260
|
+
audible_windows: List[Tuple[int, float, float]],
|
|
261
|
+
audio: AudioSegment
|
|
262
|
+
) -> List[AudibleSegment]:
|
|
263
|
+
"""
|
|
264
|
+
Convert list of audible windows into contiguous segments.
|
|
265
|
+
"""
|
|
266
|
+
if not audible_windows:
|
|
267
|
+
return []
|
|
268
|
+
|
|
269
|
+
segments = []
|
|
270
|
+
segment_start_ms = audible_windows[0][0]
|
|
271
|
+
segment_dbs = [audible_windows[0][1]]
|
|
272
|
+
segment_peaks = [audible_windows[0][2]]
|
|
273
|
+
last_end_ms = audible_windows[0][0] + self.window_ms
|
|
274
|
+
|
|
275
|
+
for i in range(1, len(audible_windows)):
|
|
276
|
+
start_ms, avg_db, peak_db = audible_windows[i]
|
|
277
|
+
|
|
278
|
+
# Check if this window is contiguous with the previous
|
|
279
|
+
gap_ms = start_ms - last_end_ms
|
|
280
|
+
|
|
281
|
+
if gap_ms <= self.window_ms:
|
|
282
|
+
# Extend current segment
|
|
283
|
+
segment_dbs.append(avg_db)
|
|
284
|
+
segment_peaks.append(peak_db)
|
|
285
|
+
last_end_ms = start_ms + self.window_ms
|
|
286
|
+
else:
|
|
287
|
+
# Save current segment and start a new one
|
|
288
|
+
segments.append(self._create_segment(
|
|
289
|
+
segment_start_ms, last_end_ms, segment_dbs, segment_peaks
|
|
290
|
+
))
|
|
291
|
+
|
|
292
|
+
segment_start_ms = start_ms
|
|
293
|
+
segment_dbs = [avg_db]
|
|
294
|
+
segment_peaks = [peak_db]
|
|
295
|
+
last_end_ms = start_ms + self.window_ms
|
|
296
|
+
|
|
297
|
+
# Don't forget the last segment
|
|
298
|
+
segments.append(self._create_segment(
|
|
299
|
+
segment_start_ms, last_end_ms, segment_dbs, segment_peaks
|
|
300
|
+
))
|
|
301
|
+
|
|
302
|
+
return segments
|
|
303
|
+
|
|
304
|
+
def _create_segment(
|
|
305
|
+
self,
|
|
306
|
+
start_ms: int,
|
|
307
|
+
end_ms: int,
|
|
308
|
+
dbs: List[float],
|
|
309
|
+
peaks: List[float]
|
|
310
|
+
) -> AudibleSegment:
|
|
311
|
+
"""Create an AudibleSegment from window data."""
|
|
312
|
+
return AudibleSegment(
|
|
313
|
+
start_seconds=start_ms / 1000.0,
|
|
314
|
+
end_seconds=end_ms / 1000.0,
|
|
315
|
+
duration_seconds=(end_ms - start_ms) / 1000.0,
|
|
316
|
+
avg_amplitude_db=sum(dbs) / len(dbs) if dbs else -100.0,
|
|
317
|
+
peak_amplitude_db=max(peaks) if peaks else -100.0,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def _merge_and_filter_segments(
|
|
321
|
+
self,
|
|
322
|
+
segments: List[AudibleSegment]
|
|
323
|
+
) -> List[AudibleSegment]:
|
|
324
|
+
"""
|
|
325
|
+
Merge segments that are close together and filter out short ones.
|
|
326
|
+
"""
|
|
327
|
+
if not segments:
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
# Sort by start time
|
|
331
|
+
segments = sorted(segments, key=lambda s: s.start_seconds)
|
|
332
|
+
|
|
333
|
+
# Merge segments with small gaps
|
|
334
|
+
merged = []
|
|
335
|
+
current = segments[0]
|
|
336
|
+
|
|
337
|
+
for next_seg in segments[1:]:
|
|
338
|
+
gap_ms = (next_seg.start_seconds - current.end_seconds) * 1000
|
|
339
|
+
|
|
340
|
+
if gap_ms <= self.merge_gap_ms:
|
|
341
|
+
# Merge segments
|
|
342
|
+
combined_duration = (
|
|
343
|
+
next_seg.end_seconds - current.start_seconds
|
|
344
|
+
)
|
|
345
|
+
# Weight average amplitude by duration
|
|
346
|
+
total_duration = (
|
|
347
|
+
current.duration_seconds + next_seg.duration_seconds
|
|
348
|
+
)
|
|
349
|
+
weighted_avg_db = (
|
|
350
|
+
(current.avg_amplitude_db * current.duration_seconds +
|
|
351
|
+
next_seg.avg_amplitude_db * next_seg.duration_seconds)
|
|
352
|
+
/ total_duration
|
|
353
|
+
) if total_duration > 0 else -100.0
|
|
354
|
+
|
|
355
|
+
current = AudibleSegment(
|
|
356
|
+
start_seconds=current.start_seconds,
|
|
357
|
+
end_seconds=next_seg.end_seconds,
|
|
358
|
+
duration_seconds=combined_duration,
|
|
359
|
+
avg_amplitude_db=weighted_avg_db,
|
|
360
|
+
peak_amplitude_db=max(
|
|
361
|
+
current.peak_amplitude_db,
|
|
362
|
+
next_seg.peak_amplitude_db
|
|
363
|
+
),
|
|
364
|
+
)
|
|
365
|
+
else:
|
|
366
|
+
merged.append(current)
|
|
367
|
+
current = next_seg
|
|
368
|
+
|
|
369
|
+
merged.append(current)
|
|
370
|
+
|
|
371
|
+
# Filter out segments shorter than minimum duration
|
|
372
|
+
min_duration_seconds = self.min_segment_duration_ms / 1000.0
|
|
373
|
+
filtered = [
|
|
374
|
+
seg for seg in merged
|
|
375
|
+
if seg.duration_seconds >= min_duration_seconds
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
return filtered
|
|
379
|
+
|
|
380
|
+
def _get_recommendation(
|
|
381
|
+
self,
|
|
382
|
+
has_audible_content: bool,
|
|
383
|
+
segments: List[AudibleSegment],
|
|
384
|
+
audible_percentage: float
|
|
385
|
+
) -> RecommendedSelection:
|
|
386
|
+
"""
|
|
387
|
+
Determine the recommended instrumental selection.
|
|
388
|
+
|
|
389
|
+
Logic:
|
|
390
|
+
- If no audible content: recommend clean instrumental
|
|
391
|
+
- If audible content covers > 20% of the audio: likely has
|
|
392
|
+
meaningful backing vocals, recommend review
|
|
393
|
+
- Otherwise: minimal content, recommend clean
|
|
394
|
+
"""
|
|
395
|
+
if not has_audible_content:
|
|
396
|
+
return RecommendedSelection.CLEAN
|
|
397
|
+
|
|
398
|
+
# If there's significant audible content, recommend review
|
|
399
|
+
if audible_percentage > 20.0:
|
|
400
|
+
return RecommendedSelection.REVIEW_NEEDED
|
|
401
|
+
|
|
402
|
+
# If there are loud segments, recommend review
|
|
403
|
+
loud_segments = [seg for seg in segments if seg.is_loud]
|
|
404
|
+
if loud_segments:
|
|
405
|
+
return RecommendedSelection.REVIEW_NEEDED
|
|
406
|
+
|
|
407
|
+
# Minimal content - recommend clean
|
|
408
|
+
return RecommendedSelection.CLEAN
|