karaoke-gen 0.75.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of karaoke-gen might be problematic. Click here for more details.

Files changed (287) hide show
  1. karaoke_gen/__init__.py +38 -0
  2. karaoke_gen/audio_fetcher.py +1614 -0
  3. karaoke_gen/audio_processor.py +790 -0
  4. karaoke_gen/config.py +83 -0
  5. karaoke_gen/file_handler.py +387 -0
  6. karaoke_gen/instrumental_review/__init__.py +45 -0
  7. karaoke_gen/instrumental_review/analyzer.py +408 -0
  8. karaoke_gen/instrumental_review/editor.py +322 -0
  9. karaoke_gen/instrumental_review/models.py +171 -0
  10. karaoke_gen/instrumental_review/server.py +475 -0
  11. karaoke_gen/instrumental_review/static/index.html +1529 -0
  12. karaoke_gen/instrumental_review/waveform.py +409 -0
  13. karaoke_gen/karaoke_finalise/__init__.py +1 -0
  14. karaoke_gen/karaoke_finalise/karaoke_finalise.py +1833 -0
  15. karaoke_gen/karaoke_gen.py +1026 -0
  16. karaoke_gen/lyrics_processor.py +474 -0
  17. karaoke_gen/metadata.py +160 -0
  18. karaoke_gen/pipeline/__init__.py +87 -0
  19. karaoke_gen/pipeline/base.py +215 -0
  20. karaoke_gen/pipeline/context.py +230 -0
  21. karaoke_gen/pipeline/executors/__init__.py +21 -0
  22. karaoke_gen/pipeline/executors/local.py +159 -0
  23. karaoke_gen/pipeline/executors/remote.py +257 -0
  24. karaoke_gen/pipeline/stages/__init__.py +27 -0
  25. karaoke_gen/pipeline/stages/finalize.py +202 -0
  26. karaoke_gen/pipeline/stages/render.py +165 -0
  27. karaoke_gen/pipeline/stages/screens.py +139 -0
  28. karaoke_gen/pipeline/stages/separation.py +191 -0
  29. karaoke_gen/pipeline/stages/transcription.py +191 -0
  30. karaoke_gen/resources/AvenirNext-Bold.ttf +0 -0
  31. karaoke_gen/resources/Montserrat-Bold.ttf +0 -0
  32. karaoke_gen/resources/Oswald-Bold.ttf +0 -0
  33. karaoke_gen/resources/Oswald-SemiBold.ttf +0 -0
  34. karaoke_gen/resources/Zurich_Cn_BT_Bold.ttf +0 -0
  35. karaoke_gen/style_loader.py +531 -0
  36. karaoke_gen/utils/__init__.py +18 -0
  37. karaoke_gen/utils/bulk_cli.py +492 -0
  38. karaoke_gen/utils/cli_args.py +432 -0
  39. karaoke_gen/utils/gen_cli.py +978 -0
  40. karaoke_gen/utils/remote_cli.py +3268 -0
  41. karaoke_gen/video_background_processor.py +351 -0
  42. karaoke_gen/video_generator.py +424 -0
  43. karaoke_gen-0.75.54.dist-info/METADATA +718 -0
  44. karaoke_gen-0.75.54.dist-info/RECORD +287 -0
  45. karaoke_gen-0.75.54.dist-info/WHEEL +4 -0
  46. karaoke_gen-0.75.54.dist-info/entry_points.txt +5 -0
  47. karaoke_gen-0.75.54.dist-info/licenses/LICENSE +21 -0
  48. lyrics_transcriber/__init__.py +10 -0
  49. lyrics_transcriber/cli/__init__.py +0 -0
  50. lyrics_transcriber/cli/cli_main.py +285 -0
  51. lyrics_transcriber/core/__init__.py +0 -0
  52. lyrics_transcriber/core/config.py +50 -0
  53. lyrics_transcriber/core/controller.py +594 -0
  54. lyrics_transcriber/correction/__init__.py +0 -0
  55. lyrics_transcriber/correction/agentic/__init__.py +9 -0
  56. lyrics_transcriber/correction/agentic/adapter.py +71 -0
  57. lyrics_transcriber/correction/agentic/agent.py +313 -0
  58. lyrics_transcriber/correction/agentic/feedback/aggregator.py +12 -0
  59. lyrics_transcriber/correction/agentic/feedback/collector.py +17 -0
  60. lyrics_transcriber/correction/agentic/feedback/retention.py +24 -0
  61. lyrics_transcriber/correction/agentic/feedback/store.py +76 -0
  62. lyrics_transcriber/correction/agentic/handlers/__init__.py +24 -0
  63. lyrics_transcriber/correction/agentic/handlers/ambiguous.py +44 -0
  64. lyrics_transcriber/correction/agentic/handlers/background_vocals.py +68 -0
  65. lyrics_transcriber/correction/agentic/handlers/base.py +51 -0
  66. lyrics_transcriber/correction/agentic/handlers/complex_multi_error.py +46 -0
  67. lyrics_transcriber/correction/agentic/handlers/extra_words.py +74 -0
  68. lyrics_transcriber/correction/agentic/handlers/no_error.py +42 -0
  69. lyrics_transcriber/correction/agentic/handlers/punctuation.py +44 -0
  70. lyrics_transcriber/correction/agentic/handlers/registry.py +60 -0
  71. lyrics_transcriber/correction/agentic/handlers/repeated_section.py +44 -0
  72. lyrics_transcriber/correction/agentic/handlers/sound_alike.py +126 -0
  73. lyrics_transcriber/correction/agentic/models/__init__.py +5 -0
  74. lyrics_transcriber/correction/agentic/models/ai_correction.py +31 -0
  75. lyrics_transcriber/correction/agentic/models/correction_session.py +30 -0
  76. lyrics_transcriber/correction/agentic/models/enums.py +38 -0
  77. lyrics_transcriber/correction/agentic/models/human_feedback.py +30 -0
  78. lyrics_transcriber/correction/agentic/models/learning_data.py +26 -0
  79. lyrics_transcriber/correction/agentic/models/observability_metrics.py +28 -0
  80. lyrics_transcriber/correction/agentic/models/schemas.py +46 -0
  81. lyrics_transcriber/correction/agentic/models/utils.py +19 -0
  82. lyrics_transcriber/correction/agentic/observability/__init__.py +5 -0
  83. lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +35 -0
  84. lyrics_transcriber/correction/agentic/observability/metrics.py +46 -0
  85. lyrics_transcriber/correction/agentic/observability/performance.py +19 -0
  86. lyrics_transcriber/correction/agentic/prompts/__init__.py +2 -0
  87. lyrics_transcriber/correction/agentic/prompts/classifier.py +227 -0
  88. lyrics_transcriber/correction/agentic/providers/__init__.py +6 -0
  89. lyrics_transcriber/correction/agentic/providers/base.py +36 -0
  90. lyrics_transcriber/correction/agentic/providers/circuit_breaker.py +145 -0
  91. lyrics_transcriber/correction/agentic/providers/config.py +73 -0
  92. lyrics_transcriber/correction/agentic/providers/constants.py +24 -0
  93. lyrics_transcriber/correction/agentic/providers/health.py +28 -0
  94. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +212 -0
  95. lyrics_transcriber/correction/agentic/providers/model_factory.py +209 -0
  96. lyrics_transcriber/correction/agentic/providers/response_cache.py +218 -0
  97. lyrics_transcriber/correction/agentic/providers/response_parser.py +111 -0
  98. lyrics_transcriber/correction/agentic/providers/retry_executor.py +127 -0
  99. lyrics_transcriber/correction/agentic/router.py +35 -0
  100. lyrics_transcriber/correction/agentic/workflows/__init__.py +5 -0
  101. lyrics_transcriber/correction/agentic/workflows/consensus_workflow.py +24 -0
  102. lyrics_transcriber/correction/agentic/workflows/correction_graph.py +59 -0
  103. lyrics_transcriber/correction/agentic/workflows/feedback_workflow.py +24 -0
  104. lyrics_transcriber/correction/anchor_sequence.py +919 -0
  105. lyrics_transcriber/correction/corrector.py +760 -0
  106. lyrics_transcriber/correction/feedback/__init__.py +2 -0
  107. lyrics_transcriber/correction/feedback/schemas.py +107 -0
  108. lyrics_transcriber/correction/feedback/store.py +236 -0
  109. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  110. lyrics_transcriber/correction/handlers/base.py +52 -0
  111. lyrics_transcriber/correction/handlers/extend_anchor.py +149 -0
  112. lyrics_transcriber/correction/handlers/levenshtein.py +189 -0
  113. lyrics_transcriber/correction/handlers/llm.py +293 -0
  114. lyrics_transcriber/correction/handlers/llm_providers.py +60 -0
  115. lyrics_transcriber/correction/handlers/no_space_punct_match.py +154 -0
  116. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +85 -0
  117. lyrics_transcriber/correction/handlers/repeat.py +88 -0
  118. lyrics_transcriber/correction/handlers/sound_alike.py +259 -0
  119. lyrics_transcriber/correction/handlers/syllables_match.py +252 -0
  120. lyrics_transcriber/correction/handlers/word_count_match.py +80 -0
  121. lyrics_transcriber/correction/handlers/word_operations.py +187 -0
  122. lyrics_transcriber/correction/operations.py +352 -0
  123. lyrics_transcriber/correction/phrase_analyzer.py +435 -0
  124. lyrics_transcriber/correction/text_utils.py +30 -0
  125. lyrics_transcriber/frontend/.gitignore +23 -0
  126. lyrics_transcriber/frontend/.yarn/releases/yarn-4.7.0.cjs +935 -0
  127. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  128. lyrics_transcriber/frontend/README.md +50 -0
  129. lyrics_transcriber/frontend/REPLACE_ALL_FUNCTIONALITY.md +210 -0
  130. lyrics_transcriber/frontend/__init__.py +25 -0
  131. lyrics_transcriber/frontend/eslint.config.js +28 -0
  132. lyrics_transcriber/frontend/index.html +18 -0
  133. lyrics_transcriber/frontend/package.json +42 -0
  134. lyrics_transcriber/frontend/public/android-chrome-192x192.png +0 -0
  135. lyrics_transcriber/frontend/public/android-chrome-512x512.png +0 -0
  136. lyrics_transcriber/frontend/public/apple-touch-icon.png +0 -0
  137. lyrics_transcriber/frontend/public/favicon-16x16.png +0 -0
  138. lyrics_transcriber/frontend/public/favicon-32x32.png +0 -0
  139. lyrics_transcriber/frontend/public/favicon.ico +0 -0
  140. lyrics_transcriber/frontend/public/nomad-karaoke-logo.png +0 -0
  141. lyrics_transcriber/frontend/src/App.tsx +214 -0
  142. lyrics_transcriber/frontend/src/api.ts +254 -0
  143. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +77 -0
  144. lyrics_transcriber/frontend/src/components/AddLyricsModal.tsx +114 -0
  145. lyrics_transcriber/frontend/src/components/AgenticCorrectionMetrics.tsx +204 -0
  146. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +180 -0
  147. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +167 -0
  148. lyrics_transcriber/frontend/src/components/CorrectionAnnotationModal.tsx +359 -0
  149. lyrics_transcriber/frontend/src/components/CorrectionDetailCard.tsx +281 -0
  150. lyrics_transcriber/frontend/src/components/CorrectionMetrics.tsx +162 -0
  151. lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +257 -0
  152. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +68 -0
  153. lyrics_transcriber/frontend/src/components/EditModal.tsx +702 -0
  154. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +496 -0
  155. lyrics_transcriber/frontend/src/components/EditWordList.tsx +379 -0
  156. lyrics_transcriber/frontend/src/components/FileUpload.tsx +77 -0
  157. lyrics_transcriber/frontend/src/components/FindReplaceModal.tsx +467 -0
  158. lyrics_transcriber/frontend/src/components/Header.tsx +413 -0
  159. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +1387 -0
  160. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
  161. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
  162. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
  163. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
  164. lyrics_transcriber/frontend/src/components/MetricsDashboard.tsx +51 -0
  165. lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
  166. lyrics_transcriber/frontend/src/components/ModeSelector.tsx +67 -0
  167. lyrics_transcriber/frontend/src/components/ModelSelector.tsx +23 -0
  168. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +144 -0
  169. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +268 -0
  170. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +336 -0
  171. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +354 -0
  172. lyrics_transcriber/frontend/src/components/SegmentDetailsModal.tsx +64 -0
  173. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +376 -0
  174. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +131 -0
  175. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +256 -0
  176. lyrics_transcriber/frontend/src/components/WordDivider.tsx +187 -0
  177. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +379 -0
  178. lyrics_transcriber/frontend/src/components/shared/components/SourceSelector.tsx +56 -0
  179. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +87 -0
  180. lyrics_transcriber/frontend/src/components/shared/constants.ts +20 -0
  181. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +180 -0
  182. lyrics_transcriber/frontend/src/components/shared/styles.ts +13 -0
  183. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  184. lyrics_transcriber/frontend/src/components/shared/types.ts +129 -0
  185. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +177 -0
  186. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  187. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +75 -0
  188. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +360 -0
  189. lyrics_transcriber/frontend/src/components/shared/utils/timingUtils.ts +110 -0
  190. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  191. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +435 -0
  192. lyrics_transcriber/frontend/src/main.tsx +17 -0
  193. lyrics_transcriber/frontend/src/theme.ts +177 -0
  194. lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
  195. lyrics_transcriber/frontend/src/types.js +2 -0
  196. lyrics_transcriber/frontend/src/types.ts +199 -0
  197. lyrics_transcriber/frontend/src/validation.ts +132 -0
  198. lyrics_transcriber/frontend/src/vite-env.d.ts +1 -0
  199. lyrics_transcriber/frontend/tsconfig.app.json +26 -0
  200. lyrics_transcriber/frontend/tsconfig.json +25 -0
  201. lyrics_transcriber/frontend/tsconfig.node.json +23 -0
  202. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -0
  203. lyrics_transcriber/frontend/update_version.js +11 -0
  204. lyrics_transcriber/frontend/vite.config.d.ts +2 -0
  205. lyrics_transcriber/frontend/vite.config.js +10 -0
  206. lyrics_transcriber/frontend/vite.config.ts +11 -0
  207. lyrics_transcriber/frontend/web_assets/android-chrome-192x192.png +0 -0
  208. lyrics_transcriber/frontend/web_assets/android-chrome-512x512.png +0 -0
  209. lyrics_transcriber/frontend/web_assets/apple-touch-icon.png +0 -0
  210. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js +43288 -0
  211. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +1 -0
  212. lyrics_transcriber/frontend/web_assets/favicon-16x16.png +0 -0
  213. lyrics_transcriber/frontend/web_assets/favicon-32x32.png +0 -0
  214. lyrics_transcriber/frontend/web_assets/favicon.ico +0 -0
  215. lyrics_transcriber/frontend/web_assets/index.html +18 -0
  216. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.png +0 -0
  217. lyrics_transcriber/frontend/yarn.lock +3752 -0
  218. lyrics_transcriber/lyrics/__init__.py +0 -0
  219. lyrics_transcriber/lyrics/base_lyrics_provider.py +211 -0
  220. lyrics_transcriber/lyrics/file_provider.py +95 -0
  221. lyrics_transcriber/lyrics/genius.py +384 -0
  222. lyrics_transcriber/lyrics/lrclib.py +231 -0
  223. lyrics_transcriber/lyrics/musixmatch.py +156 -0
  224. lyrics_transcriber/lyrics/spotify.py +290 -0
  225. lyrics_transcriber/lyrics/user_input_provider.py +44 -0
  226. lyrics_transcriber/output/__init__.py +0 -0
  227. lyrics_transcriber/output/ass/__init__.py +21 -0
  228. lyrics_transcriber/output/ass/ass.py +2088 -0
  229. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  230. lyrics_transcriber/output/ass/config.py +180 -0
  231. lyrics_transcriber/output/ass/constants.py +23 -0
  232. lyrics_transcriber/output/ass/event.py +94 -0
  233. lyrics_transcriber/output/ass/formatters.py +132 -0
  234. lyrics_transcriber/output/ass/lyrics_line.py +265 -0
  235. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  236. lyrics_transcriber/output/ass/section_detector.py +89 -0
  237. lyrics_transcriber/output/ass/section_screen.py +106 -0
  238. lyrics_transcriber/output/ass/style.py +187 -0
  239. lyrics_transcriber/output/cdg.py +619 -0
  240. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  241. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  242. lyrics_transcriber/output/cdgmaker/composer.py +2260 -0
  243. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  244. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  245. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  246. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  247. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  248. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  249. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  250. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  251. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  252. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  253. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  254. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  255. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  256. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  257. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  258. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  259. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  260. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  261. lyrics_transcriber/output/countdown_processor.py +306 -0
  262. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  263. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  264. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  265. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  266. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  267. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  268. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  269. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  270. lyrics_transcriber/output/generator.py +257 -0
  271. lyrics_transcriber/output/lrc_to_cdg.py +61 -0
  272. lyrics_transcriber/output/lyrics_file.py +102 -0
  273. lyrics_transcriber/output/plain_text.py +96 -0
  274. lyrics_transcriber/output/segment_resizer.py +431 -0
  275. lyrics_transcriber/output/subtitles.py +397 -0
  276. lyrics_transcriber/output/video.py +544 -0
  277. lyrics_transcriber/review/__init__.py +0 -0
  278. lyrics_transcriber/review/server.py +676 -0
  279. lyrics_transcriber/storage/__init__.py +0 -0
  280. lyrics_transcriber/storage/dropbox.py +225 -0
  281. lyrics_transcriber/transcribers/__init__.py +0 -0
  282. lyrics_transcriber/transcribers/audioshake.py +379 -0
  283. lyrics_transcriber/transcribers/base_transcriber.py +157 -0
  284. lyrics_transcriber/transcribers/whisper.py +330 -0
  285. lyrics_transcriber/types.py +650 -0
  286. lyrics_transcriber/utils/__init__.py +0 -0
  287. lyrics_transcriber/utils/word_utils.py +27 -0
@@ -0,0 +1,408 @@
1
+ """
2
+ Audio analyzer for detecting audible content in backing vocals.
3
+
4
+ This module provides the AudioAnalyzer class which analyzes audio files
5
+ to detect segments of audible content above a silence threshold. It's used
6
+ to help determine whether backing vocals should be included in the final
7
+ karaoke instrumental.
8
+ """
9
+
10
+ import logging
11
+ import math
12
+ from pathlib import Path
13
+ from typing import List, Optional, Tuple
14
+
15
+ from pydub import AudioSegment
16
+
17
+ from .models import AnalysisResult, AudibleSegment, RecommendedSelection
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class AudioAnalyzer:
24
+ """
25
+ Analyzes audio files for backing vocals content.
26
+
27
+ This class is pure Python with no cloud dependencies. It works with
28
+ local file paths and uses pydub for audio loading and analysis.
29
+
30
+ The analyzer detects segments of audible content (above a silence threshold)
31
+ and provides recommendations for instrumental selection based on the
32
+ analysis results.
33
+
34
+ Attributes:
35
+ silence_threshold_db: Amplitude threshold below which audio is
36
+ considered silent (default: -40.0 dB)
37
+ min_segment_duration_ms: Minimum duration for a segment to be
38
+ considered audible (default: 100ms)
39
+ merge_gap_ms: Maximum gap between segments to merge them
40
+ (default: 500ms)
41
+ window_ms: Analysis window size in milliseconds (default: 50ms)
42
+
43
+ Example:
44
+ >>> analyzer = AudioAnalyzer(silence_threshold_db=-40.0)
45
+ >>> result = analyzer.analyze("/path/to/backing_vocals.flac")
46
+ >>> if result.has_audible_content:
47
+ ... print(f"Found {result.segment_count} audible segments")
48
+ ... for seg in result.audible_segments:
49
+ ... print(f" {seg.start_seconds:.1f}s - {seg.end_seconds:.1f}s")
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ silence_threshold_db: float = -40.0,
55
+ min_segment_duration_ms: int = 100,
56
+ merge_gap_ms: int = 500,
57
+ window_ms: int = 50,
58
+ ):
59
+ """
60
+ Initialize the audio analyzer.
61
+
62
+ Args:
63
+ silence_threshold_db: Amplitude threshold below which audio is
64
+ considered silent. Default is -40.0 dB.
65
+ min_segment_duration_ms: Minimum duration for a segment to be
66
+ reported as audible. Segments shorter than this are ignored.
67
+ Default is 100ms.
68
+ merge_gap_ms: If two audible segments are separated by a gap
69
+ shorter than this, they are merged into one segment.
70
+ Default is 500ms.
71
+ window_ms: Size of the analysis window in milliseconds.
72
+ Smaller windows give more precise timing but slower analysis.
73
+ Default is 50ms.
74
+ """
75
+ self.silence_threshold_db = silence_threshold_db
76
+ self.min_segment_duration_ms = min_segment_duration_ms
77
+ self.merge_gap_ms = merge_gap_ms
78
+ self.window_ms = window_ms
79
+
80
+ def analyze(self, audio_path: str) -> AnalysisResult:
81
+ """
82
+ Analyze an audio file for audible content.
83
+
84
+ This method loads the audio file, calculates amplitude levels across
85
+ the duration, and identifies segments where the amplitude exceeds
86
+ the silence threshold.
87
+
88
+ Args:
89
+ audio_path: Path to the audio file to analyze. Supports formats
90
+ that pydub/ffmpeg can read (FLAC, WAV, MP3, etc.)
91
+
92
+ Returns:
93
+ AnalysisResult containing:
94
+ - has_audible_content: Whether any audible content was found
95
+ - total_duration_seconds: Total duration of the audio
96
+ - audible_segments: List of detected audible segments
97
+ - recommended_selection: Recommendation for which instrumental
98
+ - Various statistics about the audible content
99
+
100
+ Raises:
101
+ FileNotFoundError: If the audio file doesn't exist
102
+ Exception: If the audio file cannot be loaded
103
+ """
104
+ path = Path(audio_path)
105
+ if not path.exists():
106
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
107
+
108
+ logger.info(f"Analyzing audio file: {audio_path}")
109
+
110
+ # Load audio file
111
+ audio = AudioSegment.from_file(audio_path)
112
+ total_duration_ms = len(audio)
113
+ total_duration_seconds = total_duration_ms / 1000.0
114
+
115
+ logger.debug(f"Audio duration: {total_duration_seconds:.2f}s, "
116
+ f"channels: {audio.channels}, "
117
+ f"sample_rate: {audio.frame_rate}")
118
+
119
+ # Convert to mono for consistent analysis
120
+ if audio.channels > 1:
121
+ audio = audio.set_channels(1)
122
+
123
+ # Analyze amplitude in windows
124
+ audible_windows = self._find_audible_windows(audio)
125
+
126
+ # Merge adjacent windows into segments
127
+ raw_segments = self._windows_to_segments(audible_windows, audio)
128
+
129
+ # Merge close segments and filter short ones
130
+ segments = self._merge_and_filter_segments(raw_segments)
131
+
132
+ # Calculate statistics
133
+ total_audible_ms = sum(
134
+ seg.duration_seconds * 1000 for seg in segments
135
+ )
136
+ total_audible_seconds = total_audible_ms / 1000.0
137
+ audible_percentage = (
138
+ (total_audible_seconds / total_duration_seconds * 100)
139
+ if total_duration_seconds > 0 else 0.0
140
+ )
141
+
142
+ has_audible_content = len(segments) > 0
143
+
144
+ # Determine recommendation
145
+ recommended_selection = self._get_recommendation(
146
+ has_audible_content,
147
+ segments,
148
+ audible_percentage
149
+ )
150
+
151
+ logger.info(
152
+ f"Analysis complete: {len(segments)} segments, "
153
+ f"{audible_percentage:.1f}% audible, "
154
+ f"recommendation: {recommended_selection.value}"
155
+ )
156
+
157
+ return AnalysisResult(
158
+ has_audible_content=has_audible_content,
159
+ total_duration_seconds=total_duration_seconds,
160
+ audible_segments=segments,
161
+ recommended_selection=recommended_selection,
162
+ silence_threshold_db=self.silence_threshold_db,
163
+ total_audible_duration_seconds=total_audible_seconds,
164
+ audible_percentage=audible_percentage,
165
+ )
166
+
167
+ def get_amplitude_envelope(
168
+ self,
169
+ audio_path: str,
170
+ window_ms: int = 100,
171
+ normalize: bool = True,
172
+ ) -> List[float]:
173
+ """
174
+ Get the amplitude envelope for waveform visualization.
175
+
176
+ This method returns a list of amplitude values suitable for
177
+ rendering a waveform display. Each value represents the RMS
178
+ amplitude of a window of audio.
179
+
180
+ Args:
181
+ audio_path: Path to the audio file
182
+ window_ms: Size of each window in milliseconds. Smaller values
183
+ give more detail but larger data. Default is 100ms.
184
+ normalize: If True, normalize amplitudes to 0.0-1.0 range.
185
+ Default is True.
186
+
187
+ Returns:
188
+ List of amplitude values (floats). If normalize=True, values
189
+ are in the range [0.0, 1.0]. Otherwise, values are in dBFS.
190
+ """
191
+ path = Path(audio_path)
192
+ if not path.exists():
193
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
194
+
195
+ audio = AudioSegment.from_file(audio_path)
196
+
197
+ # Convert to mono
198
+ if audio.channels > 1:
199
+ audio = audio.set_channels(1)
200
+
201
+ amplitudes = []
202
+ duration_ms = len(audio)
203
+
204
+ for start_ms in range(0, duration_ms, window_ms):
205
+ end_ms = min(start_ms + window_ms, duration_ms)
206
+ window = audio[start_ms:end_ms]
207
+
208
+ # Get RMS amplitude in dBFS
209
+ if window.rms > 0:
210
+ db = 20 * math.log10(window.rms / window.max_possible_amplitude)
211
+ else:
212
+ db = -100.0 # Effectively silent
213
+
214
+ amplitudes.append(db)
215
+
216
+ if normalize and amplitudes:
217
+ # Normalize to 0.0 - 1.0 range
218
+ # Map from [silence_threshold, 0] to [0, 1]
219
+ min_db = self.silence_threshold_db
220
+ max_db = 0.0
221
+ amplitudes = [
222
+ max(0.0, min(1.0, (db - min_db) / (max_db - min_db)))
223
+ for db in amplitudes
224
+ ]
225
+
226
+ return amplitudes
227
+
228
+ def _find_audible_windows(
229
+ self,
230
+ audio: AudioSegment
231
+ ) -> List[Tuple[int, float, float]]:
232
+ """
233
+ Find windows with amplitude above the silence threshold.
234
+
235
+ Returns a list of tuples: (start_ms, avg_db, peak_db)
236
+ """
237
+ audible_windows = []
238
+ duration_ms = len(audio)
239
+
240
+ for start_ms in range(0, duration_ms, self.window_ms):
241
+ end_ms = min(start_ms + self.window_ms, duration_ms)
242
+ window = audio[start_ms:end_ms]
243
+
244
+ # Calculate RMS amplitude in dB
245
+ if window.rms > 0:
246
+ avg_db = 20 * math.log10(window.rms / window.max_possible_amplitude)
247
+ # Peak is approximated as max sample value
248
+ peak_db = window.dBFS if hasattr(window, 'dBFS') else avg_db
249
+ else:
250
+ avg_db = -100.0
251
+ peak_db = -100.0
252
+
253
+ if avg_db > self.silence_threshold_db:
254
+ audible_windows.append((start_ms, avg_db, peak_db))
255
+
256
+ return audible_windows
257
+
258
+ def _windows_to_segments(
259
+ self,
260
+ audible_windows: List[Tuple[int, float, float]],
261
+ audio: AudioSegment
262
+ ) -> List[AudibleSegment]:
263
+ """
264
+ Convert list of audible windows into contiguous segments.
265
+ """
266
+ if not audible_windows:
267
+ return []
268
+
269
+ segments = []
270
+ segment_start_ms = audible_windows[0][0]
271
+ segment_dbs = [audible_windows[0][1]]
272
+ segment_peaks = [audible_windows[0][2]]
273
+ last_end_ms = audible_windows[0][0] + self.window_ms
274
+
275
+ for i in range(1, len(audible_windows)):
276
+ start_ms, avg_db, peak_db = audible_windows[i]
277
+
278
+ # Check if this window is contiguous with the previous
279
+ gap_ms = start_ms - last_end_ms
280
+
281
+ if gap_ms <= self.window_ms:
282
+ # Extend current segment
283
+ segment_dbs.append(avg_db)
284
+ segment_peaks.append(peak_db)
285
+ last_end_ms = start_ms + self.window_ms
286
+ else:
287
+ # Save current segment and start a new one
288
+ segments.append(self._create_segment(
289
+ segment_start_ms, last_end_ms, segment_dbs, segment_peaks
290
+ ))
291
+
292
+ segment_start_ms = start_ms
293
+ segment_dbs = [avg_db]
294
+ segment_peaks = [peak_db]
295
+ last_end_ms = start_ms + self.window_ms
296
+
297
+ # Don't forget the last segment
298
+ segments.append(self._create_segment(
299
+ segment_start_ms, last_end_ms, segment_dbs, segment_peaks
300
+ ))
301
+
302
+ return segments
303
+
304
+ def _create_segment(
305
+ self,
306
+ start_ms: int,
307
+ end_ms: int,
308
+ dbs: List[float],
309
+ peaks: List[float]
310
+ ) -> AudibleSegment:
311
+ """Create an AudibleSegment from window data."""
312
+ return AudibleSegment(
313
+ start_seconds=start_ms / 1000.0,
314
+ end_seconds=end_ms / 1000.0,
315
+ duration_seconds=(end_ms - start_ms) / 1000.0,
316
+ avg_amplitude_db=sum(dbs) / len(dbs) if dbs else -100.0,
317
+ peak_amplitude_db=max(peaks) if peaks else -100.0,
318
+ )
319
+
320
+ def _merge_and_filter_segments(
321
+ self,
322
+ segments: List[AudibleSegment]
323
+ ) -> List[AudibleSegment]:
324
+ """
325
+ Merge segments that are close together and filter out short ones.
326
+ """
327
+ if not segments:
328
+ return []
329
+
330
+ # Sort by start time
331
+ segments = sorted(segments, key=lambda s: s.start_seconds)
332
+
333
+ # Merge segments with small gaps
334
+ merged = []
335
+ current = segments[0]
336
+
337
+ for next_seg in segments[1:]:
338
+ gap_ms = (next_seg.start_seconds - current.end_seconds) * 1000
339
+
340
+ if gap_ms <= self.merge_gap_ms:
341
+ # Merge segments
342
+ combined_duration = (
343
+ next_seg.end_seconds - current.start_seconds
344
+ )
345
+ # Weight average amplitude by duration
346
+ total_duration = (
347
+ current.duration_seconds + next_seg.duration_seconds
348
+ )
349
+ weighted_avg_db = (
350
+ (current.avg_amplitude_db * current.duration_seconds +
351
+ next_seg.avg_amplitude_db * next_seg.duration_seconds)
352
+ / total_duration
353
+ ) if total_duration > 0 else -100.0
354
+
355
+ current = AudibleSegment(
356
+ start_seconds=current.start_seconds,
357
+ end_seconds=next_seg.end_seconds,
358
+ duration_seconds=combined_duration,
359
+ avg_amplitude_db=weighted_avg_db,
360
+ peak_amplitude_db=max(
361
+ current.peak_amplitude_db,
362
+ next_seg.peak_amplitude_db
363
+ ),
364
+ )
365
+ else:
366
+ merged.append(current)
367
+ current = next_seg
368
+
369
+ merged.append(current)
370
+
371
+ # Filter out segments shorter than minimum duration
372
+ min_duration_seconds = self.min_segment_duration_ms / 1000.0
373
+ filtered = [
374
+ seg for seg in merged
375
+ if seg.duration_seconds >= min_duration_seconds
376
+ ]
377
+
378
+ return filtered
379
+
380
+ def _get_recommendation(
381
+ self,
382
+ has_audible_content: bool,
383
+ segments: List[AudibleSegment],
384
+ audible_percentage: float
385
+ ) -> RecommendedSelection:
386
+ """
387
+ Determine the recommended instrumental selection.
388
+
389
+ Logic:
390
+ - If no audible content: recommend clean instrumental
391
+ - If audible content covers > 20% of the audio: likely has
392
+ meaningful backing vocals, recommend review
393
+ - Otherwise: minimal content, recommend clean
394
+ """
395
+ if not has_audible_content:
396
+ return RecommendedSelection.CLEAN
397
+
398
+ # If there's significant audible content, recommend review
399
+ if audible_percentage > 20.0:
400
+ return RecommendedSelection.REVIEW_NEEDED
401
+
402
+ # If there are loud segments, recommend review
403
+ loud_segments = [seg for seg in segments if seg.is_loud]
404
+ if loud_segments:
405
+ return RecommendedSelection.REVIEW_NEEDED
406
+
407
+ # Minimal content - recommend clean
408
+ return RecommendedSelection.CLEAN