@siteed/audio-studio 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +535 -0
- package/LICENSE +21 -0
- package/README.md +167 -0
- package/android/build.gradle +143 -0
- package/android/src/androidTest/assets/chorus.wav +0 -0
- package/android/src/androidTest/assets/jfk.wav +0 -0
- package/android/src/androidTest/assets/osr_us_000_0010_8k.wav +0 -0
- package/android/src/androidTest/assets/recorder_hello_world.wav +0 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/AudioProcessorInstrumentedTest.kt +197 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/AudioRecorderInstrumentedTest.kt +541 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/AudioRecorderPerformanceInstrumentedTest.kt +234 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/AudioFocusStrategyIntegrationTest.kt +332 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/BufferDurationIntegrationTest.kt +324 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/CompressedOnlyOutputTest.kt +253 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/DeviceDisconnectionFallbackTest.kt +218 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/EventEmissionIntervalTest.kt +120 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/M4aFormatTest.kt +345 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/OutputControlIntegrationTest.kt +340 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/PcmStreamingDurationTest.kt +252 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/README.md +95 -0
- package/android/src/androidTest/java/net/siteed/audiostudio/integration/run_integration_tests.sh +43 -0
- package/android/src/main/AndroidManifest.xml +30 -0
- package/android/src/main/CMakeLists.txt +29 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioAnalysisData.kt +188 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioDataEncoder.kt +9 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioDeviceManager.kt +1741 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioFeaturesNative.kt +26 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioFileHandler.kt +136 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioFormatUtils.kt +354 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioNotificationsManager.kt +439 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioProcessor.kt +2237 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioRecorderManager.kt +2163 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioRecordingService.kt +167 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioStudioModule.kt +1112 -0
- package/android/src/main/java/net/siteed/audiostudio/AudioTrimmer.kt +1099 -0
- package/android/src/main/java/net/siteed/audiostudio/Constants.kt +37 -0
- package/android/src/main/java/net/siteed/audiostudio/EventSender.kt +7 -0
- package/android/src/main/java/net/siteed/audiostudio/FFT.kt +100 -0
- package/android/src/main/java/net/siteed/audiostudio/Features.kt +98 -0
- package/android/src/main/java/net/siteed/audiostudio/LogUtils.kt +93 -0
- package/android/src/main/java/net/siteed/audiostudio/MelSpectrogramNative.kt +36 -0
- package/android/src/main/java/net/siteed/audiostudio/NotificationConfig.kt +72 -0
- package/android/src/main/java/net/siteed/audiostudio/PermissionUtils.kt +68 -0
- package/android/src/main/java/net/siteed/audiostudio/RecordingActionReceiver.kt +59 -0
- package/android/src/main/java/net/siteed/audiostudio/RecordingConfig.kt +259 -0
- package/android/src/main/java/net/siteed/audiostudio/WaveformConfig.kt +19 -0
- package/android/src/main/java/net/siteed/audiostudio/WaveformRenderer.kt +159 -0
- package/android/src/main/jni/AudioFeaturesJNI.cpp +152 -0
- package/android/src/main/jni/MelSpectrogramJNI.cpp +165 -0
- package/android/src/main/res/drawable/ic_default_action_icon.xml +16 -0
- package/android/src/main/res/drawable/ic_microphone.xml +13 -0
- package/android/src/main/res/drawable/ic_pause.xml +10 -0
- package/android/src/main/res/drawable/ic_play.xml +10 -0
- package/android/src/main/res/drawable/ic_stop.xml +10 -0
- package/android/src/main/res/layout/notification_recording.xml +37 -0
- package/android/src/test/java/net/siteed/audiostudio/AudioFileHandlerTest.kt +279 -0
- package/android/src/test/java/net/siteed/audiostudio/AudioFocusStrategyTest.kt +249 -0
- package/android/src/test/java/net/siteed/audiostudio/AudioFormatTest.kt +151 -0
- package/android/src/test/java/net/siteed/audiostudio/AudioFormatUtilsTest.kt +273 -0
- package/android/src/test/java/net/siteed/audiostudio/DeviceDisconnectionFallbackUnitTest.kt +140 -0
- package/android/src/test/resources/chorus.wav +0 -0
- package/android/src/test/resources/generate_test_audio.py +94 -0
- package/android/src/test/resources/jfk.wav +0 -0
- package/android/src/test/resources/osr_us_000_0010_8k.wav +0 -0
- package/android/src/test/resources/recorder_hello_world.wav +0 -0
- package/app.plugin.js +3 -0
- package/build/cjs/AudioAnalysis/AudioAnalysis.types.js +4 -0
- package/build/cjs/AudioAnalysis/AudioAnalysis.types.js.map +1 -0
- package/build/cjs/AudioAnalysis/audioFeaturesWasm.js +164 -0
- package/build/cjs/AudioAnalysis/audioFeaturesWasm.js.map +1 -0
- package/build/cjs/AudioAnalysis/extractAudioAnalysis.js +213 -0
- package/build/cjs/AudioAnalysis/extractAudioAnalysis.js.map +1 -0
- package/build/cjs/AudioAnalysis/extractAudioData.js +21 -0
- package/build/cjs/AudioAnalysis/extractAudioData.js.map +1 -0
- package/build/cjs/AudioAnalysis/extractMelSpectrogram.js +90 -0
- package/build/cjs/AudioAnalysis/extractMelSpectrogram.js.map +1 -0
- package/build/cjs/AudioAnalysis/extractPreview.js +28 -0
- package/build/cjs/AudioAnalysis/extractPreview.js.map +1 -0
- package/build/cjs/AudioAnalysis/extractWaveform.js +18 -0
- package/build/cjs/AudioAnalysis/extractWaveform.js.map +1 -0
- package/build/cjs/AudioAnalysis/melSpectrogramWasm.js +149 -0
- package/build/cjs/AudioAnalysis/melSpectrogramWasm.js.map +1 -0
- package/build/cjs/AudioDeviceManager.js +688 -0
- package/build/cjs/AudioDeviceManager.js.map +1 -0
- package/build/cjs/AudioRecorder.provider.js +78 -0
- package/build/cjs/AudioRecorder.provider.js.map +1 -0
- package/build/cjs/AudioStudio.native.js +8 -0
- package/build/cjs/AudioStudio.native.js.map +1 -0
- package/build/cjs/AudioStudio.types.js +11 -0
- package/build/cjs/AudioStudio.types.js.map +1 -0
- package/build/cjs/AudioStudio.web.js +708 -0
- package/build/cjs/AudioStudio.web.js.map +1 -0
- package/build/cjs/AudioStudioModule.js +718 -0
- package/build/cjs/AudioStudioModule.js.map +1 -0
- package/build/cjs/WebRecorder.web.js +865 -0
- package/build/cjs/WebRecorder.web.js.map +1 -0
- package/build/cjs/constants/platformLimitations.js +99 -0
- package/build/cjs/constants/platformLimitations.js.map +1 -0
- package/build/cjs/constants.js +20 -0
- package/build/cjs/constants.js.map +1 -0
- package/build/cjs/events.js +29 -0
- package/build/cjs/events.js.map +1 -0
- package/build/cjs/hooks/useAudioDevices.js +179 -0
- package/build/cjs/hooks/useAudioDevices.js.map +1 -0
- package/build/cjs/index.js +64 -0
- package/build/cjs/index.js.map +1 -0
- package/build/cjs/trimAudio.js +76 -0
- package/build/cjs/trimAudio.js.map +1 -0
- package/build/cjs/useAudioRecorder.js +535 -0
- package/build/cjs/useAudioRecorder.js.map +1 -0
- package/build/cjs/utils/BlobFix.js +502 -0
- package/build/cjs/utils/BlobFix.js.map +1 -0
- package/build/cjs/utils/audioProcessing.js +136 -0
- package/build/cjs/utils/audioProcessing.js.map +1 -0
- package/build/cjs/utils/cleanNativeOptions.js +22 -0
- package/build/cjs/utils/cleanNativeOptions.js.map +1 -0
- package/build/cjs/utils/concatenateBuffers.js +25 -0
- package/build/cjs/utils/concatenateBuffers.js.map +1 -0
- package/build/cjs/utils/convertPCMToFloat32.js +124 -0
- package/build/cjs/utils/convertPCMToFloat32.js.map +1 -0
- package/build/cjs/utils/crc32.js +52 -0
- package/build/cjs/utils/crc32.js.map +1 -0
- package/build/cjs/utils/encodingToBitDepth.js +17 -0
- package/build/cjs/utils/encodingToBitDepth.js.map +1 -0
- package/build/cjs/utils/getWavFileInfo.js +96 -0
- package/build/cjs/utils/getWavFileInfo.js.map +1 -0
- package/build/cjs/utils/writeWavHeader.js +88 -0
- package/build/cjs/utils/writeWavHeader.js.map +1 -0
- package/build/cjs/workers/InlineFeaturesExtractor.web.js +294 -0
- package/build/cjs/workers/InlineFeaturesExtractor.web.js.map +1 -0
- package/build/cjs/workers/inlineAudioWebWorker.web.js +190 -0
- package/build/cjs/workers/inlineAudioWebWorker.web.js.map +1 -0
- package/build/cjs/workers/wasmGlueString.web.js +27 -0
- package/build/cjs/workers/wasmGlueString.web.js.map +1 -0
- package/build/esm/AudioAnalysis/AudioAnalysis.types.js +3 -0
- package/build/esm/AudioAnalysis/AudioAnalysis.types.js.map +1 -0
- package/build/esm/AudioAnalysis/audioFeaturesWasm.js +126 -0
- package/build/esm/AudioAnalysis/audioFeaturesWasm.js.map +1 -0
- package/build/esm/AudioAnalysis/extractAudioAnalysis.js +205 -0
- package/build/esm/AudioAnalysis/extractAudioAnalysis.js.map +1 -0
- package/build/esm/AudioAnalysis/extractAudioData.js +14 -0
- package/build/esm/AudioAnalysis/extractAudioData.js.map +1 -0
- package/build/esm/AudioAnalysis/extractMelSpectrogram.js +86 -0
- package/build/esm/AudioAnalysis/extractMelSpectrogram.js.map +1 -0
- package/build/esm/AudioAnalysis/extractPreview.js +25 -0
- package/build/esm/AudioAnalysis/extractPreview.js.map +1 -0
- package/build/esm/AudioAnalysis/extractWaveform.js +11 -0
- package/build/esm/AudioAnalysis/extractWaveform.js.map +1 -0
- package/build/esm/AudioAnalysis/melSpectrogramWasm.js +111 -0
- package/build/esm/AudioAnalysis/melSpectrogramWasm.js.map +1 -0
- package/build/esm/AudioDeviceManager.js +681 -0
- package/build/esm/AudioDeviceManager.js.map +1 -0
- package/build/esm/AudioRecorder.provider.js +40 -0
- package/build/esm/AudioRecorder.provider.js.map +1 -0
- package/build/esm/AudioStudio.native.js +6 -0
- package/build/esm/AudioStudio.native.js.map +1 -0
- package/build/esm/AudioStudio.types.js +8 -0
- package/build/esm/AudioStudio.types.js.map +1 -0
- package/build/esm/AudioStudio.web.js +704 -0
- package/build/esm/AudioStudio.web.js.map +1 -0
- package/build/esm/AudioStudioModule.js +713 -0
- package/build/esm/AudioStudioModule.js.map +1 -0
- package/build/esm/WebRecorder.web.js +861 -0
- package/build/esm/WebRecorder.web.js.map +1 -0
- package/build/esm/constants/platformLimitations.js +90 -0
- package/build/esm/constants/platformLimitations.js.map +1 -0
- package/build/esm/constants.js +17 -0
- package/build/esm/constants.js.map +1 -0
- package/build/esm/events.js +21 -0
- package/build/esm/events.js.map +1 -0
- package/build/esm/hooks/useAudioDevices.js +176 -0
- package/build/esm/hooks/useAudioDevices.js.map +1 -0
- package/build/esm/index.js +23 -0
- package/build/esm/index.js.map +1 -0
- package/build/esm/trimAudio.js +69 -0
- package/build/esm/trimAudio.js.map +1 -0
- package/build/esm/useAudioRecorder.js +529 -0
- package/build/esm/useAudioRecorder.js.map +1 -0
- package/build/esm/utils/BlobFix.js +498 -0
- package/build/esm/utils/BlobFix.js.map +1 -0
- package/build/esm/utils/audioProcessing.js +133 -0
- package/build/esm/utils/audioProcessing.js.map +1 -0
- package/build/esm/utils/cleanNativeOptions.js +19 -0
- package/build/esm/utils/cleanNativeOptions.js.map +1 -0
- package/build/esm/utils/concatenateBuffers.js +21 -0
- package/build/esm/utils/concatenateBuffers.js.map +1 -0
- package/build/esm/utils/convertPCMToFloat32.js +120 -0
- package/build/esm/utils/convertPCMToFloat32.js.map +1 -0
- package/build/esm/utils/crc32.js +50 -0
- package/build/esm/utils/crc32.js.map +1 -0
- package/build/esm/utils/encodingToBitDepth.js +13 -0
- package/build/esm/utils/encodingToBitDepth.js.map +1 -0
- package/build/esm/utils/getWavFileInfo.js +92 -0
- package/build/esm/utils/getWavFileInfo.js.map +1 -0
- package/build/esm/utils/writeWavHeader.js +84 -0
- package/build/esm/utils/writeWavHeader.js.map +1 -0
- package/build/esm/workers/InlineFeaturesExtractor.web.js +291 -0
- package/build/esm/workers/InlineFeaturesExtractor.web.js.map +1 -0
- package/build/esm/workers/inlineAudioWebWorker.web.js +187 -0
- package/build/esm/workers/inlineAudioWebWorker.web.js.map +1 -0
- package/build/esm/workers/wasmGlueString.web.js +24 -0
- package/build/esm/workers/wasmGlueString.web.js.map +1 -0
- package/build/types/AudioAnalysis/AudioAnalysis.types.d.ts +198 -0
- package/build/types/AudioAnalysis/AudioAnalysis.types.d.ts.map +1 -0
- package/build/types/AudioAnalysis/audioFeaturesWasm.d.ts +24 -0
- package/build/types/AudioAnalysis/audioFeaturesWasm.d.ts.map +1 -0
- package/build/types/AudioAnalysis/extractAudioAnalysis.d.ts +74 -0
- package/build/types/AudioAnalysis/extractAudioAnalysis.d.ts.map +1 -0
- package/build/types/AudioAnalysis/extractAudioData.d.ts +3 -0
- package/build/types/AudioAnalysis/extractAudioData.d.ts.map +1 -0
- package/build/types/AudioAnalysis/extractMelSpectrogram.d.ts +20 -0
- package/build/types/AudioAnalysis/extractMelSpectrogram.d.ts.map +1 -0
- package/build/types/AudioAnalysis/extractPreview.d.ts +11 -0
- package/build/types/AudioAnalysis/extractPreview.d.ts.map +1 -0
- package/build/types/AudioAnalysis/extractWaveform.d.ts +8 -0
- package/build/types/AudioAnalysis/extractWaveform.d.ts.map +1 -0
- package/build/types/AudioAnalysis/melSpectrogramWasm.d.ts +16 -0
- package/build/types/AudioAnalysis/melSpectrogramWasm.d.ts.map +1 -0
- package/build/types/AudioDeviceManager.d.ts +187 -0
- package/build/types/AudioDeviceManager.d.ts.map +1 -0
- package/build/types/AudioRecorder.provider.d.ts +11 -0
- package/build/types/AudioRecorder.provider.d.ts.map +1 -0
- package/build/types/AudioStudio.native.d.ts +3 -0
- package/build/types/AudioStudio.native.d.ts.map +1 -0
- package/build/types/AudioStudio.types.d.ts +760 -0
- package/build/types/AudioStudio.types.d.ts.map +1 -0
- package/build/types/AudioStudio.web.d.ts +96 -0
- package/build/types/AudioStudio.web.d.ts.map +1 -0
- package/build/types/AudioStudioModule.d.ts +3 -0
- package/build/types/AudioStudioModule.d.ts.map +1 -0
- package/build/types/WebRecorder.web.d.ts +208 -0
- package/build/types/WebRecorder.web.d.ts.map +1 -0
- package/build/types/constants/platformLimitations.d.ts +40 -0
- package/build/types/constants/platformLimitations.d.ts.map +1 -0
- package/build/types/constants.d.ts +14 -0
- package/build/types/constants.d.ts.map +1 -0
- package/build/types/events.d.ts +29 -0
- package/build/types/events.d.ts.map +1 -0
- package/build/types/hooks/useAudioDevices.d.ts +15 -0
- package/build/types/hooks/useAudioDevices.d.ts.map +1 -0
- package/build/types/index.d.ts +21 -0
- package/build/types/index.d.ts.map +1 -0
- package/build/types/trimAudio.d.ts +25 -0
- package/build/types/trimAudio.d.ts.map +1 -0
- package/build/types/useAudioRecorder.d.ts +22 -0
- package/build/types/useAudioRecorder.d.ts.map +1 -0
- package/build/types/utils/BlobFix.d.ts +9 -0
- package/build/types/utils/BlobFix.d.ts.map +1 -0
- package/build/types/utils/audioProcessing.d.ts +24 -0
- package/build/types/utils/audioProcessing.d.ts.map +1 -0
- package/build/types/utils/cleanNativeOptions.d.ts +15 -0
- package/build/types/utils/cleanNativeOptions.d.ts.map +1 -0
- package/build/types/utils/concatenateBuffers.d.ts +8 -0
- package/build/types/utils/concatenateBuffers.d.ts.map +1 -0
- package/build/types/utils/convertPCMToFloat32.d.ts +13 -0
- package/build/types/utils/convertPCMToFloat32.d.ts.map +1 -0
- package/build/types/utils/crc32.d.ts +7 -0
- package/build/types/utils/crc32.d.ts.map +1 -0
- package/build/types/utils/encodingToBitDepth.d.ts +5 -0
- package/build/types/utils/encodingToBitDepth.d.ts.map +1 -0
- package/build/types/utils/getWavFileInfo.d.ts +26 -0
- package/build/types/utils/getWavFileInfo.d.ts.map +1 -0
- package/build/types/utils/writeWavHeader.d.ts +34 -0
- package/build/types/utils/writeWavHeader.d.ts.map +1 -0
- package/build/types/workers/InlineFeaturesExtractor.web.d.ts +2 -0
- package/build/types/workers/InlineFeaturesExtractor.web.d.ts.map +1 -0
- package/build/types/workers/inlineAudioWebWorker.web.d.ts +2 -0
- package/build/types/workers/inlineAudioWebWorker.web.d.ts.map +1 -0
- package/build/types/workers/wasmGlueString.web.d.ts +2 -0
- package/build/types/workers/wasmGlueString.web.d.ts.map +1 -0
- package/cpp/AudioFeatures.cpp +274 -0
- package/cpp/AudioFeatures.h +85 -0
- package/cpp/AudioFeaturesBridge.cpp +146 -0
- package/cpp/AudioFeaturesBridge.h +47 -0
- package/cpp/MelSpectrogram.cpp +227 -0
- package/cpp/MelSpectrogram.h +82 -0
- package/cpp/MelSpectrogramBridge.cpp +112 -0
- package/cpp/MelSpectrogramBridge.h +33 -0
- package/cpp/kiss_fft/COPYING +11 -0
- package/cpp/kiss_fft/_kiss_fft_guts.h +167 -0
- package/cpp/kiss_fft/kiss_fft.c +424 -0
- package/cpp/kiss_fft/kiss_fft.h +160 -0
- package/cpp/kiss_fft/kiss_fft_log.h +36 -0
- package/cpp/kiss_fft/kiss_fftr.c +155 -0
- package/cpp/kiss_fft/kiss_fftr.h +54 -0
- package/expo-module.config.json +10 -0
- package/ios/AudioAnalysisData.swift +74 -0
- package/ios/AudioDeviceManager.swift +670 -0
- package/ios/AudioFeaturesWrapper.h +21 -0
- package/ios/AudioFeaturesWrapper.mm +63 -0
- package/ios/AudioNotificationManager.swift +154 -0
- package/ios/AudioProcessingHelpers.swift +797 -0
- package/ios/AudioProcessor.swift +1191 -0
- package/ios/AudioStreamError.swift +7 -0
- package/ios/AudioStreamManager.swift +2369 -0
- package/ios/AudioStreamManagerDelegate.swift +16 -0
- package/ios/AudioStudio.podspec +39 -0
- package/ios/AudioStudioModule.swift +1111 -0
- package/ios/AudioStudioTests/AudioFileHandlerTests.swift +338 -0
- package/ios/AudioStudioTests/AudioFormatUtilsTests.swift +331 -0
- package/ios/AudioStudioTests/AudioTestHelpers.swift +130 -0
- package/ios/AudioStudioTests/CompressedOnlyOutputTests.swift +294 -0
- package/ios/AudioStudioTests/EventEmissionIntervalTests.swift +105 -0
- package/ios/AudioStudioTests/Info.plist +22 -0
- package/ios/AudioStudioTests/README.md +39 -0
- package/ios/AudioStudioTests/SimpleAudioTest.swift +98 -0
- package/ios/AudioStudioTests/TestAudioGenerator.swift +75 -0
- package/ios/DataPoint.swift +54 -0
- package/ios/DecodingConfig.swift +59 -0
- package/ios/FFT.swift +62 -0
- package/ios/Features.swift +95 -0
- package/ios/ISSUE_IOS.md +68 -0
- package/ios/Logger.swift +39 -0
- package/ios/MelSpectrogramWrapper.h +30 -0
- package/ios/MelSpectrogramWrapper.mm +97 -0
- package/ios/NotificationExtension.swift +15 -0
- package/ios/RecordingResult.swift +22 -0
- package/ios/RecordingSettings.swift +311 -0
- package/ios/WaveformExtractor.swift +105 -0
- package/ios/tests/README.md +41 -0
- package/ios/tests/integration/buffer_and_fallback_test.swift +178 -0
- package/ios/tests/integration/buffer_duration_test.swift +185 -0
- package/ios/tests/integration/compressed_only_output_test.swift +271 -0
- package/ios/tests/integration/output_control_test.swift +322 -0
- package/ios/tests/integration/run_integration_tests.sh +37 -0
- package/ios/tests/opus_support_test_macos.swift +154 -0
- package/ios/tests/standalone/audio_processing_test.swift +144 -0
- package/ios/tests/standalone/audio_recording_test.swift +277 -0
- package/ios/tests/standalone/audio_streaming_test.swift +249 -0
- package/ios/tests/standalone/standalone_test.swift +144 -0
- package/package.json +146 -0
- package/plugin/build/index.cjs +194 -0
- package/plugin/build/index.d.cts +22 -0
- package/plugin/build/index.js +194 -0
- package/plugin/src/index.ts +285 -0
- package/plugin/tsconfig.json +10 -0
- package/plugin/tsconfig.tsbuildinfo +1 -0
- package/prebuilt/wasm/mel-spectrogram.js +18 -0
- package/src/AudioAnalysis/AudioAnalysis.types.ts +226 -0
- package/src/AudioAnalysis/audio-features-wasm.d.ts +37 -0
- package/src/AudioAnalysis/audioFeaturesWasm.ts +200 -0
- package/src/AudioAnalysis/extractAudioAnalysis.ts +350 -0
- package/src/AudioAnalysis/extractAudioData.ts +17 -0
- package/src/AudioAnalysis/extractMelSpectrogram.ts +140 -0
- package/src/AudioAnalysis/extractPreview.ts +34 -0
- package/src/AudioAnalysis/extractWaveform.ts +22 -0
- package/src/AudioAnalysis/mel-spectrogram-wasm.d.ts +48 -0
- package/src/AudioAnalysis/melSpectrogramWasm.ts +179 -0
- package/src/AudioDeviceManager.ts +800 -0
- package/src/AudioRecorder.provider.tsx +57 -0
- package/src/AudioStudio.native.ts +6 -0
- package/src/AudioStudio.types.ts +899 -0
- package/src/AudioStudio.web.ts +911 -0
- package/src/AudioStudioModule.ts +984 -0
- package/src/WebRecorder.web.ts +1114 -0
- package/src/constants/platformLimitations.ts +118 -0
- package/src/constants.ts +21 -0
- package/src/events.ts +63 -0
- package/src/hooks/useAudioDevices.ts +213 -0
- package/src/index.ts +67 -0
- package/src/trimAudio.ts +94 -0
- package/src/types/crc-32.d.ts +9 -0
- package/src/useAudioRecorder.tsx +784 -0
- package/src/utils/BlobFix.ts +561 -0
- package/src/utils/audioProcessing.ts +205 -0
- package/src/utils/cleanNativeOptions.ts +18 -0
- package/src/utils/concatenateBuffers.ts +24 -0
- package/src/utils/convertPCMToFloat32.ts +170 -0
- package/src/utils/crc32.ts +59 -0
- package/src/utils/encodingToBitDepth.ts +18 -0
- package/src/utils/getWavFileInfo.ts +132 -0
- package/src/utils/writeWavHeader.ts +115 -0
- package/src/workers/InlineFeaturesExtractor.web.tsx +291 -0
- package/src/workers/inlineAudioWebWorker.web.tsx +186 -0
- package/src/workers/wasmGlueString.web.ts +23 -0
|
@@ -0,0 +1,2237 @@
|
|
|
1
|
+
package net.siteed.audiostudio
|
|
2
|
+
|
|
3
|
+
import java.nio.ByteBuffer
|
|
4
|
+
import java.nio.ByteOrder
|
|
5
|
+
import kotlin.math.*
|
|
6
|
+
import android.util.Log
|
|
7
|
+
import java.io.File
|
|
8
|
+
import java.util.concurrent.atomic.AtomicLong
|
|
9
|
+
import kotlin.system.measureTimeMillis
|
|
10
|
+
import android.media.MediaExtractor
|
|
11
|
+
import android.media.MediaFormat
|
|
12
|
+
import android.media.MediaCodec
|
|
13
|
+
import java.io.FileInputStream
|
|
14
|
+
import java.io.RandomAccessFile
|
|
15
|
+
import java.util.zip.CRC32
|
|
16
|
+
import net.siteed.audiostudio.LogUtils
|
|
17
|
+
|
|
18
|
+
data class DecodingConfig(
|
|
19
|
+
val targetSampleRate: Int? = null, // Optional target sample rate
|
|
20
|
+
val targetChannels: Int? = null, // Optional target number of channels
|
|
21
|
+
val targetBitDepth: Int = 16, // Default to 16-bit PCM
|
|
22
|
+
val normalizeAudio: Boolean = false // Whether to normalize audio levels
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
data class SpectrogramData(
|
|
26
|
+
val spectrogram: Array<FloatArray>, // 2D array: [time, frequency]
|
|
27
|
+
val timeStamps: FloatArray, // Time (in seconds) for each frame
|
|
28
|
+
val frequencies: FloatArray // Frequencies (in Hz) for each mel bin
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
class AudioProcessor(private val filesDir: File) {
|
|
32
|
+
companion object {
|
|
33
|
+
const val DCT_SQRT_DIVISOR = 2.0
|
|
34
|
+
private const val N_FFT = 1024
|
|
35
|
+
private const val N_CHROMA = 12
|
|
36
|
+
private const val CLASS_NAME = "AudioProcessor" // Add class name constant for logging
|
|
37
|
+
|
|
38
|
+
private val uniqueIdCounter = AtomicLong(0L) // Keep as companion object property to maintain during pause/resume cycles
|
|
39
|
+
|
|
40
|
+
fun resetUniqueIdCounter() {
|
|
41
|
+
uniqueIdCounter.set(0L)
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
data class AudioData(val data: ByteArray, val sampleRate: Int, val bitDepth: Int, val channels: Int, val durationMs: Long = 0)
|
|
46
|
+
|
|
47
|
+
private var cumulativeMinAmplitude = Float.MAX_VALUE
|
|
48
|
+
private var cumulativeMaxAmplitude = Float.NEGATIVE_INFINITY
|
|
49
|
+
|
|
50
|
+
private fun loadAudioFile(filePath: String): AudioData? {
|
|
51
|
+
try {
|
|
52
|
+
val fileUri = filePath.removePrefix("file://")
|
|
53
|
+
LogUtils.d(CLASS_NAME, "Processing WAV file: $fileUri")
|
|
54
|
+
|
|
55
|
+
val file = File(fileUri).takeIf { it.exists() } ?: File(filesDir, File(fileUri).name).takeIf { it.exists() }
|
|
56
|
+
?: run {
|
|
57
|
+
LogUtils.e(CLASS_NAME, "File not found: $fileUri")
|
|
58
|
+
return null
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
val raf = RandomAccessFile(file, "r")
|
|
62
|
+
val fileSize = raf.length()
|
|
63
|
+
|
|
64
|
+
// Read RIFF header
|
|
65
|
+
val riffHeader = ByteArray(4).apply { raf.readFully(this) }
|
|
66
|
+
if (String(riffHeader) != "RIFF") {
|
|
67
|
+
LogUtils.e(CLASS_NAME, "Invalid RIFF header")
|
|
68
|
+
return null
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Read WAVE header
|
|
72
|
+
val waveHeader = ByteArray(4).apply { raf.readFully(this) }
|
|
73
|
+
if (String(waveHeader) != "WAVE") {
|
|
74
|
+
LogUtils.e(CLASS_NAME, "Invalid WAVE header")
|
|
75
|
+
return null
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
var fmtChunkFound = false
|
|
79
|
+
var dataChunkFound = false
|
|
80
|
+
var sampleRate = 0
|
|
81
|
+
var channels = 0
|
|
82
|
+
var bitDepth = 0
|
|
83
|
+
var dataOffset = 0L
|
|
84
|
+
var dataSize = 0L
|
|
85
|
+
|
|
86
|
+
// Parse chunks
|
|
87
|
+
while (raf.filePointer < fileSize - 8) {
|
|
88
|
+
val chunkId = ByteArray(4).apply { raf.readFully(this) }.toString(Charsets.UTF_8)
|
|
89
|
+
val chunkSizeBytes = ByteArray(4).apply { raf.readFully(this) }
|
|
90
|
+
val chunkSize = ByteBuffer.wrap(chunkSizeBytes).order(ByteOrder.LITTLE_ENDIAN).int.toLong() and 0xFFFFFFFFL
|
|
91
|
+
|
|
92
|
+
LogUtils.d(CLASS_NAME, "Found chunk: $chunkId ($chunkSize bytes)")
|
|
93
|
+
|
|
94
|
+
when (chunkId) {
|
|
95
|
+
"fmt " -> {
|
|
96
|
+
if (chunkSize < 16) {
|
|
97
|
+
LogUtils.e(CLASS_NAME, "Invalid fmt chunk size")
|
|
98
|
+
return null
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
val formatData = ByteArray(16)
|
|
102
|
+
raf.readFully(formatData)
|
|
103
|
+
val formatBuffer = ByteBuffer.wrap(formatData).order(ByteOrder.LITTLE_ENDIAN)
|
|
104
|
+
|
|
105
|
+
val audioFormat = formatBuffer.short // Skip audio format
|
|
106
|
+
channels = formatBuffer.short.toInt() and 0xFFFF
|
|
107
|
+
sampleRate = formatBuffer.int
|
|
108
|
+
val byteRate = formatBuffer.int
|
|
109
|
+
val blockAlign = formatBuffer.short
|
|
110
|
+
bitDepth = formatBuffer.short.toInt() and 0xFFFF
|
|
111
|
+
|
|
112
|
+
LogUtils.d(CLASS_NAME, "Raw format data: ${formatData.joinToString(", ")}")
|
|
113
|
+
LogUtils.d(CLASS_NAME, "Format chunk: audioFormat=$audioFormat, channels=$channels, sampleRate=$sampleRate, bitDepth=$bitDepth, byteRate=$byteRate, blockAlign=$blockAlign")
|
|
114
|
+
|
|
115
|
+
if (bitDepth !in listOf(8, 16, 32)) {
|
|
116
|
+
LogUtils.e(CLASS_NAME, "Invalid bit depth: $bitDepth")
|
|
117
|
+
return null
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
val remainingFmtBytes = chunkSize - 16
|
|
121
|
+
if (remainingFmtBytes > 0) {
|
|
122
|
+
raf.skipBytes(remainingFmtBytes.toInt())
|
|
123
|
+
}
|
|
124
|
+
fmtChunkFound = true
|
|
125
|
+
}
|
|
126
|
+
"data" -> {
|
|
127
|
+
dataOffset = raf.filePointer
|
|
128
|
+
dataSize = chunkSize
|
|
129
|
+
dataChunkFound = true
|
|
130
|
+
break
|
|
131
|
+
}
|
|
132
|
+
else -> {
|
|
133
|
+
// Skip unknown chunks
|
|
134
|
+
val skipBytes = chunkSize
|
|
135
|
+
if (skipBytes > 0) {
|
|
136
|
+
val actualSkip = minOf(skipBytes, fileSize - raf.filePointer)
|
|
137
|
+
raf.seek(raf.filePointer + actualSkip)
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (!fmtChunkFound || !dataChunkFound) {
|
|
144
|
+
LogUtils.e(CLASS_NAME, "Missing essential chunks (fmt=$fmtChunkFound, data=$dataChunkFound)")
|
|
145
|
+
return null
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Calculate actual data size if it seems wrong
|
|
149
|
+
if (dataSize <= 0 || dataSize > fileSize - dataOffset) {
|
|
150
|
+
dataSize = fileSize - dataOffset
|
|
151
|
+
LogUtils.d(CLASS_NAME, "Adjusted data size to: $dataSize")
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
LogUtils.d(CLASS_NAME, "Reading PCM data: offset=$dataOffset, size=$dataSize")
|
|
155
|
+
|
|
156
|
+
val wavData = ByteArray(dataSize.toInt())
|
|
157
|
+
raf.seek(dataOffset)
|
|
158
|
+
raf.readFully(wavData)
|
|
159
|
+
|
|
160
|
+
// Calculate duration in ms
|
|
161
|
+
// Each sample is bitsPerSample/8 bytes, and we have 'channels' samples per frame
|
|
162
|
+
val bytesPerFrame = channels * (bitDepth / 8)
|
|
163
|
+
val numFrames = wavData.size / bytesPerFrame
|
|
164
|
+
val durationMs = (numFrames * 1000L) / sampleRate
|
|
165
|
+
|
|
166
|
+
LogUtils.d(CLASS_NAME, "WAV duration calculation: size=${wavData.size}, bytesPerFrame=$bytesPerFrame, numFrames=$numFrames, sampleRate=$sampleRate, duration=${durationMs}ms")
|
|
167
|
+
|
|
168
|
+
return AudioData(
|
|
169
|
+
data = wavData,
|
|
170
|
+
sampleRate = sampleRate,
|
|
171
|
+
channels = channels,
|
|
172
|
+
bitDepth = bitDepth,
|
|
173
|
+
durationMs = durationMs
|
|
174
|
+
)
|
|
175
|
+
} catch (e: Exception) {
|
|
176
|
+
LogUtils.e(CLASS_NAME, "Failed to load WAV file: ${e.message}", e)
|
|
177
|
+
return null
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Processes the audio data and extracts features.
|
|
183
|
+
* @param data The audio data in bytes.
|
|
184
|
+
* @param config The recording configuration.
|
|
185
|
+
* @return AudioAnalysisData containing the extracted features.
|
|
186
|
+
*/
|
|
187
|
+
fun processAudioData(data: ByteArray, config: RecordingConfig): AudioAnalysisData {
|
|
188
|
+
if (data.isEmpty()) {
|
|
189
|
+
LogUtils.e(CLASS_NAME, "Received empty audio data")
|
|
190
|
+
return AudioAnalysisData(
|
|
191
|
+
segmentDurationMs = config.segmentDurationMs,
|
|
192
|
+
durationMs = 0,
|
|
193
|
+
bitDepth = 16,
|
|
194
|
+
numberOfChannels = config.channels,
|
|
195
|
+
sampleRate = config.sampleRate,
|
|
196
|
+
samples = 0,
|
|
197
|
+
dataPoints = emptyList(),
|
|
198
|
+
amplitudeRange = AudioAnalysisData.AmplitudeRange(0f, 0f),
|
|
199
|
+
rmsRange = AudioAnalysisData.AmplitudeRange(0f, 0f),
|
|
200
|
+
extractionTimeMs = 0f,
|
|
201
|
+
)
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
val sampleRate = config.sampleRate.toFloat()
|
|
205
|
+
val bitDepth = when (config.encoding) {
|
|
206
|
+
"pcm_8bit" -> 8
|
|
207
|
+
"pcm_16bit" -> 16
|
|
208
|
+
"pcm_32bit" -> 32
|
|
209
|
+
else -> throw IllegalArgumentException("Unsupported encoding: ${config.encoding}")
|
|
210
|
+
}
|
|
211
|
+
val channelData = convertToFloatArray(data, bitDepth)
|
|
212
|
+
val featureOptions = config.features
|
|
213
|
+
|
|
214
|
+
val totalSamples = channelData.size
|
|
215
|
+
// Update samplesPerSegment calculation to use proper formula
|
|
216
|
+
val samplesPerSegment = ((config.segmentDurationMs / 1000.0) * sampleRate).toInt()
|
|
217
|
+
val totalPoints = ceil(totalSamples.toDouble() / samplesPerSegment).toInt()
|
|
218
|
+
|
|
219
|
+
LogUtils.d(CLASS_NAME, "Extracting waveform totalSize=${data.size} with $totalSamples samples --> $totalPoints points")
|
|
220
|
+
LogUtils.d(CLASS_NAME, "segmentDuration: ${config.segmentDurationMs}ms, samplesPerSegment: $samplesPerSegment")
|
|
221
|
+
|
|
222
|
+
// Remove expectedPoints calculation since it used pointsPerSecond
|
|
223
|
+
val samplesPerPoint = ceil(channelData.size / totalPoints.toDouble()).toInt()
|
|
224
|
+
LogUtils.d(CLASS_NAME, "Extracting waveform with samplesPerPoints=$samplesPerPoint")
|
|
225
|
+
|
|
226
|
+
val dataPoints = mutableListOf<DataPoint>()
|
|
227
|
+
var minAmplitude = Float.MAX_VALUE
|
|
228
|
+
var maxAmplitude = Float.NEGATIVE_INFINITY
|
|
229
|
+
var minRms = Float.MAX_VALUE
|
|
230
|
+
var maxRms = Float.NEGATIVE_INFINITY
|
|
231
|
+
// Calculate total duration in milliseconds based on sample rate and total samples
|
|
232
|
+
val durationMs = (totalSamples.toFloat() / sampleRate * 1000).toInt()
|
|
233
|
+
|
|
234
|
+
// Measure the time taken for audio processing
|
|
235
|
+
val extractionTimeMs = measureTimeMillis {
|
|
236
|
+
for (i in 0 until totalPoints) {
|
|
237
|
+
val start = i * samplesPerSegment
|
|
238
|
+
val end = min(start + samplesPerSegment, totalSamples)
|
|
239
|
+
val segmentData = channelData.sliceArray(start until end)
|
|
240
|
+
|
|
241
|
+
var sumSquares = 0f
|
|
242
|
+
var zeroCrossings = 0
|
|
243
|
+
var prevValue = 0f
|
|
244
|
+
var localMinAmplitude = Float.MAX_VALUE
|
|
245
|
+
var localMaxAmplitude = Float.MIN_VALUE
|
|
246
|
+
|
|
247
|
+
for (value in segmentData) {
|
|
248
|
+
sumSquares += value * value
|
|
249
|
+
if (prevValue != 0f && value * prevValue < 0) zeroCrossings += 1
|
|
250
|
+
prevValue = value
|
|
251
|
+
|
|
252
|
+
val absValue = abs(value)
|
|
253
|
+
localMinAmplitude = min(localMinAmplitude, absValue)
|
|
254
|
+
localMaxAmplitude = max(localMaxAmplitude, absValue)
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
val features = computeFeatures(
|
|
258
|
+
segmentData = segmentData,
|
|
259
|
+
sampleRate = sampleRate,
|
|
260
|
+
sumSquares = sumSquares,
|
|
261
|
+
zeroCrossings = zeroCrossings,
|
|
262
|
+
segmentLength = segmentData.size,
|
|
263
|
+
featureOptions = featureOptions,
|
|
264
|
+
minAmplitude = localMinAmplitude,
|
|
265
|
+
maxAmplitude = localMaxAmplitude
|
|
266
|
+
)
|
|
267
|
+
val rms = features.rms
|
|
268
|
+
val silent = rms < 0.01
|
|
269
|
+
val dB = 20 * log10(rms.toDouble()).toFloat()
|
|
270
|
+
minAmplitude = min(minAmplitude, localMinAmplitude)
|
|
271
|
+
maxAmplitude = max(maxAmplitude, localMaxAmplitude)
|
|
272
|
+
minRms = min(minRms, rms)
|
|
273
|
+
maxRms = max(maxRms, rms)
|
|
274
|
+
|
|
275
|
+
val bytesPerSample = bitDepth / 8
|
|
276
|
+
val startPosition = start * bytesPerSample * config.channels
|
|
277
|
+
val endPosition = end * bytesPerSample * config.channels
|
|
278
|
+
|
|
279
|
+
// Update cumulative amplitude range
|
|
280
|
+
cumulativeMinAmplitude = min(cumulativeMinAmplitude, localMinAmplitude)
|
|
281
|
+
cumulativeMaxAmplitude = max(cumulativeMaxAmplitude, localMaxAmplitude)
|
|
282
|
+
|
|
283
|
+
val dataPoint = DataPoint(
|
|
284
|
+
id = uniqueIdCounter.getAndIncrement(),
|
|
285
|
+
amplitude = localMaxAmplitude, // Always use peak amplitude
|
|
286
|
+
rms = rms, // Always include RMS
|
|
287
|
+
dB = dB,
|
|
288
|
+
silent = silent,
|
|
289
|
+
features = features,
|
|
290
|
+
speech = SpeechFeatures(isActive = !silent),
|
|
291
|
+
startTime = startPosition / (sampleRate * bytesPerSample * config.channels),
|
|
292
|
+
endTime = endPosition / (sampleRate * bytesPerSample * config.channels),
|
|
293
|
+
startPosition = startPosition,
|
|
294
|
+
endPosition = endPosition,
|
|
295
|
+
samples = segmentData.size
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
dataPoints.add(dataPoint)
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
return AudioAnalysisData(
|
|
303
|
+
segmentDurationMs = config.segmentDurationMs,
|
|
304
|
+
durationMs = durationMs,
|
|
305
|
+
bitDepth = bitDepth,
|
|
306
|
+
numberOfChannels = config.channels,
|
|
307
|
+
sampleRate = config.sampleRate, // Use config.sampleRate instead of sampleRate
|
|
308
|
+
samples = totalSamples, // Use totalSamples instead of samplesInRange
|
|
309
|
+
dataPoints = dataPoints,
|
|
310
|
+
amplitudeRange = AudioAnalysisData.AmplitudeRange(minAmplitude, maxAmplitude),
|
|
311
|
+
rmsRange = AudioAnalysisData.AmplitudeRange(minRms, maxRms),
|
|
312
|
+
extractionTimeMs = extractionTimeMs.toFloat()
|
|
313
|
+
)
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
fun resetCumulativeAmplitudeRange() {
|
|
317
|
+
cumulativeMinAmplitude = Float.MAX_VALUE
|
|
318
|
+
cumulativeMaxAmplitude = Float.MIN_VALUE
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Converts the audio data to a float array.
|
|
323
|
+
* @param data The audio data in bytes.
|
|
324
|
+
* @param bitDepth The bit depth of the audio data.
|
|
325
|
+
* @return The converted float array.
|
|
326
|
+
*/
|
|
327
|
+
private fun convertToFloatArray(data: ByteArray, bitDepth: Int): FloatArray {
|
|
328
|
+
return when (bitDepth) {
|
|
329
|
+
16 -> {
|
|
330
|
+
val buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer()
|
|
331
|
+
val array = ShortArray(buffer.remaining())
|
|
332
|
+
buffer.get(array)
|
|
333
|
+
array.map { it / 32768.0f }.toFloatArray()
|
|
334
|
+
}
|
|
335
|
+
8 -> data.map { (it.toInt() - 128) / 128.0f }.toFloatArray()
|
|
336
|
+
32 -> {
|
|
337
|
+
val buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer()
|
|
338
|
+
val array = IntArray(buffer.remaining())
|
|
339
|
+
buffer.get(array)
|
|
340
|
+
array.map { it / Int.MAX_VALUE.toFloat() }.toFloatArray()
|
|
341
|
+
}
|
|
342
|
+
else -> throw IllegalArgumentException("Unsupported bit depth: $bitDepth")
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Computes the features of the audio data.
|
|
348
|
+
* @param segmentData The segment data.
|
|
349
|
+
* @param sampleRate The sample rate of the audio data.
|
|
350
|
+
* @param minAmplitude The minimum amplitude.
|
|
351
|
+
* @param maxAmplitude The maximum amplitude.
|
|
352
|
+
* @param sumSquares The sum of squares.
|
|
353
|
+
* @param zeroCrossings The zero crossings.
|
|
354
|
+
* @param segmentLength The length of the segment.
|
|
355
|
+
* @param featureOptions The feature options to compute.
|
|
356
|
+
* @return The computed features.
|
|
357
|
+
*/
|
|
358
|
+
private fun computeFeatures(
|
|
359
|
+
segmentData: FloatArray,
|
|
360
|
+
sampleRate: Float,
|
|
361
|
+
minAmplitude: Float,
|
|
362
|
+
maxAmplitude: Float,
|
|
363
|
+
sumSquares: Float,
|
|
364
|
+
zeroCrossings: Int,
|
|
365
|
+
segmentLength: Int,
|
|
366
|
+
featureOptions: Map<String, Boolean>
|
|
367
|
+
): Features {
|
|
368
|
+
val rms = sqrt(sumSquares / segmentLength)
|
|
369
|
+
val energy = if (featureOptions["energy"] == true) sumSquares else 0f
|
|
370
|
+
val zcr = if (featureOptions["zcr"] == true) zeroCrossings / segmentLength.toFloat() else 0f
|
|
371
|
+
|
|
372
|
+
// Determine if we need the C++ audio features (single JNI call for spectral + MFCC + chroma)
|
|
373
|
+
val needSpectral = featureOptions["spectralCentroid"] == true ||
|
|
374
|
+
featureOptions["spectralFlatness"] == true ||
|
|
375
|
+
featureOptions["spectralRolloff"] == true ||
|
|
376
|
+
featureOptions["spectralBandwidth"] == true
|
|
377
|
+
val needMfcc = featureOptions["mfcc"] == true
|
|
378
|
+
val needChroma = featureOptions["chromagram"] == true
|
|
379
|
+
|
|
380
|
+
// Single C++ call for all FFT-based features
|
|
381
|
+
var spectralCentroid = 0f
|
|
382
|
+
var spectralFlatness = 0f
|
|
383
|
+
var spectralRolloff = 0f
|
|
384
|
+
var spectralBandwidth = 0f
|
|
385
|
+
var mfcc: List<Float> = emptyList()
|
|
386
|
+
var chroma: List<Float> = emptyList()
|
|
387
|
+
|
|
388
|
+
if (needSpectral || needMfcc || needChroma) {
|
|
389
|
+
try {
|
|
390
|
+
val cppResult = AudioFeaturesNative.computeFrame(
|
|
391
|
+
segmentData,
|
|
392
|
+
sampleRate.toInt(),
|
|
393
|
+
N_FFT,
|
|
394
|
+
13, // nMfcc
|
|
395
|
+
26, // nMelFilters
|
|
396
|
+
needMfcc,
|
|
397
|
+
needChroma
|
|
398
|
+
)
|
|
399
|
+
if (needSpectral) {
|
|
400
|
+
spectralCentroid = (cppResult["spectralCentroid"] as? Float) ?: 0f
|
|
401
|
+
spectralFlatness = (cppResult["spectralFlatness"] as? Float) ?: 0f
|
|
402
|
+
spectralRolloff = (cppResult["spectralRolloff"] as? Float) ?: 0f
|
|
403
|
+
spectralBandwidth = (cppResult["spectralBandwidth"] as? Float) ?: 0f
|
|
404
|
+
}
|
|
405
|
+
if (needMfcc) {
|
|
406
|
+
mfcc = (cppResult["mfcc"] as? FloatArray)?.toList() ?: emptyList()
|
|
407
|
+
}
|
|
408
|
+
if (needChroma) {
|
|
409
|
+
chroma = (cppResult["chromagram"] as? FloatArray)?.toList() ?: emptyList()
|
|
410
|
+
}
|
|
411
|
+
} catch (e: Exception) {
|
|
412
|
+
LogUtils.e(CLASS_NAME, "Failed to compute C++ audio features: ${e.message}", e)
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
val melSpectrogram = try {
|
|
417
|
+
if (featureOptions["melSpectrogram"] == true) computeMelSpectrogram(segmentData, sampleRate) else emptyList()
|
|
418
|
+
} catch (e: Exception) {
|
|
419
|
+
LogUtils.e(CLASS_NAME, "Failed to compute mel spectrogram: ${e.message}", e)
|
|
420
|
+
emptyList()
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
val tempo = try {
|
|
424
|
+
if (featureOptions["tempo"] == true) extractTempo(segmentData, sampleRate) else 0f
|
|
425
|
+
} catch (e: Exception) {
|
|
426
|
+
LogUtils.e(CLASS_NAME, "Failed to extract tempo: ${e.message}", e)
|
|
427
|
+
0f
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
val hnr = try {
|
|
431
|
+
if (featureOptions["hnr"] == true) extractHNR(segmentData) else 0f
|
|
432
|
+
} catch (e: Exception) {
|
|
433
|
+
LogUtils.e(CLASS_NAME, "Failed to extract HNR: ${e.message}", e)
|
|
434
|
+
0f
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
val spectralContrast = try {
|
|
438
|
+
if (featureOptions["spectralContrast"] == true) computeSpectralContrast(segmentData, sampleRate) else emptyList()
|
|
439
|
+
} catch (e: Exception) {
|
|
440
|
+
LogUtils.e(CLASS_NAME, "Failed to compute spectral contrast: ${e.message}", e)
|
|
441
|
+
emptyList()
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
val tonnetz = try {
|
|
445
|
+
if (featureOptions["tonnetz"] == true) computeTonnetz(segmentData, sampleRate) else emptyList()
|
|
446
|
+
} catch (e: Exception) {
|
|
447
|
+
LogUtils.e(CLASS_NAME, "Failed to compute tonnetz: ${e.message}", e)
|
|
448
|
+
emptyList()
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
val pitch = if (featureOptions["pitch"] == true) estimatePitch(segmentData, sampleRate) else 0.0f
|
|
452
|
+
|
|
453
|
+
val crc32Value = if (featureOptions["crc32"] == true) {
|
|
454
|
+
val byteBuffer = ByteBuffer.allocate(segmentData.size * 4)
|
|
455
|
+
.order(ByteOrder.LITTLE_ENDIAN)
|
|
456
|
+
segmentData.forEach { value ->
|
|
457
|
+
byteBuffer.putFloat(value)
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
val crc32 = CRC32()
|
|
461
|
+
crc32.update(byteBuffer.array())
|
|
462
|
+
crc32.value
|
|
463
|
+
} else null
|
|
464
|
+
|
|
465
|
+
return Features(
|
|
466
|
+
energy = energy,
|
|
467
|
+
mfcc = mfcc,
|
|
468
|
+
rms = rms,
|
|
469
|
+
minAmplitude = minAmplitude,
|
|
470
|
+
maxAmplitude = maxAmplitude,
|
|
471
|
+
zcr = zcr,
|
|
472
|
+
spectralCentroid = spectralCentroid,
|
|
473
|
+
spectralFlatness = spectralFlatness,
|
|
474
|
+
spectralRolloff = spectralRolloff,
|
|
475
|
+
spectralBandwidth = spectralBandwidth,
|
|
476
|
+
tempo = tempo,
|
|
477
|
+
hnr = hnr,
|
|
478
|
+
melSpectrogram = melSpectrogram,
|
|
479
|
+
chromagram = chroma,
|
|
480
|
+
spectralContrast = spectralContrast,
|
|
481
|
+
tonnetz = tonnetz,
|
|
482
|
+
pitch = pitch,
|
|
483
|
+
crc32 = crc32Value
|
|
484
|
+
)
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
private fun extractTempo(segmentData: FloatArray, sampleRate: Float): Float {
|
|
488
|
+
val hopLength = 512
|
|
489
|
+
val frameLength = 2048
|
|
490
|
+
|
|
491
|
+
// Compute onset strength signal using spectral flux
|
|
492
|
+
val onsetEnvelope = mutableListOf<Float>()
|
|
493
|
+
var previousSpectrum = FloatArray(frameLength / 2)
|
|
494
|
+
|
|
495
|
+
// Process frames with spectral flux
|
|
496
|
+
for (i in 0 until segmentData.size - frameLength step hopLength) {
|
|
497
|
+
val frame = segmentData.slice(i until minOf(i + frameLength, segmentData.size)).toFloatArray()
|
|
498
|
+
val fft = FFT(frameLength)
|
|
499
|
+
val fftData = frame.copyOf(frameLength)
|
|
500
|
+
fft.realForward(fftData)
|
|
501
|
+
|
|
502
|
+
// Compute magnitude spectrum
|
|
503
|
+
val magnitudes = FloatArray(frameLength / 2)
|
|
504
|
+
for (j in magnitudes.indices) {
|
|
505
|
+
val re = fftData[2 * j]
|
|
506
|
+
val im = if (2 * j + 1 < fftData.size) fftData[2 * j + 1] else 0f
|
|
507
|
+
magnitudes[j] = sqrt(re * re + im * im)
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
// Calculate spectral flux (sum of positive differences)
|
|
511
|
+
var flux = 0f
|
|
512
|
+
for (j in magnitudes.indices) {
|
|
513
|
+
flux += maxOf(magnitudes[j] - previousSpectrum[j], 0f)
|
|
514
|
+
}
|
|
515
|
+
onsetEnvelope.add(flux)
|
|
516
|
+
previousSpectrum = magnitudes
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// Find peaks in onset envelope
|
|
520
|
+
val peaks = mutableListOf<Int>()
|
|
521
|
+
for (i in 1 until onsetEnvelope.size - 1) {
|
|
522
|
+
if (onsetEnvelope[i] > onsetEnvelope[i-1] && onsetEnvelope[i] > onsetEnvelope[i+1]) {
|
|
523
|
+
peaks.add(i)
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// Calculate tempo from peak intervals
|
|
528
|
+
return if (peaks.size > 1) {
|
|
529
|
+
val intervals = peaks.zipWithNext { a, b -> b - a }
|
|
530
|
+
val averageInterval = intervals.average().toFloat()
|
|
531
|
+
60f * sampleRate / (hopLength * averageInterval)
|
|
532
|
+
} else {
|
|
533
|
+
120f // Default tempo if no clear peaks found
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
private fun extractSpectralFeatures(samples: FloatArray, sampleRate: Float): SpectralFeatures {
|
|
538
|
+
// FFT requires a fixed-size buffer (N_FFT). If our input is larger,
|
|
539
|
+
// we'll analyze just the first N_FFT samples to prevent buffer overflow.
|
|
540
|
+
// This is a common practice in audio analysis where we process chunks
|
|
541
|
+
// of consistent size rather than variable-length segments.
|
|
542
|
+
val windowed = if (samples.size > N_FFT) {
|
|
543
|
+
// If samples are larger than FFT size, take the first N_FFT samples
|
|
544
|
+
applyHannWindow(samples.copyOf(N_FFT))
|
|
545
|
+
} else {
|
|
546
|
+
applyHannWindow(samples)
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
// Create padded array for FFT, ensuring we don't exceed N_FFT size
|
|
550
|
+
// Zero padding is automatic since FloatArray initializes with zeros
|
|
551
|
+
val paddedSamples = FloatArray(N_FFT).also { padded ->
|
|
552
|
+
windowed.copyInto(padded, 0, 0, minOf(windowed.size, N_FFT))
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// Perform FFT
|
|
556
|
+
val fft = FFT(N_FFT)
|
|
557
|
+
fft.realForward(paddedSamples)
|
|
558
|
+
|
|
559
|
+
// Calculate magnitude spectrum (only need first half due to symmetry)
|
|
560
|
+
// Add 1 to include both DC (0 Hz) and Nyquist frequency components
|
|
561
|
+
val magnitudeSpectrum = FloatArray(N_FFT / 2 + 1)
|
|
562
|
+
for (i in 0 until N_FFT / 2) { // Since we're only going up to N_FFT/2, the check is unnecessary
|
|
563
|
+
val re = paddedSamples[2 * i]
|
|
564
|
+
val im = paddedSamples[2 * i + 1] // This will always be within bounds
|
|
565
|
+
magnitudeSpectrum[i] = sqrt(re * re + im * im)
|
|
566
|
+
}
|
|
567
|
+
// Handle Nyquist frequency component separately
|
|
568
|
+
magnitudeSpectrum[N_FFT / 2] = abs(paddedSamples[1])
|
|
569
|
+
|
|
570
|
+
// Compute power spectrum for spectral flatness
|
|
571
|
+
val powerSpectrum = magnitudeSpectrum.map { it * it }.toFloatArray()
|
|
572
|
+
|
|
573
|
+
// Compute spectral features
|
|
574
|
+
val centroid = computeSpectralCentroid(magnitudeSpectrum, sampleRate)
|
|
575
|
+
val flatness = computeSpectralFlatness(powerSpectrum)
|
|
576
|
+
val rollOff = computeSpectralRollOff(magnitudeSpectrum, sampleRate)
|
|
577
|
+
val bandwidth = computeSpectralBandwidth(magnitudeSpectrum, sampleRate, centroid)
|
|
578
|
+
|
|
579
|
+
return SpectralFeatures(
|
|
580
|
+
centroid = centroid,
|
|
581
|
+
flatness = flatness,
|
|
582
|
+
rollOff = rollOff,
|
|
583
|
+
bandwidth = bandwidth
|
|
584
|
+
)
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
private fun computeSpectralCentroid(magnitudeSpectrum: FloatArray, sampleRate: Float): Float {
|
|
588
|
+
val sum = magnitudeSpectrum.sum()
|
|
589
|
+
if (sum == 0f) return 0f
|
|
590
|
+
|
|
591
|
+
val weightedSum = magnitudeSpectrum.mapIndexed { index, value ->
|
|
592
|
+
index * (sampleRate / N_FFT) * value
|
|
593
|
+
}.sum()
|
|
594
|
+
|
|
595
|
+
return weightedSum / sum
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
private fun computeSpectralFlatness(powerSpectrum: FloatArray): Float {
|
|
599
|
+
// Calculate geometric mean using log-space to avoid numerical issues
|
|
600
|
+
var sumLogValues = 0.0f
|
|
601
|
+
for (value in powerSpectrum) {
|
|
602
|
+
sumLogValues += ln(value + 1e-10f) // Add small epsilon to avoid log(0)
|
|
603
|
+
}
|
|
604
|
+
val geometricMean = exp(sumLogValues / powerSpectrum.size)
|
|
605
|
+
|
|
606
|
+
// Calculate arithmetic mean
|
|
607
|
+
val arithmeticMean = powerSpectrum.sum() / powerSpectrum.size
|
|
608
|
+
|
|
609
|
+
return if (arithmeticMean != 0f) geometricMean / arithmeticMean else 0f
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
private fun computeSpectralRollOff(magnitudeSpectrum: FloatArray, sampleRate: Float): Float {
|
|
613
|
+
val totalEnergy = magnitudeSpectrum.sum()
|
|
614
|
+
var cumulativeEnergy = 0f
|
|
615
|
+
val rollOffThreshold = totalEnergy * 0.85f
|
|
616
|
+
|
|
617
|
+
for ((index, value) in magnitudeSpectrum.withIndex()) {
|
|
618
|
+
cumulativeEnergy += value
|
|
619
|
+
if (cumulativeEnergy >= rollOffThreshold) {
|
|
620
|
+
return index * (sampleRate / N_FFT)
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
return 0f
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
private fun computeSpectralBandwidth(
|
|
628
|
+
magnitudeSpectrum: FloatArray,
|
|
629
|
+
sampleRate: Float,
|
|
630
|
+
centroid: Float
|
|
631
|
+
): Float {
|
|
632
|
+
val sum = magnitudeSpectrum.sum()
|
|
633
|
+
if (sum == 0f) return 0f
|
|
634
|
+
|
|
635
|
+
// Match iOS frequency calculation
|
|
636
|
+
val weightedSum = magnitudeSpectrum.mapIndexed { index, value ->
|
|
637
|
+
val freq = index * sampleRate / (2 * magnitudeSpectrum.size)
|
|
638
|
+
value * (freq - centroid).pow(2)
|
|
639
|
+
}.sum()
|
|
640
|
+
|
|
641
|
+
return sqrt(weightedSum / sum)
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
private data class SpectralFeatures(
|
|
645
|
+
val centroid: Float = 0f,
|
|
646
|
+
val flatness: Float = 0f,
|
|
647
|
+
val rollOff: Float = 0f,
|
|
648
|
+
val bandwidth: Float = 0f
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
/**
|
|
652
|
+
* Resets the segment data.
|
|
653
|
+
* @param sumSquaresUpdater Function to reset sum of squares.
|
|
654
|
+
* @param zeroCrossingsUpdater Function to reset zero crossings.
|
|
655
|
+
* @param localMinAmplitudeUpdater Function to reset local min amplitude.
|
|
656
|
+
* @param localMaxAmplitudeUpdater Function to reset local max amplitude.
|
|
657
|
+
* @param segmentData The segment data list to reset.
|
|
658
|
+
*/
|
|
659
|
+
private fun resetSegmentData(
|
|
660
|
+
sumSquaresUpdater: (Float) -> Unit,
|
|
661
|
+
zeroCrossingsUpdater: (Int) -> Unit,
|
|
662
|
+
localMinAmplitudeUpdater: (Float) -> Unit,
|
|
663
|
+
localMaxAmplitudeUpdater: (Float) -> Unit,
|
|
664
|
+
segmentData: MutableList<Float>
|
|
665
|
+
) {
|
|
666
|
+
sumSquaresUpdater(0f)
|
|
667
|
+
zeroCrossingsUpdater(0)
|
|
668
|
+
localMinAmplitudeUpdater(Float.MAX_VALUE)
|
|
669
|
+
localMaxAmplitudeUpdater(Float.MIN_VALUE)
|
|
670
|
+
segmentData.clear()
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
/**
|
|
674
|
+
* Computes the MFCC (Mel-Frequency Cepstral Coefficients) from the audio data.
|
|
675
|
+
*/
|
|
676
|
+
private fun computeMFCC(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
677
|
+
val (powerSpectrum, _) = prepareFFT(samples, sampleRate)
|
|
678
|
+
val melFilters = computeMelFilterbank(
|
|
679
|
+
numFilters = 26,
|
|
680
|
+
powerSpectrumSize = powerSpectrum.size,
|
|
681
|
+
sampleRate = sampleRate
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
if (melFilters.any { it.size != powerSpectrum.size }) {
|
|
685
|
+
LogUtils.e(CLASS_NAME, "Mel filter size (${melFilters[0].size}) does not match power spectrum size (${powerSpectrum.size})")
|
|
686
|
+
return emptyList()
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
val melEnergies = FloatArray(26) { i ->
|
|
690
|
+
var energy = 0f
|
|
691
|
+
for (j in powerSpectrum.indices) {
|
|
692
|
+
energy += powerSpectrum[j] * melFilters[i][j]
|
|
693
|
+
}
|
|
694
|
+
ln(maxOf(energy, 1e-10f))
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
val mfcc = FloatArray(13) { i ->
|
|
698
|
+
var sum = 0f
|
|
699
|
+
for (j in melEnergies.indices) {
|
|
700
|
+
sum += melEnergies[j] * cos(PI * i * (2 * j + 1) / (2 * 26)).toFloat()
|
|
701
|
+
}
|
|
702
|
+
sum * sqrt(2f / 26)
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
return mfcc.toList()
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Computes the Mel filter bank.
|
|
710
|
+
* @param numFilters The number of Mel filters.
|
|
711
|
+
* @param powerSpectrumSize The size of the power spectrum.
|
|
712
|
+
* @param sampleRate The sample rate of the audio data.
|
|
713
|
+
* @return A list of Mel filters.
|
|
714
|
+
*/
|
|
715
|
+
private fun computeMelFilterbank(numFilters: Int, powerSpectrumSize: Int, sampleRate: Float): Array<FloatArray> {
|
|
716
|
+
val fMin = 0f
|
|
717
|
+
val fMax = sampleRate / 2
|
|
718
|
+
|
|
719
|
+
// Convert Hz to Mel
|
|
720
|
+
val melMin = hzToMel(fMin)
|
|
721
|
+
val melMax = hzToMel(fMax)
|
|
722
|
+
|
|
723
|
+
// Create equally spaced points in Mel scale
|
|
724
|
+
val melPoints = FloatArray(numFilters + 2)
|
|
725
|
+
val melStep = (melMax - melMin) / (numFilters + 1)
|
|
726
|
+
for (i in melPoints.indices) {
|
|
727
|
+
melPoints[i] = melMin + i * melStep
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
// Convert back to Hz
|
|
731
|
+
val hzPoints = melPoints.map { melToHz(it) }
|
|
732
|
+
|
|
733
|
+
// Convert to FFT bin numbers, clamping to valid range
|
|
734
|
+
val bins = hzPoints.map { minOf((it * powerSpectrumSize / sampleRate).roundToInt(), powerSpectrumSize - 1) }.toList()
|
|
735
|
+
|
|
736
|
+
// Create the filterbank matrix with size matching powerSpectrumSize
|
|
737
|
+
val filterbank = Array(numFilters) { FloatArray(powerSpectrumSize) { 0f } }
|
|
738
|
+
|
|
739
|
+
// Ensure safe access to bins by limiting the loop and checking boundaries
|
|
740
|
+
for (i in 0 until numFilters) {
|
|
741
|
+
if (i + 2 < bins.size) { // Check to prevent out-of-bounds access
|
|
742
|
+
val startBin = bins[i]
|
|
743
|
+
val centerBin = bins[i + 1]
|
|
744
|
+
val endBin = bins[i + 2]
|
|
745
|
+
|
|
746
|
+
// Left slope (ascending triangle)
|
|
747
|
+
if (centerBin > startBin) {
|
|
748
|
+
for (j in startBin until centerBin) {
|
|
749
|
+
filterbank[i][j] = (j - startBin).toFloat() / (centerBin - startBin).toFloat()
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
// Right slope (descending triangle)
|
|
753
|
+
if (endBin > centerBin) {
|
|
754
|
+
for (j in centerBin until endBin) {
|
|
755
|
+
filterbank[i][j] = (endBin - j).toFloat() / (endBin - centerBin).toFloat()
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
return filterbank
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
/**
|
|
765
|
+
* Computes the Discrete Cosine Transform (DCT) of the log energies.
|
|
766
|
+
* @param logEnergies The log energies.
|
|
767
|
+
* @param numCoefficients The number of coefficients to compute.
|
|
768
|
+
* @return A list of MFCC coefficients.
|
|
769
|
+
*/
|
|
770
|
+
private fun computeDCT(logEnergies: List<Float>, numCoefficients: Int): List<Float> {
|
|
771
|
+
val n = logEnergies.size
|
|
772
|
+
val dct = FloatArray(numCoefficients)
|
|
773
|
+
|
|
774
|
+
for (i in 0 until numCoefficients) {
|
|
775
|
+
var sum = 0.0
|
|
776
|
+
for (j in logEnergies.indices) {
|
|
777
|
+
sum += logEnergies[j] * cos(PI * i * (j + 0.5) / n)
|
|
778
|
+
}
|
|
779
|
+
dct[i] = (sum / sqrt(DCT_SQRT_DIVISOR * n)).toFloat()
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
return dct.toList()
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
/**
|
|
786
|
+
* Extracts the HNR (Harmonics-to-Noise Ratio) from the audio data.
|
|
787
|
+
* @param segmentData The segment data.
|
|
788
|
+
* @return The HNR.
|
|
789
|
+
*/
|
|
790
|
+
private fun extractHNR(segmentData: FloatArray): Float {
|
|
791
|
+
val frameSize = segmentData.size
|
|
792
|
+
val autocorrelation = FloatArray(frameSize)
|
|
793
|
+
|
|
794
|
+
// Compute the autocorrelation of the segment data
|
|
795
|
+
for (i in segmentData.indices) {
|
|
796
|
+
var sum = 0f
|
|
797
|
+
for (j in 0 until frameSize - i) {
|
|
798
|
+
sum += segmentData[j] * segmentData[j + i]
|
|
799
|
+
}
|
|
800
|
+
autocorrelation[i] = sum
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
// Find peaks with minimum prominence
|
|
804
|
+
val maxAutocorrelation = autocorrelation.maxOrNull() ?: 0f
|
|
805
|
+
val peaks = findPeaks(autocorrelation, minProminence = 0.1f * maxAutocorrelation)
|
|
806
|
+
|
|
807
|
+
if (peaks.isNotEmpty()) {
|
|
808
|
+
val firstPeakIndex = peaks.firstOrNull { it > 0 } ?: 0
|
|
809
|
+
val harmonicEnergy = autocorrelation[firstPeakIndex]
|
|
810
|
+
val noiseEnergy = autocorrelation[0] - harmonicEnergy
|
|
811
|
+
if (noiseEnergy > 0) {
|
|
812
|
+
return 10 * log10(harmonicEnergy / noiseEnergy)
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
return 0f
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
private fun findPeaks(data: FloatArray, minProminence: Float): List<Int> {
|
|
820
|
+
val peaks = mutableListOf<Int>()
|
|
821
|
+
for (i in 1 until data.size - 1) {
|
|
822
|
+
if (data[i] > data[i - 1] && data[i] > data[i + 1]) {
|
|
823
|
+
val prominence = data[i] - maxOf(data[i - 1], data[i + 1])
|
|
824
|
+
if (prominence >= minProminence) {
|
|
825
|
+
peaks.add(i)
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
return peaks
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
fun loadAudioFromAnyFormat(fileUri: String, decodingConfig: DecodingConfig? = null): AudioData? {
|
|
833
|
+
val cleanUri = fileUri.removePrefix("file://")
|
|
834
|
+
val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
|
|
835
|
+
?: run {
|
|
836
|
+
LogUtils.e(CLASS_NAME, "File not found in any location: $cleanUri")
|
|
837
|
+
return null
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
// First try MediaExtractor
|
|
841
|
+
val extractor = MediaExtractor()
|
|
842
|
+
try {
|
|
843
|
+
LogUtils.d(CLASS_NAME, "Attempting MediaExtractor with path: ${file.absolutePath}")
|
|
844
|
+
extractor.setDataSource(file.absolutePath)
|
|
845
|
+
|
|
846
|
+
// Find the first audio track
|
|
847
|
+
val audioTrackIndex = (0 until extractor.trackCount)
|
|
848
|
+
.find { extractor.getTrackFormat(it).getString(MediaFormat.KEY_MIME)?.startsWith("audio/") == true }
|
|
849
|
+
|
|
850
|
+
if (audioTrackIndex != null) {
|
|
851
|
+
val format = extractor.getTrackFormat(audioTrackIndex)
|
|
852
|
+
extractor.selectTrack(audioTrackIndex)
|
|
853
|
+
|
|
854
|
+
// Get original audio properties
|
|
855
|
+
val originalSampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE)
|
|
856
|
+
val originalChannels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT)
|
|
857
|
+
val totalDurationUs = try {
|
|
858
|
+
format.getLong(MediaFormat.KEY_DURATION)
|
|
859
|
+
} catch (e: Exception) {
|
|
860
|
+
(format.getString(MediaFormat.KEY_DURATION) ?: "-1").toLong()
|
|
861
|
+
}
|
|
862
|
+
LogUtils.d(CLASS_NAME, "Raw duration from format: ${totalDurationUs}us")
|
|
863
|
+
|
|
864
|
+
val totalDurationMs = totalDurationUs / 1000
|
|
865
|
+
LogUtils.d(CLASS_NAME, "Final duration: ${totalDurationMs}ms")
|
|
866
|
+
|
|
867
|
+
// Process using MediaExtractor
|
|
868
|
+
val pcmData = decodeAudioToPCM(extractor, format)
|
|
869
|
+
val processedData = if (decodingConfig != null) {
|
|
870
|
+
processAudio(
|
|
871
|
+
pcmData,
|
|
872
|
+
originalSampleRate,
|
|
873
|
+
decodingConfig.targetSampleRate,
|
|
874
|
+
originalChannels,
|
|
875
|
+
decodingConfig.targetChannels,
|
|
876
|
+
decodingConfig.normalizeAudio
|
|
877
|
+
)
|
|
878
|
+
} else {
|
|
879
|
+
pcmData
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
return AudioData(
|
|
883
|
+
data = processedData,
|
|
884
|
+
sampleRate = decodingConfig?.targetSampleRate ?: originalSampleRate,
|
|
885
|
+
bitDepth = decodingConfig?.targetBitDepth ?: 16,
|
|
886
|
+
channels = decodingConfig?.targetChannels ?: originalChannels,
|
|
887
|
+
durationMs = totalDurationMs // Pass through the duration
|
|
888
|
+
)
|
|
889
|
+
}
|
|
890
|
+
} catch (e: Exception) {
|
|
891
|
+
LogUtils.d(CLASS_NAME, "MediaExtractor failed, attempting WAV parser: ${e.message}")
|
|
892
|
+
} finally {
|
|
893
|
+
extractor.release()
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
// If MediaExtractor failed and file is WAV, try WAV parser
|
|
897
|
+
if (file.name.lowercase().endsWith(".wav")) {
|
|
898
|
+
LogUtils.d(CLASS_NAME, "Falling back to WAV parser")
|
|
899
|
+
return loadAudioFile(file.absolutePath)?.let { wavData ->
|
|
900
|
+
if (decodingConfig != null) {
|
|
901
|
+
val processedData = processAudio(
|
|
902
|
+
wavData.data,
|
|
903
|
+
wavData.sampleRate,
|
|
904
|
+
decodingConfig.targetSampleRate,
|
|
905
|
+
wavData.channels,
|
|
906
|
+
decodingConfig.targetChannels,
|
|
907
|
+
decodingConfig.normalizeAudio
|
|
908
|
+
)
|
|
909
|
+
AudioData(
|
|
910
|
+
data = processedData,
|
|
911
|
+
sampleRate = decodingConfig.targetSampleRate ?: wavData.sampleRate,
|
|
912
|
+
bitDepth = decodingConfig.targetBitDepth,
|
|
913
|
+
channels = decodingConfig.targetChannels ?: wavData.channels,
|
|
914
|
+
durationMs = wavData.durationMs // Pass through the duration
|
|
915
|
+
)
|
|
916
|
+
} else {
|
|
917
|
+
wavData
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
LogUtils.e(CLASS_NAME, "Failed to process audio file with both MediaExtractor and WAV parser")
|
|
923
|
+
return null
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
private fun decodeAudioToPCM(extractor: MediaExtractor, format: MediaFormat): ByteArray {
|
|
927
|
+
var decoder: MediaCodec? = null
|
|
928
|
+
|
|
929
|
+
try {
|
|
930
|
+
decoder = MediaCodec.createDecoderByType(format.getString(MediaFormat.KEY_MIME)!!)
|
|
931
|
+
decoder.configure(format, null, null, 0)
|
|
932
|
+
decoder.start()
|
|
933
|
+
|
|
934
|
+
val info = MediaCodec.BufferInfo()
|
|
935
|
+
val pcmData = mutableListOf<Byte>()
|
|
936
|
+
|
|
937
|
+
var isEOS = false
|
|
938
|
+
while (!isEOS) {
|
|
939
|
+
val inputBufferId = decoder.dequeueInputBuffer(10000)
|
|
940
|
+
if (inputBufferId >= 0) {
|
|
941
|
+
val inputBuffer = decoder.getInputBuffer(inputBufferId)!!
|
|
942
|
+
val sampleSize = extractor.readSampleData(inputBuffer, 0)
|
|
943
|
+
|
|
944
|
+
if (sampleSize < 0) {
|
|
945
|
+
decoder.queueInputBuffer(inputBufferId, 0, 0, 0, MediaCodec.BUFFER_FLAG_END_OF_STREAM)
|
|
946
|
+
isEOS = true
|
|
947
|
+
} else {
|
|
948
|
+
decoder.queueInputBuffer(inputBufferId, 0, sampleSize, extractor.sampleTime, 0)
|
|
949
|
+
extractor.advance()
|
|
950
|
+
}
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
val outputBufferId = decoder.dequeueOutputBuffer(info, 10000)
|
|
954
|
+
if (outputBufferId >= 0) {
|
|
955
|
+
val outputBuffer = decoder.getOutputBuffer(outputBufferId)!!
|
|
956
|
+
val chunk = ByteArray(info.size)
|
|
957
|
+
outputBuffer.get(chunk)
|
|
958
|
+
pcmData.addAll(chunk.toList())
|
|
959
|
+
decoder.releaseOutputBuffer(outputBufferId, false)
|
|
960
|
+
}
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
return pcmData.toByteArray()
|
|
964
|
+
} finally {
|
|
965
|
+
try {
|
|
966
|
+
decoder?.stop()
|
|
967
|
+
} catch (e: Exception) {
|
|
968
|
+
LogUtils.w(CLASS_NAME, "Error stopping decoder: ${e.message}")
|
|
969
|
+
}
|
|
970
|
+
try {
|
|
971
|
+
decoder?.release()
|
|
972
|
+
} catch (e: Exception) {
|
|
973
|
+
LogUtils.w(CLASS_NAME, "Error releasing decoder: ${e.message}")
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
private fun resampleAudio(
|
|
979
|
+
pcmData: ByteArray,
|
|
980
|
+
originalSampleRate: Int,
|
|
981
|
+
targetSampleRate: Int,
|
|
982
|
+
originalChannels: Int
|
|
983
|
+
): ByteArray {
|
|
984
|
+
// Convert byte array to short array (16-bit samples)
|
|
985
|
+
val shortArray = ShortArray(pcmData.size / 2)
|
|
986
|
+
ByteBuffer.wrap(pcmData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shortArray)
|
|
987
|
+
|
|
988
|
+
// Convert to mono if needed
|
|
989
|
+
val monoShortArray = if (originalChannels > 1) {
|
|
990
|
+
convertToMono(shortArray, originalChannels)
|
|
991
|
+
} else {
|
|
992
|
+
shortArray
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
// Resample
|
|
996
|
+
val resampleRatio = targetSampleRate.toDouble() / originalSampleRate
|
|
997
|
+
val newLength = (monoShortArray.size * resampleRatio).toInt()
|
|
998
|
+
val resampledArray = ShortArray(newLength)
|
|
999
|
+
|
|
1000
|
+
for (i in resampledArray.indices) {
|
|
1001
|
+
val originalIndex = (i / resampleRatio).toInt()
|
|
1002
|
+
val nextIndex = minOf(originalIndex + 1, monoShortArray.size - 1)
|
|
1003
|
+
val fraction = (i / resampleRatio) - originalIndex
|
|
1004
|
+
|
|
1005
|
+
// Linear interpolation
|
|
1006
|
+
val sample = linearInterpolate(
|
|
1007
|
+
monoShortArray[originalIndex].toDouble(),
|
|
1008
|
+
monoShortArray[nextIndex].toDouble(),
|
|
1009
|
+
fraction
|
|
1010
|
+
).toInt().toShort()
|
|
1011
|
+
|
|
1012
|
+
resampledArray[i] = sample
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
// Convert back to byte array
|
|
1016
|
+
val resultBuffer = ByteBuffer.allocate(resampledArray.size * 2)
|
|
1017
|
+
resultBuffer.order(ByteOrder.LITTLE_ENDIAN)
|
|
1018
|
+
resultBuffer.asShortBuffer().put(resampledArray)
|
|
1019
|
+
return resultBuffer.array()
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
private fun convertToMono(stereoData: ShortArray, channels: Int): ShortArray {
|
|
1023
|
+
val monoLength = stereoData.size / channels
|
|
1024
|
+
val monoData = ShortArray(monoLength)
|
|
1025
|
+
|
|
1026
|
+
for (i in 0 until monoLength) {
|
|
1027
|
+
var sum = 0
|
|
1028
|
+
for (ch in 0 until channels) {
|
|
1029
|
+
sum += stereoData[i * channels + ch]
|
|
1030
|
+
}
|
|
1031
|
+
monoData[i] = (sum / channels).toShort()
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
return monoData
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
private fun linearInterpolate(a: Double, b: Double, fraction: Double): Double {
|
|
1038
|
+
return a + fraction * (b - a)
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
fun processAudio(
|
|
1042
|
+
pcmData: ByteArray,
|
|
1043
|
+
originalSampleRate: Int,
|
|
1044
|
+
targetSampleRate: Int?,
|
|
1045
|
+
originalChannels: Int,
|
|
1046
|
+
targetChannels: Int?,
|
|
1047
|
+
normalize: Boolean
|
|
1048
|
+
): ByteArray {
|
|
1049
|
+
var processedData = pcmData
|
|
1050
|
+
|
|
1051
|
+
// Only resample if target sample rate is explicitly specified and different
|
|
1052
|
+
if (targetSampleRate != null && originalSampleRate != targetSampleRate) {
|
|
1053
|
+
processedData = resampleAudio(processedData, originalSampleRate, targetSampleRate, originalChannels)
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
// Only convert channels if target channels is explicitly specified and different
|
|
1057
|
+
if (targetChannels != null && originalChannels != targetChannels) {
|
|
1058
|
+
processedData = convertChannels(processedData, originalChannels, targetChannels)
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
// Only normalize if explicitly requested
|
|
1062
|
+
if (normalize) {
|
|
1063
|
+
processedData = normalizeAudio(processedData)
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
return processedData
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
private fun normalizeAudio(pcmData: ByteArray): ByteArray {
|
|
1070
|
+
val shorts = ShortArray(pcmData.size / 2)
|
|
1071
|
+
ByteBuffer.wrap(pcmData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts)
|
|
1072
|
+
|
|
1073
|
+
// Find maximum amplitude
|
|
1074
|
+
var maxAmplitude = 0
|
|
1075
|
+
for (sample in shorts) {
|
|
1076
|
+
maxAmplitude = maxOf(maxAmplitude, abs(sample.toInt()))
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
// Normalize if we found a non-zero maximum
|
|
1080
|
+
if (maxAmplitude > 0) {
|
|
1081
|
+
val normalizationFactor = Short.MAX_VALUE.toFloat() / maxAmplitude
|
|
1082
|
+
for (i in shorts.indices) {
|
|
1083
|
+
shorts[i] = (shorts[i] * normalizationFactor).toInt().toShort()
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
// Convert back to bytes
|
|
1088
|
+
val resultBuffer = ByteBuffer.allocate(shorts.size * 2)
|
|
1089
|
+
resultBuffer.order(ByteOrder.LITTLE_ENDIAN)
|
|
1090
|
+
resultBuffer.asShortBuffer().put(shorts)
|
|
1091
|
+
return resultBuffer.array()
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
private fun convertChannels(pcmData: ByteArray, originalChannels: Int, targetChannels: Int): ByteArray {
|
|
1095
|
+
// Use the correct implementation from AudioFormatUtils
|
|
1096
|
+
// Assuming 16-bit audio (which is the default for most audio processing)
|
|
1097
|
+
return AudioFormatUtils.convertChannels(pcmData, originalChannels, targetChannels, 16)
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
private fun debugWavHeader(file: File) {
|
|
1101
|
+
try {
|
|
1102
|
+
val bytes = ByteArray(44) // Standard WAV header size
|
|
1103
|
+
RandomAccessFile(file, "r").use { raf ->
|
|
1104
|
+
raf.readFully(bytes)
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
LogUtils.d(CLASS_NAME, "WAV Header Bytes: ${bytes.joinToString(", ") { String.format("%02X", it) }}")
|
|
1108
|
+
LogUtils.d(CLASS_NAME, "ASCII: ${bytes.map { it.toInt().toChar() }.joinToString("")}")
|
|
1109
|
+
|
|
1110
|
+
val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)
|
|
1111
|
+
LogUtils.d(CLASS_NAME, """
|
|
1112
|
+
RIFF header: ${String(bytes, 0, 4)}
|
|
1113
|
+
File size: ${buffer.getInt(4)}
|
|
1114
|
+
WAVE header: ${String(bytes, 8, 4)}
|
|
1115
|
+
fmt header: ${String(bytes, 12, 4)}
|
|
1116
|
+
Chunk size: ${buffer.getInt(16)}
|
|
1117
|
+
Audio format: ${buffer.getShort(20)}
|
|
1118
|
+
Channels: ${buffer.getShort(22)}
|
|
1119
|
+
Sample rate: ${buffer.getInt(24)}
|
|
1120
|
+
Byte rate: ${buffer.getInt(28)}
|
|
1121
|
+
Block align: ${buffer.getShort(32)}
|
|
1122
|
+
Bits per sample: ${buffer.getShort(34)}
|
|
1123
|
+
""".trimIndent())
|
|
1124
|
+
} catch (e: Exception) {
|
|
1125
|
+
LogUtils.e(CLASS_NAME, "Failed to debug WAV header: ${e.message}", e)
|
|
1126
|
+
}
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
fun generatePreview(
|
|
1130
|
+
audioData: AudioData,
|
|
1131
|
+
numberOfPoints: Int,
|
|
1132
|
+
startTimeMs: Long? = null,
|
|
1133
|
+
endTimeMs: Long? = null,
|
|
1134
|
+
config: RecordingConfig
|
|
1135
|
+
): AudioAnalysisData {
|
|
1136
|
+
val totalDurationMs = audioData.durationMs
|
|
1137
|
+
|
|
1138
|
+
LogUtils.d(CLASS_NAME, "Total audio duration: ${totalDurationMs}ms")
|
|
1139
|
+
|
|
1140
|
+
// Validate time range
|
|
1141
|
+
if (startTimeMs != null) {
|
|
1142
|
+
require(startTimeMs >= 0) { "startTime must be non-negative, got: $startTimeMs" }
|
|
1143
|
+
require(startTimeMs <= totalDurationMs) { "startTime ($startTimeMs) is beyond audio duration ($totalDurationMs)" }
|
|
1144
|
+
}
|
|
1145
|
+
|
|
1146
|
+
if (endTimeMs != null) {
|
|
1147
|
+
require(endTimeMs >= 0) { "endTime must be non-negative, got: $endTimeMs" }
|
|
1148
|
+
if (endTimeMs > totalDurationMs) {
|
|
1149
|
+
LogUtils.w(CLASS_NAME, "endTime ($endTimeMs) is beyond audio duration ($totalDurationMs), clamping to duration")
|
|
1150
|
+
}
|
|
1151
|
+
if (startTimeMs != null) {
|
|
1152
|
+
require(startTimeMs < endTimeMs) { "startTime ($startTimeMs) must be less than endTime ($endTimeMs)" }
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
// Calculate effective range
|
|
1157
|
+
val effectiveStartMs = startTimeMs ?: 0L
|
|
1158
|
+
val effectiveEndMs = (endTimeMs ?: totalDurationMs).coerceAtMost(totalDurationMs)
|
|
1159
|
+
val durationMs = effectiveEndMs - effectiveStartMs
|
|
1160
|
+
|
|
1161
|
+
LogUtils.d(CLASS_NAME, "Preview range: ${effectiveStartMs}ms to ${effectiveEndMs}ms (${durationMs}ms)")
|
|
1162
|
+
|
|
1163
|
+
// Calculate sample range
|
|
1164
|
+
val startSampleIndex = ((effectiveStartMs * audioData.sampleRate) / 1000).toInt()
|
|
1165
|
+
val endSampleIndex = ((effectiveEndMs * audioData.sampleRate) / 1000).toInt().coerceAtMost(audioData.data.size)
|
|
1166
|
+
val samplesInRange = endSampleIndex - startSampleIndex
|
|
1167
|
+
|
|
1168
|
+
if (samplesInRange <= 0) {
|
|
1169
|
+
throw IllegalArgumentException("Invalid sample range: contains no samples")
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
val samplesPerPoint = (samplesInRange / numberOfPoints).coerceAtLeast(1)
|
|
1173
|
+
val pointsPerSecond = numberOfPoints.toDouble() / (durationMs.toDouble() / 1000.0)
|
|
1174
|
+
|
|
1175
|
+
val dataPoints = mutableListOf<DataPoint>()
|
|
1176
|
+
var minAmplitude = Float.MAX_VALUE
|
|
1177
|
+
var maxAmplitude = Float.MIN_VALUE
|
|
1178
|
+
var minRms = Float.MAX_VALUE // Add minRms
|
|
1179
|
+
var maxRms = Float.MIN_VALUE // Add maxRms
|
|
1180
|
+
|
|
1181
|
+
val extractionTimeMs = measureTimeMillis {
|
|
1182
|
+
for (i in 0 until numberOfPoints) {
|
|
1183
|
+
val pointStartSample = startSampleIndex + (i * samplesPerPoint)
|
|
1184
|
+
val pointEndSample = minOf(startSampleIndex + ((i + 1) * samplesPerPoint), endSampleIndex)
|
|
1185
|
+
|
|
1186
|
+
if (pointStartSample >= pointEndSample) break
|
|
1187
|
+
|
|
1188
|
+
try {
|
|
1189
|
+
val segmentBytes = audioData.data.sliceArray(pointStartSample until pointEndSample)
|
|
1190
|
+
|
|
1191
|
+
// Convert PCM bytes to float samples with proper bit depth handling
|
|
1192
|
+
val segmentData = when (audioData.bitDepth) {
|
|
1193
|
+
16 -> convert16BitPcmToFloat(segmentBytes)
|
|
1194
|
+
32 -> convert32BitPcmToFloat(segmentBytes)
|
|
1195
|
+
else -> convert8BitPcmToFloat(segmentBytes)
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
// Calculate time points based on actual sample rate
|
|
1199
|
+
val startTimePoint = ((pointStartSample * 1000L) / (audioData.sampleRate * audioData.channels)).toFloat()
|
|
1200
|
+
val endTimePoint = ((pointEndSample * 1000L) / (audioData.sampleRate * audioData.channels)).toFloat()
|
|
1201
|
+
|
|
1202
|
+
val rms = sqrt(segmentData.map { it * it }.average().toFloat())
|
|
1203
|
+
val amplitude = segmentData.maxOf { abs(it) } // Always use peak amplitude
|
|
1204
|
+
|
|
1205
|
+
minAmplitude = minOf(minAmplitude, amplitude)
|
|
1206
|
+
maxAmplitude = maxOf(maxAmplitude, amplitude)
|
|
1207
|
+
minRms = minOf(minRms, rms)
|
|
1208
|
+
maxRms = maxOf(maxRms, rms)
|
|
1209
|
+
|
|
1210
|
+
dataPoints.add(DataPoint(
|
|
1211
|
+
id = i.toLong(),
|
|
1212
|
+
amplitude = amplitude, // Peak amplitude
|
|
1213
|
+
rms = rms, // RMS value
|
|
1214
|
+
dB = 20 * log10(amplitude.toDouble()).toFloat(),
|
|
1215
|
+
silent = amplitude < 0.01,
|
|
1216
|
+
features = null,
|
|
1217
|
+
speech = null,
|
|
1218
|
+
startTime = startTimePoint,
|
|
1219
|
+
endTime = endTimePoint,
|
|
1220
|
+
startPosition = pointStartSample,
|
|
1221
|
+
endPosition = pointEndSample,
|
|
1222
|
+
samples = segmentData.size
|
|
1223
|
+
))
|
|
1224
|
+
} catch (e: Exception) {
|
|
1225
|
+
LogUtils.e(CLASS_NAME, "Error processing segment $i: ${e.message}")
|
|
1226
|
+
throw IllegalStateException("Failed to process audio segment: ${e.message}", e)
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
if (dataPoints.isEmpty()) {
|
|
1232
|
+
throw IllegalStateException("No data points were generated")
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
return AudioAnalysisData(
|
|
1236
|
+
segmentDurationMs = config.segmentDurationMs,
|
|
1237
|
+
durationMs = durationMs.toInt(),
|
|
1238
|
+
bitDepth = audioData.bitDepth,
|
|
1239
|
+
numberOfChannels = audioData.channels,
|
|
1240
|
+
sampleRate = audioData.sampleRate,
|
|
1241
|
+
samples = samplesInRange,
|
|
1242
|
+
dataPoints = dataPoints,
|
|
1243
|
+
amplitudeRange = AudioAnalysisData.AmplitudeRange(minAmplitude, maxAmplitude),
|
|
1244
|
+
rmsRange = AudioAnalysisData.AmplitudeRange(minRms, maxRms),
|
|
1245
|
+
extractionTimeMs = extractionTimeMs.toFloat()
|
|
1246
|
+
)
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
// Add these conversion helpers
|
|
1250
|
+
private fun convert16BitPcmToFloat(bytes: ByteArray): FloatArray {
|
|
1251
|
+
val shorts = ShortArray(bytes.size / 2)
|
|
1252
|
+
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts)
|
|
1253
|
+
return shorts.map { it.toFloat() / Short.MAX_VALUE }.toFloatArray()
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
private fun convert32BitPcmToFloat(bytes: ByteArray): FloatArray {
|
|
1257
|
+
val ints = IntArray(bytes.size / 4)
|
|
1258
|
+
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer().get(ints)
|
|
1259
|
+
return ints.map { it.toFloat() / Int.MAX_VALUE }.toFloatArray()
|
|
1260
|
+
}
|
|
1261
|
+
|
|
1262
|
+
private fun convert8BitPcmToFloat(bytes: ByteArray): FloatArray {
|
|
1263
|
+
return bytes.map { (it.toInt() - 128).toFloat() / 127f }.toFloatArray()
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
fun loadAudioRange(fileUri: String, startTimeMs: Long, endTimeMs: Long, config: DecodingConfig? = null): AudioData? {
|
|
1267
|
+
try {
|
|
1268
|
+
// Use default config if none provided
|
|
1269
|
+
val effectiveConfig = config ?: DecodingConfig(
|
|
1270
|
+
targetSampleRate = null,
|
|
1271
|
+
targetChannels = null,
|
|
1272
|
+
targetBitDepth = 16,
|
|
1273
|
+
normalizeAudio = false
|
|
1274
|
+
)
|
|
1275
|
+
|
|
1276
|
+
// First check if it's a WAV file by extension
|
|
1277
|
+
val isWavByExtension = fileUri.lowercase().endsWith(".wav")
|
|
1278
|
+
|
|
1279
|
+
// Then verify WAV header if needed
|
|
1280
|
+
val headerSize = if (isWavByExtension) {
|
|
1281
|
+
getWavHeaderSize(fileUri)
|
|
1282
|
+
} else null
|
|
1283
|
+
|
|
1284
|
+
// If it's a WAV file (by extension and header verification)
|
|
1285
|
+
return if (isWavByExtension && headerSize != null) {
|
|
1286
|
+
LogUtils.d(CLASS_NAME, "Loading WAV range with header size: $headerSize bytes")
|
|
1287
|
+
loadWavRange(fileUri, startTimeMs, endTimeMs, effectiveConfig, headerSize)
|
|
1288
|
+
} else {
|
|
1289
|
+
if (isWavByExtension) {
|
|
1290
|
+
LogUtils.w(CLASS_NAME, "File has .wav extension but invalid header, falling back to compressed loader")
|
|
1291
|
+
}
|
|
1292
|
+
LogUtils.d(CLASS_NAME, "Loading compressed audio range")
|
|
1293
|
+
loadCompressedAudioRange(fileUri, startTimeMs, endTimeMs, effectiveConfig)
|
|
1294
|
+
}
|
|
1295
|
+
} catch (e: Exception) {
|
|
1296
|
+
LogUtils.e(CLASS_NAME, "Failed to load audio range: ${e.message}", e)
|
|
1297
|
+
return null
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1301
|
+
private fun loadWavRange(
|
|
1302
|
+
fileUri: String,
|
|
1303
|
+
startTimeMs: Long,
|
|
1304
|
+
endTimeMs: Long,
|
|
1305
|
+
config: DecodingConfig,
|
|
1306
|
+
headerSize: Int
|
|
1307
|
+
): AudioData? {
|
|
1308
|
+
try {
|
|
1309
|
+
val file = File(fileUri.removePrefix("file://")).takeIf { it.exists() }
|
|
1310
|
+
?: File(filesDir, File(fileUri).name).takeIf { it.exists() }
|
|
1311
|
+
?: throw IllegalArgumentException("File not found: $fileUri")
|
|
1312
|
+
|
|
1313
|
+
// Use existing method to get audio format
|
|
1314
|
+
val format = getAudioFormat(fileUri) ?: throw IllegalArgumentException("Could not determine audio format")
|
|
1315
|
+
|
|
1316
|
+
val bytesPerSecond = format.sampleRate * format.channels * (format.bitDepth / 8)
|
|
1317
|
+
val startByteOffset = ((startTimeMs * bytesPerSecond) / 1000).toInt()
|
|
1318
|
+
val endByteOffset = ((endTimeMs * bytesPerSecond) / 1000).toInt()
|
|
1319
|
+
|
|
1320
|
+
val startByte = headerSize + startByteOffset
|
|
1321
|
+
val endByte = headerSize + endByteOffset
|
|
1322
|
+
|
|
1323
|
+
LogUtils.d(CLASS_NAME, """
|
|
1324
|
+
Loading WAV range:
|
|
1325
|
+
- headerSize: $headerSize
|
|
1326
|
+
- startByte: $startByte
|
|
1327
|
+
- endByte: $endByte
|
|
1328
|
+
- bytesPerSecond: $bytesPerSecond
|
|
1329
|
+
""".trimIndent())
|
|
1330
|
+
|
|
1331
|
+
var audioDataBytes = ByteArray((endByte - startByte).coerceAtLeast(0))
|
|
1332
|
+
FileInputStream(file).use { fis ->
|
|
1333
|
+
fis.skip(startByte.toLong())
|
|
1334
|
+
fis.read(audioDataBytes)
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
// Apply bit depth conversion if needed
|
|
1338
|
+
var effectiveBitDepth = format.bitDepth
|
|
1339
|
+
if (config.targetBitDepth != format.bitDepth) {
|
|
1340
|
+
audioDataBytes = AudioFormatUtils.convertBitDepth(
|
|
1341
|
+
audioDataBytes,
|
|
1342
|
+
format.bitDepth,
|
|
1343
|
+
config.targetBitDepth
|
|
1344
|
+
)
|
|
1345
|
+
effectiveBitDepth = config.targetBitDepth
|
|
1346
|
+
LogUtils.d(CLASS_NAME, "Converted bit depth from ${format.bitDepth} to ${config.targetBitDepth}")
|
|
1347
|
+
}
|
|
1348
|
+
|
|
1349
|
+
return AudioData(
|
|
1350
|
+
data = audioDataBytes,
|
|
1351
|
+
sampleRate = format.sampleRate,
|
|
1352
|
+
channels = format.channels,
|
|
1353
|
+
bitDepth = effectiveBitDepth,
|
|
1354
|
+
durationMs = endTimeMs - startTimeMs
|
|
1355
|
+
)
|
|
1356
|
+
} catch (e: Exception) {
|
|
1357
|
+
LogUtils.e(CLASS_NAME, "Failed to load WAV range: ${e.message}", e)
|
|
1358
|
+
return null
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
private fun loadCompressedAudioRange(
|
|
1363
|
+
fileUri: String,
|
|
1364
|
+
startTimeMs: Long,
|
|
1365
|
+
endTimeMs: Long,
|
|
1366
|
+
config: DecodingConfig
|
|
1367
|
+
): AudioData? {
|
|
1368
|
+
val extractor = MediaExtractor()
|
|
1369
|
+
var decoder: MediaCodec? = null
|
|
1370
|
+
|
|
1371
|
+
try {
|
|
1372
|
+
extractor.setDataSource(fileUri.removePrefix("file://"))
|
|
1373
|
+
val format = extractor.getTrackFormat(0)
|
|
1374
|
+
extractor.selectTrack(0)
|
|
1375
|
+
|
|
1376
|
+
val originalSampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE)
|
|
1377
|
+
val originalChannels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT)
|
|
1378
|
+
val totalDurationUs = try {
|
|
1379
|
+
format.getLong(MediaFormat.KEY_DURATION)
|
|
1380
|
+
} catch (e: Exception) {
|
|
1381
|
+
(format.getString(MediaFormat.KEY_DURATION) ?: "-1").toLong()
|
|
1382
|
+
}
|
|
1383
|
+
LogUtils.d(CLASS_NAME, "Raw duration from format: ${totalDurationUs}us")
|
|
1384
|
+
|
|
1385
|
+
val totalDurationMs = totalDurationUs / 1000
|
|
1386
|
+
LogUtils.d(CLASS_NAME, "Final duration: ${totalDurationMs}ms")
|
|
1387
|
+
|
|
1388
|
+
// Calculate valid time range
|
|
1389
|
+
val validStartMs = startTimeMs.coerceIn(0, totalDurationMs) ?: 0
|
|
1390
|
+
val validEndMs = endTimeMs.coerceIn(validStartMs, totalDurationMs) ?: totalDurationMs
|
|
1391
|
+
val effectiveDurationMs = validEndMs - validStartMs
|
|
1392
|
+
|
|
1393
|
+
// Initialize decoder
|
|
1394
|
+
decoder = MediaCodec.createDecoderByType(format.getString(MediaFormat.KEY_MIME)!!)
|
|
1395
|
+
decoder.configure(format, null, null, 0)
|
|
1396
|
+
decoder.start()
|
|
1397
|
+
|
|
1398
|
+
// Seek to start position if needed
|
|
1399
|
+
if (validStartMs > 0) {
|
|
1400
|
+
extractor.seekTo(validStartMs * 1000, MediaExtractor.SEEK_TO_CLOSEST_SYNC)
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1403
|
+
// Calculate buffer sizes
|
|
1404
|
+
val targetSampleRate = config.targetSampleRate ?: originalSampleRate
|
|
1405
|
+
val targetChannels = config.targetChannels ?: originalChannels
|
|
1406
|
+
val targetBitDepth = config.targetBitDepth ?: 16
|
|
1407
|
+
val bytesPerSample = targetBitDepth / 8
|
|
1408
|
+
val samplesPerSecond = targetSampleRate * targetChannels
|
|
1409
|
+
val totalBytes = (effectiveDurationMs * samplesPerSecond * bytesPerSample) / 1000
|
|
1410
|
+
|
|
1411
|
+
LogUtils.d(CLASS_NAME, """
|
|
1412
|
+
Loading audio range:
|
|
1413
|
+
- start: ${validStartMs}ms
|
|
1414
|
+
- end: ${validEndMs}ms
|
|
1415
|
+
- duration: ${effectiveDurationMs}ms
|
|
1416
|
+
- bytes: $totalBytes
|
|
1417
|
+
- format: ${targetSampleRate}Hz, $targetChannels channels, $targetBitDepth-bit
|
|
1418
|
+
""".trimIndent())
|
|
1419
|
+
|
|
1420
|
+
val outputBuffer = ByteBuffer.allocate(totalBytes.toInt())
|
|
1421
|
+
val bufferInfo = MediaCodec.BufferInfo()
|
|
1422
|
+
var isEOS = false
|
|
1423
|
+
|
|
1424
|
+
while (!isEOS) {
|
|
1425
|
+
// Handle input
|
|
1426
|
+
val inputBufferId = decoder.dequeueInputBuffer(10000)
|
|
1427
|
+
if (inputBufferId >= 0) {
|
|
1428
|
+
val inputBuffer = decoder.getInputBuffer(inputBufferId)!!
|
|
1429
|
+
val sampleSize = extractor.readSampleData(inputBuffer, 0)
|
|
1430
|
+
|
|
1431
|
+
when {
|
|
1432
|
+
sampleSize < 0 -> {
|
|
1433
|
+
decoder.queueInputBuffer(inputBufferId, 0, 0, 0, MediaCodec.BUFFER_FLAG_END_OF_STREAM)
|
|
1434
|
+
isEOS = true
|
|
1435
|
+
}
|
|
1436
|
+
extractor.sampleTime > validEndMs * 1000 -> {
|
|
1437
|
+
decoder.queueInputBuffer(inputBufferId, 0, 0, 0, MediaCodec.BUFFER_FLAG_END_OF_STREAM)
|
|
1438
|
+
isEOS = true
|
|
1439
|
+
}
|
|
1440
|
+
else -> {
|
|
1441
|
+
decoder.queueInputBuffer(inputBufferId, 0, sampleSize, extractor.sampleTime, 0)
|
|
1442
|
+
extractor.advance()
|
|
1443
|
+
}
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1447
|
+
// Handle output
|
|
1448
|
+
val outputBufferId = decoder.dequeueOutputBuffer(bufferInfo, 10000)
|
|
1449
|
+
if (outputBufferId >= 0) {
|
|
1450
|
+
val decodedBuffer = decoder.getOutputBuffer(outputBufferId)!!
|
|
1451
|
+
if (bufferInfo.size > 0) {
|
|
1452
|
+
// Set buffer position and limit based on the decoded data
|
|
1453
|
+
decodedBuffer.position(bufferInfo.offset)
|
|
1454
|
+
decodedBuffer.limit(bufferInfo.offset + bufferInfo.size)
|
|
1455
|
+
|
|
1456
|
+
// Copy decoded data to our output buffer
|
|
1457
|
+
outputBuffer.put(decodedBuffer)
|
|
1458
|
+
}
|
|
1459
|
+
decoder.releaseOutputBuffer(outputBufferId, false)
|
|
1460
|
+
|
|
1461
|
+
// Check if we've reached the end
|
|
1462
|
+
if ((bufferInfo.flags and MediaCodec.BUFFER_FLAG_END_OF_STREAM) != 0) {
|
|
1463
|
+
isEOS = true
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
// Prepare the final byte array
|
|
1469
|
+
outputBuffer.flip()
|
|
1470
|
+
val audioData = ByteArray(outputBuffer.remaining())
|
|
1471
|
+
outputBuffer.get(audioData)
|
|
1472
|
+
|
|
1473
|
+
return AudioData(
|
|
1474
|
+
data = audioData,
|
|
1475
|
+
sampleRate = targetSampleRate,
|
|
1476
|
+
channels = targetChannels,
|
|
1477
|
+
bitDepth = targetBitDepth,
|
|
1478
|
+
durationMs = endTimeMs - startTimeMs // Use the actual time range
|
|
1479
|
+
).also {
|
|
1480
|
+
LogUtils.d(CLASS_NAME, "Loaded compressed audio with duration: ${effectiveDurationMs}ms")
|
|
1481
|
+
}
|
|
1482
|
+
} catch (e: Exception) {
|
|
1483
|
+
LogUtils.e(CLASS_NAME, "Failed to load compressed audio range: ${e.message}", e)
|
|
1484
|
+
return null
|
|
1485
|
+
} finally {
|
|
1486
|
+
decoder?.stop()
|
|
1487
|
+
decoder?.release()
|
|
1488
|
+
extractor.release()
|
|
1489
|
+
}
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
// Future audio editing methods
|
|
1493
|
+
fun trimAudio(
|
|
1494
|
+
fileUri: String,
|
|
1495
|
+
startTimeMs: Long,
|
|
1496
|
+
endTimeMs: Long,
|
|
1497
|
+
config: DecodingConfig? = null,
|
|
1498
|
+
outputFileName: String? = null
|
|
1499
|
+
): AudioData? {
|
|
1500
|
+
try {
|
|
1501
|
+
// Load the specified range
|
|
1502
|
+
val audioData = loadAudioRange(fileUri, startTimeMs, endTimeMs, config ?: DecodingConfig())
|
|
1503
|
+
?: return null
|
|
1504
|
+
|
|
1505
|
+
// Generate output filename if not provided
|
|
1506
|
+
val outputFile = if (outputFileName != null) {
|
|
1507
|
+
File(filesDir, outputFileName)
|
|
1508
|
+
} else {
|
|
1509
|
+
val timestamp = System.currentTimeMillis()
|
|
1510
|
+
File(filesDir, "trimmed_${timestamp}.wav")
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
val durationMs = (endTimeMs - startTimeMs).toInt()
|
|
1514
|
+
|
|
1515
|
+
LogUtils.d(CLASS_NAME, """
|
|
1516
|
+
Trimming audio:
|
|
1517
|
+
- start: ${startTimeMs}ms
|
|
1518
|
+
- end: ${endTimeMs}ms
|
|
1519
|
+
- duration: ${durationMs}ms
|
|
1520
|
+
- output: ${outputFile.name}
|
|
1521
|
+
""".trimIndent())
|
|
1522
|
+
|
|
1523
|
+
// Write WAV header
|
|
1524
|
+
RandomAccessFile(outputFile, "rw").use { raf ->
|
|
1525
|
+
// RIFF header
|
|
1526
|
+
raf.write("RIFF".toByteArray())
|
|
1527
|
+
val fileSize = audioData.data.size + 36 // File size minus RIFF header
|
|
1528
|
+
raf.writeInt(fileSize)
|
|
1529
|
+
raf.write("WAVE".toByteArray())
|
|
1530
|
+
|
|
1531
|
+
// fmt chunk
|
|
1532
|
+
raf.write("fmt ".toByteArray())
|
|
1533
|
+
raf.writeInt(16) // Subchunk1Size (16 for PCM)
|
|
1534
|
+
val formatBytes = ByteBuffer.allocate(2).order(ByteOrder.LITTLE_ENDIAN)
|
|
1535
|
+
formatBytes.putShort(1) // AudioFormat (1 for PCM)
|
|
1536
|
+
raf.write(formatBytes.array())
|
|
1537
|
+
|
|
1538
|
+
val channelsBytes = ByteBuffer.allocate(2).order(ByteOrder.LITTLE_ENDIAN)
|
|
1539
|
+
channelsBytes.putShort(audioData.channels.toShort())
|
|
1540
|
+
raf.write(channelsBytes.array())
|
|
1541
|
+
|
|
1542
|
+
val sampleRateBytes = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN)
|
|
1543
|
+
sampleRateBytes.putInt(audioData.sampleRate)
|
|
1544
|
+
raf.write(sampleRateBytes.array())
|
|
1545
|
+
|
|
1546
|
+
val byteRate = audioData.sampleRate * audioData.channels * (audioData.bitDepth / 8)
|
|
1547
|
+
raf.writeInt(byteRate) // ByteRate
|
|
1548
|
+
|
|
1549
|
+
val blockAlign = audioData.channels * (audioData.bitDepth / 8)
|
|
1550
|
+
raf.writeShort(blockAlign) // BlockAlign
|
|
1551
|
+
raf.writeShort(audioData.bitDepth) // BitsPerSample
|
|
1552
|
+
|
|
1553
|
+
// data chunk
|
|
1554
|
+
raf.write("data".toByteArray())
|
|
1555
|
+
raf.writeInt(audioData.data.size) // Subchunk2Size
|
|
1556
|
+
|
|
1557
|
+
// Write audio data
|
|
1558
|
+
raf.write(audioData.data)
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
// Debug WAV header to verify
|
|
1562
|
+
debugWavHeader(outputFile)
|
|
1563
|
+
|
|
1564
|
+
// Return the trimmed audio data
|
|
1565
|
+
return AudioData(
|
|
1566
|
+
data = audioData.data,
|
|
1567
|
+
sampleRate = audioData.sampleRate,
|
|
1568
|
+
channels = audioData.channels,
|
|
1569
|
+
bitDepth = audioData.bitDepth
|
|
1570
|
+
)
|
|
1571
|
+
} catch (e: Exception) {
|
|
1572
|
+
LogUtils.e(CLASS_NAME, "Failed to trim audio: ${e.message}", e)
|
|
1573
|
+
return null
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1577
|
+
fun removeSection(
|
|
1578
|
+
fileUri: String,
|
|
1579
|
+
startTimeMs: Long,
|
|
1580
|
+
endTimeMs: Long,
|
|
1581
|
+
config: DecodingConfig? = null
|
|
1582
|
+
): AudioData? {
|
|
1583
|
+
// TODO: Implement removing a section by concatenating before and after ranges
|
|
1584
|
+
// This will use loadAudioRange to get two sections and join them
|
|
1585
|
+
return null
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
fun joinAudioSections(
|
|
1589
|
+
sections: List<AudioData>,
|
|
1590
|
+
config: DecodingConfig? = null
|
|
1591
|
+
): AudioData? {
|
|
1592
|
+
// TODO: Implement joining multiple audio sections
|
|
1593
|
+
// This will be used by removeSection and other future editing features
|
|
1594
|
+
return null
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
// Helper method for future editing features
|
|
1598
|
+
private fun convertAudioFormat(
|
|
1599
|
+
audioData: AudioData,
|
|
1600
|
+
targetSampleRate: Int? = null,
|
|
1601
|
+
targetChannels: Int? = null,
|
|
1602
|
+
targetBitDepth: Int? = null
|
|
1603
|
+
): AudioData {
|
|
1604
|
+
// TODO: Implement audio format conversion
|
|
1605
|
+
// This will help ensure consistent format when joining sections
|
|
1606
|
+
return audioData
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
// Add new function to process entire file
|
|
1610
|
+
fun processEntireFile(audioData: AudioData): Features {
|
|
1611
|
+
val samples = convertToFloatArray(audioData.data, audioData.bitDepth)
|
|
1612
|
+
|
|
1613
|
+
// Compute basic features for the entire file
|
|
1614
|
+
val sumSquares = samples.sumOf { it * it.toDouble() }.toFloat()
|
|
1615
|
+
val segmentLength = samples.size
|
|
1616
|
+
val zeroCrossings = countZeroCrossings(samples)
|
|
1617
|
+
val minAmplitude = samples.minOrNull() ?: 0f
|
|
1618
|
+
val maxAmplitude = samples.maxOrNull() ?: 0f
|
|
1619
|
+
|
|
1620
|
+
// Use existing computeFeatures with the entire file as one segment
|
|
1621
|
+
return computeFeatures(
|
|
1622
|
+
segmentData = samples,
|
|
1623
|
+
sampleRate = audioData.sampleRate.toFloat(),
|
|
1624
|
+
sumSquares = sumSquares,
|
|
1625
|
+
zeroCrossings = zeroCrossings,
|
|
1626
|
+
segmentLength = segmentLength,
|
|
1627
|
+
minAmplitude = minAmplitude,
|
|
1628
|
+
maxAmplitude = maxAmplitude,
|
|
1629
|
+
featureOptions = mapOf() // Dont compute complex features
|
|
1630
|
+
)
|
|
1631
|
+
}
|
|
1632
|
+
|
|
1633
|
+
private fun countZeroCrossings(data: FloatArray): Int {
|
|
1634
|
+
var crossings = 0
|
|
1635
|
+
for (i in 1 until data.size) {
|
|
1636
|
+
if (data[i - 1] * data[i] < 0) crossings++
|
|
1637
|
+
}
|
|
1638
|
+
return crossings
|
|
1639
|
+
}
|
|
1640
|
+
|
|
1641
|
+
private fun hzToMel(hz: Float): Float {
|
|
1642
|
+
return 2595f * log10(1f + hz / 700f)
|
|
1643
|
+
}
|
|
1644
|
+
|
|
1645
|
+
private fun melToHz(mel: Float): Float {
|
|
1646
|
+
return 700f * (10f.pow(mel / 2595f) - 1f)
|
|
1647
|
+
}
|
|
1648
|
+
|
|
1649
|
+
private fun applyHannWindow(samples: FloatArray): FloatArray {
|
|
1650
|
+
val output = FloatArray(samples.size)
|
|
1651
|
+
for (i in samples.indices) {
|
|
1652
|
+
val multiplier = 0.5f * (1f - cos(2f * PI.toFloat() * i / (samples.size - 1)))
|
|
1653
|
+
output[i] = samples[i] * multiplier
|
|
1654
|
+
}
|
|
1655
|
+
return output
|
|
1656
|
+
}
|
|
1657
|
+
|
|
1658
|
+
// Generate a Hann window of a specific size (new, avoids modifying applyHannWindow)
|
|
1659
|
+
private fun generateHannWindow(size: Int): FloatArray {
|
|
1660
|
+
return FloatArray(size) { i ->
|
|
1661
|
+
0.5f * (1f - cos(2f * PI.toFloat() * i / (size - 1)))
|
|
1662
|
+
}
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
// Main function to extract mel spectrogram (uses shared C++ implementation via JNI)
|
|
1666
|
+
fun extractMelSpectrogram(
|
|
1667
|
+
audioData: AudioData,
|
|
1668
|
+
windowSizeMs: Float = 25f, // Default 25ms window
|
|
1669
|
+
hopLengthMs: Float = 10f, // Default 10ms hop
|
|
1670
|
+
nMels: Int = 128, // Number of mel bins
|
|
1671
|
+
fftLength: Int = 2048, // FFT size
|
|
1672
|
+
fMin: Float = 0f, // Minimum frequency
|
|
1673
|
+
fMax: Float = audioData.sampleRate.toFloat() / 2, // Nyquist frequency
|
|
1674
|
+
windowType: String = "hann",
|
|
1675
|
+
logScaling: Boolean = true, // Apply log scaling
|
|
1676
|
+
normalize: Boolean = false // Normalize output
|
|
1677
|
+
): SpectrogramData {
|
|
1678
|
+
val sampleRate = audioData.sampleRate.toFloat()
|
|
1679
|
+
val samples = convertToFloatArray(audioData.data, audioData.bitDepth)
|
|
1680
|
+
|
|
1681
|
+
// Convert ms to samples
|
|
1682
|
+
val windowSizeSamples = (windowSizeMs * sampleRate / 1000).toInt()
|
|
1683
|
+
val hopLengthSamples = (hopLengthMs * sampleRate / 1000).toInt()
|
|
1684
|
+
|
|
1685
|
+
val windowTypeInt = when (windowType.lowercase()) {
|
|
1686
|
+
"hann" -> 0
|
|
1687
|
+
"hamming" -> 1
|
|
1688
|
+
else -> throw IllegalArgumentException("Unsupported windowType: $windowType")
|
|
1689
|
+
}
|
|
1690
|
+
|
|
1691
|
+
// Call shared C++ implementation via JNI
|
|
1692
|
+
val melSpectrogram = MelSpectrogramNative.compute(
|
|
1693
|
+
samples = samples,
|
|
1694
|
+
sampleRate = sampleRate.toInt(),
|
|
1695
|
+
fftLength = fftLength,
|
|
1696
|
+
windowSizeSamples = windowSizeSamples,
|
|
1697
|
+
hopLengthSamples = hopLengthSamples,
|
|
1698
|
+
nMels = nMels,
|
|
1699
|
+
fMin = fMin,
|
|
1700
|
+
fMax = fMax,
|
|
1701
|
+
windowType = windowTypeInt,
|
|
1702
|
+
logScale = logScaling,
|
|
1703
|
+
normalize = normalize
|
|
1704
|
+
)
|
|
1705
|
+
|
|
1706
|
+
// Compute timestamps and frequencies for metadata
|
|
1707
|
+
val numFrames = melSpectrogram.size
|
|
1708
|
+
val timeStamps = FloatArray(numFrames) { it * hopLengthMs / 1000f }
|
|
1709
|
+
val frequencies = melFrequencies(nMels, fMin, fMax)
|
|
1710
|
+
|
|
1711
|
+
return SpectrogramData(melSpectrogram, timeStamps, frequencies)
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
// Compute Short-Time Fourier Transform
|
|
1715
|
+
private fun computeSTFT(
|
|
1716
|
+
samples: FloatArray,
|
|
1717
|
+
fftLength: Int,
|
|
1718
|
+
windowSize: Int,
|
|
1719
|
+
hopLength: Int,
|
|
1720
|
+
window: FloatArray
|
|
1721
|
+
): Array<FloatArray> {
|
|
1722
|
+
val fft = FFT(fftLength)
|
|
1723
|
+
val numFrames = ((samples.size - windowSize) / hopLength) + 1
|
|
1724
|
+
val stft = Array(numFrames) { FloatArray(fftLength / 2 + 1) }
|
|
1725
|
+
|
|
1726
|
+
for (frameIdx in 0 until numFrames) {
|
|
1727
|
+
val start = frameIdx * hopLength
|
|
1728
|
+
val end = minOf(start + windowSize, samples.size)
|
|
1729
|
+
val frame = FloatArray(fftLength) { 0f }
|
|
1730
|
+
|
|
1731
|
+
// Extract and window the frame
|
|
1732
|
+
for (i in start until end) {
|
|
1733
|
+
frame[i - start] = samples[i] * window[i - start]
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
// Compute FFT and power spectrum
|
|
1737
|
+
val fftResult = fft.processSegment(frame)
|
|
1738
|
+
for (i in 0 until fftLength / 2 + 1) {
|
|
1739
|
+
// Check bounds before accessing array elements
|
|
1740
|
+
val real = if (2 * i < fftResult.size) fftResult[2 * i] else 0f
|
|
1741
|
+
val imag = if (2 * i + 1 < fftResult.size) fftResult[2 * i + 1] else 0f
|
|
1742
|
+
stft[frameIdx][i] = real * real + imag * imag
|
|
1743
|
+
}
|
|
1744
|
+
}
|
|
1745
|
+
return stft
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
// Apply mel filterbank to STFT
|
|
1749
|
+
private fun applyMelFilterbank(
|
|
1750
|
+
stft: Array<FloatArray>,
|
|
1751
|
+
sampleRate: Float,
|
|
1752
|
+
nMels: Int,
|
|
1753
|
+
fftLength: Int,
|
|
1754
|
+
fMin: Float,
|
|
1755
|
+
fMax: Float
|
|
1756
|
+
): Array<FloatArray> {
|
|
1757
|
+
val numFrames = stft.size
|
|
1758
|
+
val numBins = stft[0].size
|
|
1759
|
+
val melFilters = createMelFilterbank(sampleRate, fftLength, nMels, fMin, fMax)
|
|
1760
|
+
val melSpectrogram = Array(numFrames) { FloatArray(nMels) }
|
|
1761
|
+
|
|
1762
|
+
for (frame in 0 until numFrames) {
|
|
1763
|
+
for (melBin in 0 until nMels) {
|
|
1764
|
+
var sum = 0f
|
|
1765
|
+
for (bin in 0 until numBins) {
|
|
1766
|
+
sum += stft[frame][bin] * melFilters[melBin][bin]
|
|
1767
|
+
}
|
|
1768
|
+
melSpectrogram[frame][melBin] = sum
|
|
1769
|
+
}
|
|
1770
|
+
}
|
|
1771
|
+
return melSpectrogram
|
|
1772
|
+
}
|
|
1773
|
+
|
|
1774
|
+
// Create mel filterbank matrix
|
|
1775
|
+
private fun createMelFilterbank(
|
|
1776
|
+
sampleRate: Float,
|
|
1777
|
+
fftLength: Int,
|
|
1778
|
+
nMels: Int,
|
|
1779
|
+
fMin: Float,
|
|
1780
|
+
fMax: Float
|
|
1781
|
+
): Array<FloatArray> {
|
|
1782
|
+
val freqs = FloatArray(fftLength / 2 + 1) { it * sampleRate / fftLength }
|
|
1783
|
+
val melPoints = melFrequencies(nMels + 2, fMin, fMax)
|
|
1784
|
+
val melFilters = Array(nMels) { FloatArray(fftLength / 2 + 1) }
|
|
1785
|
+
|
|
1786
|
+
for (melIdx in 0 until nMels) {
|
|
1787
|
+
val fLow = melPoints[melIdx]
|
|
1788
|
+
val fCenter = melPoints[melIdx + 1]
|
|
1789
|
+
val fHigh = melPoints[melIdx + 2]
|
|
1790
|
+
|
|
1791
|
+
for (bin in freqs.indices) {
|
|
1792
|
+
val freq = freqs[bin]
|
|
1793
|
+
melFilters[melIdx][bin] = when {
|
|
1794
|
+
freq < fLow || freq > fHigh -> 0f
|
|
1795
|
+
freq <= fCenter -> (freq - fLow) / (fCenter - fLow)
|
|
1796
|
+
else -> (fHigh - freq) / (fHigh - fCenter)
|
|
1797
|
+
}
|
|
1798
|
+
}
|
|
1799
|
+
}
|
|
1800
|
+
return melFilters
|
|
1801
|
+
}
|
|
1802
|
+
|
|
1803
|
+
// Generate mel-spaced frequencies
|
|
1804
|
+
private fun melFrequencies(nMels: Int, fMin: Float, fMax: Float): FloatArray {
|
|
1805
|
+
val melMin = hzToMel(fMin)
|
|
1806
|
+
val melMax = hzToMel(fMax)
|
|
1807
|
+
val melPoints = FloatArray(nMels) { i ->
|
|
1808
|
+
val mel = melMin + i * (melMax - melMin) / (nMels - 1)
|
|
1809
|
+
melToHz(mel)
|
|
1810
|
+
}
|
|
1811
|
+
return melPoints
|
|
1812
|
+
}
|
|
1813
|
+
|
|
1814
|
+
private fun computeMelSpectrogram(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
1815
|
+
val nMels = 128
|
|
1816
|
+
val fftLength = 2048
|
|
1817
|
+
val windowSize = minOf(samples.size, fftLength)
|
|
1818
|
+
val hopLength = windowSize // single frame
|
|
1819
|
+
|
|
1820
|
+
MelSpectrogramNative.init(
|
|
1821
|
+
sampleRate = sampleRate.toInt(),
|
|
1822
|
+
fftLength = fftLength,
|
|
1823
|
+
windowSizeSamples = windowSize,
|
|
1824
|
+
hopLengthSamples = hopLength,
|
|
1825
|
+
nMels = nMels,
|
|
1826
|
+
fMin = 0f,
|
|
1827
|
+
fMax = sampleRate / 2f,
|
|
1828
|
+
windowType = 0 // Hann
|
|
1829
|
+
)
|
|
1830
|
+
|
|
1831
|
+
val melOutput = FloatArray(nMels)
|
|
1832
|
+
val success = MelSpectrogramNative.computeFrame(samples, melOutput)
|
|
1833
|
+
return if (success) melOutput.toList() else emptyList()
|
|
1834
|
+
}
|
|
1835
|
+
|
|
1836
|
+
private fun computeChroma(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
1837
|
+
val (_, magnitudeSpectrum) = prepareFFT(samples, sampleRate)
|
|
1838
|
+
val chroma = FloatArray(N_CHROMA) { 0f }
|
|
1839
|
+
val freqsPerBin = sampleRate / N_FFT
|
|
1840
|
+
|
|
1841
|
+
for (i in 0 until N_FFT / 2) {
|
|
1842
|
+
val freq = i * freqsPerBin
|
|
1843
|
+
if (freq > 0) {
|
|
1844
|
+
val pitchClass = (12 * log2(freq / 440.0) % 12).toInt()
|
|
1845
|
+
if (pitchClass in 0..11) {
|
|
1846
|
+
val magnitude = sqrt(magnitudeSpectrum[2 * i] * magnitudeSpectrum[2 * i] +
|
|
1847
|
+
(if (2 * i + 1 < magnitudeSpectrum.size) magnitudeSpectrum[2 * i + 1] else 0f) *
|
|
1848
|
+
magnitudeSpectrum[2 * i + 1])
|
|
1849
|
+
chroma[pitchClass] += magnitude
|
|
1850
|
+
}
|
|
1851
|
+
}
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1854
|
+
return chroma.toList()
|
|
1855
|
+
}
|
|
1856
|
+
|
|
1857
|
+
private fun computeSpectralContrast(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
1858
|
+
val (_, magnitudeSpectrum) = prepareFFT(samples, sampleRate)
|
|
1859
|
+
// ... rest of spectral contrast computation using magnitudeSpectrum ...
|
|
1860
|
+
// Implementation depends on your specific requirements
|
|
1861
|
+
return emptyList() // Placeholder
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
private fun computeTonnetz(samples: FloatArray, sampleRate: Float): List<Float> {
|
|
1865
|
+
// First compute chroma features
|
|
1866
|
+
val chroma = computeChroma(samples, sampleRate)
|
|
1867
|
+
|
|
1868
|
+
// Tonnetz transformation matrix (6x12)
|
|
1869
|
+
val tonnetzMatrix = arrayOf(
|
|
1870
|
+
floatArrayOf(1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f, 0f, 0f), // Perfect fifth
|
|
1871
|
+
floatArrayOf(0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f, 0f, 0f), // Minor third
|
|
1872
|
+
floatArrayOf(0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f), // Major third
|
|
1873
|
+
floatArrayOf(0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f), // Perfect fifth
|
|
1874
|
+
floatArrayOf(0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 0f, 0f, 1f, 0f), // Minor third
|
|
1875
|
+
floatArrayOf(1f, 0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f) // Major third
|
|
1876
|
+
)
|
|
1877
|
+
|
|
1878
|
+
// Compute tonnetz features
|
|
1879
|
+
val tonnetz = mutableListOf<Float>()
|
|
1880
|
+
for (row in tonnetzMatrix) {
|
|
1881
|
+
var sum = 0f
|
|
1882
|
+
for (i in row.indices) {
|
|
1883
|
+
sum += row[i] * (chroma.getOrNull(i) ?: 0f)
|
|
1884
|
+
}
|
|
1885
|
+
tonnetz.add(sum)
|
|
1886
|
+
}
|
|
1887
|
+
|
|
1888
|
+
return tonnetz
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1891
|
+
private fun nextPowerOfTwo(n: Int): Int {
|
|
1892
|
+
var value = 1
|
|
1893
|
+
while (value < n) {
|
|
1894
|
+
value *= 2
|
|
1895
|
+
}
|
|
1896
|
+
return value
|
|
1897
|
+
}
|
|
1898
|
+
|
|
1899
|
+
private fun estimatePitch(segment: FloatArray, sampleRate: Float): Float {
|
|
1900
|
+
if (segment.size < 2) return 0.0f
|
|
1901
|
+
|
|
1902
|
+
// Apply Hann window
|
|
1903
|
+
val windowed = applyHannWindow(segment)
|
|
1904
|
+
|
|
1905
|
+
// Pad for FFT - ensure length is power of 2 and sufficient for autocorrelation
|
|
1906
|
+
val fftLength = nextPowerOfTwo(segment.size * 2)
|
|
1907
|
+
val padded = FloatArray(fftLength) // Initialize with zeros
|
|
1908
|
+
windowed.copyInto(padded) // Copy windowed data into padded array
|
|
1909
|
+
|
|
1910
|
+
// Perform forward FFT
|
|
1911
|
+
val fft = FFT(fftLength)
|
|
1912
|
+
try {
|
|
1913
|
+
fft.realForward(padded)
|
|
1914
|
+
} catch (e: Exception) {
|
|
1915
|
+
LogUtils.e(CLASS_NAME, "FFT forward transform failed: ${e.message}")
|
|
1916
|
+
return 0.0f
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1919
|
+
// Compute power spectrum
|
|
1920
|
+
val powerSpectrum = FloatArray(fftLength)
|
|
1921
|
+
try {
|
|
1922
|
+
// Handle DC and Nyquist components separately
|
|
1923
|
+
powerSpectrum[0] = padded[0] * padded[0]
|
|
1924
|
+
powerSpectrum[fftLength/2] = padded[1] * padded[1]
|
|
1925
|
+
|
|
1926
|
+
// Handle remaining frequencies
|
|
1927
|
+
for (i in 1 until fftLength/2) {
|
|
1928
|
+
val re = padded[2 * i]
|
|
1929
|
+
val im = padded[2 * i + 1]
|
|
1930
|
+
powerSpectrum[i] = re * re + im * im
|
|
1931
|
+
powerSpectrum[fftLength - i] = powerSpectrum[i] // Mirror for inverse FFT
|
|
1932
|
+
}
|
|
1933
|
+
} catch (e: Exception) {
|
|
1934
|
+
LogUtils.e(CLASS_NAME, "Power spectrum computation failed: ${e.message}")
|
|
1935
|
+
return 0.0f
|
|
1936
|
+
}
|
|
1937
|
+
|
|
1938
|
+
// Inverse FFT to get autocorrelation
|
|
1939
|
+
val autocorrelation = FloatArray(fftLength)
|
|
1940
|
+
try {
|
|
1941
|
+
fft.realInverse(powerSpectrum, autocorrelation)
|
|
1942
|
+
} catch (e: Exception) {
|
|
1943
|
+
LogUtils.e(CLASS_NAME, "FFT inverse transform failed: ${e.message}")
|
|
1944
|
+
return 0.0f
|
|
1945
|
+
}
|
|
1946
|
+
|
|
1947
|
+
// Normalize autocorrelation
|
|
1948
|
+
val normFactor = 1.0f / autocorrelation[0] // Normalize by zero-lag autocorrelation
|
|
1949
|
+
for (i in autocorrelation.indices) {
|
|
1950
|
+
autocorrelation[i] *= normFactor
|
|
1951
|
+
}
|
|
1952
|
+
|
|
1953
|
+
// Find the first peak within pitch range (50-500 Hz)
|
|
1954
|
+
val minLag = (sampleRate / 500.0f).toInt().coerceAtLeast(1)
|
|
1955
|
+
val maxLag = (sampleRate / 50.0f).toInt().coerceAtMost(autocorrelation.size - 1)
|
|
1956
|
+
|
|
1957
|
+
var maxCorr = -1.0f
|
|
1958
|
+
var pitchLag = 0
|
|
1959
|
+
|
|
1960
|
+
// Add peak picking criteria
|
|
1961
|
+
val threshold = 0.3f // Correlation threshold
|
|
1962
|
+
var isPeak = false
|
|
1963
|
+
|
|
1964
|
+
for (lag in minLag..maxLag) {
|
|
1965
|
+
if (lag > 0 && lag < autocorrelation.size - 1) {
|
|
1966
|
+
// Check if this point is a peak
|
|
1967
|
+
isPeak = autocorrelation[lag] > autocorrelation[lag - 1] &&
|
|
1968
|
+
autocorrelation[lag] > autocorrelation[lag + 1] &&
|
|
1969
|
+
autocorrelation[lag] > threshold
|
|
1970
|
+
|
|
1971
|
+
if (isPeak && autocorrelation[lag] > maxCorr) {
|
|
1972
|
+
maxCorr = autocorrelation[lag]
|
|
1973
|
+
pitchLag = lag
|
|
1974
|
+
}
|
|
1975
|
+
}
|
|
1976
|
+
}
|
|
1977
|
+
|
|
1978
|
+
return if (pitchLag > 0) sampleRate / pitchLag else 0.0f
|
|
1979
|
+
}
|
|
1980
|
+
|
|
1981
|
+
/**
|
|
1982
|
+
* Prepares FFT by applying Hann window, padding, and computing both power and magnitude spectra.
|
|
1983
|
+
* @param samples Input audio samples
|
|
1984
|
+
* @param sampleRate Sampling rate in Hz
|
|
1985
|
+
* @param fftLength FFT size (must be power of 2)
|
|
1986
|
+
* @return Pair of power spectrum and magnitude spectrum
|
|
1987
|
+
*/
|
|
1988
|
+
private fun prepareFFT(samples: FloatArray, sampleRate: Float, fftLength: Int = nextPowerOfTwo(samples.size.coerceAtLeast(2048))): Pair<FloatArray, FloatArray> {
|
|
1989
|
+
val windowed = applyHannWindow(samples)
|
|
1990
|
+
val padded = windowed.copyOf(fftLength)
|
|
1991
|
+
val fft = FFT(fftLength)
|
|
1992
|
+
fft.realForward(padded)
|
|
1993
|
+
|
|
1994
|
+
val magnitudeSpectrum = FloatArray(fftLength / 2 + 1)
|
|
1995
|
+
for (i in 0 until fftLength / 2) {
|
|
1996
|
+
val re = padded[2 * i]
|
|
1997
|
+
val im = padded[2 * i + 1]
|
|
1998
|
+
magnitudeSpectrum[i] = sqrt(re * re + im * im)
|
|
1999
|
+
}
|
|
2000
|
+
magnitudeSpectrum[fftLength / 2] = abs(padded[1])
|
|
2001
|
+
|
|
2002
|
+
val powerSpectrum = magnitudeSpectrum.map { it * it }.toFloatArray()
|
|
2003
|
+
return Pair(powerSpectrum, magnitudeSpectrum)
|
|
2004
|
+
}
|
|
2005
|
+
|
|
2006
|
+
data class AudioFormat(
|
|
2007
|
+
val sampleRate: Int,
|
|
2008
|
+
val channels: Int,
|
|
2009
|
+
val bitDepth: Int
|
|
2010
|
+
)
|
|
2011
|
+
|
|
2012
|
+
fun getAudioFormat(fileUri: String): AudioFormat? {
|
|
2013
|
+
val cleanUri = fileUri.removePrefix("file://")
|
|
2014
|
+
val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
|
|
2015
|
+
?: run {
|
|
2016
|
+
LogUtils.e(CLASS_NAME, "File not found: $cleanUri")
|
|
2017
|
+
return null
|
|
2018
|
+
}
|
|
2019
|
+
|
|
2020
|
+
val extractor = MediaExtractor()
|
|
2021
|
+
try {
|
|
2022
|
+
extractor.setDataSource(file.absolutePath)
|
|
2023
|
+
val format = extractor.getTrackFormat(0)
|
|
2024
|
+
return AudioFormat(
|
|
2025
|
+
sampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE),
|
|
2026
|
+
channels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT),
|
|
2027
|
+
bitDepth = 16 // Most compressed formats decode to 16-bit PCM
|
|
2028
|
+
)
|
|
2029
|
+
} catch (e: Exception) {
|
|
2030
|
+
LogUtils.e(CLASS_NAME, "Failed to get audio format: ${e.message}", e)
|
|
2031
|
+
return null
|
|
2032
|
+
} finally {
|
|
2033
|
+
extractor.release()
|
|
2034
|
+
}
|
|
2035
|
+
}
|
|
2036
|
+
|
|
2037
|
+
/**
|
|
2038
|
+
* Gets the size of the audio file header.
|
|
2039
|
+
* For WAV files, this includes the RIFF header and all metadata chunks before the data chunk.
|
|
2040
|
+
* For other formats, this will return null as header size handling is format-specific.
|
|
2041
|
+
*
|
|
2042
|
+
* @param fileUri The URI of the audio file to analyze
|
|
2043
|
+
* @return The size of the header in bytes, or null if:
|
|
2044
|
+
* - The file is not a WAV file
|
|
2045
|
+
* - The file cannot be read
|
|
2046
|
+
* - The file format is invalid
|
|
2047
|
+
* - The data chunk cannot be found
|
|
2048
|
+
*
|
|
2049
|
+
* WAV File Structure:
|
|
2050
|
+
* - RIFF header (12 bytes)
|
|
2051
|
+
* - "RIFF" identifier (4 bytes)
|
|
2052
|
+
* - File size (4 bytes)
|
|
2053
|
+
* - "WAVE" identifier (4 bytes)
|
|
2054
|
+
* - Format chunk ("fmt ") (24 bytes typically)
|
|
2055
|
+
* - Optional metadata chunks (variable size)
|
|
2056
|
+
* - LIST (metadata like artist, title)
|
|
2057
|
+
* - JUNK (padding)
|
|
2058
|
+
* - fact (additional format info)
|
|
2059
|
+
* - cue (cue points)
|
|
2060
|
+
* - Data chunk
|
|
2061
|
+
* - "data" identifier (4 bytes)
|
|
2062
|
+
* - Chunk size (4 bytes)
|
|
2063
|
+
* - Actual audio data
|
|
2064
|
+
*/
|
|
2065
|
+
fun getWavHeaderSize(fileUri: String): Int? {
|
|
2066
|
+
val cleanUri = fileUri.removePrefix("file://")
|
|
2067
|
+
val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
|
|
2068
|
+
?: run {
|
|
2069
|
+
LogUtils.e(CLASS_NAME, "File not found: $cleanUri")
|
|
2070
|
+
return null
|
|
2071
|
+
}
|
|
2072
|
+
|
|
2073
|
+
try {
|
|
2074
|
+
val inputStream = FileInputStream(file)
|
|
2075
|
+
val buffer = ByteArray(12) // Read RIFF header and chunk size
|
|
2076
|
+
|
|
2077
|
+
// Read RIFF header
|
|
2078
|
+
if (inputStream.read(buffer) != 12) {
|
|
2079
|
+
LogUtils.e(CLASS_NAME, "Failed to read RIFF header")
|
|
2080
|
+
return null
|
|
2081
|
+
}
|
|
2082
|
+
|
|
2083
|
+
// Verify RIFF header
|
|
2084
|
+
if (String(buffer, 0, 4) != "RIFF" || String(buffer, 8, 4) != "WAVE") {
|
|
2085
|
+
LogUtils.e(CLASS_NAME, "Invalid WAV file format")
|
|
2086
|
+
return null
|
|
2087
|
+
}
|
|
2088
|
+
|
|
2089
|
+
var headerSize = 12
|
|
2090
|
+
var chunkSize: Int
|
|
2091
|
+
|
|
2092
|
+
// Read chunks until we find the data chunk
|
|
2093
|
+
while (true) {
|
|
2094
|
+
if (inputStream.read(buffer, 0, 8) != 8) {
|
|
2095
|
+
LogUtils.e(CLASS_NAME, "Unexpected end of file while reading chunks")
|
|
2096
|
+
break
|
|
2097
|
+
}
|
|
2098
|
+
|
|
2099
|
+
chunkSize = (buffer[7].toInt() and 0xFF shl 24) or
|
|
2100
|
+
(buffer[6].toInt() and 0xFF shl 16) or
|
|
2101
|
+
(buffer[5].toInt() and 0xFF shl 8) or
|
|
2102
|
+
(buffer[4].toInt() and 0xFF)
|
|
2103
|
+
|
|
2104
|
+
val chunkId = String(buffer, 0, 4)
|
|
2105
|
+
LogUtils.d(CLASS_NAME, "Found chunk: $chunkId, size: $chunkSize")
|
|
2106
|
+
|
|
2107
|
+
if (chunkId == "data") {
|
|
2108
|
+
headerSize += 8 // Add chunk header size
|
|
2109
|
+
LogUtils.d(CLASS_NAME, "Found data chunk at offset: $headerSize")
|
|
2110
|
+
break
|
|
2111
|
+
}
|
|
2112
|
+
|
|
2113
|
+
headerSize += 8 + chunkSize // Add chunk header and data size
|
|
2114
|
+
inputStream.skip(chunkSize.toLong()) // Skip chunk data
|
|
2115
|
+
}
|
|
2116
|
+
|
|
2117
|
+
inputStream.close()
|
|
2118
|
+
LogUtils.d(CLASS_NAME, "Total WAV header size: $headerSize bytes")
|
|
2119
|
+
return headerSize
|
|
2120
|
+
|
|
2121
|
+
} catch (e: Exception) {
|
|
2122
|
+
LogUtils.e(CLASS_NAME, "Error calculating WAV header size: ${e.message}", e)
|
|
2123
|
+
return null
|
|
2124
|
+
}
|
|
2125
|
+
}
|
|
2126
|
+
|
|
2127
|
+
/**
|
|
2128
|
+
* Decodes a specific time range of an audio file directly to PCM data
|
|
2129
|
+
* This is more efficient than decoding the entire file when only a portion is needed
|
|
2130
|
+
*/
|
|
2131
|
+
fun decodeAudioRangeToPCM(fileUri: String, startTimeMs: Long, endTimeMs: Long): AudioData? {
|
|
2132
|
+
val extractor = MediaExtractor()
|
|
2133
|
+
var decoder: android.media.MediaCodec? = null
|
|
2134
|
+
|
|
2135
|
+
try {
|
|
2136
|
+
extractor.setDataSource(fileUri)
|
|
2137
|
+
val trackIndex = (0 until extractor.trackCount).find {
|
|
2138
|
+
extractor.getTrackFormat(it).getString(MediaFormat.KEY_MIME)?.startsWith("audio/") == true
|
|
2139
|
+
} ?: return null
|
|
2140
|
+
|
|
2141
|
+
extractor.selectTrack(trackIndex)
|
|
2142
|
+
val format = extractor.getTrackFormat(trackIndex)
|
|
2143
|
+
|
|
2144
|
+
val sampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE)
|
|
2145
|
+
val channels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT)
|
|
2146
|
+
decoder = android.media.MediaCodec.createDecoderByType(format.getString(MediaFormat.KEY_MIME)!!)
|
|
2147
|
+
decoder.configure(format, null, null, 0)
|
|
2148
|
+
decoder.start()
|
|
2149
|
+
|
|
2150
|
+
extractor.seekTo(startTimeMs * 1000, MediaExtractor.SEEK_TO_PREVIOUS_SYNC)
|
|
2151
|
+
val pcmData = mutableListOf<Byte>()
|
|
2152
|
+
val bufferInfo = android.media.MediaCodec.BufferInfo()
|
|
2153
|
+
var isEOS = false
|
|
2154
|
+
var firstBufferTimeUs: Long? = null
|
|
2155
|
+
|
|
2156
|
+
while (!isEOS) {
|
|
2157
|
+
val inputBufferId = decoder.dequeueInputBuffer(10000)
|
|
2158
|
+
if (inputBufferId >= 0) {
|
|
2159
|
+
val inputBuffer = decoder.getInputBuffer(inputBufferId)!!
|
|
2160
|
+
val sampleSize = extractor.readSampleData(inputBuffer, 0)
|
|
2161
|
+
if (sampleSize < 0 || extractor.sampleTime > endTimeMs * 1000) {
|
|
2162
|
+
decoder.queueInputBuffer(inputBufferId, 0, 0, 0, android.media.MediaCodec.BUFFER_FLAG_END_OF_STREAM)
|
|
2163
|
+
isEOS = true
|
|
2164
|
+
} else {
|
|
2165
|
+
decoder.queueInputBuffer(inputBufferId, 0, sampleSize, extractor.sampleTime, 0)
|
|
2166
|
+
extractor.advance()
|
|
2167
|
+
}
|
|
2168
|
+
}
|
|
2169
|
+
|
|
2170
|
+
val outputBufferId = decoder.dequeueOutputBuffer(bufferInfo, 10000)
|
|
2171
|
+
if (outputBufferId >= 0) {
|
|
2172
|
+
val outputBuffer = decoder.getOutputBuffer(outputBufferId)!!
|
|
2173
|
+
if (firstBufferTimeUs == null) firstBufferTimeUs = bufferInfo.presentationTimeUs
|
|
2174
|
+
val chunk = ByteArray(bufferInfo.size)
|
|
2175
|
+
outputBuffer.get(chunk)
|
|
2176
|
+
pcmData.addAll(chunk.toList())
|
|
2177
|
+
decoder.releaseOutputBuffer(outputBufferId, false)
|
|
2178
|
+
}
|
|
2179
|
+
}
|
|
2180
|
+
|
|
2181
|
+
// If we didn't get any data or first buffer time, return null
|
|
2182
|
+
if (pcmData.isEmpty() || firstBufferTimeUs == null) {
|
|
2183
|
+
return null
|
|
2184
|
+
}
|
|
2185
|
+
|
|
2186
|
+
// Trim PCM data to exact time range
|
|
2187
|
+
val bytesPerSample = 2 // 16-bit PCM
|
|
2188
|
+
val bytesPerFrame = bytesPerSample * channels
|
|
2189
|
+
val samplesPerSecond = sampleRate * channels
|
|
2190
|
+
val dt = 1_000_000.0 / sampleRate // Time per sample in microseconds
|
|
2191
|
+
|
|
2192
|
+
val allSamples = java.nio.ByteBuffer.wrap(pcmData.toByteArray()).order(java.nio.ByteOrder.LITTLE_ENDIAN).asShortBuffer()
|
|
2193
|
+
val totalSamples = allSamples.capacity()
|
|
2194
|
+
|
|
2195
|
+
// Calculate sample indices for the exact time range
|
|
2196
|
+
val startSample = ((startTimeMs * 1000 - firstBufferTimeUs) / dt).toInt().coerceIn(0, totalSamples)
|
|
2197
|
+
val endSample = ((endTimeMs * 1000 - firstBufferTimeUs) / dt).toInt().coerceIn(startSample, totalSamples)
|
|
2198
|
+
|
|
2199
|
+
// Create a new ShortBuffer view starting at the correct position
|
|
2200
|
+
allSamples.position(startSample)
|
|
2201
|
+
val trimmedSamples = ShortArray(endSample - startSample)
|
|
2202
|
+
for (i in trimmedSamples.indices) {
|
|
2203
|
+
trimmedSamples[i] = allSamples.get()
|
|
2204
|
+
}
|
|
2205
|
+
|
|
2206
|
+
// Convert ShortArray to ByteArray
|
|
2207
|
+
val trimmedBytes = ByteArray(trimmedSamples.size * 2)
|
|
2208
|
+
val byteBuffer = java.nio.ByteBuffer.wrap(trimmedBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
|
|
2209
|
+
val shortBuffer = byteBuffer.asShortBuffer()
|
|
2210
|
+
shortBuffer.put(trimmedSamples)
|
|
2211
|
+
|
|
2212
|
+
return AudioData(
|
|
2213
|
+
data = trimmedBytes,
|
|
2214
|
+
sampleRate = sampleRate,
|
|
2215
|
+
channels = channels,
|
|
2216
|
+
bitDepth = 16, // MediaCodec typically decodes to 16-bit PCM
|
|
2217
|
+
durationMs = endTimeMs - startTimeMs
|
|
2218
|
+
)
|
|
2219
|
+
} catch (e: Exception) {
|
|
2220
|
+
LogUtils.e(CLASS_NAME, "Failed to decode audio range: ${e.message}", e)
|
|
2221
|
+
return null
|
|
2222
|
+
} finally {
|
|
2223
|
+
try {
|
|
2224
|
+
decoder?.stop()
|
|
2225
|
+
decoder?.release()
|
|
2226
|
+
} catch (e: Exception) {
|
|
2227
|
+
LogUtils.w(CLASS_NAME, "Error releasing decoder: ${e.message}")
|
|
2228
|
+
}
|
|
2229
|
+
|
|
2230
|
+
try {
|
|
2231
|
+
extractor.release()
|
|
2232
|
+
} catch (e: Exception) {
|
|
2233
|
+
LogUtils.w(CLASS_NAME, "Error releasing extractor: ${e.message}")
|
|
2234
|
+
}
|
|
2235
|
+
}
|
|
2236
|
+
}
|
|
2237
|
+
}
|