@siteed/expo-audio-stream 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/README.md +40 -222
  2. package/build/index.d.ts +11 -15
  3. package/build/index.js +44 -14
  4. package/package.json +49 -110
  5. package/src/index.ts +18 -32
  6. package/CHANGELOG.md +0 -206
  7. package/android/build.gradle +0 -105
  8. package/android/src/main/AndroidManifest.xml +0 -27
  9. package/android/src/main/java/net/siteed/audiostream/AudioAnalysisData.kt +0 -166
  10. package/android/src/main/java/net/siteed/audiostream/AudioDataEncoder.kt +0 -9
  11. package/android/src/main/java/net/siteed/audiostream/AudioFileHandler.kt +0 -131
  12. package/android/src/main/java/net/siteed/audiostream/AudioFormatUtils.kt +0 -103
  13. package/android/src/main/java/net/siteed/audiostream/AudioNotificationsManager.kt +0 -435
  14. package/android/src/main/java/net/siteed/audiostream/AudioProcessor.kt +0 -2235
  15. package/android/src/main/java/net/siteed/audiostream/AudioRecorderManager.kt +0 -1437
  16. package/android/src/main/java/net/siteed/audiostream/AudioRecordingService.kt +0 -152
  17. package/android/src/main/java/net/siteed/audiostream/AudioTrimmer.kt +0 -1099
  18. package/android/src/main/java/net/siteed/audiostream/Constants.kt +0 -21
  19. package/android/src/main/java/net/siteed/audiostream/EventSender.kt +0 -7
  20. package/android/src/main/java/net/siteed/audiostream/ExpoAudioStreamModule.kt +0 -739
  21. package/android/src/main/java/net/siteed/audiostream/FFT.kt +0 -99
  22. package/android/src/main/java/net/siteed/audiostream/Features.kt +0 -98
  23. package/android/src/main/java/net/siteed/audiostream/NotificationConfig.kt +0 -70
  24. package/android/src/main/java/net/siteed/audiostream/PermissionUtils.kt +0 -59
  25. package/android/src/main/java/net/siteed/audiostream/RecordingActionReceiver.kt +0 -59
  26. package/android/src/main/java/net/siteed/audiostream/RecordingConfig.kt +0 -205
  27. package/android/src/main/java/net/siteed/audiostream/WaveformConfig.kt +0 -19
  28. package/android/src/main/java/net/siteed/audiostream/WaveformRenderer.kt +0 -159
  29. package/android/src/main/res/drawable/ic_default_action_icon.xml +0 -16
  30. package/android/src/main/res/drawable/ic_microphone.xml +0 -13
  31. package/android/src/main/res/drawable/ic_pause.xml +0 -10
  32. package/android/src/main/res/drawable/ic_play.xml +0 -10
  33. package/android/src/main/res/drawable/ic_stop.xml +0 -10
  34. package/android/src/main/res/layout/notification_recording.xml +0 -37
  35. package/android/src/main/test/java/net/siteed/audiostream/AudioProcessorTest.kt +0 -56
  36. package/app.plugin.js +0 -1
  37. package/build/AudioAnalysis/AudioAnalysis.types.d.ts +0 -179
  38. package/build/AudioAnalysis/AudioAnalysis.types.d.ts.map +0 -1
  39. package/build/AudioAnalysis/AudioAnalysis.types.js +0 -3
  40. package/build/AudioAnalysis/AudioAnalysis.types.js.map +0 -1
  41. package/build/AudioAnalysis/extractAudioAnalysis.d.ts +0 -68
  42. package/build/AudioAnalysis/extractAudioAnalysis.d.ts.map +0 -1
  43. package/build/AudioAnalysis/extractAudioAnalysis.js +0 -203
  44. package/build/AudioAnalysis/extractAudioAnalysis.js.map +0 -1
  45. package/build/AudioAnalysis/extractAudioData.d.ts +0 -3
  46. package/build/AudioAnalysis/extractAudioData.d.ts.map +0 -1
  47. package/build/AudioAnalysis/extractAudioData.js +0 -5
  48. package/build/AudioAnalysis/extractAudioData.js.map +0 -1
  49. package/build/AudioAnalysis/extractMelSpectrogram.d.ts +0 -14
  50. package/build/AudioAnalysis/extractMelSpectrogram.d.ts.map +0 -1
  51. package/build/AudioAnalysis/extractMelSpectrogram.js +0 -85
  52. package/build/AudioAnalysis/extractMelSpectrogram.js.map +0 -1
  53. package/build/AudioAnalysis/extractPreview.d.ts +0 -11
  54. package/build/AudioAnalysis/extractPreview.d.ts.map +0 -1
  55. package/build/AudioAnalysis/extractPreview.js +0 -25
  56. package/build/AudioAnalysis/extractPreview.js.map +0 -1
  57. package/build/AudioAnalysis/extractWaveform.d.ts +0 -8
  58. package/build/AudioAnalysis/extractWaveform.d.ts.map +0 -1
  59. package/build/AudioAnalysis/extractWaveform.js +0 -11
  60. package/build/AudioAnalysis/extractWaveform.js.map +0 -1
  61. package/build/AudioRecorder.provider.d.ts +0 -11
  62. package/build/AudioRecorder.provider.d.ts.map +0 -1
  63. package/build/AudioRecorder.provider.js +0 -37
  64. package/build/AudioRecorder.provider.js.map +0 -1
  65. package/build/ExpoAudioStream.native.d.ts +0 -3
  66. package/build/ExpoAudioStream.native.d.ts.map +0 -1
  67. package/build/ExpoAudioStream.native.js +0 -6
  68. package/build/ExpoAudioStream.native.js.map +0 -1
  69. package/build/ExpoAudioStream.types.d.ts +0 -532
  70. package/build/ExpoAudioStream.types.d.ts.map +0 -1
  71. package/build/ExpoAudioStream.types.js +0 -2
  72. package/build/ExpoAudioStream.types.js.map +0 -1
  73. package/build/ExpoAudioStream.web.d.ts +0 -59
  74. package/build/ExpoAudioStream.web.d.ts.map +0 -1
  75. package/build/ExpoAudioStream.web.js +0 -285
  76. package/build/ExpoAudioStream.web.js.map +0 -1
  77. package/build/ExpoAudioStreamModule.d.ts +0 -3
  78. package/build/ExpoAudioStreamModule.d.ts.map +0 -1
  79. package/build/ExpoAudioStreamModule.js +0 -693
  80. package/build/ExpoAudioStreamModule.js.map +0 -1
  81. package/build/WebRecorder.web.d.ts +0 -119
  82. package/build/WebRecorder.web.d.ts.map +0 -1
  83. package/build/WebRecorder.web.js +0 -436
  84. package/build/WebRecorder.web.js.map +0 -1
  85. package/build/constants.d.ts +0 -11
  86. package/build/constants.d.ts.map +0 -1
  87. package/build/constants.js +0 -14
  88. package/build/constants.js.map +0 -1
  89. package/build/events.d.ts +0 -26
  90. package/build/events.d.ts.map +0 -1
  91. package/build/events.js +0 -21
  92. package/build/events.js.map +0 -1
  93. package/build/index.d.ts.map +0 -1
  94. package/build/index.js.map +0 -1
  95. package/build/trimAudio.d.ts +0 -25
  96. package/build/trimAudio.d.ts.map +0 -1
  97. package/build/trimAudio.js +0 -67
  98. package/build/trimAudio.js.map +0 -1
  99. package/build/useAudioRecorder.d.ts +0 -21
  100. package/build/useAudioRecorder.d.ts.map +0 -1
  101. package/build/useAudioRecorder.js +0 -427
  102. package/build/useAudioRecorder.js.map +0 -1
  103. package/build/utils/BlobFix.d.ts +0 -9
  104. package/build/utils/BlobFix.d.ts.map +0 -1
  105. package/build/utils/BlobFix.js +0 -498
  106. package/build/utils/BlobFix.js.map +0 -1
  107. package/build/utils/audioProcessing.d.ts +0 -24
  108. package/build/utils/audioProcessing.d.ts.map +0 -1
  109. package/build/utils/audioProcessing.js +0 -133
  110. package/build/utils/audioProcessing.js.map +0 -1
  111. package/build/utils/concatenateBuffers.d.ts +0 -8
  112. package/build/utils/concatenateBuffers.d.ts.map +0 -1
  113. package/build/utils/concatenateBuffers.js +0 -21
  114. package/build/utils/concatenateBuffers.js.map +0 -1
  115. package/build/utils/convertPCMToFloat32.d.ts +0 -13
  116. package/build/utils/convertPCMToFloat32.d.ts.map +0 -1
  117. package/build/utils/convertPCMToFloat32.js +0 -120
  118. package/build/utils/convertPCMToFloat32.js.map +0 -1
  119. package/build/utils/encodingToBitDepth.d.ts +0 -5
  120. package/build/utils/encodingToBitDepth.d.ts.map +0 -1
  121. package/build/utils/encodingToBitDepth.js +0 -13
  122. package/build/utils/encodingToBitDepth.js.map +0 -1
  123. package/build/utils/getWavFileInfo.d.ts +0 -26
  124. package/build/utils/getWavFileInfo.d.ts.map +0 -1
  125. package/build/utils/getWavFileInfo.js +0 -92
  126. package/build/utils/getWavFileInfo.js.map +0 -1
  127. package/build/utils/writeWavHeader.d.ts +0 -49
  128. package/build/utils/writeWavHeader.d.ts.map +0 -1
  129. package/build/utils/writeWavHeader.js +0 -91
  130. package/build/utils/writeWavHeader.js.map +0 -1
  131. package/build/workers/InlineFeaturesExtractor.web.d.ts +0 -2
  132. package/build/workers/InlineFeaturesExtractor.web.d.ts.map +0 -1
  133. package/build/workers/InlineFeaturesExtractor.web.js +0 -828
  134. package/build/workers/InlineFeaturesExtractor.web.js.map +0 -1
  135. package/build/workers/inlineAudioWebWorker.web.d.ts +0 -2
  136. package/build/workers/inlineAudioWebWorker.web.d.ts.map +0 -1
  137. package/build/workers/inlineAudioWebWorker.web.js +0 -157
  138. package/build/workers/inlineAudioWebWorker.web.js.map +0 -1
  139. package/expo-module.config.json +0 -9
  140. package/ios/AudioAnalysisData.swift +0 -74
  141. package/ios/AudioNotificationManager.swift +0 -135
  142. package/ios/AudioProcessingHelpers.swift +0 -743
  143. package/ios/AudioProcessor.swift +0 -1313
  144. package/ios/AudioStreamError.swift +0 -7
  145. package/ios/AudioStreamManager.swift +0 -1708
  146. package/ios/AudioStreamManagerDelegate.swift +0 -16
  147. package/ios/DataPoint.swift +0 -54
  148. package/ios/DecodingConfig.swift +0 -47
  149. package/ios/ExpoAudioStream.podspec +0 -27
  150. package/ios/ExpoAudioStreamModule.swift +0 -805
  151. package/ios/FFT.swift +0 -62
  152. package/ios/Features.swift +0 -95
  153. package/ios/Logger.swift +0 -7
  154. package/ios/NotificationExtension.swift +0 -15
  155. package/ios/RecordingResult.swift +0 -22
  156. package/ios/RecordingSettings.swift +0 -265
  157. package/ios/WaveformExtractor.swift +0 -105
  158. package/plugin/build/index.d.ts +0 -21
  159. package/plugin/build/index.js +0 -191
  160. package/plugin/src/index.ts +0 -278
  161. package/plugin/tsconfig.json +0 -10
  162. package/plugin/tsconfig.tsbuildinfo +0 -1
  163. package/src/AudioAnalysis/AudioAnalysis.types.ts +0 -202
  164. package/src/AudioAnalysis/extractAudioAnalysis.ts +0 -333
  165. package/src/AudioAnalysis/extractAudioData.ts +0 -6
  166. package/src/AudioAnalysis/extractMelSpectrogram.ts +0 -144
  167. package/src/AudioAnalysis/extractPreview.ts +0 -34
  168. package/src/AudioAnalysis/extractWaveform.ts +0 -22
  169. package/src/AudioRecorder.provider.tsx +0 -54
  170. package/src/ExpoAudioStream.native.ts +0 -6
  171. package/src/ExpoAudioStream.types.ts +0 -641
  172. package/src/ExpoAudioStream.web.ts +0 -359
  173. package/src/ExpoAudioStreamModule.ts +0 -967
  174. package/src/WebRecorder.web.ts +0 -580
  175. package/src/constants.ts +0 -18
  176. package/src/events.ts +0 -60
  177. package/src/trimAudio.ts +0 -90
  178. package/src/useAudioRecorder.tsx +0 -620
  179. package/src/utils/BlobFix.ts +0 -559
  180. package/src/utils/audioProcessing.ts +0 -205
  181. package/src/utils/concatenateBuffers.ts +0 -24
  182. package/src/utils/convertPCMToFloat32.ts +0 -170
  183. package/src/utils/encodingToBitDepth.ts +0 -18
  184. package/src/utils/getWavFileInfo.ts +0 -132
  185. package/src/utils/writeWavHeader.ts +0 -114
  186. package/src/workers/InlineFeaturesExtractor.web.tsx +0 -827
  187. package/src/workers/inlineAudioWebWorker.web.tsx +0 -156
@@ -1,2235 +0,0 @@
1
- // packages/expo-audio-stream/android/src/main/java/net/siteed/audiostream/AudioProcessor.kt
2
- // packages/expo-audio-stream/android/src/main/java/net/siteed/audiostream/AudioProcessor.kt
3
- package net.siteed.audiostream
4
-
5
- import java.nio.ByteBuffer
6
- import java.nio.ByteOrder
7
- import kotlin.math.*
8
- import android.util.Log
9
- import java.io.File
10
- import java.util.concurrent.atomic.AtomicLong
11
- import kotlin.system.measureTimeMillis
12
- import android.media.MediaExtractor
13
- import android.media.MediaFormat
14
- import android.media.MediaCodec
15
- import java.io.FileInputStream
16
- import java.io.RandomAccessFile
17
- import java.util.zip.CRC32
18
-
19
- data class DecodingConfig(
20
- val targetSampleRate: Int? = null, // Optional target sample rate
21
- val targetChannels: Int? = null, // Optional target number of channels
22
- val targetBitDepth: Int = 16, // Default to 16-bit PCM
23
- val normalizeAudio: Boolean = false // Whether to normalize audio levels
24
- )
25
-
26
- data class SpectrogramData(
27
- val spectrogram: Array<FloatArray>, // 2D array: [time, frequency]
28
- val timeStamps: FloatArray, // Time (in seconds) for each frame
29
- val frequencies: FloatArray // Frequencies (in Hz) for each mel bin
30
- )
31
-
32
- class AudioProcessor(private val filesDir: File) {
33
- companion object {
34
- const val DCT_SQRT_DIVISOR = 2.0
35
- private const val N_FFT = 1024
36
- private const val N_CHROMA = 12
37
-
38
- private val uniqueIdCounter = AtomicLong(0L) // Keep as companion object property to maintain during pause/resume cycles
39
-
40
- fun resetUniqueIdCounter() {
41
- uniqueIdCounter.set(0L)
42
- }
43
- }
44
-
45
- data class AudioData(val data: ByteArray, val sampleRate: Int, val bitDepth: Int, val channels: Int, val durationMs: Long = 0)
46
-
47
- private var cumulativeMinAmplitude = Float.MAX_VALUE
48
- private var cumulativeMaxAmplitude = Float.NEGATIVE_INFINITY
49
-
50
- private fun loadAudioFile(filePath: String): AudioData? {
51
- try {
52
- val fileUri = filePath.removePrefix("file://")
53
- Log.d("AudioProcessor", "Processing WAV file: $fileUri")
54
-
55
- val file = File(fileUri).takeIf { it.exists() } ?: File(filesDir, File(fileUri).name).takeIf { it.exists() }
56
- ?: run {
57
- Log.e("AudioProcessor", "File not found: $fileUri")
58
- return null
59
- }
60
-
61
- val raf = RandomAccessFile(file, "r")
62
- val fileSize = raf.length()
63
-
64
- // Read RIFF header
65
- val riffHeader = ByteArray(4).apply { raf.readFully(this) }
66
- if (String(riffHeader) != "RIFF") {
67
- Log.e("AudioProcessor", "Invalid RIFF header")
68
- return null
69
- }
70
-
71
- // Read WAVE header
72
- val waveHeader = ByteArray(4).apply { raf.readFully(this) }
73
- if (String(waveHeader) != "WAVE") {
74
- Log.e("AudioProcessor", "Invalid WAVE header")
75
- return null
76
- }
77
-
78
- var fmtChunkFound = false
79
- var dataChunkFound = false
80
- var sampleRate = 0
81
- var channels = 0
82
- var bitDepth = 0
83
- var dataOffset = 0L
84
- var dataSize = 0L
85
-
86
- // Parse chunks
87
- while (raf.filePointer < fileSize - 8) {
88
- val chunkId = ByteArray(4).apply { raf.readFully(this) }.toString(Charsets.UTF_8)
89
- val chunkSizeBytes = ByteArray(4).apply { raf.readFully(this) }
90
- val chunkSize = ByteBuffer.wrap(chunkSizeBytes).order(ByteOrder.LITTLE_ENDIAN).int.toLong() and 0xFFFFFFFFL
91
-
92
- Log.d("AudioProcessor", "Found chunk: $chunkId ($chunkSize bytes)")
93
-
94
- when (chunkId) {
95
- "fmt " -> {
96
- if (chunkSize < 16) {
97
- Log.e("AudioProcessor", "Invalid fmt chunk size")
98
- return null
99
- }
100
-
101
- val formatData = ByteArray(16)
102
- raf.readFully(formatData)
103
- val formatBuffer = ByteBuffer.wrap(formatData).order(ByteOrder.LITTLE_ENDIAN)
104
-
105
- val audioFormat = formatBuffer.short // Skip audio format
106
- channels = formatBuffer.short.toInt() and 0xFFFF
107
- sampleRate = formatBuffer.int
108
- val byteRate = formatBuffer.int
109
- val blockAlign = formatBuffer.short
110
- bitDepth = formatBuffer.short.toInt() and 0xFFFF
111
-
112
- Log.d("AudioProcessor", "Raw format data: ${formatData.joinToString(", ")}")
113
- Log.d("AudioProcessor", "Format chunk: audioFormat=$audioFormat, channels=$channels, sampleRate=$sampleRate, bitDepth=$bitDepth, byteRate=$byteRate, blockAlign=$blockAlign")
114
-
115
- if (bitDepth !in listOf(8, 16, 32)) {
116
- Log.e("AudioProcessor", "Invalid bit depth: $bitDepth")
117
- return null
118
- }
119
-
120
- val remainingFmtBytes = chunkSize - 16
121
- if (remainingFmtBytes > 0) {
122
- raf.skipBytes(remainingFmtBytes.toInt())
123
- }
124
- fmtChunkFound = true
125
- }
126
- "data" -> {
127
- dataOffset = raf.filePointer
128
- dataSize = chunkSize
129
- dataChunkFound = true
130
- break
131
- }
132
- else -> {
133
- // Skip unknown chunks
134
- val skipBytes = chunkSize
135
- if (skipBytes > 0) {
136
- val actualSkip = minOf(skipBytes, fileSize - raf.filePointer)
137
- raf.seek(raf.filePointer + actualSkip)
138
- }
139
- }
140
- }
141
- }
142
-
143
- if (!fmtChunkFound || !dataChunkFound) {
144
- Log.e("AudioProcessor", "Missing essential chunks (fmt=$fmtChunkFound, data=$dataChunkFound)")
145
- return null
146
- }
147
-
148
- // Calculate actual data size if it seems wrong
149
- if (dataSize <= 0 || dataSize > fileSize - dataOffset) {
150
- dataSize = fileSize - dataOffset
151
- Log.d("AudioProcessor", "Adjusted data size to: $dataSize")
152
- }
153
-
154
- Log.d("AudioProcessor", "Reading PCM data: offset=$dataOffset, size=$dataSize")
155
-
156
- val wavData = ByteArray(dataSize.toInt())
157
- raf.seek(dataOffset)
158
- raf.readFully(wavData)
159
-
160
- // Calculate duration in ms
161
- // Each sample is bitsPerSample/8 bytes, and we have 'channels' samples per frame
162
- val bytesPerFrame = channels * (bitDepth / 8)
163
- val numFrames = wavData.size / bytesPerFrame
164
- val durationMs = (numFrames * 1000L) / sampleRate
165
-
166
- Log.d(Constants.TAG, "WAV duration calculation: size=${wavData.size}, bytesPerFrame=$bytesPerFrame, numFrames=$numFrames, sampleRate=$sampleRate, duration=${durationMs}ms")
167
-
168
- return AudioData(
169
- data = wavData,
170
- sampleRate = sampleRate,
171
- channels = channels,
172
- bitDepth = bitDepth,
173
- durationMs = durationMs
174
- )
175
- } catch (e: Exception) {
176
- Log.e(Constants.TAG, "Failed to load WAV file: ${e.message}")
177
- return null
178
- }
179
- }
180
-
181
- /**
182
- * Processes the audio data and extracts features.
183
- * @param data The audio data in bytes.
184
- * @param config The recording configuration.
185
- * @return AudioAnalysisData containing the extracted features.
186
- */
187
- fun processAudioData(data: ByteArray, config: RecordingConfig): AudioAnalysisData {
188
- if (data.isEmpty()) {
189
- Log.e("AudioProcessor", "Received empty audio data")
190
- return AudioAnalysisData(
191
- segmentDurationMs = config.segmentDurationMs,
192
- durationMs = 0,
193
- bitDepth = 16,
194
- numberOfChannels = config.channels,
195
- sampleRate = config.sampleRate,
196
- samples = 0,
197
- dataPoints = emptyList(),
198
- amplitudeRange = AudioAnalysisData.AmplitudeRange(0f, 0f),
199
- rmsRange = AudioAnalysisData.AmplitudeRange(0f, 0f),
200
- extractionTimeMs = 0f,
201
- )
202
- }
203
-
204
- val sampleRate = config.sampleRate.toFloat()
205
- val bitDepth = when (config.encoding) {
206
- "pcm_8bit" -> 8
207
- "pcm_16bit" -> 16
208
- "pcm_32bit" -> 32
209
- else -> throw IllegalArgumentException("Unsupported encoding: ${config.encoding}")
210
- }
211
- val channelData = convertToFloatArray(data, bitDepth)
212
- val featureOptions = config.features
213
-
214
- val totalSamples = channelData.size
215
- // Update samplesPerSegment calculation to use proper formula
216
- val samplesPerSegment = ((config.segmentDurationMs / 1000.0) * sampleRate).toInt()
217
- val totalPoints = ceil(totalSamples.toDouble() / samplesPerSegment).toInt()
218
-
219
- Log.d("AudioProcessor", "Extracting waveform totalSize=${data.size} with $totalSamples samples --> $totalPoints points")
220
- Log.d("AudioProcessor", "segmentDuration: ${config.segmentDurationMs}ms, samplesPerSegment: $samplesPerSegment")
221
-
222
- // Remove expectedPoints calculation since it used pointsPerSecond
223
- val samplesPerPoint = ceil(channelData.size / totalPoints.toDouble()).toInt()
224
- Log.d("AudioProcessor", "Extracting waveform with samplesPerPoints=$samplesPerPoint")
225
-
226
- val dataPoints = mutableListOf<DataPoint>()
227
- var minAmplitude = Float.MAX_VALUE
228
- var maxAmplitude = Float.NEGATIVE_INFINITY
229
- var minRms = Float.MAX_VALUE
230
- var maxRms = Float.NEGATIVE_INFINITY
231
- // Calculate total duration in milliseconds based on sample rate and total samples
232
- val durationMs = (totalSamples.toFloat() / sampleRate * 1000).toInt()
233
-
234
- // Measure the time taken for audio processing
235
- val extractionTimeMs = measureTimeMillis {
236
- for (i in 0 until totalPoints) {
237
- val start = i * samplesPerSegment
238
- val end = min(start + samplesPerSegment, totalSamples)
239
- val segmentData = channelData.sliceArray(start until end)
240
-
241
- var sumSquares = 0f
242
- var zeroCrossings = 0
243
- var prevValue = 0f
244
- var localMinAmplitude = Float.MAX_VALUE
245
- var localMaxAmplitude = Float.MIN_VALUE
246
-
247
- for (value in segmentData) {
248
- sumSquares += value * value
249
- if (prevValue != 0f && value * prevValue < 0) zeroCrossings += 1
250
- prevValue = value
251
-
252
- val absValue = abs(value)
253
- localMinAmplitude = min(localMinAmplitude, absValue)
254
- localMaxAmplitude = max(localMaxAmplitude, absValue)
255
- }
256
-
257
- val features = computeFeatures(
258
- segmentData = segmentData,
259
- sampleRate = sampleRate,
260
- sumSquares = sumSquares,
261
- zeroCrossings = zeroCrossings,
262
- segmentLength = segmentData.size,
263
- featureOptions = featureOptions,
264
- minAmplitude = localMinAmplitude,
265
- maxAmplitude = localMaxAmplitude
266
- )
267
- val rms = features.rms
268
- val silent = rms < 0.01
269
- val dB = 20 * log10(rms.toDouble()).toFloat()
270
- minAmplitude = min(minAmplitude, localMinAmplitude)
271
- maxAmplitude = max(maxAmplitude, localMaxAmplitude)
272
- minRms = min(minRms, rms)
273
- maxRms = max(maxRms, rms)
274
-
275
- val bytesPerSample = bitDepth / 8
276
- val startPosition = start * bytesPerSample * config.channels
277
- val endPosition = end * bytesPerSample * config.channels
278
-
279
- // Update cumulative amplitude range
280
- cumulativeMinAmplitude = min(cumulativeMinAmplitude, localMinAmplitude)
281
- cumulativeMaxAmplitude = max(cumulativeMaxAmplitude, localMaxAmplitude)
282
-
283
- val dataPoint = DataPoint(
284
- id = uniqueIdCounter.getAndIncrement(),
285
- amplitude = localMaxAmplitude, // Always use peak amplitude
286
- rms = rms, // Always include RMS
287
- dB = dB,
288
- silent = silent,
289
- features = features,
290
- speech = SpeechFeatures(isActive = !silent),
291
- startTime = startPosition / (sampleRate * bytesPerSample * config.channels),
292
- endTime = endPosition / (sampleRate * bytesPerSample * config.channels),
293
- startPosition = startPosition,
294
- endPosition = endPosition,
295
- samples = segmentData.size
296
- )
297
-
298
- dataPoints.add(dataPoint)
299
- }
300
- }
301
-
302
- return AudioAnalysisData(
303
- segmentDurationMs = config.segmentDurationMs,
304
- durationMs = durationMs,
305
- bitDepth = bitDepth,
306
- numberOfChannels = config.channels,
307
- sampleRate = config.sampleRate, // Use config.sampleRate instead of sampleRate
308
- samples = totalSamples, // Use totalSamples instead of samplesInRange
309
- dataPoints = dataPoints,
310
- amplitudeRange = AudioAnalysisData.AmplitudeRange(minAmplitude, maxAmplitude),
311
- rmsRange = AudioAnalysisData.AmplitudeRange(minRms, maxRms),
312
- extractionTimeMs = extractionTimeMs.toFloat()
313
- )
314
- }
315
-
316
- fun resetCumulativeAmplitudeRange() {
317
- cumulativeMinAmplitude = Float.MAX_VALUE
318
- cumulativeMaxAmplitude = Float.MIN_VALUE
319
- }
320
-
321
- /**
322
- * Converts the audio data to a float array.
323
- * @param data The audio data in bytes.
324
- * @param bitDepth The bit depth of the audio data.
325
- * @return The converted float array.
326
- */
327
- private fun convertToFloatArray(data: ByteArray, bitDepth: Int): FloatArray {
328
- return when (bitDepth) {
329
- 16 -> {
330
- val buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer()
331
- val array = ShortArray(buffer.remaining())
332
- buffer.get(array)
333
- array.map { it / 32768.0f }.toFloatArray()
334
- }
335
- 8 -> data.map { (it.toInt() - 128) / 128.0f }.toFloatArray()
336
- 32 -> {
337
- val buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer()
338
- val array = IntArray(buffer.remaining())
339
- buffer.get(array)
340
- array.map { it / Int.MAX_VALUE.toFloat() }.toFloatArray()
341
- }
342
- else -> throw IllegalArgumentException("Unsupported bit depth: $bitDepth")
343
- }
344
- }
345
-
346
- /**
347
- * Computes the features of the audio data.
348
- * @param segmentData The segment data.
349
- * @param sampleRate The sample rate of the audio data.
350
- * @param minAmplitude The minimum amplitude.
351
- * @param maxAmplitude The maximum amplitude.
352
- * @param sumSquares The sum of squares.
353
- * @param zeroCrossings The zero crossings.
354
- * @param segmentLength The length of the segment.
355
- * @param featureOptions The feature options to compute.
356
- * @return The computed features.
357
- */
358
- private fun computeFeatures(
359
- segmentData: FloatArray,
360
- sampleRate: Float,
361
- minAmplitude: Float,
362
- maxAmplitude: Float,
363
- sumSquares: Float,
364
- zeroCrossings: Int,
365
- segmentLength: Int,
366
- featureOptions: Map<String, Boolean>
367
- ): Features {
368
- val rms = sqrt(sumSquares / segmentLength)
369
- val energy = if (featureOptions["energy"] == true) sumSquares else 0f
370
- val zcr = if (featureOptions["zcr"] == true) zeroCrossings / segmentLength.toFloat() else 0f
371
-
372
- val mfcc = try {
373
- if (featureOptions["mfcc"] == true) computeMFCC(segmentData, sampleRate) else emptyList()
374
- } catch (e: Exception) {
375
- Log.e("AudioProcessor", "Failed to extract MFCC: ${e.message}", e)
376
- emptyList()
377
- }
378
-
379
- val melSpectrogram = try {
380
- if (featureOptions["melSpectrogram"] == true) computeMelSpectrogram(segmentData, sampleRate) else emptyList()
381
- } catch (e: Exception) {
382
- Log.e("AudioProcessor", "Failed to compute mel spectrogram: ${e.message}", e)
383
- emptyList()
384
- }
385
-
386
- val chroma = try {
387
- if (featureOptions["chromagram"] == true) computeChroma(segmentData, sampleRate) else emptyList()
388
- } catch (e: Exception) {
389
- Log.e("AudioProcessor", "Failed to compute chroma: ${e.message}", e)
390
- emptyList()
391
- }
392
-
393
- val spectralFeatures = if (featureOptions["spectralCentroid"] == true ||
394
- featureOptions["spectralFlatness"] == true ||
395
- featureOptions["spectralRollOff"] == true ||
396
- featureOptions["spectralBandwidth"] == true) {
397
- extractSpectralFeatures(segmentData, sampleRate)
398
- } else {
399
- SpectralFeatures()
400
- }
401
-
402
- val tempo = try {
403
- if (featureOptions["tempo"] == true) extractTempo(segmentData, sampleRate) else 0f
404
- } catch (e: Exception) {
405
- Log.e("AudioProcessor", "Failed to extract tempo: ${e.message}", e)
406
- 0f
407
- }
408
-
409
- val hnr = try {
410
- if (featureOptions["hnr"] == true) extractHNR(segmentData) else 0f
411
- } catch (e: Exception) {
412
- Log.e("AudioProcessor", "Failed to extract HNR: ${e.message}", e)
413
- 0f
414
- }
415
-
416
- val spectralContrast = try {
417
- if (featureOptions["spectralContrast"] == true) computeSpectralContrast(segmentData, sampleRate) else emptyList()
418
- } catch (e: Exception) {
419
- Log.e("AudioProcessor", "Failed to compute spectral contrast: ${e.message}", e)
420
- emptyList()
421
- }
422
-
423
- val tonnetz = try {
424
- if (featureOptions["tonnetz"] == true) computeTonnetz(segmentData, sampleRate) else emptyList()
425
- } catch (e: Exception) {
426
- Log.e("AudioProcessor", "Failed to compute tonnetz: ${e.message}", e)
427
- emptyList()
428
- }
429
-
430
- val pitch = if (featureOptions["pitch"] == true) estimatePitch(segmentData, sampleRate) else 0.0f
431
-
432
- val crc32Value = if (featureOptions["crc32"] == true) {
433
- val byteBuffer = ByteBuffer.allocate(segmentData.size * 4)
434
- .order(ByteOrder.LITTLE_ENDIAN)
435
- segmentData.forEach { value ->
436
- byteBuffer.putFloat(value)
437
- }
438
-
439
- val crc32 = CRC32()
440
- crc32.update(byteBuffer.array())
441
- crc32.value
442
- } else null
443
-
444
- return Features(
445
- energy = energy,
446
- mfcc = mfcc,
447
- rms = rms,
448
- minAmplitude = minAmplitude,
449
- maxAmplitude = maxAmplitude,
450
- zcr = zcr,
451
- spectralCentroid = spectralFeatures.centroid,
452
- spectralFlatness = spectralFeatures.flatness,
453
- spectralRollOff = spectralFeatures.rollOff,
454
- spectralBandwidth = spectralFeatures.bandwidth,
455
- tempo = tempo,
456
- hnr = hnr,
457
- melSpectrogram = melSpectrogram,
458
- chromagram = chroma,
459
- spectralContrast = spectralContrast,
460
- tonnetz = tonnetz,
461
- pitch = pitch,
462
- crc32 = crc32Value
463
- )
464
- }
465
-
466
- private fun extractTempo(segmentData: FloatArray, sampleRate: Float): Float {
467
- val hopLength = 512
468
- val frameLength = 2048
469
-
470
- // Compute onset strength signal using spectral flux
471
- val onsetEnvelope = mutableListOf<Float>()
472
- var previousSpectrum = FloatArray(frameLength / 2)
473
-
474
- // Process frames with spectral flux
475
- for (i in 0 until segmentData.size - frameLength step hopLength) {
476
- val frame = segmentData.slice(i until minOf(i + frameLength, segmentData.size)).toFloatArray()
477
- val fft = FFT(frameLength)
478
- val fftData = frame.copyOf(frameLength)
479
- fft.realForward(fftData)
480
-
481
- // Compute magnitude spectrum
482
- val magnitudes = FloatArray(frameLength / 2)
483
- for (j in magnitudes.indices) {
484
- val re = fftData[2 * j]
485
- val im = if (2 * j + 1 < fftData.size) fftData[2 * j + 1] else 0f
486
- magnitudes[j] = sqrt(re * re + im * im)
487
- }
488
-
489
- // Calculate spectral flux (sum of positive differences)
490
- var flux = 0f
491
- for (j in magnitudes.indices) {
492
- flux += maxOf(magnitudes[j] - previousSpectrum[j], 0f)
493
- }
494
- onsetEnvelope.add(flux)
495
- previousSpectrum = magnitudes
496
- }
497
-
498
- // Find peaks in onset envelope
499
- val peaks = mutableListOf<Int>()
500
- for (i in 1 until onsetEnvelope.size - 1) {
501
- if (onsetEnvelope[i] > onsetEnvelope[i-1] && onsetEnvelope[i] > onsetEnvelope[i+1]) {
502
- peaks.add(i)
503
- }
504
- }
505
-
506
- // Calculate tempo from peak intervals
507
- return if (peaks.size > 1) {
508
- val intervals = peaks.zipWithNext { a, b -> b - a }
509
- val averageInterval = intervals.average().toFloat()
510
- 60f * sampleRate / (hopLength * averageInterval)
511
- } else {
512
- 120f // Default tempo if no clear peaks found
513
- }
514
- }
515
-
516
- private fun extractSpectralFeatures(samples: FloatArray, sampleRate: Float): SpectralFeatures {
517
- // FFT requires a fixed-size buffer (N_FFT). If our input is larger,
518
- // we'll analyze just the first N_FFT samples to prevent buffer overflow.
519
- // This is a common practice in audio analysis where we process chunks
520
- // of consistent size rather than variable-length segments.
521
- val windowed = if (samples.size > N_FFT) {
522
- // If samples are larger than FFT size, take the first N_FFT samples
523
- applyHannWindow(samples.copyOf(N_FFT))
524
- } else {
525
- applyHannWindow(samples)
526
- }
527
-
528
- // Create padded array for FFT, ensuring we don't exceed N_FFT size
529
- // Zero padding is automatic since FloatArray initializes with zeros
530
- val paddedSamples = FloatArray(N_FFT).also { padded ->
531
- windowed.copyInto(padded, 0, 0, minOf(windowed.size, N_FFT))
532
- }
533
-
534
- // Perform FFT
535
- val fft = FFT(N_FFT)
536
- fft.realForward(paddedSamples)
537
-
538
- // Calculate magnitude spectrum (only need first half due to symmetry)
539
- // Add 1 to include both DC (0 Hz) and Nyquist frequency components
540
- val magnitudeSpectrum = FloatArray(N_FFT / 2 + 1)
541
- for (i in 0 until N_FFT / 2) { // Since we're only going up to N_FFT/2, the check is unnecessary
542
- val re = paddedSamples[2 * i]
543
- val im = paddedSamples[2 * i + 1] // This will always be within bounds
544
- magnitudeSpectrum[i] = sqrt(re * re + im * im)
545
- }
546
- // Handle Nyquist frequency component separately
547
- magnitudeSpectrum[N_FFT / 2] = abs(paddedSamples[1])
548
-
549
- // Compute power spectrum for spectral flatness
550
- val powerSpectrum = magnitudeSpectrum.map { it * it }.toFloatArray()
551
-
552
- // Compute spectral features
553
- val centroid = computeSpectralCentroid(magnitudeSpectrum, sampleRate)
554
- val flatness = computeSpectralFlatness(powerSpectrum)
555
- val rollOff = computeSpectralRollOff(magnitudeSpectrum, sampleRate)
556
- val bandwidth = computeSpectralBandwidth(magnitudeSpectrum, sampleRate, centroid)
557
-
558
- return SpectralFeatures(
559
- centroid = centroid,
560
- flatness = flatness,
561
- rollOff = rollOff,
562
- bandwidth = bandwidth
563
- )
564
- }
565
-
566
- private fun computeSpectralCentroid(magnitudeSpectrum: FloatArray, sampleRate: Float): Float {
567
- val sum = magnitudeSpectrum.sum()
568
- if (sum == 0f) return 0f
569
-
570
- val weightedSum = magnitudeSpectrum.mapIndexed { index, value ->
571
- index * (sampleRate / N_FFT) * value
572
- }.sum()
573
-
574
- return weightedSum / sum
575
- }
576
-
577
- private fun computeSpectralFlatness(powerSpectrum: FloatArray): Float {
578
- // Calculate geometric mean using log-space to avoid numerical issues
579
- var sumLogValues = 0.0f
580
- for (value in powerSpectrum) {
581
- sumLogValues += ln(value + 1e-10f) // Add small epsilon to avoid log(0)
582
- }
583
- val geometricMean = exp(sumLogValues / powerSpectrum.size)
584
-
585
- // Calculate arithmetic mean
586
- val arithmeticMean = powerSpectrum.sum() / powerSpectrum.size
587
-
588
- return if (arithmeticMean != 0f) geometricMean / arithmeticMean else 0f
589
- }
590
-
591
- private fun computeSpectralRollOff(magnitudeSpectrum: FloatArray, sampleRate: Float): Float {
592
- val totalEnergy = magnitudeSpectrum.sum()
593
- var cumulativeEnergy = 0f
594
- val rollOffThreshold = totalEnergy * 0.85f
595
-
596
- for ((index, value) in magnitudeSpectrum.withIndex()) {
597
- cumulativeEnergy += value
598
- if (cumulativeEnergy >= rollOffThreshold) {
599
- return index * (sampleRate / N_FFT)
600
- }
601
- }
602
-
603
- return 0f
604
- }
605
-
606
- private fun computeSpectralBandwidth(
607
- magnitudeSpectrum: FloatArray,
608
- sampleRate: Float,
609
- centroid: Float
610
- ): Float {
611
- val sum = magnitudeSpectrum.sum()
612
- if (sum == 0f) return 0f
613
-
614
- // Match iOS frequency calculation
615
- val weightedSum = magnitudeSpectrum.mapIndexed { index, value ->
616
- val freq = index * sampleRate / (2 * magnitudeSpectrum.size)
617
- value * (freq - centroid).pow(2)
618
- }.sum()
619
-
620
- return sqrt(weightedSum / sum)
621
- }
622
-
623
- private data class SpectralFeatures(
624
- val centroid: Float = 0f,
625
- val flatness: Float = 0f,
626
- val rollOff: Float = 0f,
627
- val bandwidth: Float = 0f
628
- )
629
-
630
- /**
631
- * Resets the segment data.
632
- * @param sumSquaresUpdater Function to reset sum of squares.
633
- * @param zeroCrossingsUpdater Function to reset zero crossings.
634
- * @param localMinAmplitudeUpdater Function to reset local min amplitude.
635
- * @param localMaxAmplitudeUpdater Function to reset local max amplitude.
636
- * @param segmentData The segment data list to reset.
637
- */
638
- private fun resetSegmentData(
639
- sumSquaresUpdater: (Float) -> Unit,
640
- zeroCrossingsUpdater: (Int) -> Unit,
641
- localMinAmplitudeUpdater: (Float) -> Unit,
642
- localMaxAmplitudeUpdater: (Float) -> Unit,
643
- segmentData: MutableList<Float>
644
- ) {
645
- sumSquaresUpdater(0f)
646
- zeroCrossingsUpdater(0)
647
- localMinAmplitudeUpdater(Float.MAX_VALUE)
648
- localMaxAmplitudeUpdater(Float.MIN_VALUE)
649
- segmentData.clear()
650
- }
651
-
652
- /**
653
- * Computes the MFCC (Mel-Frequency Cepstral Coefficients) from the audio data.
654
- */
655
- private fun computeMFCC(samples: FloatArray, sampleRate: Float): List<Float> {
656
- val (powerSpectrum, _) = prepareFFT(samples, sampleRate)
657
- val melFilters = computeMelFilterbank(
658
- numFilters = 26,
659
- powerSpectrumSize = powerSpectrum.size,
660
- sampleRate = sampleRate
661
- )
662
-
663
- if (melFilters.any { it.size != powerSpectrum.size }) {
664
- Log.e("AudioProcessor", "Mel filter size (${melFilters[0].size}) does not match power spectrum size (${powerSpectrum.size})")
665
- return emptyList()
666
- }
667
-
668
- val melEnergies = FloatArray(26) { i ->
669
- var energy = 0f
670
- for (j in powerSpectrum.indices) {
671
- energy += powerSpectrum[j] * melFilters[i][j]
672
- }
673
- ln(maxOf(energy, 1e-10f))
674
- }
675
-
676
- val mfcc = FloatArray(13) { i ->
677
- var sum = 0f
678
- for (j in melEnergies.indices) {
679
- sum += melEnergies[j] * cos(PI * i * (2 * j + 1) / (2 * 26)).toFloat()
680
- }
681
- sum * sqrt(2f / 26)
682
- }
683
-
684
- return mfcc.toList()
685
- }
686
-
687
- /**
688
- * Computes the Mel filter bank.
689
- * @param numFilters The number of Mel filters.
690
- * @param powerSpectrumSize The size of the power spectrum.
691
- * @param sampleRate The sample rate of the audio data.
692
- * @return A list of Mel filters.
693
- */
694
- private fun computeMelFilterbank(numFilters: Int, powerSpectrumSize: Int, sampleRate: Float): Array<FloatArray> {
695
- val fMin = 0f
696
- val fMax = sampleRate / 2
697
-
698
- // Convert Hz to Mel
699
- val melMin = hzToMel(fMin)
700
- val melMax = hzToMel(fMax)
701
-
702
- // Create equally spaced points in Mel scale
703
- val melPoints = FloatArray(numFilters + 2)
704
- val melStep = (melMax - melMin) / (numFilters + 1)
705
- for (i in melPoints.indices) {
706
- melPoints[i] = melMin + i * melStep
707
- }
708
-
709
- // Convert back to Hz
710
- val hzPoints = melPoints.map { melToHz(it) }
711
-
712
- // Convert to FFT bin numbers, clamping to valid range
713
- val bins = hzPoints.map { minOf((it * powerSpectrumSize / sampleRate).roundToInt(), powerSpectrumSize - 1) }.toList()
714
-
715
- // Create the filterbank matrix with size matching powerSpectrumSize
716
- val filterbank = Array(numFilters) { FloatArray(powerSpectrumSize) { 0f } }
717
-
718
- // Ensure safe access to bins by limiting the loop and checking boundaries
719
- for (i in 0 until numFilters) {
720
- if (i + 2 < bins.size) { // Check to prevent out-of-bounds access
721
- val startBin = bins[i]
722
- val centerBin = bins[i + 1]
723
- val endBin = bins[i + 2]
724
-
725
- // Left slope (ascending triangle)
726
- if (centerBin > startBin) {
727
- for (j in startBin until centerBin) {
728
- filterbank[i][j] = (j - startBin).toFloat() / (centerBin - startBin).toFloat()
729
- }
730
- }
731
- // Right slope (descending triangle)
732
- if (endBin > centerBin) {
733
- for (j in centerBin until endBin) {
734
- filterbank[i][j] = (endBin - j).toFloat() / (endBin - centerBin).toFloat()
735
- }
736
- }
737
- }
738
- }
739
-
740
- return filterbank
741
- }
742
-
743
- /**
744
- * Computes the Discrete Cosine Transform (DCT) of the log energies.
745
- * @param logEnergies The log energies.
746
- * @param numCoefficients The number of coefficients to compute.
747
- * @return A list of MFCC coefficients.
748
- */
749
- private fun computeDCT(logEnergies: List<Float>, numCoefficients: Int): List<Float> {
750
- val n = logEnergies.size
751
- val dct = FloatArray(numCoefficients)
752
-
753
- for (i in 0 until numCoefficients) {
754
- var sum = 0.0
755
- for (j in logEnergies.indices) {
756
- sum += logEnergies[j] * cos(PI * i * (j + 0.5) / n)
757
- }
758
- dct[i] = (sum / sqrt(DCT_SQRT_DIVISOR * n)).toFloat()
759
- }
760
-
761
- return dct.toList()
762
- }
763
-
764
- /**
765
- * Extracts the HNR (Harmonics-to-Noise Ratio) from the audio data.
766
- * @param segmentData The segment data.
767
- * @return The HNR.
768
- */
769
- private fun extractHNR(segmentData: FloatArray): Float {
770
- val frameSize = segmentData.size
771
- val autocorrelation = FloatArray(frameSize)
772
-
773
- // Compute the autocorrelation of the segment data
774
- for (i in segmentData.indices) {
775
- var sum = 0f
776
- for (j in 0 until frameSize - i) {
777
- sum += segmentData[j] * segmentData[j + i]
778
- }
779
- autocorrelation[i] = sum
780
- }
781
-
782
- // Find peaks with minimum prominence
783
- val maxAutocorrelation = autocorrelation.maxOrNull() ?: 0f
784
- val peaks = findPeaks(autocorrelation, minProminence = 0.1f * maxAutocorrelation)
785
-
786
- if (peaks.isNotEmpty()) {
787
- val firstPeakIndex = peaks.firstOrNull { it > 0 } ?: 0
788
- val harmonicEnergy = autocorrelation[firstPeakIndex]
789
- val noiseEnergy = autocorrelation[0] - harmonicEnergy
790
- if (noiseEnergy > 0) {
791
- return 10 * log10(harmonicEnergy / noiseEnergy)
792
- }
793
- }
794
-
795
- return 0f
796
- }
797
-
798
- private fun findPeaks(data: FloatArray, minProminence: Float): List<Int> {
799
- val peaks = mutableListOf<Int>()
800
- for (i in 1 until data.size - 1) {
801
- if (data[i] > data[i - 1] && data[i] > data[i + 1]) {
802
- val prominence = data[i] - maxOf(data[i - 1], data[i + 1])
803
- if (prominence >= minProminence) {
804
- peaks.add(i)
805
- }
806
- }
807
- }
808
- return peaks
809
- }
810
-
811
- fun loadAudioFromAnyFormat(fileUri: String, decodingConfig: DecodingConfig? = null): AudioData? {
812
- val cleanUri = fileUri.removePrefix("file://")
813
- val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
814
- ?: run {
815
- Log.e("AudioProcessor", "File not found in any location: $cleanUri")
816
- return null
817
- }
818
-
819
- // First try MediaExtractor
820
- val extractor = MediaExtractor()
821
- try {
822
- Log.d("AudioProcessor", "Attempting MediaExtractor with path: ${file.absolutePath}")
823
- extractor.setDataSource(file.absolutePath)
824
-
825
- // Find the first audio track
826
- val audioTrackIndex = (0 until extractor.trackCount)
827
- .find { extractor.getTrackFormat(it).getString(MediaFormat.KEY_MIME)?.startsWith("audio/") == true }
828
-
829
- if (audioTrackIndex != null) {
830
- val format = extractor.getTrackFormat(audioTrackIndex)
831
- extractor.selectTrack(audioTrackIndex)
832
-
833
- // Get original audio properties
834
- val originalSampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE)
835
- val originalChannels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT)
836
- val totalDurationUs = try {
837
- format.getLong(MediaFormat.KEY_DURATION)
838
- } catch (e: Exception) {
839
- (format.getString(MediaFormat.KEY_DURATION) ?: "-1").toLong()
840
- }
841
- Log.d("AudioProcessor", "Raw duration from format: ${totalDurationUs}us")
842
-
843
- val totalDurationMs = totalDurationUs / 1000
844
- Log.d("AudioProcessor", "Final duration: ${totalDurationMs}ms")
845
-
846
- // Process using MediaExtractor
847
- val pcmData = decodeAudioToPCM(extractor, format)
848
- val processedData = if (decodingConfig != null) {
849
- processAudio(
850
- pcmData,
851
- originalSampleRate,
852
- decodingConfig.targetSampleRate,
853
- originalChannels,
854
- decodingConfig.targetChannels,
855
- decodingConfig.normalizeAudio
856
- )
857
- } else {
858
- pcmData
859
- }
860
-
861
- return AudioData(
862
- data = processedData,
863
- sampleRate = decodingConfig?.targetSampleRate ?: originalSampleRate,
864
- bitDepth = decodingConfig?.targetBitDepth ?: 16,
865
- channels = decodingConfig?.targetChannels ?: originalChannels,
866
- durationMs = totalDurationMs // Pass through the duration
867
- )
868
- }
869
- } catch (e: Exception) {
870
- Log.d("AudioProcessor", "MediaExtractor failed, attempting WAV parser: ${e.message}")
871
- } finally {
872
- extractor.release()
873
- }
874
-
875
- // If MediaExtractor failed and file is WAV, try WAV parser
876
- if (file.name.lowercase().endsWith(".wav")) {
877
- Log.d("AudioProcessor", "Falling back to WAV parser")
878
- return loadAudioFile(file.absolutePath)?.let { wavData ->
879
- if (decodingConfig != null) {
880
- val processedData = processAudio(
881
- wavData.data,
882
- wavData.sampleRate,
883
- decodingConfig.targetSampleRate,
884
- wavData.channels,
885
- decodingConfig.targetChannels,
886
- decodingConfig.normalizeAudio
887
- )
888
- AudioData(
889
- data = processedData,
890
- sampleRate = decodingConfig.targetSampleRate ?: wavData.sampleRate,
891
- bitDepth = decodingConfig.targetBitDepth,
892
- channels = decodingConfig.targetChannels ?: wavData.channels,
893
- durationMs = wavData.durationMs // Pass through the duration
894
- )
895
- } else {
896
- wavData
897
- }
898
- }
899
- }
900
-
901
- Log.e("AudioProcessor", "Failed to process audio file with both MediaExtractor and WAV parser")
902
- return null
903
- }
904
-
905
- private fun decodeAudioToPCM(extractor: MediaExtractor, format: MediaFormat): ByteArray {
906
- val decoder = MediaCodec.createDecoderByType(format.getString(MediaFormat.KEY_MIME)!!)
907
- decoder.configure(format, null, null, 0)
908
- decoder.start()
909
-
910
- val info = MediaCodec.BufferInfo()
911
- val pcmData = mutableListOf<Byte>()
912
-
913
- var isEOS = false
914
- while (!isEOS) {
915
- val inputBufferId = decoder.dequeueInputBuffer(10000)
916
- if (inputBufferId >= 0) {
917
- val inputBuffer = decoder.getInputBuffer(inputBufferId)!!
918
- val sampleSize = extractor.readSampleData(inputBuffer, 0)
919
-
920
- if (sampleSize < 0) {
921
- decoder.queueInputBuffer(inputBufferId, 0, 0, 0, MediaCodec.BUFFER_FLAG_END_OF_STREAM)
922
- isEOS = true
923
- } else {
924
- decoder.queueInputBuffer(inputBufferId, 0, sampleSize, extractor.sampleTime, 0)
925
- extractor.advance()
926
- }
927
- }
928
-
929
- val outputBufferId = decoder.dequeueOutputBuffer(info, 10000)
930
- if (outputBufferId >= 0) {
931
- val outputBuffer = decoder.getOutputBuffer(outputBufferId)!!
932
- val chunk = ByteArray(info.size)
933
- outputBuffer.get(chunk)
934
- pcmData.addAll(chunk.toList())
935
- decoder.releaseOutputBuffer(outputBufferId, false)
936
- }
937
- }
938
-
939
- decoder.stop()
940
- decoder.release()
941
-
942
- return pcmData.toByteArray()
943
- }
944
-
945
- private fun resampleAudio(
946
- pcmData: ByteArray,
947
- originalSampleRate: Int,
948
- targetSampleRate: Int,
949
- originalChannels: Int
950
- ): ByteArray {
951
- // Convert byte array to short array (16-bit samples)
952
- val shortArray = ShortArray(pcmData.size / 2)
953
- ByteBuffer.wrap(pcmData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shortArray)
954
-
955
- // Convert to mono if needed
956
- val monoShortArray = if (originalChannels > 1) {
957
- convertToMono(shortArray, originalChannels)
958
- } else {
959
- shortArray
960
- }
961
-
962
- // Resample
963
- val resampleRatio = targetSampleRate.toDouble() / originalSampleRate
964
- val newLength = (monoShortArray.size * resampleRatio).toInt()
965
- val resampledArray = ShortArray(newLength)
966
-
967
- for (i in resampledArray.indices) {
968
- val originalIndex = (i / resampleRatio).toInt()
969
- val nextIndex = minOf(originalIndex + 1, monoShortArray.size - 1)
970
- val fraction = (i / resampleRatio) - originalIndex
971
-
972
- // Linear interpolation
973
- val sample = linearInterpolate(
974
- monoShortArray[originalIndex].toDouble(),
975
- monoShortArray[nextIndex].toDouble(),
976
- fraction
977
- ).toInt().toShort()
978
-
979
- resampledArray[i] = sample
980
- }
981
-
982
- // Convert back to byte array
983
- val resultBuffer = ByteBuffer.allocate(resampledArray.size * 2)
984
- resultBuffer.order(ByteOrder.LITTLE_ENDIAN)
985
- resultBuffer.asShortBuffer().put(resampledArray)
986
- return resultBuffer.array()
987
- }
988
-
989
- private fun convertToMono(stereoData: ShortArray, channels: Int): ShortArray {
990
- val monoLength = stereoData.size / channels
991
- val monoData = ShortArray(monoLength)
992
-
993
- for (i in 0 until monoLength) {
994
- var sum = 0
995
- for (ch in 0 until channels) {
996
- sum += stereoData[i * channels + ch]
997
- }
998
- monoData[i] = (sum / channels).toShort()
999
- }
1000
-
1001
- return monoData
1002
- }
1003
-
1004
- private fun linearInterpolate(a: Double, b: Double, fraction: Double): Double {
1005
- return a + fraction * (b - a)
1006
- }
1007
-
1008
- fun processAudio(
1009
- pcmData: ByteArray,
1010
- originalSampleRate: Int,
1011
- targetSampleRate: Int?,
1012
- originalChannels: Int,
1013
- targetChannels: Int?,
1014
- normalize: Boolean
1015
- ): ByteArray {
1016
- var processedData = pcmData
1017
-
1018
- // Only resample if target sample rate is explicitly specified and different
1019
- if (targetSampleRate != null && originalSampleRate != targetSampleRate) {
1020
- processedData = resampleAudio(processedData, originalSampleRate, targetSampleRate, originalChannels)
1021
- }
1022
-
1023
- // Only convert channels if target channels is explicitly specified and different
1024
- if (targetChannels != null && originalChannels != targetChannels) {
1025
- processedData = convertChannels(processedData, originalChannels, targetChannels)
1026
- }
1027
-
1028
- // Only normalize if explicitly requested
1029
- if (normalize) {
1030
- processedData = normalizeAudio(processedData)
1031
- }
1032
-
1033
- return processedData
1034
- }
1035
-
1036
- private fun normalizeAudio(pcmData: ByteArray): ByteArray {
1037
- val shorts = ShortArray(pcmData.size / 2)
1038
- ByteBuffer.wrap(pcmData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts)
1039
-
1040
- // Find maximum amplitude
1041
- var maxAmplitude = 0
1042
- for (sample in shorts) {
1043
- maxAmplitude = maxOf(maxAmplitude, abs(sample.toInt()))
1044
- }
1045
-
1046
- // Normalize if we found a non-zero maximum
1047
- if (maxAmplitude > 0) {
1048
- val normalizationFactor = Short.MAX_VALUE.toFloat() / maxAmplitude
1049
- for (i in shorts.indices) {
1050
- shorts[i] = (shorts[i] * normalizationFactor).toInt().toShort()
1051
- }
1052
- }
1053
-
1054
- // Convert back to bytes
1055
- val resultBuffer = ByteBuffer.allocate(shorts.size * 2)
1056
- resultBuffer.order(ByteOrder.LITTLE_ENDIAN)
1057
- resultBuffer.asShortBuffer().put(shorts)
1058
- return resultBuffer.array()
1059
- }
1060
-
1061
- private fun convertChannels(pcmData: ByteArray, originalChannels: Int, targetChannels: Int): ByteArray {
1062
- val result = ByteArray(pcmData.size * targetChannels / originalChannels)
1063
- val inputBuffer = ByteBuffer.wrap(pcmData).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer()
1064
- val outputBuffer = ByteBuffer.wrap(result).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer()
1065
-
1066
- for (i in result.indices) {
1067
- val channelData = ShortArray(targetChannels)
1068
- for (j in 0 until targetChannels) {
1069
- channelData[j] = inputBuffer.get()
1070
- }
1071
- outputBuffer.put(channelData)
1072
- }
1073
-
1074
- return result
1075
- }
1076
-
1077
- private fun debugWavHeader(file: File) {
1078
- try {
1079
- val bytes = ByteArray(44) // Standard WAV header size
1080
- RandomAccessFile(file, "r").use { raf ->
1081
- raf.readFully(bytes)
1082
- }
1083
-
1084
- Log.d("AudioProcessor", "WAV Header Bytes: ${bytes.joinToString(", ") { String.format("%02X", it) }}")
1085
- Log.d("AudioProcessor", "ASCII: ${bytes.map { it.toInt().toChar() }.joinToString("")}")
1086
-
1087
- val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)
1088
- Log.d("AudioProcessor", """
1089
- RIFF header: ${String(bytes, 0, 4)}
1090
- File size: ${buffer.getInt(4)}
1091
- WAVE header: ${String(bytes, 8, 4)}
1092
- fmt header: ${String(bytes, 12, 4)}
1093
- Chunk size: ${buffer.getInt(16)}
1094
- Audio format: ${buffer.getShort(20)}
1095
- Channels: ${buffer.getShort(22)}
1096
- Sample rate: ${buffer.getInt(24)}
1097
- Byte rate: ${buffer.getInt(28)}
1098
- Block align: ${buffer.getShort(32)}
1099
- Bits per sample: ${buffer.getShort(34)}
1100
- """.trimIndent())
1101
- } catch (e: Exception) {
1102
- Log.e("AudioProcessor", "Failed to debug WAV header: ${e.message}")
1103
- }
1104
- }
1105
-
1106
- fun generatePreview(
1107
- audioData: AudioData,
1108
- numberOfPoints: Int,
1109
- startTimeMs: Long? = null,
1110
- endTimeMs: Long? = null,
1111
- config: RecordingConfig
1112
- ): AudioAnalysisData {
1113
- val totalDurationMs = audioData.durationMs
1114
-
1115
- Log.d(Constants.TAG, "Total audio duration: ${totalDurationMs}ms")
1116
-
1117
- // Validate time range
1118
- if (startTimeMs != null) {
1119
- require(startTimeMs >= 0) { "startTime must be non-negative, got: $startTimeMs" }
1120
- require(startTimeMs <= totalDurationMs) { "startTime ($startTimeMs) is beyond audio duration ($totalDurationMs)" }
1121
- }
1122
-
1123
- if (endTimeMs != null) {
1124
- require(endTimeMs >= 0) { "endTime must be non-negative, got: $endTimeMs" }
1125
- if (endTimeMs > totalDurationMs) {
1126
- Log.w(Constants.TAG, "endTime ($endTimeMs) is beyond audio duration ($totalDurationMs), clamping to duration")
1127
- }
1128
- if (startTimeMs != null) {
1129
- require(startTimeMs < endTimeMs) { "startTime ($startTimeMs) must be less than endTime ($endTimeMs)" }
1130
- }
1131
- }
1132
-
1133
- // Calculate effective range
1134
- val effectiveStartMs = startTimeMs ?: 0L
1135
- val effectiveEndMs = (endTimeMs ?: totalDurationMs).coerceAtMost(totalDurationMs)
1136
- val durationMs = effectiveEndMs - effectiveStartMs
1137
-
1138
- Log.d(Constants.TAG, "Preview range: ${effectiveStartMs}ms to ${effectiveEndMs}ms (${durationMs}ms)")
1139
-
1140
- // Calculate sample range
1141
- val startSampleIndex = ((effectiveStartMs * audioData.sampleRate) / 1000).toInt()
1142
- val endSampleIndex = ((effectiveEndMs * audioData.sampleRate) / 1000).toInt().coerceAtMost(audioData.data.size)
1143
- val samplesInRange = endSampleIndex - startSampleIndex
1144
-
1145
- if (samplesInRange <= 0) {
1146
- throw IllegalArgumentException("Invalid sample range: contains no samples")
1147
- }
1148
-
1149
- val samplesPerPoint = (samplesInRange / numberOfPoints).coerceAtLeast(1)
1150
- val pointsPerSecond = numberOfPoints.toDouble() / (durationMs.toDouble() / 1000.0)
1151
-
1152
- val dataPoints = mutableListOf<DataPoint>()
1153
- var minAmplitude = Float.MAX_VALUE
1154
- var maxAmplitude = Float.MIN_VALUE
1155
- var minRms = Float.MAX_VALUE // Add minRms
1156
- var maxRms = Float.MIN_VALUE // Add maxRms
1157
-
1158
- val extractionTimeMs = measureTimeMillis {
1159
- for (i in 0 until numberOfPoints) {
1160
- val pointStartSample = startSampleIndex + (i * samplesPerPoint)
1161
- val pointEndSample = minOf(startSampleIndex + ((i + 1) * samplesPerPoint), endSampleIndex)
1162
-
1163
- if (pointStartSample >= pointEndSample) break
1164
-
1165
- try {
1166
- val segmentBytes = audioData.data.sliceArray(pointStartSample until pointEndSample)
1167
-
1168
- // Convert PCM bytes to float samples with proper bit depth handling
1169
- val segmentData = when (audioData.bitDepth) {
1170
- 16 -> convert16BitPcmToFloat(segmentBytes)
1171
- 32 -> convert32BitPcmToFloat(segmentBytes)
1172
- else -> convert8BitPcmToFloat(segmentBytes)
1173
- }
1174
-
1175
- // Calculate time points based on actual sample rate
1176
- val startTimePoint = ((pointStartSample * 1000L) / (audioData.sampleRate * audioData.channels)).toFloat()
1177
- val endTimePoint = ((pointEndSample * 1000L) / (audioData.sampleRate * audioData.channels)).toFloat()
1178
-
1179
- val rms = sqrt(segmentData.map { it * it }.average().toFloat())
1180
- val amplitude = segmentData.maxOf { abs(it) } // Always use peak amplitude
1181
-
1182
- minAmplitude = minOf(minAmplitude, amplitude)
1183
- maxAmplitude = maxOf(maxAmplitude, amplitude)
1184
- minRms = minOf(minRms, rms)
1185
- maxRms = maxOf(maxRms, rms)
1186
-
1187
- dataPoints.add(DataPoint(
1188
- id = i.toLong(),
1189
- amplitude = amplitude, // Peak amplitude
1190
- rms = rms, // RMS value
1191
- dB = 20 * log10(amplitude.toDouble()).toFloat(),
1192
- silent = amplitude < 0.01,
1193
- features = null,
1194
- speech = null,
1195
- startTime = startTimePoint,
1196
- endTime = endTimePoint,
1197
- startPosition = pointStartSample,
1198
- endPosition = pointEndSample,
1199
- samples = segmentData.size
1200
- ))
1201
- } catch (e: Exception) {
1202
- Log.e(Constants.TAG, "Error processing segment $i: ${e.message}")
1203
- throw IllegalStateException("Failed to process audio segment: ${e.message}", e)
1204
- }
1205
- }
1206
- }
1207
-
1208
- if (dataPoints.isEmpty()) {
1209
- throw IllegalStateException("No data points were generated")
1210
- }
1211
-
1212
- return AudioAnalysisData(
1213
- segmentDurationMs = config.segmentDurationMs,
1214
- durationMs = durationMs.toInt(),
1215
- bitDepth = audioData.bitDepth,
1216
- numberOfChannels = audioData.channels,
1217
- sampleRate = audioData.sampleRate,
1218
- samples = samplesInRange,
1219
- dataPoints = dataPoints,
1220
- amplitudeRange = AudioAnalysisData.AmplitudeRange(minAmplitude, maxAmplitude),
1221
- rmsRange = AudioAnalysisData.AmplitudeRange(minRms, maxRms),
1222
- extractionTimeMs = extractionTimeMs.toFloat()
1223
- )
1224
- }
1225
-
1226
- // Add these conversion helpers
1227
- private fun convert16BitPcmToFloat(bytes: ByteArray): FloatArray {
1228
- val shorts = ShortArray(bytes.size / 2)
1229
- ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asShortBuffer().get(shorts)
1230
- return shorts.map { it.toFloat() / Short.MAX_VALUE }.toFloatArray()
1231
- }
1232
-
1233
- private fun convert32BitPcmToFloat(bytes: ByteArray): FloatArray {
1234
- val ints = IntArray(bytes.size / 4)
1235
- ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asIntBuffer().get(ints)
1236
- return ints.map { it.toFloat() / Int.MAX_VALUE }.toFloatArray()
1237
- }
1238
-
1239
- private fun convert8BitPcmToFloat(bytes: ByteArray): FloatArray {
1240
- return bytes.map { (it.toInt() - 128).toFloat() / 127f }.toFloatArray()
1241
- }
1242
-
1243
- fun loadAudioRange(fileUri: String, startTimeMs: Long, endTimeMs: Long, config: DecodingConfig? = null): AudioData? {
1244
- try {
1245
- // Use default config if none provided
1246
- val effectiveConfig = config ?: DecodingConfig(
1247
- targetSampleRate = null,
1248
- targetChannels = null,
1249
- targetBitDepth = 16,
1250
- normalizeAudio = false
1251
- )
1252
-
1253
- // First check if it's a WAV file by extension
1254
- val isWavByExtension = fileUri.lowercase().endsWith(".wav")
1255
-
1256
- // Then verify WAV header if needed
1257
- val headerSize = if (isWavByExtension) {
1258
- getWavHeaderSize(fileUri)
1259
- } else null
1260
-
1261
- // If it's a WAV file (by extension and header verification)
1262
- return if (isWavByExtension && headerSize != null) {
1263
- Log.d(Constants.TAG, "Loading WAV range with header size: $headerSize bytes")
1264
- loadWavRange(fileUri, startTimeMs, endTimeMs, effectiveConfig, headerSize)
1265
- } else {
1266
- if (isWavByExtension) {
1267
- Log.w(Constants.TAG, "File has .wav extension but invalid header, falling back to compressed loader")
1268
- }
1269
- Log.d(Constants.TAG, "Loading compressed audio range")
1270
- loadCompressedAudioRange(fileUri, startTimeMs, endTimeMs, effectiveConfig)
1271
- }
1272
- } catch (e: Exception) {
1273
- Log.e(Constants.TAG, "Failed to load audio range: ${e.message}", e)
1274
- return null
1275
- }
1276
- }
1277
-
1278
- private fun loadWavRange(
1279
- fileUri: String,
1280
- startTimeMs: Long,
1281
- endTimeMs: Long,
1282
- config: DecodingConfig,
1283
- headerSize: Int
1284
- ): AudioData? {
1285
- try {
1286
- val file = File(fileUri.removePrefix("file://")).takeIf { it.exists() }
1287
- ?: File(filesDir, File(fileUri).name).takeIf { it.exists() }
1288
- ?: throw IllegalArgumentException("File not found: $fileUri")
1289
-
1290
- // Use existing method to get audio format
1291
- val format = getAudioFormat(fileUri) ?: throw IllegalArgumentException("Could not determine audio format")
1292
-
1293
- val bytesPerSecond = format.sampleRate * format.channels * (format.bitDepth / 8)
1294
- val startByteOffset = ((startTimeMs * bytesPerSecond) / 1000).toInt()
1295
- val endByteOffset = ((endTimeMs * bytesPerSecond) / 1000).toInt()
1296
-
1297
- val startByte = headerSize + startByteOffset
1298
- val endByte = headerSize + endByteOffset
1299
-
1300
- Log.d(Constants.TAG, """
1301
- Loading WAV range:
1302
- - headerSize: $headerSize
1303
- - startByte: $startByte
1304
- - endByte: $endByte
1305
- - bytesPerSecond: $bytesPerSecond
1306
- """.trimIndent())
1307
-
1308
- var audioDataBytes = ByteArray((endByte - startByte).coerceAtLeast(0))
1309
- FileInputStream(file).use { fis ->
1310
- fis.skip(startByte.toLong())
1311
- fis.read(audioDataBytes)
1312
- }
1313
-
1314
- // Apply bit depth conversion if needed
1315
- var effectiveBitDepth = format.bitDepth
1316
- if (config.targetBitDepth != format.bitDepth) {
1317
- audioDataBytes = AudioFormatUtils.convertBitDepth(
1318
- audioDataBytes,
1319
- format.bitDepth,
1320
- config.targetBitDepth
1321
- )
1322
- effectiveBitDepth = config.targetBitDepth
1323
- Log.d(Constants.TAG, "Converted bit depth from ${format.bitDepth} to ${config.targetBitDepth}")
1324
- }
1325
-
1326
- return AudioData(
1327
- data = audioDataBytes,
1328
- sampleRate = format.sampleRate,
1329
- channels = format.channels,
1330
- bitDepth = effectiveBitDepth,
1331
- durationMs = endTimeMs - startTimeMs
1332
- )
1333
- } catch (e: Exception) {
1334
- Log.e(Constants.TAG, "Failed to load WAV range: ${e.message}", e)
1335
- return null
1336
- }
1337
- }
1338
-
1339
- private fun loadCompressedAudioRange(
1340
- fileUri: String,
1341
- startTimeMs: Long,
1342
- endTimeMs: Long,
1343
- config: DecodingConfig
1344
- ): AudioData? {
1345
- val extractor = MediaExtractor()
1346
- var decoder: MediaCodec? = null
1347
-
1348
- try {
1349
- extractor.setDataSource(fileUri.removePrefix("file://"))
1350
- val format = extractor.getTrackFormat(0)
1351
- extractor.selectTrack(0)
1352
-
1353
- val originalSampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE)
1354
- val originalChannels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT)
1355
- val totalDurationUs = try {
1356
- format.getLong(MediaFormat.KEY_DURATION)
1357
- } catch (e: Exception) {
1358
- (format.getString(MediaFormat.KEY_DURATION) ?: "-1").toLong()
1359
- }
1360
- Log.d("AudioProcessor", "Raw duration from format: ${totalDurationUs}us")
1361
-
1362
- val totalDurationMs = totalDurationUs / 1000
1363
- Log.d("AudioProcessor", "Final duration: ${totalDurationMs}ms")
1364
-
1365
- // Calculate valid time range
1366
- val validStartMs = startTimeMs.coerceIn(0, totalDurationMs) ?: 0
1367
- val validEndMs = endTimeMs.coerceIn(validStartMs, totalDurationMs) ?: totalDurationMs
1368
- val effectiveDurationMs = validEndMs - validStartMs
1369
-
1370
- // Initialize decoder
1371
- decoder = MediaCodec.createDecoderByType(format.getString(MediaFormat.KEY_MIME)!!)
1372
- decoder.configure(format, null, null, 0)
1373
- decoder.start()
1374
-
1375
- // Seek to start position if needed
1376
- if (validStartMs > 0) {
1377
- extractor.seekTo(validStartMs * 1000, MediaExtractor.SEEK_TO_CLOSEST_SYNC)
1378
- }
1379
-
1380
- // Calculate buffer sizes
1381
- val targetSampleRate = config.targetSampleRate ?: originalSampleRate
1382
- val targetChannels = config.targetChannels ?: originalChannels
1383
- val targetBitDepth = config.targetBitDepth ?: 16
1384
- val bytesPerSample = targetBitDepth / 8
1385
- val samplesPerSecond = targetSampleRate * targetChannels
1386
- val totalBytes = (effectiveDurationMs * samplesPerSecond * bytesPerSample) / 1000
1387
-
1388
- Log.d(Constants.TAG, """
1389
- Loading audio range:
1390
- - start: ${validStartMs}ms
1391
- - end: ${validEndMs}ms
1392
- - duration: ${effectiveDurationMs}ms
1393
- - bytes: $totalBytes
1394
- - format: ${targetSampleRate}Hz, $targetChannels channels, $targetBitDepth-bit
1395
- """.trimIndent())
1396
-
1397
- val outputBuffer = ByteBuffer.allocate(totalBytes.toInt())
1398
- val bufferInfo = MediaCodec.BufferInfo()
1399
- var isEOS = false
1400
-
1401
- while (!isEOS) {
1402
- // Handle input
1403
- val inputBufferId = decoder.dequeueInputBuffer(10000)
1404
- if (inputBufferId >= 0) {
1405
- val inputBuffer = decoder.getInputBuffer(inputBufferId)!!
1406
- val sampleSize = extractor.readSampleData(inputBuffer, 0)
1407
-
1408
- when {
1409
- sampleSize < 0 -> {
1410
- decoder.queueInputBuffer(inputBufferId, 0, 0, 0, MediaCodec.BUFFER_FLAG_END_OF_STREAM)
1411
- isEOS = true
1412
- }
1413
- extractor.sampleTime > validEndMs * 1000 -> {
1414
- decoder.queueInputBuffer(inputBufferId, 0, 0, 0, MediaCodec.BUFFER_FLAG_END_OF_STREAM)
1415
- isEOS = true
1416
- }
1417
- else -> {
1418
- decoder.queueInputBuffer(inputBufferId, 0, sampleSize, extractor.sampleTime, 0)
1419
- extractor.advance()
1420
- }
1421
- }
1422
- }
1423
-
1424
- // Handle output
1425
- val outputBufferId = decoder.dequeueOutputBuffer(bufferInfo, 10000)
1426
- if (outputBufferId >= 0) {
1427
- val decodedBuffer = decoder.getOutputBuffer(outputBufferId)!!
1428
- if (bufferInfo.size > 0) {
1429
- // Set buffer position and limit based on the decoded data
1430
- decodedBuffer.position(bufferInfo.offset)
1431
- decodedBuffer.limit(bufferInfo.offset + bufferInfo.size)
1432
-
1433
- // Copy decoded data to our output buffer
1434
- outputBuffer.put(decodedBuffer)
1435
- }
1436
- decoder.releaseOutputBuffer(outputBufferId, false)
1437
-
1438
- // Check if we've reached the end
1439
- if ((bufferInfo.flags and MediaCodec.BUFFER_FLAG_END_OF_STREAM) != 0) {
1440
- isEOS = true
1441
- }
1442
- }
1443
- }
1444
-
1445
- // Prepare the final byte array
1446
- outputBuffer.flip()
1447
- val audioData = ByteArray(outputBuffer.remaining())
1448
- outputBuffer.get(audioData)
1449
-
1450
- return AudioData(
1451
- data = audioData,
1452
- sampleRate = targetSampleRate,
1453
- channels = targetChannels,
1454
- bitDepth = targetBitDepth,
1455
- durationMs = endTimeMs - startTimeMs // Use the actual time range
1456
- ).also {
1457
- Log.d(Constants.TAG, "Loaded compressed audio with duration: ${effectiveDurationMs}ms")
1458
- }
1459
- } catch (e: Exception) {
1460
- Log.e(Constants.TAG, "Failed to load compressed audio range: ${e.message}", e)
1461
- return null
1462
- } finally {
1463
- decoder?.stop()
1464
- decoder?.release()
1465
- extractor.release()
1466
- }
1467
- }
1468
-
1469
- // Future audio editing methods
1470
- fun trimAudio(
1471
- fileUri: String,
1472
- startTimeMs: Long,
1473
- endTimeMs: Long,
1474
- config: DecodingConfig? = null,
1475
- outputFileName: String? = null
1476
- ): AudioData? {
1477
- try {
1478
- // Load the specified range
1479
- val audioData = loadAudioRange(fileUri, startTimeMs, endTimeMs, config ?: DecodingConfig())
1480
- ?: return null
1481
-
1482
- // Generate output filename if not provided
1483
- val outputFile = if (outputFileName != null) {
1484
- File(filesDir, outputFileName)
1485
- } else {
1486
- val timestamp = System.currentTimeMillis()
1487
- File(filesDir, "trimmed_${timestamp}.wav")
1488
- }
1489
-
1490
- val durationMs = (endTimeMs - startTimeMs).toInt()
1491
-
1492
- Log.d(Constants.TAG, """
1493
- Trimming audio:
1494
- - start: ${startTimeMs}ms
1495
- - end: ${endTimeMs}ms
1496
- - duration: ${durationMs}ms
1497
- - output: ${outputFile.name}
1498
- """.trimIndent())
1499
-
1500
- // Write WAV header
1501
- RandomAccessFile(outputFile, "rw").use { raf ->
1502
- // RIFF header
1503
- raf.write("RIFF".toByteArray())
1504
- val fileSize = audioData.data.size + 36 // File size minus RIFF header
1505
- raf.writeInt(fileSize)
1506
- raf.write("WAVE".toByteArray())
1507
-
1508
- // fmt chunk
1509
- raf.write("fmt ".toByteArray())
1510
- raf.writeInt(16) // Subchunk1Size (16 for PCM)
1511
- val formatBytes = ByteBuffer.allocate(2).order(ByteOrder.LITTLE_ENDIAN)
1512
- formatBytes.putShort(1) // AudioFormat (1 for PCM)
1513
- raf.write(formatBytes.array())
1514
-
1515
- val channelsBytes = ByteBuffer.allocate(2).order(ByteOrder.LITTLE_ENDIAN)
1516
- channelsBytes.putShort(audioData.channels.toShort())
1517
- raf.write(channelsBytes.array())
1518
-
1519
- val sampleRateBytes = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN)
1520
- sampleRateBytes.putInt(audioData.sampleRate)
1521
- raf.write(sampleRateBytes.array())
1522
-
1523
- val byteRate = audioData.sampleRate * audioData.channels * (audioData.bitDepth / 8)
1524
- raf.writeInt(byteRate) // ByteRate
1525
-
1526
- val blockAlign = audioData.channels * (audioData.bitDepth / 8)
1527
- raf.writeShort(blockAlign) // BlockAlign
1528
- raf.writeShort(audioData.bitDepth) // BitsPerSample
1529
-
1530
- // data chunk
1531
- raf.write("data".toByteArray())
1532
- raf.writeInt(audioData.data.size) // Subchunk2Size
1533
-
1534
- // Write audio data
1535
- raf.write(audioData.data)
1536
- }
1537
-
1538
- // Debug WAV header to verify
1539
- debugWavHeader(outputFile)
1540
-
1541
- // Return the trimmed audio data
1542
- return AudioData(
1543
- data = audioData.data,
1544
- sampleRate = audioData.sampleRate,
1545
- channels = audioData.channels,
1546
- bitDepth = audioData.bitDepth
1547
- )
1548
- } catch (e: Exception) {
1549
- Log.e(Constants.TAG, "Failed to trim audio: ${e.message}", e)
1550
- return null
1551
- }
1552
- }
1553
-
1554
- fun removeSection(
1555
- fileUri: String,
1556
- startTimeMs: Long,
1557
- endTimeMs: Long,
1558
- config: DecodingConfig? = null
1559
- ): AudioData? {
1560
- // TODO: Implement removing a section by concatenating before and after ranges
1561
- // This will use loadAudioRange to get two sections and join them
1562
- return null
1563
- }
1564
-
1565
- fun joinAudioSections(
1566
- sections: List<AudioData>,
1567
- config: DecodingConfig? = null
1568
- ): AudioData? {
1569
- // TODO: Implement joining multiple audio sections
1570
- // This will be used by removeSection and other future editing features
1571
- return null
1572
- }
1573
-
1574
- // Helper method for future editing features
1575
- private fun convertAudioFormat(
1576
- audioData: AudioData,
1577
- targetSampleRate: Int? = null,
1578
- targetChannels: Int? = null,
1579
- targetBitDepth: Int? = null
1580
- ): AudioData {
1581
- // TODO: Implement audio format conversion
1582
- // This will help ensure consistent format when joining sections
1583
- return audioData
1584
- }
1585
-
1586
- // Add new function to process entire file
1587
- fun processEntireFile(audioData: AudioData): Features {
1588
- val samples = convertToFloatArray(audioData.data, audioData.bitDepth)
1589
-
1590
- // Compute basic features for the entire file
1591
- val sumSquares = samples.sumOf { it * it.toDouble() }.toFloat()
1592
- val segmentLength = samples.size
1593
- val zeroCrossings = countZeroCrossings(samples)
1594
- val minAmplitude = samples.minOrNull() ?: 0f
1595
- val maxAmplitude = samples.maxOrNull() ?: 0f
1596
-
1597
- // Use existing computeFeatures with the entire file as one segment
1598
- return computeFeatures(
1599
- segmentData = samples,
1600
- sampleRate = audioData.sampleRate.toFloat(),
1601
- sumSquares = sumSquares,
1602
- zeroCrossings = zeroCrossings,
1603
- segmentLength = segmentLength,
1604
- minAmplitude = minAmplitude,
1605
- maxAmplitude = maxAmplitude,
1606
- featureOptions = mapOf() // Dont compute complex features
1607
- )
1608
- }
1609
-
1610
- private fun countZeroCrossings(data: FloatArray): Int {
1611
- var crossings = 0
1612
- for (i in 1 until data.size) {
1613
- if (data[i - 1] * data[i] < 0) crossings++
1614
- }
1615
- return crossings
1616
- }
1617
-
1618
- private fun hzToMel(hz: Float): Float {
1619
- return 2595f * log10(1f + hz / 700f)
1620
- }
1621
-
1622
- private fun melToHz(mel: Float): Float {
1623
- return 700f * (10f.pow(mel / 2595f) - 1f)
1624
- }
1625
-
1626
- private fun applyHannWindow(samples: FloatArray): FloatArray {
1627
- val output = FloatArray(samples.size)
1628
- for (i in samples.indices) {
1629
- val multiplier = 0.5f * (1f - cos(2f * PI.toFloat() * i / (samples.size - 1)))
1630
- output[i] = samples[i] * multiplier
1631
- }
1632
- return output
1633
- }
1634
-
1635
- // Generate a Hann window of a specific size (new, avoids modifying applyHannWindow)
1636
- private fun generateHannWindow(size: Int): FloatArray {
1637
- return FloatArray(size) { i ->
1638
- 0.5f * (1f - cos(2f * PI.toFloat() * i / (size - 1)))
1639
- }
1640
- }
1641
-
1642
- // Main function to extract mel spectrogram
1643
- fun extractMelSpectrogram(
1644
- audioData: AudioData,
1645
- windowSizeMs: Float = 25f, // Default 25ms window
1646
- hopLengthMs: Float = 10f, // Default 10ms hop
1647
- nMels: Int = 128, // Number of mel bins
1648
- fftLength: Int = 2048, // FFT size
1649
- fMin: Float = 0f, // Minimum frequency
1650
- fMax: Float = audioData.sampleRate.toFloat() / 2, // Nyquist frequency
1651
- windowType: String = "hann", // Add parameter
1652
- logScaling: Boolean = true, // Apply log scaling
1653
- normalize: Boolean = false // Normalize output
1654
- ): SpectrogramData {
1655
- val sampleRate = audioData.sampleRate.toFloat()
1656
- val samples = convertToFloatArray(audioData.data, audioData.bitDepth)
1657
-
1658
- // Convert ms to samples
1659
- val windowSizeSamples = (windowSizeMs * sampleRate / 1000).toInt()
1660
- val hopLengthSamples = (hopLengthMs * sampleRate / 1000).toInt()
1661
-
1662
-
1663
- val window = when (windowType.lowercase()) {
1664
- "hann" -> generateHannWindow(windowSizeSamples)
1665
- "hamming" -> FloatArray(windowSizeSamples) { i ->
1666
- 0.54f - 0.46f * cos(2f * PI.toFloat() * i / (windowSizeSamples - 1))
1667
- }
1668
- else -> throw IllegalArgumentException("Unsupported windowType: $windowType")
1669
- }
1670
-
1671
- // Compute STFT
1672
- val stft = computeSTFT(samples, fftLength, windowSizeSamples, hopLengthSamples, window)
1673
-
1674
- // Apply mel filterbank
1675
- val melSpectrogram = applyMelFilterbank(stft, sampleRate, nMels, fftLength, fMin, fMax)
1676
-
1677
- // Post-processing: log scaling and normalization
1678
- if (logScaling) {
1679
- for (i in melSpectrogram.indices) {
1680
- for (j in melSpectrogram[i].indices) {
1681
- melSpectrogram[i][j] = ln(max(1e-10f, melSpectrogram[i][j])).toFloat()
1682
- }
1683
- }
1684
- }
1685
- if (normalize) {
1686
- // Find min and max values across the entire spectrogram
1687
- var minVal = Float.MAX_VALUE
1688
- var maxVal = Float.MIN_VALUE
1689
-
1690
- for (i in melSpectrogram.indices) {
1691
- for (j in melSpectrogram[i].indices) {
1692
- val value = melSpectrogram[i][j]
1693
- if (value < minVal) minVal = value
1694
- if (value > maxVal) maxVal = value
1695
- }
1696
- }
1697
-
1698
- val range = maxVal - minVal
1699
- if (range > 0) {
1700
- for (i in melSpectrogram.indices) {
1701
- for (j in melSpectrogram[i].indices) {
1702
- melSpectrogram[i][j] = (melSpectrogram[i][j] - minVal) / range
1703
- }
1704
- }
1705
- }
1706
- }
1707
-
1708
- // Compute timestamps and frequencies for metadata
1709
- val numFrames = melSpectrogram.size
1710
- val timeStamps = FloatArray(numFrames) { it * hopLengthMs / 1000f }
1711
- val frequencies = melFrequencies(nMels, fMin, fMax)
1712
-
1713
- return SpectrogramData(melSpectrogram, timeStamps, frequencies)
1714
- }
1715
-
1716
- // Compute Short-Time Fourier Transform
1717
- private fun computeSTFT(
1718
- samples: FloatArray,
1719
- fftLength: Int,
1720
- windowSize: Int,
1721
- hopLength: Int,
1722
- window: FloatArray
1723
- ): Array<FloatArray> {
1724
- val fft = FFT(fftLength)
1725
- val numFrames = ((samples.size - windowSize) / hopLength) + 1
1726
- val stft = Array(numFrames) { FloatArray(fftLength / 2 + 1) }
1727
-
1728
- for (frameIdx in 0 until numFrames) {
1729
- val start = frameIdx * hopLength
1730
- val end = minOf(start + windowSize, samples.size)
1731
- val frame = FloatArray(fftLength) { 0f }
1732
-
1733
- // Extract and window the frame
1734
- for (i in start until end) {
1735
- frame[i - start] = samples[i] * window[i - start]
1736
- }
1737
-
1738
- // Compute FFT and power spectrum
1739
- val fftResult = fft.processSegment(frame)
1740
- for (i in 0 until fftLength / 2 + 1) {
1741
- // Check bounds before accessing array elements
1742
- val real = if (2 * i < fftResult.size) fftResult[2 * i] else 0f
1743
- val imag = if (2 * i + 1 < fftResult.size) fftResult[2 * i + 1] else 0f
1744
- stft[frameIdx][i] = real * real + imag * imag
1745
- }
1746
- }
1747
- return stft
1748
- }
1749
-
1750
- // Apply mel filterbank to STFT
1751
- private fun applyMelFilterbank(
1752
- stft: Array<FloatArray>,
1753
- sampleRate: Float,
1754
- nMels: Int,
1755
- fftLength: Int,
1756
- fMin: Float,
1757
- fMax: Float
1758
- ): Array<FloatArray> {
1759
- val numFrames = stft.size
1760
- val numBins = stft[0].size
1761
- val melFilters = createMelFilterbank(sampleRate, fftLength, nMels, fMin, fMax)
1762
- val melSpectrogram = Array(numFrames) { FloatArray(nMels) }
1763
-
1764
- for (frame in 0 until numFrames) {
1765
- for (melBin in 0 until nMels) {
1766
- var sum = 0f
1767
- for (bin in 0 until numBins) {
1768
- sum += stft[frame][bin] * melFilters[melBin][bin]
1769
- }
1770
- melSpectrogram[frame][melBin] = sum
1771
- }
1772
- }
1773
- return melSpectrogram
1774
- }
1775
-
1776
- // Create mel filterbank matrix
1777
- private fun createMelFilterbank(
1778
- sampleRate: Float,
1779
- fftLength: Int,
1780
- nMels: Int,
1781
- fMin: Float,
1782
- fMax: Float
1783
- ): Array<FloatArray> {
1784
- val freqs = FloatArray(fftLength / 2 + 1) { it * sampleRate / fftLength }
1785
- val melPoints = melFrequencies(nMels + 2, fMin, fMax)
1786
- val melFilters = Array(nMels) { FloatArray(fftLength / 2 + 1) }
1787
-
1788
- for (melIdx in 0 until nMels) {
1789
- val fLow = melPoints[melIdx]
1790
- val fCenter = melPoints[melIdx + 1]
1791
- val fHigh = melPoints[melIdx + 2]
1792
-
1793
- for (bin in freqs.indices) {
1794
- val freq = freqs[bin]
1795
- melFilters[melIdx][bin] = when {
1796
- freq < fLow || freq > fHigh -> 0f
1797
- freq <= fCenter -> (freq - fLow) / (fCenter - fLow)
1798
- else -> (fHigh - freq) / (fHigh - fCenter)
1799
- }
1800
- }
1801
- }
1802
- return melFilters
1803
- }
1804
-
1805
- // Generate mel-spaced frequencies
1806
- private fun melFrequencies(nMels: Int, fMin: Float, fMax: Float): FloatArray {
1807
- val melMin = hzToMel(fMin)
1808
- val melMax = hzToMel(fMax)
1809
- val melPoints = FloatArray(nMels) { i ->
1810
- val mel = melMin + i * (melMax - melMin) / (nMels - 1)
1811
- melToHz(mel)
1812
- }
1813
- return melPoints
1814
- }
1815
-
1816
- private fun computeMelSpectrogram(samples: FloatArray, sampleRate: Float): List<Float> {
1817
- val (powerSpectrum, _) = prepareFFT(samples, sampleRate)
1818
- val melFilters = computeMelFilterbank(
1819
- numFilters = 128,
1820
- powerSpectrumSize = powerSpectrum.size,
1821
- sampleRate = sampleRate
1822
- )
1823
-
1824
- // Apply Mel filters to power spectrum
1825
- return melFilters.map { filter ->
1826
- var energy = 0f
1827
- for (j in powerSpectrum.indices) {
1828
- energy += powerSpectrum[j] * filter[j]
1829
- }
1830
- kotlin.math.ln(maxOf(energy, 1e-10f))
1831
- }
1832
- }
1833
-
1834
- private fun computeChroma(samples: FloatArray, sampleRate: Float): List<Float> {
1835
- val (_, magnitudeSpectrum) = prepareFFT(samples, sampleRate)
1836
- val chroma = FloatArray(N_CHROMA) { 0f }
1837
- val freqsPerBin = sampleRate / N_FFT
1838
-
1839
- for (i in 0 until N_FFT / 2) {
1840
- val freq = i * freqsPerBin
1841
- if (freq > 0) {
1842
- val pitchClass = (12 * log2(freq / 440.0) % 12).toInt()
1843
- if (pitchClass in 0..11) {
1844
- val magnitude = sqrt(magnitudeSpectrum[2 * i] * magnitudeSpectrum[2 * i] +
1845
- (if (2 * i + 1 < magnitudeSpectrum.size) magnitudeSpectrum[2 * i + 1] else 0f) *
1846
- magnitudeSpectrum[2 * i + 1])
1847
- chroma[pitchClass] += magnitude
1848
- }
1849
- }
1850
- }
1851
-
1852
- return chroma.toList()
1853
- }
1854
-
1855
- private fun computeSpectralContrast(samples: FloatArray, sampleRate: Float): List<Float> {
1856
- val (_, magnitudeSpectrum) = prepareFFT(samples, sampleRate)
1857
- // ... rest of spectral contrast computation using magnitudeSpectrum ...
1858
- // Implementation depends on your specific requirements
1859
- return emptyList() // Placeholder
1860
- }
1861
-
1862
- private fun computeTonnetz(samples: FloatArray, sampleRate: Float): List<Float> {
1863
- // First compute chroma features
1864
- val chroma = computeChroma(samples, sampleRate)
1865
-
1866
- // Tonnetz transformation matrix (6x12)
1867
- val tonnetzMatrix = arrayOf(
1868
- floatArrayOf(1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f, 0f, 0f), // Perfect fifth
1869
- floatArrayOf(0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f, 0f, 0f), // Minor third
1870
- floatArrayOf(0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f, 0f), // Major third
1871
- floatArrayOf(0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 1f, 0f), // Perfect fifth
1872
- floatArrayOf(0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 0f, 0f, 1f, 0f), // Minor third
1873
- floatArrayOf(1f, 0f, 0f, 0f, 0f, 1f, 0f, 0f, 0f, 1f, 0f, 0f) // Major third
1874
- )
1875
-
1876
- // Compute tonnetz features
1877
- val tonnetz = mutableListOf<Float>()
1878
- for (row in tonnetzMatrix) {
1879
- var sum = 0f
1880
- for (i in row.indices) {
1881
- sum += row[i] * (chroma.getOrNull(i) ?: 0f)
1882
- }
1883
- tonnetz.add(sum)
1884
- }
1885
-
1886
- return tonnetz
1887
- }
1888
-
1889
- private fun nextPowerOfTwo(n: Int): Int {
1890
- var value = 1
1891
- while (value < n) {
1892
- value *= 2
1893
- }
1894
- return value
1895
- }
1896
-
1897
- private fun estimatePitch(segment: FloatArray, sampleRate: Float): Float {
1898
- if (segment.size < 2) return 0.0f
1899
-
1900
- // Apply Hann window
1901
- val windowed = applyHannWindow(segment)
1902
-
1903
- // Pad for FFT - ensure length is power of 2 and sufficient for autocorrelation
1904
- val fftLength = nextPowerOfTwo(segment.size * 2)
1905
- val padded = FloatArray(fftLength) // Initialize with zeros
1906
- windowed.copyInto(padded) // Copy windowed data into padded array
1907
-
1908
- // Perform forward FFT
1909
- val fft = FFT(fftLength)
1910
- try {
1911
- fft.realForward(padded)
1912
- } catch (e: Exception) {
1913
- Log.e("AudioProcessor", "FFT forward transform failed: ${e.message}")
1914
- return 0.0f
1915
- }
1916
-
1917
- // Compute power spectrum
1918
- val powerSpectrum = FloatArray(fftLength)
1919
- try {
1920
- // Handle DC and Nyquist components separately
1921
- powerSpectrum[0] = padded[0] * padded[0]
1922
- powerSpectrum[fftLength/2] = padded[1] * padded[1]
1923
-
1924
- // Handle remaining frequencies
1925
- for (i in 1 until fftLength/2) {
1926
- val re = padded[2 * i]
1927
- val im = padded[2 * i + 1]
1928
- powerSpectrum[i] = re * re + im * im
1929
- powerSpectrum[fftLength - i] = powerSpectrum[i] // Mirror for inverse FFT
1930
- }
1931
- } catch (e: Exception) {
1932
- Log.e("AudioProcessor", "Power spectrum computation failed: ${e.message}")
1933
- return 0.0f
1934
- }
1935
-
1936
- // Inverse FFT to get autocorrelation
1937
- val autocorrelation = FloatArray(fftLength)
1938
- try {
1939
- fft.realInverse(powerSpectrum, autocorrelation)
1940
- } catch (e: Exception) {
1941
- Log.e("AudioProcessor", "FFT inverse transform failed: ${e.message}")
1942
- return 0.0f
1943
- }
1944
-
1945
- // Normalize autocorrelation
1946
- val normFactor = 1.0f / autocorrelation[0] // Normalize by zero-lag autocorrelation
1947
- for (i in autocorrelation.indices) {
1948
- autocorrelation[i] *= normFactor
1949
- }
1950
-
1951
- // Find the first peak within pitch range (50-500 Hz)
1952
- val minLag = (sampleRate / 500.0f).toInt().coerceAtLeast(1)
1953
- val maxLag = (sampleRate / 50.0f).toInt().coerceAtMost(autocorrelation.size - 1)
1954
-
1955
- var maxCorr = -1.0f
1956
- var pitchLag = 0
1957
-
1958
- // Add peak picking criteria
1959
- val threshold = 0.3f // Correlation threshold
1960
- var isPeak = false
1961
-
1962
- for (lag in minLag..maxLag) {
1963
- if (lag > 0 && lag < autocorrelation.size - 1) {
1964
- // Check if this point is a peak
1965
- isPeak = autocorrelation[lag] > autocorrelation[lag - 1] &&
1966
- autocorrelation[lag] > autocorrelation[lag + 1] &&
1967
- autocorrelation[lag] > threshold
1968
-
1969
- if (isPeak && autocorrelation[lag] > maxCorr) {
1970
- maxCorr = autocorrelation[lag]
1971
- pitchLag = lag
1972
- }
1973
- }
1974
- }
1975
-
1976
- return if (pitchLag > 0) sampleRate / pitchLag else 0.0f
1977
- }
1978
-
1979
- /**
1980
- * Prepares FFT by applying Hann window, padding, and computing both power and magnitude spectra.
1981
- * @param samples Input audio samples
1982
- * @param sampleRate Sampling rate in Hz
1983
- * @param fftLength FFT size (must be power of 2)
1984
- * @return Pair of power spectrum and magnitude spectrum
1985
- */
1986
- private fun prepareFFT(samples: FloatArray, sampleRate: Float, fftLength: Int = nextPowerOfTwo(samples.size.coerceAtLeast(2048))): Pair<FloatArray, FloatArray> {
1987
- val windowed = applyHannWindow(samples)
1988
- val padded = windowed.copyOf(fftLength)
1989
- val fft = FFT(fftLength)
1990
- fft.realForward(padded)
1991
-
1992
- val magnitudeSpectrum = FloatArray(fftLength / 2 + 1)
1993
- for (i in 0 until fftLength / 2) {
1994
- val re = padded[2 * i]
1995
- val im = padded[2 * i + 1]
1996
- magnitudeSpectrum[i] = sqrt(re * re + im * im)
1997
- }
1998
- magnitudeSpectrum[fftLength / 2] = abs(padded[1])
1999
-
2000
- val powerSpectrum = magnitudeSpectrum.map { it * it }.toFloatArray()
2001
- return Pair(powerSpectrum, magnitudeSpectrum)
2002
- }
2003
-
2004
- data class AudioFormat(
2005
- val sampleRate: Int,
2006
- val channels: Int,
2007
- val bitDepth: Int
2008
- )
2009
-
2010
- fun getAudioFormat(fileUri: String): AudioFormat? {
2011
- val cleanUri = fileUri.removePrefix("file://")
2012
- val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
2013
- ?: run {
2014
- Log.e(Constants.TAG, "File not found: $cleanUri")
2015
- return null
2016
- }
2017
-
2018
- val extractor = MediaExtractor()
2019
- try {
2020
- extractor.setDataSource(file.absolutePath)
2021
- val format = extractor.getTrackFormat(0)
2022
- return AudioFormat(
2023
- sampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE),
2024
- channels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT),
2025
- bitDepth = 16 // Most compressed formats decode to 16-bit PCM
2026
- )
2027
- } catch (e: Exception) {
2028
- Log.e(Constants.TAG, "Failed to get audio format: ${e.message}")
2029
- return null
2030
- } finally {
2031
- extractor.release()
2032
- }
2033
- }
2034
-
2035
- /**
2036
- * Gets the size of the audio file header.
2037
- * For WAV files, this includes the RIFF header and all metadata chunks before the data chunk.
2038
- * For other formats, this will return null as header size handling is format-specific.
2039
- *
2040
- * @param fileUri The URI of the audio file to analyze
2041
- * @return The size of the header in bytes, or null if:
2042
- * - The file is not a WAV file
2043
- * - The file cannot be read
2044
- * - The file format is invalid
2045
- * - The data chunk cannot be found
2046
- *
2047
- * WAV File Structure:
2048
- * - RIFF header (12 bytes)
2049
- * - "RIFF" identifier (4 bytes)
2050
- * - File size (4 bytes)
2051
- * - "WAVE" identifier (4 bytes)
2052
- * - Format chunk ("fmt ") (24 bytes typically)
2053
- * - Optional metadata chunks (variable size)
2054
- * - LIST (metadata like artist, title)
2055
- * - JUNK (padding)
2056
- * - fact (additional format info)
2057
- * - cue (cue points)
2058
- * - Data chunk
2059
- * - "data" identifier (4 bytes)
2060
- * - Chunk size (4 bytes)
2061
- * - Actual audio data
2062
- */
2063
- fun getWavHeaderSize(fileUri: String): Int? {
2064
- val cleanUri = fileUri.removePrefix("file://")
2065
- val file = File(cleanUri).takeIf { it.exists() } ?: File(filesDir, File(cleanUri).name).takeIf { it.exists() }
2066
- ?: run {
2067
- Log.e(Constants.TAG, "File not found: $cleanUri")
2068
- return null
2069
- }
2070
-
2071
- try {
2072
- val inputStream = FileInputStream(file)
2073
- val buffer = ByteArray(12) // Read RIFF header and chunk size
2074
-
2075
- // Read RIFF header
2076
- if (inputStream.read(buffer) != 12) {
2077
- Log.e(Constants.TAG, "Failed to read RIFF header")
2078
- return null
2079
- }
2080
-
2081
- // Verify RIFF header
2082
- if (String(buffer, 0, 4) != "RIFF" || String(buffer, 8, 4) != "WAVE") {
2083
- Log.e(Constants.TAG, "Invalid WAV file format")
2084
- return null
2085
- }
2086
-
2087
- var headerSize = 12
2088
- var chunkSize: Int
2089
-
2090
- // Read chunks until we find the data chunk
2091
- while (true) {
2092
- if (inputStream.read(buffer, 0, 8) != 8) {
2093
- Log.e(Constants.TAG, "Unexpected end of file while reading chunks")
2094
- break
2095
- }
2096
-
2097
- chunkSize = (buffer[7].toInt() and 0xFF shl 24) or
2098
- (buffer[6].toInt() and 0xFF shl 16) or
2099
- (buffer[5].toInt() and 0xFF shl 8) or
2100
- (buffer[4].toInt() and 0xFF)
2101
-
2102
- val chunkId = String(buffer, 0, 4)
2103
- Log.d(Constants.TAG, "Found chunk: $chunkId, size: $chunkSize")
2104
-
2105
- if (chunkId == "data") {
2106
- headerSize += 8 // Add chunk header size
2107
- Log.d(Constants.TAG, "Found data chunk at offset: $headerSize")
2108
- break
2109
- }
2110
-
2111
- headerSize += 8 + chunkSize // Add chunk header and data size
2112
- inputStream.skip(chunkSize.toLong()) // Skip chunk data
2113
- }
2114
-
2115
- inputStream.close()
2116
- Log.d(Constants.TAG, "Total WAV header size: $headerSize bytes")
2117
- return headerSize
2118
-
2119
- } catch (e: Exception) {
2120
- Log.e(Constants.TAG, "Error calculating WAV header size: ${e.message}")
2121
- return null
2122
- }
2123
- }
2124
-
2125
- /**
2126
- * Decodes a specific time range of an audio file directly to PCM data
2127
- * This is more efficient than decoding the entire file when only a portion is needed
2128
- */
2129
- fun decodeAudioRangeToPCM(fileUri: String, startTimeMs: Long, endTimeMs: Long): AudioData? {
2130
- val extractor = MediaExtractor()
2131
- var decoder: android.media.MediaCodec? = null
2132
-
2133
- try {
2134
- extractor.setDataSource(fileUri)
2135
- val trackIndex = (0 until extractor.trackCount).find {
2136
- extractor.getTrackFormat(it).getString(MediaFormat.KEY_MIME)?.startsWith("audio/") == true
2137
- } ?: return null
2138
-
2139
- extractor.selectTrack(trackIndex)
2140
- val format = extractor.getTrackFormat(trackIndex)
2141
-
2142
- val sampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE)
2143
- val channels = format.getInteger(MediaFormat.KEY_CHANNEL_COUNT)
2144
- decoder = android.media.MediaCodec.createDecoderByType(format.getString(MediaFormat.KEY_MIME)!!)
2145
- decoder.configure(format, null, null, 0)
2146
- decoder.start()
2147
-
2148
- extractor.seekTo(startTimeMs * 1000, MediaExtractor.SEEK_TO_PREVIOUS_SYNC)
2149
- val pcmData = mutableListOf<Byte>()
2150
- val bufferInfo = android.media.MediaCodec.BufferInfo()
2151
- var isEOS = false
2152
- var firstBufferTimeUs: Long? = null
2153
-
2154
- while (!isEOS) {
2155
- val inputBufferId = decoder.dequeueInputBuffer(10000)
2156
- if (inputBufferId >= 0) {
2157
- val inputBuffer = decoder.getInputBuffer(inputBufferId)!!
2158
- val sampleSize = extractor.readSampleData(inputBuffer, 0)
2159
- if (sampleSize < 0 || extractor.sampleTime > endTimeMs * 1000) {
2160
- decoder.queueInputBuffer(inputBufferId, 0, 0, 0, android.media.MediaCodec.BUFFER_FLAG_END_OF_STREAM)
2161
- isEOS = true
2162
- } else {
2163
- decoder.queueInputBuffer(inputBufferId, 0, sampleSize, extractor.sampleTime, 0)
2164
- extractor.advance()
2165
- }
2166
- }
2167
-
2168
- val outputBufferId = decoder.dequeueOutputBuffer(bufferInfo, 10000)
2169
- if (outputBufferId >= 0) {
2170
- val outputBuffer = decoder.getOutputBuffer(outputBufferId)!!
2171
- if (firstBufferTimeUs == null) firstBufferTimeUs = bufferInfo.presentationTimeUs
2172
- val chunk = ByteArray(bufferInfo.size)
2173
- outputBuffer.get(chunk)
2174
- pcmData.addAll(chunk.toList())
2175
- decoder.releaseOutputBuffer(outputBufferId, false)
2176
- }
2177
- }
2178
-
2179
- // If we didn't get any data or first buffer time, return null
2180
- if (pcmData.isEmpty() || firstBufferTimeUs == null) {
2181
- return null
2182
- }
2183
-
2184
- // Trim PCM data to exact time range
2185
- val bytesPerSample = 2 // 16-bit PCM
2186
- val bytesPerFrame = bytesPerSample * channels
2187
- val samplesPerSecond = sampleRate * channels
2188
- val dt = 1_000_000.0 / sampleRate // Time per sample in microseconds
2189
-
2190
- val allSamples = java.nio.ByteBuffer.wrap(pcmData.toByteArray()).order(java.nio.ByteOrder.LITTLE_ENDIAN).asShortBuffer()
2191
- val totalSamples = allSamples.capacity()
2192
-
2193
- // Calculate sample indices for the exact time range
2194
- val startSample = ((startTimeMs * 1000 - firstBufferTimeUs) / dt).toInt().coerceIn(0, totalSamples)
2195
- val endSample = ((endTimeMs * 1000 - firstBufferTimeUs) / dt).toInt().coerceIn(startSample, totalSamples)
2196
-
2197
- // Create a new ShortBuffer view starting at the correct position
2198
- allSamples.position(startSample)
2199
- val trimmedSamples = ShortArray(endSample - startSample)
2200
- for (i in trimmedSamples.indices) {
2201
- trimmedSamples[i] = allSamples.get()
2202
- }
2203
-
2204
- // Convert ShortArray to ByteArray
2205
- val trimmedBytes = ByteArray(trimmedSamples.size * 2)
2206
- val byteBuffer = java.nio.ByteBuffer.wrap(trimmedBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
2207
- val shortBuffer = byteBuffer.asShortBuffer()
2208
- shortBuffer.put(trimmedSamples)
2209
-
2210
- return AudioData(
2211
- data = trimmedBytes,
2212
- sampleRate = sampleRate,
2213
- channels = channels,
2214
- bitDepth = 16, // MediaCodec typically decodes to 16-bit PCM
2215
- durationMs = endTimeMs - startTimeMs
2216
- )
2217
- } catch (e: Exception) {
2218
- Log.e(Constants.TAG, "Failed to decode audio range: ${e.message}", e)
2219
- return null
2220
- } finally {
2221
- try {
2222
- decoder?.stop()
2223
- decoder?.release()
2224
- } catch (e: Exception) {
2225
- Log.w(Constants.TAG, "Error releasing decoder: ${e.message}")
2226
- }
2227
-
2228
- try {
2229
- extractor.release()
2230
- } catch (e: Exception) {
2231
- Log.w(Constants.TAG, "Error releasing extractor: ${e.message}")
2232
- }
2233
- }
2234
- }
2235
- }